Overview

Request 6063 (accepted)

No description set
Submit package Staging / x265 to package Essentials / x265

x265.changes Changed
x
 
1
@@ -1,4 +1,53 @@
2
 -------------------------------------------------------------------
3
+Thu Jun 13 05:58:19 UTC 2024 - Luigi Baldoni <aloisio@gmx.com>
4
+
5
+- Update to version 3.6
6
+  New features:
7
+  * Segment based Ratecontrol (SBRC) feature
8
+  * Motion-Compensated Spatio-Temporal Filtering
9
+  * Scene-cut aware qp - BBAQ (Bidirectional Boundary Aware
10
+    Quantization)
11
+  * Histogram-Based Scene Change Detection
12
+  * Film-Grain characteristics as a SEI message to support Film
13
+    Grain Synthesis(FGS)
14
+  * Add temporal layer implementation(Hierarchical B-frame
15
+    implementation)
16
+  Enhancements to existing features:
17
+  * Added Dolby Vision 8.4 Profile Support
18
+  API changes:
19
+  * Add Segment based Ratecontrol(SBRC) feature: "--no-sbrc".
20
+  * Add command line parameter for mcstf feature: "--no-mctf".
21
+  * Add command line parameters for the scene cut aware qp
22
+    feature: "--scenecut-aware-qp" and "--masking-strength".
23
+  * Add command line parameters for Histogram-Based Scene Change
24
+    Detection: "--hist-scenecut".
25
+  * Add film grain characteristics as a SEI message to the
26
+    bitstream: "--film-grain <filename>"
27
+  * cli: add new option --cra-nal (Force nal type to CRA to all
28
+    frames expect for the first frame, works only with keyint 1)
29
+  Optimizations:
30
+  * ARM64 NEON optimizations:- Several time-consuming C
31
+    functions have been optimized for the targeted platform -
32
+    aarch64. The overall performance increased by around 20%.
33
+  * SVE/SVE2 optimizations
34
+  Bug fixes:
35
+  * Linux bug to utilize all the cores
36
+  * Crash with hist-scenecut build when source resolution is not
37
+    multiple of minCuSize
38
+  * 32bit and 64bit builds generation for ARM
39
+  * bugs in zonefile feature (Reflect Zonefile Parameters inside
40
+    Lookahead, extra IDR issue, Avg I Slice QP value issue etc..)
41
+  * Add x86 ASM implementation for subsampling luma
42
+  * Fix for abrladder segfault with load reuse level 1
43
+  * Reorder miniGOP based on temporal layer hierarchy and add
44
+    support for more B frame
45
+  * Add MacOS aarch64 build support
46
+  * Fix boundary condition issue for Gaussian filter
47
+- Drop arm.patch and replace it with 0001-Fix-arm-flags.patch
48
+  and 0004-Do-not-build-with-assembly-support-on-arm.patch
49
+  (courtesy of Debian)
50
+
51
+-------------------------------------------------------------------
52
 Wed May 19 13:21:09 UTC 2021 - Luigi Baldoni <aloisio@gmx.com>
53
 
54
 - Build libx265_main10 and libx265_main12 unconditionally and
55
x265.spec Changed
46
 
1
@@ -1,7 +1,7 @@
2
 #
3
 # spec file for package x265
4
 #
5
-# Copyright (c) 2021 Packman Team <packman@links2linux.de>
6
+# Copyright (c) 2024 Packman Team <packman@links2linux.de>
7
 # Copyright (c) 2014 Torsten Gruner <t.gruner@katodev.de>
8
 #
9
 # All modifications and additions to the file contributed by third parties
10
@@ -17,21 +17,22 @@
11
 #
12
 
13
 
14
-%define sover   199
15
+%define sover   209
16
 %define libname lib%{name}
17
 %define libsoname %{libname}-%{sover}
18
-%define uver    3_5
19
+%define uver    3_6
20
 Name:           x265
21
-Version:        3.5
22
+Version:        3.6
23
 Release:        0
24
 Summary:        A free h265/HEVC encoder - encoder binary
25
 License:        GPL-2.0-or-later
26
 Group:          Productivity/Multimedia/Video/Editors and Convertors
27
 URL:            https://bitbucket.org/multicoreware/x265_git
28
 Source0:        https://bitbucket.org/multicoreware/x265_git/downloads/%{name}_%{version}.tar.gz
29
-Patch0:         arm.patch
30
 Patch1:         x265.pkgconfig.patch
31
 Patch2:         x265-fix_enable512.patch
32
+Patch3:         0001-Fix-arm-flags.patch
33
+Patch4:         0004-Do-not-build-with-assembly-support-on-arm.patch
34
 BuildRequires:  cmake >= 2.8.8
35
 BuildRequires:  gcc-c++
36
 BuildRequires:  nasm >= 2.13
37
@@ -130,6 +131,8 @@
38
 %cmake_install
39
 find %{buildroot} -type f -name "*.a" -delete -print0
40
 
41
+%check
42
+
43
 %post -n %{libsoname} -p /sbin/ldconfig
44
 %postun -n %{libsoname} -p /sbin/ldconfig
45
 
46
0001-Fix-arm-flags.patch Added
41
 
1
@@ -0,0 +1,39 @@
2
+From: Sebastian Ramacher <sramacher@debian.org>
3
+Date: Sun, 21 Jun 2020 17:54:56 +0200
4
+Subject: Fix arm* flags
5
+
6
+---
7
+ source/CMakeLists.txt | 7 ++-----
8
+ 1 file changed, 2 insertions(+), 5 deletions(-)
9
+
10
+diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
11
+index ab5ddfe..eb9b19b 100755
12
+--- a/source/CMakeLists.txt
13
++++ b/source/CMakeLists.txt
14
+@@ -253,10 +253,7 @@ if(GCC)
15
+     elseif(ARM)
16
+         find_package(Neon)
17
+         if(CPU_HAS_NEON)
18
+-            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
19
+             add_definitions(-DHAVE_NEON)
20
+-        else()
21
+-            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
22
+         endif()
23
+     endif()
24
+   if(ARM64 OR CROSS_COMPILE_ARM64)
25
+@@ -265,13 +262,13 @@ if(GCC)
26
+         find_package(SVE2)
27
+         if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2)
28
+             message(STATUS "Found SVE2")
29
+-          set(ARM_ARGS -O3 -march=armv8-a+sve2 -fPIC -flax-vector-conversions)
30
++          set(ARM_ARGS -fPIC -flax-vector-conversions)
31
+             add_definitions(-DHAVE_SVE2)
32
+             add_definitions(-DHAVE_SVE)
33
+             add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE2
34
+         elseif(CPU_HAS_SVE OR CROSS_COMPILE_SVE)
35
+             message(STATUS "Found SVE")
36
+-          set(ARM_ARGS -O3 -march=armv8-a+sve -fPIC -flax-vector-conversions)
37
++          set(ARM_ARGS -fPIC -flax-vector-conversions)
38
+             add_definitions(-DHAVE_SVE)
39
+             add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE
40
+         elseif(CPU_HAS_NEON)
41
0004-Do-not-build-with-assembly-support-on-arm.patch Added
30
 
1
@@ -0,0 +1,28 @@
2
+From: Sebastian Ramacher <sramacher@debian.org>
3
+Date: Fri, 31 May 2024 23:38:23 +0200
4
+Subject: Do not build with assembly support on arm*
5
+
6
+---
7
+ source/CMakeLists.txt | 9 ---------
8
+ 1 file changed, 9 deletions(-)
9
+
10
+diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
11
+index 672cc2d..f112330 100755
12
+--- a/source/CMakeLists.txt
13
++++ b/source/CMakeLists.txt
14
+@@ -73,15 +73,6 @@ elseif(POWERMATCH GREATER "-1")
15
+         add_definitions(-DPPC64=1)
16
+         message(STATUS "Detected POWER PPC64 target processor")
17
+     endif()
18
+-elseif(ARMMATCH GREATER "-1")
19
+-    if(CROSS_COMPILE_ARM)
20
+-        message(STATUS "Cross compiling for ARM arch")
21
+-    else()
22
+-        set(CROSS_COMPILE_ARM 0)
23
+-    endif()
24
+-  message(STATUS "Detected ARM target processor")
25
+-    set(ARM 1)
26
+-    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
27
+ elseif(ARM64MATCH GREATER "-1")
28
+     #if(CROSS_COMPILE_ARM64)
29
+         #message(STATUS "Cross compiling for ARM64 arch")
30
arm.patch Deleted
110
 
1
@@ -1,108 +0,0 @@
2
-Index: x265_3.4/source/CMakeLists.txt
3
-===================================================================
4
---- x265_3.4.orig/source/CMakeLists.txt
5
-+++ x265_3.4/source/CMakeLists.txt
6
-@@ -64,26 +64,26 @@ elseif(POWERMATCH GREATER "-1")
7
-         add_definitions(-DPPC64=1)
8
-         message(STATUS "Detected POWER PPC64 target processor")
9
-     endif()
10
--elseif(ARMMATCH GREATER "-1")
11
--    if(CROSS_COMPILE_ARM)
12
--        message(STATUS "Cross compiling for ARM arch")
13
--    else()
14
--        set(CROSS_COMPILE_ARM 0)
15
--    endif()
16
--    set(ARM 1)
17
--    if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
18
--        message(STATUS "Detected ARM64 target processor")
19
--        set(ARM64 1)
20
--        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0)
21
--    else()
22
--        message(STATUS "Detected ARM target processor")
23
--        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1)
24
--    endif()
25
-+elseif(${SYSPROC} MATCHES "armv5.*")
26
-+    message(STATUS "Detected ARMV5 system processor")
27
-+    set(ARMV5 1)
28
-+    add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=0 -DHAVE_NEON=0)
29
-+elseif(${SYSPROC} STREQUAL "armv6l")
30
-+    message(STATUS "Detected ARMV6 system processor")
31
-+    set(ARMV6 1)
32
-+    add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
33
-+elseif(${SYSPROC} STREQUAL "armv7l")
34
-+    message(STATUS "Detected ARMV7 system processor")
35
-+    set(ARMV7 1)
36
-+    add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
37
-+elseif(${SYSPROC} STREQUAL "aarch64")
38
-+    message(STATUS "Detected AArch64 system processor")
39
-+    set(ARMV7 1)
40
-+    add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0 -DHAVE_NEON=0)
41
- else()
42
-     message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
43
-     message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
44
- endif()
45
--
46
- if(UNIX)
47
-     list(APPEND PLATFORM_LIBS pthread)
48
-     find_library(LIBRT rt)
49
-@@ -238,28 +238,9 @@ if(GCC)
50
-             endif()
51
-         endif()
52
-     endif()
53
--    if(ARM AND CROSS_COMPILE_ARM)
54
--        if(ARM64)
55
--            set(ARM_ARGS -fPIC)
56
--        else()
57
--            set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
58
--        endif()
59
--        message(STATUS "cross compile arm")
60
--    elseif(ARM)
61
--        if(ARM64)
62
--            set(ARM_ARGS -fPIC)
63
--            add_definitions(-DHAVE_NEON)
64
--        else()
65
--            find_package(Neon)
66
--            if(CPU_HAS_NEON)
67
--                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
68
--                add_definitions(-DHAVE_NEON)
69
--            else()
70
--                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
71
--            endif()
72
--        endif()
73
-+    if(ARMV7)
74
-+        add_definitions(-fPIC)
75
-     endif()
76
--    add_definitions(${ARM_ARGS})
77
-     if(FPROFILE_GENERATE)
78
-         if(INTEL_CXX)
79
-             add_definitions(-prof-gen -prof-dir="${CMAKE_CURRENT_BINARY_DIR}")
80
-Index: x265_3.4/source/common/cpu.cpp
81
-===================================================================
82
---- x265_3.4.orig/source/common/cpu.cpp
83
-+++ x265_3.4/source/common/cpu.cpp
84
-@@ -39,7 +39,7 @@
85
- #include <machine/cpu.h>
86
- #endif
87
- 
88
--#if X265_ARCH_ARM && !defined(HAVE_NEON)
89
-+#if X265_ARCH_ARM && (!defined(HAVE_NEON) || HAVE_NEON==0)
90
- #include <signal.h>
91
- #include <setjmp.h>
92
- static sigjmp_buf jmpbuf;
93
-@@ -350,7 +350,6 @@ uint32_t cpu_detect(bool benableavx512)
94
-     }
95
- 
96
-     canjump = 1;
97
--    PFX(cpu_neon_test)();
98
-     canjump = 0;
99
-     signal(SIGILL, oldsig);
100
- #endif // if !HAVE_NEON
101
-@@ -366,7 +365,7 @@ uint32_t cpu_detect(bool benableavx512)
102
-     // which may result in incorrect detection and the counters stuck enabled.
103
-     // right now Apple does not seem to support performance counters for this test
104
- #ifndef __MACH__
105
--    flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
106
-+    //flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
107
- #endif
108
-     // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
109
- #elif X265_ARCH_ARM64
110
baselibs.conf Changed
4
 
1
@@ -1,1 +1,1 @@
2
-libx265-199
3
+libx265-209
4
x265_3.5.tar.gz/source/common/aarch64/ipfilter8.S Deleted
416
 
1
@@ -1,414 +0,0 @@
2
-/*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
- *
5
- * Authors: Yimeng Su <yimeng.su@huawei.com>
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
- *
21
- * This program is also available under a commercial proprietary license.
22
- * For more information, contact us at license @ x265.com.
23
- *****************************************************************************/
24
-
25
-#include "asm.S"
26
-
27
-.section .rodata
28
-
29
-.align 4
30
-
31
-.text
32
-
33
-
34
-
35
-.macro qpel_filter_0_32b
36
-    movi            v24.8h, #64
37
-    uxtl            v19.8h, v5.8b
38
-    smull           v17.4s, v19.4h, v24.4h
39
-    smull2          v18.4s, v19.8h, v24.8h
40
-.endm
41
-
42
-.macro qpel_filter_1_32b
43
-    movi            v16.8h, #58
44
-    uxtl            v19.8h, v5.8b
45
-    smull           v17.4s, v19.4h, v16.4h
46
-    smull2          v18.4s, v19.8h, v16.8h
47
-
48
-    movi            v24.8h, #10
49
-    uxtl            v21.8h, v1.8b
50
-    smull           v19.4s, v21.4h, v24.4h
51
-    smull2          v20.4s, v21.8h, v24.8h
52
-
53
-    movi            v16.8h, #17
54
-    uxtl            v23.8h, v2.8b
55
-    smull           v21.4s, v23.4h, v16.4h
56
-    smull2          v22.4s, v23.8h, v16.8h
57
-
58
-    movi            v24.8h, #5
59
-    uxtl            v1.8h, v6.8b
60
-    smull           v23.4s, v1.4h, v24.4h
61
-    smull2          v16.4s, v1.8h, v24.8h
62
-
63
-    sub             v17.4s, v17.4s, v19.4s
64
-    sub             v18.4s, v18.4s, v20.4s
65
-
66
-    uxtl            v1.8h, v4.8b
67
-    sshll           v19.4s, v1.4h, #2
68
-    sshll2          v20.4s, v1.8h, #2
69
-
70
-    add             v17.4s, v17.4s, v21.4s
71
-    add             v18.4s, v18.4s, v22.4s
72
-
73
-    uxtl            v1.8h, v0.8b
74
-    uxtl            v2.8h, v3.8b
75
-    ssubl           v21.4s, v2.4h, v1.4h
76
-    ssubl2          v22.4s, v2.8h, v1.8h
77
-
78
-    add             v17.4s, v17.4s, v19.4s
79
-    add             v18.4s, v18.4s, v20.4s
80
-    sub             v21.4s, v21.4s, v23.4s
81
-    sub             v22.4s, v22.4s, v16.4s
82
-    add             v17.4s, v17.4s, v21.4s
83
-    add             v18.4s, v18.4s, v22.4s
84
-.endm
85
-
86
-.macro qpel_filter_2_32b
87
-    movi            v16.4s, #11
88
-    uxtl            v19.8h, v5.8b
89
-    uxtl            v20.8h, v2.8b
90
-    saddl           v17.4s, v19.4h, v20.4h
91
-    saddl2          v18.4s, v19.8h, v20.8h
92
-
93
-    uxtl            v21.8h, v1.8b
94
-    uxtl            v22.8h, v6.8b
95
-    saddl           v19.4s, v21.4h, v22.4h
96
-    saddl2          v20.4s, v21.8h, v22.8h
97
-
98
-    mul             v19.4s, v19.4s, v16.4s
99
-    mul             v20.4s, v20.4s, v16.4s
100
-
101
-    movi            v16.4s, #40
102
-    mul             v17.4s, v17.4s, v16.4s
103
-    mul             v18.4s, v18.4s, v16.4s
104
-
105
-    uxtl            v21.8h, v4.8b
106
-    uxtl            v22.8h, v3.8b
107
-    saddl           v23.4s, v21.4h, v22.4h
108
-    saddl2          v16.4s, v21.8h, v22.8h
109
-
110
-    uxtl            v1.8h, v0.8b
111
-    uxtl            v2.8h, v7.8b
112
-    saddl           v21.4s, v1.4h, v2.4h
113
-    saddl2          v22.4s, v1.8h, v2.8h
114
-
115
-    shl             v23.4s, v23.4s, #2
116
-    shl             v16.4s, v16.4s, #2
117
-
118
-    add             v19.4s, v19.4s, v21.4s
119
-    add             v20.4s, v20.4s, v22.4s
120
-    add             v17.4s, v17.4s, v23.4s
121
-    add             v18.4s, v18.4s, v16.4s
122
-    sub             v17.4s, v17.4s, v19.4s
123
-    sub             v18.4s, v18.4s, v20.4s
124
-.endm
125
-
126
-.macro qpel_filter_3_32b
127
-    movi            v16.8h, #17
128
-    movi            v24.8h, #5
129
-
130
-    uxtl            v19.8h, v5.8b
131
-    smull           v17.4s, v19.4h, v16.4h
132
-    smull2          v18.4s, v19.8h, v16.8h
133
-
134
-    uxtl            v21.8h, v1.8b
135
-    smull           v19.4s, v21.4h, v24.4h
136
-    smull2          v20.4s, v21.8h, v24.8h
137
-
138
-    movi            v16.8h, #58
139
-    uxtl            v23.8h, v2.8b
140
-    smull           v21.4s, v23.4h, v16.4h
141
-    smull2          v22.4s, v23.8h, v16.8h
142
-
143
-    movi            v24.8h, #10
144
-    uxtl            v1.8h, v6.8b
145
-    smull           v23.4s, v1.4h, v24.4h
146
-    smull2          v16.4s, v1.8h, v24.8h
147
-
148
-    sub             v17.4s, v17.4s, v19.4s
149
-    sub             v18.4s, v18.4s, v20.4s
150
-
151
-    uxtl            v1.8h, v3.8b
152
-    sshll           v19.4s, v1.4h, #2
153
-    sshll2          v20.4s, v1.8h, #2
154
-
155
-    add             v17.4s, v17.4s, v21.4s
156
-    add             v18.4s, v18.4s, v22.4s
157
-
158
-    uxtl            v1.8h, v4.8b
159
-    uxtl            v2.8h, v7.8b
160
-    ssubl           v21.4s, v1.4h, v2.4h
161
-    ssubl2          v22.4s, v1.8h, v2.8h
162
-
163
-    add             v17.4s, v17.4s, v19.4s
164
-    add             v18.4s, v18.4s, v20.4s
165
-    sub             v21.4s, v21.4s, v23.4s
166
-    sub             v22.4s, v22.4s, v16.4s
167
-    add             v17.4s, v17.4s, v21.4s
168
-    add             v18.4s, v18.4s, v22.4s
169
-.endm
170
-
171
-
172
-
173
-
174
-.macro vextin8
175
-    ld1             {v3.16b}, x11, #16
176
-    mov             v7.d0, v3.d1
177
-    ext             v0.8b, v3.8b, v7.8b, #1
178
-    ext             v4.8b, v3.8b, v7.8b, #2
179
-    ext             v1.8b, v3.8b, v7.8b, #3
180
-    ext             v5.8b, v3.8b, v7.8b, #4
181
-    ext             v2.8b, v3.8b, v7.8b, #5
182
-    ext             v6.8b, v3.8b, v7.8b, #6
183
-    ext             v3.8b, v3.8b, v7.8b, #7
184
-.endm
185
-
186
-
187
-
188
-// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
189
-.macro HPS_FILTER a b filterhps
190
-    mov             w12, #8192
191
-    mov             w6, w10
192
-    sub             x3, x3, #\a
193
-    lsl             x3, x3, #1
194
-    mov             w9, #\a
195
-    cmp             w9, #4
196
-    b.eq            14f
197
-    cmp             w9, #12
198
-    b.eq            15f
199
-    b               7f
200
-14:
201
-    HPS_FILTER_4 \a \b \filterhps
202
-    b               10f
203
-15:
204
-    HPS_FILTER_12 \a \b \filterhps
205
-    b               10f
206
-7:
207
-    cmp             w5, #0
208
-    b.eq            8f
209
-    cmp             w5, #1
210
-    b.eq            9f
211
-8:
212
-loop1_hps_\filterhps\()_\a\()x\b\()_rowext0:
213
-    mov             w7, #\a
214
-    lsr             w7, w7, #3
215
-    mov             x11, x0
216
-    sub             x11, x11, #4
217
-loop2_hps_\filterhps\()_\a\()x\b\()_rowext0:
218
-    vextin8
219
-    \filterhps
220
-    dup             v16.4s, w12
221
-    sub             v17.4s, v17.4s, v16.4s
222
-    sub             v18.4s, v18.4s, v16.4s
223
-    xtn             v0.4h, v17.4s
224
-    xtn2            v0.8h, v18.4s
225
-    st1             {v0.8h}, x2, #16
226
-    subs            w7, w7, #1
227
-    sub             x11, x11, #8
228
-    b.ne            loop2_hps_\filterhps\()_\a\()x\b\()_rowext0
229
-    subs            w6, w6, #1
230
-    add             x0, x0, x1
231
-    add             x2, x2, x3
232
-    b.ne            loop1_hps_\filterhps\()_\a\()x\b\()_rowext0
233
-    b               10f
234
-9:
235
-loop3_hps_\filterhps\()_\a\()x\b\()_rowext1:
236
-    mov             w7, #\a
237
-    lsr             w7, w7, #3
238
-    mov             x11, x0
239
-    sub             x11, x11, #4
240
-loop4_hps_\filterhps\()_\a\()x\b\()_rowext1:
241
-    vextin8
242
-    \filterhps
243
-    dup             v16.4s, w12
244
-    sub             v17.4s, v17.4s, v16.4s
245
-    sub             v18.4s, v18.4s, v16.4s
246
-    xtn             v0.4h, v17.4s
247
-    xtn2            v0.8h, v18.4s
248
-    st1             {v0.8h}, x2, #16
249
-    subs            w7, w7, #1
250
-    sub             x11, x11, #8
251
-    b.ne            loop4_hps_\filterhps\()_\a\()x\b\()_rowext1
252
-    subs            w6, w6, #1
253
-    add             x0, x0, x1
254
-    add             x2, x2, x3
255
-    b.ne            loop3_hps_\filterhps\()_\a\()x\b\()_rowext1
256
-10:
257
-.endm
258
-
259
-.macro HPS_FILTER_4 w h filterhps
260
-    cmp             w5, #0
261
-    b.eq            11f
262
-    cmp             w5, #1
263
-    b.eq            12f
264
-11:
265
-loop4_hps_\filterhps\()_\w\()x\h\()_rowext0:
266
-    mov             x11, x0
267
-    sub             x11, x11, #4
268
-    vextin8
269
-    \filterhps
270
-    dup             v16.4s, w12
271
-    sub             v17.4s, v17.4s, v16.4s
272
-    xtn             v0.4h, v17.4s
273
-    st1             {v0.4h}, x2, #8
274
-    sub             x11, x11, #8
275
-    subs            w6, w6, #1
276
-    add             x0, x0, x1
277
-    add             x2, x2, x3
278
-    b.ne            loop4_hps_\filterhps\()_\w\()x\h\()_rowext0
279
-    b               13f
280
-12:
281
-loop5_hps_\filterhps\()_\w\()x\h\()_rowext1:
282
-    mov             x11, x0
283
-    sub             x11, x11, #4
284
-    vextin8
285
-    \filterhps
286
-    dup             v16.4s, w12
287
-    sub             v17.4s, v17.4s, v16.4s
288
-    xtn             v0.4h, v17.4s
289
-    st1             {v0.4h}, x2, #8
290
-    sub             x11, x11, #8
291
-    subs            w6, w6, #1
292
-    add             x0, x0, x1
293
-    add             x2, x2, x3
294
-    b.ne            loop5_hps_\filterhps\()_\w\()x\h\()_rowext1
295
-13:
296
-.endm
297
-
298
-.macro HPS_FILTER_12 w h filterhps
299
-    cmp             w5, #0
300
-    b.eq            14f
301
-    cmp             w5, #1
302
-    b.eq            15f
303
-14:
304
-loop12_hps_\filterhps\()_\w\()x\h\()_rowext0:
305
-    mov             x11, x0
306
-    sub             x11, x11, #4
307
-    vextin8
308
-    \filterhps
309
-    dup             v16.4s, w12
310
-    sub             v17.4s, v17.4s, v16.4s
311
-    sub             v18.4s, v18.4s, v16.4s
312
-    xtn             v0.4h, v17.4s
313
-    xtn2            v0.8h, v18.4s
314
-    st1             {v0.8h}, x2, #16
315
-    sub             x11, x11, #8
316
-
317
-    vextin8
318
-    \filterhps
319
-    dup             v16.4s, w12
320
-    sub             v17.4s, v17.4s, v16.4s
321
-    xtn             v0.4h, v17.4s
322
-    st1             {v0.4h}, x2, #8
323
-    add             x2, x2, x3
324
-    subs            w6, w6, #1
325
-    add             x0, x0, x1
326
-    b.ne            loop12_hps_\filterhps\()_\w\()x\h\()_rowext0
327
-    b               16f
328
-15:
329
-loop12_hps_\filterhps\()_\w\()x\h\()_rowext1:
330
-    mov             x11, x0
331
-    sub             x11, x11, #4
332
-    vextin8
333
-    \filterhps
334
-    dup             v16.4s, w12
335
-    sub             v17.4s, v17.4s, v16.4s
336
-    sub             v18.4s, v18.4s, v16.4s
337
-    xtn             v0.4h, v17.4s
338
-    xtn2            v0.8h, v18.4s
339
-    st1             {v0.8h}, x2, #16
340
-    sub             x11, x11, #8
341
-
342
-    vextin8
343
-    \filterhps
344
-    dup             v16.4s, w12
345
-    sub             v17.4s, v17.4s, v16.4s
346
-    xtn             v0.4h, v17.4s
347
-    st1             {v0.4h}, x2, #8
348
-    add             x2, x2, x3
349
-    subs            w6, w6, #1
350
-    add             x0, x0, x1
351
-    b.ne            loop12_hps_\filterhps\()_\w\()x\h\()_rowext1
352
-16:
353
-.endm
354
-
355
-// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
356
-.macro LUMA_HPS w h
357
-function x265_interp_8tap_horiz_ps_\w\()x\h\()_neon
358
-    mov             w10, #\h
359
-    cmp             w5, #0
360
-    b.eq            6f
361
-    sub             x0, x0, x1, lsl #2
362
-
363
-    add             x0, x0, x1
364
-    add             w10, w10, #7
365
-6:
366
-    cmp             w4, #0
367
-    b.eq            0f
368
-    cmp             w4, #1
369
-    b.eq            1f
370
-    cmp             w4, #2
371
-    b.eq            2f
372
-    cmp             w4, #3
373
-    b.eq            3f
374
-0:
375
-    HPS_FILTER  \w \h qpel_filter_0_32b
376
-    b               5f
377
-1:
378
-    HPS_FILTER  \w \h qpel_filter_1_32b
379
-    b               5f
380
-2:
381
-    HPS_FILTER  \w \h qpel_filter_2_32b
382
-    b               5f
383
-3:
384
-    HPS_FILTER  \w \h qpel_filter_3_32b
385
-    b               5f
386
-5:
387
-    ret
388
-endfunc
389
-.endm
390
-
391
-LUMA_HPS    4 4
392
-LUMA_HPS    4 8
393
-LUMA_HPS    4 16
394
-LUMA_HPS    8 4
395
-LUMA_HPS    8 8
396
-LUMA_HPS    8 16
397
-LUMA_HPS    8 32
398
-LUMA_HPS    12 16
399
-LUMA_HPS    16 4
400
-LUMA_HPS    16 8
401
-LUMA_HPS    16 12
402
-LUMA_HPS    16 16
403
-LUMA_HPS    16 32
404
-LUMA_HPS    16 64
405
-LUMA_HPS    24 32
406
-LUMA_HPS    32 8
407
-LUMA_HPS    32 16
408
-LUMA_HPS    32 24
409
-LUMA_HPS    32 32
410
-LUMA_HPS    32 64
411
-LUMA_HPS    48 64
412
-LUMA_HPS    64 16
413
-LUMA_HPS    64 32
414
-LUMA_HPS    64 48
415
-LUMA_HPS    64 64
416
x265_3.5.tar.gz/source/common/aarch64/ipfilter8.h Deleted
57
 
1
@@ -1,55 +0,0 @@
2
-/*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
- *
5
- * Authors: Yimeng Su <yimeng.su@huawei.com>
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
- *
21
- * This program is also available under a commercial proprietary license.
22
- * For more information, contact us at license @ x265.com.
23
- *****************************************************************************/
24
-
25
-#ifndef X265_IPFILTER8_AARCH64_H
26
-#define X265_IPFILTER8_AARCH64_H
27
-
28
-
29
-void x265_interp_8tap_horiz_ps_4x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
30
-void x265_interp_8tap_horiz_ps_4x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
31
-void x265_interp_8tap_horiz_ps_4x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
32
-void x265_interp_8tap_horiz_ps_8x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
33
-void x265_interp_8tap_horiz_ps_8x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
34
-void x265_interp_8tap_horiz_ps_8x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
35
-void x265_interp_8tap_horiz_ps_8x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
36
-void x265_interp_8tap_horiz_ps_12x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
37
-void x265_interp_8tap_horiz_ps_16x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
38
-void x265_interp_8tap_horiz_ps_16x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
39
-void x265_interp_8tap_horiz_ps_16x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
40
-void x265_interp_8tap_horiz_ps_16x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
41
-void x265_interp_8tap_horiz_ps_16x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
42
-void x265_interp_8tap_horiz_ps_16x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
43
-void x265_interp_8tap_horiz_ps_24x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
44
-void x265_interp_8tap_horiz_ps_32x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
45
-void x265_interp_8tap_horiz_ps_32x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
46
-void x265_interp_8tap_horiz_ps_32x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
47
-void x265_interp_8tap_horiz_ps_32x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
48
-void x265_interp_8tap_horiz_ps_32x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
49
-void x265_interp_8tap_horiz_ps_48x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
50
-void x265_interp_8tap_horiz_ps_64x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
51
-void x265_interp_8tap_horiz_ps_64x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
52
-void x265_interp_8tap_horiz_ps_64x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
53
-void x265_interp_8tap_horiz_ps_64x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
54
-
55
-
56
-#endif // ifndef X265_IPFILTER8_AARCH64_H
57
x265_3.5.tar.gz/source/common/aarch64/pixel-util.h Deleted
42
 
1
@@ -1,40 +0,0 @@
2
-/*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
- *
5
- * Authors: Yimeng Su <yimeng.su@huawei.com>
6
- *          Hongbin Liu <liuhongbin1@huawei.com>
7
- *
8
- * This program is free software; you can redistribute it and/or modify
9
- * it under the terms of the GNU General Public License as published by
10
- * the Free Software Foundation; either version 2 of the License, or
11
- * (at your option) any later version.
12
- *
13
- * This program is distributed in the hope that it will be useful,
14
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
- * GNU General Public License for more details.
17
- *
18
- * You should have received a copy of the GNU General Public License
19
- * along with this program; if not, write to the Free Software
20
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21
- *
22
- * This program is also available under a commercial proprietary license.
23
- * For more information, contact us at license @ x265.com.
24
- *****************************************************************************/
25
-
26
-#ifndef X265_PIXEL_UTIL_AARCH64_H
27
-#define X265_PIXEL_UTIL_AARCH64_H
28
-
29
-int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
30
-int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
31
-int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
32
-int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
33
-int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
34
-int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
35
-int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
36
-int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
37
-
38
-uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
39
-int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
40
-
41
-#endif // ifndef X265_PIXEL_UTIL_AARCH64_H
42
x265_3.5.tar.gz/source/common/aarch64/pixel.h Deleted
107
 
1
@@ -1,105 +0,0 @@
2
-/*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
- *
5
- * Authors: Hongbin Liu <liuhongbin1@huawei.com>
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
- *
21
- * This program is also available under a commercial proprietary license.
22
- * For more information, contact us at license @ x265.com.
23
- *****************************************************************************/
24
-
25
-#ifndef X265_I386_PIXEL_AARCH64_H
26
-#define X265_I386_PIXEL_AARCH64_H
27
-
28
-void x265_pixel_avg_pp_4x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
29
-void x265_pixel_avg_pp_4x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
30
-void x265_pixel_avg_pp_4x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
31
-void x265_pixel_avg_pp_8x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
32
-void x265_pixel_avg_pp_8x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
33
-void x265_pixel_avg_pp_8x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
34
-void x265_pixel_avg_pp_8x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
35
-void x265_pixel_avg_pp_12x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
36
-void x265_pixel_avg_pp_16x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
37
-void x265_pixel_avg_pp_16x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
38
-void x265_pixel_avg_pp_16x12_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
39
-void x265_pixel_avg_pp_16x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
40
-void x265_pixel_avg_pp_16x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
41
-void x265_pixel_avg_pp_16x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
42
-void x265_pixel_avg_pp_24x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
43
-void x265_pixel_avg_pp_32x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
44
-void x265_pixel_avg_pp_32x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
45
-void x265_pixel_avg_pp_32x24_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
46
-void x265_pixel_avg_pp_32x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
47
-void x265_pixel_avg_pp_32x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
48
-void x265_pixel_avg_pp_48x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
49
-void x265_pixel_avg_pp_64x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
50
-void x265_pixel_avg_pp_64x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
51
-void x265_pixel_avg_pp_64x48_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
52
-void x265_pixel_avg_pp_64x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
53
-
54
-void x265_sad_x3_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
55
-void x265_sad_x3_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
56
-void x265_sad_x3_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
57
-void x265_sad_x3_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
58
-void x265_sad_x3_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
59
-void x265_sad_x3_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
60
-void x265_sad_x3_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
61
-void x265_sad_x3_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
62
-void x265_sad_x3_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
63
-void x265_sad_x3_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
64
-void x265_sad_x3_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
65
-void x265_sad_x3_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
66
-void x265_sad_x3_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
67
-void x265_sad_x3_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
68
-void x265_sad_x3_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
69
-void x265_sad_x3_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
70
-void x265_sad_x3_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
71
-void x265_sad_x3_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
72
-void x265_sad_x3_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
73
-void x265_sad_x3_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
74
-void x265_sad_x3_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
75
-void x265_sad_x3_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
76
-void x265_sad_x3_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
77
-void x265_sad_x3_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
78
-void x265_sad_x3_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
79
-
80
-void x265_sad_x4_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
81
-void x265_sad_x4_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
82
-void x265_sad_x4_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
83
-void x265_sad_x4_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
84
-void x265_sad_x4_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
85
-void x265_sad_x4_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
86
-void x265_sad_x4_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
87
-void x265_sad_x4_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
88
-void x265_sad_x4_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
89
-void x265_sad_x4_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
90
-void x265_sad_x4_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
91
-void x265_sad_x4_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
92
-void x265_sad_x4_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
93
-void x265_sad_x4_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
94
-void x265_sad_x4_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
95
-void x265_sad_x4_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
96
-void x265_sad_x4_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
97
-void x265_sad_x4_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
98
-void x265_sad_x4_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
99
-void x265_sad_x4_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
100
-void x265_sad_x4_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
101
-void x265_sad_x4_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
102
-void x265_sad_x4_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
103
-void x265_sad_x4_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
104
-void x265_sad_x4_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
105
-
106
-#endif // ifndef X265_I386_PIXEL_AARCH64_H
107
x265_3.6.tar.gz/.gitignore Added
38
 
1
@@ -0,0 +1,36 @@
2
+# Prerequisites
3
+*.d
4
+
5
+# Compiled Object files
6
+*.slo
7
+*.lo
8
+*.o
9
+*.obj
10
+
11
+# Precompiled Headers
12
+*.gch
13
+*.pch
14
+
15
+# Compiled Dynamic libraries
16
+*.so
17
+*.dylib
18
+*.dll
19
+
20
+# Fortran module files
21
+*.mod
22
+*.smod
23
+
24
+# Compiled Static libraries
25
+*.lai
26
+*.la
27
+*.a
28
+*.lib
29
+
30
+# Executables
31
+*.exe
32
+*.out
33
+*.app
34
+
35
+# Build directory
36
+build/
37
+
38
x265_3.5.tar.gz/build/README.txt -> x265_3.6.tar.gz/build/README.txt Changed
37
 
1
@@ -6,6 +6,9 @@
2
 
3
 Note: MSVC12 requires cmake 2.8.11 or later
4
 
5
+Note: When the SVE/SVE2 instruction set of Arm AArch64 architecture is to be used, the GCC10.x and onwards must
6
+      be installed in order to compile x265.
7
+
8
 
9
 = Optional Prerequisites =
10
 
11
@@ -88,3 +91,25 @@
12
 building out of a Mercurial source repository.  If you are building out of
13
 a release source package, the version will not change.  If Mercurial is not
14
 found, the version will be "unknown".
15
+
16
+= Build Instructions for cross-compilation for Arm AArch64 Targets=
17
+
18
+When the target platform is based on Arm AArch64 architecture, the x265 can be
19
+built in x86 platforms. However, the CMAKE_C_COMPILER and CMAKE_CXX_COMPILER
20
+enviroment variables should be set to point to the cross compilers of the
21
+appropriate gcc. For example:
22
+
23
+1. export CMAKE_C_COMPILER=aarch64-unknown-linux-gnu-gcc
24
+2. export CMAKE_CXX_COMPILER=aarch64-unknown-linux-gnu-g++
25
+
26
+The default ones are aarch64-linux-gnu-gcc and aarch64-linux-gnu-g++.
27
+Then, the normal building process can be followed.
28
+
29
+Moreover, if the target platform supports SVE or SVE2 instruction set, the
30
+CROSS_COMPILE_SVE or CROSS_COMPILE_SVE2 environment variables should be set
31
+to true, respectively. For example:
32
+
33
+1. export CROSS_COMPILE_SVE2=true
34
+2. export CROSS_COMPILE_SVE=true
35
+
36
+Then, the normal building process can be followed.
37
x265_3.6.tar.gz/build/aarch64-darwin Added
2
 
1
+(directory)
2
x265_3.6.tar.gz/build/aarch64-darwin/crosscompile.cmake Added
25
 
1
@@ -0,0 +1,23 @@
2
+# CMake toolchain file for cross compiling x265 for aarch64
3
+# This feature is only supported as experimental. Use with caution.
4
+# Please report bugs on bitbucket
5
+# Run cmake with: cmake -DCMAKE_TOOLCHAIN_FILE=crosscompile.cmake -G "Unix Makefiles" ../../source && ccmake ../../source
6
+
7
+set(CROSS_COMPILE_ARM64 1)
8
+set(CMAKE_SYSTEM_NAME Darwin)
9
+set(CMAKE_SYSTEM_PROCESSOR aarch64)
10
+
11
+# specify the cross compiler
12
+set(CMAKE_C_COMPILER gcc-12)
13
+set(CMAKE_CXX_COMPILER g++-12)
14
+
15
+# specify the target environment
16
+SET(CMAKE_FIND_ROOT_PATH  /opt/homebrew/bin/)
17
+
18
+# specify whether SVE/SVE2 is supported by the target platform
19
+if(DEFINED ENV{CROSS_COMPILE_SVE2})
20
+    set(CROSS_COMPILE_SVE2 1)
21
+elseif(DEFINED ENV{CROSS_COMPILE_SVE})
22
+    set(CROSS_COMPILE_SVE 1)
23
+endif()
24
+
25
x265_3.6.tar.gz/build/aarch64-darwin/make-Makefiles.bash Added
6
 
1
@@ -0,0 +1,4 @@
2
+#!/bin/bash
3
+# Run this from within a bash shell
4
+
5
+cmake -DCMAKE_TOOLCHAIN_FILE="crosscompile.cmake" -G "Unix Makefiles" ../../source && ccmake ../../source
6
x265_3.5.tar.gz/build/aarch64-linux/crosscompile.cmake -> x265_3.6.tar.gz/build/aarch64-linux/crosscompile.cmake Changed
34
 
1
@@ -3,13 +3,29 @@
2
 # Please report bugs on bitbucket
3
 # Run cmake with: cmake -DCMAKE_TOOLCHAIN_FILE=crosscompile.cmake -G "Unix Makefiles" ../../source && ccmake ../../source
4
 
5
-set(CROSS_COMPILE_ARM 1)
6
+set(CROSS_COMPILE_ARM64 1)
7
 set(CMAKE_SYSTEM_NAME Linux)
8
 set(CMAKE_SYSTEM_PROCESSOR aarch64)
9
 
10
 # specify the cross compiler
11
-set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
12
-set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
13
+if(DEFINED ENV{CMAKE_C_COMPILER})
14
+    set(CMAKE_C_COMPILER $ENV{CMAKE_C_COMPILER})
15
+else()
16
+    set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
17
+endif()
18
+if(DEFINED ENV{CMAKE_CXX_COMPILER})
19
+    set(CMAKE_CXX_COMPILER $ENV{CMAKE_CXX_COMPILER})
20
+else()
21
+    set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
22
+endif()
23
 
24
 # specify the target environment
25
 SET(CMAKE_FIND_ROOT_PATH  /usr/aarch64-linux-gnu)
26
+
27
+# specify whether SVE/SVE2 is supported by the target platform
28
+if(DEFINED ENV{CROSS_COMPILE_SVE2})
29
+    set(CROSS_COMPILE_SVE2 1)
30
+elseif(DEFINED ENV{CROSS_COMPILE_SVE})
31
+    set(CROSS_COMPILE_SVE 1)
32
+endif()
33
+
34
x265_3.5.tar.gz/build/arm-linux/make-Makefiles.bash -> x265_3.6.tar.gz/build/arm-linux/make-Makefiles.bash Changed
7
 
1
@@ -1,4 +1,4 @@
2
 #!/bin/bash
3
 # Run this from within a bash shell
4
 
5
-cmake -G "Unix Makefiles" ../../source && ccmake ../../source
6
+cmake -DCMAKE_TOOLCHAIN_FILE="crosscompile.cmake" -G "Unix Makefiles" ../../source && ccmake ../../source
7
x265_3.5.tar.gz/doc/reST/cli.rst -> x265_3.6.tar.gz/doc/reST/cli.rst Changed
405
 
1
@@ -632,9 +632,8 @@
2
    auto-detection by the encoder. If specified, the encoder will
3
    attempt to bring the encode specifications within that specified
4
    level. If the encoder is unable to reach the level it issues a
5
-   warning and aborts the encode. If the requested requirement level is
6
-   higher than the actual level, the actual requirement level is
7
-   signaled.
8
+   warning and aborts the encode. The requested level will be signaled 
9
+   in the bitstream even if it is higher than the actual level.
10
 
11
    Beware, specifying a decoder level will force the encoder to enable
12
    VBV for constant rate factor encodes, which may introduce
13
@@ -714,11 +713,8 @@
14
    (main, main10, etc). Second, an encoder is created from this
15
    x265_param instance and the :option:`--level-idc` and
16
    :option:`--high-tier` parameters are used to reduce bitrate or other
17
-   features in order to enforce the target level. Finally, the encoder
18
-   re-examines the final set of parameters and detects the actual
19
-   minimum decoder requirement level and this is what is signaled in
20
-   the bitstream headers. The detected decoder level will only use High
21
-   tier if the user specified a High tier level.
22
+   features in order to enforce the target level. The detected decoder level
23
+   will only use High tier if the user specified a High tier level.
24
 
25
    The signaled profile will be determined by the encoder's internal
26
    bitdepth and input color space. If :option:`--keyint` is 0 or 1,
27
@@ -961,21 +957,21 @@
28
    Note that :option:`--analysis-save-reuse-level` and :option:`--analysis-load-reuse-level` must be paired
29
    with :option:`--analysis-save` and :option:`--analysis-load` respectively.
30
 
31
-   +--------------+------------------------------------------+
32
-   | Level        | Description                              |
33
-   +==============+==========================================+
34
-   | 1            | Lookahead information                    |
35
-   +--------------+------------------------------------------+
36
-   | 2 to 4       | Level 1 + intra/inter modes, ref's       |
37
-   +--------------+------------------------------------------+
38
-   | 5 and 6      | Level 2 + rect-amp                       |
39
-   +--------------+------------------------------------------+
40
-   | 7            | Level 5 + AVC size CU refinement         |
41
-   +--------------+------------------------------------------+
42
-   | 8 and 9      | Level 5 + AVC size Full CU analysis-info |
43
-   +--------------+------------------------------------------+
44
-   | 10           | Level 5 + Full CU analysis-info          |
45
-   +--------------+------------------------------------------+
46
+   +--------------+---------------------------------------------------+
47
+   | Level        | Description                                       |
48
+   +==============+===================================================+
49
+   | 1            | Lookahead information                             |
50
+   +--------------+---------------------------------------------------+
51
+   | 2 to 4       | Level 1 + intra/inter modes, depth, ref's, cutree |
52
+   +--------------+---------------------------------------------------+
53
+   | 5 and 6      | Level 2 + rect-amp                                |
54
+   +--------------+---------------------------------------------------+
55
+   | 7            | Level 5 + AVC size CU refinement                  |
56
+   +--------------+---------------------------------------------------+
57
+   | 8 and 9      | Level 5 + AVC size Full CU analysis-info          |
58
+   +--------------+---------------------------------------------------+
59
+   | 10           | Level 5 + Full CU analysis-info                   |
60
+   +--------------+---------------------------------------------------+
61
 
62
 .. option:: --refine-mv-type <string>
63
 
64
@@ -1332,6 +1328,11 @@
65
    Search range for HME level 0, 1 and 2.
66
    The Search Range for each HME level must be between 0 and 32768(excluding).
67
    Default search range is 16,32,48 for level 0,1,2 respectively.
68
+   
69
+.. option:: --mcstf, --no-mcstf
70
+
71
+    Enable Motion Compensated Temporal filtering.
72
+   Default: disabled
73
 
74
 Spatial/intra options
75
 =====================
76
@@ -1473,17 +1474,9 @@
77
 
78
 .. option:: --hist-scenecut, --no-hist-scenecut
79
 
80
-   Indicates that scenecuts need to be detected using luma edge and chroma histograms.
81
-   :option:`--hist-scenecut` enables scenecut detection using the histograms and disables the default scene cut algorithm.
82
-   :option:`--no-hist-scenecut` disables histogram based scenecut algorithm.
83
-   
84
-.. option:: --hist-threshold <0.0..1.0>
85
-
86
-   This value represents the threshold for normalized SAD of edge histograms used in scenecut detection.
87
-   This requires :option:`--hist-scenecut` to be enabled. For example, a value of 0.2 indicates that a frame with normalized SAD value 
88
-   greater than 0.2 against the previous frame as scenecut. 
89
-   Increasing the threshold reduces the number of scenecuts detected.
90
-   Default 0.03.
91
+   Scenecuts detected based on histogram, intensity and variance of the picture.
92
+   :option:`--hist-scenecut` enables or :option:`--no-hist-scenecut` disables scenecut detection based on
93
+   histogram.
94
    
95
 .. option:: --radl <integer>
96
    
97
@@ -1766,6 +1759,12 @@
98
    Default 1.0.
99
    **Range of values:** 0.0 to 3.0
100
 
101
+.. option:: --sbrc --no-sbrc
102
+
103
+   To enable and disable segment based rate control.Segment duration depends on the
104
+   keyframe interval specified.If unspecified,default keyframe interval will be used.
105
+   Default: disabled.
106
+
107
 .. option:: --hevc-aq
108
 
109
    Enable adaptive quantization
110
@@ -1976,12 +1975,18 @@
111
    
112
    **CLI ONLY**
113
 
114
+.. option:: --scenecut-qp-config <filename>
115
+
116
+   Specify a text file which contains the scenecut aware QP options.
117
+   The options include :option:`--scenecut-aware-qp` and :option:`--masking-strength`
118
+
119
+   **CLI ONLY**
120
+
121
 .. option:: --scenecut-aware-qp <integer>
122
 
123
    It reduces the bits spent on the inter-frames within the scenecut window
124
    before and after a scenecut by increasing their QP in ratecontrol pass2 algorithm
125
-   without any deterioration in visual quality. If a scenecut falls within the window,
126
-   the QP of the inter-frames after this scenecut will not be modified.
127
+   without any deterioration in visual quality.
128
    :option:`--scenecut-aware-qp` works only with --pass 2. Default 0.
129
 
130
    +-------+---------------------------------------------------------------+
131
@@ -2006,48 +2011,83 @@
132
    for the QP increment for inter-frames when :option:`--scenecut-aware-qp`
133
    is enabled.
134
 
135
-   When :option:`--scenecut-aware-qp` is::
136
+   When :option:`--scenecut-aware-qp` is:
137
+
138
    * 1 (Forward masking):
139
-   --masking-strength <fwdWindow,fwdRefQPDelta,fwdNonRefQPDelta>
140
+   --masking-strength <fwdMaxWindow,fwdRefQPDelta,fwdNonRefQPDelta>
141
+   or 
142
+   --masking-strength <fwdWindow1,fwdRefQPDelta1,fwdNonRefQPDelta1,fwdWindow2,fwdRefQPDelta2,fwdNonRefQPDelta2,
143
+                       fwdWindow3,fwdRefQPDelta3,fwdNonRefQPDelta3,fwdWindow4,fwdRefQPDelta4,fwdNonRefQPDelta4,
144
+                       fwdWindow5,fwdRefQPDelta5,fwdNonRefQPDelta5,fwdWindow6,fwdRefQPDelta6,fwdNonRefQPDelta6>
145
    * 2 (Backward masking):
146
-   --masking-strength <bwdWindow,bwdRefQPDelta,bwdNonRefQPDelta>
147
+   --masking-strength <bwdMaxWindow,bwdRefQPDelta,bwdNonRefQPDelta>
148
+   or 
149
+   --masking-strength <bwdWindow1,bwdRefQPDelta1,bwdNonRefQPDelta1,bwdWindow2,bwdRefQPDelta2,bwdNonRefQPDelta2,
150
+                       bwdWindow3,bwdRefQPDelta3,bwdNonRefQPDelta3,bwdWindow4,bwdRefQPDelta4,bwdNonRefQPDelta4,
151
+                       bwdWindow5,bwdRefQPDelta5,bwdNonRefQPDelta5,bwdWindow6,bwdRefQPDelta6,bwdNonRefQPDelta6>
152
    * 3 (Bi-directional masking):
153
-   --masking-strength <fwdWindow,fwdRefQPDelta,fwdNonRefQPDelta,bwdWindow,bwdRefQPDelta,bwdNonRefQPDelta>
154
+   --masking-strength <fwdMaxWindow,fwdRefQPDelta,fwdNonRefQPDelta,bwdMaxWindow,bwdRefQPDelta,bwdNonRefQPDelta>
155
+   or 
156
+   --masking-strength <fwdWindow1,fwdRefQPDelta1,fwdNonRefQPDelta1,fwdWindow2,fwdRefQPDelta2,fwdNonRefQPDelta2,
157
+                       fwdWindow3,fwdRefQPDelta3,fwdNonRefQPDelta3,fwdWindow4,fwdRefQPDelta4,fwdNonRefQPDelta4,
158
+                       fwdWindow5,fwdRefQPDelta5,fwdNonRefQPDelta5,fwdWindow6,fwdRefQPDelta6,fwdNonRefQPDelta6,
159
+                       bwdWindow1,bwdRefQPDelta1,bwdNonRefQPDelta1,bwdWindow2,bwdRefQPDelta2,bwdNonRefQPDelta2,
160
+                       bwdWindow3,bwdRefQPDelta3,bwdNonRefQPDelta3,bwdWindow4,bwdRefQPDelta4,bwdNonRefQPDelta4,
161
+                       bwdWindow5,bwdRefQPDelta5,bwdNonRefQPDelta5,bwdWindow6,bwdRefQPDelta6,bwdNonRefQPDelta6>
162
 
163
    +-----------------+---------------------------------------------------------------+
164
    | Parameter       | Description                                                   |
165
    +=================+===============================================================+
166
-   | fwdWindow       | The duration(in milliseconds) for which there is a reduction  |
167
-   |                 | in the bits spent on the inter-frames after a scenecut by     |
168
-   |                 | increasing their QP. Default 500ms.                           |
169
-   |                 | **Range of values:** 0 to 1000                                |
170
+   | fwdMaxWindow    | The maximum duration(in milliseconds) for which there is a    |
171
+   |                 | reduction in the bits spent on the inter-frames after a       |
172
+   |                 | scenecut by increasing their QP. Default 500ms.               |
173
+   |                 | **Range of values:** 0 to 2000                                |
174
+   +-----------------+---------------------------------------------------------------+
175
+   | fwdWindow       | The duration of a sub-window(in milliseconds) for which there |
176
+   |                 | is a reduction in the bits spent on the inter-frames after a  |
177
+   |                 | scenecut by increasing their QP. Default 500ms.               |
178
+   |                 | **Range of values:** 0 to 2000                                |
179
    +-----------------+---------------------------------------------------------------+
180
    | fwdRefQPDelta   | The offset by which QP is incremented for inter-frames        |
181
    |                 | after a scenecut. Default 5.                                  |
182
-   |                 | **Range of values:** 0 to 10                                  |
183
+   |                 | **Range of values:** 0 to 20                                  |
184
    +-----------------+---------------------------------------------------------------+
185
    | fwdNonRefQPDelta| The offset by which QP is incremented for non-referenced      |
186
    |                 | inter-frames after a scenecut. The offset is computed from    |
187
    |                 | fwdRefQPDelta when it is not explicitly specified.            |
188
-   |                 | **Range of values:** 0 to 10                                  |
189
+   |                 | **Range of values:** 0 to 20                                  |
190
+   +-----------------+---------------------------------------------------------------+
191
+   | bwdMaxWindow    | The maximum duration(in milliseconds) for which there is a    |
192
+   |                 | reduction in the bits spent on the inter-frames before a      |
193
+   |                 | scenecut by increasing their QP. Default 100ms.               |
194
+   |                 | **Range of values:** 0 to 2000                                |
195
    +-----------------+---------------------------------------------------------------+
196
-   | bwdWindow       | The duration(in milliseconds) for which there is a reduction  |
197
-   |                 | in the bits spent on the inter-frames before a scenecut by    |
198
-   |                 | increasing their QP. Default 100ms.                           |
199
-   |                 | **Range of values:** 0 to 1000                                |
200
+   | bwdWindow       | The duration of a sub-window(in milliseconds) for which there |
201
+   |                 | is a reduction in the bits spent on the inter-frames before a |
202
+   |                 | scenecut by increasing their QP. Default 100ms.               |
203
+   |                 | **Range of values:** 0 to 2000                                |
204
    +-----------------+---------------------------------------------------------------+
205
    | bwdRefQPDelta   | The offset by which QP is incremented for inter-frames        |
206
    |                 | before a scenecut. The offset is computed from                |
207
    |                 | fwdRefQPDelta when it is not explicitly specified.            |
208
-   |                 | **Range of values:** 0 to 10                                  |
209
+   |                 | **Range of values:** 0 to 20                                  |
210
    +-----------------+---------------------------------------------------------------+
211
    | bwdNonRefQPDelta| The offset by which QP is incremented for non-referenced      |
212
    |                 | inter-frames before a scenecut. The offset is computed from   |
213
    |                 | bwdRefQPDelta when it is not explicitly specified.            |
214
-   |                 | **Range of values:** 0 to 10                                  |
215
+   |                 | **Range of values:** 0 to 20                                  |
216
    +-----------------+---------------------------------------------------------------+
217
 
218
-   **CLI ONLY**
219
+   We can specify the value for the Use :option:`--masking-strength` parameter in different formats.
220
+   1. If we don't specify --masking-strength and specify only --scenecut-aware-qp, then default offset and window size values are considered.
221
+   2. If we specify --masking-strength with the format 1 mentioned above, the values of window, refQpDelta and nonRefQpDelta given by the user are taken for window 1 and the offsets for the remaining windows are derived with 15% difference between windows.
222
+   3. If we specify the --masking-strength with the format 2 mentioned above, the values of window, refQpDelta and nonRefQpDelta given by the user for each window from 1 to 6 are directly used.NOTE: We can use this format to specify zero offsets for any particular window
223
+
224
+   Sample config file:: (Format 2 Forward masking explained here)
225
+
226
+   --scenecut-aware-qp 1 --masking-strength 1000,8,12
227
+   
228
+   The above sample config file is available in `the downloads page <https://bitbucket.org/multicoreware/x265_git/downloads/scenecut_qp_config.txt>`_
229
 
230
 .. option:: --vbv-live-multi-pass, --no-vbv-live-multi-pass
231
 
232
@@ -2057,6 +2097,14 @@
233
    rate control mode.
234
 
235
    Default disabled. **Experimental feature**
236
+   
237
+
238
+.. option:: bEncFocusedFramesOnly
239
+
240
+   Used to trigger encoding of selective GOPs; Disabled by default.
241
+   
242
+   **API ONLY**
243
+   
244
 
245
 Quantization Options
246
 ====================
247
@@ -2427,6 +2475,81 @@
248
    Values in the range 0..12. See D.3.3 of the HEVC spec. for a detailed explanation.
249
    Required for HLG (Hybrid Log Gamma) signaling. Not signaled by default.
250
 
251
+.. option:: --video-signal-type-preset <string>
252
+
253
+   Specify combinations of color primaries, transfer characteristics, color matrix,
254
+   range of luma and chroma signals, and chroma sample location.
255
+   String format: <system-id>:<color-volume>
256
+   
257
+   This has higher precedence than individual VUI parameters. If any individual VUI option
258
+   is specified together with this, which changes the values set corresponding to the system-id
259
+   or color-volume, it will be discarded.
260
+
261
+   system-id options and their corresponding values:
262
+   +----------------+---------------------------------------------------------------+
263
+   | system-id      | Value                                                         |
264
+   +================+===============================================================+
265
+   | BT601_525      | --colorprim smpte170m --transfer smpte170m                    |
266
+   |                | --colormatrix smpte170m --range limited --chromaloc 0         |
267
+   +----------------+---------------------------------------------------------------+
268
+   | BT601_626      | --colorprim bt470bg --transfer smpte170m --colormatrix bt470bg|
269
+   |                | --range limited --chromaloc 0                                 |
270
+   +----------------+---------------------------------------------------------------+
271
+   | BT709_YCC      | --colorprim bt709 --transfer bt709 --colormatrix bt709        |
272
+   |                | --range limited --chromaloc 0                                 |
273
+   +----------------+---------------------------------------------------------------+
274
+   | BT709_RGB      | --colorprim bt709 --transfer bt709 --colormatrix gbr          |
275
+   |                | --range limited                                               |
276
+   +----------------+---------------------------------------------------------------+
277
+   | BT2020_YCC_NCL | --colorprim bt2020 --transfer bt2020-10 --colormatrix bt709   |
278
+   |                | --range limited --chromaloc 2                                 |
279
+   +----------------+---------------------------------------------------------------+
280
+   | BT2020_RGB     | --colorprim bt2020 --transfer smpte2084 --colormatrix bt2020nc|
281
+   |                | --range limited                                               |
282
+   +----------------+---------------------------------------------------------------+
283
+   | BT2100_PQ_YCC  | --colorprim bt2020 --transfer smpte2084 --colormatrix bt2020nc|
284
+   |                | --range limited --chromaloc 2                                 |
285
+   +----------------+---------------------------------------------------------------+
286
+   | BT2100_PQ_ICTCP| --colorprim bt2020 --transfer smpte2084 --colormatrix ictcp   |
287
+   |                | --range limited --chromaloc 2                                 |
288
+   +----------------+---------------------------------------------------------------+
289
+   | BT2100_PQ_RGB  | --colorprim bt2020 --transfer smpte2084 --colormatrix gbr     |
290
+   |                | --range limited                                               |
291
+   +----------------+---------------------------------------------------------------+
292
+   | BT2100_HLG_YCC | --colorprim bt2020 --transfer arib-std-b67                    |
293
+   |                | --colormatrix bt2020nc --range limited --chromaloc 2          |
294
+   +----------------+---------------------------------------------------------------+
295
+   | BT2100_HLG_RGB | --colorprim bt2020 --transfer arib-std-b67 --colormatrix gbr  |
296
+   |                | --range limited                                               |
297
+   +----------------+---------------------------------------------------------------+
298
+   | FR709_RGB      | --colorprim bt709 --transfer bt709 --colormatrix gbr          |
299
+   |                | --range full                                                  |
300
+   +----------------+---------------------------------------------------------------+
301
+   | FR2020_RGB     | --colorprim bt2020 --transfer bt2020-10 --colormatrix gbr     |
302
+   |                | --range full                                                  |
303
+   +----------------+---------------------------------------------------------------+
304
+   | FRP3D65_YCC    | --colorprim smpte432 --transfer bt709 --colormatrix smpte170m |
305
+   |                | --range full --chromaloc 1                                    |
306
+   +----------------+---------------------------------------------------------------+
307
+
308
+   color-volume options and their corresponding values:
309
+   +----------------+---------------------------------------------------------------+
310
+   | color-volume   | Value                                                         |
311
+   +================+===============================================================+
312
+   | P3D65x1000n0005| --master-display G(13250,34500)B(7500,3000)R(34000,16000)     |
313
+   |                |                  WP(15635,16450)L(10000000,5)                 |
314
+   +----------------+---------------------------------------------------------------+
315
+   | P3D65x4000n005 | --master-display G(13250,34500)B(7500,3000)R(34000,16000)     |
316
+   |                |                  WP(15635,16450)L(40000000,50)                |
317
+   +----------------+---------------------------------------------------------------+
318
+   | BT2100x108n0005| --master-display G(8500,39850)B(6550,2300)R(34000,146000)     |
319
+   |                |                  WP(15635,16450)L(10000000,1)                 |
320
+   +----------------+---------------------------------------------------------------+
321
+
322
+   Note: The color-volume options can be used only with the system-id options BT2100_PQ_YCC,
323
+          BT2100_PQ_ICTCP, and BT2100_PQ_RGB. It is incompatible with other options.
324
+
325
+
326
 Bitstream options
327
 =================
328
 
329
@@ -2454,6 +2577,16 @@
330
    the very first AUD will be skipped since it cannot be placed at the
331
    start of the access unit, where it belongs. Default disabled
332
 
333
+.. option:: --eob, --no-eob
334
+
335
+   Emit an end of bitstream NAL unit at the end of the bitstream.
336
+   Default disabled
337
+
338
+.. option:: --eos, --no-eos
339
+
340
+   Emit an end of sequence NAL unit at the end of every coded
341
+   video sequence. Default disabled
342
+
343
 .. option:: --hrd, --no-hrd
344
 
345
    Enable the signaling of HRD parameters to the decoder. The HRD
346
@@ -2480,7 +2613,7 @@
347
     The value is specified as a float or as an integer with the profile times 10,
348
     for example profile 5 is specified as "5" or "5.0" or "50".
349
     
350
-    Currently only profile 5, profile 8.1 and profile 8.2 enabled, Default 0 (disabled)
351
+    Currently only profile 5, profile 8.1, profile 8.2 and profile 8.4  enabled, Default 0 (disabled)
352
 
353
 .. option:: --dolby-vision-rpu <filename>
354
 
355
@@ -2509,17 +2642,26 @@
356
    2. CRC
357
    3. Checksum
358
 
359
-.. option:: --temporal-layers,--no-temporal-layers
360
+.. option:: --temporal-layers <integer>
361
 
362
-   Enable a temporal sub layer. All referenced I/P/B frames are in the
363
-   base layer and all unreferenced B frames are placed in a temporal
364
-   enhancement layer. A decoder may choose to drop the enhancement layer 
365
-   and only decode and display the base layer slices.
366
-   
367
-   If used with a fixed GOP (:option:`--b-adapt` 0) and :option:`--bframes`
368
-   3 then the two layers evenly split the frame rate, with a cadence of
369
-   PbBbP. You probably also want :option:`--no-scenecut` and a keyframe
370
-   interval that is a multiple of 4.
371
+   Enable specified number of temporal sub layers. For any frame in layer N,
372
+   all referenced frames are in the layer N or N-1.A decoder may choose to drop the enhancement layer
373
+   and only decode and display the base layer slices.Allowed number of temporal sub-layers
374
+   are 2 to 5.(2 and 5 inclusive)
375
+
376
+   When enabled,temporal layers 3 through 5 configures a fixed miniGOP with the number of bframes as shown below
377
+   unless miniGOP size is modified due to lookahead decisions.Temporal layer 2 is a special case that has
378
+   all reference frames in base layer and non-reference frames in enhancement layer without any constraint on the
379
+   number of bframes.Default disabled.
380
+   +----------------+--------+
381
+   | temporal layer | bframes|
382
+   +================+========+
383
+   | 3              | 3      |
384
+   +----------------+--------+
385
+   | 4              | 7      |
386
+    +----------------+--------+
387
+   | 5              | 15     |
388
+   +----------------+--------+
389
 
390
 .. option:: --log2-max-poc-lsb <integer>
391
 
392
@@ -2564,6 +2706,12 @@
393
    Emit SEI messages in a single NAL unit instead of multiple NALs. Default disabled.
394
    When HRD SEI is enabled the HM decoder will throw a warning.
395
 
396
+.. option:: --film-grain <filename>
397
+
398
+    Refers to the film grain model characteristics for signal enhancement information transmission
399
+
400
+    **CLI_ONLY**
401
+
402
 DCT Approximations
403
 =================
404
 
405
x265_3.5.tar.gz/doc/reST/introduction.rst -> x265_3.6.tar.gz/doc/reST/introduction.rst Changed
9
 
1
@@ -77,6 +77,6 @@
2
 to start is with the `Motion Picture Experts Group - Licensing Authority
3
 - HEVC Licensing Program <http://www.mpegla.com/main/PID/HEVC/default.aspx>`_.
4
 
5
-x265 is a registered trademark of MulticoreWare, Inc.  The x265 logo is
6
+x265 is a registered trademark of MulticoreWare, Inc.  The X265 logo is
7
 a trademark of MulticoreWare, and may only be used with explicit written
8
 permission.  All rights reserved.
9
x265_3.5.tar.gz/doc/reST/releasenotes.rst -> x265_3.6.tar.gz/doc/reST/releasenotes.rst Changed
55
 
1
@@ -2,6 +2,53 @@
2
 Release Notes
3
 *************
4
 
5
+Version 3.6
6
+===========
7
+
8
+Release date - 4th April, 2024.
9
+
10
+New feature
11
+-----------
12
+1. Segment based Ratecontrol (SBRC) feature
13
+2. Motion-Compensated Spatio-Temporal Filtering
14
+3. Scene-cut aware qp - BBAQ (Bidirectional Boundary Aware Quantization)
15
+4. Histogram-Based Scene Change Detection
16
+5. Film-Grain characteristics as a SEI message to support Film Grain Synthesis(FGS)
17
+6. Add temporal layer implementation(Hierarchical B-frame implementation)
18
+ 
19
+Enhancements to existing features
20
+---------------------------------
21
+1. Added Dolby Vision 8.4 Profile Support
22
+
23
+
24
+API changes
25
+-----------
26
+1. Add Segment based Ratecontrol(SBRC) feature: "--no-sbrc".
27
+2. Add command line parameter for mcstf feature: "--no-mctf".
28
+3. Add command line parameters for the scene cut aware qp feature: "--scenecut-aware-qp" and "--masking-strength".
29
+4. Add command line parameters for Histogram-Based Scene Change Detection: "--hist-scenecut".
30
+5. Add film grain characteristics as a SEI message to the bitstream: "--film-grain <filename>"
31
+6. cli: add new option --cra-nal (Force nal type to CRA to all frames expect for the first frame, works only with keyint 1)
32
+
33
+Optimizations
34
+---------------------
35
+ARM64 NEON optimizations:- Several time-consuming C functions have been optimized for the targeted platform - aarch64. The overall performance increased by around 20%.
36
+SVE/SVE2 optimizations
37
+
38
+
39
+Bug fixes
40
+---------
41
+1. Linux bug to utilize all the cores
42
+2. Crash with hist-scenecut build when source resolution is not multiple of minCuSize
43
+3. 32bit and 64bit builds generation for ARM
44
+4. bugs in zonefile feature (Reflect Zonefile Parameters inside Lookahead, extra IDR issue, Avg I Slice QP value issue etc..)
45
+5. Add x86 ASM implementation for subsampling luma 
46
+6. Fix for abrladder segfault with load reuse level 1 
47
+7. Reorder miniGOP based on temporal layer hierarchy and add support for more B frame 
48
+8. Add MacOS aarch64 build support 
49
+9. Fix boundary condition issue for Gaussian filter
50
+
51
+
52
 Version 3.5
53
 ===========
54
 
55
x265_3.5.tar.gz/readme.rst -> x265_3.6.tar.gz/readme.rst Changed
10
 
1
@@ -2,7 +2,7 @@
2
 x265 HEVC Encoder
3
 =================
4
 
5
-| **Read:** | Online `documentation <http://x265.readthedocs.org/en/default/>`_ | Developer `wiki <http://bitbucket.org/multicoreware/x265/wiki/>`_
6
+| **Read:** | Online `documentation <http://x265.readthedocs.org/en/master/>`_ | Developer `wiki <http://bitbucket.org/multicoreware/x265_git/wiki/>`_
7
 | **Download:** | `releases <http://ftp.videolan.org/pub/videolan/x265/>`_ 
8
 | **Interact:** | #x265 on freenode.irc.net | `x265-devel@videolan.org <http://mailman.videolan.org/listinfo/x265-devel>`_ | `Report an issue <https://bitbucket.org/multicoreware/x265/issues?status=new&status=open>`_
9
 
10
x265_3.5.tar.gz/source/CMakeLists.txt -> x265_3.6.tar.gz/source/CMakeLists.txt Changed
232
 
1
@@ -29,7 +29,7 @@
2
 option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
3
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
4
 # X265_BUILD must be incremented each time the public API is changed
5
-set(X265_BUILD 199)
6
+set(X265_BUILD 209)
7
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
8
                "${PROJECT_BINARY_DIR}/x265.def")
9
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
10
@@ -38,14 +38,20 @@
11
 SET(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake" "${CMAKE_MODULE_PATH}")
12
 
13
 # System architecture detection
14
-string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
15
+if (APPLE AND CMAKE_OSX_ARCHITECTURES)
16
+    string(TOLOWER "${CMAKE_OSX_ARCHITECTURES}" SYSPROC)
17
+else()
18
+    string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
19
+endif()
20
 set(X86_ALIASES x86 i386 i686 x86_64 amd64)
21
-set(ARM_ALIASES armv6l armv7l aarch64)
22
+set(ARM_ALIASES armv6l armv7l)
23
+set(ARM64_ALIASES arm64 arm64e aarch64)
24
 list(FIND X86_ALIASES "${SYSPROC}" X86MATCH)
25
 list(FIND ARM_ALIASES "${SYSPROC}" ARMMATCH)
26
-set(POWER_ALIASES ppc64 ppc64le)
27
+list(FIND ARM64_ALIASES "${SYSPROC}" ARM64MATCH)
28
+set(POWER_ALIASES powerpc64 powerpc64le ppc64 ppc64le)
29
 list(FIND POWER_ALIASES "${SYSPROC}" POWERMATCH)
30
-if("${SYSPROC}" STREQUAL "" OR X86MATCH GREATER "-1")
31
+if(X86MATCH GREATER "-1")
32
     set(X86 1)
33
     add_definitions(-DX265_ARCH_X86=1)
34
     if(CMAKE_CXX_FLAGS STREQUAL "-m32")
35
@@ -70,15 +76,18 @@
36
     else()
37
         set(CROSS_COMPILE_ARM 0)
38
     endif()
39
+   message(STATUS "Detected ARM target processor")
40
     set(ARM 1)
41
-    if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
42
-        message(STATUS "Detected ARM64 target processor")
43
-        set(ARM64 1)
44
-        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0)
45
-    else()
46
-        message(STATUS "Detected ARM target processor")
47
-        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1)
48
-    endif()
49
+    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
50
+elseif(ARM64MATCH GREATER "-1")
51
+    #if(CROSS_COMPILE_ARM64)
52
+        #message(STATUS "Cross compiling for ARM64 arch")
53
+    #else()
54
+        #set(CROSS_COMPILE_ARM64 0)
55
+    #endif()
56
+    message(STATUS "Detected ARM64 target processor")
57
+    set(ARM64 1)
58
+    add_definitions(-DX265_ARCH_ARM64=1 -DHAVE_NEON)
59
 else()
60
     message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
61
     message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
62
@@ -239,26 +248,43 @@
63
         endif()
64
     endif()
65
     if(ARM AND CROSS_COMPILE_ARM)
66
-        if(ARM64)
67
-            set(ARM_ARGS -fPIC)
68
-        else()
69
-            set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
70
-        endif()
71
         message(STATUS "cross compile arm")
72
+       set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
73
     elseif(ARM)
74
-        if(ARM64)
75
-            set(ARM_ARGS -fPIC)
76
+        find_package(Neon)
77
+        if(CPU_HAS_NEON)
78
+            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
79
             add_definitions(-DHAVE_NEON)
80
         else()
81
-            find_package(Neon)
82
-            if(CPU_HAS_NEON)
83
-                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
84
-                add_definitions(-DHAVE_NEON)
85
-            else()
86
-                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
87
-            endif()
88
+            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
89
         endif()
90
     endif()
91
+   if(ARM64 OR CROSS_COMPILE_ARM64)
92
+        find_package(Neon)
93
+        find_package(SVE)
94
+        find_package(SVE2)
95
+        if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2)
96
+            message(STATUS "Found SVE2")
97
+           set(ARM_ARGS -O3 -march=armv8-a+sve2 -fPIC -flax-vector-conversions)
98
+            add_definitions(-DHAVE_SVE2)
99
+            add_definitions(-DHAVE_SVE)
100
+            add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE2
101
+        elseif(CPU_HAS_SVE OR CROSS_COMPILE_SVE)
102
+            message(STATUS "Found SVE")
103
+           set(ARM_ARGS -O3 -march=armv8-a+sve -fPIC -flax-vector-conversions)
104
+            add_definitions(-DHAVE_SVE)
105
+            add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE
106
+        elseif(CPU_HAS_NEON)
107
+            message(STATUS "Found NEON")
108
+            set(ARM_ARGS -fPIC -flax-vector-conversions)
109
+            add_definitions(-DHAVE_NEON)
110
+        else()
111
+            set(ARM_ARGS -fPIC -flax-vector-conversions)
112
+        endif()        
113
+    endif()
114
+   if(ENABLE_PIC)
115
+   list(APPEND ARM_ARGS -DPIC)
116
+   endif()
117
     add_definitions(${ARM_ARGS})
118
     if(FPROFILE_GENERATE)
119
         if(INTEL_CXX)
120
@@ -350,7 +376,7 @@
121
 endif(GCC)
122
 
123
 find_package(Nasm)
124
-if(ARM OR CROSS_COMPILE_ARM)
125
+if(ARM OR CROSS_COMPILE_ARM OR ARM64 OR CROSS_COMPILE_ARM64)
126
     option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" ON)
127
 elseif(NASM_FOUND AND X86)
128
     if (NASM_VERSION_STRING VERSION_LESS "2.13.0")
129
@@ -384,7 +410,7 @@
130
 endif(EXTRA_LIB)
131
 mark_as_advanced(EXTRA_LIB EXTRA_LINK_FLAGS)
132
 
133
-if(X64)
134
+if(X64 OR ARM64 OR PPC64)
135
     # NOTE: We only officially support high-bit-depth compiles of x265
136
     # on 64bit architectures. Main10 plus large resolution plus slow
137
     # preset plus 32bit address space usually means malloc failure.  You
138
@@ -393,7 +419,7 @@
139
     # license" so to speak.  If it breaks you get to keep both halves.
140
     # You will need to disable assembly manually.
141
     option(HIGH_BIT_DEPTH "Store pixel samples as 16bit values (Main10/Main12)" OFF)
142
-endif(X64)
143
+endif(X64 OR ARM64 OR PPC64)
144
 if(HIGH_BIT_DEPTH)
145
     option(MAIN12 "Support Main12 instead of Main10" OFF)
146
     if(MAIN12)
147
@@ -440,6 +466,18 @@
148
 endif()
149
 add_definitions(-DX265_NS=${X265_NS})
150
 
151
+if(ARM64)
152
+  if(HIGH_BIT_DEPTH)
153
+    if(MAIN12)
154
+      list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=12 -DX265_NS=${X265_NS})
155
+    else()
156
+      list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=10 -DX265_NS=${X265_NS})
157
+    endif()
158
+  else()
159
+    list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=0 -DBIT_DEPTH=8 -DX265_NS=${X265_NS})
160
+  endif()
161
+endif(ARM64)
162
+
163
 option(WARNINGS_AS_ERRORS "Stop compiles on first warning" OFF)
164
 if(WARNINGS_AS_ERRORS)
165
     if(GCC)
166
@@ -536,11 +574,7 @@
167
     # compile ARM arch asm files here
168
         enable_language(ASM)
169
         foreach(ASM ${ARM_ASMS})
170
-            if(ARM64)
171
-                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
172
-            else()
173
-                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})
174
-            endif()
175
+           set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})
176
             list(APPEND ASM_SRCS ${ASM_SRC})
177
             list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
178
             add_custom_command(
179
@@ -549,6 +583,52 @@
180
                 ARGS ${ARM_ARGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
181
                 DEPENDS ${ASM_SRC})
182
         endforeach()
183
+   elseif(ARM64 OR CROSS_COMPILE_ARM64)
184
+    # compile ARM64 arch asm files here
185
+        enable_language(ASM)
186
+        foreach(ASM ${ARM_ASMS})
187
+            set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
188
+            list(APPEND ASM_SRCS ${ASM_SRC})
189
+            list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
190
+            add_custom_command(
191
+                OUTPUT ${ASM}.${SUFFIX}
192
+                COMMAND ${CMAKE_CXX_COMPILER}
193
+                ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
194
+                DEPENDS ${ASM_SRC})
195
+        endforeach()
196
+        if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2)
197
+            foreach(ASM ${ARM_ASMS_SVE})
198
+                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
199
+                list(APPEND ASM_SRCS ${ASM_SRC})
200
+                list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
201
+                add_custom_command(
202
+                    OUTPUT ${ASM}.${SUFFIX}
203
+                    COMMAND ${CMAKE_CXX_COMPILER}
204
+                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
205
+                    DEPENDS ${ASM_SRC})
206
+            endforeach()
207
+            foreach(ASM ${ARM_ASMS_SVE2})
208
+                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
209
+                list(APPEND ASM_SRCS ${ASM_SRC})
210
+                list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
211
+                add_custom_command(
212
+                    OUTPUT ${ASM}.${SUFFIX}
213
+                    COMMAND ${CMAKE_CXX_COMPILER}
214
+                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
215
+                    DEPENDS ${ASM_SRC})
216
+            endforeach()
217
+        elseif(CPU_HAS_SVE OR CROSS_COMPILE_SVE)
218
+            foreach(ASM ${ARM_ASMS_SVE})
219
+                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
220
+                list(APPEND ASM_SRCS ${ASM_SRC})
221
+                list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
222
+                add_custom_command(
223
+                    OUTPUT ${ASM}.${SUFFIX}
224
+                    COMMAND ${CMAKE_CXX_COMPILER}
225
+                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
226
+                    DEPENDS ${ASM_SRC})
227
+            endforeach()
228
+        endif()
229
     elseif(X86)
230
     # compile X86 arch asm files here
231
         foreach(ASM ${MSVC_ASMS})
232
x265_3.5.tar.gz/source/abrEncApp.cpp -> x265_3.6.tar.gz/source/abrEncApp.cpp Changed
2220
 
1
@@ -1,1111 +1,1111 @@
2
-/*****************************************************************************
3
-* Copyright (C) 2013-2020 MulticoreWare, Inc
4
-*
5
-* Authors: Pooja Venkatesan <pooja@multicorewareinc.com>
6
-*          Aruna Matheswaran <aruna@multicorewareinc.com>
7
-*
8
-* This program is free software; you can redistribute it and/or modify
9
-* it under the terms of the GNU General Public License as published by
10
-* the Free Software Foundation; either version 2 of the License, or
11
-* (at your option) any later version.
12
-*
13
-* This program is distributed in the hope that it will be useful,
14
-* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
-* GNU General Public License for more details.
17
-*
18
-* You should have received a copy of the GNU General Public License
19
-* along with this program; if not, write to the Free Software
20
-* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21
-*
22
-* This program is also available under a commercial proprietary license.
23
-* For more information, contact us at license @ x265.com.
24
-*****************************************************************************/
25
-
26
-#include "abrEncApp.h"
27
-#include "mv.h"
28
-#include "slice.h"
29
-#include "param.h"
30
-
31
-#include <signal.h>
32
-#include <errno.h>
33
-
34
-#include <queue>
35
-
36
-using namespace X265_NS;
37
-
38
-/* Ctrl-C handler */
39
-static volatile sig_atomic_t b_ctrl_c /* = 0 */;
40
-static void sigint_handler(int)
41
-{
42
-    b_ctrl_c = 1;
43
-}
44
-
45
-namespace X265_NS {
46
-    // private namespace
47
-#define X265_INPUT_QUEUE_SIZE 250
48
-
49
-    AbrEncoder::AbrEncoder(CLIOptions cliopt, uint8_t numEncodes, int &ret)
50
-    {
51
-        m_numEncodes = numEncodes;
52
-        m_numActiveEncodes.set(numEncodes);
53
-        m_queueSize = (numEncodes > 1) ? X265_INPUT_QUEUE_SIZE : 1;
54
-        m_passEnc = X265_MALLOC(PassEncoder*, m_numEncodes);
55
-
56
-        for (uint8_t i = 0; i < m_numEncodes; i++)
57
-        {
58
-            m_passEnci = new PassEncoder(i, cliopti, this);
59
-            if (!m_passEnci)
60
-            {
61
-                x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for passEncoder\n");
62
-                ret = 4;
63
-            }
64
-            m_passEnci->init(ret);
65
-        }
66
-
67
-        if (!allocBuffers())
68
-        {
69
-            x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for buffers\n");
70
-            ret = 4;
71
-        }
72
-
73
-        /* start passEncoder worker threads */
74
-        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
75
-            m_passEncpass->startThreads();
76
-    }
77
-
78
-    bool AbrEncoder::allocBuffers()
79
-    {
80
-        m_inputPicBuffer = X265_MALLOC(x265_picture**, m_numEncodes);
81
-        m_analysisBuffer = X265_MALLOC(x265_analysis_data*, m_numEncodes);
82
-
83
-        m_picWriteCnt = new ThreadSafeIntegerm_numEncodes;
84
-        m_picReadCnt = new ThreadSafeIntegerm_numEncodes;
85
-        m_analysisWriteCnt = new ThreadSafeIntegerm_numEncodes;
86
-        m_analysisReadCnt = new ThreadSafeIntegerm_numEncodes;
87
-
88
-        m_picIdxReadCnt = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
89
-        m_analysisWrite = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
90
-        m_analysisRead = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
91
-        m_readFlag = X265_MALLOC(int*, m_numEncodes);
92
-
93
-        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
94
-        {
95
-            m_inputPicBufferpass = X265_MALLOC(x265_picture*, m_queueSize);
96
-            for (uint32_t idx = 0; idx < m_queueSize; idx++)
97
-            {
98
-                m_inputPicBufferpassidx = x265_picture_alloc();
99
-                x265_picture_init(m_passEncpass->m_param, m_inputPicBufferpassidx);
100
-            }
101
-
102
-            CHECKED_MALLOC_ZERO(m_analysisBufferpass, x265_analysis_data, m_queueSize);
103
-            m_picIdxReadCntpass = new ThreadSafeIntegerm_queueSize;
104
-            m_analysisWritepass = new ThreadSafeIntegerm_queueSize;
105
-            m_analysisReadpass = new ThreadSafeIntegerm_queueSize;
106
-            m_readFlagpass = X265_MALLOC(int, m_queueSize);
107
-        }
108
-        return true;
109
-    fail:
110
-        return false;
111
-    }
112
-
113
-    void AbrEncoder::destroy()
114
-    {
115
-        x265_cleanup(); /* Free library singletons */
116
-        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
117
-        {
118
-            for (uint32_t index = 0; index < m_queueSize; index++)
119
-            {
120
-                X265_FREE(m_inputPicBufferpassindex->planes0);
121
-                x265_picture_free(m_inputPicBufferpassindex);
122
-            }
123
-
124
-            X265_FREE(m_inputPicBufferpass);
125
-            X265_FREE(m_analysisBufferpass);
126
-            X265_FREE(m_readFlagpass);
127
-            delete m_picIdxReadCntpass;
128
-            delete m_analysisWritepass;
129
-            delete m_analysisReadpass;
130
-            m_passEncpass->destroy();
131
-            delete m_passEncpass;
132
-        }
133
-        X265_FREE(m_inputPicBuffer);
134
-        X265_FREE(m_analysisBuffer);
135
-        X265_FREE(m_readFlag);
136
-
137
-        delete m_picWriteCnt;
138
-        delete m_picReadCnt;
139
-        delete m_analysisWriteCnt;
140
-        delete m_analysisReadCnt;
141
-
142
-        X265_FREE(m_picIdxReadCnt);
143
-        X265_FREE(m_analysisWrite);
144
-        X265_FREE(m_analysisRead);
145
-
146
-        X265_FREE(m_passEnc);
147
-    }
148
-
149
-    PassEncoder::PassEncoder(uint32_t id, CLIOptions cliopt, AbrEncoder *parent)
150
-    {
151
-        m_id = id;
152
-        m_cliopt = cliopt;
153
-        m_parent = parent;
154
-        if(!(m_cliopt.enableScaler && m_id))
155
-            m_input = m_cliopt.input;
156
-        m_param = cliopt.param;
157
-        m_inputOver = false;
158
-        m_lastIdx = -1;
159
-        m_encoder = NULL;
160
-        m_scaler = NULL;
161
-        m_reader = NULL;
162
-        m_ret = 0;
163
-    }
164
-
165
-    int PassEncoder::init(int &result)
166
-    {
167
-        if (m_parent->m_numEncodes > 1)
168
-            setReuseLevel();
169
-                
170
-        if (!(m_cliopt.enableScaler && m_id))
171
-            m_reader = new Reader(m_id, this);
172
-        else
173
-        {
174
-            VideoDesc *src = NULL, *dst = NULL;
175
-            dst = new VideoDesc(m_param->sourceWidth, m_param->sourceHeight, m_param->internalCsp, m_param->internalBitDepth);
176
-            int dstW = m_parent->m_passEncm_id - 1->m_param->sourceWidth;
177
-            int dstH = m_parent->m_passEncm_id - 1->m_param->sourceHeight;
178
-            src = new VideoDesc(dstW, dstH, m_param->internalCsp, m_param->internalBitDepth);
179
-            if (src != NULL && dst != NULL)
180
-            {
181
-                m_scaler = new Scaler(0, 1, m_id, src, dst, this);
182
-                if (!m_scaler)
183
-                {
184
-                    x265_log(m_param, X265_LOG_ERROR, "\n MALLOC failure in Scaler");
185
-                    result = 4;
186
-                }
187
-            }
188
-        }
189
-
190
-        /* note: we could try to acquire a different libx265 API here based on
191
-        * the profile found during option parsing, but it must be done before
192
-        * opening an encoder */
193
-
194
-        if (m_param)
195
-            m_encoder = m_cliopt.api->encoder_open(m_param);
196
-        if (!m_encoder)
197
-        {
198
-            x265_log(NULL, X265_LOG_ERROR, "x265_encoder_open() failed for Enc, \n");
199
-            m_ret = 2;
200
-            return -1;
201
-        }
202
-
203
-        /* get the encoder parameters post-initialization */
204
-        m_cliopt.api->encoder_parameters(m_encoder, m_param);
205
-
206
-        return 1;
207
-    }
208
-
209
-    void PassEncoder::setReuseLevel()
210
-    {
211
-        uint32_t r, padh = 0, padw = 0;
212
-
213
-        m_param->confWinBottomOffset = m_param->confWinRightOffset = 0;
214
-
215
-        m_param->analysisLoadReuseLevel = m_cliopt.loadLevel;
216
-        m_param->analysisSaveReuseLevel = m_cliopt.saveLevel;
217
-        m_param->analysisSave = m_cliopt.saveLevel ? "save.dat" : NULL;
218
-        m_param->analysisLoad = m_cliopt.loadLevel ? "load.dat" : NULL;
219
-        m_param->bUseAnalysisFile = 0;
220
-
221
-        if (m_cliopt.loadLevel)
222
-        {
223
-            x265_param *refParam = m_parent->m_passEncm_cliopt.refId->m_param;
224
-
225
-            if (m_param->sourceHeight == (refParam->sourceHeight - refParam->confWinBottomOffset) &&
226
-                m_param->sourceWidth == (refParam->sourceWidth - refParam->confWinRightOffset))
227
-            {
228
-                m_parent->m_passEncm_id->m_param->confWinBottomOffset = refParam->confWinBottomOffset;
229
-                m_parent->m_passEncm_id->m_param->confWinRightOffset = refParam->confWinRightOffset;
230
-            }
231
-            else
232
-            {
233
-                int srcH = refParam->sourceHeight - refParam->confWinBottomOffset;
234
-                int srcW = refParam->sourceWidth - refParam->confWinRightOffset;
235
-
236
-                double scaleFactorH = double(m_param->sourceHeight / srcH);
237
-                double scaleFactorW = double(m_param->sourceWidth / srcW);
238
-
239
-                int absScaleFactorH = (int)(10 * scaleFactorH + 0.5);
240
-                int absScaleFactorW = (int)(10 * scaleFactorW + 0.5);
241
-
242
-                if (absScaleFactorH == 20 && absScaleFactorW == 20)
243
-                {
244
-                    m_param->scaleFactor = 2;
245
-
246
-                    m_parent->m_passEncm_id->m_param->confWinBottomOffset = refParam->confWinBottomOffset * 2;
247
-                    m_parent->m_passEncm_id->m_param->confWinRightOffset = refParam->confWinRightOffset * 2;
248
-
249
-                }
250
-            }
251
-        }
252
-
253
-        int h = m_param->sourceHeight + m_param->confWinBottomOffset;
254
-        int w = m_param->sourceWidth + m_param->confWinRightOffset;
255
-        if (h & (m_param->minCUSize - 1))
256
-        {
257
-            r = h & (m_param->minCUSize - 1);
258
-            padh = m_param->minCUSize - r;
259
-            m_param->confWinBottomOffset += padh;
260
-
261
-        }
262
-
263
-        if (w & (m_param->minCUSize - 1))
264
-        {
265
-            r = w & (m_param->minCUSize - 1);
266
-            padw = m_param->minCUSize - r;
267
-            m_param->confWinRightOffset += padw;
268
-        }
269
-    }
270
-
271
-    void PassEncoder::startThreads()
272
-    {
273
-        /* Start slave worker threads */
274
-        m_threadActive = true;
275
-        start();
276
-        /* Start reader threads*/
277
-        if (m_reader != NULL)
278
-        {
279
-            m_reader->m_threadActive = true;
280
-            m_reader->start();
281
-        }
282
-        /* Start scaling worker threads */
283
-        if (m_scaler != NULL)
284
-        {
285
-            m_scaler->m_threadActive = true;
286
-            m_scaler->start();
287
-        }
288
-    }
289
-
290
-    void PassEncoder::copyInfo(x265_analysis_data * src)
291
-    {
292
-
293
-        uint32_t written = m_parent->m_analysisWriteCntm_id.get();
294
-
295
-        int index = written % m_parent->m_queueSize;
296
-        //If all streams have read analysis data, reuse that position in Queue
297
-
298
-        int read = m_parent->m_analysisReadm_idindex.get();
299
-        int write = m_parent->m_analysisWritem_idindex.get();
300
-
301
-        int overwrite = written / m_parent->m_queueSize;
302
-        bool emptyIdxFound = 0;
303
-        while (!emptyIdxFound && overwrite)
304
-        {
305
-            for (uint32_t i = 0; i < m_parent->m_queueSize; i++)
306
-            {
307
-                read = m_parent->m_analysisReadm_idi.get();
308
-                write = m_parent->m_analysisWritem_idi.get();
309
-                write *= m_cliopt.numRefs;
310
-
311
-                if (read == write)
312
-                {
313
-                    index = i;
314
-                    emptyIdxFound = 1;
315
-                }
316
-            }
317
-        }
318
-
319
-        x265_analysis_data *m_analysisInfo = &m_parent->m_analysisBufferm_idindex;
320
-
321
-        x265_free_analysis_data(m_param, m_analysisInfo);
322
-        memcpy(m_analysisInfo, src, sizeof(x265_analysis_data));
323
-        x265_alloc_analysis_data(m_param, m_analysisInfo);
324
-
325
-        bool isVbv = m_param->rc.vbvBufferSize && m_param->rc.vbvMaxBitrate;
326
-        if (m_param->bDisableLookahead && isVbv)
327
-        {
328
-            memcpy(m_analysisInfo->lookahead.intraSatdForVbv, src->lookahead.intraSatdForVbv, src->numCuInHeight * sizeof(uint32_t));
329
-            memcpy(m_analysisInfo->lookahead.satdForVbv, src->lookahead.satdForVbv, src->numCuInHeight * sizeof(uint32_t));
330
-            memcpy(m_analysisInfo->lookahead.intraVbvCost, src->lookahead.intraVbvCost, src->numCUsInFrame * sizeof(uint32_t));
331
-            memcpy(m_analysisInfo->lookahead.vbvCost, src->lookahead.vbvCost, src->numCUsInFrame * sizeof(uint32_t));
332
-        }
333
-
334
-        if (src->sliceType == X265_TYPE_IDR || src->sliceType == X265_TYPE_I)
335
-        {
336
-            if (m_param->analysisSaveReuseLevel < 2)
337
-                goto ret;
338
-            x265_analysis_intra_data *intraDst, *intraSrc;
339
-            intraDst = (x265_analysis_intra_data*)m_analysisInfo->intraData;
340
-            intraSrc = (x265_analysis_intra_data*)src->intraData;
341
-            memcpy(intraDst->depth, intraSrc->depth, sizeof(uint8_t) * src->depthBytes);
342
-            memcpy(intraDst->modes, intraSrc->modes, sizeof(uint8_t) * src->numCUsInFrame * src->numPartitions);
343
-            memcpy(intraDst->partSizes, intraSrc->partSizes, sizeof(char) * src->depthBytes);
344
-            memcpy(intraDst->chromaModes, intraSrc->chromaModes, sizeof(uint8_t) * src->depthBytes);
345
-            if (m_param->rc.cuTree)
346
-                memcpy(intraDst->cuQPOff, intraSrc->cuQPOff, sizeof(int8_t) * src->depthBytes);
347
-        }
348
-        else
349
-        {
350
-            bool bIntraInInter = (src->sliceType == X265_TYPE_P || m_param->bIntraInBFrames);
351
-            int numDir = src->sliceType == X265_TYPE_P ? 1 : 2;
352
-            memcpy(m_analysisInfo->wt, src->wt, sizeof(WeightParam) * 3 * numDir);
353
-            if (m_param->analysisSaveReuseLevel < 2)
354
-                goto ret;
355
-            x265_analysis_inter_data *interDst, *interSrc;
356
-            interDst = (x265_analysis_inter_data*)m_analysisInfo->interData;
357
-            interSrc = (x265_analysis_inter_data*)src->interData;
358
-            memcpy(interDst->depth, interSrc->depth, sizeof(uint8_t) * src->depthBytes);
359
-            memcpy(interDst->modes, interSrc->modes, sizeof(uint8_t) * src->depthBytes);
360
-            if (m_param->rc.cuTree)
361
-                memcpy(interDst->cuQPOff, interSrc->cuQPOff, sizeof(int8_t) * src->depthBytes);
362
-            if (m_param->analysisSaveReuseLevel > 4)
363
-            {
364
-                memcpy(interDst->partSize, interSrc->partSize, sizeof(uint8_t) * src->depthBytes);
365
-                memcpy(interDst->mergeFlag, interSrc->mergeFlag, sizeof(uint8_t) * src->depthBytes);
366
-                if (m_param->analysisSaveReuseLevel == 10)
367
-                {
368
-                    memcpy(interDst->interDir, interSrc->interDir, sizeof(uint8_t) * src->depthBytes);
369
-                    for (int dir = 0; dir < numDir; dir++)
370
-                    {
371
-                        memcpy(interDst->mvpIdxdir, interSrc->mvpIdxdir, sizeof(uint8_t) * src->depthBytes);
372
-                        memcpy(interDst->refIdxdir, interSrc->refIdxdir, sizeof(int8_t) * src->depthBytes);
373
-                        memcpy(interDst->mvdir, interSrc->mvdir, sizeof(MV) * src->depthBytes);
374
-                    }
375
-                    if (bIntraInInter)
376
-                    {
377
-                        x265_analysis_intra_data *intraDst = (x265_analysis_intra_data*)m_analysisInfo->intraData;
378
-                        x265_analysis_intra_data *intraSrc = (x265_analysis_intra_data*)src->intraData;
379
-                        memcpy(intraDst->modes, intraSrc->modes, sizeof(uint8_t) * src->numPartitions * src->numCUsInFrame);
380
-                        memcpy(intraDst->chromaModes, intraSrc->chromaModes, sizeof(uint8_t) * src->depthBytes);
381
-                    }
382
-               }
383
-            }
384
-            if (m_param->analysisSaveReuseLevel != 10)
385
-                memcpy(interDst->ref, interSrc->ref, sizeof(int32_t) * src->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir);
386
-        }
387
-
388
-ret:
389
-        //increment analysis Write counter 
390
-        m_parent->m_analysisWriteCntm_id.incr();
391
-        m_parent->m_analysisWritem_idindex.incr();
392
-        return;
393
-    }
394
-
395
-
396
-    bool PassEncoder::readPicture(x265_picture *dstPic)
397
-    {
398
-        /*Check and wait if there any input frames to read*/
399
-        int ipread = m_parent->m_picReadCntm_id.get();
400
-        int ipwrite = m_parent->m_picWriteCntm_id.get();
401
-
402
-        bool isAbrLoad = m_cliopt.loadLevel && (m_parent->m_numEncodes > 1);
403
-        while (!m_inputOver && (ipread == ipwrite))
404
-        {
405
-            ipwrite = m_parent->m_picWriteCntm_id.waitForChange(ipwrite);
406
-        }
407
-
408
-        if (m_threadActive && ipread < ipwrite)
409
-        {
410
-            /*Get input index to read from inputQueue. If doesn't need analysis info, it need not wait to fetch poc from analysisQueue*/
411
-            int readPos = ipread % m_parent->m_queueSize;
412
-            x265_analysis_data* analysisData = 0;
413
-
414
-            if (isAbrLoad)
415
-            {
416
-                /*If stream is master of each slave pass, then fetch analysis data from prev pass*/
417
-                int analysisQId = m_cliopt.refId;
418
-                /*Check and wait if there any analysis Data to read*/
419
-                int analysisWrite = m_parent->m_analysisWriteCntanalysisQId.get();
420
-                int written = analysisWrite * m_parent->m_passEncanalysisQId->m_cliopt.numRefs;
421
-                int analysisRead = m_parent->m_analysisReadCntanalysisQId.get();
422
-                
423
-                while (m_threadActive && written == analysisRead)
424
-                {
425
-                    analysisWrite = m_parent->m_analysisWriteCntanalysisQId.waitForChange(analysisWrite);
426
-                    written = analysisWrite * m_parent->m_passEncanalysisQId->m_cliopt.numRefs;
427
-                }
428
-
429
-                if (analysisRead < written)
430
-                {
431
-                    int analysisIdx = 0;
432
-                    if (!m_param->bDisableLookahead)
433
-                    {
434
-                        bool analysisdRead = false;
435
-                        while ((analysisRead < written) && !analysisdRead)
436
-                        {
437
-                            while (analysisWrite < ipread)
438
-                            {
439
-                                analysisWrite = m_parent->m_analysisWriteCntanalysisQId.waitForChange(analysisWrite);
440
-                                written = analysisWrite * m_parent->m_passEncanalysisQId->m_cliopt.numRefs;
441
-                            }
442
-                            for (uint32_t i = 0; i < m_parent->m_queueSize; i++)
443
-                            {
444
-                                analysisData = &m_parent->m_analysisBufferanalysisQIdi;
445
-                                int read = m_parent->m_analysisReadanalysisQIdi.get();
446
-                                int write = m_parent->m_analysisWriteanalysisQIdi.get() * m_parent->m_passEncanalysisQId->m_cliopt.numRefs;
447
-                                if ((analysisData->poc == (uint32_t)(ipread)) && (read < write))
448
-                                {
449
-                                    analysisIdx = i;
450
-                                    analysisdRead = true;
451
-                                    break;
452
-                                }
453
-                            }
454
-                        }
455
-                    }
456
-                    else
457
-                    {
458
-                        analysisIdx = analysisRead % m_parent->m_queueSize;
459
-                        analysisData = &m_parent->m_analysisBufferanalysisQIdanalysisIdx;
460
-                        readPos = analysisData->poc % m_parent->m_queueSize;
461
-                        while ((ipwrite < readPos) || ((ipwrite - 1) < (int)analysisData->poc))
462
-                        {
463
-                            ipwrite = m_parent->m_picWriteCntm_id.waitForChange(ipwrite);
464
-                        }
465
-                    }
466
-
467
-                    m_lastIdx = analysisIdx;
468
-                }
469
-                else
470
-                    return false;
471
-            }
472
-
473
-
474
-            x265_picture *srcPic = (x265_picture*)(m_parent->m_inputPicBufferm_idreadPos);
475
-
476
-            x265_picture *pic = (x265_picture*)(dstPic);
477
-            pic->colorSpace = srcPic->colorSpace;
478
-            pic->bitDepth = srcPic->bitDepth;
479
-            pic->framesize = srcPic->framesize;
480
-            pic->height = srcPic->height;
481
-            pic->pts = srcPic->pts;
482
-            pic->dts = srcPic->dts;
483
-            pic->reorderedPts = srcPic->reorderedPts;
484
-            pic->width = srcPic->width;
485
-            pic->analysisData = srcPic->analysisData;
486
-            pic->userSEI = srcPic->userSEI;
487
-            pic->stride0 = srcPic->stride0;
488
-            pic->stride1 = srcPic->stride1;
489
-            pic->stride2 = srcPic->stride2;
490
-            pic->planes0 = srcPic->planes0;
491
-            pic->planes1 = srcPic->planes1;
492
-            pic->planes2 = srcPic->planes2;
493
-            if (isAbrLoad)
494
-                pic->analysisData = *analysisData;
495
-            return true;
496
-        }
497
-        else
498
-            return false;
499
-    }
500
-
501
-    void PassEncoder::threadMain()
502
-    {
503
+/*****************************************************************************
504
+* Copyright (C) 2013-2020 MulticoreWare, Inc
505
+*
506
+* Authors: Pooja Venkatesan <pooja@multicorewareinc.com>
507
+*          Aruna Matheswaran <aruna@multicorewareinc.com>
508
+*
509
+* This program is free software; you can redistribute it and/or modify
510
+* it under the terms of the GNU General Public License as published by
511
+* the Free Software Foundation; either version 2 of the License, or
512
+* (at your option) any later version.
513
+*
514
+* This program is distributed in the hope that it will be useful,
515
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
516
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
517
+* GNU General Public License for more details.
518
+*
519
+* You should have received a copy of the GNU General Public License
520
+* along with this program; if not, write to the Free Software
521
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
522
+*
523
+* This program is also available under a commercial proprietary license.
524
+* For more information, contact us at license @ x265.com.
525
+*****************************************************************************/
526
+
527
+#include "abrEncApp.h"
528
+#include "mv.h"
529
+#include "slice.h"
530
+#include "param.h"
531
+
532
+#include <signal.h>
533
+#include <errno.h>
534
+
535
+#include <queue>
536
+
537
+using namespace X265_NS;
538
+
539
+/* Ctrl-C handler */
540
+static volatile sig_atomic_t b_ctrl_c /* = 0 */;
541
+static void sigint_handler(int)
542
+{
543
+    b_ctrl_c = 1;
544
+}
545
+
546
+namespace X265_NS {
547
+    // private namespace
548
+#define X265_INPUT_QUEUE_SIZE 250
549
+
550
+    AbrEncoder::AbrEncoder(CLIOptions cliopt, uint8_t numEncodes, int &ret)
551
+    {
552
+        m_numEncodes = numEncodes;
553
+        m_numActiveEncodes.set(numEncodes);
554
+        m_queueSize = (numEncodes > 1) ? X265_INPUT_QUEUE_SIZE : 1;
555
+        m_passEnc = X265_MALLOC(PassEncoder*, m_numEncodes);
556
+
557
+        for (uint8_t i = 0; i < m_numEncodes; i++)
558
+        {
559
+            m_passEnci = new PassEncoder(i, cliopti, this);
560
+            if (!m_passEnci)
561
+            {
562
+                x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for passEncoder\n");
563
+                ret = 4;
564
+            }
565
+            m_passEnci->init(ret);
566
+        }
567
+
568
+        if (!allocBuffers())
569
+        {
570
+            x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for buffers\n");
571
+            ret = 4;
572
+        }
573
+
574
+        /* start passEncoder worker threads */
575
+        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
576
+            m_passEncpass->startThreads();
577
+    }
578
+
579
+    bool AbrEncoder::allocBuffers()
580
+    {
581
+        m_inputPicBuffer = X265_MALLOC(x265_picture**, m_numEncodes);
582
+        m_analysisBuffer = X265_MALLOC(x265_analysis_data*, m_numEncodes);
583
+
584
+        m_picWriteCnt = new ThreadSafeIntegerm_numEncodes;
585
+        m_picReadCnt = new ThreadSafeIntegerm_numEncodes;
586
+        m_analysisWriteCnt = new ThreadSafeIntegerm_numEncodes;
587
+        m_analysisReadCnt = new ThreadSafeIntegerm_numEncodes;
588
+
589
+        m_picIdxReadCnt = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
590
+        m_analysisWrite = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
591
+        m_analysisRead = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
592
+        m_readFlag = X265_MALLOC(int*, m_numEncodes);
593
+
594
+        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
595
+        {
596
+            m_inputPicBufferpass = X265_MALLOC(x265_picture*, m_queueSize);
597
+            for (uint32_t idx = 0; idx < m_queueSize; idx++)
598
+            {
599
+                m_inputPicBufferpassidx = x265_picture_alloc();
600
+                x265_picture_init(m_passEncpass->m_param, m_inputPicBufferpassidx);
601
+            }
602
+
603
+            CHECKED_MALLOC_ZERO(m_analysisBufferpass, x265_analysis_data, m_queueSize);
604
+            m_picIdxReadCntpass = new ThreadSafeIntegerm_queueSize;
605
+            m_analysisWritepass = new ThreadSafeIntegerm_queueSize;
606
+            m_analysisReadpass = new ThreadSafeIntegerm_queueSize;
607
+            m_readFlagpass = X265_MALLOC(int, m_queueSize);
608
+        }
609
+        return true;
610
+    fail:
611
+        return false;
612
+    }
613
+
614
+    void AbrEncoder::destroy()
615
+    {
616
+        x265_cleanup(); /* Free library singletons */
617
+        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
618
+        {
619
+            for (uint32_t index = 0; index < m_queueSize; index++)
620
+            {
621
+                X265_FREE(m_inputPicBufferpassindex->planes0);
622
+                x265_picture_free(m_inputPicBufferpassindex);
623
+            }
624
+
625
+            X265_FREE(m_inputPicBufferpass);
626
+            X265_FREE(m_analysisBufferpass);
627
+            X265_FREE(m_readFlagpass);
628
+            delete m_picIdxReadCntpass;
629
+            delete m_analysisWritepass;
630
+            delete m_analysisReadpass;
631
+            m_passEncpass->destroy();
632
+            delete m_passEncpass;
633
+        }
634
+        X265_FREE(m_inputPicBuffer);
635
+        X265_FREE(m_analysisBuffer);
636
+        X265_FREE(m_readFlag);
637
+
638
+        delete m_picWriteCnt;
639
+        delete m_picReadCnt;
640
+        delete m_analysisWriteCnt;
641
+        delete m_analysisReadCnt;
642
+
643
+        X265_FREE(m_picIdxReadCnt);
644
+        X265_FREE(m_analysisWrite);
645
+        X265_FREE(m_analysisRead);
646
+
647
+        X265_FREE(m_passEnc);
648
+    }
649
+
650
+    PassEncoder::PassEncoder(uint32_t id, CLIOptions cliopt, AbrEncoder *parent)
651
+    {
652
+        m_id = id;
653
+        m_cliopt = cliopt;
654
+        m_parent = parent;
655
+        if(!(m_cliopt.enableScaler && m_id))
656
+            m_input = m_cliopt.input;
657
+        m_param = cliopt.param;
658
+        m_inputOver = false;
659
+        m_lastIdx = -1;
660
+        m_encoder = NULL;
661
+        m_scaler = NULL;
662
+        m_reader = NULL;
663
+        m_ret = 0;
664
+    }
665
+
666
+    int PassEncoder::init(int &result)
667
+    {
668
+        if (m_parent->m_numEncodes > 1)
669
+            setReuseLevel();
670
+                
671
+        if (!(m_cliopt.enableScaler && m_id))
672
+            m_reader = new Reader(m_id, this);
673
+        else
674
+        {
675
+            VideoDesc *src = NULL, *dst = NULL;
676
+            dst = new VideoDesc(m_param->sourceWidth, m_param->sourceHeight, m_param->internalCsp, m_param->internalBitDepth);
677
+            int dstW = m_parent->m_passEncm_id - 1->m_param->sourceWidth;
678
+            int dstH = m_parent->m_passEncm_id - 1->m_param->sourceHeight;
679
+            src = new VideoDesc(dstW, dstH, m_param->internalCsp, m_param->internalBitDepth);
680
+            if (src != NULL && dst != NULL)
681
+            {
682
+                m_scaler = new Scaler(0, 1, m_id, src, dst, this);
683
+                if (!m_scaler)
684
+                {
685
+                    x265_log(m_param, X265_LOG_ERROR, "\n MALLOC failure in Scaler");
686
+                    result = 4;
687
+                }
688
+            }
689
+        }
690
+
691
+        if (m_cliopt.zoneFile)
692
+        {
693
+            if (!m_cliopt.parseZoneFile())
694
+            {
695
+                x265_log(NULL, X265_LOG_ERROR, "Unable to parse zonefile in %s\n");
696
+                fclose(m_cliopt.zoneFile);
697
+                m_cliopt.zoneFile = NULL;
698
+            }
699
+        }
700
+
701
+        /* note: we could try to acquire a different libx265 API here based on
702
+        * the profile found during option parsing, but it must be done before
703
+        * opening an encoder */
704
+
705
+        if (m_param)
706
+            m_encoder = m_cliopt.api->encoder_open(m_param);
707
+        if (!m_encoder)
708
+        {
709
+            x265_log(NULL, X265_LOG_ERROR, "x265_encoder_open() failed for Enc, \n");
710
+            m_ret = 2;
711
+            return -1;
712
+        }
713
+
714
+        /* get the encoder parameters post-initialization */
715
+        m_cliopt.api->encoder_parameters(m_encoder, m_param);
716
+
717
+        return 1;
718
+    }
719
+
720
+    void PassEncoder::setReuseLevel()
721
+    {
722
+        uint32_t r, padh = 0, padw = 0;
723
+
724
+        m_param->confWinBottomOffset = m_param->confWinRightOffset = 0;
725
+
726
+        m_param->analysisLoadReuseLevel = m_cliopt.loadLevel;
727
+        m_param->analysisSaveReuseLevel = m_cliopt.saveLevel;
728
+        m_param->analysisSave = m_cliopt.saveLevel ? "save.dat" : NULL;
729
+        m_param->analysisLoad = m_cliopt.loadLevel ? "load.dat" : NULL;
730
+        m_param->bUseAnalysisFile = 0;
731
+
732
+        if (m_cliopt.loadLevel)
733
+        {
734
+            x265_param *refParam = m_parent->m_passEncm_cliopt.refId->m_param;
735
+
736
+            if (m_param->sourceHeight == (refParam->sourceHeight - refParam->confWinBottomOffset) &&
737
+                m_param->sourceWidth == (refParam->sourceWidth - refParam->confWinRightOffset))
738
+            {
739
+                m_parent->m_passEncm_id->m_param->confWinBottomOffset = refParam->confWinBottomOffset;
740
+                m_parent->m_passEncm_id->m_param->confWinRightOffset = refParam->confWinRightOffset;
741
+            }
742
+            else
743
+            {
744
+                int srcH = refParam->sourceHeight - refParam->confWinBottomOffset;
745
+                int srcW = refParam->sourceWidth - refParam->confWinRightOffset;
746
+
747
+                double scaleFactorH = double(m_param->sourceHeight / srcH);
748
+                double scaleFactorW = double(m_param->sourceWidth / srcW);
749
+
750
+                int absScaleFactorH = (int)(10 * scaleFactorH + 0.5);
751
+                int absScaleFactorW = (int)(10 * scaleFactorW + 0.5);
752
+
753
+                if (absScaleFactorH == 20 && absScaleFactorW == 20)
754
+                {
755
+                    m_param->scaleFactor = 2;
756
+
757
+                    m_parent->m_passEncm_id->m_param->confWinBottomOffset = refParam->confWinBottomOffset * 2;
758
+                    m_parent->m_passEncm_id->m_param->confWinRightOffset = refParam->confWinRightOffset * 2;
759
+
760
+                }
761
+            }
762
+        }
763
+
764
+        int h = m_param->sourceHeight + m_param->confWinBottomOffset;
765
+        int w = m_param->sourceWidth + m_param->confWinRightOffset;
766
+        if (h & (m_param->minCUSize - 1))
767
+        {
768
+            r = h & (m_param->minCUSize - 1);
769
+            padh = m_param->minCUSize - r;
770
+            m_param->confWinBottomOffset += padh;
771
+
772
+        }
773
+
774
+        if (w & (m_param->minCUSize - 1))
775
+        {
776
+            r = w & (m_param->minCUSize - 1);
777
+            padw = m_param->minCUSize - r;
778
+            m_param->confWinRightOffset += padw;
779
+        }
780
+    }
781
+
782
+    void PassEncoder::startThreads()
783
+    {
784
+        /* Start slave worker threads */
785
+        m_threadActive = true;
786
+        start();
787
+        /* Start reader threads*/
788
+        if (m_reader != NULL)
789
+        {
790
+            m_reader->m_threadActive = true;
791
+            m_reader->start();
792
+        }
793
+        /* Start scaling worker threads */
794
+        if (m_scaler != NULL)
795
+        {
796
+            m_scaler->m_threadActive = true;
797
+            m_scaler->start();
798
+        }
799
+    }
800
+
801
+    void PassEncoder::copyInfo(x265_analysis_data * src)
802
+    {
803
+
804
+        uint32_t written = m_parent->m_analysisWriteCntm_id.get();
805
+
806
+        int index = written % m_parent->m_queueSize;
807
+        //If all streams have read analysis data, reuse that position in Queue
808
+
809
+        int read = m_parent->m_analysisReadm_idindex.get();
810
+        int write = m_parent->m_analysisWritem_idindex.get();
811
+
812
+        int overwrite = written / m_parent->m_queueSize;
813
+        bool emptyIdxFound = 0;
814
+        while (!emptyIdxFound && overwrite)
815
+        {
816
+            for (uint32_t i = 0; i < m_parent->m_queueSize; i++)
817
+            {
818
+                read = m_parent->m_analysisReadm_idi.get();
819
+                write = m_parent->m_analysisWritem_idi.get();
820
+                write *= m_cliopt.numRefs;
821
+
822
+                if (read == write)
823
+                {
824
+                    index = i;
825
+                    emptyIdxFound = 1;
826
+                }
827
+            }
828
+        }
829
+
830
+        x265_analysis_data *m_analysisInfo = &m_parent->m_analysisBufferm_idindex;
831
+
832
+        x265_free_analysis_data(m_param, m_analysisInfo);
833
+        memcpy(m_analysisInfo, src, sizeof(x265_analysis_data));
834
+        x265_alloc_analysis_data(m_param, m_analysisInfo);
835
+
836
+        bool isVbv = m_param->rc.vbvBufferSize && m_param->rc.vbvMaxBitrate;
837
+        if (m_param->bDisableLookahead && isVbv)
838
+        {
839
+            memcpy(m_analysisInfo->lookahead.intraSatdForVbv, src->lookahead.intraSatdForVbv, src->numCuInHeight * sizeof(uint32_t));
840
+            memcpy(m_analysisInfo->lookahead.satdForVbv, src->lookahead.satdForVbv, src->numCuInHeight * sizeof(uint32_t));
841
+            memcpy(m_analysisInfo->lookahead.intraVbvCost, src->lookahead.intraVbvCost, src->numCUsInFrame * sizeof(uint32_t));
842
+            memcpy(m_analysisInfo->lookahead.vbvCost, src->lookahead.vbvCost, src->numCUsInFrame * sizeof(uint32_t));
843
+        }
844
+
845
+        if (src->sliceType == X265_TYPE_IDR || src->sliceType == X265_TYPE_I)
846
+        {
847
+            if (m_param->analysisSaveReuseLevel < 2)
848
+                goto ret;
849
+            x265_analysis_intra_data *intraDst, *intraSrc;
850
+            intraDst = (x265_analysis_intra_data*)m_analysisInfo->intraData;
851
+            intraSrc = (x265_analysis_intra_data*)src->intraData;
852
+            memcpy(intraDst->depth, intraSrc->depth, sizeof(uint8_t) * src->depthBytes);
853
+            memcpy(intraDst->modes, intraSrc->modes, sizeof(uint8_t) * src->numCUsInFrame * src->numPartitions);
854
+            memcpy(intraDst->partSizes, intraSrc->partSizes, sizeof(char) * src->depthBytes);
855
+            memcpy(intraDst->chromaModes, intraSrc->chromaModes, sizeof(uint8_t) * src->depthBytes);
856
+            if (m_param->rc.cuTree)
857
+                memcpy(intraDst->cuQPOff, intraSrc->cuQPOff, sizeof(int8_t) * src->depthBytes);
858
+        }
859
+        else
860
+        {
861
+            bool bIntraInInter = (src->sliceType == X265_TYPE_P || m_param->bIntraInBFrames);
862
+            int numDir = src->sliceType == X265_TYPE_P ? 1 : 2;
863
+            memcpy(m_analysisInfo->wt, src->wt, sizeof(WeightParam) * 3 * numDir);
864
+            if (m_param->analysisSaveReuseLevel < 2)
865
+                goto ret;
866
+            x265_analysis_inter_data *interDst, *interSrc;
867
+            interDst = (x265_analysis_inter_data*)m_analysisInfo->interData;
868
+            interSrc = (x265_analysis_inter_data*)src->interData;
869
+            memcpy(interDst->depth, interSrc->depth, sizeof(uint8_t) * src->depthBytes);
870
+            memcpy(interDst->modes, interSrc->modes, sizeof(uint8_t) * src->depthBytes);
871
+            if (m_param->rc.cuTree)
872
+                memcpy(interDst->cuQPOff, interSrc->cuQPOff, sizeof(int8_t) * src->depthBytes);
873
+            if (m_param->analysisSaveReuseLevel > 4)
874
+            {
875
+                memcpy(interDst->partSize, interSrc->partSize, sizeof(uint8_t) * src->depthBytes);
876
+                memcpy(interDst->mergeFlag, interSrc->mergeFlag, sizeof(uint8_t) * src->depthBytes);
877
+                if (m_param->analysisSaveReuseLevel == 10)
878
+                {
879
+                    memcpy(interDst->interDir, interSrc->interDir, sizeof(uint8_t) * src->depthBytes);
880
+                    for (int dir = 0; dir < numDir; dir++)
881
+                    {
882
+                        memcpy(interDst->mvpIdxdir, interSrc->mvpIdxdir, sizeof(uint8_t) * src->depthBytes);
883
+                        memcpy(interDst->refIdxdir, interSrc->refIdxdir, sizeof(int8_t) * src->depthBytes);
884
+                        memcpy(interDst->mvdir, interSrc->mvdir, sizeof(MV) * src->depthBytes);
885
+                    }
886
+                    if (bIntraInInter)
887
+                    {
888
+                        x265_analysis_intra_data *intraDst = (x265_analysis_intra_data*)m_analysisInfo->intraData;
889
+                        x265_analysis_intra_data *intraSrc = (x265_analysis_intra_data*)src->intraData;
890
+                        memcpy(intraDst->modes, intraSrc->modes, sizeof(uint8_t) * src->numPartitions * src->numCUsInFrame);
891
+                        memcpy(intraDst->chromaModes, intraSrc->chromaModes, sizeof(uint8_t) * src->depthBytes);
892
+                    }
893
+               }
894
+            }
895
+            if (m_param->analysisSaveReuseLevel != 10)
896
+                memcpy(interDst->ref, interSrc->ref, sizeof(int32_t) * src->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir);
897
+        }
898
+
899
+ret:
900
+        //increment analysis Write counter 
901
+        m_parent->m_analysisWriteCntm_id.incr();
902
+        m_parent->m_analysisWritem_idindex.incr();
903
+        return;
904
+    }
905
+
906
+
907
+    bool PassEncoder::readPicture(x265_picture *dstPic)
908
+    {
909
+        /*Check and wait if there any input frames to read*/
910
+        int ipread = m_parent->m_picReadCntm_id.get();
911
+        int ipwrite = m_parent->m_picWriteCntm_id.get();
912
+
913
+        bool isAbrLoad = m_cliopt.loadLevel && (m_parent->m_numEncodes > 1);
914
+        while (!m_inputOver && (ipread == ipwrite))
915
+        {
916
+            ipwrite = m_parent->m_picWriteCntm_id.waitForChange(ipwrite);
917
+        }
918
+
919
+        if (m_threadActive && ipread < ipwrite)
920
+        {
921
+            /*Get input index to read from inputQueue. If doesn't need analysis info, it need not wait to fetch poc from analysisQueue*/
922
+            int readPos = ipread % m_parent->m_queueSize;
923
+            x265_analysis_data* analysisData = 0;
924
+
925
+            if (isAbrLoad)
926
+            {
927
+                /*If stream is master of each slave pass, then fetch analysis data from prev pass*/
928
+                int analysisQId = m_cliopt.refId;
929
+                /*Check and wait if there any analysis Data to read*/
930
+                int analysisWrite = m_parent->m_analysisWriteCntanalysisQId.get();
931
+                int written = analysisWrite * m_parent->m_passEncanalysisQId->m_cliopt.numRefs;
932
+                int analysisRead = m_parent->m_analysisReadCntanalysisQId.get();
933
+                
934
+                while (m_threadActive && written == analysisRead)
935
+                {
936
+                    analysisWrite = m_parent->m_analysisWriteCntanalysisQId.waitForChange(analysisWrite);
937
+                    written = analysisWrite * m_parent->m_passEncanalysisQId->m_cliopt.numRefs;
938
+                }
939
+
940
+                if (analysisRead < written)
941
+                {
942
+                    int analysisIdx = 0;
943
+                    if (!m_param->bDisableLookahead)
944
+                    {
945
+                        bool analysisdRead = false;
946
+                        while ((analysisRead < written) && !analysisdRead)
947
+                        {
948
+                            while (analysisWrite < ipread)
949
+                            {
950
+                                analysisWrite = m_parent->m_analysisWriteCntanalysisQId.waitForChange(analysisWrite);
951
+                                written = analysisWrite * m_parent->m_passEncanalysisQId->m_cliopt.numRefs;
952
+                            }
953
+                            for (uint32_t i = 0; i < m_parent->m_queueSize; i++)
954
+                            {
955
+                                analysisData = &m_parent->m_analysisBufferanalysisQIdi;
956
+                                int read = m_parent->m_analysisReadanalysisQIdi.get();
957
+                                int write = m_parent->m_analysisWriteanalysisQIdi.get() * m_parent->m_passEncanalysisQId->m_cliopt.numRefs;
958
+                                if ((analysisData->poc == (uint32_t)(ipread)) && (read < write))
959
+                                {
960
+                                    analysisIdx = i;
961
+                                    analysisdRead = true;
962
+                                    break;
963
+                                }
964
+                            }
965
+                        }
966
+                    }
967
+                    else
968
+                    {
969
+                        analysisIdx = analysisRead % m_parent->m_queueSize;
970
+                        analysisData = &m_parent->m_analysisBufferanalysisQIdanalysisIdx;
971
+                        readPos = analysisData->poc % m_parent->m_queueSize;
972
+                        while ((ipwrite < readPos) || ((ipwrite - 1) < (int)analysisData->poc))
973
+                        {
974
+                            ipwrite = m_parent->m_picWriteCntm_id.waitForChange(ipwrite);
975
+                        }
976
+                    }
977
+
978
+                    m_lastIdx = analysisIdx;
979
+                }
980
+                else
981
+                    return false;
982
+            }
983
+
984
+
985
+            x265_picture *srcPic = (x265_picture*)(m_parent->m_inputPicBufferm_idreadPos);
986
+
987
+            x265_picture *pic = (x265_picture*)(dstPic);
988
+            pic->colorSpace = srcPic->colorSpace;
989
+            pic->bitDepth = srcPic->bitDepth;
990
+            pic->framesize = srcPic->framesize;
991
+            pic->height = srcPic->height;
992
+            pic->pts = srcPic->pts;
993
+            pic->dts = srcPic->dts;
994
+            pic->reorderedPts = srcPic->reorderedPts;
995
+            pic->width = srcPic->width;
996
+            pic->analysisData = srcPic->analysisData;
997
+            pic->userSEI = srcPic->userSEI;
998
+            pic->stride0 = srcPic->stride0;
999
+            pic->stride1 = srcPic->stride1;
1000
+            pic->stride2 = srcPic->stride2;
1001
+            pic->planes0 = srcPic->planes0;
1002
+            pic->planes1 = srcPic->planes1;
1003
+            pic->planes2 = srcPic->planes2;
1004
+            if (isAbrLoad)
1005
+                pic->analysisData = *analysisData;
1006
+            return true;
1007
+        }
1008
+        else
1009
+            return false;
1010
+    }
1011
+
1012
+    void PassEncoder::threadMain()
1013
+    {
1014
         THREAD_NAME("PassEncoder", m_id);
1015
 
1016
         while (m_threadActive)
1017
         {
1018
-
1019
-#if ENABLE_LIBVMAF
1020
-            x265_vmaf_data* vmafdata = m_cliopt.vmafData;
1021
-#endif
1022
-            /* This allows muxers to modify bitstream format */
1023
-            m_cliopt.output->setParam(m_param);
1024
-            const x265_api* api = m_cliopt.api;
1025
-            ReconPlay* reconPlay = NULL;
1026
-            if (m_cliopt.reconPlayCmd)
1027
-                reconPlay = new ReconPlay(m_cliopt.reconPlayCmd, *m_param);
1028
-            char* profileName = m_cliopt.encName ? m_cliopt.encName : (char *)"x265";
1029
-
1030
-            if (m_cliopt.zoneFile)
1031
-            {
1032
-                if (!m_cliopt.parseZoneFile())
1033
-                {
1034
-                    x265_log(NULL, X265_LOG_ERROR, "Unable to parse zonefile in %s\n", profileName);
1035
-                    fclose(m_cliopt.zoneFile);
1036
-                    m_cliopt.zoneFile = NULL;
1037
-                }
1038
-            }
1039
-
1040
-            if (signal(SIGINT, sigint_handler) == SIG_ERR)
1041
-                x265_log(m_param, X265_LOG_ERROR, "Unable to register CTRL+C handler: %s in %s\n",
1042
-                    strerror(errno), profileName);
1043
-
1044
-            x265_picture pic_orig, pic_out;
1045
-            x265_picture *pic_in = &pic_orig;
1046
-            /* Allocate recon picture if analysis save/load is enabled */
1047
-            std::priority_queue<int64_t>* pts_queue = m_cliopt.output->needPTS() ? new std::priority_queue<int64_t>() : NULL;
1048
-            x265_picture *pic_recon = (m_cliopt.recon || m_param->analysisSave || m_param->analysisLoad || pts_queue || reconPlay || m_param->csvLogLevel) ? &pic_out : NULL;
1049
-            uint32_t inFrameCount = 0;
1050
-            uint32_t outFrameCount = 0;
1051
-            x265_nal *p_nal;
1052
-            x265_stats stats;
1053
-            uint32_t nal;
1054
-            int16_t *errorBuf = NULL;
1055
-            bool bDolbyVisionRPU = false;
1056
-            uint8_t *rpuPayload = NULL;
1057
-            int inputPicNum = 1;
1058
-            x265_picture picField1, picField2;
1059
-            x265_analysis_data* analysisInfo = (x265_analysis_data*)(&pic_out.analysisData);
1060
-            bool isAbrSave = m_cliopt.saveLevel && (m_parent->m_numEncodes > 1);
1061
-
1062
-            if (!m_param->bRepeatHeaders && !m_param->bEnableSvtHevc)
1063
-            {
1064
-                if (api->encoder_headers(m_encoder, &p_nal, &nal) < 0)
1065
-                {
1066
-                    x265_log(m_param, X265_LOG_ERROR, "Failure generating stream headers in %s\n", profileName);
1067
-                    m_ret = 3;
1068
-                    goto fail;
1069
-                }
1070
-                else
1071
-                    m_cliopt.totalbytes += m_cliopt.output->writeHeaders(p_nal, nal);
1072
-            }
1073
-
1074
-            if (m_param->bField && m_param->interlaceMode)
1075
-            {
1076
-                api->picture_init(m_param, &picField1);
1077
-                api->picture_init(m_param, &picField2);
1078
-                // return back the original height of input
1079
-                m_param->sourceHeight *= 2;
1080
-                api->picture_init(m_param, &pic_orig);
1081
-            }
1082
-            else
1083
-                api->picture_init(m_param, &pic_orig);
1084
-
1085
-            if (m_param->dolbyProfile && m_cliopt.dolbyVisionRpu)
1086
-            {
1087
-                rpuPayload = X265_MALLOC(uint8_t, 1024);
1088
-                pic_in->rpu.payload = rpuPayload;
1089
-                if (pic_in->rpu.payload)
1090
-                    bDolbyVisionRPU = true;
1091
-            }
1092
-
1093
-            if (m_cliopt.bDither)
1094
-            {
1095
-                errorBuf = X265_MALLOC(int16_t, m_param->sourceWidth + 1);
1096
-                if (errorBuf)
1097
-                    memset(errorBuf, 0, (m_param->sourceWidth + 1) * sizeof(int16_t));
1098
-                else
1099
-                    m_cliopt.bDither = false;
1100
-            }
1101
-
1102
-            // main encoder loop
1103
-            while (pic_in && !b_ctrl_c)
1104
-            {
1105
-                pic_orig.poc = (m_param->bField && m_param->interlaceMode) ? inFrameCount * 2 : inFrameCount;
1106
-                if (m_cliopt.qpfile)
1107
-                {
1108
-                    if (!m_cliopt.parseQPFile(pic_orig))
1109
-                    {
1110
-                        x265_log(NULL, X265_LOG_ERROR, "can't parse qpfile for frame %d in %s\n",
1111
-                            pic_in->poc, profileName);
1112
-                        fclose(m_cliopt.qpfile);
1113
-                        m_cliopt.qpfile = NULL;
1114
-                    }
1115
-                }
1116
-
1117
-                if (m_cliopt.framesToBeEncoded && inFrameCount >= m_cliopt.framesToBeEncoded)
1118
-                    pic_in = NULL;
1119
-                else if (readPicture(pic_in))
1120
-                    inFrameCount++;
1121
-                else
1122
-                    pic_in = NULL;
1123
-
1124
-                if (pic_in)
1125
-                {
1126
-                    if (pic_in->bitDepth > m_param->internalBitDepth && m_cliopt.bDither)
1127
-                    {
1128
-                        x265_dither_image(pic_in, m_cliopt.input->getWidth(), m_cliopt.input->getHeight(), errorBuf, m_param->internalBitDepth);
1129
-                        pic_in->bitDepth = m_param->internalBitDepth;
1130
-                    }
1131
-                    /* Overwrite PTS */
1132
-                    pic_in->pts = pic_in->poc;
1133
-
1134
-                    // convert to field
1135
-                    if (m_param->bField && m_param->interlaceMode)
1136
-                    {
1137
-                        int height = pic_in->height >> 1;
1138
-
1139
-                        int static bCreated = 0;
1140
-                        if (bCreated == 0)
1141
-                        {
1142
-                            bCreated = 1;
1143
-                            inputPicNum = 2;
1144
-                            picField1.fieldNum = 1;
1145
-                            picField2.fieldNum = 2;
1146
-
1147
-                            picField1.bitDepth = picField2.bitDepth = pic_in->bitDepth;
1148
-                            picField1.colorSpace = picField2.colorSpace = pic_in->colorSpace;
1149
-                            picField1.height = picField2.height = pic_in->height >> 1;
1150
-                            picField1.framesize = picField2.framesize = pic_in->framesize >> 1;
1151
-
1152
-                            size_t fieldFrameSize = (size_t)pic_in->framesize >> 1;
1153
-                            char* field1Buf = X265_MALLOC(char, fieldFrameSize);
1154
-                            char* field2Buf = X265_MALLOC(char, fieldFrameSize);
1155
-
1156
-                            int stride = picField1.stride0 = picField2.stride0 = pic_in->stride0;
1157
-                            uint64_t framesize = stride * (height >> x265_cli_cspspic_in->colorSpace.height0);
1158
-                            picField1.planes0 = field1Buf;
1159
-                            picField2.planes0 = field2Buf;
1160
-                            for (int i = 1; i < x265_cli_cspspic_in->colorSpace.planes; i++)
1161
-                            {
1162
-                                picField1.planesi = field1Buf + framesize;
1163
-                                picField2.planesi = field2Buf + framesize;
1164
-
1165
-                                stride = picField1.stridei = picField2.stridei = pic_in->stridei;
1166
-                                framesize += (stride * (height >> x265_cli_cspspic_in->colorSpace.heighti));
1167
-                            }
1168
-                            assert(framesize == picField1.framesize);
1169
-                        }
1170
-
1171
-                        picField1.pts = picField1.poc = pic_in->poc;
1172
-                        picField2.pts = picField2.poc = pic_in->poc + 1;
1173
-
1174
-                        picField1.userSEI = picField2.userSEI = pic_in->userSEI;
1175
-
1176
-                        //if (pic_in->userData)
1177
-                        //{
1178
-                        //    // Have to handle userData here
1179
-                        //}
1180
-
1181
-                        if (pic_in->framesize)
1182
-                        {
1183
-                            for (int i = 0; i < x265_cli_cspspic_in->colorSpace.planes; i++)
1184
-                            {
1185
-                                char* srcP1 = (char*)pic_in->planesi;
1186
-                                char* srcP2 = (char*)pic_in->planesi + pic_in->stridei;
1187
-                                char* p1 = (char*)picField1.planesi;
1188
-                                char* p2 = (char*)picField2.planesi;
1189
-
1190
-                                int stride = picField1.stridei;
1191
-
1192
-                                for (int y = 0; y < (height >> x265_cli_cspspic_in->colorSpace.heighti); y++)
1193
-                                {
1194
-                                    memcpy(p1, srcP1, stride);
1195
-                                    memcpy(p2, srcP2, stride);
1196
-                                    srcP1 += 2 * stride;
1197
-                                    srcP2 += 2 * stride;
1198
-                                    p1 += stride;
1199
-                                    p2 += stride;
1200
-                                }
1201
-                            }
1202
-                        }
1203
-                    }
1204
-
1205
-                    if (bDolbyVisionRPU)
1206
-                    {
1207
-                        if (m_param->bField && m_param->interlaceMode)
1208
-                        {
1209
-                            if (m_cliopt.rpuParser(&picField1) > 0)
1210
-                                goto fail;
1211
-                            if (m_cliopt.rpuParser(&picField2) > 0)
1212
-                                goto fail;
1213
-                        }
1214
-                        else
1215
-                        {
1216
-                            if (m_cliopt.rpuParser(pic_in) > 0)
1217
-                                goto fail;
1218
-                        }
1219
-                    }
1220
-                }
1221
-
1222
-                for (int inputNum = 0; inputNum < inputPicNum; inputNum++)
1223
-                {
1224
-                    x265_picture *picInput = NULL;
1225
-                    if (inputPicNum == 2)
1226
-                        picInput = pic_in ? (inputNum ? &picField2 : &picField1) : NULL;
1227
-                    else
1228
-                        picInput = pic_in;
1229
-
1230
-                    int numEncoded = api->encoder_encode(m_encoder, &p_nal, &nal, picInput, pic_recon);
1231
-
1232
-                    int idx = (inFrameCount - 1) % m_parent->m_queueSize;
1233
-                    m_parent->m_picIdxReadCntm_ididx.incr();
1234
-                    m_parent->m_picReadCntm_id.incr();
1235
-                    if (m_cliopt.loadLevel && picInput)
1236
-                    {
1237
-                        m_parent->m_analysisReadCntm_cliopt.refId.incr();
1238
-                        m_parent->m_analysisReadm_cliopt.refIdm_lastIdx.incr();
1239
-                    }
1240
-
1241
-                    if (numEncoded < 0)
1242
-                    {
1243
-                        b_ctrl_c = 1;
1244
-                        m_ret = 4;
1245
-                        break;
1246
-                    }
1247
-
1248
-                    if (reconPlay && numEncoded)
1249
-                        reconPlay->writePicture(*pic_recon);
1250
-
1251
-                    outFrameCount += numEncoded;
1252
-
1253
-                    if (isAbrSave && numEncoded)
1254
-                    {
1255
-                        copyInfo(analysisInfo);
1256
-                    }
1257
-
1258
-                    if (numEncoded && pic_recon && m_cliopt.recon)
1259
-                        m_cliopt.recon->writePicture(pic_out);
1260
-                    if (nal)
1261
-                    {
1262
-                        m_cliopt.totalbytes += m_cliopt.output->writeFrame(p_nal, nal, pic_out);
1263
-                        if (pts_queue)
1264
-                        {
1265
-                            pts_queue->push(-pic_out.pts);
1266
-                            if (pts_queue->size() > 2)
1267
-                                pts_queue->pop();
1268
-                        }
1269
-                    }
1270
-                    m_cliopt.printStatus(outFrameCount);
1271
-                }
1272
-            }
1273
-
1274
-            /* Flush the encoder */
1275
-            while (!b_ctrl_c)
1276
-            {
1277
-                int numEncoded = api->encoder_encode(m_encoder, &p_nal, &nal, NULL, pic_recon);
1278
-                if (numEncoded < 0)
1279
-                {
1280
-                    m_ret = 4;
1281
-                    break;
1282
-                }
1283
-
1284
-                if (reconPlay && numEncoded)
1285
-                    reconPlay->writePicture(*pic_recon);
1286
-
1287
-                outFrameCount += numEncoded;
1288
-                if (isAbrSave && numEncoded)
1289
-                {
1290
-                    copyInfo(analysisInfo);
1291
-                }
1292
-
1293
-                if (numEncoded && pic_recon && m_cliopt.recon)
1294
-                    m_cliopt.recon->writePicture(pic_out);
1295
-                if (nal)
1296
-                {
1297
-                    m_cliopt.totalbytes += m_cliopt.output->writeFrame(p_nal, nal, pic_out);
1298
-                    if (pts_queue)
1299
-                    {
1300
-                        pts_queue->push(-pic_out.pts);
1301
-                        if (pts_queue->size() > 2)
1302
-                            pts_queue->pop();
1303
-                    }
1304
-                }
1305
-
1306
-                m_cliopt.printStatus(outFrameCount);
1307
-
1308
-                if (!numEncoded)
1309
-                    break;
1310
-            }
1311
-
1312
-            if (bDolbyVisionRPU)
1313
-            {
1314
-                if (fgetc(m_cliopt.dolbyVisionRpu) != EOF)
1315
-                    x265_log(NULL, X265_LOG_WARNING, "Dolby Vision RPU count is greater than frame count in %s\n",
1316
-                        profileName);
1317
-                x265_log(NULL, X265_LOG_INFO, "VES muxing with Dolby Vision RPU file successful in %s\n",
1318
-                    profileName);
1319
-            }
1320
-
1321
-            /* clear progress report */
1322
-            if (m_cliopt.bProgress)
1323
-                fprintf(stderr, "%*s\r", 80, " ");
1324
-
1325
-        fail:
1326
-
1327
-            delete reconPlay;
1328
-
1329
-            api->encoder_get_stats(m_encoder, &stats, sizeof(stats));
1330
-            if (m_param->csvfn && !b_ctrl_c)
1331
-#if ENABLE_LIBVMAF
1332
-                api->vmaf_encoder_log(m_encoder, m_cliopt.argCnt, m_cliopt.argString, m_cliopt.param, vmafdata);
1333
-#else
1334
-                api->encoder_log(m_encoder, m_cliopt.argCnt, m_cliopt.argString);
1335
-#endif
1336
-            api->encoder_close(m_encoder);
1337
-
1338
-            int64_t second_largest_pts = 0;
1339
-            int64_t largest_pts = 0;
1340
-            if (pts_queue && pts_queue->size() >= 2)
1341
-            {
1342
-                second_largest_pts = -pts_queue->top();
1343
-                pts_queue->pop();
1344
-                largest_pts = -pts_queue->top();
1345
-                pts_queue->pop();
1346
-                delete pts_queue;
1347
-                pts_queue = NULL;
1348
-            }
1349
-            m_cliopt.output->closeFile(largest_pts, second_largest_pts);
1350
-
1351
-            if (b_ctrl_c)
1352
-                general_log(m_param, NULL, X265_LOG_INFO, "aborted at input frame %d, output frame %d in %s\n",
1353
-                    m_cliopt.seek + inFrameCount, stats.encodedPictureCount, profileName);
1354
-
1355
-            api->param_free(m_param);
1356
-
1357
-            X265_FREE(errorBuf);
1358
-            X265_FREE(rpuPayload);
1359
-
1360
-            m_threadActive = false;
1361
-            m_parent->m_numActiveEncodes.decr();
1362
-        }
1363
-    }
1364
-
1365
-    void PassEncoder::destroy()
1366
-    {
1367
-        stop();
1368
-        if (m_reader)
1369
-        {
1370
-            m_reader->stop();
1371
-            delete m_reader;
1372
-        }
1373
-        else
1374
-        {
1375
-            m_scaler->stop();
1376
-            m_scaler->destroy();
1377
-            delete m_scaler;
1378
-        }
1379
-    }
1380
-
1381
-    Scaler::Scaler(int threadId, int threadNum, int id, VideoDesc *src, VideoDesc *dst, PassEncoder *parentEnc)
1382
-    {
1383
-        m_parentEnc = parentEnc;
1384
-        m_id = id;
1385
-        m_srcFormat = src;
1386
-        m_dstFormat = dst;
1387
-        m_threadActive = false;
1388
-        m_scaleFrameSize = 0;
1389
-        m_filterManager = NULL;
1390
-        m_threadId = threadId;
1391
-        m_threadTotal = threadNum;
1392
-
1393
-        int csp = dst->m_csp;
1394
-        uint32_t pixelbytes = dst->m_inputDepth > 8 ? 2 : 1;
1395
-        for (int i = 0; i < x265_cli_cspscsp.planes; i++)
1396
-        {
1397
-            int w = dst->m_width >> x265_cli_cspscsp.widthi;
1398
-            int h = dst->m_height >> x265_cli_cspscsp.heighti;
1399
-            m_scalePlanesi = w * h * pixelbytes;
1400
-            m_scaleFrameSize += m_scalePlanesi;
1401
-        }
1402
-
1403
-        if (src->m_height != dst->m_height || src->m_width != dst->m_width)
1404
-        {
1405
-            m_filterManager = new ScalerFilterManager;
1406
-            m_filterManager->init(4, m_srcFormat, m_dstFormat);
1407
-        }
1408
-    }
1409
-
1410
-    bool Scaler::scalePic(x265_picture * destination, x265_picture * source)
1411
-    {
1412
-        if (!destination || !source)
1413
-            return false;
1414
-        x265_param* param = m_parentEnc->m_param;
1415
-        int pixelBytes = m_dstFormat->m_inputDepth > 8 ? 2 : 1;
1416
-        if (m_srcFormat->m_height != m_dstFormat->m_height || m_srcFormat->m_width != m_dstFormat->m_width)
1417
-        {
1418
-            void **srcPlane = NULL, **dstPlane = NULL;
1419
-            int srcStride3, dstStride3;
1420
-            destination->bitDepth = source->bitDepth;
1421
-            destination->colorSpace = source->colorSpace;
1422
-            destination->pts = source->pts;
1423
-            destination->dts = source->dts;
1424
-            destination->reorderedPts = source->reorderedPts;
1425
-            destination->poc = source->poc;
1426
-            destination->userSEI = source->userSEI;
1427
-            srcPlane = source->planes;
1428
-            dstPlane = destination->planes;
1429
-            srcStride0 = source->stride0;
1430
-            destination->stride0 = m_dstFormat->m_width * pixelBytes;
1431
-            dstStride0 = destination->stride0;
1432
-            if (param->internalCsp != X265_CSP_I400)
1433
-            {
1434
-                srcStride1 = source->stride1;
1435
-                srcStride2 = source->stride2;
1436
-                destination->stride1 = destination->stride0 >> x265_cli_cspsparam->internalCsp.width1;
1437
-                destination->stride2 = destination->stride0 >> x265_cli_cspsparam->internalCsp.width2;
1438
-                dstStride1 = destination->stride1;
1439
-                dstStride2 = destination->stride2;
1440
-            }
1441
-            if (m_scaleFrameSize)
1442
-            {
1443
-                m_filterManager->scale_pic(srcPlane, dstPlane, srcStride, dstStride);
1444
-                return true;
1445
-            }
1446
-            else
1447
-                x265_log(param, X265_LOG_INFO, "Empty frame received\n");
1448
-        }
1449
-        return false;
1450
-    }
1451
-
1452
-    void Scaler::threadMain()
1453
-    {
1454
-        THREAD_NAME("Scaler", m_id);
1455
-
1456
-        /* unscaled picture is stored in the last index */
1457
-        uint32_t srcId = m_id - 1;
1458
-        int QDepth = m_parentEnc->m_parent->m_queueSize;
1459
-        while (!m_parentEnc->m_inputOver)
1460
-        {
1461
-
1462
-            uint32_t scaledWritten = m_parentEnc->m_parent->m_picWriteCntm_id.get();
1463
-
1464
-            if (m_parentEnc->m_cliopt.framesToBeEncoded && scaledWritten >= m_parentEnc->m_cliopt.framesToBeEncoded)
1465
-                break;
1466
-
1467
-            if (m_threadTotal > 1 && (m_threadId != scaledWritten % m_threadTotal))
1468
-            {
1469
-                continue;
1470
-            }
1471
-            uint32_t written = m_parentEnc->m_parent->m_picWriteCntsrcId.get();
1472
-
1473
-            /*If all the input pictures are scaled by the current scale worker thread wait for input pictures*/
1474
-            while (m_threadActive && (scaledWritten == written)) {
1475
-                written = m_parentEnc->m_parent->m_picWriteCntsrcId.waitForChange(written);
1476
-            }
1477
-
1478
-            if (m_threadActive && scaledWritten < written)
1479
-            {
1480
-
1481
-                int scaledWriteIdx = scaledWritten % QDepth;
1482
-                int overWritePicBuffer = scaledWritten / QDepth;
1483
-                int read = m_parentEnc->m_parent->m_picIdxReadCntm_idscaledWriteIdx.get();
1484
-
1485
-                while (overWritePicBuffer && read < overWritePicBuffer)
1486
-                {
1487
-                    read = m_parentEnc->m_parent->m_picIdxReadCntm_idscaledWriteIdx.waitForChange(read);
1488
-                }
1489
-
1490
-                if (!m_parentEnc->m_parent->m_inputPicBufferm_idscaledWriteIdx)
1491
-                {
1492
-                    int framesize = 0;
1493
-                    int planesize3;
1494
-                    int csp = m_dstFormat->m_csp;
1495
-                    int stride3;
1496
-                    stride0 = m_dstFormat->m_width;
1497
-                    stride1 = stride0 >> x265_cli_cspscsp.width1;
1498
-                    stride2 = stride0 >> x265_cli_cspscsp.width2;
1499
-                    for (int i = 0; i < x265_cli_cspscsp.planes; i++)
1500
-                    {
1501
-                        uint32_t h = m_dstFormat->m_height >> x265_cli_cspscsp.heighti;
1502
-                        planesizei = h * stridei;
1503
-                        framesize += planesizei;
1504
-                    }
1505
-
1506
-                    m_parentEnc->m_parent->m_inputPicBufferm_idscaledWriteIdx = x265_picture_alloc();
1507
-                    x265_picture_init(m_parentEnc->m_param, m_parentEnc->m_parent->m_inputPicBufferm_idscaledWriteIdx);
1508
-
1509
-                    ((x265_picture*)m_parentEnc->m_parent->m_inputPicBufferm_idscaledWritten % QDepth)->framesize = framesize;
1510
-                    for (int32_t j = 0; j < x265_cli_cspscsp.planes; j++)
1511
-                    {
1512
-                        m_parentEnc->m_parent->m_inputPicBufferm_idscaledWritten % QDepth->planesj = X265_MALLOC(char, planesizej);
1513
-                    }
1514
-                }
1515
-
1516
-                x265_picture *srcPic = m_parentEnc->m_parent->m_inputPicBuffersrcIdscaledWritten % QDepth;
1517
-                x265_picture* destPic = m_parentEnc->m_parent->m_inputPicBufferm_idscaledWriteIdx;
1518
-
1519
-                // Enqueue this picture up with the current encoder so that it will asynchronously encode
1520
-                if (!scalePic(destPic, srcPic))
1521
-                    x265_log(NULL, X265_LOG_ERROR, "Unable to copy scaled input picture to input queue \n");
1522
-                else
1523
-                    m_parentEnc->m_parent->m_picWriteCntm_id.incr();
1524
-                m_scaledWriteCnt.incr();
1525
-                m_parentEnc->m_parent->m_picIdxReadCntsrcIdscaledWriteIdx.incr();
1526
-            }
1527
-            if (m_threadTotal > 1)
1528
-            {
1529
-                written = m_parentEnc->m_parent->m_picWriteCntsrcId.get();
1530
-                int totalWrite = written / m_threadTotal;
1531
-                if (written % m_threadTotal > m_threadId)
1532
-                    totalWrite++;
1533
-                if (totalWrite == m_scaledWriteCnt.get())
1534
-                {
1535
-                    m_parentEnc->m_parent->m_picWriteCntsrcId.poke();
1536
-                    m_parentEnc->m_parent->m_picWriteCntm_id.poke();
1537
-                    break;
1538
-                }
1539
-            }
1540
-            else
1541
-            {
1542
-                /* Once end of video is reached and all frames are scaled, release wait on picwritecount */
1543
-                scaledWritten = m_parentEnc->m_parent->m_picWriteCntm_id.get();
1544
-                written = m_parentEnc->m_parent->m_picWriteCntsrcId.get();
1545
-                if (written == scaledWritten)
1546
-                {
1547
-                    m_parentEnc->m_parent->m_picWriteCntsrcId.poke();
1548
-                    m_parentEnc->m_parent->m_picWriteCntm_id.poke();
1549
-                    break;
1550
-                }
1551
-            }
1552
-
1553
-        }
1554
-        m_threadActive = false;
1555
-        destroy();
1556
-    }
1557
-
1558
-    Reader::Reader(int id, PassEncoder *parentEnc)
1559
-    {
1560
-        m_parentEnc = parentEnc;
1561
-        m_id = id;
1562
-        m_input = parentEnc->m_input;
1563
-    }
1564
-
1565
-    void Reader::threadMain()
1566
-    {
1567
-        THREAD_NAME("Reader", m_id);
1568
-
1569
-        int QDepth = m_parentEnc->m_parent->m_queueSize;
1570
-        x265_picture* src = x265_picture_alloc();
1571
-        x265_picture_init(m_parentEnc->m_param, src);
1572
-
1573
-        while (m_threadActive)
1574
-        {
1575
-            uint32_t written = m_parentEnc->m_parent->m_picWriteCntm_id.get();
1576
-            uint32_t writeIdx = written % QDepth;
1577
-            uint32_t read = m_parentEnc->m_parent->m_picIdxReadCntm_idwriteIdx.get();
1578
-            uint32_t overWritePicBuffer = written / QDepth;
1579
-
1580
-            if (m_parentEnc->m_cliopt.framesToBeEncoded && written >= m_parentEnc->m_cliopt.framesToBeEncoded)
1581
-                break;
1582
-
1583
-            while (overWritePicBuffer && read < overWritePicBuffer)
1584
-            {
1585
-                read = m_parentEnc->m_parent->m_picIdxReadCntm_idwriteIdx.waitForChange(read);
1586
-            }
1587
-
1588
-            x265_picture* dest = m_parentEnc->m_parent->m_inputPicBufferm_idwriteIdx;
1589
-            if (m_input->readPicture(*src))
1590
-            {
1591
-                dest->poc = src->poc;
1592
-                dest->pts = src->pts;
1593
-                dest->userSEI = src->userSEI;
1594
-                dest->bitDepth = src->bitDepth;
1595
-                dest->framesize = src->framesize;
1596
-                dest->height = src->height;
1597
-                dest->width = src->width;
1598
-                dest->colorSpace = src->colorSpace;
1599
-                dest->userSEI = src->userSEI;
1600
-                dest->rpu.payload = src->rpu.payload;
1601
-                dest->picStruct = src->picStruct;
1602
-                dest->stride0 = src->stride0;
1603
-                dest->stride1 = src->stride1;
1604
-                dest->stride2 = src->stride2;
1605
-
1606
-                if (!dest->planes0)
1607
-                    dest->planes0 = X265_MALLOC(char, dest->framesize);
1608
-
1609
-                memcpy(dest->planes0, src->planes0, src->framesize * sizeof(char));
1610
-                dest->planes1 = (char*)dest->planes0 + src->stride0 * src->height;
1611
-                dest->planes2 = (char*)dest->planes1 + src->stride1 * (src->height >> x265_cli_cspssrc->colorSpace.height1);
1612
-                m_parentEnc->m_parent->m_picWriteCntm_id.incr();
1613
-            }
1614
-            else
1615
-            {
1616
-                m_threadActive = false;
1617
-                m_parentEnc->m_inputOver = true;
1618
-                m_parentEnc->m_parent->m_picWriteCntm_id.poke();
1619
-            }
1620
-        }
1621
-        x265_picture_free(src);
1622
-    }
1623
-}
1624
+
1625
+#if ENABLE_LIBVMAF
1626
+            x265_vmaf_data* vmafdata = m_cliopt.vmafData;
1627
+#endif
1628
+            /* This allows muxers to modify bitstream format */
1629
+            m_cliopt.output->setParam(m_param);
1630
+            const x265_api* api = m_cliopt.api;
1631
+            ReconPlay* reconPlay = NULL;
1632
+            if (m_cliopt.reconPlayCmd)
1633
+                reconPlay = new ReconPlay(m_cliopt.reconPlayCmd, *m_param);
1634
+            char* profileName = m_cliopt.encName ? m_cliopt.encName : (char *)"x265";
1635
+
1636
+            if (signal(SIGINT, sigint_handler) == SIG_ERR)
1637
+                x265_log(m_param, X265_LOG_ERROR, "Unable to register CTRL+C handler: %s in %s\n",
1638
+                    strerror(errno), profileName);
1639
+
1640
+            x265_picture pic_orig, pic_out;
1641
+            x265_picture *pic_in = &pic_orig;
1642
+            /* Allocate recon picture if analysis save/load is enabled */
1643
+            std::priority_queue<int64_t>* pts_queue = m_cliopt.output->needPTS() ? new std::priority_queue<int64_t>() : NULL;
1644
+            x265_picture *pic_recon = (m_cliopt.recon || m_param->analysisSave || m_param->analysisLoad || pts_queue || reconPlay || m_param->csvLogLevel) ? &pic_out : NULL;
1645
+            uint32_t inFrameCount = 0;
1646
+            uint32_t outFrameCount = 0;
1647
+            x265_nal *p_nal;
1648
+            x265_stats stats;
1649
+            uint32_t nal;
1650
+            int16_t *errorBuf = NULL;
1651
+            bool bDolbyVisionRPU = false;
1652
+            uint8_t *rpuPayload = NULL;
1653
+            int inputPicNum = 1;
1654
+            x265_picture picField1, picField2;
1655
+            x265_analysis_data* analysisInfo = (x265_analysis_data*)(&pic_out.analysisData);
1656
+            bool isAbrSave = m_cliopt.saveLevel && (m_parent->m_numEncodes > 1);
1657
+
1658
+            if (!m_param->bRepeatHeaders && !m_param->bEnableSvtHevc)
1659
+            {
1660
+                if (api->encoder_headers(m_encoder, &p_nal, &nal) < 0)
1661
+                {
1662
+                    x265_log(m_param, X265_LOG_ERROR, "Failure generating stream headers in %s\n", profileName);
1663
+                    m_ret = 3;
1664
+                    goto fail;
1665
+                }
1666
+                else
1667
+                    m_cliopt.totalbytes += m_cliopt.output->writeHeaders(p_nal, nal);
1668
+            }
1669
+
1670
+            if (m_param->bField && m_param->interlaceMode)
1671
+            {
1672
+                api->picture_init(m_param, &picField1);
1673
+                api->picture_init(m_param, &picField2);
1674
+                // return back the original height of input
1675
+                m_param->sourceHeight *= 2;
1676
+                api->picture_init(m_param, &pic_orig);
1677
+            }
1678
+            else
1679
+                api->picture_init(m_param, &pic_orig);
1680
+
1681
+            if (m_param->dolbyProfile && m_cliopt.dolbyVisionRpu)
1682
+            {
1683
+                rpuPayload = X265_MALLOC(uint8_t, 1024);
1684
+                pic_in->rpu.payload = rpuPayload;
1685
+                if (pic_in->rpu.payload)
1686
+                    bDolbyVisionRPU = true;
1687
+            }
1688
+
1689
+            if (m_cliopt.bDither)
1690
+            {
1691
+                errorBuf = X265_MALLOC(int16_t, m_param->sourceWidth + 1);
1692
+                if (errorBuf)
1693
+                    memset(errorBuf, 0, (m_param->sourceWidth + 1) * sizeof(int16_t));
1694
+                else
1695
+                    m_cliopt.bDither = false;
1696
+            }
1697
+
1698
+            // main encoder loop
1699
+            while (pic_in && !b_ctrl_c)
1700
+            {
1701
+                pic_orig.poc = (m_param->bField && m_param->interlaceMode) ? inFrameCount * 2 : inFrameCount;
1702
+                if (m_cliopt.qpfile)
1703
+                {
1704
+                    if (!m_cliopt.parseQPFile(pic_orig))
1705
+                    {
1706
+                        x265_log(NULL, X265_LOG_ERROR, "can't parse qpfile for frame %d in %s\n",
1707
+                            pic_in->poc, profileName);
1708
+                        fclose(m_cliopt.qpfile);
1709
+                        m_cliopt.qpfile = NULL;
1710
+                    }
1711
+                }
1712
+
1713
+                if (m_cliopt.framesToBeEncoded && inFrameCount >= m_cliopt.framesToBeEncoded)
1714
+                    pic_in = NULL;
1715
+                else if (readPicture(pic_in))
1716
+                    inFrameCount++;
1717
+                else
1718
+                    pic_in = NULL;
1719
+
1720
+                if (pic_in)
1721
+                {
1722
+                    if (pic_in->bitDepth > m_param->internalBitDepth && m_cliopt.bDither)
1723
+                    {
1724
+                        x265_dither_image(pic_in, m_cliopt.input->getWidth(), m_cliopt.input->getHeight(), errorBuf, m_param->internalBitDepth);
1725
+                        pic_in->bitDepth = m_param->internalBitDepth;
1726
+                    }
1727
+                    /* Overwrite PTS */
1728
+                    pic_in->pts = pic_in->poc;
1729
+
1730
+                    // convert to field
1731
+                    if (m_param->bField && m_param->interlaceMode)
1732
+                    {
1733
+                        int height = pic_in->height >> 1;
1734
+
1735
+                        int static bCreated = 0;
1736
+                        if (bCreated == 0)
1737
+                        {
1738
+                            bCreated = 1;
1739
+                            inputPicNum = 2;
1740
+                            picField1.fieldNum = 1;
1741
+                            picField2.fieldNum = 2;
1742
+
1743
+                            picField1.bitDepth = picField2.bitDepth = pic_in->bitDepth;
1744
+                            picField1.colorSpace = picField2.colorSpace = pic_in->colorSpace;
1745
+                            picField1.height = picField2.height = pic_in->height >> 1;
1746
+                            picField1.framesize = picField2.framesize = pic_in->framesize >> 1;
1747
+
1748
+                            size_t fieldFrameSize = (size_t)pic_in->framesize >> 1;
1749
+                            char* field1Buf = X265_MALLOC(char, fieldFrameSize);
1750
+                            char* field2Buf = X265_MALLOC(char, fieldFrameSize);
1751
+
1752
+                            int stride = picField1.stride0 = picField2.stride0 = pic_in->stride0;
1753
+                            uint64_t framesize = stride * (height >> x265_cli_cspspic_in->colorSpace.height0);
1754
+                            picField1.planes0 = field1Buf;
1755
+                            picField2.planes0 = field2Buf;
1756
+                            for (int i = 1; i < x265_cli_cspspic_in->colorSpace.planes; i++)
1757
+                            {
1758
+                                picField1.planesi = field1Buf + framesize;
1759
+                                picField2.planesi = field2Buf + framesize;
1760
+
1761
+                                stride = picField1.stridei = picField2.stridei = pic_in->stridei;
1762
+                                framesize += (stride * (height >> x265_cli_cspspic_in->colorSpace.heighti));
1763
+                            }
1764
+                            assert(framesize == picField1.framesize);
1765
+                        }
1766
+
1767
+                        picField1.pts = picField1.poc = pic_in->poc;
1768
+                        picField2.pts = picField2.poc = pic_in->poc + 1;
1769
+
1770
+                        picField1.userSEI = picField2.userSEI = pic_in->userSEI;
1771
+
1772
+                        //if (pic_in->userData)
1773
+                        //{
1774
+                        //    // Have to handle userData here
1775
+                        //}
1776
+
1777
+                        if (pic_in->framesize)
1778
+                        {
1779
+                            for (int i = 0; i < x265_cli_cspspic_in->colorSpace.planes; i++)
1780
+                            {
1781
+                                char* srcP1 = (char*)pic_in->planesi;
1782
+                                char* srcP2 = (char*)pic_in->planesi + pic_in->stridei;
1783
+                                char* p1 = (char*)picField1.planesi;
1784
+                                char* p2 = (char*)picField2.planesi;
1785
+
1786
+                                int stride = picField1.stridei;
1787
+
1788
+                                for (int y = 0; y < (height >> x265_cli_cspspic_in->colorSpace.heighti); y++)
1789
+                                {
1790
+                                    memcpy(p1, srcP1, stride);
1791
+                                    memcpy(p2, srcP2, stride);
1792
+                                    srcP1 += 2 * stride;
1793
+                                    srcP2 += 2 * stride;
1794
+                                    p1 += stride;
1795
+                                    p2 += stride;
1796
+                                }
1797
+                            }
1798
+                        }
1799
+                    }
1800
+
1801
+                    if (bDolbyVisionRPU)
1802
+                    {
1803
+                        if (m_param->bField && m_param->interlaceMode)
1804
+                        {
1805
+                            if (m_cliopt.rpuParser(&picField1) > 0)
1806
+                                goto fail;
1807
+                            if (m_cliopt.rpuParser(&picField2) > 0)
1808
+                                goto fail;
1809
+                        }
1810
+                        else
1811
+                        {
1812
+                            if (m_cliopt.rpuParser(pic_in) > 0)
1813
+                                goto fail;
1814
+                        }
1815
+                    }
1816
+                }
1817
+
1818
+                for (int inputNum = 0; inputNum < inputPicNum; inputNum++)
1819
+                {
1820
+                    x265_picture *picInput = NULL;
1821
+                    if (inputPicNum == 2)
1822
+                        picInput = pic_in ? (inputNum ? &picField2 : &picField1) : NULL;
1823
+                    else
1824
+                        picInput = pic_in;
1825
+
1826
+                    int numEncoded = api->encoder_encode(m_encoder, &p_nal, &nal, picInput, pic_recon);
1827
+
1828
+                    int idx = (inFrameCount - 1) % m_parent->m_queueSize;
1829
+                    m_parent->m_picIdxReadCntm_ididx.incr();
1830
+                    m_parent->m_picReadCntm_id.incr();
1831
+                    if (m_cliopt.loadLevel && picInput)
1832
+                    {
1833
+                        m_parent->m_analysisReadCntm_cliopt.refId.incr();
1834
+                        m_parent->m_analysisReadm_cliopt.refIdm_lastIdx.incr();
1835
+                    }
1836
+
1837
+                    if (numEncoded < 0)
1838
+                    {
1839
+                        b_ctrl_c = 1;
1840
+                        m_ret = 4;
1841
+                        break;
1842
+                    }
1843
+
1844
+                    if (reconPlay && numEncoded)
1845
+                        reconPlay->writePicture(*pic_recon);
1846
+
1847
+                    outFrameCount += numEncoded;
1848
+
1849
+                    if (isAbrSave && numEncoded)
1850
+                    {
1851
+                        copyInfo(analysisInfo);
1852
+                    }
1853
+
1854
+                    if (numEncoded && pic_recon && m_cliopt.recon)
1855
+                        m_cliopt.recon->writePicture(pic_out);
1856
+                    if (nal)
1857
+                    {
1858
+                        m_cliopt.totalbytes += m_cliopt.output->writeFrame(p_nal, nal, pic_out);
1859
+                        if (pts_queue)
1860
+                        {
1861
+                            pts_queue->push(-pic_out.pts);
1862
+                            if (pts_queue->size() > 2)
1863
+                                pts_queue->pop();
1864
+                        }
1865
+                    }
1866
+                    m_cliopt.printStatus(outFrameCount);
1867
+                }
1868
+            }
1869
+
1870
+            /* Flush the encoder */
1871
+            while (!b_ctrl_c)
1872
+            {
1873
+                int numEncoded = api->encoder_encode(m_encoder, &p_nal, &nal, NULL, pic_recon);
1874
+                if (numEncoded < 0)
1875
+                {
1876
+                    m_ret = 4;
1877
+                    break;
1878
+                }
1879
+
1880
+                if (reconPlay && numEncoded)
1881
+                    reconPlay->writePicture(*pic_recon);
1882
+
1883
+                outFrameCount += numEncoded;
1884
+                if (isAbrSave && numEncoded)
1885
+                {
1886
+                    copyInfo(analysisInfo);
1887
+                }
1888
+
1889
+                if (numEncoded && pic_recon && m_cliopt.recon)
1890
+                    m_cliopt.recon->writePicture(pic_out);
1891
+                if (nal)
1892
+                {
1893
+                    m_cliopt.totalbytes += m_cliopt.output->writeFrame(p_nal, nal, pic_out);
1894
+                    if (pts_queue)
1895
+                    {
1896
+                        pts_queue->push(-pic_out.pts);
1897
+                        if (pts_queue->size() > 2)
1898
+                            pts_queue->pop();
1899
+                    }
1900
+                }
1901
+
1902
+                m_cliopt.printStatus(outFrameCount);
1903
+
1904
+                if (!numEncoded)
1905
+                    break;
1906
+            }
1907
+
1908
+            if (bDolbyVisionRPU)
1909
+            {
1910
+                if (fgetc(m_cliopt.dolbyVisionRpu) != EOF)
1911
+                    x265_log(NULL, X265_LOG_WARNING, "Dolby Vision RPU count is greater than frame count in %s\n",
1912
+                        profileName);
1913
+                x265_log(NULL, X265_LOG_INFO, "VES muxing with Dolby Vision RPU file successful in %s\n",
1914
+                    profileName);
1915
+            }
1916
+
1917
+            /* clear progress report */
1918
+            if (m_cliopt.bProgress)
1919
+                fprintf(stderr, "%*s\r", 80, " ");
1920
+
1921
+        fail:
1922
+
1923
+            delete reconPlay;
1924
+
1925
+            api->encoder_get_stats(m_encoder, &stats, sizeof(stats));
1926
+            if (m_param->csvfn && !b_ctrl_c)
1927
+#if ENABLE_LIBVMAF
1928
+                api->vmaf_encoder_log(m_encoder, m_cliopt.argCnt, m_cliopt.argString, m_cliopt.param, vmafdata);
1929
+#else
1930
+                api->encoder_log(m_encoder, m_cliopt.argCnt, m_cliopt.argString);
1931
+#endif
1932
+            api->encoder_close(m_encoder);
1933
+
1934
+            int64_t second_largest_pts = 0;
1935
+            int64_t largest_pts = 0;
1936
+            if (pts_queue && pts_queue->size() >= 2)
1937
+            {
1938
+                second_largest_pts = -pts_queue->top();
1939
+                pts_queue->pop();
1940
+                largest_pts = -pts_queue->top();
1941
+                pts_queue->pop();
1942
+                delete pts_queue;
1943
+                pts_queue = NULL;
1944
+            }
1945
+            m_cliopt.output->closeFile(largest_pts, second_largest_pts);
1946
+
1947
+            if (b_ctrl_c)
1948
+                general_log(m_param, NULL, X265_LOG_INFO, "aborted at input frame %d, output frame %d in %s\n",
1949
+                    m_cliopt.seek + inFrameCount, stats.encodedPictureCount, profileName);
1950
+
1951
+            api->param_free(m_param);
1952
+
1953
+            X265_FREE(errorBuf);
1954
+            X265_FREE(rpuPayload);
1955
+
1956
+            m_threadActive = false;
1957
+            m_parent->m_numActiveEncodes.decr();
1958
+        }
1959
+    }
1960
+
1961
+    void PassEncoder::destroy()
1962
+    {
1963
+        stop();
1964
+        if (m_reader)
1965
+        {
1966
+            m_reader->stop();
1967
+            delete m_reader;
1968
+        }
1969
+        else
1970
+        {
1971
+            m_scaler->stop();
1972
+            m_scaler->destroy();
1973
+            delete m_scaler;
1974
+        }
1975
+    }
1976
+
1977
+    Scaler::Scaler(int threadId, int threadNum, int id, VideoDesc *src, VideoDesc *dst, PassEncoder *parentEnc)
1978
+    {
1979
+        m_parentEnc = parentEnc;
1980
+        m_id = id;
1981
+        m_srcFormat = src;
1982
+        m_dstFormat = dst;
1983
+        m_threadActive = false;
1984
+        m_scaleFrameSize = 0;
1985
+        m_filterManager = NULL;
1986
+        m_threadId = threadId;
1987
+        m_threadTotal = threadNum;
1988
+
1989
+        int csp = dst->m_csp;
1990
+        uint32_t pixelbytes = dst->m_inputDepth > 8 ? 2 : 1;
1991
+        for (int i = 0; i < x265_cli_cspscsp.planes; i++)
1992
+        {
1993
+            int w = dst->m_width >> x265_cli_cspscsp.widthi;
1994
+            int h = dst->m_height >> x265_cli_cspscsp.heighti;
1995
+            m_scalePlanesi = w * h * pixelbytes;
1996
+            m_scaleFrameSize += m_scalePlanesi;
1997
+        }
1998
+
1999
+        if (src->m_height != dst->m_height || src->m_width != dst->m_width)
2000
+        {
2001
+            m_filterManager = new ScalerFilterManager;
2002
+            m_filterManager->init(4, m_srcFormat, m_dstFormat);
2003
+        }
2004
+    }
2005
+
2006
+    bool Scaler::scalePic(x265_picture * destination, x265_picture * source)
2007
+    {
2008
+        if (!destination || !source)
2009
+            return false;
2010
+        x265_param* param = m_parentEnc->m_param;
2011
+        int pixelBytes = m_dstFormat->m_inputDepth > 8 ? 2 : 1;
2012
+        if (m_srcFormat->m_height != m_dstFormat->m_height || m_srcFormat->m_width != m_dstFormat->m_width)
2013
+        {
2014
+            void **srcPlane = NULL, **dstPlane = NULL;
2015
+            int srcStride3, dstStride3;
2016
+            destination->bitDepth = source->bitDepth;
2017
+            destination->colorSpace = source->colorSpace;
2018
+            destination->pts = source->pts;
2019
+            destination->dts = source->dts;
2020
+            destination->reorderedPts = source->reorderedPts;
2021
+            destination->poc = source->poc;
2022
+            destination->userSEI = source->userSEI;
2023
+            srcPlane = source->planes;
2024
+            dstPlane = destination->planes;
2025
+            srcStride0 = source->stride0;
2026
+            destination->stride0 = m_dstFormat->m_width * pixelBytes;
2027
+            dstStride0 = destination->stride0;
2028
+            if (param->internalCsp != X265_CSP_I400)
2029
+            {
2030
+                srcStride1 = source->stride1;
2031
+                srcStride2 = source->stride2;
2032
+                destination->stride1 = destination->stride0 >> x265_cli_cspsparam->internalCsp.width1;
2033
+                destination->stride2 = destination->stride0 >> x265_cli_cspsparam->internalCsp.width2;
2034
+                dstStride1 = destination->stride1;
2035
+                dstStride2 = destination->stride2;
2036
+            }
2037
+            if (m_scaleFrameSize)
2038
+            {
2039
+                m_filterManager->scale_pic(srcPlane, dstPlane, srcStride, dstStride);
2040
+                return true;
2041
+            }
2042
+            else
2043
+                x265_log(param, X265_LOG_INFO, "Empty frame received\n");
2044
+        }
2045
+        return false;
2046
+    }
2047
+
2048
+    void Scaler::threadMain()
2049
+    {
2050
+        THREAD_NAME("Scaler", m_id);
2051
+
2052
+        /* unscaled picture is stored in the last index */
2053
+        uint32_t srcId = m_id - 1;
2054
+        int QDepth = m_parentEnc->m_parent->m_queueSize;
2055
+        while (!m_parentEnc->m_inputOver)
2056
+        {
2057
+
2058
+            uint32_t scaledWritten = m_parentEnc->m_parent->m_picWriteCntm_id.get();
2059
+
2060
+            if (m_parentEnc->m_cliopt.framesToBeEncoded && scaledWritten >= m_parentEnc->m_cliopt.framesToBeEncoded)
2061
+                break;
2062
+
2063
+            if (m_threadTotal > 1 && (m_threadId != scaledWritten % m_threadTotal))
2064
+            {
2065
+                continue;
2066
+            }
2067
+            uint32_t written = m_parentEnc->m_parent->m_picWriteCntsrcId.get();
2068
+
2069
+            /*If all the input pictures are scaled by the current scale worker thread wait for input pictures*/
2070
+            while (m_threadActive && (scaledWritten == written)) {
2071
+                written = m_parentEnc->m_parent->m_picWriteCntsrcId.waitForChange(written);
2072
+            }
2073
+
2074
+            if (m_threadActive && scaledWritten < written)
2075
+            {
2076
+
2077
+                int scaledWriteIdx = scaledWritten % QDepth;
2078
+                int overWritePicBuffer = scaledWritten / QDepth;
2079
+                int read = m_parentEnc->m_parent->m_picIdxReadCntm_idscaledWriteIdx.get();
2080
+
2081
+                while (overWritePicBuffer && read < overWritePicBuffer)
2082
+                {
2083
+                    read = m_parentEnc->m_parent->m_picIdxReadCntm_idscaledWriteIdx.waitForChange(read);
2084
+                }
2085
+
2086
+                if (!m_parentEnc->m_parent->m_inputPicBufferm_idscaledWriteIdx)
2087
+                {
2088
+                    int framesize = 0;
2089
+                    int planesize3;
2090
+                    int csp = m_dstFormat->m_csp;
2091
+                    int stride3;
2092
+                    stride0 = m_dstFormat->m_width;
2093
+                    stride1 = stride0 >> x265_cli_cspscsp.width1;
2094
+                    stride2 = stride0 >> x265_cli_cspscsp.width2;
2095
+                    for (int i = 0; i < x265_cli_cspscsp.planes; i++)
2096
+                    {
2097
+                        uint32_t h = m_dstFormat->m_height >> x265_cli_cspscsp.heighti;
2098
+                        planesizei = h * stridei;
2099
+                        framesize += planesizei;
2100
+                    }
2101
+
2102
+                    m_parentEnc->m_parent->m_inputPicBufferm_idscaledWriteIdx = x265_picture_alloc();
2103
+                    x265_picture_init(m_parentEnc->m_param, m_parentEnc->m_parent->m_inputPicBufferm_idscaledWriteIdx);
2104
+
2105
+                    ((x265_picture*)m_parentEnc->m_parent->m_inputPicBufferm_idscaledWritten % QDepth)->framesize = framesize;
2106
+                    for (int32_t j = 0; j < x265_cli_cspscsp.planes; j++)
2107
+                    {
2108
+                        m_parentEnc->m_parent->m_inputPicBufferm_idscaledWritten % QDepth->planesj = X265_MALLOC(char, planesizej);
2109
+                    }
2110
+                }
2111
+
2112
+                x265_picture *srcPic = m_parentEnc->m_parent->m_inputPicBuffersrcIdscaledWritten % QDepth;
2113
+                x265_picture* destPic = m_parentEnc->m_parent->m_inputPicBufferm_idscaledWriteIdx;
2114
+
2115
+                // Enqueue this picture up with the current encoder so that it will asynchronously encode
2116
+                if (!scalePic(destPic, srcPic))
2117
+                    x265_log(NULL, X265_LOG_ERROR, "Unable to copy scaled input picture to input queue \n");
2118
+                else
2119
+                    m_parentEnc->m_parent->m_picWriteCntm_id.incr();
2120
+                m_scaledWriteCnt.incr();
2121
+                m_parentEnc->m_parent->m_picIdxReadCntsrcIdscaledWriteIdx.incr();
2122
+            }
2123
+            if (m_threadTotal > 1)
2124
+            {
2125
+                written = m_parentEnc->m_parent->m_picWriteCntsrcId.get();
2126
+                int totalWrite = written / m_threadTotal;
2127
+                if (written % m_threadTotal > m_threadId)
2128
+                    totalWrite++;
2129
+                if (totalWrite == m_scaledWriteCnt.get())
2130
+                {
2131
+                    m_parentEnc->m_parent->m_picWriteCntsrcId.poke();
2132
+                    m_parentEnc->m_parent->m_picWriteCntm_id.poke();
2133
+                    break;
2134
+                }
2135
+            }
2136
+            else
2137
+            {
2138
+                /* Once end of video is reached and all frames are scaled, release wait on picwritecount */
2139
+                scaledWritten = m_parentEnc->m_parent->m_picWriteCntm_id.get();
2140
+                written = m_parentEnc->m_parent->m_picWriteCntsrcId.get();
2141
+                if (written == scaledWritten)
2142
+                {
2143
+                    m_parentEnc->m_parent->m_picWriteCntsrcId.poke();
2144
+                    m_parentEnc->m_parent->m_picWriteCntm_id.poke();
2145
+                    break;
2146
+                }
2147
+            }
2148
+
2149
+        }
2150
+        m_threadActive = false;
2151
+        destroy();
2152
+    }
2153
+
2154
+    Reader::Reader(int id, PassEncoder *parentEnc)
2155
+    {
2156
+        m_parentEnc = parentEnc;
2157
+        m_id = id;
2158
+        m_input = parentEnc->m_input;
2159
+    }
2160
+
2161
+    void Reader::threadMain()
2162
+    {
2163
+        THREAD_NAME("Reader", m_id);
2164
+
2165
+        int QDepth = m_parentEnc->m_parent->m_queueSize;
2166
+        x265_picture* src = x265_picture_alloc();
2167
+        x265_picture_init(m_parentEnc->m_param, src);
2168
+
2169
+        while (m_threadActive)
2170
+        {
2171
+            uint32_t written = m_parentEnc->m_parent->m_picWriteCntm_id.get();
2172
+            uint32_t writeIdx = written % QDepth;
2173
+            uint32_t read = m_parentEnc->m_parent->m_picIdxReadCntm_idwriteIdx.get();
2174
+            uint32_t overWritePicBuffer = written / QDepth;
2175
+
2176
+            if (m_parentEnc->m_cliopt.framesToBeEncoded && written >= m_parentEnc->m_cliopt.framesToBeEncoded)
2177
+                break;
2178
+
2179
+            while (overWritePicBuffer && read < overWritePicBuffer)
2180
+            {
2181
+                read = m_parentEnc->m_parent->m_picIdxReadCntm_idwriteIdx.waitForChange(read);
2182
+            }
2183
+
2184
+            x265_picture* dest = m_parentEnc->m_parent->m_inputPicBufferm_idwriteIdx;
2185
+            if (m_input->readPicture(*src))
2186
+            {
2187
+                dest->poc = src->poc;
2188
+                dest->pts = src->pts;
2189
+                dest->userSEI = src->userSEI;
2190
+                dest->bitDepth = src->bitDepth;
2191
+                dest->framesize = src->framesize;
2192
+                dest->height = src->height;
2193
+                dest->width = src->width;
2194
+                dest->colorSpace = src->colorSpace;
2195
+                dest->userSEI = src->userSEI;
2196
+                dest->rpu.payload = src->rpu.payload;
2197
+                dest->picStruct = src->picStruct;
2198
+                dest->stride0 = src->stride0;
2199
+                dest->stride1 = src->stride1;
2200
+                dest->stride2 = src->stride2;
2201
+
2202
+                if (!dest->planes0)
2203
+                    dest->planes0 = X265_MALLOC(char, dest->framesize);
2204
+
2205
+                memcpy(dest->planes0, src->planes0, src->framesize * sizeof(char));
2206
+                dest->planes1 = (char*)dest->planes0 + src->stride0 * src->height;
2207
+                dest->planes2 = (char*)dest->planes1 + src->stride1 * (src->height >> x265_cli_cspssrc->colorSpace.height1);
2208
+                m_parentEnc->m_parent->m_picWriteCntm_id.incr();
2209
+            }
2210
+            else
2211
+            {
2212
+                m_threadActive = false;
2213
+                m_parentEnc->m_inputOver = true;
2214
+                m_parentEnc->m_parent->m_picWriteCntm_id.poke();
2215
+            }
2216
+        }
2217
+        x265_picture_free(src);
2218
+    }
2219
+}
2220
x265_3.5.tar.gz/source/abrEncApp.h -> x265_3.6.tar.gz/source/abrEncApp.h Changed
9
 
1
@@ -91,6 +91,7 @@
2
         FILE*    m_qpfile;
3
         FILE*    m_zoneFile;
4
         FILE*    m_dolbyVisionRpu;/* File containing Dolby Vision BL RPU metadata */
5
+        FILE*    m_scenecutAwareQpConfig;
6
 
7
         int m_ret;
8
 
9
x265_3.5.tar.gz/source/cmake/FindNeon.cmake -> x265_3.6.tar.gz/source/cmake/FindNeon.cmake Changed
27
 
1
@@ -1,10 +1,21 @@
2
 include(FindPackageHandleStandardArgs)
3
 
4
 # Check the version of neon supported by the ARM CPU
5
-execute_process(COMMAND cat /proc/cpuinfo | grep Features | grep neon
6
-                OUTPUT_VARIABLE neon_version
7
-                ERROR_QUIET
8
-                OUTPUT_STRIP_TRAILING_WHITESPACE)
9
+if(APPLE)
10
+    execute_process(COMMAND sysctl -a
11
+                    COMMAND grep "hw.optional.neon: 1"
12
+                    OUTPUT_VARIABLE neon_version
13
+                    ERROR_QUIET
14
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
15
+else()
16
+    execute_process(COMMAND cat /proc/cpuinfo
17
+                    COMMAND grep Features
18
+                    COMMAND grep neon
19
+                    OUTPUT_VARIABLE neon_version
20
+                    ERROR_QUIET
21
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
22
+endif()
23
+
24
 if(neon_version)
25
     set(CPU_HAS_NEON 1)
26
 endif()
27
x265_3.6.tar.gz/source/cmake/FindSVE.cmake Added
23
 
1
@@ -0,0 +1,21 @@
2
+include(FindPackageHandleStandardArgs)
3
+
4
+# Check the version of SVE supported by the ARM CPU
5
+if(APPLE)
6
+    execute_process(COMMAND sysctl -a
7
+                    COMMAND grep "hw.optional.sve: 1"
8
+                    OUTPUT_VARIABLE sve_version
9
+                    ERROR_QUIET
10
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
11
+else()
12
+    execute_process(COMMAND cat /proc/cpuinfo
13
+                    COMMAND grep Features
14
+                    COMMAND grep -e "sve$" -e "sve:space:"
15
+                    OUTPUT_VARIABLE sve_version
16
+                    ERROR_QUIET
17
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
18
+endif()
19
+
20
+if(sve_version)
21
+    set(CPU_HAS_SVE 1)
22
+endif()
23
x265_3.6.tar.gz/source/cmake/FindSVE2.cmake Added
24
 
1
@@ -0,0 +1,22 @@
2
+include(FindPackageHandleStandardArgs)
3
+
4
+# Check the version of SVE2 supported by the ARM CPU
5
+if(APPLE)
6
+    execute_process(COMMAND sysctl -a
7
+                    COMMAND grep "hw.optional.sve2: 1"
8
+                    OUTPUT_VARIABLE sve2_version
9
+                    ERROR_QUIET
10
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
11
+else()
12
+    execute_process(COMMAND cat /proc/cpuinfo
13
+                    COMMAND grep Features
14
+                    COMMAND grep sve2
15
+                    OUTPUT_VARIABLE sve2_version
16
+                    ERROR_QUIET
17
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
18
+endif()
19
+
20
+if(sve2_version)
21
+    set(CPU_HAS_SVE 1)
22
+    set(CPU_HAS_SVE2 1)
23
+endif()
24
x265_3.5.tar.gz/source/common/CMakeLists.txt -> x265_3.6.tar.gz/source/common/CMakeLists.txt Changed
76
 
1
@@ -84,35 +84,42 @@
2
 endif(ENABLE_ASSEMBLY AND X86)
3
 
4
 if(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
5
-    if(ARM64)
6
-        if(GCC AND (CMAKE_CXX_FLAGS_RELEASE MATCHES "-O3"))
7
-            message(STATUS "Detected CXX compiler using -O3 optimization level")
8
-            add_definitions(-DAUTO_VECTORIZE=1)
9
-        endif()
10
-        set(C_SRCS asm-primitives.cpp pixel.h ipfilter8.h)
11
-
12
-        # add ARM assembly/intrinsic files here
13
-        set(A_SRCS asm.S mc-a.S sad-a.S pixel-util.S ipfilter8.S)
14
-        set(VEC_PRIMITIVES)
15
+    set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
16
 
17
-        set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
18
-        foreach(SRC ${C_SRCS})
19
-            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
20
-        endforeach()
21
-    else()
22
-        set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
23
+    # add ARM assembly/intrinsic files here
24
+    set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S dct-a.S)
25
+    set(VEC_PRIMITIVES)
26
 
27
-        # add ARM assembly/intrinsic files here
28
-        set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S dct-a.S)
29
-        set(VEC_PRIMITIVES)
30
+    set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
31
+    foreach(SRC ${C_SRCS})
32
+        set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
33
+    endforeach()
34
+    source_group(Assembly FILES ${ASM_PRIMITIVES})
35
+endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
36
 
37
-        set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
38
-        foreach(SRC ${C_SRCS})
39
-            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
40
-        endforeach()
41
+if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
42
+    if(GCC AND (CMAKE_CXX_FLAGS_RELEASE MATCHES "-O3"))
43
+        message(STATUS "Detected CXX compiler using -O3 optimization level")
44
+        add_definitions(-DAUTO_VECTORIZE=1)
45
     endif()
46
+
47
+    set(C_SRCS asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h)
48
+    enable_language(ASM)
49
+
50
+    # add ARM assembly/intrinsic files here
51
+    set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S sad-a-common.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S ipfilter.S ipfilter-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S)
52
+    set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S)
53
+    set(A_SRCS_SVE2 mc-a-sve2.S sad-a-sve2.S pixel-util-sve2.S ipfilter-sve2.S ssd-a-sve2.S)
54
+    set(VEC_PRIMITIVES)
55
+
56
+    set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
57
+    set(ARM_ASMS_SVE "${A_SRCS_SVE}" CACHE INTERNAL "ARM Assembly Sources that use SVE instruction set")
58
+    set(ARM_ASMS_SVE2 "${A_SRCS_SVE2}" CACHE INTERNAL "ARM Assembly Sources that use SVE2 instruction set")
59
+    foreach(SRC ${C_SRCS})
60
+        set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
61
+    endforeach()
62
     source_group(Assembly FILES ${ASM_PRIMITIVES})
63
-endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
64
+endif(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
65
 
66
 if(POWER)
67
     set_source_files_properties(version.cpp PROPERTIES COMPILE_FLAGS -DX265_VERSION=${X265_VERSION})
68
@@ -169,4 +176,6 @@
69
     scalinglist.cpp scalinglist.h
70
     quant.cpp quant.h contexts.h
71
     deblock.cpp deblock.h
72
-    scaler.cpp scaler.h)
73
+    scaler.cpp scaler.h
74
+    ringmem.cpp ringmem.h
75
+    temporalfilter.cpp temporalfilter.h)
76
x265_3.6.tar.gz/source/common/aarch64/arm64-utils.cpp Added
302
 
1
@@ -0,0 +1,300 @@
2
+#include "common.h"
3
+#include "x265.h"
4
+#include "arm64-utils.h"
5
+#include <arm_neon.h>
6
+
7
+#define COPY_16(d,s) *(uint8x16_t *)(d) = *(uint8x16_t *)(s)
8
+namespace X265_NS
9
+{
10
+
11
+
12
+
13
+void transpose8x8(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
14
+{
15
+    uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7;
16
+    uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;
17
+
18
+    a0 = *(uint8x8_t *)(src + 0 * sstride);
19
+    a1 = *(uint8x8_t *)(src + 1 * sstride);
20
+    a2 = *(uint8x8_t *)(src + 2 * sstride);
21
+    a3 = *(uint8x8_t *)(src + 3 * sstride);
22
+    a4 = *(uint8x8_t *)(src + 4 * sstride);
23
+    a5 = *(uint8x8_t *)(src + 5 * sstride);
24
+    a6 = *(uint8x8_t *)(src + 6 * sstride);
25
+    a7 = *(uint8x8_t *)(src + 7 * sstride);
26
+
27
+    b0 = vtrn1_u32(a0, a4);
28
+    b1 = vtrn1_u32(a1, a5);
29
+    b2 = vtrn1_u32(a2, a6);
30
+    b3 = vtrn1_u32(a3, a7);
31
+    b4 = vtrn2_u32(a0, a4);
32
+    b5 = vtrn2_u32(a1, a5);
33
+    b6 = vtrn2_u32(a2, a6);
34
+    b7 = vtrn2_u32(a3, a7);
35
+
36
+    a0 = vtrn1_u16(b0, b2);
37
+    a1 = vtrn1_u16(b1, b3);
38
+    a2 = vtrn2_u16(b0, b2);
39
+    a3 = vtrn2_u16(b1, b3);
40
+    a4 = vtrn1_u16(b4, b6);
41
+    a5 = vtrn1_u16(b5, b7);
42
+    a6 = vtrn2_u16(b4, b6);
43
+    a7 = vtrn2_u16(b5, b7);
44
+
45
+    b0 = vtrn1_u8(a0, a1);
46
+    b1 = vtrn2_u8(a0, a1);
47
+    b2 = vtrn1_u8(a2, a3);
48
+    b3 = vtrn2_u8(a2, a3);
49
+    b4 = vtrn1_u8(a4, a5);
50
+    b5 = vtrn2_u8(a4, a5);
51
+    b6 = vtrn1_u8(a6, a7);
52
+    b7 = vtrn2_u8(a6, a7);
53
+
54
+    *(uint8x8_t *)(dst + 0 * dstride) = b0;
55
+    *(uint8x8_t *)(dst + 1 * dstride) = b1;
56
+    *(uint8x8_t *)(dst + 2 * dstride) = b2;
57
+    *(uint8x8_t *)(dst + 3 * dstride) = b3;
58
+    *(uint8x8_t *)(dst + 4 * dstride) = b4;
59
+    *(uint8x8_t *)(dst + 5 * dstride) = b5;
60
+    *(uint8x8_t *)(dst + 6 * dstride) = b6;
61
+    *(uint8x8_t *)(dst + 7 * dstride) = b7;
62
+}
63
+
64
+
65
+
66
+
67
+
68
+
69
+void transpose16x16(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
70
+{
71
+    uint16x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aA, aB, aC, aD, aE, aF;
72
+    uint16x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, bA, bB, bC, bD, bE, bF;
73
+    uint16x8_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF;
74
+    uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, dA, dB, dC, dD, dE, dF;
75
+
76
+    a0 = *(uint16x8_t *)(src + 0 * sstride);
77
+    a1 = *(uint16x8_t *)(src + 1 * sstride);
78
+    a2 = *(uint16x8_t *)(src + 2 * sstride);
79
+    a3 = *(uint16x8_t *)(src + 3 * sstride);
80
+    a4 = *(uint16x8_t *)(src + 4 * sstride);
81
+    a5 = *(uint16x8_t *)(src + 5 * sstride);
82
+    a6 = *(uint16x8_t *)(src + 6 * sstride);
83
+    a7 = *(uint16x8_t *)(src + 7 * sstride);
84
+    a8 = *(uint16x8_t *)(src + 8 * sstride);
85
+    a9 = *(uint16x8_t *)(src + 9 * sstride);
86
+    aA = *(uint16x8_t *)(src + 10 * sstride);
87
+    aB = *(uint16x8_t *)(src + 11 * sstride);
88
+    aC = *(uint16x8_t *)(src + 12 * sstride);
89
+    aD = *(uint16x8_t *)(src + 13 * sstride);
90
+    aE = *(uint16x8_t *)(src + 14 * sstride);
91
+    aF = *(uint16x8_t *)(src + 15 * sstride);
92
+
93
+    b0 = vtrn1q_u64(a0, a8);
94
+    b1 = vtrn1q_u64(a1, a9);
95
+    b2 = vtrn1q_u64(a2, aA);
96
+    b3 = vtrn1q_u64(a3, aB);
97
+    b4 = vtrn1q_u64(a4, aC);
98
+    b5 = vtrn1q_u64(a5, aD);
99
+    b6 = vtrn1q_u64(a6, aE);
100
+    b7 = vtrn1q_u64(a7, aF);
101
+    b8 = vtrn2q_u64(a0, a8);
102
+    b9 = vtrn2q_u64(a1, a9);
103
+    bA = vtrn2q_u64(a2, aA);
104
+    bB = vtrn2q_u64(a3, aB);
105
+    bC = vtrn2q_u64(a4, aC);
106
+    bD = vtrn2q_u64(a5, aD);
107
+    bE = vtrn2q_u64(a6, aE);
108
+    bF = vtrn2q_u64(a7, aF);
109
+
110
+    c0 = vtrn1q_u32(b0, b4);
111
+    c1 = vtrn1q_u32(b1, b5);
112
+    c2 = vtrn1q_u32(b2, b6);
113
+    c3 = vtrn1q_u32(b3, b7);
114
+    c4 = vtrn2q_u32(b0, b4);
115
+    c5 = vtrn2q_u32(b1, b5);
116
+    c6 = vtrn2q_u32(b2, b6);
117
+    c7 = vtrn2q_u32(b3, b7);
118
+    c8 = vtrn1q_u32(b8, bC);
119
+    c9 = vtrn1q_u32(b9, bD);
120
+    cA = vtrn1q_u32(bA, bE);
121
+    cB = vtrn1q_u32(bB, bF);
122
+    cC = vtrn2q_u32(b8, bC);
123
+    cD = vtrn2q_u32(b9, bD);
124
+    cE = vtrn2q_u32(bA, bE);
125
+    cF = vtrn2q_u32(bB, bF);
126
+
127
+    d0 = vtrn1q_u16(c0, c2);
128
+    d1 = vtrn1q_u16(c1, c3);
129
+    d2 = vtrn2q_u16(c0, c2);
130
+    d3 = vtrn2q_u16(c1, c3);
131
+    d4 = vtrn1q_u16(c4, c6);
132
+    d5 = vtrn1q_u16(c5, c7);
133
+    d6 = vtrn2q_u16(c4, c6);
134
+    d7 = vtrn2q_u16(c5, c7);
135
+    d8 = vtrn1q_u16(c8, cA);
136
+    d9 = vtrn1q_u16(c9, cB);
137
+    dA = vtrn2q_u16(c8, cA);
138
+    dB = vtrn2q_u16(c9, cB);
139
+    dC = vtrn1q_u16(cC, cE);
140
+    dD = vtrn1q_u16(cD, cF);
141
+    dE = vtrn2q_u16(cC, cE);
142
+    dF = vtrn2q_u16(cD, cF);
143
+
144
+    *(uint16x8_t *)(dst + 0 * dstride)  = vtrn1q_u8(d0, d1);
145
+    *(uint16x8_t *)(dst + 1 * dstride)  = vtrn2q_u8(d0, d1);
146
+    *(uint16x8_t *)(dst + 2 * dstride)  = vtrn1q_u8(d2, d3);
147
+    *(uint16x8_t *)(dst + 3 * dstride)  = vtrn2q_u8(d2, d3);
148
+    *(uint16x8_t *)(dst + 4 * dstride)  = vtrn1q_u8(d4, d5);
149
+    *(uint16x8_t *)(dst + 5 * dstride)  = vtrn2q_u8(d4, d5);
150
+    *(uint16x8_t *)(dst + 6 * dstride)  = vtrn1q_u8(d6, d7);
151
+    *(uint16x8_t *)(dst + 7 * dstride)  = vtrn2q_u8(d6, d7);
152
+    *(uint16x8_t *)(dst + 8 * dstride)  = vtrn1q_u8(d8, d9);
153
+    *(uint16x8_t *)(dst + 9 * dstride)  = vtrn2q_u8(d8, d9);
154
+    *(uint16x8_t *)(dst + 10 * dstride)  = vtrn1q_u8(dA, dB);
155
+    *(uint16x8_t *)(dst + 11 * dstride)  = vtrn2q_u8(dA, dB);
156
+    *(uint16x8_t *)(dst + 12 * dstride)  = vtrn1q_u8(dC, dD);
157
+    *(uint16x8_t *)(dst + 13 * dstride)  = vtrn2q_u8(dC, dD);
158
+    *(uint16x8_t *)(dst + 14 * dstride)  = vtrn1q_u8(dE, dF);
159
+    *(uint16x8_t *)(dst + 15 * dstride)  = vtrn2q_u8(dE, dF);
160
+
161
+
162
+}
163
+
164
+
165
+void transpose32x32(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
166
+{
167
+    //assumption: there is no partial overlap
168
+    transpose16x16(dst, src, dstride, sstride);
169
+    transpose16x16(dst + 16 * dstride + 16, src + 16 * sstride + 16, dstride, sstride);
170
+    if (dst == src)
171
+    {
172
+        uint8_t tmp16 * 16 __attribute__((aligned(64)));
173
+        transpose16x16(tmp, src + 16, 16, sstride);
174
+        transpose16x16(dst + 16, src + 16 * sstride, dstride, sstride);
175
+        for (int i = 0; i < 16; i++)
176
+        {
177
+            COPY_16(dst + (16 + i)*dstride, tmp + 16 * i);
178
+        }
179
+    }
180
+    else
181
+    {
182
+        transpose16x16(dst + 16 * dstride, src + 16, dstride, sstride);
183
+        transpose16x16(dst + 16, src + 16 * sstride, dstride, sstride);
184
+    }
185
+
186
+}
187
+
188
+
189
+
190
+void transpose8x8(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride)
191
+{
192
+    uint16x8_t a0, a1, a2, a3, a4, a5, a6, a7;
193
+    uint16x8_t b0, b1, b2, b3, b4, b5, b6, b7;
194
+
195
+    a0 = *(uint16x8_t *)(src + 0 * sstride);
196
+    a1 = *(uint16x8_t *)(src + 1 * sstride);
197
+    a2 = *(uint16x8_t *)(src + 2 * sstride);
198
+    a3 = *(uint16x8_t *)(src + 3 * sstride);
199
+    a4 = *(uint16x8_t *)(src + 4 * sstride);
200
+    a5 = *(uint16x8_t *)(src + 5 * sstride);
201
+    a6 = *(uint16x8_t *)(src + 6 * sstride);
202
+    a7 = *(uint16x8_t *)(src + 7 * sstride);
203
+
204
+    b0 = vtrn1q_u64(a0, a4);
205
+    b1 = vtrn1q_u64(a1, a5);
206
+    b2 = vtrn1q_u64(a2, a6);
207
+    b3 = vtrn1q_u64(a3, a7);
208
+    b4 = vtrn2q_u64(a0, a4);
209
+    b5 = vtrn2q_u64(a1, a5);
210
+    b6 = vtrn2q_u64(a2, a6);
211
+    b7 = vtrn2q_u64(a3, a7);
212
+
213
+    a0 = vtrn1q_u32(b0, b2);
214
+    a1 = vtrn1q_u32(b1, b3);
215
+    a2 = vtrn2q_u32(b0, b2);
216
+    a3 = vtrn2q_u32(b1, b3);
217
+    a4 = vtrn1q_u32(b4, b6);
218
+    a5 = vtrn1q_u32(b5, b7);
219
+    a6 = vtrn2q_u32(b4, b6);
220
+    a7 = vtrn2q_u32(b5, b7);
221
+
222
+    b0 = vtrn1q_u16(a0, a1);
223
+    b1 = vtrn2q_u16(a0, a1);
224
+    b2 = vtrn1q_u16(a2, a3);
225
+    b3 = vtrn2q_u16(a2, a3);
226
+    b4 = vtrn1q_u16(a4, a5);
227
+    b5 = vtrn2q_u16(a4, a5);
228
+    b6 = vtrn1q_u16(a6, a7);
229
+    b7 = vtrn2q_u16(a6, a7);
230
+
231
+    *(uint16x8_t *)(dst + 0 * dstride) = b0;
232
+    *(uint16x8_t *)(dst + 1 * dstride) = b1;
233
+    *(uint16x8_t *)(dst + 2 * dstride) = b2;
234
+    *(uint16x8_t *)(dst + 3 * dstride) = b3;
235
+    *(uint16x8_t *)(dst + 4 * dstride) = b4;
236
+    *(uint16x8_t *)(dst + 5 * dstride) = b5;
237
+    *(uint16x8_t *)(dst + 6 * dstride) = b6;
238
+    *(uint16x8_t *)(dst + 7 * dstride) = b7;
239
+}
240
+
241
+void transpose16x16(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride)
242
+{
243
+    //assumption: there is no partial overlap
244
+    transpose8x8(dst, src, dstride, sstride);
245
+    transpose8x8(dst + 8 * dstride + 8, src + 8 * sstride + 8, dstride, sstride);
246
+
247
+    if (dst == src)
248
+    {
249
+        uint16_t tmp8 * 8;
250
+        transpose8x8(tmp, src + 8, 8, sstride);
251
+        transpose8x8(dst + 8, src + 8 * sstride, dstride, sstride);
252
+        for (int i = 0; i < 8; i++)
253
+        {
254
+            COPY_16(dst + (8 + i)*dstride, tmp + 8 * i);
255
+        }
256
+    }
257
+    else
258
+    {
259
+        transpose8x8(dst + 8 * dstride, src + 8, dstride, sstride);
260
+        transpose8x8(dst + 8, src + 8 * sstride, dstride, sstride);
261
+    }
262
+
263
+}
264
+
265
+
266
+
267
+void transpose32x32(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride)
268
+{
269
+    //assumption: there is no partial overlap
270
+    for (int i = 0; i < 4; i++)
271
+    {
272
+        transpose8x8(dst + i * 8 * (1 + dstride), src + i * 8 * (1 + sstride), dstride, sstride);
273
+        for (int j = i + 1; j < 4; j++)
274
+        {
275
+            if (dst == src)
276
+            {
277
+                uint16_t tmp8 * 8 __attribute__((aligned(64)));
278
+                transpose8x8(tmp, src + 8 * i + 8 * j * sstride, 8, sstride);
279
+                transpose8x8(dst + 8 * i + 8 * j * dstride, src + 8 * j + 8 * i * sstride, dstride, sstride);
280
+                for (int k = 0; k < 8; k++)
281
+                {
282
+                    COPY_16(dst + 8 * j + (8 * i + k)*dstride, tmp + 8 * k);
283
+                }
284
+            }
285
+            else
286
+            {
287
+                transpose8x8(dst + 8 * (j + i * dstride), src + 8 * (i + j * sstride), dstride, sstride);
288
+                transpose8x8(dst + 8 * (i + j * dstride), src + 8 * (j + i * sstride), dstride, sstride);
289
+            }
290
+
291
+        }
292
+    }
293
+}
294
+
295
+
296
+
297
+
298
+}
299
+
300
+
301
+
302
x265_3.6.tar.gz/source/common/aarch64/arm64-utils.h Added
17
 
1
@@ -0,0 +1,15 @@
2
+#ifndef __ARM64_UTILS_H__
3
+#define __ARM64_UTILS_H__
4
+
5
+
6
+namespace X265_NS
7
+{
8
+void transpose8x8(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride);
9
+void transpose16x16(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride);
10
+void transpose32x32(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride);
11
+void transpose8x8(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride);
12
+void transpose16x16(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride);
13
+void transpose32x32(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride);
14
+}
15
+
16
+#endif
17
x265_3.5.tar.gz/source/common/aarch64/asm-primitives.cpp -> x265_3.6.tar.gz/source/common/aarch64/asm-primitives.cpp Changed
2102
 
1
@@ -3,6 +3,7 @@
2
  *
3
  * Authors: Hongbin Liu <liuhongbin1@huawei.com>
4
  *          Yimeng Su <yimeng.su@huawei.com>
5
+ *          Sebastian Pop <spop@amazon.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -22,11 +23,659 @@
10
  * For more information, contact us at license @ x265.com.
11
  *****************************************************************************/
12
 
13
+
14
 #include "common.h"
15
 #include "primitives.h"
16
 #include "x265.h"
17
 #include "cpu.h"
18
 
19
+extern "C" {
20
+#include "fun-decls.h"
21
+}
22
+
23
+#define ALL_LUMA_TU_TYPED(prim, fncdef, fname, cpu) \
24
+    p.cuBLOCK_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
25
+    p.cuBLOCK_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
26
+    p.cuBLOCK_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
27
+    p.cuBLOCK_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
28
+    p.cuBLOCK_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu)
29
+#define LUMA_TU_TYPED_NEON(prim, fncdef, fname) \
30
+    p.cuBLOCK_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
31
+    p.cuBLOCK_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
32
+    p.cuBLOCK_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
33
+    p.cuBLOCK_64x64.prim = fncdef PFX(fname ## _64x64_ ## neon)
34
+#define LUMA_TU_TYPED_CAN_USE_SVE(prim, fncdef, fname) \
35
+    p.cuBLOCK_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve)
36
+#define ALL_LUMA_TU(prim, fname, cpu)      ALL_LUMA_TU_TYPED(prim, , fname, cpu)
37
+#define LUMA_TU_NEON(prim, fname)      LUMA_TU_TYPED_NEON(prim, , fname)
38
+#define LUMA_TU_CAN_USE_SVE(prim, fname)      LUMA_TU_TYPED_CAN_USE_SVE(prim, , fname)
39
+
40
+#define ALL_LUMA_PU_TYPED(prim, fncdef, fname, cpu) \
41
+    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
42
+    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
43
+    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
44
+    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
45
+    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
46
+    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## cpu); \
47
+    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## cpu); \
48
+    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## cpu); \
49
+    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
50
+    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
51
+    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
52
+    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
53
+    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
54
+    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
55
+    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
56
+    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## cpu); \
57
+    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## cpu); \
58
+    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
59
+    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
60
+    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
61
+    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu); \
62
+    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
63
+    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
64
+    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \
65
+    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu)
66
+#define LUMA_PU_TYPED_MULTIPLE_ARCHS_1(prim, fncdef, fname, cpu) \
67
+    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
68
+    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## cpu); \
69
+    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## cpu)
70
+#define LUMA_PU_TYPED_MULTIPLE_ARCHS_2(prim, fncdef, fname, cpu) \
71
+    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
72
+    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
73
+    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
74
+    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
75
+    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## cpu); \
76
+    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## cpu); \
77
+    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
78
+    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
79
+    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
80
+    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
81
+    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
82
+    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
83
+    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
84
+    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## cpu); \
85
+    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
86
+    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
87
+    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
88
+    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu); \
89
+    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
90
+    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
91
+    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \
92
+    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu)
93
+#define LUMA_PU_TYPED_NEON_1(prim, fncdef, fname) \
94
+    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
95
+    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
96
+    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon); \
97
+    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## neon); \
98
+    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
99
+    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
100
+    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## neon); \
101
+    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## neon); \
102
+    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## neon); \
103
+    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \
104
+    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
105
+    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## neon); \
106
+    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## neon); \
107
+    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## neon); \
108
+    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## neon); \
109
+    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## neon)
110
+#define LUMA_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fncdef, fname) \
111
+    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve); \
112
+    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## sve); \
113
+    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## sve); \
114
+    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## sve); \
115
+    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## sve); \
116
+    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## sve); \
117
+    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## sve); \
118
+    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## sve); \
119
+    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## sve)
120
+#define LUMA_PU_TYPED_NEON_2(prim, fncdef, fname) \
121
+    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
122
+    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## neon); \
123
+    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
124
+    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
125
+    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## neon); \
126
+    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## neon); \
127
+    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
128
+    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
129
+    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \
130
+    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## neon); \
131
+    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon); \
132
+    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## neon); \
133
+    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## neon)
134
+#define LUMA_PU_TYPED_MULTIPLE_ARCHS_3(prim, fncdef, fname, cpu) \
135
+    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
136
+    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
137
+    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
138
+    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
139
+    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
140
+    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
141
+    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
142
+    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
143
+    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
144
+    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
145
+    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
146
+    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu)
147
+#define LUMA_PU_TYPED_NEON_3(prim, fncdef, fname) \
148
+    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
149
+    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
150
+    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon)
151
+#define LUMA_PU_TYPED_CAN_USE_SVE2(prim, fncdef, fname) \
152
+    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## sve2); \
153
+    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## sve2); \
154
+    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve2); \
155
+    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## sve2); \
156
+    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## sve2); \
157
+    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## sve2); \
158
+    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## sve2); \
159
+    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## sve2); \
160
+    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## sve2); \
161
+    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## sve2); \
162
+    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## sve2); \
163
+    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## sve2); \
164
+    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## sve2); \
165
+    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## sve2); \
166
+    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## sve2); \
167
+    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## sve2); \
168
+    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## sve2); \
169
+    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## sve2); \
170
+    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## sve2); \
171
+    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## sve2); \
172
+    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## sve2); \
173
+    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## sve2)
174
+#define LUMA_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
175
+    p.puLUMA_4x4.prim   = fncdef PFX(filterPixelToShort ## _4x4_ ## neon); \
176
+    p.puLUMA_8x8.prim   = fncdef PFX(filterPixelToShort ## _8x8_ ## neon); \
177
+    p.puLUMA_16x16.prim = fncdef PFX(filterPixelToShort ## _16x16_ ## neon); \
178
+    p.puLUMA_8x4.prim   = fncdef PFX(filterPixelToShort ## _8x4_ ## neon); \
179
+    p.puLUMA_4x8.prim   = fncdef PFX(filterPixelToShort ## _4x8_ ## neon); \
180
+    p.puLUMA_16x8.prim  = fncdef PFX(filterPixelToShort ## _16x8_ ## neon); \
181
+    p.puLUMA_8x16.prim  = fncdef PFX(filterPixelToShort ## _8x16_ ## neon); \
182
+    p.puLUMA_16x32.prim = fncdef PFX(filterPixelToShort ## _16x32_ ## neon); \
183
+    p.puLUMA_16x12.prim = fncdef PFX(filterPixelToShort ## _16x12_ ## neon); \
184
+    p.puLUMA_12x16.prim = fncdef PFX(filterPixelToShort ## _12x16_ ## neon); \
185
+    p.puLUMA_16x4.prim  = fncdef PFX(filterPixelToShort ## _16x4_ ## neon); \
186
+    p.puLUMA_4x16.prim  = fncdef PFX(filterPixelToShort ## _4x16_ ## neon); \
187
+    p.puLUMA_24x32.prim = fncdef PFX(filterPixelToShort ## _24x32_ ## neon); \
188
+    p.puLUMA_8x32.prim  = fncdef PFX(filterPixelToShort ## _8x32_ ## neon); \
189
+    p.puLUMA_16x64.prim = fncdef PFX(filterPixelToShort ## _16x64_ ## neon)
190
+#define LUMA_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
191
+    p.puLUMA_32x32.prim = fncdef PFX(filterPixelToShort ## _32x32_ ## sve); \
192
+    p.puLUMA_32x16.prim = fncdef PFX(filterPixelToShort ## _32x16_ ## sve); \
193
+    p.puLUMA_32x64.prim = fncdef PFX(filterPixelToShort ## _32x64_ ## sve); \
194
+    p.puLUMA_32x24.prim = fncdef PFX(filterPixelToShort ## _32x24_ ## sve); \
195
+    p.puLUMA_32x8.prim  = fncdef PFX(filterPixelToShort ## _32x8_ ## sve); \
196
+    p.puLUMA_64x64.prim = fncdef PFX(filterPixelToShort ## _64x64_ ## sve); \
197
+    p.puLUMA_64x32.prim = fncdef PFX(filterPixelToShort ## _64x32_ ## sve); \
198
+    p.puLUMA_64x48.prim = fncdef PFX(filterPixelToShort ## _64x48_ ## sve); \
199
+    p.puLUMA_64x16.prim = fncdef PFX(filterPixelToShort ## _64x16_ ## sve); \
200
+    p.puLUMA_48x64.prim = fncdef PFX(filterPixelToShort ## _48x64_ ## sve)
201
+#define ALL_LUMA_PU(prim, fname, cpu) ALL_LUMA_PU_TYPED(prim, , fname, cpu)
202
+#define LUMA_PU_MULTIPLE_ARCHS_1(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_1(prim, , fname, cpu)
203
+#define LUMA_PU_MULTIPLE_ARCHS_2(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_2(prim, , fname, cpu)
204
+#define LUMA_PU_NEON_1(prim, fname) LUMA_PU_TYPED_NEON_1(prim, , fname)
205
+#define LUMA_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fname) LUMA_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, , fname)
206
+#define LUMA_PU_NEON_2(prim, fname) LUMA_PU_TYPED_NEON_2(prim, , fname)
207
+#define LUMA_PU_MULTIPLE_ARCHS_3(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_3(prim, , fname, cpu)
208
+#define LUMA_PU_NEON_3(prim, fname) LUMA_PU_TYPED_NEON_3(prim, , fname)
209
+#define LUMA_PU_CAN_USE_SVE2(prim, fname) LUMA_PU_TYPED_CAN_USE_SVE2(prim, , fname)
210
+#define LUMA_PU_NEON_FILTER_PIXEL_TO_SHORT(prim) LUMA_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, )
211
+#define LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(prim) LUMA_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, )
212
+
213
+
214
+#define ALL_LUMA_PU_T(prim, fname) \
215
+    p.puLUMA_4x4.prim   = fname<LUMA_4x4>; \
216
+    p.puLUMA_8x8.prim   = fname<LUMA_8x8>; \
217
+    p.puLUMA_16x16.prim = fname<LUMA_16x16>; \
218
+    p.puLUMA_32x32.prim = fname<LUMA_32x32>; \
219
+    p.puLUMA_64x64.prim = fname<LUMA_64x64>; \
220
+    p.puLUMA_8x4.prim   = fname<LUMA_8x4>; \
221
+    p.puLUMA_4x8.prim   = fname<LUMA_4x8>; \
222
+    p.puLUMA_16x8.prim  = fname<LUMA_16x8>; \
223
+    p.puLUMA_8x16.prim  = fname<LUMA_8x16>; \
224
+    p.puLUMA_16x32.prim = fname<LUMA_16x32>; \
225
+    p.puLUMA_32x16.prim = fname<LUMA_32x16>; \
226
+    p.puLUMA_64x32.prim = fname<LUMA_64x32>; \
227
+    p.puLUMA_32x64.prim = fname<LUMA_32x64>; \
228
+    p.puLUMA_16x12.prim = fname<LUMA_16x12>; \
229
+    p.puLUMA_12x16.prim = fname<LUMA_12x16>; \
230
+    p.puLUMA_16x4.prim  = fname<LUMA_16x4>; \
231
+    p.puLUMA_4x16.prim  = fname<LUMA_4x16>; \
232
+    p.puLUMA_32x24.prim = fname<LUMA_32x24>; \
233
+    p.puLUMA_24x32.prim = fname<LUMA_24x32>; \
234
+    p.puLUMA_32x8.prim  = fname<LUMA_32x8>; \
235
+    p.puLUMA_8x32.prim  = fname<LUMA_8x32>; \
236
+    p.puLUMA_64x48.prim = fname<LUMA_64x48>; \
237
+    p.puLUMA_48x64.prim = fname<LUMA_48x64>; \
238
+    p.puLUMA_64x16.prim = fname<LUMA_64x16>; \
239
+    p.puLUMA_16x64.prim = fname<LUMA_16x64>
240
+
241
+#define ALL_CHROMA_420_PU_TYPED(prim, fncdef, fname, cpu)               \
242
+    p.chromaX265_CSP_I420.puCHROMA_420_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
243
+    p.chromaX265_CSP_I420.puCHROMA_420_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
244
+    p.chromaX265_CSP_I420.puCHROMA_420_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
245
+    p.chromaX265_CSP_I420.puCHROMA_420_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
246
+    p.chromaX265_CSP_I420.puCHROMA_420_4x2.prim   = fncdef PFX(fname ## _4x2_ ## cpu); \
247
+    p.chromaX265_CSP_I420.puCHROMA_420_2x4.prim   = fncdef PFX(fname ## _2x4_ ## cpu); \
248
+    p.chromaX265_CSP_I420.puCHROMA_420_8x4.prim   = fncdef PFX(fname ## _8x4_ ## cpu); \
249
+    p.chromaX265_CSP_I420.puCHROMA_420_4x8.prim   = fncdef PFX(fname ## _4x8_ ## cpu); \
250
+    p.chromaX265_CSP_I420.puCHROMA_420_16x8.prim  = fncdef PFX(fname ## _16x8_ ## cpu); \
251
+    p.chromaX265_CSP_I420.puCHROMA_420_8x16.prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
252
+    p.chromaX265_CSP_I420.puCHROMA_420_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
253
+    p.chromaX265_CSP_I420.puCHROMA_420_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
254
+    p.chromaX265_CSP_I420.puCHROMA_420_8x6.prim   = fncdef PFX(fname ## _8x6_ ## cpu); \
255
+    p.chromaX265_CSP_I420.puCHROMA_420_6x8.prim   = fncdef PFX(fname ## _6x8_ ## cpu); \
256
+    p.chromaX265_CSP_I420.puCHROMA_420_8x2.prim   = fncdef PFX(fname ## _8x2_ ## cpu); \
257
+    p.chromaX265_CSP_I420.puCHROMA_420_2x8.prim   = fncdef PFX(fname ## _2x8_ ## cpu); \
258
+    p.chromaX265_CSP_I420.puCHROMA_420_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
259
+    p.chromaX265_CSP_I420.puCHROMA_420_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
260
+    p.chromaX265_CSP_I420.puCHROMA_420_16x4.prim  = fncdef PFX(fname ## _16x4_ ## cpu); \
261
+    p.chromaX265_CSP_I420.puCHROMA_420_4x16.prim  = fncdef PFX(fname ## _4x16_ ## cpu); \
262
+    p.chromaX265_CSP_I420.puCHROMA_420_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
263
+    p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
264
+    p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
265
+    p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu)
266
+#define CHROMA_420_PU_TYPED_NEON_1(prim, fncdef, fname)               \
267
+    p.chromaX265_CSP_I420.puCHROMA_420_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
268
+    p.chromaX265_CSP_I420.puCHROMA_420_4x2.prim   = fncdef PFX(fname ## _4x2_ ## neon); \
269
+    p.chromaX265_CSP_I420.puCHROMA_420_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
270
+    p.chromaX265_CSP_I420.puCHROMA_420_6x8.prim   = fncdef PFX(fname ## _6x8_ ## neon); \
271
+    p.chromaX265_CSP_I420.puCHROMA_420_12x16.prim = fncdef PFX(fname ## _12x16_ ## neon); \
272
+    p.chromaX265_CSP_I420.puCHROMA_420_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon); \
273
+    p.chromaX265_CSP_I420.puCHROMA_420_32x24.prim = fncdef PFX(fname ## _32x24_ ## neon); \
274
+    p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(fname ## _24x32_ ## neon); \
275
+    p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim  = fncdef PFX(fname ## _32x8_ ## neon); \
276
+    p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim  = fncdef PFX(fname ## _8x32_ ## neon); \
277
+    p.chromaX265_CSP_I420.puCHROMA_420_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
278
+    p.chromaX265_CSP_I420.puCHROMA_420_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
279
+    p.chromaX265_CSP_I420.puCHROMA_420_2x4.prim   = fncdef PFX(fname ## _2x4_ ## neon); \
280
+    p.chromaX265_CSP_I420.puCHROMA_420_8x4.prim   = fncdef PFX(fname ## _8x4_ ## neon); \
281
+    p.chromaX265_CSP_I420.puCHROMA_420_16x8.prim  = fncdef PFX(fname ## _16x8_ ## neon); \
282
+    p.chromaX265_CSP_I420.puCHROMA_420_8x16.prim  = fncdef PFX(fname ## _8x16_ ## neon); \
283
+    p.chromaX265_CSP_I420.puCHROMA_420_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
284
+    p.chromaX265_CSP_I420.puCHROMA_420_8x6.prim   = fncdef PFX(fname ## _8x6_ ## neon); \
285
+    p.chromaX265_CSP_I420.puCHROMA_420_8x2.prim   = fncdef PFX(fname ## _8x2_ ## neon); \
286
+    p.chromaX265_CSP_I420.puCHROMA_420_2x8.prim   = fncdef PFX(fname ## _2x8_ ## neon); \
287
+    p.chromaX265_CSP_I420.puCHROMA_420_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \
288
+    p.chromaX265_CSP_I420.puCHROMA_420_16x4.prim  = fncdef PFX(fname ## _16x4_ ## neon)
289
+#define CHROMA_420_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fncdef, fname)               \
290
+    p.chromaX265_CSP_I420.puCHROMA_420_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve); \
291
+    p.chromaX265_CSP_I420.puCHROMA_420_32x16.prim = fncdef PFX(fname ## _32x16_ ## sve)
292
+#define CHROMA_420_PU_TYPED_NEON_2(prim, fncdef, fname)               \
293
+    p.chromaX265_CSP_I420.puCHROMA_420_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
294
+    p.chromaX265_CSP_I420.puCHROMA_420_4x2.prim   = fncdef PFX(fname ## _4x2_ ## neon); \
295
+    p.chromaX265_CSP_I420.puCHROMA_420_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
296
+    p.chromaX265_CSP_I420.puCHROMA_420_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon)
297
+#define CHROMA_420_PU_TYPED_MULTIPLE_ARCHS(prim, fncdef, fname, cpu)               \
298
+    p.chromaX265_CSP_I420.puCHROMA_420_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
299
+    p.chromaX265_CSP_I420.puCHROMA_420_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
300
+    p.chromaX265_CSP_I420.puCHROMA_420_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
301
+    p.chromaX265_CSP_I420.puCHROMA_420_2x4.prim   = fncdef PFX(fname ## _2x4_ ## cpu); \
302
+    p.chromaX265_CSP_I420.puCHROMA_420_8x4.prim   = fncdef PFX(fname ## _8x4_ ## cpu); \
303
+    p.chromaX265_CSP_I420.puCHROMA_420_16x8.prim  = fncdef PFX(fname ## _16x8_ ## cpu); \
304
+    p.chromaX265_CSP_I420.puCHROMA_420_8x16.prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
305
+    p.chromaX265_CSP_I420.puCHROMA_420_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
306
+    p.chromaX265_CSP_I420.puCHROMA_420_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
307
+    p.chromaX265_CSP_I420.puCHROMA_420_8x6.prim   = fncdef PFX(fname ## _8x6_ ## cpu); \
308
+    p.chromaX265_CSP_I420.puCHROMA_420_6x8.prim   = fncdef PFX(fname ## _6x8_ ## cpu); \
309
+    p.chromaX265_CSP_I420.puCHROMA_420_8x2.prim   = fncdef PFX(fname ## _8x2_ ## cpu); \
310
+    p.chromaX265_CSP_I420.puCHROMA_420_2x8.prim   = fncdef PFX(fname ## _2x8_ ## cpu); \
311
+    p.chromaX265_CSP_I420.puCHROMA_420_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
312
+    p.chromaX265_CSP_I420.puCHROMA_420_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
313
+    p.chromaX265_CSP_I420.puCHROMA_420_16x4.prim  = fncdef PFX(fname ## _16x4_ ## cpu); \
314
+    p.chromaX265_CSP_I420.puCHROMA_420_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
315
+    p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
316
+    p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
317
+    p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu)
318
+#define CHROMA_420_PU_TYPED_FILTER_PIXEL_TO_SHORT_NEON(prim, fncdef)               \
319
+    p.chromaX265_CSP_I420.puCHROMA_420_4x4.prim   = fncdef PFX(filterPixelToShort ## _4x4_ ## neon); \
320
+    p.chromaX265_CSP_I420.puCHROMA_420_8x8.prim   = fncdef PFX(filterPixelToShort ## _8x8_ ## neon); \
321
+    p.chromaX265_CSP_I420.puCHROMA_420_16x16.prim = fncdef PFX(filterPixelToShort ## _16x16_ ## neon); \
322
+    p.chromaX265_CSP_I420.puCHROMA_420_8x4.prim   = fncdef PFX(filterPixelToShort ## _8x4_ ## neon); \
323
+    p.chromaX265_CSP_I420.puCHROMA_420_4x8.prim   = fncdef PFX(filterPixelToShort ## _4x8_ ## neon); \
324
+    p.chromaX265_CSP_I420.puCHROMA_420_16x8.prim  = fncdef PFX(filterPixelToShort ## _16x8_ ## neon); \
325
+    p.chromaX265_CSP_I420.puCHROMA_420_8x16.prim  = fncdef PFX(filterPixelToShort ## _8x16_ ## neon); \
326
+    p.chromaX265_CSP_I420.puCHROMA_420_16x32.prim = fncdef PFX(filterPixelToShort ## _16x32_ ## neon); \
327
+    p.chromaX265_CSP_I420.puCHROMA_420_8x6.prim   = fncdef PFX(filterPixelToShort ## _8x6_ ## neon); \
328
+    p.chromaX265_CSP_I420.puCHROMA_420_8x2.prim   = fncdef PFX(filterPixelToShort ## _8x2_ ## neon); \
329
+    p.chromaX265_CSP_I420.puCHROMA_420_16x12.prim = fncdef PFX(filterPixelToShort ## _16x12_ ## neon); \
330
+    p.chromaX265_CSP_I420.puCHROMA_420_12x16.prim = fncdef PFX(filterPixelToShort ## _12x16_ ## neon); \
331
+    p.chromaX265_CSP_I420.puCHROMA_420_16x4.prim  = fncdef PFX(filterPixelToShort ## _16x4_ ## neon); \
332
+    p.chromaX265_CSP_I420.puCHROMA_420_4x16.prim  = fncdef PFX(filterPixelToShort ## _4x16_ ## neon); \
333
+    p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(filterPixelToShort ## _24x32_ ## neon); \
334
+    p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim  = fncdef PFX(filterPixelToShort ## _8x32_ ## neon)
335
+#define CHROMA_420_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, fncdef)               \
336
+    p.chromaX265_CSP_I420.puCHROMA_420_2x4.prim   = fncdef PFX(filterPixelToShort ## _2x4_ ## sve); \
337
+    p.chromaX265_CSP_I420.puCHROMA_420_2x8.prim   = fncdef PFX(filterPixelToShort ## _2x8_ ## sve); \
338
+    p.chromaX265_CSP_I420.puCHROMA_420_6x8.prim   = fncdef PFX(filterPixelToShort ## _6x8_ ## sve); \
339
+    p.chromaX265_CSP_I420.puCHROMA_420_4x2.prim   = fncdef PFX(filterPixelToShort ## _4x2_ ## sve); \
340
+    p.chromaX265_CSP_I420.puCHROMA_420_32x32.prim = fncdef PFX(filterPixelToShort ## _32x32_ ## sve); \
341
+    p.chromaX265_CSP_I420.puCHROMA_420_32x16.prim = fncdef PFX(filterPixelToShort ## _32x16_ ## sve); \
342
+    p.chromaX265_CSP_I420.puCHROMA_420_32x24.prim = fncdef PFX(filterPixelToShort ## _32x24_ ## sve); \
343
+    p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim  = fncdef PFX(filterPixelToShort ## _32x8_ ## sve)
344
+#define ALL_CHROMA_420_PU(prim, fname, cpu) ALL_CHROMA_420_PU_TYPED(prim, , fname, cpu)
345
+#define CHROMA_420_PU_NEON_1(prim, fname) CHROMA_420_PU_TYPED_NEON_1(prim, , fname)
346
+#define CHROMA_420_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fname) CHROMA_420_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, , fname)
347
+#define CHROMA_420_PU_NEON_2(prim, fname) CHROMA_420_PU_TYPED_NEON_2(prim, , fname)
348
+#define CHROMA_420_PU_MULTIPLE_ARCHS(prim, fname, cpu) CHROMA_420_PU_TYPED_MULTIPLE_ARCHS(prim, , fname, cpu)
349
+#define CHROMA_420_PU_FILTER_PIXEL_TO_SHORT_NEON(prim) CHROMA_420_PU_TYPED_FILTER_PIXEL_TO_SHORT_NEON(prim, )
350
+#define CHROMA_420_PU_SVE_FILTER_PIXEL_TO_SHORT(prim) CHROMA_420_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, )
351
+
352
+
353
+#define ALL_CHROMA_420_4x4_PU_TYPED(prim, fncdef, fname, cpu) \
354
+    p.chromaX265_CSP_I420.puCHROMA_420_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
355
+    p.chromaX265_CSP_I420.puCHROMA_420_8x2.prim   = fncdef PFX(fname ## _8x2_ ## cpu); \
356
+    p.chromaX265_CSP_I420.puCHROMA_420_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
357
+    p.chromaX265_CSP_I420.puCHROMA_420_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
358
+    p.chromaX265_CSP_I420.puCHROMA_420_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
359
+    p.chromaX265_CSP_I420.puCHROMA_420_8x4.prim   = fncdef PFX(fname ## _8x4_ ## cpu); \
360
+    p.chromaX265_CSP_I420.puCHROMA_420_8x6.prim   = fncdef PFX(fname ## _8x6_ ## cpu); \
361
+    p.chromaX265_CSP_I420.puCHROMA_420_4x8.prim   = fncdef PFX(fname ## _4x8_ ## cpu); \
362
+    p.chromaX265_CSP_I420.puCHROMA_420_16x8.prim  = fncdef PFX(fname ## _16x8_ ## cpu); \
363
+    p.chromaX265_CSP_I420.puCHROMA_420_8x16.prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
364
+    p.chromaX265_CSP_I420.puCHROMA_420_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
365
+    p.chromaX265_CSP_I420.puCHROMA_420_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
366
+    p.chromaX265_CSP_I420.puCHROMA_420_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
367
+    p.chromaX265_CSP_I420.puCHROMA_420_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
368
+    p.chromaX265_CSP_I420.puCHROMA_420_16x4.prim  = fncdef PFX(fname ## _16x4_ ## cpu); \
369
+    p.chromaX265_CSP_I420.puCHROMA_420_4x16.prim  = fncdef PFX(fname ## _4x16_ ## cpu); \
370
+    p.chromaX265_CSP_I420.puCHROMA_420_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
371
+    p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
372
+    p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
373
+    p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu)
374
+#define ALL_CHROMA_420_4x4_PU(prim, fname, cpu) ALL_CHROMA_420_4x4_PU_TYPED(prim, , fname, cpu)
375
+
376
+#define ALL_CHROMA_422_PU_TYPED(prim, fncdef, fname, cpu)               \
377
+    p.chromaX265_CSP_I422.puCHROMA_422_4x8.prim   = fncdef PFX(fname ## _4x8_ ## cpu); \
378
+    p.chromaX265_CSP_I422.puCHROMA_422_8x16.prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
379
+    p.chromaX265_CSP_I422.puCHROMA_422_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
380
+    p.chromaX265_CSP_I422.puCHROMA_422_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
381
+    p.chromaX265_CSP_I422.puCHROMA_422_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
382
+    p.chromaX265_CSP_I422.puCHROMA_422_2x8.prim   = fncdef PFX(fname ## _2x8_ ## cpu); \
383
+    p.chromaX265_CSP_I422.puCHROMA_422_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
384
+    p.chromaX265_CSP_I422.puCHROMA_422_4x16.prim  = fncdef PFX(fname ## _4x16_ ## cpu); \
385
+    p.chromaX265_CSP_I422.puCHROMA_422_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
386
+    p.chromaX265_CSP_I422.puCHROMA_422_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu); \
387
+    p.chromaX265_CSP_I422.puCHROMA_422_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
388
+    p.chromaX265_CSP_I422.puCHROMA_422_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu); \
389
+    p.chromaX265_CSP_I422.puCHROMA_422_8x12.prim  = fncdef PFX(fname ## _8x12_ ## cpu); \
390
+    p.chromaX265_CSP_I422.puCHROMA_422_6x16.prim  = fncdef PFX(fname ## _6x16_ ## cpu); \
391
+    p.chromaX265_CSP_I422.puCHROMA_422_8x4.prim   = fncdef PFX(fname ## _8x4_ ## cpu); \
392
+    p.chromaX265_CSP_I422.puCHROMA_422_2x16.prim  = fncdef PFX(fname ## _2x16_ ## cpu); \
393
+    p.chromaX265_CSP_I422.puCHROMA_422_16x24.prim = fncdef PFX(fname ## _16x24_ ## cpu); \
394
+    p.chromaX265_CSP_I422.puCHROMA_422_12x32.prim = fncdef PFX(fname ## _12x32_ ## cpu); \
395
+    p.chromaX265_CSP_I422.puCHROMA_422_16x8.prim  = fncdef PFX(fname ## _16x8_ ## cpu); \
396
+    p.chromaX265_CSP_I422.puCHROMA_422_4x32.prim  = fncdef PFX(fname ## _4x32_ ## cpu); \
397
+    p.chromaX265_CSP_I422.puCHROMA_422_32x48.prim = fncdef PFX(fname ## _32x48_ ## cpu); \
398
+    p.chromaX265_CSP_I422.puCHROMA_422_24x64.prim = fncdef PFX(fname ## _24x64_ ## cpu); \
399
+    p.chromaX265_CSP_I422.puCHROMA_422_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
400
+    p.chromaX265_CSP_I422.puCHROMA_422_8x64.prim  = fncdef PFX(fname ## _8x64_ ## cpu)
401
+#define CHROMA_422_PU_TYPED_NEON_1(prim, fncdef, fname)               \
402
+    p.chromaX265_CSP_I422.puCHROMA_422_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
403
+    p.chromaX265_CSP_I422.puCHROMA_422_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
404
+    p.chromaX265_CSP_I422.puCHROMA_422_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon); \
405
+    p.chromaX265_CSP_I422.puCHROMA_422_6x16.prim  = fncdef PFX(fname ## _6x16_ ## neon); \
406
+    p.chromaX265_CSP_I422.puCHROMA_422_12x32.prim = fncdef PFX(fname ## _12x32_ ## neon); \
407
+    p.chromaX265_CSP_I422.puCHROMA_422_4x32.prim  = fncdef PFX(fname ## _4x32_ ## neon); \
408
+    p.chromaX265_CSP_I422.puCHROMA_422_8x16.prim  = fncdef PFX(fname ## _8x16_ ## neon); \
409
+    p.chromaX265_CSP_I422.puCHROMA_422_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
410
+    p.chromaX265_CSP_I422.puCHROMA_422_2x8.prim   = fncdef PFX(fname ## _2x8_ ## neon); \
411
+    p.chromaX265_CSP_I422.puCHROMA_422_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
412
+    p.chromaX265_CSP_I422.puCHROMA_422_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
413
+    p.chromaX265_CSP_I422.puCHROMA_422_8x32.prim  = fncdef PFX(fname ## _8x32_ ## neon); \
414
+    p.chromaX265_CSP_I422.puCHROMA_422_16x64.prim = fncdef PFX(fname ## _16x64_ ## neon); \
415
+    p.chromaX265_CSP_I422.puCHROMA_422_8x12.prim  = fncdef PFX(fname ## _8x12_ ## neon); \
416
+    p.chromaX265_CSP_I422.puCHROMA_422_8x4.prim   = fncdef PFX(fname ## _8x4_ ## neon); \
417
+    p.chromaX265_CSP_I422.puCHROMA_422_2x16.prim  = fncdef PFX(fname ## _2x16_ ## neon); \
418
+    p.chromaX265_CSP_I422.puCHROMA_422_16x24.prim = fncdef PFX(fname ## _16x24_ ## neon); \
419
+    p.chromaX265_CSP_I422.puCHROMA_422_16x8.prim  = fncdef PFX(fname ## _16x8_ ## neon); \
420
+    p.chromaX265_CSP_I422.puCHROMA_422_24x64.prim = fncdef PFX(fname ## _24x64_ ## neon); \
421
+    p.chromaX265_CSP_I422.puCHROMA_422_8x64.prim  = fncdef PFX(fname ## _8x64_ ## neon)
422
+#define CHROMA_422_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fncdef, fname)               \
423
+    p.chromaX265_CSP_I422.puCHROMA_422_32x64.prim = fncdef PFX(fname ## _32x64_ ## sve); \
424
+    p.chromaX265_CSP_I422.puCHROMA_422_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve); \
425
+    p.chromaX265_CSP_I422.puCHROMA_422_32x48.prim = fncdef PFX(fname ## _32x48_ ## sve); \
426
+    p.chromaX265_CSP_I422.puCHROMA_422_32x16.prim = fncdef PFX(fname ## _32x16_ ## sve)
427
+#define CHROMA_422_PU_TYPED_NEON_2(prim, fncdef, fname)               \
428
+    p.chromaX265_CSP_I422.puCHROMA_422_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
429
+    p.chromaX265_CSP_I422.puCHROMA_422_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
430
+    p.chromaX265_CSP_I422.puCHROMA_422_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon); \
431
+    p.chromaX265_CSP_I422.puCHROMA_422_4x32.prim  = fncdef PFX(fname ## _4x32_ ## neon)
432
+#define CHROMA_422_PU_TYPED_CAN_USE_SVE2(prim, fncdef, fname)               \
433
+    p.chromaX265_CSP_I422.puCHROMA_422_8x16.prim  = fncdef PFX(fname ## _8x16_ ## sve2); \
434
+    p.chromaX265_CSP_I422.puCHROMA_422_16x32.prim = fncdef PFX(fname ## _16x32_ ## sve2); \
435
+    p.chromaX265_CSP_I422.puCHROMA_422_32x64.prim = fncdef PFX(fname ## _32x64_ ## sve2); \
436
+    p.chromaX265_CSP_I422.puCHROMA_422_2x8.prim   = fncdef PFX(fname ## _2x8_ ## sve2); \
437
+    p.chromaX265_CSP_I422.puCHROMA_422_8x8.prim   = fncdef PFX(fname ## _8x8_ ## sve2); \
438
+    p.chromaX265_CSP_I422.puCHROMA_422_16x16.prim = fncdef PFX(fname ## _16x16_ ## sve2); \
439
+    p.chromaX265_CSP_I422.puCHROMA_422_8x32.prim  = fncdef PFX(fname ## _8x32_ ## sve2); \
440
+    p.chromaX265_CSP_I422.puCHROMA_422_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve2); \
441
+    p.chromaX265_CSP_I422.puCHROMA_422_16x64.prim = fncdef PFX(fname ## _16x64_ ## sve2); \
442
+    p.chromaX265_CSP_I422.puCHROMA_422_8x12.prim  = fncdef PFX(fname ## _8x12_ ## sve2); \
443
+    p.chromaX265_CSP_I422.puCHROMA_422_6x16.prim  = fncdef PFX(fname ## _6x16_ ## sve2); \
444
+    p.chromaX265_CSP_I422.puCHROMA_422_8x4.prim   = fncdef PFX(fname ## _8x4_ ## sve2); \
445
+    p.chromaX265_CSP_I422.puCHROMA_422_2x16.prim  = fncdef PFX(fname ## _2x16_ ## sve2); \
446
+    p.chromaX265_CSP_I422.puCHROMA_422_16x24.prim = fncdef PFX(fname ## _16x24_ ## sve2); \
447
+    p.chromaX265_CSP_I422.puCHROMA_422_12x32.prim = fncdef PFX(fname ## _12x32_ ## sve2); \
448
+    p.chromaX265_CSP_I422.puCHROMA_422_16x8.prim  = fncdef PFX(fname ## _16x8_ ## sve2); \
449
+    p.chromaX265_CSP_I422.puCHROMA_422_32x48.prim = fncdef PFX(fname ## _32x48_ ## sve2); \
450
+    p.chromaX265_CSP_I422.puCHROMA_422_24x64.prim = fncdef PFX(fname ## _24x64_ ## sve2); \
451
+    p.chromaX265_CSP_I422.puCHROMA_422_32x16.prim = fncdef PFX(fname ## _32x16_ ## sve2); \
452
+    p.chromaX265_CSP_I422.puCHROMA_422_8x64.prim  = fncdef PFX(fname ## _8x64_ ## sve2)
453
+#define CHROMA_422_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, fncdef)               \
454
+    p.chromaX265_CSP_I422.puCHROMA_422_4x8.prim   = fncdef PFX(filterPixelToShort ## _4x8_ ## neon); \
455
+    p.chromaX265_CSP_I422.puCHROMA_422_8x16.prim  = fncdef PFX(filterPixelToShort ## _8x16_ ## neon); \
456
+    p.chromaX265_CSP_I422.puCHROMA_422_16x32.prim = fncdef PFX(filterPixelToShort ## _16x32_ ## neon); \
457
+    p.chromaX265_CSP_I422.puCHROMA_422_4x4.prim   = fncdef PFX(filterPixelToShort ## _4x4_ ## neon); \
458
+    p.chromaX265_CSP_I422.puCHROMA_422_8x8.prim   = fncdef PFX(filterPixelToShort ## _8x8_ ## neon); \
459
+    p.chromaX265_CSP_I422.puCHROMA_422_4x16.prim  = fncdef PFX(filterPixelToShort ## _4x16_ ## neon); \
460
+    p.chromaX265_CSP_I422.puCHROMA_422_16x16.prim = fncdef PFX(filterPixelToShort ## _16x16_ ## neon); \
461
+    p.chromaX265_CSP_I422.puCHROMA_422_8x32.prim  = fncdef PFX(filterPixelToShort ## _8x32_ ## neon); \
462
+    p.chromaX265_CSP_I422.puCHROMA_422_16x64.prim = fncdef PFX(filterPixelToShort ## _16x64_ ## neon); \
463
+    p.chromaX265_CSP_I422.puCHROMA_422_8x12.prim  = fncdef PFX(filterPixelToShort ## _8x12_ ## neon); \
464
+    p.chromaX265_CSP_I422.puCHROMA_422_8x4.prim   = fncdef PFX(filterPixelToShort ## _8x4_ ## neon); \
465
+    p.chromaX265_CSP_I422.puCHROMA_422_16x24.prim = fncdef PFX(filterPixelToShort ## _16x24_ ## neon); \
466
+    p.chromaX265_CSP_I422.puCHROMA_422_12x32.prim = fncdef PFX(filterPixelToShort ## _12x32_ ## neon); \
467
+    p.chromaX265_CSP_I422.puCHROMA_422_16x8.prim  = fncdef PFX(filterPixelToShort ## _16x8_ ## neon); \
468
+    p.chromaX265_CSP_I422.puCHROMA_422_4x32.prim  = fncdef PFX(filterPixelToShort ## _4x32_ ## neon); \
469
+    p.chromaX265_CSP_I422.puCHROMA_422_24x64.prim = fncdef PFX(filterPixelToShort ## _24x64_ ## neon); \
470
+    p.chromaX265_CSP_I422.puCHROMA_422_8x64.prim  = fncdef PFX(filterPixelToShort ## _8x64_ ## neon)
471
+#define CHROMA_422_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, fncdef)               \
472
+    p.chromaX265_CSP_I422.puCHROMA_422_2x8.prim   = fncdef PFX(filterPixelToShort ## _2x8_ ## sve); \
473
+    p.chromaX265_CSP_I422.puCHROMA_422_2x16.prim  = fncdef PFX(filterPixelToShort ## _2x16_ ## sve); \
474
+    p.chromaX265_CSP_I422.puCHROMA_422_6x16.prim  = fncdef PFX(filterPixelToShort ## _6x16_ ## sve); \
475
+    p.chromaX265_CSP_I422.puCHROMA_422_32x64.prim = fncdef PFX(filterPixelToShort ## _32x64_ ## sve); \
476
+    p.chromaX265_CSP_I422.puCHROMA_422_32x32.prim = fncdef PFX(filterPixelToShort ## _32x32_ ## sve); \
477
+    p.chromaX265_CSP_I422.puCHROMA_422_32x48.prim = fncdef PFX(filterPixelToShort ## _32x48_ ## sve); \
478
+    p.chromaX265_CSP_I422.puCHROMA_422_32x16.prim = fncdef PFX(filterPixelToShort ## _32x16_ ## sve)
479
+#define ALL_CHROMA_422_PU(prim, fname, cpu) ALL_CHROMA_422_PU_TYPED(prim, , fname, cpu)
480
+#define CHROMA_422_PU_NEON_1(prim, fname) CHROMA_422_PU_TYPED_NEON_1(prim, , fname)
481
+#define CHROMA_422_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fname) CHROMA_422_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, , fname)
482
+#define CHROMA_422_PU_NEON_2(prim, fname) CHROMA_422_PU_TYPED_NEON_2(prim, , fname)
483
+#define CHROMA_422_PU_CAN_USE_SVE2(prim, fname) CHROMA_422_PU_TYPED_CAN_USE_SVE2(prim, , fname)
484
+#define CHROMA_422_PU_NEON_FILTER_PIXEL_TO_SHORT(prim) CHROMA_422_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, )
485
+#define CHROMA_422_PU_SVE_FILTER_PIXEL_TO_SHORT(prim) CHROMA_422_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, )
486
+
487
+#define ALL_CHROMA_444_PU_TYPED(prim, fncdef, fname, cpu) \
488
+    p.chromaX265_CSP_I444.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
489
+    p.chromaX265_CSP_I444.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
490
+    p.chromaX265_CSP_I444.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
491
+    p.chromaX265_CSP_I444.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
492
+    p.chromaX265_CSP_I444.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
493
+    p.chromaX265_CSP_I444.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## cpu); \
494
+    p.chromaX265_CSP_I444.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## cpu); \
495
+    p.chromaX265_CSP_I444.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## cpu); \
496
+    p.chromaX265_CSP_I444.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
497
+    p.chromaX265_CSP_I444.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
498
+    p.chromaX265_CSP_I444.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
499
+    p.chromaX265_CSP_I444.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
500
+    p.chromaX265_CSP_I444.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
501
+    p.chromaX265_CSP_I444.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
502
+    p.chromaX265_CSP_I444.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
503
+    p.chromaX265_CSP_I444.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## cpu); \
504
+    p.chromaX265_CSP_I444.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## cpu); \
505
+    p.chromaX265_CSP_I444.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
506
+    p.chromaX265_CSP_I444.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
507
+    p.chromaX265_CSP_I444.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
508
+    p.chromaX265_CSP_I444.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu); \
509
+    p.chromaX265_CSP_I444.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
510
+    p.chromaX265_CSP_I444.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
511
+    p.chromaX265_CSP_I444.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \
512
+    p.chromaX265_CSP_I444.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu)
513
+#define CHROMA_444_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
514
+    p.chromaX265_CSP_I444.puLUMA_4x4.prim   = fncdef PFX(filterPixelToShort ## _4x4_ ## neon); \
515
+    p.chromaX265_CSP_I444.puLUMA_8x8.prim   = fncdef PFX(filterPixelToShort ## _8x8_ ## neon); \
516
+    p.chromaX265_CSP_I444.puLUMA_16x16.prim = fncdef PFX(filterPixelToShort ## _16x16_ ## neon); \
517
+    p.chromaX265_CSP_I444.puLUMA_8x4.prim   = fncdef PFX(filterPixelToShort ## _8x4_ ## neon); \
518
+    p.chromaX265_CSP_I444.puLUMA_4x8.prim   = fncdef PFX(filterPixelToShort ## _4x8_ ## neon); \
519
+    p.chromaX265_CSP_I444.puLUMA_16x8.prim  = fncdef PFX(filterPixelToShort ## _16x8_ ## neon); \
520
+    p.chromaX265_CSP_I444.puLUMA_8x16.prim  = fncdef PFX(filterPixelToShort ## _8x16_ ## neon); \
521
+    p.chromaX265_CSP_I444.puLUMA_16x32.prim = fncdef PFX(filterPixelToShort ## _16x32_ ## neon); \
522
+    p.chromaX265_CSP_I444.puLUMA_16x12.prim = fncdef PFX(filterPixelToShort ## _16x12_ ## neon); \
523
+    p.chromaX265_CSP_I444.puLUMA_12x16.prim = fncdef PFX(filterPixelToShort ## _12x16_ ## neon); \
524
+    p.chromaX265_CSP_I444.puLUMA_16x4.prim  = fncdef PFX(filterPixelToShort ## _16x4_ ## neon); \
525
+    p.chromaX265_CSP_I444.puLUMA_4x16.prim  = fncdef PFX(filterPixelToShort ## _4x16_ ## neon); \
526
+    p.chromaX265_CSP_I444.puLUMA_24x32.prim = fncdef PFX(filterPixelToShort ## _24x32_ ## neon); \
527
+    p.chromaX265_CSP_I444.puLUMA_8x32.prim  = fncdef PFX(filterPixelToShort ## _8x32_ ## neon); \
528
+    p.chromaX265_CSP_I444.puLUMA_16x64.prim = fncdef PFX(filterPixelToShort ## _16x64_ ## neon)
529
+#define CHROMA_444_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
530
+    p.chromaX265_CSP_I444.puLUMA_32x32.prim = fncdef PFX(filterPixelToShort ## _32x32_ ## sve); \
531
+    p.chromaX265_CSP_I444.puLUMA_32x16.prim = fncdef PFX(filterPixelToShort ## _32x16_ ## sve); \
532
+    p.chromaX265_CSP_I444.puLUMA_32x64.prim = fncdef PFX(filterPixelToShort ## _32x64_ ## sve); \
533
+    p.chromaX265_CSP_I444.puLUMA_32x24.prim = fncdef PFX(filterPixelToShort ## _32x24_ ## sve); \
534
+    p.chromaX265_CSP_I444.puLUMA_32x8.prim  = fncdef PFX(filterPixelToShort ## _32x8_ ## sve); \
535
+    p.chromaX265_CSP_I444.puLUMA_64x64.prim = fncdef PFX(filterPixelToShort ## _64x64_ ## sve); \
536
+    p.chromaX265_CSP_I444.puLUMA_64x32.prim = fncdef PFX(filterPixelToShort ## _64x32_ ## sve); \
537
+    p.chromaX265_CSP_I444.puLUMA_64x48.prim = fncdef PFX(filterPixelToShort ## _64x48_ ## sve); \
538
+    p.chromaX265_CSP_I444.puLUMA_64x16.prim = fncdef PFX(filterPixelToShort ## _64x16_ ## sve); \
539
+    p.chromaX265_CSP_I444.puLUMA_48x64.prim = fncdef PFX(filterPixelToShort ## _48x64_ ## sve)
540
+#define ALL_CHROMA_444_PU(prim, fname, cpu) ALL_CHROMA_444_PU_TYPED(prim, , fname, cpu)
541
+#define CHROMA_444_PU_NEON_FILTER_PIXEL_TO_SHORT(prim) CHROMA_444_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, )
542
+#define CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(prim) CHROMA_444_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, )
543
+
544
+#define ALL_CHROMA_420_VERT_FILTERS(cpu)                             \
545
+    ALL_CHROMA_420_4x4_PU(filter_vpp, interp_4tap_vert_pp, cpu); \
546
+    ALL_CHROMA_420_4x4_PU(filter_vps, interp_4tap_vert_ps, cpu); \
547
+    ALL_CHROMA_420_4x4_PU(filter_vsp, interp_4tap_vert_sp, cpu); \
548
+    ALL_CHROMA_420_4x4_PU(filter_vss, interp_4tap_vert_ss, cpu)
549
+
550
+#define CHROMA_420_VERT_FILTERS_NEON()                             \
551
+    ALL_CHROMA_420_4x4_PU(filter_vsp, interp_4tap_vert_sp, neon)
552
+
553
+#define CHROMA_420_VERT_FILTERS_CAN_USE_SVE2()                             \
554
+    ALL_CHROMA_420_4x4_PU(filter_vpp, interp_4tap_vert_pp, sve2); \
555
+    ALL_CHROMA_420_4x4_PU(filter_vps, interp_4tap_vert_ps, sve2); \
556
+    ALL_CHROMA_420_4x4_PU(filter_vss, interp_4tap_vert_ss, sve2)
557
+
558
+#define SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(W, H) \
559
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vsp = PFX(interp_4tap_vert_sp_ ## W ## x ## H ## _ ## neon)
560
+
561
+#define SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(W, H, cpu) \
562
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vpp = PFX(interp_4tap_vert_pp_ ## W ## x ## H ## _ ## cpu); \
563
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vps = PFX(interp_4tap_vert_ps_ ## W ## x ## H ## _ ## cpu); \
564
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vss = PFX(interp_4tap_vert_ss_ ## W ## x ## H ## _ ## cpu)
565
+
566
+#define CHROMA_422_VERT_FILTERS_NEON() \
567
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(4, 8); \
568
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(8, 16); \
569
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(8, 8); \
570
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(4, 16); \
571
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(8, 12); \
572
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(8, 4); \
573
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(16, 32); \
574
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(16, 16); \
575
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(8, 32); \
576
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(16, 24); \
577
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(12, 32); \
578
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(16, 8); \
579
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(4, 32); \
580
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(32, 64); \
581
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(32, 32); \
582
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(16, 64); \
583
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(32, 48); \
584
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(24, 64); \
585
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(32, 16); \
586
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(8, 64)
587
+
588
+#define CHROMA_422_VERT_FILTERS_CAN_USE_SVE2(cpu) \
589
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(4, 8, cpu); \
590
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(8, 16, cpu); \
591
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(8, 8, cpu); \
592
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(4, 16, cpu); \
593
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(8, 12, cpu); \
594
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(8, 4, cpu); \
595
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(16, 32, cpu); \
596
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(16, 16, cpu); \
597
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(8, 32, cpu); \
598
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(16, 24, cpu); \
599
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(12, 32, cpu); \
600
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(16, 8, cpu); \
601
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(4, 32, cpu); \
602
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(32, 64, cpu); \
603
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(32, 32, cpu); \
604
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(16, 64, cpu); \
605
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(32, 48, cpu); \
606
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(24, 64, cpu); \
607
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(32, 16, cpu); \
608
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(8, 64, cpu)
609
+
610
+#define ALL_CHROMA_444_VERT_FILTERS(cpu) \
611
+    ALL_CHROMA_444_PU(filter_vpp, interp_4tap_vert_pp, cpu); \
612
+    ALL_CHROMA_444_PU(filter_vps, interp_4tap_vert_ps, cpu); \
613
+    ALL_CHROMA_444_PU(filter_vsp, interp_4tap_vert_sp, cpu); \
614
+    ALL_CHROMA_444_PU(filter_vss, interp_4tap_vert_ss, cpu)
615
+
616
+#define CHROMA_444_VERT_FILTERS_NEON() \
617
+    ALL_CHROMA_444_PU(filter_vsp, interp_4tap_vert_sp, neon)
618
+
619
+#define CHROMA_444_VERT_FILTERS_CAN_USE_SVE2() \
620
+    ALL_CHROMA_444_PU(filter_vpp, interp_4tap_vert_pp, sve2); \
621
+    ALL_CHROMA_444_PU(filter_vps, interp_4tap_vert_ps, sve2); \
622
+    ALL_CHROMA_444_PU(filter_vss, interp_4tap_vert_ss, sve2)
623
+
624
+#define ALL_CHROMA_420_FILTERS(cpu)                               \
625
+    ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, cpu); \
626
+    ALL_CHROMA_420_PU(filter_hps, interp_4tap_horiz_ps, cpu); \
627
+    ALL_CHROMA_420_PU(filter_vpp, interp_4tap_vert_pp, cpu);  \
628
+    ALL_CHROMA_420_PU(filter_vps, interp_4tap_vert_ps, cpu)
629
+
630
+#define CHROMA_420_FILTERS_NEON()                               \
631
+    ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, neon); \
632
+    ALL_CHROMA_420_PU(filter_hps, interp_4tap_horiz_ps, neon)
633
+
634
+#define CHROMA_420_FILTERS_CAN_USE_SVE2()                               \
635
+    ALL_CHROMA_420_PU(filter_vpp, interp_4tap_vert_pp, sve2);  \
636
+    ALL_CHROMA_420_PU(filter_vps, interp_4tap_vert_ps, sve2)
637
+
638
+#define ALL_CHROMA_422_FILTERS(cpu) \
639
+    ALL_CHROMA_422_PU(filter_hpp, interp_4tap_horiz_pp, cpu); \
640
+    ALL_CHROMA_422_PU(filter_hps, interp_4tap_horiz_ps, cpu); \
641
+    ALL_CHROMA_422_PU(filter_vpp, interp_4tap_vert_pp, cpu);  \
642
+    ALL_CHROMA_422_PU(filter_vps, interp_4tap_vert_ps, cpu)
643
+
644
+#define CHROMA_422_FILTERS_NEON() \
645
+    ALL_CHROMA_422_PU(filter_hpp, interp_4tap_horiz_pp, neon); \
646
+    ALL_CHROMA_422_PU(filter_hps, interp_4tap_horiz_ps, neon)
647
+
648
+#define CHROMA_422_FILTERS_CAN_USE_SVE2() \
649
+    ALL_CHROMA_422_PU(filter_vpp, interp_4tap_vert_pp, sve2);  \
650
+    ALL_CHROMA_422_PU(filter_vps, interp_4tap_vert_ps, sve2)
651
+
652
+#define ALL_CHROMA_444_FILTERS(cpu) \
653
+    ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, cpu); \
654
+    ALL_CHROMA_444_PU(filter_hps, interp_4tap_horiz_ps, cpu); \
655
+    ALL_CHROMA_444_PU(filter_vpp, interp_4tap_vert_pp, cpu);  \
656
+    ALL_CHROMA_444_PU(filter_vps, interp_4tap_vert_ps, cpu)
657
+
658
+#define CHROMA_444_FILTERS_NEON() \
659
+    ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, neon); \
660
+    ALL_CHROMA_444_PU(filter_hps, interp_4tap_horiz_ps, neon)
661
+
662
+#define CHROMA_444_FILTERS_CAN_USE_SVE2() \
663
+    ALL_CHROMA_444_PU(filter_vpp, interp_4tap_vert_pp, sve2);  \
664
+    ALL_CHROMA_444_PU(filter_vps, interp_4tap_vert_ps, sve2)
665
+
666
 
667
 #if defined(__GNUC__)
668
 #define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
669
@@ -35,18 +684,19 @@
670
 #define GCC_4_9_0 40900
671
 #define GCC_5_1_0 50100
672
 
673
-extern "C" {
674
-#include "pixel.h"
675
-#include "pixel-util.h"
676
-#include "ipfilter8.h"
677
-}
678
+#include "pixel-prim.h"
679
+#include "filter-prim.h"
680
+#include "dct-prim.h"
681
+#include "loopfilter-prim.h"
682
+#include "intrapred-prim.h"
683
 
684
-namespace X265_NS {
685
+namespace X265_NS
686
+{
687
 // private x265 namespace
688
 
689
 
690
 template<int size>
691
-void interp_8tap_hv_pp_cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY)
692
+void interp_8tap_hv_pp_cpu(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
693
 {
694
     ALIGN_VAR_32(int16_t, immedMAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA - 1));
695
     const int halfFilterSize = NTAPS_LUMA >> 1;
696
@@ -56,164 +706,1259 @@
697
     primitives.pusize.luma_vsp(immed + (halfFilterSize - 1) * immedStride, immedStride, dst, dstStride, idxY);
698
 }
699
 
700
-
701
-/* Temporary workaround because luma_vsp assembly primitive has not been completed
702
- * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
703
- * Otherwise, segment fault occurs. */
704
-void setupAliasCPrimitives(EncoderPrimitives &cp, EncoderPrimitives &asmp, int cpuMask)
705
+void setupNeonPrimitives(EncoderPrimitives &p)
706
 {
707
-    if (cpuMask & X265_CPU_NEON)
708
-    {
709
-        asmp.puLUMA_8x4.luma_vsp   = cp.puLUMA_8x4.luma_vsp;
710
-        asmp.puLUMA_8x8.luma_vsp   = cp.puLUMA_8x8.luma_vsp;
711
-        asmp.puLUMA_8x16.luma_vsp  = cp.puLUMA_8x16.luma_vsp;
712
-        asmp.puLUMA_8x32.luma_vsp  = cp.puLUMA_8x32.luma_vsp;
713
-        asmp.puLUMA_12x16.luma_vsp = cp.puLUMA_12x16.luma_vsp;
714
-#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */
715
-        asmp.puLUMA_16x4.luma_vsp  = cp.puLUMA_16x4.luma_vsp;
716
-        asmp.puLUMA_16x8.luma_vsp  = cp.puLUMA_16x8.luma_vsp;
717
-        asmp.puLUMA_16x12.luma_vsp = cp.puLUMA_16x12.luma_vsp;
718
-        asmp.puLUMA_16x16.luma_vsp = cp.puLUMA_16x16.luma_vsp;
719
-        asmp.puLUMA_16x32.luma_vsp = cp.puLUMA_16x32.luma_vsp;
720
-        asmp.puLUMA_16x64.luma_vsp = cp.puLUMA_16x64.luma_vsp;
721
-        asmp.puLUMA_32x16.luma_vsp = cp.puLUMA_32x16.luma_vsp;
722
-        asmp.puLUMA_32x24.luma_vsp = cp.puLUMA_32x24.luma_vsp;
723
-        asmp.puLUMA_32x32.luma_vsp = cp.puLUMA_32x32.luma_vsp;
724
-        asmp.puLUMA_32x64.luma_vsp = cp.puLUMA_32x64.luma_vsp;
725
-        asmp.puLUMA_48x64.luma_vsp = cp.puLUMA_48x64.luma_vsp;
726
-        asmp.puLUMA_64x16.luma_vsp = cp.puLUMA_64x16.luma_vsp;
727
-        asmp.puLUMA_64x32.luma_vsp = cp.puLUMA_64x32.luma_vsp;
728
-        asmp.puLUMA_64x48.luma_vsp = cp.puLUMA_64x48.luma_vsp;
729
-        asmp.puLUMA_64x64.luma_vsp = cp.puLUMA_64x64.luma_vsp;    
730
-#if !AUTO_VECTORIZE || GCC_VERSION < GCC_4_9_0 /* gcc_version < gcc-4.9.0 */
731
-        asmp.puLUMA_4x4.luma_vsp   = cp.puLUMA_4x4.luma_vsp;
732
-        asmp.puLUMA_4x8.luma_vsp   = cp.puLUMA_4x8.luma_vsp;
733
-        asmp.puLUMA_4x16.luma_vsp  = cp.puLUMA_4x16.luma_vsp;
734
-        asmp.puLUMA_24x32.luma_vsp = cp.puLUMA_24x32.luma_vsp;
735
-        asmp.puLUMA_32x8.luma_vsp  = cp.puLUMA_32x8.luma_vsp;
736
+    setupPixelPrimitives_neon(p);
737
+    setupFilterPrimitives_neon(p);
738
+    setupDCTPrimitives_neon(p);
739
+    setupLoopFilterPrimitives_neon(p);
740
+    setupIntraPrimitives_neon(p);
741
+
742
+    ALL_CHROMA_420_PU(p2sNONALIGNED, filterPixelToShort, neon);
743
+    ALL_CHROMA_422_PU(p2sALIGNED, filterPixelToShort, neon);
744
+    ALL_CHROMA_444_PU(p2sALIGNED, filterPixelToShort, neon);
745
+    ALL_LUMA_PU(convert_p2sALIGNED, filterPixelToShort, neon);
746
+    ALL_CHROMA_420_PU(p2sALIGNED, filterPixelToShort, neon);
747
+    ALL_CHROMA_422_PU(p2sNONALIGNED, filterPixelToShort, neon);
748
+    ALL_CHROMA_444_PU(p2sNONALIGNED, filterPixelToShort, neon);
749
+    ALL_LUMA_PU(convert_p2sNONALIGNED, filterPixelToShort, neon);
750
+
751
+#if !HIGH_BIT_DEPTH
752
+    ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, neon);
753
+    ALL_LUMA_PU(luma_vsp, interp_8tap_vert_sp, neon);
754
+    ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, neon);
755
+    ALL_LUMA_PU(luma_hpp, interp_horiz_pp, neon);
756
+    ALL_LUMA_PU(luma_hps, interp_horiz_ps, neon);
757
+    ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, neon);
758
+    ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu);
759
+    ALL_CHROMA_420_VERT_FILTERS(neon);
760
+    CHROMA_422_VERT_FILTERS_NEON();
761
+    CHROMA_422_VERT_FILTERS_CAN_USE_SVE2(neon);
762
+    ALL_CHROMA_444_VERT_FILTERS(neon);
763
+    ALL_CHROMA_420_FILTERS(neon);
764
+    ALL_CHROMA_422_FILTERS(neon);
765
+    ALL_CHROMA_444_FILTERS(neon);
766
+
767
+    // Blockcopy_pp
768
+    ALL_LUMA_PU(copy_pp, blockcopy_pp, neon);
769
+    ALL_CHROMA_420_PU(copy_pp, blockcopy_pp, neon);
770
+    ALL_CHROMA_422_PU(copy_pp, blockcopy_pp, neon);
771
+    p.cuBLOCK_4x4.copy_pp   = PFX(blockcopy_pp_4x4_neon);
772
+    p.cuBLOCK_8x8.copy_pp   = PFX(blockcopy_pp_8x8_neon);
773
+    p.cuBLOCK_16x16.copy_pp = PFX(blockcopy_pp_16x16_neon);
774
+    p.cuBLOCK_32x32.copy_pp = PFX(blockcopy_pp_32x32_neon);
775
+    p.cuBLOCK_64x64.copy_pp = PFX(blockcopy_pp_64x64_neon);
776
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_pp = PFX(blockcopy_pp_4x4_neon);
777
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_pp = PFX(blockcopy_pp_8x8_neon);
778
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_pp = PFX(blockcopy_pp_16x16_neon);
779
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_pp = PFX(blockcopy_pp_32x32_neon);
780
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_pp = PFX(blockcopy_pp_4x8_neon);
781
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_pp = PFX(blockcopy_pp_8x16_neon);
782
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_pp = PFX(blockcopy_pp_16x32_neon);
783
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_pp = PFX(blockcopy_pp_32x64_neon);
784
+
785
+#endif // !HIGH_BIT_DEPTH
786
+
787
+    // Blockcopy_ss
788
+    p.cuBLOCK_4x4.copy_ss   = PFX(blockcopy_ss_4x4_neon);
789
+    p.cuBLOCK_8x8.copy_ss   = PFX(blockcopy_ss_8x8_neon);
790
+    p.cuBLOCK_16x16.copy_ss = PFX(blockcopy_ss_16x16_neon);
791
+    p.cuBLOCK_32x32.copy_ss = PFX(blockcopy_ss_32x32_neon);
792
+    p.cuBLOCK_64x64.copy_ss = PFX(blockcopy_ss_64x64_neon);
793
+
794
+    // Blockcopy_ps
795
+    p.cuBLOCK_4x4.copy_ps   = PFX(blockcopy_ps_4x4_neon);
796
+    p.cuBLOCK_8x8.copy_ps   = PFX(blockcopy_ps_8x8_neon);
797
+    p.cuBLOCK_16x16.copy_ps = PFX(blockcopy_ps_16x16_neon);
798
+    p.cuBLOCK_32x32.copy_ps = PFX(blockcopy_ps_32x32_neon);
799
+    p.cuBLOCK_64x64.copy_ps = PFX(blockcopy_ps_64x64_neon);
800
+
801
+    // Blockcopy_sp
802
+    p.cuBLOCK_4x4.copy_sp   = PFX(blockcopy_sp_4x4_neon);
803
+    p.cuBLOCK_8x8.copy_sp   = PFX(blockcopy_sp_8x8_neon);
804
+    p.cuBLOCK_16x16.copy_sp = PFX(blockcopy_sp_16x16_neon);
805
+    p.cuBLOCK_32x32.copy_sp = PFX(blockcopy_sp_32x32_neon);
806
+    p.cuBLOCK_64x64.copy_sp = PFX(blockcopy_sp_64x64_neon);
807
+
808
+    // chroma blockcopy_ss
809
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_ss   = PFX(blockcopy_ss_4x4_neon);
810
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_ss   = PFX(blockcopy_ss_8x8_neon);
811
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_ss = PFX(blockcopy_ss_16x16_neon);
812
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_ss = PFX(blockcopy_ss_32x32_neon);
813
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_ss   = PFX(blockcopy_ss_4x8_neon);
814
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_ss  = PFX(blockcopy_ss_8x16_neon);
815
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_ss = PFX(blockcopy_ss_16x32_neon);
816
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_ss = PFX(blockcopy_ss_32x64_neon);
817
+
818
+    // chroma blockcopy_ps
819
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_ps   = PFX(blockcopy_ps_4x4_neon);
820
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_ps   = PFX(blockcopy_ps_8x8_neon);
821
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_ps = PFX(blockcopy_ps_16x16_neon);
822
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_ps = PFX(blockcopy_ps_32x32_neon);
823
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_ps   = PFX(blockcopy_ps_4x8_neon);
824
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_ps  = PFX(blockcopy_ps_8x16_neon);
825
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_ps = PFX(blockcopy_ps_16x32_neon);
826
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_ps = PFX(blockcopy_ps_32x64_neon);
827
+
828
+    // chroma blockcopy_sp
829
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_sp   = PFX(blockcopy_sp_4x4_neon);
830
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_sp   = PFX(blockcopy_sp_8x8_neon);
831
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_sp = PFX(blockcopy_sp_16x16_neon);
832
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_sp = PFX(blockcopy_sp_32x32_neon);
833
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_sp   = PFX(blockcopy_sp_4x8_neon);
834
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_sp  = PFX(blockcopy_sp_8x16_neon);
835
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_sp = PFX(blockcopy_sp_16x32_neon);
836
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_sp = PFX(blockcopy_sp_32x64_neon);
837
+
838
+    // Block_fill
839
+    ALL_LUMA_TU(blockfill_sALIGNED, blockfill_s, neon);
840
+    ALL_LUMA_TU(blockfill_sNONALIGNED, blockfill_s, neon);
841
+
842
+    // copy_count
843
+    p.cuBLOCK_4x4.copy_cnt     = PFX(copy_cnt_4_neon);
844
+    p.cuBLOCK_8x8.copy_cnt     = PFX(copy_cnt_8_neon);
845
+    p.cuBLOCK_16x16.copy_cnt   = PFX(copy_cnt_16_neon);
846
+    p.cuBLOCK_32x32.copy_cnt   = PFX(copy_cnt_32_neon);
847
+
848
+    // count nonzero
849
+    p.cuBLOCK_4x4.count_nonzero     = PFX(count_nonzero_4_neon);
850
+    p.cuBLOCK_8x8.count_nonzero     = PFX(count_nonzero_8_neon);
851
+    p.cuBLOCK_16x16.count_nonzero   = PFX(count_nonzero_16_neon);
852
+    p.cuBLOCK_32x32.count_nonzero   = PFX(count_nonzero_32_neon);
853
+
854
+    // cpy2Dto1D_shl
855
+    p.cuBLOCK_4x4.cpy2Dto1D_shl   = PFX(cpy2Dto1D_shl_4x4_neon);
856
+    p.cuBLOCK_8x8.cpy2Dto1D_shl   = PFX(cpy2Dto1D_shl_8x8_neon);
857
+    p.cuBLOCK_16x16.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16x16_neon);
858
+    p.cuBLOCK_32x32.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32x32_neon);
859
+    p.cuBLOCK_64x64.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_64x64_neon);
860
+
861
+    // cpy2Dto1D_shr
862
+    p.cuBLOCK_4x4.cpy2Dto1D_shr   = PFX(cpy2Dto1D_shr_4x4_neon);
863
+    p.cuBLOCK_8x8.cpy2Dto1D_shr   = PFX(cpy2Dto1D_shr_8x8_neon);
864
+    p.cuBLOCK_16x16.cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16x16_neon);
865
+    p.cuBLOCK_32x32.cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_neon);
866
+
867
+    // cpy1Dto2D_shl
868
+    p.cuBLOCK_4x4.cpy1Dto2D_shlALIGNED      = PFX(cpy1Dto2D_shl_4x4_neon);
869
+    p.cuBLOCK_8x8.cpy1Dto2D_shlALIGNED      = PFX(cpy1Dto2D_shl_8x8_neon);
870
+    p.cuBLOCK_16x16.cpy1Dto2D_shlALIGNED    = PFX(cpy1Dto2D_shl_16x16_neon);
871
+    p.cuBLOCK_32x32.cpy1Dto2D_shlALIGNED    = PFX(cpy1Dto2D_shl_32x32_neon);
872
+    p.cuBLOCK_64x64.cpy1Dto2D_shlALIGNED    = PFX(cpy1Dto2D_shl_64x64_neon);
873
+
874
+    p.cuBLOCK_4x4.cpy1Dto2D_shlNONALIGNED   = PFX(cpy1Dto2D_shl_4x4_neon);
875
+    p.cuBLOCK_8x8.cpy1Dto2D_shlNONALIGNED   = PFX(cpy1Dto2D_shl_8x8_neon);
876
+    p.cuBLOCK_16x16.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_16x16_neon);
877
+    p.cuBLOCK_32x32.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_32x32_neon);
878
+    p.cuBLOCK_64x64.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_64x64_neon);
879
+
880
+    // cpy1Dto2D_shr
881
+    p.cuBLOCK_4x4.cpy1Dto2D_shr   = PFX(cpy1Dto2D_shr_4x4_neon);
882
+    p.cuBLOCK_8x8.cpy1Dto2D_shr   = PFX(cpy1Dto2D_shr_8x8_neon);
883
+    p.cuBLOCK_16x16.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16x16_neon);
884
+    p.cuBLOCK_32x32.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_neon);
885
+    p.cuBLOCK_64x64.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_64x64_neon);
886
+
887
+#if !HIGH_BIT_DEPTH
888
+    // pixel_avg_pp
889
+    ALL_LUMA_PU(pixelavg_ppNONALIGNED, pixel_avg_pp, neon);
890
+    ALL_LUMA_PU(pixelavg_ppALIGNED, pixel_avg_pp, neon);
891
+
892
+    // addAvg
893
+    ALL_LUMA_PU(addAvgNONALIGNED, addAvg, neon);
894
+    ALL_LUMA_PU(addAvgALIGNED, addAvg, neon);
895
+    ALL_CHROMA_420_PU(addAvgNONALIGNED, addAvg, neon);
896
+    ALL_CHROMA_422_PU(addAvgNONALIGNED, addAvg, neon);
897
+    ALL_CHROMA_420_PU(addAvgALIGNED, addAvg, neon);
898
+    ALL_CHROMA_422_PU(addAvgALIGNED, addAvg, neon);
899
+
900
+    // sad
901
+    ALL_LUMA_PU(sad, pixel_sad, neon);
902
+    ALL_LUMA_PU(sad_x3, sad_x3, neon);
903
+    ALL_LUMA_PU(sad_x4, sad_x4, neon);
904
+
905
+    // sse_pp
906
+    p.cuBLOCK_4x4.sse_pp   = PFX(pixel_sse_pp_4x4_neon);
907
+    p.cuBLOCK_8x8.sse_pp   = PFX(pixel_sse_pp_8x8_neon);
908
+    p.cuBLOCK_16x16.sse_pp = PFX(pixel_sse_pp_16x16_neon);
909
+    p.cuBLOCK_32x32.sse_pp = PFX(pixel_sse_pp_32x32_neon);
910
+    p.cuBLOCK_64x64.sse_pp = PFX(pixel_sse_pp_64x64_neon);
911
+
912
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.sse_pp   = PFX(pixel_sse_pp_4x4_neon);
913
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.sse_pp   = PFX(pixel_sse_pp_8x8_neon);
914
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.sse_pp = PFX(pixel_sse_pp_16x16_neon);
915
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.sse_pp = PFX(pixel_sse_pp_32x32_neon);
916
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.sse_pp   = PFX(pixel_sse_pp_4x8_neon);
917
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.sse_pp  = PFX(pixel_sse_pp_8x16_neon);
918
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.sse_pp = PFX(pixel_sse_pp_16x32_neon);
919
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.sse_pp = PFX(pixel_sse_pp_32x64_neon);
920
+
921
+    // sse_ss
922
+    p.cuBLOCK_4x4.sse_ss   = PFX(pixel_sse_ss_4x4_neon);
923
+    p.cuBLOCK_8x8.sse_ss   = PFX(pixel_sse_ss_8x8_neon);
924
+    p.cuBLOCK_16x16.sse_ss = PFX(pixel_sse_ss_16x16_neon);
925
+    p.cuBLOCK_32x32.sse_ss = PFX(pixel_sse_ss_32x32_neon);
926
+    p.cuBLOCK_64x64.sse_ss = PFX(pixel_sse_ss_64x64_neon);
927
+
928
+    // ssd_s
929
+    p.cuBLOCK_4x4.ssd_sNONALIGNED   = PFX(pixel_ssd_s_4x4_neon);
930
+    p.cuBLOCK_8x8.ssd_sNONALIGNED   = PFX(pixel_ssd_s_8x8_neon);
931
+    p.cuBLOCK_16x16.ssd_sNONALIGNED = PFX(pixel_ssd_s_16x16_neon);
932
+    p.cuBLOCK_32x32.ssd_sNONALIGNED = PFX(pixel_ssd_s_32x32_neon);
933
+
934
+    p.cuBLOCK_4x4.ssd_sALIGNED   = PFX(pixel_ssd_s_4x4_neon);
935
+    p.cuBLOCK_8x8.ssd_sALIGNED   = PFX(pixel_ssd_s_8x8_neon);
936
+    p.cuBLOCK_16x16.ssd_sALIGNED = PFX(pixel_ssd_s_16x16_neon);
937
+    p.cuBLOCK_32x32.ssd_sALIGNED = PFX(pixel_ssd_s_32x32_neon);
938
+
939
+    // pixel_var
940
+    p.cuBLOCK_8x8.var   = PFX(pixel_var_8x8_neon);
941
+    p.cuBLOCK_16x16.var = PFX(pixel_var_16x16_neon);
942
+    p.cuBLOCK_32x32.var = PFX(pixel_var_32x32_neon);
943
+    p.cuBLOCK_64x64.var = PFX(pixel_var_64x64_neon);
944
+
945
+    // calc_Residual
946
+    p.cuBLOCK_4x4.calcresidualNONALIGNED   = PFX(getResidual4_neon);
947
+    p.cuBLOCK_8x8.calcresidualNONALIGNED   = PFX(getResidual8_neon);
948
+    p.cuBLOCK_16x16.calcresidualNONALIGNED = PFX(getResidual16_neon);
949
+    p.cuBLOCK_32x32.calcresidualNONALIGNED = PFX(getResidual32_neon);
950
+
951
+    p.cuBLOCK_4x4.calcresidualALIGNED   = PFX(getResidual4_neon);
952
+    p.cuBLOCK_8x8.calcresidualALIGNED   = PFX(getResidual8_neon);
953
+    p.cuBLOCK_16x16.calcresidualALIGNED = PFX(getResidual16_neon);
954
+    p.cuBLOCK_32x32.calcresidualALIGNED = PFX(getResidual32_neon);
955
+
956
+    // pixel_sub_ps
957
+    p.cuBLOCK_4x4.sub_ps   = PFX(pixel_sub_ps_4x4_neon);
958
+    p.cuBLOCK_8x8.sub_ps   = PFX(pixel_sub_ps_8x8_neon);
959
+    p.cuBLOCK_16x16.sub_ps = PFX(pixel_sub_ps_16x16_neon);
960
+    p.cuBLOCK_32x32.sub_ps = PFX(pixel_sub_ps_32x32_neon);
961
+    p.cuBLOCK_64x64.sub_ps = PFX(pixel_sub_ps_64x64_neon);
962
+
963
+    // chroma sub_ps
964
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.sub_ps   = PFX(pixel_sub_ps_4x4_neon);
965
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.sub_ps   = PFX(pixel_sub_ps_8x8_neon);
966
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.sub_ps = PFX(pixel_sub_ps_16x16_neon);
967
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.sub_ps = PFX(pixel_sub_ps_32x32_neon);
968
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.sub_ps   = PFX(pixel_sub_ps_4x8_neon);
969
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.sub_ps  = PFX(pixel_sub_ps_8x16_neon);
970
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.sub_ps = PFX(pixel_sub_ps_16x32_neon);
971
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.sub_ps = PFX(pixel_sub_ps_32x64_neon);
972
+
973
+    // pixel_add_ps
974
+    p.cuBLOCK_4x4.add_psNONALIGNED   = PFX(pixel_add_ps_4x4_neon);
975
+    p.cuBLOCK_8x8.add_psNONALIGNED   = PFX(pixel_add_ps_8x8_neon);
976
+    p.cuBLOCK_16x16.add_psNONALIGNED = PFX(pixel_add_ps_16x16_neon);
977
+    p.cuBLOCK_32x32.add_psNONALIGNED = PFX(pixel_add_ps_32x32_neon);
978
+    p.cuBLOCK_64x64.add_psNONALIGNED = PFX(pixel_add_ps_64x64_neon);
979
+
980
+    p.cuBLOCK_4x4.add_psALIGNED   = PFX(pixel_add_ps_4x4_neon);
981
+    p.cuBLOCK_8x8.add_psALIGNED   = PFX(pixel_add_ps_8x8_neon);
982
+    p.cuBLOCK_16x16.add_psALIGNED = PFX(pixel_add_ps_16x16_neon);
983
+    p.cuBLOCK_32x32.add_psALIGNED = PFX(pixel_add_ps_32x32_neon);
984
+    p.cuBLOCK_64x64.add_psALIGNED = PFX(pixel_add_ps_64x64_neon);
985
+
986
+    // chroma add_ps
987
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.add_psNONALIGNED   = PFX(pixel_add_ps_4x4_neon);
988
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.add_psNONALIGNED   = PFX(pixel_add_ps_8x8_neon);
989
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.add_psNONALIGNED = PFX(pixel_add_ps_16x16_neon);
990
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.add_psNONALIGNED = PFX(pixel_add_ps_32x32_neon);
991
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.add_psNONALIGNED   = PFX(pixel_add_ps_4x8_neon);
992
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.add_psNONALIGNED  = PFX(pixel_add_ps_8x16_neon);
993
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.add_psNONALIGNED = PFX(pixel_add_ps_16x32_neon);
994
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.add_psNONALIGNED = PFX(pixel_add_ps_32x64_neon);
995
+
996
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.add_psALIGNED   = PFX(pixel_add_ps_4x4_neon);
997
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.add_psALIGNED   = PFX(pixel_add_ps_8x8_neon);
998
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.add_psALIGNED = PFX(pixel_add_ps_16x16_neon);
999
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.add_psALIGNED = PFX(pixel_add_ps_32x32_neon);
1000
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.add_psALIGNED   = PFX(pixel_add_ps_4x8_neon);
1001
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.add_psALIGNED  = PFX(pixel_add_ps_8x16_neon);
1002
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.add_psALIGNED = PFX(pixel_add_ps_16x32_neon);
1003
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.add_psALIGNED = PFX(pixel_add_ps_32x64_neon);
1004
+
1005
+    //scale2D_64to32
1006
+    p.scale2D_64to32  = PFX(scale2D_64to32_neon);
1007
+
1008
+    // scale1D_128to64
1009
+    p.scale1D_128to64NONALIGNED = PFX(scale1D_128to64_neon);
1010
+    p.scale1D_128to64ALIGNED = PFX(scale1D_128to64_neon);
1011
+
1012
+    // planecopy
1013
+    p.planecopy_cp = PFX(pixel_planecopy_cp_neon);
1014
+
1015
+    // satd
1016
+    ALL_LUMA_PU(satd, pixel_satd, neon);
1017
+
1018
+    p.chromaX265_CSP_I420.puCHROMA_420_4x4.satd   = PFX(pixel_satd_4x4_neon);
1019
+    p.chromaX265_CSP_I420.puCHROMA_420_8x8.satd   = PFX(pixel_satd_8x8_neon);
1020
+    p.chromaX265_CSP_I420.puCHROMA_420_16x16.satd = PFX(pixel_satd_16x16_neon);
1021
+    p.chromaX265_CSP_I420.puCHROMA_420_32x32.satd = PFX(pixel_satd_32x32_neon);
1022
+    p.chromaX265_CSP_I420.puCHROMA_420_8x4.satd   = PFX(pixel_satd_8x4_neon);
1023
+    p.chromaX265_CSP_I420.puCHROMA_420_4x8.satd   = PFX(pixel_satd_4x8_neon);
1024
+    p.chromaX265_CSP_I420.puCHROMA_420_16x8.satd  = PFX(pixel_satd_16x8_neon);
1025
+    p.chromaX265_CSP_I420.puCHROMA_420_8x16.satd  = PFX(pixel_satd_8x16_neon);
1026
+    p.chromaX265_CSP_I420.puCHROMA_420_32x16.satd = PFX(pixel_satd_32x16_neon);
1027
+    p.chromaX265_CSP_I420.puCHROMA_420_16x32.satd = PFX(pixel_satd_16x32_neon);
1028
+    p.chromaX265_CSP_I420.puCHROMA_420_16x12.satd = PFX(pixel_satd_16x12_neon);
1029
+    p.chromaX265_CSP_I420.puCHROMA_420_12x16.satd = PFX(pixel_satd_12x16_neon);
1030
+    p.chromaX265_CSP_I420.puCHROMA_420_16x4.satd  = PFX(pixel_satd_16x4_neon);
1031
+    p.chromaX265_CSP_I420.puCHROMA_420_4x16.satd  = PFX(pixel_satd_4x16_neon);
1032
+    p.chromaX265_CSP_I420.puCHROMA_420_32x24.satd = PFX(pixel_satd_32x24_neon);
1033
+    p.chromaX265_CSP_I420.puCHROMA_420_24x32.satd = PFX(pixel_satd_24x32_neon);
1034
+    p.chromaX265_CSP_I420.puCHROMA_420_32x8.satd  = PFX(pixel_satd_32x8_neon);
1035
+    p.chromaX265_CSP_I420.puCHROMA_420_8x32.satd  = PFX(pixel_satd_8x32_neon);
1036
+
1037
+    p.chromaX265_CSP_I422.puCHROMA_422_4x8.satd   = PFX(pixel_satd_4x8_neon);
1038
+    p.chromaX265_CSP_I422.puCHROMA_422_8x16.satd  = PFX(pixel_satd_8x16_neon);
1039
+    p.chromaX265_CSP_I422.puCHROMA_422_16x32.satd = PFX(pixel_satd_16x32_neon);
1040
+    p.chromaX265_CSP_I422.puCHROMA_422_32x64.satd = PFX(pixel_satd_32x64_neon);
1041
+    p.chromaX265_CSP_I422.puCHROMA_422_4x4.satd   = PFX(pixel_satd_4x4_neon);
1042
+    p.chromaX265_CSP_I422.puCHROMA_422_8x8.satd   = PFX(pixel_satd_8x8_neon);
1043
+    p.chromaX265_CSP_I422.puCHROMA_422_4x16.satd  = PFX(pixel_satd_4x16_neon);
1044
+    p.chromaX265_CSP_I422.puCHROMA_422_16x16.satd = PFX(pixel_satd_16x16_neon);
1045
+    p.chromaX265_CSP_I422.puCHROMA_422_8x32.satd  = PFX(pixel_satd_8x32_neon);
1046
+    p.chromaX265_CSP_I422.puCHROMA_422_32x32.satd = PFX(pixel_satd_32x32_neon);
1047
+    p.chromaX265_CSP_I422.puCHROMA_422_16x64.satd = PFX(pixel_satd_16x64_neon);
1048
+    p.chromaX265_CSP_I422.puCHROMA_422_8x12.satd  = PFX(pixel_satd_8x12_neon);
1049
+    p.chromaX265_CSP_I422.puCHROMA_422_8x4.satd   = PFX(pixel_satd_8x4_neon);
1050
+    p.chromaX265_CSP_I422.puCHROMA_422_16x24.satd = PFX(pixel_satd_16x24_neon);
1051
+    p.chromaX265_CSP_I422.puCHROMA_422_12x32.satd = PFX(pixel_satd_12x32_neon);
1052
+    p.chromaX265_CSP_I422.puCHROMA_422_16x8.satd  = PFX(pixel_satd_16x8_neon);
1053
+    p.chromaX265_CSP_I422.puCHROMA_422_4x32.satd  = PFX(pixel_satd_4x32_neon);
1054
+    p.chromaX265_CSP_I422.puCHROMA_422_32x48.satd = PFX(pixel_satd_32x48_neon);
1055
+    p.chromaX265_CSP_I422.puCHROMA_422_24x64.satd = PFX(pixel_satd_24x64_neon);
1056
+    p.chromaX265_CSP_I422.puCHROMA_422_32x16.satd = PFX(pixel_satd_32x16_neon);
1057
+    p.chromaX265_CSP_I422.puCHROMA_422_8x64.satd  = PFX(pixel_satd_8x64_neon);
1058
+
1059
+    // sa8d
1060
+    p.cuBLOCK_4x4.sa8d   = PFX(pixel_satd_4x4_neon);
1061
+    p.cuBLOCK_8x8.sa8d   = PFX(pixel_sa8d_8x8_neon);
1062
+    p.cuBLOCK_16x16.sa8d = PFX(pixel_sa8d_16x16_neon);
1063
+    p.cuBLOCK_32x32.sa8d = PFX(pixel_sa8d_32x32_neon);
1064
+    p.cuBLOCK_64x64.sa8d = PFX(pixel_sa8d_64x64_neon);
1065
+    p.chromaX265_CSP_I420.cuBLOCK_8x8.sa8d = PFX(pixel_satd_4x4_neon);
1066
+    p.chromaX265_CSP_I420.cuBLOCK_16x16.sa8d = PFX(pixel_sa8d_16x16_neon);
1067
+    p.chromaX265_CSP_I420.cuBLOCK_32x32.sa8d = PFX(pixel_sa8d_32x32_neon);
1068
+    p.chromaX265_CSP_I420.cuBLOCK_64x64.sa8d = PFX(pixel_sa8d_64x64_neon);
1069
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.sa8d = PFX(pixel_sa8d_8x16_neon);
1070
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.sa8d = PFX(pixel_sa8d_16x32_neon);
1071
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.sa8d = PFX(pixel_sa8d_32x64_neon);
1072
+
1073
+    // dequant_scaling
1074
+    p.dequant_scaling = PFX(dequant_scaling_neon);
1075
+    p.dequant_normal  = PFX(dequant_normal_neon);
1076
+
1077
+    // ssim_4x4x2_core
1078
+    p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_neon);
1079
+
1080
+    // ssimDist
1081
+    p.cuBLOCK_4x4.ssimDist = PFX(ssimDist4_neon);
1082
+    p.cuBLOCK_8x8.ssimDist = PFX(ssimDist8_neon);
1083
+    p.cuBLOCK_16x16.ssimDist = PFX(ssimDist16_neon);
1084
+    p.cuBLOCK_32x32.ssimDist = PFX(ssimDist32_neon);
1085
+    p.cuBLOCK_64x64.ssimDist = PFX(ssimDist64_neon);
1086
+
1087
+    // normFact
1088
+    p.cuBLOCK_8x8.normFact = PFX(normFact8_neon);
1089
+    p.cuBLOCK_16x16.normFact = PFX(normFact16_neon);
1090
+    p.cuBLOCK_32x32.normFact = PFX(normFact32_neon);
1091
+    p.cuBLOCK_64x64.normFact = PFX(normFact64_neon);
1092
+
1093
+    // psy_cost_pp
1094
+    p.cuBLOCK_4x4.psy_cost_pp = PFX(psyCost_4x4_neon);
1095
+
1096
+    p.weight_pp = PFX(weight_pp_neon);
1097
+#if !defined(__APPLE__)
1098
+    p.scanPosLast = PFX(scanPosLast_neon);
1099
 #endif
1100
+    p.costCoeffNxN = PFX(costCoeffNxN_neon);
1101
 #endif
1102
-    }
1103
-}
1104
 
1105
+    // quant
1106
+    p.quant = PFX(quant_neon);
1107
+    p.nquant = PFX(nquant_neon);
1108
+}
1109
 
1110
-void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) 
1111
+#if defined(HAVE_SVE2) || defined(HAVE_SVE)
1112
+void setupSvePrimitives(EncoderPrimitives &p)
1113
 {
1114
-    if (cpuMask & X265_CPU_NEON)
1115
-    {
1116
-        p.puLUMA_4x4.satd   = PFX(pixel_satd_4x4_neon);
1117
-        p.puLUMA_4x8.satd   = PFX(pixel_satd_4x8_neon);
1118
-        p.puLUMA_4x16.satd  = PFX(pixel_satd_4x16_neon);
1119
-        p.puLUMA_8x4.satd   = PFX(pixel_satd_8x4_neon);
1120
-        p.puLUMA_8x8.satd   = PFX(pixel_satd_8x8_neon);
1121
-        p.puLUMA_12x16.satd = PFX(pixel_satd_12x16_neon);
1122
-        
1123
-        p.chromaX265_CSP_I420.puCHROMA_420_4x4.satd    = PFX(pixel_satd_4x4_neon);
1124
-        p.chromaX265_CSP_I420.puCHROMA_420_4x8.satd    = PFX(pixel_satd_4x8_neon);
1125
-        p.chromaX265_CSP_I420.puCHROMA_420_4x16.satd   = PFX(pixel_satd_4x16_neon);
1126
-        p.chromaX265_CSP_I420.puCHROMA_420_8x4.satd    = PFX(pixel_satd_8x4_neon);
1127
-        p.chromaX265_CSP_I420.puCHROMA_420_8x8.satd    = PFX(pixel_satd_8x8_neon);
1128
-        p.chromaX265_CSP_I420.puCHROMA_420_12x16.satd  = PFX(pixel_satd_12x16_neon);
1129
-        
1130
-        p.chromaX265_CSP_I422.puCHROMA_422_4x4.satd    = PFX(pixel_satd_4x4_neon);
1131
-        p.chromaX265_CSP_I422.puCHROMA_422_4x8.satd    = PFX(pixel_satd_4x8_neon);
1132
-        p.chromaX265_CSP_I422.puCHROMA_422_4x16.satd   = PFX(pixel_satd_4x16_neon);
1133
-        p.chromaX265_CSP_I422.puCHROMA_422_4x32.satd   = PFX(pixel_satd_4x32_neon);
1134
-        p.chromaX265_CSP_I422.puCHROMA_422_8x4.satd    = PFX(pixel_satd_8x4_neon);
1135
-        p.chromaX265_CSP_I422.puCHROMA_422_8x8.satd    = PFX(pixel_satd_8x8_neon);
1136
-        p.chromaX265_CSP_I422.puCHROMA_422_12x32.satd  = PFX(pixel_satd_12x32_neon);
1137
-
1138
-        p.puLUMA_4x4.pixelavg_ppNONALIGNED   = PFX(pixel_avg_pp_4x4_neon);
1139
-        p.puLUMA_4x8.pixelavg_ppNONALIGNED   = PFX(pixel_avg_pp_4x8_neon);
1140
-        p.puLUMA_4x16.pixelavg_ppNONALIGNED  = PFX(pixel_avg_pp_4x16_neon);
1141
-        p.puLUMA_8x4.pixelavg_ppNONALIGNED   = PFX(pixel_avg_pp_8x4_neon);
1142
-        p.puLUMA_8x8.pixelavg_ppNONALIGNED   = PFX(pixel_avg_pp_8x8_neon);
1143
-        p.puLUMA_8x16.pixelavg_ppNONALIGNED  = PFX(pixel_avg_pp_8x16_neon);
1144
-        p.puLUMA_8x32.pixelavg_ppNONALIGNED  = PFX(pixel_avg_pp_8x32_neon);
1145
-
1146
-        p.puLUMA_4x4.pixelavg_ppALIGNED   = PFX(pixel_avg_pp_4x4_neon);
1147
-        p.puLUMA_4x8.pixelavg_ppALIGNED   = PFX(pixel_avg_pp_4x8_neon);
1148
-        p.puLUMA_4x16.pixelavg_ppALIGNED  = PFX(pixel_avg_pp_4x16_neon);
1149
-        p.puLUMA_8x4.pixelavg_ppALIGNED   = PFX(pixel_avg_pp_8x4_neon);
1150
-        p.puLUMA_8x8.pixelavg_ppALIGNED   = PFX(pixel_avg_pp_8x8_neon);
1151
-        p.puLUMA_8x16.pixelavg_ppALIGNED  = PFX(pixel_avg_pp_8x16_neon);
1152
-        p.puLUMA_8x32.pixelavg_ppALIGNED  = PFX(pixel_avg_pp_8x32_neon);
1153
-
1154
-        p.puLUMA_8x4.sad_x3   = PFX(sad_x3_8x4_neon);
1155
-        p.puLUMA_8x8.sad_x3   = PFX(sad_x3_8x8_neon);
1156
-        p.puLUMA_8x16.sad_x3  = PFX(sad_x3_8x16_neon);
1157
-        p.puLUMA_8x32.sad_x3  = PFX(sad_x3_8x32_neon);
1158
-
1159
-        p.puLUMA_8x4.sad_x4   = PFX(sad_x4_8x4_neon);
1160
-        p.puLUMA_8x8.sad_x4   = PFX(sad_x4_8x8_neon);
1161
-        p.puLUMA_8x16.sad_x4  = PFX(sad_x4_8x16_neon);
1162
-        p.puLUMA_8x32.sad_x4  = PFX(sad_x4_8x32_neon);
1163
-
1164
-        // quant
1165
-        p.quant = PFX(quant_neon);
1166
-        // luma_hps
1167
-        p.puLUMA_4x4.luma_hps   = PFX(interp_8tap_horiz_ps_4x4_neon);
1168
-        p.puLUMA_4x8.luma_hps   = PFX(interp_8tap_horiz_ps_4x8_neon);
1169
-        p.puLUMA_4x16.luma_hps  = PFX(interp_8tap_horiz_ps_4x16_neon);
1170
-        p.puLUMA_8x4.luma_hps   = PFX(interp_8tap_horiz_ps_8x4_neon);
1171
-        p.puLUMA_8x8.luma_hps   = PFX(interp_8tap_horiz_ps_8x8_neon);
1172
-        p.puLUMA_8x16.luma_hps  = PFX(interp_8tap_horiz_ps_8x16_neon);
1173
-        p.puLUMA_8x32.luma_hps  = PFX(interp_8tap_horiz_ps_8x32_neon);
1174
-        p.puLUMA_12x16.luma_hps = PFX(interp_8tap_horiz_ps_12x16_neon);
1175
-        p.puLUMA_24x32.luma_hps = PFX(interp_8tap_horiz_ps_24x32_neon);
1176
-#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */
1177
-        p.puLUMA_16x4.luma_hps  = PFX(interp_8tap_horiz_ps_16x4_neon);
1178
-        p.puLUMA_16x8.luma_hps  = PFX(interp_8tap_horiz_ps_16x8_neon);
1179
-        p.puLUMA_16x12.luma_hps = PFX(interp_8tap_horiz_ps_16x12_neon);
1180
-        p.puLUMA_16x16.luma_hps = PFX(interp_8tap_horiz_ps_16x16_neon);
1181
-        p.puLUMA_16x32.luma_hps = PFX(interp_8tap_horiz_ps_16x32_neon);
1182
-        p.puLUMA_16x64.luma_hps = PFX(interp_8tap_horiz_ps_16x64_neon);
1183
-        p.puLUMA_32x8.luma_hps  = PFX(interp_8tap_horiz_ps_32x8_neon);
1184
-        p.puLUMA_32x16.luma_hps = PFX(interp_8tap_horiz_ps_32x16_neon);
1185
-        p.puLUMA_32x24.luma_hps = PFX(interp_8tap_horiz_ps_32x24_neon);
1186
-        p.puLUMA_32x32.luma_hps = PFX(interp_8tap_horiz_ps_32x32_neon);
1187
-        p.puLUMA_32x64.luma_hps = PFX(interp_8tap_horiz_ps_32x64_neon);
1188
-        p.puLUMA_48x64.luma_hps = PFX(interp_8tap_horiz_ps_48x64_neon);
1189
-        p.puLUMA_64x16.luma_hps = PFX(interp_8tap_horiz_ps_64x16_neon);
1190
-        p.puLUMA_64x32.luma_hps = PFX(interp_8tap_horiz_ps_64x32_neon);
1191
-        p.puLUMA_64x48.luma_hps = PFX(interp_8tap_horiz_ps_64x48_neon);
1192
-        p.puLUMA_64x64.luma_hps = PFX(interp_8tap_horiz_ps_64x64_neon);
1193
-#endif
1194
-
1195
-        p.puLUMA_8x4.luma_hvpp   =  interp_8tap_hv_pp_cpu<LUMA_8x4>;
1196
-        p.puLUMA_8x8.luma_hvpp   =  interp_8tap_hv_pp_cpu<LUMA_8x8>;
1197
-        p.puLUMA_8x16.luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_8x16>;
1198
-        p.puLUMA_8x32.luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_8x32>;
1199
-        p.puLUMA_12x16.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_12x16>;
1200
-#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */
1201
-        p.puLUMA_16x4.luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_16x4>;
1202
-        p.puLUMA_16x8.luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_16x8>;
1203
-        p.puLUMA_16x12.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x12>;
1204
-        p.puLUMA_16x16.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x16>;
1205
-        p.puLUMA_16x32.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x32>;
1206
-        p.puLUMA_16x64.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x64>;
1207
-        p.puLUMA_32x16.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x16>;
1208
-        p.puLUMA_32x24.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x24>;
1209
-        p.puLUMA_32x32.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x32>;
1210
-        p.puLUMA_32x64.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x64>;
1211
-        p.puLUMA_48x64.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_48x64>;
1212
-        p.puLUMA_64x16.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_64x16>;
1213
-        p.puLUMA_64x32.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_64x32>;
1214
-        p.puLUMA_64x48.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_64x48>;
1215
-        p.puLUMA_64x64.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_64x64>;
1216
-#if !AUTO_VECTORIZE || GCC_VERSION < GCC_4_9_0 /* gcc_version < gcc-4.9.0 */
1217
-        p.puLUMA_4x4.luma_hvpp   =  interp_8tap_hv_pp_cpu<LUMA_4x4>;
1218
-        p.puLUMA_4x8.luma_hvpp   =  interp_8tap_hv_pp_cpu<LUMA_4x8>;
1219
-        p.puLUMA_4x16.luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_4x16>;
1220
-        p.puLUMA_24x32.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_24x32>;
1221
-        p.puLUMA_32x8.luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_32x8>;
1222
+    // When these primitives will use SVE/SVE2 instructions set,
1223
+    // change the following definitions to point to the SVE/SVE2 implementation
1224
+    setupPixelPrimitives_neon(p);
1225
+    setupFilterPrimitives_neon(p);
1226
+    setupDCTPrimitives_neon(p);
1227
+    setupLoopFilterPrimitives_neon(p);
1228
+    setupIntraPrimitives_neon(p);
1229
+
1230
+    CHROMA_420_PU_FILTER_PIXEL_TO_SHORT_NEON(p2sNONALIGNED);
1231
+    CHROMA_420_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1232
+    CHROMA_422_PU_NEON_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1233
+    CHROMA_422_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1234
+    CHROMA_444_PU_NEON_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1235
+    CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1236
+    LUMA_PU_NEON_FILTER_PIXEL_TO_SHORT(convert_p2sALIGNED);
1237
+    LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2sALIGNED);
1238
+    CHROMA_420_PU_FILTER_PIXEL_TO_SHORT_NEON(p2sALIGNED);
1239
+    CHROMA_420_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1240
+    CHROMA_422_PU_NEON_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1241
+    CHROMA_422_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1242
+    CHROMA_444_PU_NEON_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1243
+    CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1244
+    LUMA_PU_NEON_FILTER_PIXEL_TO_SHORT(convert_p2sNONALIGNED);
1245
+    LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2sNONALIGNED);
1246
+
1247
+#if !HIGH_BIT_DEPTH
1248
+    ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, neon);
1249
+    ALL_LUMA_PU(luma_vsp, interp_8tap_vert_sp, neon);
1250
+    ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, neon);
1251
+    ALL_LUMA_PU(luma_hpp, interp_horiz_pp, neon);
1252
+    ALL_LUMA_PU(luma_hps, interp_horiz_ps, neon);
1253
+    ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, neon);
1254
+    ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu);
1255
+    ALL_CHROMA_420_VERT_FILTERS(neon);
1256
+    CHROMA_422_VERT_FILTERS_NEON();
1257
+    CHROMA_422_VERT_FILTERS_CAN_USE_SVE2(neon);
1258
+    ALL_CHROMA_444_VERT_FILTERS(neon);
1259
+    ALL_CHROMA_420_FILTERS(neon);
1260
+    ALL_CHROMA_422_FILTERS(neon);
1261
+    ALL_CHROMA_444_FILTERS(neon);
1262
+
1263
+
1264
+    // Blockcopy_pp
1265
+    LUMA_PU_NEON_1(copy_pp, blockcopy_pp);
1266
+    LUMA_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
1267
+    CHROMA_420_PU_NEON_1(copy_pp, blockcopy_pp);
1268
+    CHROMA_420_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
1269
+    CHROMA_422_PU_NEON_1(copy_pp, blockcopy_pp);
1270
+    CHROMA_422_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
1271
+    p.cuBLOCK_4x4.copy_pp   = PFX(blockcopy_pp_4x4_neon);
1272
+    p.cuBLOCK_8x8.copy_pp   = PFX(blockcopy_pp_8x8_neon);
1273
+    p.cuBLOCK_16x16.copy_pp = PFX(blockcopy_pp_16x16_neon);
1274
+    p.cuBLOCK_32x32.copy_pp = PFX(blockcopy_pp_32x32_sve);
1275
+    p.cuBLOCK_64x64.copy_pp = PFX(blockcopy_pp_64x64_sve);
1276
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_pp = PFX(blockcopy_pp_4x4_neon);
1277
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_pp = PFX(blockcopy_pp_8x8_neon);
1278
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_pp = PFX(blockcopy_pp_16x16_neon);
1279
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_pp = PFX(blockcopy_pp_32x32_sve);
1280
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_pp = PFX(blockcopy_pp_4x8_neon);
1281
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_pp = PFX(blockcopy_pp_8x16_neon);
1282
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_pp = PFX(blockcopy_pp_16x32_neon);
1283
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_pp = PFX(blockcopy_pp_32x64_sve);
1284
+
1285
+#endif // !HIGH_BIT_DEPTH
1286
+
1287
+    // Blockcopy_ss
1288
+    p.cuBLOCK_4x4.copy_ss   = PFX(blockcopy_ss_4x4_neon);
1289
+    p.cuBLOCK_8x8.copy_ss   = PFX(blockcopy_ss_8x8_neon);
1290
+    p.cuBLOCK_16x16.copy_ss = PFX(blockcopy_ss_16x16_sve);
1291
+    p.cuBLOCK_32x32.copy_ss = PFX(blockcopy_ss_32x32_sve);
1292
+    p.cuBLOCK_64x64.copy_ss = PFX(blockcopy_ss_64x64_sve);
1293
+
1294
+    // Blockcopy_ps
1295
+    p.cuBLOCK_4x4.copy_ps   = PFX(blockcopy_ps_4x4_neon);
1296
+    p.cuBLOCK_8x8.copy_ps   = PFX(blockcopy_ps_8x8_neon);
1297
+    p.cuBLOCK_16x16.copy_ps = PFX(blockcopy_ps_16x16_sve);
1298
+    p.cuBLOCK_32x32.copy_ps = PFX(blockcopy_ps_32x32_sve);
1299
+    p.cuBLOCK_64x64.copy_ps = PFX(blockcopy_ps_64x64_sve);
1300
+
1301
+    // Blockcopy_sp
1302
+    p.cuBLOCK_4x4.copy_sp   = PFX(blockcopy_sp_4x4_sve);
1303
+    p.cuBLOCK_8x8.copy_sp   = PFX(blockcopy_sp_8x8_sve);
1304
+    p.cuBLOCK_16x16.copy_sp = PFX(blockcopy_sp_16x16_sve);
1305
+    p.cuBLOCK_32x32.copy_sp = PFX(blockcopy_sp_32x32_sve);
1306
+    p.cuBLOCK_64x64.copy_sp = PFX(blockcopy_sp_64x64_neon);
1307
+
1308
+    // chroma blockcopy_ss
1309
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_ss   = PFX(blockcopy_ss_4x4_neon);
1310
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_ss   = PFX(blockcopy_ss_8x8_neon);
1311
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_ss = PFX(blockcopy_ss_16x16_sve);
1312
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_ss = PFX(blockcopy_ss_32x32_sve);
1313
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_ss   = PFX(blockcopy_ss_4x8_neon);
1314
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_ss  = PFX(blockcopy_ss_8x16_neon);
1315
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_ss = PFX(blockcopy_ss_16x32_sve);
1316
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_ss = PFX(blockcopy_ss_32x64_sve);
1317
+
1318
+    // chroma blockcopy_ps
1319
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_ps   = PFX(blockcopy_ps_4x4_neon);
1320
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_ps   = PFX(blockcopy_ps_8x8_neon);
1321
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_ps = PFX(blockcopy_ps_16x16_sve);
1322
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_ps = PFX(blockcopy_ps_32x32_sve);
1323
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_ps   = PFX(blockcopy_ps_4x8_sve);
1324
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_ps  = PFX(blockcopy_ps_8x16_sve);
1325
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_ps = PFX(blockcopy_ps_16x32_sve);
1326
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_ps = PFX(blockcopy_ps_32x64_sve);
1327
+
1328
+    // chroma blockcopy_sp
1329
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_sp   = PFX(blockcopy_sp_4x4_sve);
1330
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_sp   = PFX(blockcopy_sp_8x8_sve);
1331
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_sp = PFX(blockcopy_sp_16x16_sve);
1332
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_sp = PFX(blockcopy_sp_32x32_sve);
1333
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_sp   = PFX(blockcopy_sp_4x8_sve);
1334
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_sp  = PFX(blockcopy_sp_8x16_sve);
1335
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_sp = PFX(blockcopy_sp_16x32_sve);
1336
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_sp = PFX(blockcopy_sp_32x64_sve);
1337
+
1338
+    // Block_fill
1339
+    LUMA_TU_NEON(blockfill_sALIGNED, blockfill_s);
1340
+    LUMA_TU_CAN_USE_SVE(blockfill_sALIGNED, blockfill_s);
1341
+    LUMA_TU_NEON(blockfill_sNONALIGNED, blockfill_s);
1342
+    LUMA_TU_CAN_USE_SVE(blockfill_sNONALIGNED, blockfill_s);
1343
+
1344
+    // copy_count
1345
+    p.cuBLOCK_4x4.copy_cnt     = PFX(copy_cnt_4_neon);
1346
+    p.cuBLOCK_8x8.copy_cnt     = PFX(copy_cnt_8_neon);
1347
+    p.cuBLOCK_16x16.copy_cnt   = PFX(copy_cnt_16_neon);
1348
+    p.cuBLOCK_32x32.copy_cnt   = PFX(copy_cnt_32_neon);
1349
+
1350
+    // count nonzero
1351
+    p.cuBLOCK_4x4.count_nonzero     = PFX(count_nonzero_4_neon);
1352
+    p.cuBLOCK_8x8.count_nonzero     = PFX(count_nonzero_8_neon);
1353
+    p.cuBLOCK_16x16.count_nonzero   = PFX(count_nonzero_16_neon);
1354
+    p.cuBLOCK_32x32.count_nonzero   = PFX(count_nonzero_32_neon);
1355
+
1356
+    // cpy2Dto1D_shl
1357
+    p.cuBLOCK_4x4.cpy2Dto1D_shl   = PFX(cpy2Dto1D_shl_4x4_neon);
1358
+    p.cuBLOCK_8x8.cpy2Dto1D_shl   = PFX(cpy2Dto1D_shl_8x8_neon);
1359
+    p.cuBLOCK_16x16.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16x16_sve);
1360
+    p.cuBLOCK_32x32.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32x32_sve);
1361
+    p.cuBLOCK_64x64.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_64x64_sve);
1362
+
1363
+    // cpy2Dto1D_shr
1364
+    p.cuBLOCK_4x4.cpy2Dto1D_shr   = PFX(cpy2Dto1D_shr_4x4_neon);
1365
+    p.cuBLOCK_8x8.cpy2Dto1D_shr   = PFX(cpy2Dto1D_shr_8x8_neon);
1366
+    p.cuBLOCK_16x16.cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16x16_sve);
1367
+    p.cuBLOCK_32x32.cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_sve);
1368
+
1369
+    // cpy1Dto2D_shl
1370
+    p.cuBLOCK_4x4.cpy1Dto2D_shlALIGNED      = PFX(cpy1Dto2D_shl_4x4_neon);
1371
+    p.cuBLOCK_8x8.cpy1Dto2D_shlALIGNED      = PFX(cpy1Dto2D_shl_8x8_neon);
1372
+    p.cuBLOCK_16x16.cpy1Dto2D_shlALIGNED    = PFX(cpy1Dto2D_shl_16x16_sve);
1373
+    p.cuBLOCK_32x32.cpy1Dto2D_shlALIGNED    = PFX(cpy1Dto2D_shl_32x32_sve);
1374
+    p.cuBLOCK_64x64.cpy1Dto2D_shlALIGNED    = PFX(cpy1Dto2D_shl_64x64_sve);
1375
+
1376
+    p.cuBLOCK_4x4.cpy1Dto2D_shlNONALIGNED   = PFX(cpy1Dto2D_shl_4x4_neon);
1377
+    p.cuBLOCK_8x8.cpy1Dto2D_shlNONALIGNED   = PFX(cpy1Dto2D_shl_8x8_neon);
1378
+    p.cuBLOCK_16x16.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_16x16_sve);
1379
+    p.cuBLOCK_32x32.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_32x32_sve);
1380
+    p.cuBLOCK_64x64.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_64x64_sve);
1381
+
1382
+    // cpy1Dto2D_shr
1383
+    p.cuBLOCK_4x4.cpy1Dto2D_shr   = PFX(cpy1Dto2D_shr_4x4_neon);
1384
+    p.cuBLOCK_8x8.cpy1Dto2D_shr   = PFX(cpy1Dto2D_shr_8x8_neon);
1385
+    p.cuBLOCK_16x16.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16x16_sve);
1386
+    p.cuBLOCK_32x32.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_sve);
1387
+    p.cuBLOCK_64x64.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_64x64_sve);
1388
+
1389
+#if !HIGH_BIT_DEPTH
1390
+    // pixel_avg_pp
1391
+    ALL_LUMA_PU(pixelavg_ppNONALIGNED, pixel_avg_pp, neon);
1392
+    ALL_LUMA_PU(pixelavg_ppALIGNED, pixel_avg_pp, neon);
1393
+
1394
+    // addAvg
1395
+    ALL_LUMA_PU(addAvgNONALIGNED, addAvg, neon);
1396
+    ALL_LUMA_PU(addAvgALIGNED, addAvg, neon);
1397
+    ALL_CHROMA_420_PU(addAvgNONALIGNED, addAvg, neon);
1398
+    ALL_CHROMA_422_PU(addAvgNONALIGNED, addAvg, neon);
1399
+    ALL_CHROMA_420_PU(addAvgALIGNED, addAvg, neon);
1400
+    ALL_CHROMA_422_PU(addAvgALIGNED, addAvg, neon);
1401
+
1402
+    // sad
1403
+    ALL_LUMA_PU(sad, pixel_sad, neon);
1404
+    ALL_LUMA_PU(sad_x3, sad_x3, neon);
1405
+    ALL_LUMA_PU(sad_x4, sad_x4, neon);
1406
+
1407
+    // sse_pp
1408
+    p.cuBLOCK_4x4.sse_pp   = PFX(pixel_sse_pp_4x4_sve);
1409
+    p.cuBLOCK_8x8.sse_pp   = PFX(pixel_sse_pp_8x8_neon);
1410
+    p.cuBLOCK_16x16.sse_pp = PFX(pixel_sse_pp_16x16_neon);
1411
+    p.cuBLOCK_32x32.sse_pp = PFX(pixel_sse_pp_32x32_neon);
1412
+    p.cuBLOCK_64x64.sse_pp = PFX(pixel_sse_pp_64x64_neon);
1413
+
1414
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.sse_pp   = PFX(pixel_sse_pp_4x4_sve);
1415
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.sse_pp   = PFX(pixel_sse_pp_8x8_neon);
1416
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.sse_pp = PFX(pixel_sse_pp_16x16_neon);
1417
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.sse_pp = PFX(pixel_sse_pp_32x32_neon);
1418
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.sse_pp   = PFX(pixel_sse_pp_4x8_sve);
1419
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.sse_pp  = PFX(pixel_sse_pp_8x16_neon);
1420
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.sse_pp = PFX(pixel_sse_pp_16x32_neon);
1421
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.sse_pp = PFX(pixel_sse_pp_32x64_neon);
1422
+
1423
+    // sse_ss
1424
+    p.cuBLOCK_4x4.sse_ss   = PFX(pixel_sse_ss_4x4_neon);
1425
+    p.cuBLOCK_8x8.sse_ss   = PFX(pixel_sse_ss_8x8_neon);
1426
+    p.cuBLOCK_16x16.sse_ss = PFX(pixel_sse_ss_16x16_neon);
1427
+    p.cuBLOCK_32x32.sse_ss = PFX(pixel_sse_ss_32x32_neon);
1428
+    p.cuBLOCK_64x64.sse_ss = PFX(pixel_sse_ss_64x64_neon);
1429
+
1430
+    // ssd_s
1431
+    p.cuBLOCK_4x4.ssd_sNONALIGNED   = PFX(pixel_ssd_s_4x4_neon);
1432
+    p.cuBLOCK_8x8.ssd_sNONALIGNED   = PFX(pixel_ssd_s_8x8_neon);
1433
+    p.cuBLOCK_16x16.ssd_sNONALIGNED = PFX(pixel_ssd_s_16x16_neon);
1434
+    p.cuBLOCK_32x32.ssd_sNONALIGNED = PFX(pixel_ssd_s_32x32_neon);
1435
+
1436
+    p.cuBLOCK_4x4.ssd_sALIGNED   = PFX(pixel_ssd_s_4x4_neon);
1437
+    p.cuBLOCK_8x8.ssd_sALIGNED   = PFX(pixel_ssd_s_8x8_neon);
1438
+    p.cuBLOCK_16x16.ssd_sALIGNED = PFX(pixel_ssd_s_16x16_neon);
1439
+    p.cuBLOCK_32x32.ssd_sALIGNED = PFX(pixel_ssd_s_32x32_neon);
1440
+
1441
+    // pixel_var
1442
+    p.cuBLOCK_8x8.var   = PFX(pixel_var_8x8_neon);
1443
+    p.cuBLOCK_16x16.var = PFX(pixel_var_16x16_neon);
1444
+    p.cuBLOCK_32x32.var = PFX(pixel_var_32x32_neon);
1445
+    p.cuBLOCK_64x64.var = PFX(pixel_var_64x64_neon);
1446
+
1447
+    // calc_Residual
1448
+    p.cuBLOCK_4x4.calcresidualNONALIGNED   = PFX(getResidual4_neon);
1449
+    p.cuBLOCK_8x8.calcresidualNONALIGNED   = PFX(getResidual8_neon);
1450
+    p.cuBLOCK_16x16.calcresidualNONALIGNED = PFX(getResidual16_neon);
1451
+    p.cuBLOCK_32x32.calcresidualNONALIGNED = PFX(getResidual32_neon);
1452
+
1453
+    p.cuBLOCK_4x4.calcresidualALIGNED   = PFX(getResidual4_neon);
1454
+    p.cuBLOCK_8x8.calcresidualALIGNED   = PFX(getResidual8_neon);
1455
+    p.cuBLOCK_16x16.calcresidualALIGNED = PFX(getResidual16_neon);
1456
+    p.cuBLOCK_32x32.calcresidualALIGNED = PFX(getResidual32_neon);
1457
+
1458
+    // pixel_sub_ps
1459
+    p.cuBLOCK_4x4.sub_ps   = PFX(pixel_sub_ps_4x4_neon);
1460
+    p.cuBLOCK_8x8.sub_ps   = PFX(pixel_sub_ps_8x8_neon);
1461
+    p.cuBLOCK_16x16.sub_ps = PFX(pixel_sub_ps_16x16_neon);
1462
+    p.cuBLOCK_32x32.sub_ps = PFX(pixel_sub_ps_32x32_neon);
1463
+    p.cuBLOCK_64x64.sub_ps = PFX(pixel_sub_ps_64x64_neon);
1464
+
1465
+    // chroma sub_ps
1466
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.sub_ps   = PFX(pixel_sub_ps_4x4_neon);
1467
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.sub_ps   = PFX(pixel_sub_ps_8x8_neon);
1468
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.sub_ps = PFX(pixel_sub_ps_16x16_neon);
1469
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.sub_ps = PFX(pixel_sub_ps_32x32_neon);
1470
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.sub_ps   = PFX(pixel_sub_ps_4x8_neon);
1471
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.sub_ps  = PFX(pixel_sub_ps_8x16_sve);
1472
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.sub_ps = PFX(pixel_sub_ps_16x32_neon);
1473
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.sub_ps = PFX(pixel_sub_ps_32x64_neon);
1474
+
1475
+    // pixel_add_ps
1476
+    p.cuBLOCK_4x4.add_psNONALIGNED   = PFX(pixel_add_ps_4x4_neon);
1477
+    p.cuBLOCK_8x8.add_psNONALIGNED   = PFX(pixel_add_ps_8x8_neon);
1478
+    p.cuBLOCK_16x16.add_psNONALIGNED = PFX(pixel_add_ps_16x16_neon);
1479
+    p.cuBLOCK_32x32.add_psNONALIGNED = PFX(pixel_add_ps_32x32_neon);
1480
+    p.cuBLOCK_64x64.add_psNONALIGNED = PFX(pixel_add_ps_64x64_neon);
1481
+
1482
+    p.cuBLOCK_4x4.add_psALIGNED   = PFX(pixel_add_ps_4x4_neon);
1483
+    p.cuBLOCK_8x8.add_psALIGNED   = PFX(pixel_add_ps_8x8_neon);
1484
+    p.cuBLOCK_16x16.add_psALIGNED = PFX(pixel_add_ps_16x16_neon);
1485
+    p.cuBLOCK_32x32.add_psALIGNED = PFX(pixel_add_ps_32x32_neon);
1486
+    p.cuBLOCK_64x64.add_psALIGNED = PFX(pixel_add_ps_64x64_neon);
1487
+
1488
+    // chroma add_ps
1489
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.add_psNONALIGNED   = PFX(pixel_add_ps_4x4_neon);
1490
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.add_psNONALIGNED   = PFX(pixel_add_ps_8x8_neon);
1491
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.add_psNONALIGNED = PFX(pixel_add_ps_16x16_neon);
1492
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.add_psNONALIGNED = PFX(pixel_add_ps_32x32_neon);
1493
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.add_psNONALIGNED   = PFX(pixel_add_ps_4x8_neon);
1494
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.add_psNONALIGNED  = PFX(pixel_add_ps_8x16_neon);
1495
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.add_psNONALIGNED = PFX(pixel_add_ps_16x32_neon);
1496
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.add_psNONALIGNED = PFX(pixel_add_ps_32x64_neon);
1497
+
1498
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.add_psALIGNED   = PFX(pixel_add_ps_4x4_neon);
1499
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.add_psALIGNED   = PFX(pixel_add_ps_8x8_neon);
1500
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.add_psALIGNED = PFX(pixel_add_ps_16x16_neon);
1501
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.add_psALIGNED = PFX(pixel_add_ps_32x32_neon);
1502
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.add_psALIGNED   = PFX(pixel_add_ps_4x8_neon);
1503
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.add_psALIGNED  = PFX(pixel_add_ps_8x16_neon);
1504
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.add_psALIGNED = PFX(pixel_add_ps_16x32_neon);
1505
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.add_psALIGNED = PFX(pixel_add_ps_32x64_neon);
1506
+
1507
+    //scale2D_64to32
1508
+    p.scale2D_64to32  = PFX(scale2D_64to32_neon);
1509
+
1510
+    // scale1D_128to64
1511
+    p.scale1D_128to64NONALIGNED = PFX(scale1D_128to64_neon);
1512
+    p.scale1D_128to64ALIGNED = PFX(scale1D_128to64_neon);
1513
+
1514
+    // planecopy
1515
+    p.planecopy_cp = PFX(pixel_planecopy_cp_neon);
1516
+
1517
+    // satd
1518
+    p.puLUMA_4x4.satd   = PFX(pixel_satd_4x4_sve);
1519
+    p.puLUMA_8x8.satd   = PFX(pixel_satd_8x8_neon);
1520
+    p.puLUMA_16x16.satd = PFX(pixel_satd_16x16_neon);
1521
+    p.puLUMA_32x32.satd = PFX(pixel_satd_32x32_sve);
1522
+    p.puLUMA_64x64.satd = PFX(pixel_satd_64x64_neon);
1523
+    p.puLUMA_8x4.satd   = PFX(pixel_satd_8x4_sve);
1524
+    p.puLUMA_4x8.satd   = PFX(pixel_satd_4x8_neon);
1525
+    p.puLUMA_16x8.satd  = PFX(pixel_satd_16x8_neon);
1526
+    p.puLUMA_8x16.satd  = PFX(pixel_satd_8x16_neon);
1527
+    p.puLUMA_16x32.satd = PFX(pixel_satd_16x32_neon);
1528
+    p.puLUMA_32x16.satd = PFX(pixel_satd_32x16_sve);
1529
+    p.puLUMA_64x32.satd = PFX(pixel_satd_64x32_neon);
1530
+    p.puLUMA_32x64.satd = PFX(pixel_satd_32x64_neon);
1531
+    p.puLUMA_16x12.satd = PFX(pixel_satd_16x12_neon);
1532
+    p.puLUMA_12x16.satd = PFX(pixel_satd_12x16_neon);
1533
+    p.puLUMA_16x4.satd  = PFX(pixel_satd_16x4_neon);
1534
+    p.puLUMA_4x16.satd  = PFX(pixel_satd_4x16_neon);
1535
+    p.puLUMA_32x24.satd = PFX(pixel_satd_32x24_neon);
1536
+    p.puLUMA_24x32.satd = PFX(pixel_satd_24x32_neon);
1537
+    p.puLUMA_32x8.satd  = PFX(pixel_satd_32x8_neon);
1538
+    p.puLUMA_8x32.satd  = PFX(pixel_satd_8x32_neon);
1539
+    p.puLUMA_64x48.satd = PFX(pixel_satd_64x48_sve);
1540
+    p.puLUMA_48x64.satd = PFX(pixel_satd_48x64_neon);
1541
+    p.puLUMA_64x16.satd = PFX(pixel_satd_64x16_neon);
1542
+    p.puLUMA_16x64.satd = PFX(pixel_satd_16x64_neon);
1543
+
1544
+    p.chromaX265_CSP_I420.puCHROMA_420_4x4.satd   = PFX(pixel_satd_4x4_sve);
1545
+    p.chromaX265_CSP_I420.puCHROMA_420_8x8.satd   = PFX(pixel_satd_8x8_neon);
1546
+    p.chromaX265_CSP_I420.puCHROMA_420_16x16.satd = PFX(pixel_satd_16x16_neon);
1547
+    p.chromaX265_CSP_I420.puCHROMA_420_32x32.satd = PFX(pixel_satd_32x32_neon);
1548
+    p.chromaX265_CSP_I420.puCHROMA_420_8x4.satd   = PFX(pixel_satd_8x4_sve);
1549
+    p.chromaX265_CSP_I420.puCHROMA_420_4x8.satd   = PFX(pixel_satd_4x8_neon);
1550
+    p.chromaX265_CSP_I420.puCHROMA_420_16x8.satd  = PFX(pixel_satd_16x8_neon);
1551
+    p.chromaX265_CSP_I420.puCHROMA_420_8x16.satd  = PFX(pixel_satd_8x16_neon);
1552
+    p.chromaX265_CSP_I420.puCHROMA_420_32x16.satd = PFX(pixel_satd_32x16_neon);
1553
+    p.chromaX265_CSP_I420.puCHROMA_420_16x32.satd = PFX(pixel_satd_16x32_neon);
1554
+    p.chromaX265_CSP_I420.puCHROMA_420_16x12.satd = PFX(pixel_satd_16x12_neon);
1555
+    p.chromaX265_CSP_I420.puCHROMA_420_12x16.satd = PFX(pixel_satd_12x16_neon);
1556
+    p.chromaX265_CSP_I420.puCHROMA_420_16x4.satd  = PFX(pixel_satd_16x4_neon);
1557
+    p.chromaX265_CSP_I420.puCHROMA_420_4x16.satd  = PFX(pixel_satd_4x16_neon);
1558
+    p.chromaX265_CSP_I420.puCHROMA_420_32x24.satd = PFX(pixel_satd_32x24_neon);
1559
+    p.chromaX265_CSP_I420.puCHROMA_420_24x32.satd = PFX(pixel_satd_24x32_neon);
1560
+    p.chromaX265_CSP_I420.puCHROMA_420_32x8.satd  = PFX(pixel_satd_32x8_neon);
1561
+    p.chromaX265_CSP_I420.puCHROMA_420_8x32.satd  = PFX(pixel_satd_8x32_neon);
1562
+
1563
+    p.chromaX265_CSP_I422.puCHROMA_422_4x8.satd   = PFX(pixel_satd_4x8_neon);
1564
+    p.chromaX265_CSP_I422.puCHROMA_422_8x16.satd  = PFX(pixel_satd_8x16_neon);
1565
+    p.chromaX265_CSP_I422.puCHROMA_422_16x32.satd = PFX(pixel_satd_16x32_neon);
1566
+    p.chromaX265_CSP_I422.puCHROMA_422_32x64.satd = PFX(pixel_satd_32x64_neon);
1567
+    p.chromaX265_CSP_I422.puCHROMA_422_4x4.satd   = PFX(pixel_satd_4x4_sve);
1568
+    p.chromaX265_CSP_I422.puCHROMA_422_8x8.satd   = PFX(pixel_satd_8x8_neon);
1569
+    p.chromaX265_CSP_I422.puCHROMA_422_4x16.satd  = PFX(pixel_satd_4x16_neon);
1570
+    p.chromaX265_CSP_I422.puCHROMA_422_16x16.satd = PFX(pixel_satd_16x16_neon);
1571
+    p.chromaX265_CSP_I422.puCHROMA_422_8x32.satd  = PFX(pixel_satd_8x32_neon);
1572
+    p.chromaX265_CSP_I422.puCHROMA_422_32x32.satd = PFX(pixel_satd_32x32_neon);
1573
+    p.chromaX265_CSP_I422.puCHROMA_422_16x64.satd = PFX(pixel_satd_16x64_neon);
1574
+    p.chromaX265_CSP_I422.puCHROMA_422_8x12.satd  = PFX(pixel_satd_8x12_sve);
1575
+    p.chromaX265_CSP_I422.puCHROMA_422_8x4.satd   = PFX(pixel_satd_8x4_sve);
1576
+    p.chromaX265_CSP_I422.puCHROMA_422_16x24.satd = PFX(pixel_satd_16x24_neon);
1577
+    p.chromaX265_CSP_I422.puCHROMA_422_12x32.satd = PFX(pixel_satd_12x32_neon);
1578
+    p.chromaX265_CSP_I422.puCHROMA_422_16x8.satd  = PFX(pixel_satd_16x8_neon);
1579
+    p.chromaX265_CSP_I422.puCHROMA_422_4x32.satd  = PFX(pixel_satd_4x32_neon);
1580
+    p.chromaX265_CSP_I422.puCHROMA_422_32x48.satd = PFX(pixel_satd_32x48_neon);
1581
+    p.chromaX265_CSP_I422.puCHROMA_422_24x64.satd = PFX(pixel_satd_24x64_neon);
1582
+    p.chromaX265_CSP_I422.puCHROMA_422_32x16.satd = PFX(pixel_satd_32x16_neon);
1583
+    p.chromaX265_CSP_I422.puCHROMA_422_8x64.satd  = PFX(pixel_satd_8x64_neon);
1584
+
1585
+    // sa8d
1586
+    p.cuBLOCK_4x4.sa8d   = PFX(pixel_satd_4x4_sve);
1587
+    p.cuBLOCK_8x8.sa8d   = PFX(pixel_sa8d_8x8_neon);
1588
+    p.cuBLOCK_16x16.sa8d = PFX(pixel_sa8d_16x16_neon);
1589
+    p.cuBLOCK_32x32.sa8d = PFX(pixel_sa8d_32x32_neon);
1590
+    p.cuBLOCK_64x64.sa8d = PFX(pixel_sa8d_64x64_neon);
1591
+    p.chromaX265_CSP_I420.cuBLOCK_8x8.sa8d = PFX(pixel_satd_4x4_sve);
1592
+    p.chromaX265_CSP_I420.cuBLOCK_16x16.sa8d = PFX(pixel_sa8d_16x16_neon);
1593
+    p.chromaX265_CSP_I420.cuBLOCK_32x32.sa8d = PFX(pixel_sa8d_32x32_neon);
1594
+    p.chromaX265_CSP_I420.cuBLOCK_64x64.sa8d = PFX(pixel_sa8d_64x64_neon);
1595
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.sa8d = PFX(pixel_sa8d_8x16_neon);
1596
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.sa8d = PFX(pixel_sa8d_16x32_neon);
1597
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.sa8d = PFX(pixel_sa8d_32x64_neon);
1598
+
1599
+    // dequant_scaling
1600
+    p.dequant_scaling = PFX(dequant_scaling_neon);
1601
+    p.dequant_normal  = PFX(dequant_normal_neon);
1602
+
1603
+    // ssim_4x4x2_core
1604
+    p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_neon);
1605
+
1606
+    // ssimDist
1607
+    p.cuBLOCK_4x4.ssimDist = PFX(ssimDist4_neon);
1608
+    p.cuBLOCK_8x8.ssimDist = PFX(ssimDist8_neon);
1609
+    p.cuBLOCK_16x16.ssimDist = PFX(ssimDist16_neon);
1610
+    p.cuBLOCK_32x32.ssimDist = PFX(ssimDist32_neon);
1611
+    p.cuBLOCK_64x64.ssimDist = PFX(ssimDist64_neon);
1612
+
1613
+    // normFact
1614
+    p.cuBLOCK_8x8.normFact = PFX(normFact8_neon);
1615
+    p.cuBLOCK_16x16.normFact = PFX(normFact16_neon);
1616
+    p.cuBLOCK_32x32.normFact = PFX(normFact32_neon);
1617
+    p.cuBLOCK_64x64.normFact = PFX(normFact64_neon);
1618
+
1619
+    // psy_cost_pp
1620
+    p.cuBLOCK_4x4.psy_cost_pp = PFX(psyCost_4x4_neon);
1621
+
1622
+    p.weight_pp = PFX(weight_pp_neon);
1623
+#if !defined(__APPLE__)
1624
+    p.scanPosLast = PFX(scanPosLast_neon);
1625
+#endif
1626
+    p.costCoeffNxN = PFX(costCoeffNxN_neon);
1627
 #endif
1628
+
1629
+    // quant
1630
+    p.quant = PFX(quant_sve);
1631
+    p.nquant = PFX(nquant_neon);
1632
+}
1633
 #endif
1634
 
1635
+#if defined(HAVE_SVE2)
1636
+void setupSve2Primitives(EncoderPrimitives &p)
1637
+{
1638
+    // When these primitives will use SVE/SVE2 instructions set,
1639
+    // change the following definitions to point to the SVE/SVE2 implementation
1640
+    setupPixelPrimitives_neon(p);
1641
+    setupFilterPrimitives_neon(p);
1642
+    setupDCTPrimitives_neon(p);
1643
+    setupLoopFilterPrimitives_neon(p);
1644
+    setupIntraPrimitives_neon(p);
1645
+
1646
+    CHROMA_420_PU_FILTER_PIXEL_TO_SHORT_NEON(p2sNONALIGNED);
1647
+    CHROMA_420_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1648
+    CHROMA_422_PU_NEON_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1649
+    CHROMA_422_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1650
+    CHROMA_444_PU_NEON_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1651
+    CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1652
+    LUMA_PU_NEON_FILTER_PIXEL_TO_SHORT(convert_p2sALIGNED);
1653
+    LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2sALIGNED);
1654
+    CHROMA_420_PU_FILTER_PIXEL_TO_SHORT_NEON(p2sALIGNED);
1655
+    CHROMA_420_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1656
+    CHROMA_422_PU_NEON_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1657
+    CHROMA_422_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1658
+    CHROMA_444_PU_NEON_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1659
+    CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1660
+    LUMA_PU_NEON_FILTER_PIXEL_TO_SHORT(convert_p2sNONALIGNED);
1661
+    LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2sNONALIGNED);
1662
+
1663
 #if !HIGH_BIT_DEPTH
1664
-        p.cuBLOCK_4x4.psy_cost_pp = PFX(psyCost_4x4_neon);
1665
+    LUMA_PU_MULTIPLE_ARCHS_1(luma_vpp, interp_8tap_vert_pp, neon);
1666
+    LUMA_PU_MULTIPLE_ARCHS_2(luma_vpp, interp_8tap_vert_pp, sve2);
1667
+    LUMA_PU_MULTIPLE_ARCHS_1(luma_vsp, interp_8tap_vert_sp, sve2);
1668
+    LUMA_PU_MULTIPLE_ARCHS_2(luma_vsp, interp_8tap_vert_sp, neon);
1669
+    ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, sve2);
1670
+    ALL_LUMA_PU(luma_hpp, interp_horiz_pp, neon);
1671
+    ALL_LUMA_PU(luma_hps, interp_horiz_ps, neon);
1672
+    ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, sve2);
1673
+    ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu);
1674
+    CHROMA_420_VERT_FILTERS_NEON();
1675
+    CHROMA_420_VERT_FILTERS_CAN_USE_SVE2();
1676
+    CHROMA_422_VERT_FILTERS_NEON();
1677
+    CHROMA_422_VERT_FILTERS_CAN_USE_SVE2(sve2);
1678
+    CHROMA_444_VERT_FILTERS_NEON();
1679
+    CHROMA_444_VERT_FILTERS_CAN_USE_SVE2();
1680
+    CHROMA_420_FILTERS_NEON();
1681
+    CHROMA_420_FILTERS_CAN_USE_SVE2();
1682
+    CHROMA_422_FILTERS_NEON();
1683
+    CHROMA_422_FILTERS_CAN_USE_SVE2();
1684
+    CHROMA_444_FILTERS_NEON();
1685
+    CHROMA_444_FILTERS_CAN_USE_SVE2();
1686
+
1687
+    // Blockcopy_pp
1688
+    LUMA_PU_NEON_1(copy_pp, blockcopy_pp);
1689
+    LUMA_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
1690
+    CHROMA_420_PU_NEON_1(copy_pp, blockcopy_pp);
1691
+    CHROMA_420_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
1692
+    CHROMA_422_PU_NEON_1(copy_pp, blockcopy_pp);
1693
+    CHROMA_422_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
1694
+    p.cuBLOCK_4x4.copy_pp   = PFX(blockcopy_pp_4x4_neon);
1695
+    p.cuBLOCK_8x8.copy_pp   = PFX(blockcopy_pp_8x8_neon);
1696
+    p.cuBLOCK_16x16.copy_pp = PFX(blockcopy_pp_16x16_neon);
1697
+    p.cuBLOCK_32x32.copy_pp = PFX(blockcopy_pp_32x32_sve);
1698
+    p.cuBLOCK_64x64.copy_pp = PFX(blockcopy_pp_64x64_sve);
1699
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_pp = PFX(blockcopy_pp_4x4_neon);
1700
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_pp = PFX(blockcopy_pp_8x8_neon);
1701
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_pp = PFX(blockcopy_pp_16x16_neon);
1702
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_pp = PFX(blockcopy_pp_32x32_sve);
1703
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_pp = PFX(blockcopy_pp_4x8_neon);
1704
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_pp = PFX(blockcopy_pp_8x16_neon);
1705
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_pp = PFX(blockcopy_pp_16x32_neon);
1706
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_pp = PFX(blockcopy_pp_32x64_sve);
1707
+
1708
 #endif // !HIGH_BIT_DEPTH
1709
 
1710
+    // Blockcopy_ss
1711
+    p.cuBLOCK_4x4.copy_ss   = PFX(blockcopy_ss_4x4_neon);
1712
+    p.cuBLOCK_8x8.copy_ss   = PFX(blockcopy_ss_8x8_neon);
1713
+    p.cuBLOCK_16x16.copy_ss = PFX(blockcopy_ss_16x16_sve);
1714
+    p.cuBLOCK_32x32.copy_ss = PFX(blockcopy_ss_32x32_sve);
1715
+    p.cuBLOCK_64x64.copy_ss = PFX(blockcopy_ss_64x64_sve);
1716
+
1717
+    // Blockcopy_ps
1718
+    p.cuBLOCK_4x4.copy_ps   = PFX(blockcopy_ps_4x4_neon);
1719
+    p.cuBLOCK_8x8.copy_ps   = PFX(blockcopy_ps_8x8_neon);
1720
+    p.cuBLOCK_16x16.copy_ps = PFX(blockcopy_ps_16x16_sve);
1721
+    p.cuBLOCK_32x32.copy_ps = PFX(blockcopy_ps_32x32_sve);
1722
+    p.cuBLOCK_64x64.copy_ps = PFX(blockcopy_ps_64x64_sve);
1723
+
1724
+    // Blockcopy_sp
1725
+    p.cuBLOCK_4x4.copy_sp   = PFX(blockcopy_sp_4x4_sve);
1726
+    p.cuBLOCK_8x8.copy_sp   = PFX(blockcopy_sp_8x8_sve);
1727
+    p.cuBLOCK_16x16.copy_sp = PFX(blockcopy_sp_16x16_sve);
1728
+    p.cuBLOCK_32x32.copy_sp = PFX(blockcopy_sp_32x32_sve);
1729
+    p.cuBLOCK_64x64.copy_sp = PFX(blockcopy_sp_64x64_neon);
1730
+
1731
+    // chroma blockcopy_ss
1732
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_ss   = PFX(blockcopy_ss_4x4_neon);
1733
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_ss   = PFX(blockcopy_ss_8x8_neon);
1734
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_ss = PFX(blockcopy_ss_16x16_sve);
1735
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_ss = PFX(blockcopy_ss_32x32_sve);
1736
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_ss   = PFX(blockcopy_ss_4x8_neon);
1737
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_ss  = PFX(blockcopy_ss_8x16_neon);
1738
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_ss = PFX(blockcopy_ss_16x32_sve);
1739
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_ss = PFX(blockcopy_ss_32x64_sve);
1740
+
1741
+    // chroma blockcopy_ps
1742
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_ps   = PFX(blockcopy_ps_4x4_neon);
1743
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_ps   = PFX(blockcopy_ps_8x8_neon);
1744
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_ps = PFX(blockcopy_ps_16x16_sve);
1745
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_ps = PFX(blockcopy_ps_32x32_sve);
1746
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_ps   = PFX(blockcopy_ps_4x8_sve);
1747
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_ps  = PFX(blockcopy_ps_8x16_sve);
1748
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_ps = PFX(blockcopy_ps_16x32_sve);
1749
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_ps = PFX(blockcopy_ps_32x64_sve);
1750
+
1751
+    // chroma blockcopy_sp
1752
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_sp   = PFX(blockcopy_sp_4x4_sve);
1753
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_sp   = PFX(blockcopy_sp_8x8_sve);
1754
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_sp = PFX(blockcopy_sp_16x16_sve);
1755
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_sp = PFX(blockcopy_sp_32x32_sve);
1756
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_sp   = PFX(blockcopy_sp_4x8_sve);
1757
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_sp  = PFX(blockcopy_sp_8x16_sve);
1758
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_sp = PFX(blockcopy_sp_16x32_sve);
1759
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_sp = PFX(blockcopy_sp_32x64_sve);
1760
+
1761
+    // Block_fill
1762
+    LUMA_TU_NEON(blockfill_sALIGNED, blockfill_s);
1763
+    LUMA_TU_CAN_USE_SVE(blockfill_sALIGNED, blockfill_s);
1764
+    LUMA_TU_NEON(blockfill_sNONALIGNED, blockfill_s);
1765
+    LUMA_TU_CAN_USE_SVE(blockfill_sNONALIGNED, blockfill_s);
1766
+
1767
+    // copy_count
1768
+    p.cuBLOCK_4x4.copy_cnt     = PFX(copy_cnt_4_neon);
1769
+    p.cuBLOCK_8x8.copy_cnt     = PFX(copy_cnt_8_neon);
1770
+    p.cuBLOCK_16x16.copy_cnt   = PFX(copy_cnt_16_neon);
1771
+    p.cuBLOCK_32x32.copy_cnt   = PFX(copy_cnt_32_neon);
1772
+
1773
+    // count nonzero
1774
+    p.cuBLOCK_4x4.count_nonzero     = PFX(count_nonzero_4_neon);
1775
+    p.cuBLOCK_8x8.count_nonzero     = PFX(count_nonzero_8_neon);
1776
+    p.cuBLOCK_16x16.count_nonzero   = PFX(count_nonzero_16_neon);
1777
+    p.cuBLOCK_32x32.count_nonzero   = PFX(count_nonzero_32_neon);
1778
+
1779
+    // cpy2Dto1D_shl
1780
+    p.cuBLOCK_4x4.cpy2Dto1D_shl   = PFX(cpy2Dto1D_shl_4x4_neon);
1781
+    p.cuBLOCK_8x8.cpy2Dto1D_shl   = PFX(cpy2Dto1D_shl_8x8_neon);
1782
+    p.cuBLOCK_16x16.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16x16_sve);
1783
+    p.cuBLOCK_32x32.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32x32_sve);
1784
+    p.cuBLOCK_64x64.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_64x64_sve);
1785
+
1786
+    // cpy2Dto1D_shr
1787
+    p.cuBLOCK_4x4.cpy2Dto1D_shr   = PFX(cpy2Dto1D_shr_4x4_neon);
1788
+    p.cuBLOCK_8x8.cpy2Dto1D_shr   = PFX(cpy2Dto1D_shr_8x8_neon);
1789
+    p.cuBLOCK_16x16.cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16x16_sve);
1790
+    p.cuBLOCK_32x32.cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_sve);
1791
+
1792
+    // cpy1Dto2D_shl
1793
+    p.cuBLOCK_4x4.cpy1Dto2D_shlALIGNED      = PFX(cpy1Dto2D_shl_4x4_neon);
1794
+    p.cuBLOCK_8x8.cpy1Dto2D_shlALIGNED      = PFX(cpy1Dto2D_shl_8x8_neon);
1795
+    p.cuBLOCK_16x16.cpy1Dto2D_shlALIGNED    = PFX(cpy1Dto2D_shl_16x16_sve);
1796
+    p.cuBLOCK_32x32.cpy1Dto2D_shlALIGNED    = PFX(cpy1Dto2D_shl_32x32_sve);
1797
+    p.cuBLOCK_64x64.cpy1Dto2D_shlALIGNED    = PFX(cpy1Dto2D_shl_64x64_sve);
1798
+
1799
+    p.cuBLOCK_4x4.cpy1Dto2D_shlNONALIGNED   = PFX(cpy1Dto2D_shl_4x4_neon);
1800
+    p.cuBLOCK_8x8.cpy1Dto2D_shlNONALIGNED   = PFX(cpy1Dto2D_shl_8x8_neon);
1801
+    p.cuBLOCK_16x16.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_16x16_sve);
1802
+    p.cuBLOCK_32x32.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_32x32_sve);
1803
+    p.cuBLOCK_64x64.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_64x64_sve);
1804
+
1805
+    // cpy1Dto2D_shr
1806
+    p.cuBLOCK_4x4.cpy1Dto2D_shr   = PFX(cpy1Dto2D_shr_4x4_neon);
1807
+    p.cuBLOCK_8x8.cpy1Dto2D_shr   = PFX(cpy1Dto2D_shr_8x8_neon);
1808
+    p.cuBLOCK_16x16.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16x16_sve);
1809
+    p.cuBLOCK_32x32.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_sve);
1810
+    p.cuBLOCK_64x64.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_64x64_sve);
1811
+
1812
+#if !HIGH_BIT_DEPTH
1813
+    // pixel_avg_pp
1814
+    LUMA_PU_NEON_2(pixelavg_ppNONALIGNED, pixel_avg_pp);
1815
+    LUMA_PU_MULTIPLE_ARCHS_3(pixelavg_ppNONALIGNED, pixel_avg_pp, sve2);
1816
+    LUMA_PU_NEON_2(pixelavg_ppALIGNED, pixel_avg_pp);
1817
+    LUMA_PU_MULTIPLE_ARCHS_3(pixelavg_ppALIGNED, pixel_avg_pp, sve2);
1818
+
1819
+    // addAvg
1820
+    LUMA_PU_NEON_3(addAvgNONALIGNED, addAvg);
1821
+    LUMA_PU_CAN_USE_SVE2(addAvgNONALIGNED, addAvg);
1822
+    LUMA_PU_NEON_3(addAvgALIGNED, addAvg);
1823
+    LUMA_PU_CAN_USE_SVE2(addAvgALIGNED, addAvg);
1824
+    CHROMA_420_PU_NEON_2(addAvgNONALIGNED, addAvg);
1825
+    CHROMA_420_PU_MULTIPLE_ARCHS(addAvgNONALIGNED, addAvg, sve2);
1826
+    CHROMA_420_PU_NEON_2(addAvgALIGNED, addAvg);
1827
+    CHROMA_420_PU_MULTIPLE_ARCHS(addAvgALIGNED, addAvg, sve2);
1828
+    CHROMA_422_PU_NEON_2(addAvgNONALIGNED, addAvg);
1829
+    CHROMA_422_PU_CAN_USE_SVE2(addAvgNONALIGNED, addAvg);
1830
+    CHROMA_422_PU_NEON_2(addAvgALIGNED, addAvg);
1831
+    CHROMA_422_PU_CAN_USE_SVE2(addAvgALIGNED, addAvg);
1832
+
1833
+    // sad
1834
+    ALL_LUMA_PU(sad, pixel_sad, sve2);
1835
+    ALL_LUMA_PU(sad_x3, sad_x3, sve2);
1836
+    ALL_LUMA_PU(sad_x4, sad_x4, sve2);
1837
+
1838
+    // sse_pp
1839
+    p.cuBLOCK_4x4.sse_pp   = PFX(pixel_sse_pp_4x4_sve);
1840
+    p.cuBLOCK_8x8.sse_pp   = PFX(pixel_sse_pp_8x8_neon);
1841
+    p.cuBLOCK_16x16.sse_pp = PFX(pixel_sse_pp_16x16_neon);
1842
+    p.cuBLOCK_32x32.sse_pp = PFX(pixel_sse_pp_32x32_sve2);
1843
+    p.cuBLOCK_64x64.sse_pp = PFX(pixel_sse_pp_64x64_sve2);
1844
+
1845
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.sse_pp   = PFX(pixel_sse_pp_4x4_sve);
1846
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.sse_pp   = PFX(pixel_sse_pp_8x8_neon);
1847
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.sse_pp = PFX(pixel_sse_pp_16x16_neon);
1848
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.sse_pp = PFX(pixel_sse_pp_32x32_sve2);
1849
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.sse_pp   = PFX(pixel_sse_pp_4x8_sve);
1850
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.sse_pp  = PFX(pixel_sse_pp_8x16_neon);
1851
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.sse_pp = PFX(pixel_sse_pp_16x32_neon);
1852
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.sse_pp = PFX(pixel_sse_pp_32x64_sve2);
1853
+
1854
+    // sse_ss
1855
+    p.cuBLOCK_4x4.sse_ss   = PFX(pixel_sse_ss_4x4_sve2);
1856
+    p.cuBLOCK_8x8.sse_ss   = PFX(pixel_sse_ss_8x8_sve2);
1857
+    p.cuBLOCK_16x16.sse_ss = PFX(pixel_sse_ss_16x16_sve2);
1858
+    p.cuBLOCK_32x32.sse_ss = PFX(pixel_sse_ss_32x32_sve2);
1859
+    p.cuBLOCK_64x64.sse_ss = PFX(pixel_sse_ss_64x64_sve2);
1860
+
1861
+    // ssd_s
1862
+    p.cuBLOCK_4x4.ssd_sNONALIGNED   = PFX(pixel_ssd_s_4x4_sve2);
1863
+    p.cuBLOCK_8x8.ssd_sNONALIGNED   = PFX(pixel_ssd_s_8x8_sve2);
1864
+    p.cuBLOCK_16x16.ssd_sNONALIGNED = PFX(pixel_ssd_s_16x16_sve2);
1865
+    p.cuBLOCK_32x32.ssd_sNONALIGNED = PFX(pixel_ssd_s_32x32_sve2);
1866
+
1867
+    p.cuBLOCK_4x4.ssd_sALIGNED   = PFX(pixel_ssd_s_4x4_sve2);
1868
+    p.cuBLOCK_8x8.ssd_sALIGNED   = PFX(pixel_ssd_s_8x8_sve2);
1869
+    p.cuBLOCK_16x16.ssd_sALIGNED = PFX(pixel_ssd_s_16x16_sve2);
1870
+    p.cuBLOCK_32x32.ssd_sALIGNED = PFX(pixel_ssd_s_32x32_sve2);
1871
+
1872
+    // pixel_var
1873
+    p.cuBLOCK_8x8.var   = PFX(pixel_var_8x8_sve2);
1874
+    p.cuBLOCK_16x16.var = PFX(pixel_var_16x16_sve2);
1875
+    p.cuBLOCK_32x32.var = PFX(pixel_var_32x32_sve2);
1876
+    p.cuBLOCK_64x64.var = PFX(pixel_var_64x64_sve2);
1877
+
1878
+    // calc_Residual
1879
+    p.cuBLOCK_4x4.calcresidualNONALIGNED   = PFX(getResidual4_neon);
1880
+    p.cuBLOCK_8x8.calcresidualNONALIGNED   = PFX(getResidual8_neon);
1881
+    p.cuBLOCK_16x16.calcresidualNONALIGNED = PFX(getResidual16_sve2);
1882
+    p.cuBLOCK_32x32.calcresidualNONALIGNED = PFX(getResidual32_sve2);
1883
+
1884
+    p.cuBLOCK_4x4.calcresidualALIGNED   = PFX(getResidual4_neon);
1885
+    p.cuBLOCK_8x8.calcresidualALIGNED   = PFX(getResidual8_neon);
1886
+    p.cuBLOCK_16x16.calcresidualALIGNED = PFX(getResidual16_sve2);
1887
+    p.cuBLOCK_32x32.calcresidualALIGNED = PFX(getResidual32_sve2);
1888
+
1889
+    // pixel_sub_ps
1890
+    p.cuBLOCK_4x4.sub_ps   = PFX(pixel_sub_ps_4x4_neon);
1891
+    p.cuBLOCK_8x8.sub_ps   = PFX(pixel_sub_ps_8x8_neon);
1892
+    p.cuBLOCK_16x16.sub_ps = PFX(pixel_sub_ps_16x16_neon);
1893
+    p.cuBLOCK_32x32.sub_ps = PFX(pixel_sub_ps_32x32_sve2);
1894
+    p.cuBLOCK_64x64.sub_ps = PFX(pixel_sub_ps_64x64_sve2);
1895
+
1896
+    // chroma sub_ps
1897
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.sub_ps   = PFX(pixel_sub_ps_4x4_neon);
1898
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.sub_ps   = PFX(pixel_sub_ps_8x8_neon);
1899
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.sub_ps = PFX(pixel_sub_ps_16x16_neon);
1900
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.sub_ps = PFX(pixel_sub_ps_32x32_sve2);
1901
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.sub_ps   = PFX(pixel_sub_ps_4x8_neon);
1902
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.sub_ps  = PFX(pixel_sub_ps_8x16_sve);
1903
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.sub_ps = PFX(pixel_sub_ps_16x32_neon);
1904
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.sub_ps = PFX(pixel_sub_ps_32x64_sve2);
1905
+
1906
+    // pixel_add_ps
1907
+    p.cuBLOCK_4x4.add_psNONALIGNED   = PFX(pixel_add_ps_4x4_sve2);
1908
+    p.cuBLOCK_8x8.add_psNONALIGNED   = PFX(pixel_add_ps_8x8_sve2);
1909
+    p.cuBLOCK_16x16.add_psNONALIGNED = PFX(pixel_add_ps_16x16_sve2);
1910
+    p.cuBLOCK_32x32.add_psNONALIGNED = PFX(pixel_add_ps_32x32_sve2);
1911
+    p.cuBLOCK_64x64.add_psNONALIGNED = PFX(pixel_add_ps_64x64_sve2);
1912
+
1913
+    p.cuBLOCK_4x4.add_psALIGNED   = PFX(pixel_add_ps_4x4_sve2);
1914
+    p.cuBLOCK_8x8.add_psALIGNED   = PFX(pixel_add_ps_8x8_sve2);
1915
+    p.cuBLOCK_16x16.add_psALIGNED = PFX(pixel_add_ps_16x16_sve2);
1916
+    p.cuBLOCK_32x32.add_psALIGNED = PFX(pixel_add_ps_32x32_sve2);
1917
+    p.cuBLOCK_64x64.add_psALIGNED = PFX(pixel_add_ps_64x64_sve2);
1918
+
1919
+    // chroma add_ps
1920
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.add_psNONALIGNED   = PFX(pixel_add_ps_4x4_sve2);
1921
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.add_psNONALIGNED   = PFX(pixel_add_ps_8x8_sve2);
1922
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.add_psNONALIGNED = PFX(pixel_add_ps_16x16_sve2);
1923
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.add_psNONALIGNED = PFX(pixel_add_ps_32x32_sve2);
1924
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.add_psNONALIGNED   = PFX(pixel_add_ps_4x8_sve2);
1925
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.add_psNONALIGNED  = PFX(pixel_add_ps_8x16_sve2);
1926
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.add_psNONALIGNED = PFX(pixel_add_ps_16x32_sve2);
1927
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.add_psNONALIGNED = PFX(pixel_add_ps_32x64_sve2);
1928
+
1929
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.add_psALIGNED   = PFX(pixel_add_ps_4x4_sve2);
1930
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.add_psALIGNED   = PFX(pixel_add_ps_8x8_sve2);
1931
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.add_psALIGNED = PFX(pixel_add_ps_16x16_sve2);
1932
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.add_psALIGNED = PFX(pixel_add_ps_32x32_sve2);
1933
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.add_psALIGNED   = PFX(pixel_add_ps_4x8_sve2);
1934
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.add_psALIGNED  = PFX(pixel_add_ps_8x16_sve2);
1935
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.add_psALIGNED = PFX(pixel_add_ps_16x32_sve2);
1936
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.add_psALIGNED = PFX(pixel_add_ps_32x64_sve2);
1937
+
1938
+    //scale2D_64to32
1939
+    p.scale2D_64to32  = PFX(scale2D_64to32_neon);
1940
+
1941
+    // scale1D_128to64
1942
+    p.scale1D_128to64NONALIGNED = PFX(scale1D_128to64_sve2);
1943
+    p.scale1D_128to64ALIGNED = PFX(scale1D_128to64_sve2);
1944
+
1945
+    // planecopy
1946
+    p.planecopy_cp = PFX(pixel_planecopy_cp_neon);
1947
+
1948
+    // satd
1949
+    p.puLUMA_4x4.satd   = PFX(pixel_satd_4x4_sve);
1950
+    p.puLUMA_8x8.satd   = PFX(pixel_satd_8x8_neon);
1951
+    p.puLUMA_16x16.satd = PFX(pixel_satd_16x16_neon);
1952
+    p.puLUMA_32x32.satd = PFX(pixel_satd_32x32_sve);
1953
+    p.puLUMA_64x64.satd = PFX(pixel_satd_64x64_neon);
1954
+    p.puLUMA_8x4.satd   = PFX(pixel_satd_8x4_sve);
1955
+    p.puLUMA_4x8.satd   = PFX(pixel_satd_4x8_neon);
1956
+    p.puLUMA_16x8.satd  = PFX(pixel_satd_16x8_neon);
1957
+    p.puLUMA_8x16.satd  = PFX(pixel_satd_8x16_neon);
1958
+    p.puLUMA_16x32.satd = PFX(pixel_satd_16x32_neon);
1959
+    p.puLUMA_32x16.satd = PFX(pixel_satd_32x16_sve);
1960
+    p.puLUMA_64x32.satd = PFX(pixel_satd_64x32_neon);
1961
+    p.puLUMA_32x64.satd = PFX(pixel_satd_32x64_neon);
1962
+    p.puLUMA_16x12.satd = PFX(pixel_satd_16x12_neon);
1963
+    p.puLUMA_12x16.satd = PFX(pixel_satd_12x16_neon);
1964
+    p.puLUMA_16x4.satd  = PFX(pixel_satd_16x4_neon);
1965
+    p.puLUMA_4x16.satd  = PFX(pixel_satd_4x16_neon);
1966
+    p.puLUMA_32x24.satd = PFX(pixel_satd_32x24_neon);
1967
+    p.puLUMA_24x32.satd = PFX(pixel_satd_24x32_neon);
1968
+    p.puLUMA_32x8.satd  = PFX(pixel_satd_32x8_neon);
1969
+    p.puLUMA_8x32.satd  = PFX(pixel_satd_8x32_neon);
1970
+    p.puLUMA_64x48.satd = PFX(pixel_satd_64x48_sve);
1971
+    p.puLUMA_48x64.satd = PFX(pixel_satd_48x64_neon);
1972
+    p.puLUMA_64x16.satd = PFX(pixel_satd_64x16_neon);
1973
+    p.puLUMA_16x64.satd = PFX(pixel_satd_16x64_neon);
1974
+
1975
+    p.chromaX265_CSP_I420.puCHROMA_420_4x4.satd   = PFX(pixel_satd_4x4_sve);
1976
+    p.chromaX265_CSP_I420.puCHROMA_420_8x8.satd   = PFX(pixel_satd_8x8_neon);
1977
+    p.chromaX265_CSP_I420.puCHROMA_420_16x16.satd = PFX(pixel_satd_16x16_neon);
1978
+    p.chromaX265_CSP_I420.puCHROMA_420_32x32.satd = PFX(pixel_satd_32x32_neon);
1979
+    p.chromaX265_CSP_I420.puCHROMA_420_8x4.satd   = PFX(pixel_satd_8x4_sve);
1980
+    p.chromaX265_CSP_I420.puCHROMA_420_4x8.satd   = PFX(pixel_satd_4x8_neon);
1981
+    p.chromaX265_CSP_I420.puCHROMA_420_16x8.satd  = PFX(pixel_satd_16x8_neon);
1982
+    p.chromaX265_CSP_I420.puCHROMA_420_8x16.satd  = PFX(pixel_satd_8x16_neon);
1983
+    p.chromaX265_CSP_I420.puCHROMA_420_32x16.satd = PFX(pixel_satd_32x16_neon);
1984
+    p.chromaX265_CSP_I420.puCHROMA_420_16x32.satd = PFX(pixel_satd_16x32_neon);
1985
+    p.chromaX265_CSP_I420.puCHROMA_420_16x12.satd = PFX(pixel_satd_16x12_neon);
1986
+    p.chromaX265_CSP_I420.puCHROMA_420_12x16.satd = PFX(pixel_satd_12x16_neon);
1987
+    p.chromaX265_CSP_I420.puCHROMA_420_16x4.satd  = PFX(pixel_satd_16x4_neon);
1988
+    p.chromaX265_CSP_I420.puCHROMA_420_4x16.satd  = PFX(pixel_satd_4x16_neon);
1989
+    p.chromaX265_CSP_I420.puCHROMA_420_32x24.satd = PFX(pixel_satd_32x24_neon);
1990
+    p.chromaX265_CSP_I420.puCHROMA_420_24x32.satd = PFX(pixel_satd_24x32_neon);
1991
+    p.chromaX265_CSP_I420.puCHROMA_420_32x8.satd  = PFX(pixel_satd_32x8_neon);
1992
+    p.chromaX265_CSP_I420.puCHROMA_420_8x32.satd  = PFX(pixel_satd_8x32_neon);
1993
+
1994
+    p.chromaX265_CSP_I422.puCHROMA_422_4x8.satd   = PFX(pixel_satd_4x8_neon);
1995
+    p.chromaX265_CSP_I422.puCHROMA_422_8x16.satd  = PFX(pixel_satd_8x16_neon);
1996
+    p.chromaX265_CSP_I422.puCHROMA_422_16x32.satd = PFX(pixel_satd_16x32_neon);
1997
+    p.chromaX265_CSP_I422.puCHROMA_422_32x64.satd = PFX(pixel_satd_32x64_neon);
1998
+    p.chromaX265_CSP_I422.puCHROMA_422_4x4.satd   = PFX(pixel_satd_4x4_sve);
1999
+    p.chromaX265_CSP_I422.puCHROMA_422_8x8.satd   = PFX(pixel_satd_8x8_neon);
2000
+    p.chromaX265_CSP_I422.puCHROMA_422_4x16.satd  = PFX(pixel_satd_4x16_neon);
2001
+    p.chromaX265_CSP_I422.puCHROMA_422_16x16.satd = PFX(pixel_satd_16x16_neon);
2002
+    p.chromaX265_CSP_I422.puCHROMA_422_8x32.satd  = PFX(pixel_satd_8x32_neon);
2003
+    p.chromaX265_CSP_I422.puCHROMA_422_32x32.satd = PFX(pixel_satd_32x32_neon);
2004
+    p.chromaX265_CSP_I422.puCHROMA_422_16x64.satd = PFX(pixel_satd_16x64_neon);
2005
+    p.chromaX265_CSP_I422.puCHROMA_422_8x12.satd  = PFX(pixel_satd_8x12_sve);
2006
+    p.chromaX265_CSP_I422.puCHROMA_422_8x4.satd   = PFX(pixel_satd_8x4_sve);
2007
+    p.chromaX265_CSP_I422.puCHROMA_422_16x24.satd = PFX(pixel_satd_16x24_neon);
2008
+    p.chromaX265_CSP_I422.puCHROMA_422_12x32.satd = PFX(pixel_satd_12x32_neon);
2009
+    p.chromaX265_CSP_I422.puCHROMA_422_16x8.satd  = PFX(pixel_satd_16x8_neon);
2010
+    p.chromaX265_CSP_I422.puCHROMA_422_4x32.satd  = PFX(pixel_satd_4x32_neon);
2011
+    p.chromaX265_CSP_I422.puCHROMA_422_32x48.satd = PFX(pixel_satd_32x48_neon);
2012
+    p.chromaX265_CSP_I422.puCHROMA_422_24x64.satd = PFX(pixel_satd_24x64_neon);
2013
+    p.chromaX265_CSP_I422.puCHROMA_422_32x16.satd = PFX(pixel_satd_32x16_neon);
2014
+    p.chromaX265_CSP_I422.puCHROMA_422_8x64.satd  = PFX(pixel_satd_8x64_neon);
2015
+
2016
+    // sa8d
2017
+    p.cuBLOCK_4x4.sa8d   = PFX(pixel_satd_4x4_sve);
2018
+    p.cuBLOCK_8x8.sa8d   = PFX(pixel_sa8d_8x8_neon);
2019
+    p.cuBLOCK_16x16.sa8d = PFX(pixel_sa8d_16x16_neon);
2020
+    p.cuBLOCK_32x32.sa8d = PFX(pixel_sa8d_32x32_neon);
2021
+    p.cuBLOCK_64x64.sa8d = PFX(pixel_sa8d_64x64_neon);
2022
+    p.chromaX265_CSP_I420.cuBLOCK_8x8.sa8d = PFX(pixel_satd_4x4_sve);
2023
+    p.chromaX265_CSP_I420.cuBLOCK_16x16.sa8d = PFX(pixel_sa8d_16x16_neon);
2024
+    p.chromaX265_CSP_I420.cuBLOCK_32x32.sa8d = PFX(pixel_sa8d_32x32_neon);
2025
+    p.chromaX265_CSP_I420.cuBLOCK_64x64.sa8d = PFX(pixel_sa8d_64x64_neon);
2026
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.sa8d = PFX(pixel_sa8d_8x16_neon);
2027
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.sa8d = PFX(pixel_sa8d_16x32_neon);
2028
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.sa8d = PFX(pixel_sa8d_32x64_neon);
2029
+
2030
+    // dequant_scaling
2031
+    p.dequant_scaling = PFX(dequant_scaling_sve2);
2032
+    p.dequant_normal  = PFX(dequant_normal_sve2);
2033
+
2034
+    // ssim_4x4x2_core
2035
+    p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_sve2);
2036
+
2037
+    // ssimDist
2038
+    p.cuBLOCK_4x4.ssimDist = PFX(ssimDist4_sve2);
2039
+    p.cuBLOCK_8x8.ssimDist = PFX(ssimDist8_sve2);
2040
+    p.cuBLOCK_16x16.ssimDist = PFX(ssimDist16_sve2);
2041
+    p.cuBLOCK_32x32.ssimDist = PFX(ssimDist32_sve2);
2042
+    p.cuBLOCK_64x64.ssimDist = PFX(ssimDist64_sve2);
2043
+
2044
+    // normFact
2045
+    p.cuBLOCK_8x8.normFact = PFX(normFact8_sve2);
2046
+    p.cuBLOCK_16x16.normFact = PFX(normFact16_sve2);
2047
+    p.cuBLOCK_32x32.normFact = PFX(normFact32_sve2);
2048
+    p.cuBLOCK_64x64.normFact = PFX(normFact64_sve2);
2049
+
2050
+    // psy_cost_pp
2051
+    p.cuBLOCK_4x4.psy_cost_pp = PFX(psyCost_4x4_neon);
2052
+
2053
+    p.weight_pp = PFX(weight_pp_neon);
2054
+#if !defined(__APPLE__)
2055
+    p.scanPosLast = PFX(scanPosLast_neon);
2056
+#endif
2057
+    p.costCoeffNxN = PFX(costCoeffNxN_neon);
2058
+#endif
2059
+
2060
+    // quant
2061
+    p.quant = PFX(quant_sve);
2062
+    p.nquant = PFX(nquant_neon);
2063
+}
2064
+#endif
2065
+
2066
+void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)
2067
+{
2068
+
2069
+#ifdef HAVE_SVE2
2070
+    if (cpuMask & X265_CPU_SVE2)
2071
+    {
2072
+        setupSve2Primitives(p);
2073
     }
2074
+    else if (cpuMask & X265_CPU_SVE)
2075
+    {
2076
+        setupSvePrimitives(p);
2077
+    }
2078
+    else if (cpuMask & X265_CPU_NEON)
2079
+    {
2080
+        setupNeonPrimitives(p);
2081
+    }
2082
+
2083
+#elif defined(HAVE_SVE)
2084
+    if (cpuMask & X265_CPU_SVE)
2085
+    {
2086
+        setupSvePrimitives(p);
2087
+    }
2088
+    else if (cpuMask & X265_CPU_NEON)
2089
+    {
2090
+        setupNeonPrimitives(p);
2091
+    }
2092
+
2093
+#else
2094
+    if (cpuMask & X265_CPU_NEON)
2095
+    {
2096
+        setupNeonPrimitives(p);
2097
+    }
2098
+#endif
2099
+
2100
 }
2101
 } // namespace X265_NS
2102
x265_3.6.tar.gz/source/common/aarch64/asm-sve.S Added
41
 
1
@@ -0,0 +1,39 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+
27
+.arch armv8-a+sve
28
+
29
+.macro ABS2_SVE a b c
30
+    abs             \a, \c\()/m, \a
31
+    abs             \b, \c\()/m, \b
32
+.endm
33
+
34
+.macro ABS8_SVE z0, z1, z2, z3, z4, z5, z6, z7, p0
35
+    ABS2_SVE        \z0, \z1, p0
36
+    ABS2_SVE        \z2, \z3, p0
37
+    ABS2_SVE        \z4, \z5, p0
38
+    ABS2_SVE        \z6, \z7, p0
39
+.endm
40
+
41
x265_3.5.tar.gz/source/common/aarch64/asm.S -> x265_3.6.tar.gz/source/common/aarch64/asm.S Changed
173
 
1
@@ -1,7 +1,8 @@
2
 /*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
+ * Copyright (C) 2020-2021 MulticoreWare, Inc
5
  *
6
  * Authors: Hongbin Liu <liuhongbin1@huawei.com>
7
+ *          Sebastian Pop <spop@amazon.com>
8
  *
9
  * This program is free software; you can redistribute it and/or modify
10
  * it under the terms of the GNU General Public License as published by
11
@@ -21,34 +22,74 @@
12
  * For more information, contact us at license @ x265.com.
13
  *****************************************************************************/
14
 
15
+#ifndef ASM_S_  // #include guards
16
+#define ASM_S_
17
+
18
 .arch           armv8-a
19
 
20
+#define PFX3(prefix, name) prefix ## _ ## name
21
+#define PFX2(prefix, name) PFX3(prefix, name)
22
+#define PFX(name)          PFX2(X265_NS, name)
23
+
24
+#ifdef __APPLE__
25
+#define PREFIX 1
26
+#endif
27
+
28
 #ifdef PREFIX
29
 #define EXTERN_ASM _
30
+#define HAVE_AS_FUNC 0
31
+#elif defined __clang__
32
+#define EXTERN_ASM
33
+#define HAVE_AS_FUNC 0
34
+#define PREFIX 1
35
 #else
36
 #define EXTERN_ASM
37
+#define HAVE_AS_FUNC 1
38
 #endif
39
 
40
 #ifdef __ELF__
41
 #define ELF
42
 #else
43
+#ifdef PREFIX
44
+#define ELF #
45
+#else
46
 #define ELF @
47
 #endif
48
-
49
-#define HAVE_AS_FUNC 1
50
+#endif
51
 
52
 #if HAVE_AS_FUNC
53
 #define FUNC
54
 #else
55
+#ifdef PREFIX
56
+#define FUNC #
57
+#else
58
 #define FUNC @
59
 #endif
60
+#endif
61
+
62
+#define GLUE(a, b) a ## b
63
+#define JOIN(a, b) GLUE(a, b)
64
+
65
+#define PFX_C(name)        JOIN(JOIN(JOIN(EXTERN_ASM, X265_NS), _), name)
66
+
67
+#ifdef __APPLE__
68
+.macro endfunc
69
+ELF .size \name, . - \name
70
+FUNC .endfunc
71
+.endm
72
+#endif
73
 
74
 .macro function name, export=1
75
+#ifdef __APPLE__
76
+    .global \name
77
+    endfunc
78
+#else
79
     .macro endfunc
80
 ELF     .size   \name, . - \name
81
 FUNC    .endfunc
82
         .purgem endfunc
83
     .endm
84
+#endif
85
         .align  2
86
 .if \export == 1
87
         .global EXTERN_ASM\name
88
@@ -64,6 +105,83 @@
89
 .endif
90
 .endm
91
 
92
+.macro  const   name, align=2
93
+    .macro endconst
94
+ELF     .size   \name, . - \name
95
+        .purgem endconst
96
+    .endm
97
+#ifdef __MACH__
98
+    .const_data
99
+#else
100
+    .section .rodata
101
+#endif
102
+    .align          \align
103
+\name:
104
+.endm
105
+
106
+.macro  movrel rd, val, offset=0
107
+#if defined(__APPLE__)
108
+  .if \offset < 0
109
+        adrp            \rd, \val@PAGE
110
+        add             \rd, \rd, \val@PAGEOFF
111
+        sub             \rd, \rd, -(\offset)
112
+  .else
113
+        adrp            \rd, \val+(\offset)@PAGE
114
+        add             \rd, \rd, \val+(\offset)@PAGEOFF
115
+  .endif
116
+#elif defined(PIC) && defined(_WIN32)
117
+  .if \offset < 0
118
+        adrp            \rd, \val
119
+        add             \rd, \rd, :lo12:\val
120
+        sub             \rd, \rd, -(\offset)
121
+  .else
122
+        adrp            \rd, \val+(\offset)
123
+        add             \rd, \rd, :lo12:\val+(\offset)
124
+  .endif
125
+#else
126
+        adrp            \rd, \val+(\offset)
127
+        add             \rd, \rd, :lo12:\val+(\offset)
128
+#endif
129
+.endm
130
 
131
 #define FENC_STRIDE 64
132
 #define FDEC_STRIDE 32
133
+
134
+.macro SUMSUB_AB sum, diff, a, b
135
+    add             \sum,  \a, \b
136
+    sub             \diff, \a, \b
137
+.endm
138
+
139
+.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
140
+    SUMSUB_AB       \s1, \d1, \a, \b
141
+    SUMSUB_AB       \s2, \d2, \c, \d
142
+.endm
143
+
144
+.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
145
+    SUMSUB_ABCD     \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
146
+    SUMSUB_ABCD     \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
147
+.endm
148
+
149
+.macro ABS2 a b
150
+    abs             \a, \a
151
+    abs             \b, \b
152
+.endm
153
+
154
+.macro ABS8 v0, v1, v2, v3, v4, v5, v6, v7
155
+    ABS2            \v0, \v1
156
+    ABS2            \v2, \v3
157
+    ABS2            \v4, \v5
158
+    ABS2            \v6, \v7
159
+.endm
160
+
161
+.macro vtrn t1, t2, s1, s2
162
+    trn1            \t1, \s1, \s2
163
+    trn2            \t2, \s1, \s2
164
+.endm
165
+
166
+.macro trn4 t1, t2, t3, t4, s1, s2, s3, s4
167
+    vtrn            \t1, \t2, \s1, \s2
168
+    vtrn            \t3, \t4, \s3, \s4
169
+.endm
170
+
171
+#endif
172
\ No newline at end of file
173
x265_3.6.tar.gz/source/common/aarch64/blockcopy8-common.S Added
56
 
1
@@ -0,0 +1,54 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+#include "asm.S"
29
+
30
+.arch           armv8-a
31
+
32
+// void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
33
+.macro cpy1Dto2D_shr_start
34
+    add             x2, x2, x2
35
+    dup             v0.8h, w3
36
+    cmeq            v1.8h, v1.8h, v1.8h
37
+    sshl            v1.8h, v1.8h, v0.8h
38
+    sri             v1.8h, v1.8h, #1
39
+    neg             v0.8h, v0.8h
40
+.endm
41
+
42
+.macro cpy2Dto1D_shr_start
43
+    add             x2, x2, x2
44
+    dup             v0.8h, w3
45
+    cmeq            v1.8h, v1.8h, v1.8h
46
+    sshl            v1.8h, v1.8h, v0.8h
47
+    sri             v1.8h, v1.8h, #1
48
+    neg             v0.8h, v0.8h
49
+.endm
50
+
51
+const xtn_xtn2_table, align=4
52
+.byte    0, 2, 4, 6, 8, 10, 12, 14
53
+.byte    16, 18, 20, 22, 24, 26, 28, 30
54
+endconst
55
+
56
x265_3.6.tar.gz/source/common/aarch64/blockcopy8-sve.S Added
1418
 
1
@@ -0,0 +1,1416 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ 
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "blockcopy8-common.S"
27
+
28
+.arch armv8-a+sve
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+/* void blockcopy_sp(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
41
+ *
42
+ * r0   - a
43
+ * r1   - stridea
44
+ * r2   - b
45
+ * r3   - strideb */
46
+
47
+function PFX(blockcopy_sp_4x4_sve)
48
+    ptrue           p0.h, vl4
49
+.rept 2
50
+    ld1h            {z0.h}, p0/z, x2
51
+    add             x2, x2, x3, lsl #1
52
+    st1b            {z0.h}, p0, x0
53
+    add             x0, x0, x1
54
+    ld1h            {z1.h}, p0/z, x2
55
+    add             x2, x2, x3, lsl #1
56
+    st1b            {z1.h}, p0, x0
57
+    add             x0, x0, x1
58
+.endr
59
+    ret
60
+endfunc
61
+
62
+function PFX(blockcopy_sp_8x8_sve)
63
+    ptrue           p0.h, vl8
64
+.rept 4
65
+    ld1h            {z0.h}, p0/z, x2
66
+    add             x2, x2, x3, lsl #1
67
+    st1b            {z0.h}, p0, x0
68
+    add            x0, x0, x1
69
+    ld1h            {z1.h}, p0/z, x2
70
+    add             x2, x2, x3, lsl #1
71
+    st1b            {z1.h}, p0, x0
72
+    add            x0, x0, x1
73
+.endr
74
+    ret
75
+endfunc
76
+
77
+function PFX(blockcopy_sp_16x16_sve)
78
+    rdvl            x9, #1
79
+    cmp             x9, #16
80
+    bgt             .vl_gt_16_blockcopy_sp_16_16
81
+    lsl             x3, x3, #1
82
+    movrel          x11, xtn_xtn2_table
83
+    ld1             {v31.16b}, x11
84
+.rept 8
85
+    ld1             {v0.8h-v1.8h}, x2, x3
86
+    ld1             {v2.8h-v3.8h}, x2, x3
87
+    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
88
+    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
89
+    st1             {v0.16b}, x0, x1
90
+    st1             {v1.16b}, x0, x1
91
+.endr
92
+    ret
93
+.vl_gt_16_blockcopy_sp_16_16:
94
+    ptrue           p0.h, vl16
95
+.rept 8
96
+    ld1h            {z0.h}, p0/z, x2
97
+    st1b            {z0.h}, p0, x0
98
+    add             x2, x2, x3, lsl #1
99
+    add             x0, x0, x1
100
+    ld1h            {z1.h}, p0/z, x2
101
+    st1b            {z1.h}, p0, x0
102
+    add             x2, x2, x3, lsl #1
103
+    add             x0, x0, x1
104
+.endr
105
+    ret
106
+endfunc
107
+
108
+function PFX(blockcopy_sp_32x32_sve)
109
+    mov             w12, #4
110
+    rdvl            x9, #1
111
+    cmp             x9, #16
112
+    bgt             .vl_gt_16_blockcopy_sp_32_32
113
+    lsl             x3, x3, #1
114
+    movrel          x11, xtn_xtn2_table
115
+    ld1             {v31.16b}, x11
116
+.loop_csp32_sve:
117
+    sub             w12, w12, #1
118
+.rept 4
119
+    ld1             {v0.8h-v3.8h}, x2, x3
120
+    ld1             {v4.8h-v7.8h}, x2, x3
121
+    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
122
+    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
123
+    tbl             v2.16b, {v4.16b,v5.16b}, v31.16b
124
+    tbl             v3.16b, {v6.16b,v7.16b}, v31.16b
125
+    st1             {v0.16b-v1.16b}, x0, x1
126
+    st1             {v2.16b-v3.16b}, x0, x1
127
+.endr
128
+    cbnz            w12, .loop_csp32_sve
129
+    ret
130
+.vl_gt_16_blockcopy_sp_32_32:
131
+    cmp             x9, #48
132
+    bgt             .vl_gt_48_blockcopy_sp_32_32
133
+    ptrue           p0.h, vl16
134
+.vl_gt_16_loop_csp32_sve:
135
+    sub             w12, w12, #1
136
+.rept 4
137
+    ld1h            {z0.h}, p0/z, x2
138
+    ld1h            {z1.h}, p0/z, x2, #1, mul vl
139
+    st1b            {z0.h}, p0, x0
140
+    st1b            {z1.h}, p0, x0, #1, mul vl
141
+    add             x2, x2, x3, lsl #1
142
+    add             x0, x0, x1
143
+    ld1h            {z2.h}, p0/z, x2
144
+    ld1h            {z3.h}, p0/z, x2, #1, mul vl
145
+    st1b            {z2.h}, p0, x0
146
+    st1b            {z3.h}, p0, x0, #1, mul vl
147
+    add             x2, x2, x3, lsl #1
148
+    add             x0, x0, x1
149
+.endr
150
+    cbnz            w12, .vl_gt_16_loop_csp32_sve
151
+    ret
152
+.vl_gt_48_blockcopy_sp_32_32:
153
+    ptrue           p0.h, vl32
154
+.vl_gt_48_loop_csp32_sve:
155
+    sub             w12, w12, #1
156
+.rept 4
157
+    ld1h            {z0.h}, p0/z, x2
158
+    st1b            {z0.h}, p0, x0
159
+    add             x2, x2, x3, lsl #1
160
+    add             x0, x0, x1
161
+    ld1h            {z1.h}, p0/z, x2
162
+    st1b            {z1.h}, p0, x0
163
+    add             x2, x2, x3, lsl #1
164
+    add             x0, x0, x1
165
+.endr
166
+    cbnz            w12, .vl_gt_48_loop_csp32_sve
167
+    ret
168
+endfunc
169
+
170
+function PFX(blockcopy_ps_16x16_sve)
171
+    rdvl            x9, #1
172
+    cmp             x9, #16
173
+    bgt             .vl_gt_16_blockcopy_ps_16_16
174
+    lsl             x1, x1, #1
175
+.rept 8
176
+    ld1             {v4.16b}, x2, x3
177
+    ld1             {v5.16b}, x2, x3
178
+    uxtl            v0.8h, v4.8b
179
+    uxtl2           v1.8h, v4.16b
180
+    uxtl            v2.8h, v5.8b
181
+    uxtl2           v3.8h, v5.16b
182
+    st1             {v0.8h-v1.8h}, x0, x1
183
+    st1             {v2.8h-v3.8h}, x0, x1
184
+.endr
185
+    ret
186
+.vl_gt_16_blockcopy_ps_16_16:
187
+    ptrue           p0.b, vl32
188
+.rept 16
189
+    ld1b            {z1.h}, p0/z, x2
190
+    st1h            {z1.h}, p0, x0
191
+    add             x0, x0, x1, lsl #1
192
+    add             x2, x2, x3
193
+.endr
194
+    ret
195
+endfunc
196
+
197
+function PFX(blockcopy_ps_32x32_sve)
198
+    rdvl            x9, #1
199
+    cmp             x9, #16
200
+    bgt             .vl_gt_16_blockcopy_ps_32_32
201
+    lsl             x1, x1, #1
202
+    mov             w12, #4
203
+.loop_cps32_sve:
204
+    sub             w12, w12, #1
205
+.rept 4
206
+    ld1             {v16.16b-v17.16b}, x2, x3
207
+    ld1             {v18.16b-v19.16b}, x2, x3
208
+    uxtl            v0.8h, v16.8b
209
+    uxtl2           v1.8h, v16.16b
210
+    uxtl            v2.8h, v17.8b
211
+    uxtl2           v3.8h, v17.16b
212
+    uxtl            v4.8h, v18.8b
213
+    uxtl2           v5.8h, v18.16b
214
+    uxtl            v6.8h, v19.8b
215
+    uxtl2           v7.8h, v19.16b
216
+    st1             {v0.8h-v3.8h}, x0, x1
217
+    st1             {v4.8h-v7.8h}, x0, x1
218
+.endr
219
+    cbnz            w12, .loop_cps32_sve
220
+    ret
221
+.vl_gt_16_blockcopy_ps_32_32:
222
+    cmp             x9, #48
223
+    bgt             .vl_gt_48_blockcopy_ps_32_32
224
+    ptrue           p0.b, vl32
225
+.rept 32
226
+    ld1b            {z2.h}, p0/z, x2
227
+    ld1b            {z3.h}, p0/z, x2, #1, mul vl
228
+    st1h            {z2.h}, p0, x0
229
+    st1h            {z3.h}, p0, x0, #1, mul vl
230
+    add             x0, x0, x1, lsl #1
231
+    add             x2, x2, x3
232
+.endr
233
+    ret
234
+.vl_gt_48_blockcopy_ps_32_32:
235
+    ptrue           p0.b, vl64
236
+.rept 32
237
+    ld1b            {z2.h}, p0/z, x2
238
+    st1h            {z2.h}, p0, x0
239
+    add             x0, x0, x1, lsl #1
240
+    add             x2, x2, x3
241
+.endr
242
+    ret
243
+endfunc
244
+
245
+function PFX(blockcopy_ps_64x64_sve)
246
+    rdvl            x9, #1
247
+    cmp             x9, #16
248
+    bgt             .vl_gt_16_blockcopy_ps_64_64
249
+    lsl             x1, x1, #1
250
+    sub             x1, x1, #64
251
+    mov             w12, #16
252
+.loop_cps64_sve:
253
+    sub             w12, w12, #1
254
+.rept 4
255
+    ld1             {v16.16b-v19.16b}, x2, x3
256
+    uxtl            v0.8h, v16.8b
257
+    uxtl2           v1.8h, v16.16b
258
+    uxtl            v2.8h, v17.8b
259
+    uxtl2           v3.8h, v17.16b
260
+    uxtl            v4.8h, v18.8b
261
+    uxtl2           v5.8h, v18.16b
262
+    uxtl            v6.8h, v19.8b
263
+    uxtl2           v7.8h, v19.16b
264
+    st1             {v0.8h-v3.8h}, x0, #64
265
+    st1             {v4.8h-v7.8h}, x0, x1
266
+.endr
267
+    cbnz            w12, .loop_cps64_sve
268
+    ret
269
+.vl_gt_16_blockcopy_ps_64_64:
270
+    cmp             x9, #48
271
+    bgt             .vl_gt_48_blockcopy_ps_64_64
272
+    ptrue           p0.b, vl32
273
+.rept 64
274
+    ld1b            {z4.h}, p0/z, x2
275
+    ld1b            {z5.h}, p0/z, x2, #1, mul vl
276
+    ld1b            {z6.h}, p0/z, x2, #2, mul vl
277
+    ld1b            {z7.h}, p0/z, x2, #3, mul vl
278
+    st1h            {z4.h}, p0, x0
279
+    st1h            {z5.h}, p0, x0, #1, mul vl
280
+    st1h            {z6.h}, p0, x0, #2, mul vl
281
+    st1h            {z7.h}, p0, x0, #3, mul vl
282
+    add             x0, x0, x1, lsl #1
283
+    add             x2, x2, x3
284
+.endr
285
+    ret
286
+.vl_gt_48_blockcopy_ps_64_64:
287
+    cmp             x9, #112
288
+    bgt             .vl_gt_112_blockcopy_ps_64_64
289
+    ptrue           p0.b, vl64
290
+.rept 64
291
+    ld1b            {z4.h}, p0/z, x2
292
+    ld1b            {z5.h}, p0/z, x2, #1, mul vl
293
+    st1h            {z4.h}, p0, x0
294
+    st1h            {z5.h}, p0, x0, #1, mul vl
295
+    add             x0, x0, x1, lsl #1
296
+    add             x2, x2, x3
297
+.endr
298
+    ret
299
+.vl_gt_112_blockcopy_ps_64_64:
300
+    ptrue           p0.b, vl128
301
+.rept 64
302
+    ld1b            {z4.h}, p0/z, x2
303
+    st1h            {z4.h}, p0, x0
304
+    add             x0, x0, x1, lsl #1
305
+    add             x2, x2, x3
306
+.endr
307
+    ret
308
+
309
+endfunc
310
+
311
+function PFX(blockcopy_ss_16x16_sve)
312
+    rdvl            x9, #1
313
+    cmp             x9, #16
314
+    bgt             .vl_gt_16_blockcopy_ss_16_16
315
+    lsl             x1, x1, #1
316
+    lsl             x3, x3, #1
317
+.rept 8
318
+    ld1             {v0.8h-v1.8h}, x2, x3
319
+    ld1             {v2.8h-v3.8h}, x2, x3
320
+    st1             {v0.8h-v1.8h}, x0, x1
321
+    st1             {v2.8h-v3.8h}, x0, x1
322
+.endr
323
+    ret
324
+.vl_gt_16_blockcopy_ss_16_16:
325
+    ptrue           p0.h, vl16
326
+.rept 16
327
+    ld1h            {z0.h}, p0/z, x2
328
+    st1h            {z0.h}, p0, x0
329
+    add             x2, x2, x3, lsl #1
330
+    add             x0, x0, x1, lsl #1
331
+.endr
332
+    ret
333
+endfunc
334
+
335
+function PFX(blockcopy_ss_32x32_sve)
336
+    rdvl            x9, #1
337
+    cmp             x9, #16
338
+    bgt             .vl_gt_16_blockcopy_ss_32_32
339
+    lsl             x1, x1, #1
340
+    lsl             x3, x3, #1
341
+    mov             w12, #4
342
+.loop_css32_sve:
343
+    sub             w12, w12, #1
344
+.rept 8
345
+    ld1             {v0.8h-v3.8h}, x2, x3
346
+    st1             {v0.8h-v3.8h}, x0, x1
347
+.endr
348
+    cbnz            w12, .loop_css32_sve
349
+    ret
350
+.vl_gt_16_blockcopy_ss_32_32:
351
+    cmp             x9, #48
352
+    bgt             .vl_gt_48_blockcopy_ss_32_32
353
+    ptrue           p0.h, vl16
354
+.rept 32
355
+    ld1h            {z0.h}, p0/z, x2
356
+    ld1h            {z1.h}, p0/z, x2, #1, mul vl
357
+    st1h            {z0.h}, p0, x0
358
+    st1h            {z1.h}, p0, x0, #1, mul vl
359
+    add             x2, x2, x3, lsl #1
360
+    add             x0, x0, x1, lsl #1
361
+.endr
362
+    ret
363
+.vl_gt_48_blockcopy_ss_32_32:
364
+    ptrue           p0.h, vl32
365
+.rept 32
366
+    ld1h            {z0.h}, p0/z, x2
367
+    st1h            {z0.h}, p0, x0
368
+    add             x2, x2, x3, lsl #1
369
+    add             x0, x0, x1, lsl #1
370
+.endr
371
+    ret
372
+endfunc
373
+
374
+function PFX(blockcopy_ss_64x64_sve)
375
+    rdvl            x9, #1
376
+    cmp             x9, #16
377
+    bgt             .vl_gt_16_blockcopy_ss_64_64
378
+    lsl             x1, x1, #1
379
+    sub             x1, x1, #64
380
+    lsl             x3, x3, #1
381
+    sub             x3, x3, #64
382
+    mov             w12, #8
383
+.loop_css64_sve:
384
+    sub             w12, w12, #1
385
+.rept 8
386
+    ld1             {v0.8h-v3.8h}, x2, #64
387
+    ld1             {v4.8h-v7.8h}, x2, x3
388
+    st1             {v0.8h-v3.8h}, x0, #64
389
+    st1             {v4.8h-v7.8h}, x0, x1
390
+.endr
391
+    cbnz            w12, .loop_css64_sve
392
+    ret
393
+.vl_gt_16_blockcopy_ss_64_64:
394
+    cmp             x9, #48
395
+    bgt             .vl_gt_48_blockcopy_ss_64_64
396
+    mov             w12, #8
397
+    ptrue           p0.b, vl32
398
+.vl_gt_16_loop_css64_sve:
399
+    sub             w12, w12, #1
400
+.rept 8
401
+    ld1b            {z0.b}, p0/z, x2
402
+    ld1b            {z1.b}, p0/z, x2, #1, mul vl
403
+    ld1b            {z2.b}, p0/z, x2, #2, mul vl
404
+    ld1b            {z3.b}, p0/z, x2, #3, mul vl
405
+    st1b            {z0.b}, p0, x0
406
+    st1b            {z1.b}, p0, x0, #1, mul vl
407
+    st1b            {z2.b}, p0, x0, #2, mul vl
408
+    st1b            {z3.b}, p0, x0, #3, mul vl
409
+    add             x2, x2, x3, lsl #1
410
+    add             x0, x0, x1, lsl #1
411
+.endr
412
+    cbnz            w12, .vl_gt_16_loop_css64_sve
413
+    ret
414
+.vl_gt_48_blockcopy_ss_64_64:
415
+    cmp             x9, #112
416
+    bgt             .vl_gt_112_blockcopy_ss_64_64
417
+    mov             w12, #8
418
+    ptrue           p0.b, vl64
419
+.vl_gt_48_loop_css64_sve:
420
+    sub             w12, w12, #1
421
+.rept 8
422
+    ld1b            {z0.b}, p0/z, x2
423
+    ld1b            {z1.b}, p0/z, x2, #1, mul vl
424
+    st1b            {z0.b}, p0, x0
425
+    st1b            {z1.b}, p0, x0, #1, mul vl
426
+    add             x2, x2, x3, lsl #1
427
+    add             x0, x0, x1, lsl #1
428
+.endr
429
+    cbnz            w12, .vl_gt_48_loop_css64_sve
430
+    ret
431
+.vl_gt_112_blockcopy_ss_64_64:
432
+    mov             w12, #8
433
+    ptrue           p0.b, vl128
434
+.vl_gt_112_loop_css64_sve:
435
+    sub             w12, w12, #1
436
+.rept 8
437
+    ld1b            {z0.b}, p0/z, x2
438
+    st1b            {z0.b}, p0, x0
439
+    add             x2, x2, x3, lsl #1
440
+    add             x0, x0, x1, lsl #1
441
+.endr
442
+    cbnz            w12, .vl_gt_112_loop_css64_sve
443
+    ret
444
+endfunc
445
+
446
+/******** Chroma blockcopy********/
447
+function PFX(blockcopy_ss_16x32_sve)
448
+    rdvl            x9, #1
449
+    cmp             x9, #16
450
+    bgt             .vl_gt_16_blockcopy_ss_16_32
451
+    lsl             x1, x1, #1
452
+    lsl             x3, x3, #1
453
+.rept 16
454
+    ld1             {v0.8h-v1.8h}, x2, x3
455
+    ld1             {v2.8h-v3.8h}, x2, x3
456
+    st1             {v0.8h-v1.8h}, x0, x1
457
+    st1             {v2.8h-v3.8h}, x0, x1
458
+.endr
459
+    ret
460
+.vl_gt_16_blockcopy_ss_16_32:
461
+    ptrue           p0.h, vl16
462
+.rept 32
463
+    ld1h            {z0.h}, p0/z, x2
464
+    st1h            {z0.h}, p0, x0
465
+    add             x2, x2, x3, lsl #1
466
+    add             x0, x0, x1, lsl #1
467
+.endr
468
+    ret
469
+endfunc
470
+
471
+function PFX(blockcopy_ss_32x64_sve)
472
+    rdvl            x9, #1
473
+    cmp             x9, #16
474
+    bgt             .vl_gt_16_blockcopy_ss_32_64
475
+    lsl             x1, x1, #1
476
+    lsl             x3, x3, #1
477
+    mov             w12, #8
478
+.loop_css32x64_sve:
479
+    sub             w12, w12, #1
480
+.rept 8
481
+    ld1             {v0.8h-v3.8h}, x2, x3
482
+    st1             {v0.8h-v3.8h}, x0, x1
483
+.endr
484
+    cbnz            w12, .loop_css32x64_sve
485
+    ret
486
+.vl_gt_16_blockcopy_ss_32_64:
487
+    cmp             x9, #48
488
+    bgt             .vl_gt_48_blockcopy_ss_32_64
489
+    mov             w12, #8
490
+    ptrue           p0.b, vl32
491
+.vl_gt_32_loop_css32x64_sve:
492
+    sub             w12, w12, #1
493
+.rept 8
494
+    ld1b            {z0.b}, p0/z, x2
495
+    ld1b            {z1.b}, p0/z, x2, #1, mul vl
496
+    st1b            {z0.b}, p0, x0
497
+    st1b            {z1.b}, p0, x0, #1, mul vl
498
+    add             x2, x2, x3, lsl #1
499
+    add             x0, x0, x1, lsl #1
500
+.endr
501
+    cbnz            w12, .vl_gt_32_loop_css32x64_sve
502
+    ret
503
+.vl_gt_48_blockcopy_ss_32_64:
504
+    mov             w12, #8
505
+    ptrue           p0.b, vl64
506
+.vl_gt_48_loop_css32x64_sve:
507
+    sub             w12, w12, #1
508
+.rept 8
509
+    ld1b            {z0.b}, p0/z, x2
510
+    st1b            {z0.b}, p0, x0
511
+    add             x2, x2, x3, lsl #1
512
+    add             x0, x0, x1, lsl #1
513
+.endr
514
+    cbnz            w12, .vl_gt_48_loop_css32x64_sve
515
+    ret
516
+endfunc
517
+
518
+// chroma blockcopy_ps
519
+function PFX(blockcopy_ps_4x8_sve)
520
+    ptrue           p0.h, vl4
521
+.rept 8
522
+    ld1b            {z0.h}, p0/z, x2
523
+    st1h            {z0.h}, p0, x0
524
+    add             x0, x0, x1, lsl #1
525
+    add             x2, x2, x3
526
+.endr
527
+    ret
528
+endfunc
529
+
530
+function PFX(blockcopy_ps_8x16_sve)
531
+    ptrue           p0.h, vl8
532
+.rept 16
533
+    ld1b            {z0.h}, p0/z, x2
534
+    st1h            {z0.h}, p0, x0
535
+    add             x0, x0, x1, lsl #1
536
+    add             x2, x2, x3
537
+.endr
538
+    ret
539
+endfunc
540
+
541
+function PFX(blockcopy_ps_16x32_sve)
542
+    rdvl            x9, #1
543
+    cmp             x9, #16
544
+    bgt             .vl_gt_16_blockcopy_ps_16_32
545
+    lsl             x1, x1, #1
546
+.rept 16
547
+    ld1             {v4.16b}, x2, x3
548
+    ld1             {v5.16b}, x2, x3
549
+    uxtl            v0.8h, v4.8b
550
+    uxtl2           v1.8h, v4.16b
551
+    uxtl            v2.8h, v5.8b
552
+    uxtl2           v3.8h, v5.16b
553
+    st1             {v0.8h-v1.8h}, x0, x1
554
+    st1             {v2.8h-v3.8h}, x0, x1
555
+.endr
556
+    ret
557
+.vl_gt_16_blockcopy_ps_16_32:
558
+    ptrue           p0.b, vl32
559
+.rept 32
560
+    ld1b            {z1.h}, p0/z, x2
561
+    st1h            {z1.h}, p0, x0
562
+    add             x0, x0, x1, lsl #1
563
+    add             x2, x2, x3
564
+.endr
565
+    ret
566
+endfunc
567
+
568
+function PFX(blockcopy_ps_32x64_sve)
569
+    rdvl            x9, #1
570
+    cmp             x9, #16
571
+    bgt             .vl_gt_16_blockcopy_ps_32_64
572
+    lsl             x1, x1, #1
573
+    mov             w12, #8
574
+.loop_cps32x64_sve:
575
+    sub             w12, w12, #1
576
+.rept 4
577
+    ld1             {v16.16b-v17.16b}, x2, x3
578
+    ld1             {v18.16b-v19.16b}, x2, x3
579
+    uxtl            v0.8h, v16.8b
580
+    uxtl2           v1.8h, v16.16b
581
+    uxtl            v2.8h, v17.8b
582
+    uxtl2           v3.8h, v17.16b
583
+    uxtl            v4.8h, v18.8b
584
+    uxtl2           v5.8h, v18.16b
585
+    uxtl            v6.8h, v19.8b
586
+    uxtl2           v7.8h, v19.16b
587
+    st1             {v0.8h-v3.8h}, x0, x1
588
+    st1             {v4.8h-v7.8h}, x0, x1
589
+.endr
590
+    cbnz            w12, .loop_cps32x64_sve
591
+    ret
592
+.vl_gt_16_blockcopy_ps_32_64:
593
+    cmp             x9, #48
594
+    bgt             .vl_gt_48_blockcopy_ps_32_64
595
+    ptrue           p0.b, vl32
596
+.rept 64
597
+    ld1b            {z2.h}, p0/z, x2
598
+    ld1b            {z3.h}, p0/z, x2, #1, mul vl
599
+    st1h            {z2.h}, p0, x0
600
+    st1h            {z3.h}, p0, x0, #1, mul vl
601
+    add             x0, x0, x1, lsl #1
602
+    add             x2, x2, x3
603
+.endr
604
+    ret
605
+.vl_gt_48_blockcopy_ps_32_64:
606
+    ptrue           p0.b, vl64
607
+.rept 64
608
+    ld1b            {z2.h}, p0/z, x2
609
+    st1h            {z2.h}, p0, x0
610
+    add             x0, x0, x1, lsl #1
611
+    add             x2, x2, x3
612
+.endr
613
+    ret
614
+endfunc
615
+
616
+// chroma blockcopy_sp
617
+function PFX(blockcopy_sp_4x8_sve)
618
+    ptrue           p0.h, vl4
619
+.rept 8
620
+    ld1h            {z0.h}, p0/z, x2
621
+    st1b            {z0.h}, p0, x0
622
+    add             x2, x2, x3, lsl #1
623
+    add             x0, x0, x1
624
+.endr
625
+    ret
626
+endfunc
627
+
628
+function PFX(blockcopy_sp_8x16_sve)
629
+    ptrue           p0.h, vl8
630
+.rept 16
631
+    ld1h            {z0.h}, p0/z, x2
632
+    st1b            {z0.h}, p0, x0
633
+    add             x2, x2, x3, lsl #1
634
+    add             x0, x0, x1
635
+.endr
636
+    ret
637
+endfunc
638
+
639
+function PFX(blockcopy_sp_16x32_sve)
640
+    rdvl            x9, #1
641
+    cmp             x9, #16
642
+    bgt             .vl_gt_16_blockcopy_sp_16_32
643
+    ptrue           p0.h, vl8
644
+.rept 32
645
+    ld1h            {z0.h}, p0/z, x2
646
+    ld1h            {z1.h}, p0/z, x2, #1, mul vl
647
+    st1b            {z0.h}, p0, x0
648
+    st1b            {z1.h}, p0, x0, #1, mul vl
649
+    add             x2, x2, x3, lsl #1
650
+    add             x0, x0, x1
651
+.endr
652
+    ret
653
+.vl_gt_16_blockcopy_sp_16_32:
654
+    ptrue           p0.h, vl16
655
+.rept 32
656
+    ld1h            {z0.h}, p0/z, x2
657
+    st1b            {z0.h}, p0, x0
658
+    add             x2, x2, x3, lsl #1
659
+    add             x0, x0, x1
660
+.endr
661
+    ret
662
+endfunc
663
+
664
+function PFX(blockcopy_sp_32x64_sve)
665
+    rdvl            x9, #1
666
+    cmp             x9, #16
667
+    bgt             .vl_gt_16_blockcopy_sp_32_64
668
+    ptrue           p0.h, vl8
669
+.rept 64
670
+    ld1h            {z0.h}, p0/z, x2
671
+    ld1h            {z1.h}, p0/z, x2, #1, mul vl
672
+    ld1h            {z2.h}, p0/z, x2, #2, mul vl
673
+    ld1h            {z3.h}, p0/z, x2, #3, mul vl
674
+    st1b            {z0.h}, p0, x0
675
+    st1b            {z1.h}, p0, x0, #1, mul vl
676
+    st1b            {z2.h}, p0, x0, #2, mul vl
677
+    st1b            {z3.h}, p0, x0, #3, mul vl
678
+    add             x2, x2, x3, lsl #1
679
+    add             x0, x0, x1
680
+.endr
681
+    ret
682
+.vl_gt_16_blockcopy_sp_32_64:
683
+    cmp             x9, #48
684
+    bgt             .vl_gt_48_blockcopy_sp_32_64
685
+    ptrue           p0.h, vl16
686
+.rept 64
687
+    ld1h            {z0.h}, p0/z, x2
688
+    ld1h            {z1.h}, p0/z, x2, #1, mul vl
689
+    st1b            {z0.h}, p0, x0
690
+    st1b            {z1.h}, p0, x0, #1, mul vl
691
+    add             x2, x2, x3, lsl #1
692
+    add             x0, x0, x1
693
+.endr
694
+    ret
695
+.vl_gt_48_blockcopy_sp_32_64:
696
+    ptrue           p0.h, vl32
697
+.rept 64
698
+    ld1h            {z0.h}, p0/z, x2
699
+    st1b            {z0.h}, p0, x0
700
+    add             x2, x2, x3, lsl #1
701
+    add             x0, x0, x1
702
+.endr
703
+    ret
704
+endfunc
705
+
706
+/* blockcopy_pp(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) */
707
+
708
+function PFX(blockcopy_pp_32x8_sve)
709
+    rdvl            x9, #1
710
+    cmp             x9, #16
711
+    bgt             .vl_gt_16_blockcopy_pp_32_8
712
+.rept 8
713
+    ld1             {v0.16b-v1.16b}, x2, x3
714
+    st1             {v0.16b-v1.16b}, x0, x1
715
+.endr
716
+    ret
717
+.vl_gt_16_blockcopy_pp_32_8:
718
+    ptrue           p0.b, vl32
719
+.rept 8
720
+    ld1b            {z0.b}, p0/z, x2
721
+    st1b            {z0.b}, p0, x0
722
+    add             x2, x2, x3
723
+    add             x0, x0, x1
724
+.endr
725
+    ret
726
+endfunc
727
+
728
+.macro blockcopy_pp_32xN_sve h
729
+function PFX(blockcopy_pp_32x\h\()_sve)
730
+    mov             w12, #\h / 8
731
+    rdvl            x9, #1
732
+    cmp             x9, #16
733
+    bgt             .vl_gt_16_blockcopy_pp_32xN_\h
734
+.loop_sve_32x\h\():
735
+    sub             w12, w12, #1
736
+.rept 8
737
+    ld1             {v0.16b-v1.16b}, x2, x3
738
+    st1             {v0.16b-v1.16b}, x0, x1
739
+.endr
740
+    cbnz            w12, .loop_sve_32x\h
741
+    ret
742
+.vl_gt_16_blockcopy_pp_32xN_\h:
743
+    ptrue           p0.b, vl32
744
+.L_gt_16_blockcopy_pp_32xN_\h:
745
+    sub             w12, w12, #1
746
+.rept 8
747
+    ld1b            {z0.b}, p0/z, x2
748
+    st1b            {z0.b}, p0, x0
749
+    add             x2, x2, x3
750
+    add             x0, x0, x1
751
+.endr
752
+    cbnz            w12, .L_gt_16_blockcopy_pp_32xN_\h
753
+    ret
754
+endfunc
755
+.endm
756
+
757
+blockcopy_pp_32xN_sve 16
758
+blockcopy_pp_32xN_sve 24
759
+blockcopy_pp_32xN_sve 32
760
+blockcopy_pp_32xN_sve 64
761
+blockcopy_pp_32xN_sve 48
762
+
763
+.macro blockcopy_pp_64xN_sve h
764
+function PFX(blockcopy_pp_64x\h\()_sve)
765
+    mov             w12, #\h / 4
766
+    rdvl            x9, #1
767
+    cmp             x9, #16
768
+    bgt             .vl_gt_16_blockcopy_pp_64xN_\h
769
+.loop_sve_64x\h\():
770
+    sub             w12, w12, #1
771
+.rept 4
772
+    ld1             {v0.16b-v3.16b}, x2, x3
773
+    st1             {v0.16b-v3.16b}, x0, x1
774
+.endr
775
+    cbnz            w12, .loop_sve_64x\h
776
+    ret
777
+.vl_gt_16_blockcopy_pp_64xN_\h:
778
+    cmp             x9, #48
779
+    bgt             .vl_gt_48_blockcopy_pp_64xN_\h
780
+    ptrue           p0.b, vl32
781
+.L_le_32_blockcopy_pp_64xN_\h:
782
+    sub             w12, w12, #1
783
+.rept 4
784
+    ld1b            {z0.b}, p0/z, x2
785
+    ld1b            {z1.b}, p0/z, x2, #1, mul vl
786
+    st1b            {z0.b}, p0, x0
787
+    st1b            {z1.b}, p0, x0, #1, mul vl
788
+    add             x2, x2, x3
789
+    add             x0, x0, x1
790
+.endr
791
+    cbnz            w12, .L_le_32_blockcopy_pp_64xN_\h
792
+    ret
793
+.vl_gt_48_blockcopy_pp_64xN_\h:
794
+    ptrue           p0.b, vl64
795
+.L_blockcopy_pp_64xN_\h:
796
+    sub             w12, w12, #1
797
+.rept 4
798
+    ld1b            {z0.b}, p0/z, x2
799
+    st1b            {z0.b}, p0, x0
800
+    add             x2, x2, x3
801
+    add             x0, x0, x1
802
+.endr
803
+    cbnz            w12, .L_blockcopy_pp_64xN_\h
804
+    ret
805
+endfunc
806
+.endm
807
+
808
+blockcopy_pp_64xN_sve 16
809
+blockcopy_pp_64xN_sve 32
810
+blockcopy_pp_64xN_sve 48
811
+blockcopy_pp_64xN_sve 64
812
+
813
+function PFX(blockfill_s_32x32_sve)
814
+    rdvl            x9, #1
815
+    cmp             x9, #16
816
+    bgt             .vl_gt_16_blockfill_s_32_32
817
+    dup             v0.8h, w2
818
+    mov             v1.16b, v0.16b
819
+    mov             v2.16b, v0.16b
820
+    mov             v3.16b, v0.16b
821
+    lsl             x1, x1, #1
822
+.rept 32
823
+    st1             {v0.8h-v3.8h}, x0, x1
824
+.endr
825
+    ret
826
+.vl_gt_16_blockfill_s_32_32:
827
+    cmp             x9, #48
828
+    bgt             .vl_gt_48_blockfill_s_32_32
829
+    dup             z0.h, w2
830
+    ptrue           p0.h, vl16
831
+.rept 32
832
+    st1h            {z0.h}, p0, x0
833
+    st1h            {z0.h}, p0, x0, #1, mul vl
834
+    add             x0, x0, x1, lsl #1
835
+.endr
836
+    ret
837
+.vl_gt_48_blockfill_s_32_32:
838
+    dup             z0.h, w2
839
+    ptrue           p0.h, vl32
840
+.rept 32
841
+    st1h            {z0.h}, p0, x0
842
+    add             x0, x0, x1, lsl #1
843
+.endr
844
+    ret
845
+endfunc
846
+
847
+// void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
848
+.macro cpy2Dto1D_shl_start_sve
849
+    add             x2, x2, x2
850
+    mov             z0.h, w3
851
+.endm
852
+
853
+function PFX(cpy2Dto1D_shl_16x16_sve)
854
+    dup             z0.h, w3
855
+    rdvl            x9, #1
856
+    cmp             x9, #16
857
+    bgt             .vl_gt_16_cpy2Dto1D_shl_16x16
858
+    cpy2Dto1D_shl_start_sve
859
+    mov             w12, #4
860
+.loop_cpy2Dto1D_shl_16_sve:
861
+    sub             w12, w12, #1
862
+.rept 4
863
+    ld1             {v2.16b-v3.16b}, x1, x2
864
+    sshl            v2.8h, v2.8h, v0.8h
865
+    sshl            v3.8h, v3.8h, v0.8h
866
+    st1             {v2.16b-v3.16b}, x0, #32
867
+.endr
868
+    cbnz            w12, .loop_cpy2Dto1D_shl_16_sve
869
+    ret
870
+.vl_gt_16_cpy2Dto1D_shl_16x16:
871
+    ptrue           p0.h, vl16
872
+.rept 16
873
+    ld1h            {z1.h}, p0/z, x1
874
+    lsl             z1.h, p0/m, z1.h, z0.h
875
+    st1h            {z1.h}, p0, x0
876
+    add             x1, x1, x2, lsl #1
877
+    add             x0, x0, #32
878
+.endr
879
+    ret
880
+endfunc
881
+
882
+function PFX(cpy2Dto1D_shl_32x32_sve)
883
+    dup             z0.h, w3
884
+    rdvl            x9, #1
885
+    cmp             x9, #16
886
+    bgt             .vl_gt_16_cpy2Dto1D_shl_32x32
887
+    cpy2Dto1D_shl_start_sve
888
+    mov             w12, #16
889
+.loop_cpy2Dto1D_shl_32_sve:
890
+    sub             w12, w12, #1
891
+.rept 2
892
+    ld1             {v2.16b-v5.16b}, x1, x2
893
+    sshl            v2.8h, v2.8h, v0.8h
894
+    sshl            v3.8h, v3.8h, v0.8h
895
+    sshl            v4.8h, v4.8h, v0.8h
896
+    sshl            v5.8h, v5.8h, v0.8h
897
+    st1             {v2.16b-v5.16b}, x0, #64
898
+.endr
899
+    cbnz            w12, .loop_cpy2Dto1D_shl_32_sve
900
+    ret
901
+.vl_gt_16_cpy2Dto1D_shl_32x32:
902
+    cmp             x9, #48
903
+    bgt             .vl_gt_48_cpy2Dto1D_shl_32x32
904
+    ptrue           p0.h, vl16
905
+.rept 32
906
+    ld1h            {z1.h}, p0/z, x1
907
+    ld1h            {z2.h}, p0/z, x1, #1, mul vl
908
+    lsl             z1.h, p0/m, z1.h, z0.h
909
+    lsl             z2.h, p0/m, z2.h, z0.h
910
+    st1h            {z1.h}, p0, x0
911
+    st1h            {z2.h}, p0, x0, #1, mul vl
912
+    add             x1, x1, x2, lsl #1
913
+    add             x0, x0, #64
914
+.endr
915
+    ret
916
+.vl_gt_48_cpy2Dto1D_shl_32x32:
917
+    ptrue           p0.h, vl32
918
+.rept 32
919
+    ld1h            {z1.h}, p0/z, x1
920
+    lsl             z1.h, p0/m, z1.h, z0.h
921
+    st1h            {z1.h}, p0, x0
922
+    add             x1, x1, x2, lsl #1
923
+    add             x0, x0, #64
924
+.endr
925
+    ret
926
+endfunc
927
+
928
+function PFX(cpy2Dto1D_shl_64x64_sve)
929
+    rdvl            x9, #1
930
+    cmp             x9, #16
931
+    bgt             .vl_gt_16_cpy2Dto1D_shl_64x64
932
+    cpy2Dto1D_shl_start_sve
933
+    mov             w12, #32
934
+    sub             x2, x2, #64
935
+.loop_cpy2Dto1D_shl_64_sve:
936
+    sub             w12, w12, #1
937
+.rept 2
938
+    ld1             {v2.16b-v5.16b}, x1, #64
939
+    ld1             {v16.16b-v19.16b}, x1, x2
940
+    sshl            v2.8h, v2.8h, v0.8h
941
+    sshl            v3.8h, v3.8h, v0.8h
942
+    sshl            v4.8h, v4.8h, v0.8h
943
+    sshl            v5.8h, v5.8h, v0.8h
944
+    sshl            v16.8h, v16.8h, v0.8h
945
+    sshl            v17.8h, v17.8h, v0.8h
946
+    sshl            v18.8h, v18.8h, v0.8h
947
+    sshl            v19.8h, v19.8h, v0.8h
948
+    st1             {v2.16b-v5.16b}, x0, #64
949
+    st1             {v16.16b-v19.16b}, x0, #64
950
+.endr
951
+    cbnz            w12, .loop_cpy2Dto1D_shl_64_sve
952
+    ret
953
+.vl_gt_16_cpy2Dto1D_shl_64x64:
954
+    dup             z0.h, w3
955
+    mov             x8, #64
956
+    mov             w12, #64
957
+.L_init_cpy2Dto1D_shl_64x64:
958
+    sub             w12, w12, 1
959
+    mov             x9, #0
960
+    whilelt         p0.h, x9, x8
961
+.L_cpy2Dto1D_shl_64x64:
962
+    ld1h            {z1.h}, p0/z, x1, x9, lsl #1
963
+    lsl             z1.h, p0/m, z1.h, z0.h
964
+    st1h            {z1.h}, p0, x0, x9, lsl #1
965
+    inch            x9
966
+    whilelt         p0.h, x9, x8
967
+    b.first         .L_cpy2Dto1D_shl_64x64
968
+    add             x1, x1, x2, lsl #1
969
+    addvl           x0, x0, #1
970
+    cbnz            w12, .L_init_cpy2Dto1D_shl_64x64
971
+    ret
972
+endfunc
973
+
974
+// void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
975
+
976
+function PFX(cpy2Dto1D_shr_4x4_sve)
977
+    dup             z0.h, w3
978
+    sub             w4, w3, #1
979
+    dup             z1.h, w4
980
+    ptrue           p0.h, vl8
981
+    mov             z2.h, #1
982
+    lsl             z2.h, p0/m, z2.h, z1.h
983
+    lsl             x2, x2, #1
984
+    index           z3.d, #0, x2
985
+    index           z4.d, #0, #8
986
+.rept 2
987
+    ld1d            {z5.d}, p0/z, x1, z3.d
988
+    add             x1, x1, x2, lsl #1
989
+    add             z5.h, p0/m, z5.h, z2.h
990
+    asr             z5.h, p0/m, z5.h, z0.h
991
+    st1d            {z5.d}, p0, x0, z4.d
992
+    add             x0, x0, #16
993
+.endr
994
+    ret
995
+endfunc
996
+
997
+function PFX(cpy2Dto1D_shr_8x8_sve)
998
+    dup             z0.h, w3
999
+    sub             w4, w3, #1
1000
+    dup             z1.h, w4
1001
+    ptrue           p0.h, vl8
1002
+    mov             z2.h, #1
1003
+    lsl             z2.h, p0/m, z2.h, z1.h
1004
+.rept 8
1005
+    ld1d            {z5.d}, p0/z, x1
1006
+    add             x1, x1, x2, lsl #1
1007
+    add             z5.h, p0/m, z5.h, z2.h
1008
+    asr             z5.h, p0/m, z5.h, z0.h
1009
+    st1d            {z5.d}, p0, x0
1010
+    add             x0, x0, #16
1011
+.endr
1012
+    ret
1013
+endfunc
1014
+
1015
+function PFX(cpy2Dto1D_shr_16x16_sve)
1016
+    dup             z0.h, w3
1017
+    sub             w4, w3, #1
1018
+    dup             z1.h, w4
1019
+    rdvl            x9, #1
1020
+    cmp             x9, #16
1021
+    bgt             .vl_gt_16_cpy2Dto1D_shr_16x16
1022
+    ptrue           p0.h, vl8
1023
+    mov             z2.h, #1
1024
+    lsl             z2.h, p0/m, z2.h, z1.h
1025
+.rept 16
1026
+    ld1d            {z5.d}, p0/z, x1
1027
+    ld1d            {z6.d}, p0/z, x1, #1, mul vl
1028
+    add             x1, x1, x2, lsl #1
1029
+    add             z5.h, p0/m, z5.h, z2.h
1030
+    add             z6.h, p0/m, z6.h, z2.h
1031
+    asr             z5.h, p0/m, z5.h, z0.h
1032
+    asr             z6.h, p0/m, z6.h, z0.h
1033
+    st1d            {z5.d}, p0, x0
1034
+    st1d            {z6.d}, p0, x0, #1, mul vl
1035
+    add             x0, x0, #32
1036
+.endr
1037
+    ret
1038
+.vl_gt_16_cpy2Dto1D_shr_16x16:
1039
+    ptrue           p0.h, vl16
1040
+    mov             z2.h, #1
1041
+    lsl             z2.h, p0/m, z2.h, z1.h
1042
+.rept 16
1043
+    ld1d            {z5.d}, p0/z, x1
1044
+    add             x1, x1, x2, lsl #1
1045
+    add             z5.h, p0/m, z5.h, z2.h
1046
+    asr             z5.h, p0/m, z5.h, z0.h
1047
+    st1d            {z5.d}, p0, x0
1048
+    add             x0, x0, #32
1049
+.endr
1050
+    ret
1051
+endfunc
1052
+
1053
+function PFX(cpy2Dto1D_shr_32x32_sve)
1054
+    rdvl            x9, #1
1055
+    cmp             x9, #16
1056
+    bgt             .vl_gt_16_cpy2Dto1D_shr_32x32
1057
+    cpy2Dto1D_shr_start
1058
+    mov             w12, #16
1059
+.loop_cpy2Dto1D_shr_32_sve:
1060
+    sub             w12, w12, #1
1061
+.rept 2
1062
+    ld1             {v2.8h-v5.8h}, x1, x2
1063
+    sub             v2.8h, v2.8h, v1.8h
1064
+    sub             v3.8h, v3.8h, v1.8h
1065
+    sub             v4.8h, v4.8h, v1.8h
1066
+    sub             v5.8h, v5.8h, v1.8h
1067
+    sshl            v2.8h, v2.8h, v0.8h
1068
+    sshl            v3.8h, v3.8h, v0.8h
1069
+    sshl            v4.8h, v4.8h, v0.8h
1070
+    sshl            v5.8h, v5.8h, v0.8h
1071
+    st1             {v2.8h-v5.8h}, x0, #64
1072
+.endr
1073
+    cbnz            w12, .loop_cpy2Dto1D_shr_32_sve
1074
+    ret
1075
+.vl_gt_16_cpy2Dto1D_shr_32x32:
1076
+    dup             z0.h, w3
1077
+    sub             w4, w3, #1
1078
+    dup             z1.h, w4
1079
+    cmp             x9, #48
1080
+    bgt             .vl_gt_48_cpy2Dto1D_shr_32x32
1081
+    ptrue           p0.h, vl16
1082
+    mov             z2.h, #1
1083
+    lsl             z2.h, p0/m, z2.h, z1.h
1084
+.rept 32
1085
+    ld1d            {z5.d}, p0/z, x1
1086
+    ld1d            {z6.d}, p0/z, x1, #1, mul vl
1087
+    add             x1, x1, x2, lsl #1
1088
+    add             z5.h, p0/m, z5.h, z2.h
1089
+    add             z6.h, p0/m, z6.h, z2.h
1090
+    asr             z5.h, p0/m, z5.h, z0.h
1091
+    asr             z6.h, p0/m, z6.h, z0.h
1092
+    st1d            {z5.d}, p0, x0
1093
+    st1d            {z6.d}, p0, x0, #1, mul vl
1094
+    add             x0, x0, #64
1095
+.endr
1096
+    ret
1097
+.vl_gt_48_cpy2Dto1D_shr_32x32:
1098
+    ptrue           p0.h, vl32
1099
+    mov             z2.h, #1
1100
+    lsl             z2.h, p0/m, z2.h, z1.h
1101
+.rept 32
1102
+    ld1d            {z5.d}, p0/z, x1
1103
+    add             x1, x1, x2, lsl #1
1104
+    add             z5.h, p0/m, z5.h, z2.h
1105
+    asr             z5.h, p0/m, z5.h, z0.h
1106
+    st1d            {z5.d}, p0, x0
1107
+    add             x0, x0, #64
1108
+.endr
1109
+    ret
1110
+endfunc
1111
+
1112
+// void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
1113
+
1114
+function PFX(cpy1Dto2D_shl_16x16_sve)
1115
+    dup             z0.h, w3
1116
+    rdvl            x9, #1
1117
+    cmp             x9, #16
1118
+    bgt             .vl_gt_16_cpy1Dto2D_shl_16x16
1119
+    ptrue           p0.h, vl8
1120
+.rept 16
1121
+    ld1h            {z1.h}, p0/z, x1
1122
+    ld1h            {z2.h}, p0/z, x1, #1, mul vl
1123
+    lsl             z1.h, p0/m, z1.h, z0.h
1124
+    lsl             z2.h, p0/m, z2.h, z0.h
1125
+    st1h            {z1.h}, p0, x0
1126
+    st1h            {z2.h}, p0, x0, #1, mul vl
1127
+    add             x1, x1, #32
1128
+    add             x0, x0, x2, lsl #1
1129
+.endr
1130
+    ret
1131
+.vl_gt_16_cpy1Dto2D_shl_16x16:
1132
+    ptrue           p0.h, vl16
1133
+.rept 16
1134
+    ld1h            {z1.h}, p0/z, x1
1135
+    lsl             z1.h, p0/m, z1.h, z0.h
1136
+    st1h            {z1.h}, p0, x0
1137
+    add             x1, x1, #32
1138
+    add             x0, x0, x2, lsl #1
1139
+.endr
1140
+    ret
1141
+endfunc
1142
+
1143
+function PFX(cpy1Dto2D_shl_32x32_sve)
1144
+    dup             z0.h, w3
1145
+    rdvl            x9, #1
1146
+    cmp             x9, #16
1147
+    bgt             .vl_gt_16_cpy1Dto2D_shl_32x32
1148
+    ptrue           p0.h, vl8
1149
+.rept 32
1150
+    ld1h            {z1.h}, p0/z, x1
1151
+    ld1h            {z2.h}, p0/z, x1, #1, mul vl
1152
+    ld1h            {z3.h}, p0/z, x1, #2, mul vl
1153
+    ld1h            {z4.h}, p0/z, x1, #3, mul vl
1154
+    lsl             z1.h, p0/m, z1.h, z0.h
1155
+    lsl             z2.h, p0/m, z2.h, z0.h
1156
+    lsl             z3.h, p0/m, z3.h, z0.h
1157
+    lsl             z4.h, p0/m, z4.h, z0.h
1158
+    st1h            {z1.h}, p0, x0
1159
+    st1h            {z2.h}, p0, x0, #1, mul vl
1160
+    st1h            {z3.h}, p0, x0, #2, mul vl
1161
+    st1h            {z4.h}, p0, x0, #3, mul vl
1162
+    add             x1, x1, #64
1163
+    add             x0, x0, x2, lsl #1
1164
+.endr
1165
+    ret
1166
+.vl_gt_16_cpy1Dto2D_shl_32x32:
1167
+    cmp             x9, #48
1168
+    bgt             .vl_gt_48_cpy1Dto2D_shl_32x32
1169
+    ptrue           p0.h, vl16
1170
+.rept 32
1171
+    ld1h            {z1.h}, p0/z, x1
1172
+    ld1h            {z2.h}, p0/z, x1, #1, mul vl
1173
+    lsl             z1.h, p0/m, z1.h, z0.h
1174
+    lsl             z2.h, p0/m, z2.h, z0.h
1175
+    st1h            {z1.h}, p0, x0
1176
+    st1h            {z2.h}, p0, x0, #1, mul vl
1177
+    add             x1, x1, #64
1178
+    add             x0, x0, x2, lsl #1
1179
+.endr
1180
+    ret
1181
+.vl_gt_48_cpy1Dto2D_shl_32x32:
1182
+    ptrue           p0.h, vl32
1183
+.rept 32
1184
+    ld1h            {z1.h}, p0/z, x1
1185
+    lsl             z1.h, p0/m, z1.h, z0.h
1186
+    st1h            {z1.h}, p0, x0
1187
+    add             x1, x1, #64
1188
+    add             x0, x0, x2, lsl #1
1189
+.endr
1190
+    ret
1191
+endfunc
1192
+
1193
+function PFX(cpy1Dto2D_shl_64x64_sve)
1194
+    dup             z0.h, w3
1195
+    mov             x8, #64
1196
+    mov             w12, #64
1197
+.L_init_cpy1Dto2D_shl_64x64:
1198
+    sub             w12, w12, 1
1199
+    mov             x9, #0
1200
+    whilelt         p0.h, x9, x8
1201
+.L_cpy1Dto2D_shl_64x64:
1202
+    ld1h            {z1.h}, p0/z, x1, x9, lsl #1
1203
+    lsl             z1.h, p0/m, z1.h, z0.h
1204
+    st1h            {z1.h}, p0, x0, x9, lsl #1
1205
+    inch            x9
1206
+    whilelt         p0.h, x9, x8
1207
+    b.first         .L_cpy1Dto2D_shl_64x64
1208
+    addvl           x1, x1, #1
1209
+    add             x0, x0, x2, lsl #1
1210
+    cbnz            w12, .L_init_cpy1Dto2D_shl_64x64
1211
+    ret
1212
+endfunc
1213
+
1214
+// void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
1215
+
1216
+function PFX(cpy1Dto2D_shr_16x16_sve)
1217
+    rdvl            x9, #1
1218
+    cmp             x9, #16
1219
+    bgt             .vl_gt_16_cpy1Dto2D_shr_16x16
1220
+    cpy1Dto2D_shr_start
1221
+    mov             w12, #4
1222
+.loop_cpy1Dto2D_shr_16:
1223
+    sub             w12, w12, #1
1224
+.rept 4
1225
+    ld1             {v2.8h-v3.8h}, x1, #32
1226
+    sub             v2.8h, v2.8h, v1.8h
1227
+    sub             v3.8h, v3.8h, v1.8h
1228
+    sshl            v2.8h, v2.8h, v0.8h
1229
+    sshl            v3.8h, v3.8h, v0.8h
1230
+    st1             {v2.8h-v3.8h}, x0, x2
1231
+.endr
1232
+    cbnz            w12, .loop_cpy1Dto2D_shr_16
1233
+    ret
1234
+.vl_gt_16_cpy1Dto2D_shr_16x16:
1235
+    dup             z0.h, w3
1236
+    sub             w4, w3, #1
1237
+    dup             z1.h, w4
1238
+    ptrue           p0.h, vl16
1239
+    mov             z2.h, #1
1240
+    lsl             z2.h, p0/m, z2.h, z1.h
1241
+.rept 16
1242
+    ld1d            {z5.d}, p0/z, x1
1243
+    add             x1, x1, #32
1244
+    add             z5.h, p0/m, z5.h, z2.h
1245
+    asr             z5.h, p0/m, z5.h, z0.h
1246
+    st1d            {z5.d}, p0, x0
1247
+    add             x0, x0, x2, lsl #1
1248
+.endr
1249
+    ret
1250
+endfunc
1251
+
1252
+function PFX(cpy1Dto2D_shr_32x32_sve)
1253
+    rdvl            x9, #1
1254
+    cmp             x9, #16
1255
+    bgt             .vl_gt_16_cpy1Dto2D_shr_32x32
1256
+    cpy1Dto2D_shr_start
1257
+    mov             w12, #16
1258
+.loop_cpy1Dto2D_shr_32_sve:
1259
+    sub             w12, w12, #1
1260
+.rept 2
1261
+    ld1             {v2.16b-v5.16b}, x1, #64
1262
+    sub             v2.8h, v2.8h, v1.8h
1263
+    sub             v3.8h, v3.8h, v1.8h
1264
+    sub             v4.8h, v4.8h, v1.8h
1265
+    sub             v5.8h, v5.8h, v1.8h
1266
+    sshl            v2.8h, v2.8h, v0.8h
1267
+    sshl            v3.8h, v3.8h, v0.8h
1268
+    sshl            v4.8h, v4.8h, v0.8h
1269
+    sshl            v5.8h, v5.8h, v0.8h
1270
+    st1             {v2.16b-v5.16b}, x0, x2
1271
+.endr
1272
+    cbnz            w12, .loop_cpy1Dto2D_shr_32_sve
1273
+    ret
1274
+.vl_gt_16_cpy1Dto2D_shr_32x32:
1275
+    dup             z0.h, w3
1276
+    sub             w4, w3, #1
1277
+    dup             z1.h, w4
1278
+    cmp             x9, #48
1279
+    bgt             .vl_gt_48_cpy2Dto1D_shr_32x32
1280
+    ptrue           p0.h, vl16
1281
+    mov             z2.h, #1
1282
+    lsl             z2.h, p0/m, z2.h, z1.h
1283
+.rept 32
1284
+    ld1d            {z5.d}, p0/z, x1
1285
+    ld1d            {z6.d}, p0/z, x1, #1, mul vl
1286
+    add             x1, x1, #64
1287
+    add             z5.h, p0/m, z5.h, z2.h
1288
+    add             z6.h, p0/m, z6.h, z2.h
1289
+    asr             z5.h, p0/m, z5.h, z0.h
1290
+    asr             z6.h, p0/m, z6.h, z0.h
1291
+    st1d            {z5.d}, p0, x0
1292
+    st1d            {z6.d}, p0, x0, #1, mul vl
1293
+    add             x0, x0, x2, lsl #1
1294
+.endr
1295
+    ret
1296
+.vl_gt_48_cpy1Dto2D_shr_32x32:
1297
+    ptrue           p0.h, vl32
1298
+    mov             z2.h, #1
1299
+    lsl             z2.h, p0/m, z2.h, z1.h
1300
+.rept 32
1301
+    ld1d            {z5.d}, p0/z, x1
1302
+    add             x1, x1, #64
1303
+    add             z5.h, p0/m, z5.h, z2.h
1304
+    asr             z5.h, p0/m, z5.h, z0.h
1305
+    st1d            {z5.d}, p0, x0
1306
+    add             x0, x0, x2, lsl #1
1307
+.endr
1308
+    ret
1309
+endfunc
1310
+
1311
+function PFX(cpy1Dto2D_shr_64x64_sve)
1312
+    dup             z0.h, w3
1313
+    sub             w4, w3, #1
1314
+    dup             z1.h, w4
1315
+    rdvl            x9, #1
1316
+    cmp             x9, #16
1317
+    bgt             .vl_gt_16_cpy1Dto2D_shr_64x64
1318
+    ptrue           p0.h, vl8
1319
+    mov             z2.h, #1
1320
+    lsl             z2.h, p0/m, z2.h, z1.h
1321
+.rept 128
1322
+    ld1d            {z5.d}, p0/z, x1
1323
+    ld1d            {z6.d}, p0/z, x1, #1, mul vl
1324
+    ld1d            {z7.d}, p0/z, x1, #2, mul vl
1325
+    ld1d            {z8.d}, p0/z, x1, #3, mul vl
1326
+    ld1d            {z9.d}, p0/z, x1, #4, mul vl
1327
+    ld1d            {z10.d}, p0/z, x1, #5, mul vl
1328
+    ld1d            {z11.d}, p0/z, x1, #6, mul vl
1329
+    ld1d            {z12.d}, p0/z, x1, #7, mul vl
1330
+    add             x1, x1, #128
1331
+    add             z5.h, p0/m, z5.h, z2.h
1332
+    add             z6.h, p0/m, z6.h, z2.h
1333
+    add             z7.h, p0/m, z7.h, z2.h
1334
+    add             z8.h, p0/m, z8.h, z2.h
1335
+    add             z9.h, p0/m, z9.h, z2.h
1336
+    add             z10.h, p0/m, z10.h, z2.h
1337
+    add             z11.h, p0/m, z11.h, z2.h
1338
+    add             z12.h, p0/m, z12.h, z2.h
1339
+    asr             z5.h, p0/m, z5.h, z0.h
1340
+    asr             z6.h, p0/m, z6.h, z0.h
1341
+    asr             z7.h, p0/m, z7.h, z0.h
1342
+    asr             z8.h, p0/m, z8.h, z0.h
1343
+    asr             z9.h, p0/m, z9.h, z0.h
1344
+    asr             z10.h, p0/m, z10.h, z0.h
1345
+    asr             z11.h, p0/m, z11.h, z0.h
1346
+    asr             z12.h, p0/m, z12.h, z0.h
1347
+    st1d            {z5.d}, p0, x0
1348
+    st1d            {z6.d}, p0, x0, #1, mul vl
1349
+    st1d            {z7.d}, p0, x0, #2, mul vl
1350
+    st1d            {z8.d}, p0, x0, #3, mul vl
1351
+    st1d            {z9.d}, p0, x0, #4, mul vl
1352
+    st1d            {z10.d}, p0, x0, #5, mul vl
1353
+    st1d            {z11.d}, p0, x0, #6, mul vl
1354
+    st1d            {z12.d}, p0, x0, #7, mul vl
1355
+    add             x0, x0, x2, lsl #1
1356
+.endr
1357
+    ret
1358
+.vl_gt_16_cpy1Dto2D_shr_64x64:
1359
+    cmp             x9, #48
1360
+    bgt             .vl_gt_48_cpy1Dto2D_shr_64x64
1361
+    ptrue           p0.h, vl16
1362
+    mov             z2.h, #1
1363
+    lsl             z2.h, p0/m, z2.h, z1.h
1364
+.rept 128
1365
+    ld1d            {z5.d}, p0/z, x1
1366
+    ld1d            {z6.d}, p0/z, x1, #1, mul vl
1367
+    ld1d            {z7.d}, p0/z, x1, #2, mul vl
1368
+    ld1d            {z8.d}, p0/z, x1, #3, mul vl
1369
+    add             x1, x1, #128
1370
+    add             z5.h, p0/m, z5.h, z2.h
1371
+    add             z6.h, p0/m, z6.h, z2.h
1372
+    add             z7.h, p0/m, z7.h, z2.h
1373
+    add             z8.h, p0/m, z8.h, z2.h
1374
+    asr             z5.h, p0/m, z5.h, z0.h
1375
+    asr             z6.h, p0/m, z6.h, z0.h
1376
+    asr             z7.h, p0/m, z7.h, z0.h
1377
+    asr             z8.h, p0/m, z8.h, z0.h
1378
+    st1d            {z5.d}, p0, x0
1379
+    st1d            {z6.d}, p0, x0, #1, mul vl
1380
+    st1d            {z7.d}, p0, x0, #2, mul vl
1381
+    st1d            {z8.d}, p0, x0, #3, mul vl
1382
+    add             x0, x0, x2, lsl #1
1383
+.endr
1384
+    ret
1385
+.vl_gt_48_cpy1Dto2D_shr_64x64:
1386
+    cmp             x9, #112
1387
+    bgt             .vl_gt_112_cpy1Dto2D_shr_64x64
1388
+    ptrue           p0.h, vl32
1389
+    mov             z2.h, #1
1390
+    lsl             z2.h, p0/m, z2.h, z1.h
1391
+.rept 128
1392
+    ld1d            {z5.d}, p0/z, x1
1393
+    ld1d            {z6.d}, p0/z, x1, #1, mul vl
1394
+    add             x1, x1, #128
1395
+    add             z5.h, p0/m, z5.h, z2.h
1396
+    add             z6.h, p0/m, z6.h, z2.h
1397
+    asr             z5.h, p0/m, z5.h, z0.h
1398
+    asr             z6.h, p0/m, z6.h, z0.h
1399
+    st1d            {z5.d}, p0, x0
1400
+    st1d            {z6.d}, p0, x0, #1, mul vl
1401
+    add             x0, x0, x2, lsl #1
1402
+.endr
1403
+    ret
1404
+.vl_gt_112_cpy1Dto2D_shr_64x64:
1405
+    ptrue           p0.h, vl64
1406
+    mov             z2.h, #1
1407
+    lsl             z2.h, p0/m, z2.h, z1.h
1408
+.rept 128
1409
+    ld1d            {z5.d}, p0/z, x1
1410
+    add             x1, x1, #128
1411
+    add             z5.h, p0/m, z5.h, z2.h
1412
+    asr             z5.h, p0/m, z5.h, z0.h
1413
+    st1d            {z5.d}, p0, x0
1414
+    add             x0, x0, x2, lsl #1
1415
+.endr
1416
+    ret
1417
+endfunc
1418
x265_3.6.tar.gz/source/common/aarch64/blockcopy8.S Added
1301
 
1
@@ -0,0 +1,1299 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2021 MulticoreWare, Inc
4
+ *
5
+ * Authors: Sebastian Pop <spop@amazon.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+#include "blockcopy8-common.S"
27
+
28
+#ifdef __APPLE__
29
+.section __RODATA,__rodata
30
+#else
31
+.section .rodata
32
+#endif
33
+
34
+.align 4
35
+
36
+.text
37
+
38
+/* void blockcopy_sp(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
39
+ *
40
+ * r0   - a
41
+ * r1   - stridea
42
+ * r2   - b
43
+ * r3   - strideb */
44
+function PFX(blockcopy_sp_4x4_neon)
45
+    lsl             x3, x3, #1
46
+.rept 2
47
+    ld1             {v0.8h}, x2, x3
48
+    ld1             {v1.8h}, x2, x3
49
+    xtn             v0.8b, v0.8h
50
+    xtn             v1.8b, v1.8h
51
+    st1             {v0.s}0, x0, x1
52
+    st1             {v1.s}0, x0, x1
53
+.endr
54
+    ret
55
+endfunc
56
+
57
+function PFX(blockcopy_sp_8x8_neon)
58
+    lsl             x3, x3, #1
59
+.rept 4
60
+    ld1             {v0.8h}, x2, x3
61
+    ld1             {v1.8h}, x2, x3
62
+    xtn             v0.8b, v0.8h
63
+    xtn             v1.8b, v1.8h
64
+    st1             {v0.d}0, x0, x1
65
+    st1             {v1.d}0, x0, x1
66
+.endr
67
+    ret
68
+endfunc
69
+
70
+function PFX(blockcopy_sp_16x16_neon)
71
+    lsl             x3, x3, #1
72
+    movrel          x11, xtn_xtn2_table
73
+    ld1             {v31.16b}, x11
74
+.rept 8
75
+    ld1             {v0.8h-v1.8h}, x2, x3
76
+    ld1             {v2.8h-v3.8h}, x2, x3
77
+    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
78
+    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
79
+    st1             {v0.16b}, x0, x1
80
+    st1             {v1.16b}, x0, x1
81
+.endr
82
+    ret
83
+endfunc
84
+
85
+function PFX(blockcopy_sp_32x32_neon)
86
+    mov             w12, #4
87
+    lsl             x3, x3, #1
88
+    movrel          x11, xtn_xtn2_table
89
+    ld1             {v31.16b}, x11
90
+.loop_csp32:
91
+    sub             w12, w12, #1
92
+.rept 4
93
+    ld1             {v0.8h-v3.8h}, x2, x3
94
+    ld1             {v4.8h-v7.8h}, x2, x3
95
+    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
96
+    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
97
+    tbl             v2.16b, {v4.16b,v5.16b}, v31.16b
98
+    tbl             v3.16b, {v6.16b,v7.16b}, v31.16b
99
+    st1             {v0.16b-v1.16b}, x0, x1
100
+    st1             {v2.16b-v3.16b}, x0, x1
101
+.endr
102
+    cbnz            w12, .loop_csp32
103
+    ret
104
+endfunc
105
+
106
+function PFX(blockcopy_sp_64x64_neon)
107
+    mov             w12, #16
108
+    lsl             x3, x3, #1
109
+    sub             x3, x3, #64
110
+    movrel          x11, xtn_xtn2_table
111
+    ld1             {v31.16b}, x11
112
+.loop_csp64:
113
+    sub             w12, w12, #1
114
+.rept 4
115
+    ld1             {v0.8h-v3.8h}, x2, #64
116
+    ld1             {v4.8h-v7.8h}, x2, x3
117
+    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
118
+    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
119
+    tbl             v2.16b, {v4.16b,v5.16b}, v31.16b
120
+    tbl             v3.16b, {v6.16b,v7.16b}, v31.16b
121
+    st1             {v0.16b-v3.16b}, x0, x1
122
+.endr
123
+    cbnz            w12, .loop_csp64
124
+    ret
125
+endfunc
126
+
127
+// void blockcopy_ps(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb)
128
+function PFX(blockcopy_ps_4x4_neon)
129
+    lsl             x1, x1, #1
130
+.rept 2
131
+    ld1             {v0.8b}, x2, x3
132
+    ld1             {v1.8b}, x2, x3
133
+    uxtl            v0.8h, v0.8b
134
+    uxtl            v1.8h, v1.8b
135
+    st1             {v0.4h}, x0, x1
136
+    st1             {v1.4h}, x0, x1
137
+.endr
138
+    ret
139
+endfunc
140
+
141
+function PFX(blockcopy_ps_8x8_neon)
142
+    lsl             x1, x1, #1
143
+.rept 4
144
+    ld1             {v0.8b}, x2, x3
145
+    ld1             {v1.8b}, x2, x3
146
+    uxtl            v0.8h, v0.8b
147
+    uxtl            v1.8h, v1.8b
148
+    st1             {v0.8h}, x0, x1
149
+    st1             {v1.8h}, x0, x1
150
+.endr
151
+    ret
152
+endfunc
153
+
154
+function PFX(blockcopy_ps_16x16_neon)
155
+    lsl             x1, x1, #1
156
+.rept 8
157
+    ld1             {v4.16b}, x2, x3
158
+    ld1             {v5.16b}, x2, x3
159
+    uxtl            v0.8h, v4.8b
160
+    uxtl2           v1.8h, v4.16b
161
+    uxtl            v2.8h, v5.8b
162
+    uxtl2           v3.8h, v5.16b
163
+    st1             {v0.8h-v1.8h}, x0, x1
164
+    st1             {v2.8h-v3.8h}, x0, x1
165
+.endr
166
+    ret
167
+endfunc
168
+
169
+function PFX(blockcopy_ps_32x32_neon)
170
+    lsl             x1, x1, #1
171
+    mov             w12, #4
172
+.loop_cps32:
173
+    sub             w12, w12, #1
174
+.rept 4
175
+    ld1             {v16.16b-v17.16b}, x2, x3
176
+    ld1             {v18.16b-v19.16b}, x2, x3
177
+    uxtl            v0.8h, v16.8b
178
+    uxtl2           v1.8h, v16.16b
179
+    uxtl            v2.8h, v17.8b
180
+    uxtl2           v3.8h, v17.16b
181
+    uxtl            v4.8h, v18.8b
182
+    uxtl2           v5.8h, v18.16b
183
+    uxtl            v6.8h, v19.8b
184
+    uxtl2           v7.8h, v19.16b
185
+    st1             {v0.8h-v3.8h}, x0, x1
186
+    st1             {v4.8h-v7.8h}, x0, x1
187
+.endr
188
+    cbnz            w12, .loop_cps32
189
+    ret
190
+endfunc
191
+
192
+function PFX(blockcopy_ps_64x64_neon)
193
+    lsl             x1, x1, #1
194
+    sub             x1, x1, #64
195
+    mov             w12, #16
196
+.loop_cps64:
197
+    sub             w12, w12, #1
198
+.rept 4
199
+    ld1             {v16.16b-v19.16b}, x2, x3
200
+    uxtl            v0.8h, v16.8b
201
+    uxtl2           v1.8h, v16.16b
202
+    uxtl            v2.8h, v17.8b
203
+    uxtl2           v3.8h, v17.16b
204
+    uxtl            v4.8h, v18.8b
205
+    uxtl2           v5.8h, v18.16b
206
+    uxtl            v6.8h, v19.8b
207
+    uxtl2           v7.8h, v19.16b
208
+    st1             {v0.8h-v3.8h}, x0, #64
209
+    st1             {v4.8h-v7.8h}, x0, x1
210
+.endr
211
+    cbnz            w12, .loop_cps64
212
+    ret
213
+endfunc
214
+
215
+// void x265_blockcopy_ss(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
216
+function PFX(blockcopy_ss_4x4_neon)
217
+    lsl             x1, x1, #1
218
+    lsl             x3, x3, #1
219
+.rept 2
220
+    ld1             {v0.8b}, x2, x3
221
+    ld1             {v1.8b}, x2, x3
222
+    st1             {v0.8b}, x0, x1
223
+    st1             {v1.8b}, x0, x1
224
+.endr
225
+    ret
226
+endfunc
227
+
228
+function PFX(blockcopy_ss_8x8_neon)
229
+    lsl             x1, x1, #1
230
+    lsl             x3, x3, #1
231
+.rept 4
232
+    ld1             {v0.8h}, x2, x3
233
+    ld1             {v1.8h}, x2, x3
234
+    st1             {v0.8h}, x0, x1
235
+    st1             {v1.8h}, x0, x1
236
+.endr
237
+    ret
238
+endfunc
239
+
240
+function PFX(blockcopy_ss_16x16_neon)
241
+    lsl             x1, x1, #1
242
+    lsl             x3, x3, #1
243
+.rept 8
244
+    ld1             {v0.8h-v1.8h}, x2, x3
245
+    ld1             {v2.8h-v3.8h}, x2, x3
246
+    st1             {v0.8h-v1.8h}, x0, x1
247
+    st1             {v2.8h-v3.8h}, x0, x1
248
+.endr
249
+    ret
250
+endfunc
251
+
252
+function PFX(blockcopy_ss_32x32_neon)
253
+    lsl             x1, x1, #1
254
+    lsl             x3, x3, #1
255
+    mov             w12, #4
256
+.loop_css32:
257
+    sub             w12, w12, #1
258
+.rept 8
259
+    ld1             {v0.8h-v3.8h}, x2, x3
260
+    st1             {v0.8h-v3.8h}, x0, x1
261
+.endr
262
+    cbnz            w12, .loop_css32
263
+    ret
264
+endfunc
265
+
266
+function PFX(blockcopy_ss_64x64_neon)
267
+    lsl             x1, x1, #1
268
+    sub             x1, x1, #64
269
+    lsl             x3, x3, #1
270
+    sub             x3, x3, #64
271
+    mov             w12, #8
272
+.loop_css64:
273
+    sub             w12, w12, #1
274
+.rept 8
275
+    ld1             {v0.8h-v3.8h}, x2, #64
276
+    ld1             {v4.8h-v7.8h}, x2, x3
277
+    st1             {v0.8h-v3.8h}, x0, #64
278
+    st1             {v4.8h-v7.8h}, x0, x1
279
+.endr
280
+    cbnz            w12, .loop_css64
281
+    ret
282
+endfunc
283
+
284
+/******** Chroma blockcopy********/
285
+function PFX(blockcopy_ss_4x8_neon)
286
+    lsl             x1, x1, #1
287
+    lsl             x3, x3, #1
288
+.rept 4
289
+    ld1             {v0.8b}, x2, x3
290
+    ld1             {v1.8b}, x2, x3
291
+    st1             {v0.8b}, x0, x1
292
+    st1             {v1.8b}, x0, x1
293
+.endr
294
+    ret
295
+endfunc
296
+
297
+function PFX(blockcopy_ss_8x16_neon)
298
+    lsl             x1, x1, #1
299
+    lsl             x3, x3, #1
300
+.rept 8
301
+    ld1             {v0.8h}, x2, x3
302
+    ld1             {v1.8h}, x2, x3
303
+    st1             {v0.8h}, x0, x1
304
+    st1             {v1.8h}, x0, x1
305
+.endr
306
+    ret
307
+endfunc
308
+
309
+function PFX(blockcopy_ss_16x32_neon)
310
+    lsl             x1, x1, #1
311
+    lsl             x3, x3, #1
312
+.rept 16
313
+    ld1             {v0.8h-v1.8h}, x2, x3
314
+    ld1             {v2.8h-v3.8h}, x2, x3
315
+    st1             {v0.8h-v1.8h}, x0, x1
316
+    st1             {v2.8h-v3.8h}, x0, x1
317
+.endr
318
+    ret
319
+endfunc
320
+
321
+function PFX(blockcopy_ss_32x64_neon)
322
+    lsl             x1, x1, #1
323
+    lsl             x3, x3, #1
324
+    mov             w12, #8
325
+.loop_css32x64:
326
+    sub             w12, w12, #1
327
+.rept 8
328
+    ld1             {v0.8h-v3.8h}, x2, x3
329
+    st1             {v0.8h-v3.8h}, x0, x1
330
+.endr
331
+    cbnz            w12, .loop_css32x64
332
+    ret
333
+endfunc
334
+
335
+// chroma blockcopy_ps
336
+function PFX(blockcopy_ps_4x8_neon)
337
+    lsl             x1, x1, #1
338
+.rept 4
339
+    ld1             {v0.8b}, x2, x3
340
+    ld1             {v1.8b}, x2, x3
341
+    uxtl            v0.8h, v0.8b
342
+    uxtl            v1.8h, v1.8b
343
+    st1             {v0.4h}, x0, x1
344
+    st1             {v1.4h}, x0, x1
345
+.endr
346
+    ret
347
+endfunc
348
+
349
+function PFX(blockcopy_ps_8x16_neon)
350
+    lsl             x1, x1, #1
351
+.rept 8
352
+    ld1             {v0.8b}, x2, x3
353
+    ld1             {v1.8b}, x2, x3
354
+    uxtl            v0.8h, v0.8b
355
+    uxtl            v1.8h, v1.8b
356
+    st1             {v0.8h}, x0, x1
357
+    st1             {v1.8h}, x0, x1
358
+.endr
359
+    ret
360
+endfunc
361
+
362
+function PFX(blockcopy_ps_16x32_neon)
363
+    lsl             x1, x1, #1
364
+.rept 16
365
+    ld1             {v4.16b}, x2, x3
366
+    ld1             {v5.16b}, x2, x3
367
+    uxtl            v0.8h, v4.8b
368
+    uxtl2           v1.8h, v4.16b
369
+    uxtl            v2.8h, v5.8b
370
+    uxtl2           v3.8h, v5.16b
371
+    st1             {v0.8h-v1.8h}, x0, x1
372
+    st1             {v2.8h-v3.8h}, x0, x1
373
+.endr
374
+    ret
375
+endfunc
376
+
377
+function PFX(blockcopy_ps_32x64_neon)
378
+    lsl             x1, x1, #1
379
+    mov             w12, #8
380
+.loop_cps32x64:
381
+    sub             w12, w12, #1
382
+.rept 4
383
+    ld1             {v16.16b-v17.16b}, x2, x3
384
+    ld1             {v18.16b-v19.16b}, x2, x3
385
+    uxtl            v0.8h, v16.8b
386
+    uxtl2           v1.8h, v16.16b
387
+    uxtl            v2.8h, v17.8b
388
+    uxtl2           v3.8h, v17.16b
389
+    uxtl            v4.8h, v18.8b
390
+    uxtl2           v5.8h, v18.16b
391
+    uxtl            v6.8h, v19.8b
392
+    uxtl2           v7.8h, v19.16b
393
+    st1             {v0.8h-v3.8h}, x0, x1
394
+    st1             {v4.8h-v7.8h}, x0, x1
395
+.endr
396
+    cbnz            w12, .loop_cps32x64
397
+    ret
398
+endfunc
399
+
400
+// chroma blockcopy_sp
401
+function PFX(blockcopy_sp_4x8_neon)
402
+    lsl             x3, x3, #1
403
+.rept 4
404
+    ld1             {v0.8h}, x2, x3
405
+    ld1             {v1.8h}, x2, x3
406
+    xtn             v0.8b, v0.8h
407
+    xtn             v1.8b, v1.8h
408
+    st1             {v0.s}0, x0, x1
409
+    st1             {v1.s}0, x0, x1
410
+.endr
411
+    ret
412
+endfunc
413
+
414
+function PFX(blockcopy_sp_8x16_neon)
415
+    lsl             x3, x3, #1
416
+.rept 8
417
+    ld1             {v0.8h}, x2, x3
418
+    ld1             {v1.8h}, x2, x3
419
+    xtn             v0.8b, v0.8h
420
+    xtn             v1.8b, v1.8h
421
+    st1             {v0.d}0, x0, x1
422
+    st1             {v1.d}0, x0, x1
423
+.endr
424
+    ret
425
+endfunc
426
+
427
+function PFX(blockcopy_sp_16x32_neon)
428
+    lsl             x3, x3, #1
429
+    movrel          x11, xtn_xtn2_table
430
+    ld1             {v31.16b}, x11
431
+.rept 16
432
+    ld1             {v0.8h-v1.8h}, x2, x3
433
+    ld1             {v2.8h-v3.8h}, x2, x3
434
+    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
435
+    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
436
+    st1             {v0.16b}, x0, x1
437
+    st1             {v1.16b}, x0, x1
438
+.endr
439
+    ret
440
+endfunc
441
+
442
+function PFX(blockcopy_sp_32x64_neon)
443
+    mov             w12, #8
444
+    lsl             x3, x3, #1
445
+    movrel          x11, xtn_xtn2_table
446
+    ld1             {v31.16b}, x11
447
+.loop_csp32x64:
448
+    sub             w12, w12, #1
449
+.rept 4
450
+    ld1             {v0.8h-v3.8h}, x2, x3
451
+    ld1             {v4.8h-v7.8h}, x2, x3
452
+    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
453
+    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
454
+    tbl             v2.16b, {v4.16b,v5.16b}, v31.16b
455
+    tbl             v3.16b, {v6.16b,v7.16b}, v31.16b
456
+    st1             {v0.16b-v1.16b}, x0, x1
457
+    st1             {v2.16b-v3.16b}, x0, x1
458
+.endr
459
+    cbnz            w12, .loop_csp32x64
460
+    ret
461
+endfunc
462
+
463
+/* blockcopy_pp(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) */
464
+
465
+function PFX(blockcopy_pp_2x4_neon)
466
+    ldrh            w9, x2
467
+    add             x4, x1, x1
468
+    add             x14, x3, x3
469
+    strh            w9, x0
470
+    ldrh            w10, x2, x3
471
+    add             x5, x4, x1
472
+    add             x15, x14, x3
473
+    strh            w10, x0, x1
474
+    ldrh            w11, x2, x14
475
+    strh            w11, x0, x4
476
+    ldrh            w12, x2, x15
477
+    strh            w12, x0, x5
478
+    ret
479
+endfunc
480
+
481
+.macro blockcopy_pp_2xN_neon h
482
+function PFX(blockcopy_pp_2x\h\()_neon)
483
+    add             x4, x1, x1
484
+    add             x5, x4, x1
485
+    add             x6, x5, x1
486
+
487
+    add             x14, x3, x3
488
+    add             x15, x14, x3
489
+    add             x16, x15, x3
490
+
491
+.rept \h / 4
492
+    ldrh            w9, x2
493
+    strh            w9, x0
494
+    ldrh            w10, x2, x3
495
+    strh            w10, x0, x1
496
+    ldrh            w11, x2, x14
497
+    strh            w11, x0, x4
498
+    ldrh            w12, x2, x15
499
+    strh            w12, x0, x5
500
+    add             x2, x2, x16
501
+    add             x0, x0, x6
502
+.endr
503
+    ret
504
+endfunc
505
+.endm
506
+
507
+blockcopy_pp_2xN_neon 8
508
+blockcopy_pp_2xN_neon 16
509
+
510
+function PFX(blockcopy_pp_4x2_neon)
511
+    ldr             w9, x2
512
+    str             w9, x0
513
+    ldr             w10, x2, x3
514
+    str             w10, x0, x1
515
+    ret
516
+endfunc
517
+
518
+function PFX(blockcopy_pp_4x4_neon)
519
+    ldr             w9, x2
520
+    add             x4, x1, x1
521
+    add             x14, x3, x3
522
+    str             w9, x0
523
+    ldr             w10, x2, x3
524
+    add             x5, x4, x1
525
+    add             x15, x14, x3
526
+    str             w10, x0, x1
527
+    ldr             w11, x2, x14
528
+    str             w11, x0, x4
529
+    ldr             w12, x2, x15
530
+    str             w12, x0, x5
531
+    ret
532
+endfunc
533
+
534
+.macro blockcopy_pp_4xN_neon h
535
+function PFX(blockcopy_pp_4x\h\()_neon)
536
+    add             x4, x1, x1
537
+    add             x5, x4, x1
538
+    add             x6, x5, x1
539
+
540
+    add             x14, x3, x3
541
+    add             x15, x14, x3
542
+    add             x16, x15, x3
543
+
544
+.rept \h / 4
545
+    ldr             w9, x2
546
+    str             w9, x0
547
+    ldr             w10, x2, x3
548
+    str             w10, x0, x1
549
+    ldr             w11, x2, x14
550
+    str             w11, x0, x4
551
+    ldr             w12, x2, x15
552
+    str             w12, x0, x5
553
+    add             x2, x2, x16
554
+    add             x0, x0, x6
555
+.endr
556
+    ret
557
+endfunc
558
+.endm
559
+
560
+blockcopy_pp_4xN_neon 8
561
+blockcopy_pp_4xN_neon 16
562
+blockcopy_pp_4xN_neon 32
563
+
564
+.macro blockcopy_pp_6xN_neon h
565
+function PFX(blockcopy_pp_6x\h\()_neon)
566
+    sub             x1, x1, #4
567
+.rept \h
568
+    ld1             {v0.8b}, x2, x3
569
+    st1             {v0.s}0, x0, #4
570
+    st1             {v0.h}2, x0, x1
571
+.endr
572
+    ret
573
+endfunc
574
+.endm
575
+
576
+blockcopy_pp_6xN_neon 8
577
+blockcopy_pp_6xN_neon 16
578
+
579
+.macro blockcopy_pp_8xN_neon h
580
+function PFX(blockcopy_pp_8x\h\()_neon)
581
+.rept \h
582
+    ld1             {v0.4h}, x2, x3
583
+    st1             {v0.4h}, x0, x1
584
+.endr
585
+    ret
586
+endfunc
587
+.endm
588
+
589
+blockcopy_pp_8xN_neon 2
590
+blockcopy_pp_8xN_neon 4
591
+blockcopy_pp_8xN_neon 6
592
+blockcopy_pp_8xN_neon 8
593
+blockcopy_pp_8xN_neon 12
594
+blockcopy_pp_8xN_neon 16
595
+blockcopy_pp_8xN_neon 32
596
+
597
+function PFX(blockcopy_pp_8x64_neon)
598
+    mov             w12, #4
599
+.loop_pp_8x64:
600
+    sub             w12, w12, #1
601
+.rept 16
602
+    ld1             {v0.4h}, x2, x3
603
+    st1             {v0.4h}, x0, x1
604
+.endr
605
+    cbnz            w12, .loop_pp_8x64
606
+    ret
607
+endfunc
608
+
609
+.macro blockcopy_pp_16xN_neon h
610
+function PFX(blockcopy_pp_16x\h\()_neon)
611
+.rept \h
612
+    ld1             {v0.8h}, x2, x3
613
+    st1             {v0.8h}, x0, x1
614
+.endr
615
+    ret
616
+endfunc
617
+.endm
618
+
619
+blockcopy_pp_16xN_neon 4
620
+blockcopy_pp_16xN_neon 8
621
+blockcopy_pp_16xN_neon 12
622
+blockcopy_pp_16xN_neon 16
623
+
624
+.macro blockcopy_pp_16xN1_neon h
625
+function PFX(blockcopy_pp_16x\h\()_neon)
626
+    mov             w12, #\h / 8
627
+.loop_16x\h\():
628
+.rept 8
629
+    ld1             {v0.8h}, x2, x3
630
+    st1             {v0.8h}, x0, x1
631
+.endr
632
+    sub             w12, w12, #1
633
+    cbnz            w12, .loop_16x\h
634
+    ret
635
+endfunc
636
+.endm
637
+
638
+blockcopy_pp_16xN1_neon 24
639
+blockcopy_pp_16xN1_neon 32
640
+blockcopy_pp_16xN1_neon 64
641
+
642
+function PFX(blockcopy_pp_12x16_neon)
643
+    sub             x1, x1, #8
644
+.rept 16
645
+    ld1             {v0.16b}, x2, x3
646
+    str             d0, x0, #8
647
+    st1             {v0.s}2, x0, x1
648
+.endr
649
+    ret
650
+endfunc
651
+
652
+function PFX(blockcopy_pp_12x32_neon)
653
+    sub             x1, x1, #8
654
+    mov             w12, #4
655
+.loop_pp_12x32:
656
+    sub             w12, w12, #1
657
+.rept 8
658
+    ld1             {v0.16b}, x2, x3
659
+    str             d0, x0, #8
660
+    st1             {v0.s}2, x0, x1
661
+.endr
662
+    cbnz            w12, .loop_pp_12x32
663
+    ret
664
+endfunc
665
+
666
+function PFX(blockcopy_pp_24x32_neon)
667
+    mov             w12, #4
668
+.loop_24x32:
669
+    sub             w12, w12, #1
670
+.rept 8
671
+    ld1             {v0.8b-v2.8b}, x2, x3
672
+    st1             {v0.8b-v2.8b}, x0, x1
673
+.endr
674
+    cbnz            w12, .loop_24x32
675
+    ret
676
+endfunc
677
+
678
+function PFX(blockcopy_pp_24x64_neon)
679
+    mov             w12, #4
680
+.loop_24x64:
681
+    sub             w12, w12, #1
682
+.rept 16
683
+    ld1             {v0.8b-v2.8b}, x2, x3
684
+    st1             {v0.8b-v2.8b}, x0, x1
685
+.endr
686
+    cbnz            w12, .loop_24x64
687
+    ret
688
+endfunc
689
+
690
+function PFX(blockcopy_pp_32x8_neon)
691
+.rept 8
692
+    ld1             {v0.16b-v1.16b}, x2, x3
693
+    st1             {v0.16b-v1.16b}, x0, x1
694
+.endr
695
+    ret
696
+endfunc
697
+
698
+.macro blockcopy_pp_32xN_neon h
699
+function PFX(blockcopy_pp_32x\h\()_neon)
700
+    mov             w12, #\h / 8
701
+.loop_32x\h\():
702
+    sub             w12, w12, #1
703
+.rept 8
704
+    ld1             {v0.16b-v1.16b}, x2, x3
705
+    st1             {v0.16b-v1.16b}, x0, x1
706
+.endr
707
+    cbnz            w12, .loop_32x\h
708
+    ret
709
+endfunc
710
+.endm
711
+
712
+blockcopy_pp_32xN_neon 16
713
+blockcopy_pp_32xN_neon 24
714
+blockcopy_pp_32xN_neon 32
715
+blockcopy_pp_32xN_neon 64
716
+blockcopy_pp_32xN_neon 48
717
+
718
+function PFX(blockcopy_pp_48x64_neon)
719
+    mov             w12, #8
720
+.loop_48x64:
721
+    sub             w12, w12, #1
722
+.rept 8
723
+    ld1             {v0.16b-v2.16b}, x2, x3
724
+    st1             {v0.16b-v2.16b}, x0, x1
725
+.endr
726
+    cbnz            w12, .loop_48x64
727
+    ret
728
+endfunc
729
+
730
+.macro blockcopy_pp_64xN_neon h
731
+function PFX(blockcopy_pp_64x\h\()_neon)
732
+    mov             w12, #\h / 4
733
+.loop_64x\h\():
734
+    sub             w12, w12, #1
735
+.rept 4
736
+    ld1             {v0.16b-v3.16b}, x2, x3
737
+    st1             {v0.16b-v3.16b}, x0, x1
738
+.endr
739
+    cbnz            w12, .loop_64x\h
740
+    ret
741
+endfunc
742
+.endm
743
+
744
+blockcopy_pp_64xN_neon 16
745
+blockcopy_pp_64xN_neon 32
746
+blockcopy_pp_64xN_neon 48
747
+blockcopy_pp_64xN_neon 64
748
+
749
+// void x265_blockfill_s_neon(int16_t* dst, intptr_t dstride, int16_t val)
750
+function PFX(blockfill_s_4x4_neon)
751
+    dup             v0.4h, w2
752
+    lsl             x1, x1, #1
753
+.rept 4
754
+    st1             {v0.4h}, x0, x1
755
+.endr
756
+    ret
757
+endfunc
758
+
759
+function PFX(blockfill_s_8x8_neon)
760
+    dup             v0.8h, w2
761
+    lsl             x1, x1, #1
762
+.rept 8
763
+    st1             {v0.8h}, x0, x1
764
+.endr
765
+    ret
766
+endfunc
767
+
768
+function PFX(blockfill_s_16x16_neon)
769
+    dup             v0.8h, w2
770
+    mov             v1.16b, v0.16b
771
+    lsl             x1, x1, #1
772
+.rept 16
773
+    stp             q0, q1, x0
774
+    add             x0, x0, x1
775
+.endr
776
+    ret
777
+endfunc
778
+
779
+function PFX(blockfill_s_32x32_neon)
780
+    dup             v0.8h, w2
781
+    mov             v1.16b, v0.16b
782
+    mov             v2.16b, v0.16b
783
+    mov             v3.16b, v0.16b
784
+    lsl             x1, x1, #1
785
+.rept 32
786
+    st1             {v0.8h-v3.8h}, x0, x1
787
+.endr
788
+    ret
789
+endfunc
790
+
791
+function PFX(blockfill_s_64x64_neon)
792
+    dup             v0.8h, w2
793
+    mov             v1.16b, v0.16b
794
+    mov             v2.16b, v0.16b
795
+    mov             v3.16b, v0.16b
796
+    lsl             x1, x1, #1
797
+    sub             x1, x1, #64
798
+.rept 64
799
+    st1             {v0.8h-v3.8h}, x0, #64
800
+    st1             {v0.8h-v3.8h}, x0, x1
801
+.endr
802
+    ret
803
+endfunc
804
+
805
+// uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t resiStride)
806
+function PFX(copy_cnt_4_neon)
807
+    lsl             x2, x2, #1
808
+    movi            v4.8b, #0
809
+.rept 2
810
+    ld1             {v0.8b}, x1, x2
811
+    ld1             {v1.8b}, x1, x2
812
+    stp             d0, d1, x0, #16
813
+    cmeq            v0.4h, v0.4h, #0
814
+    cmeq            v1.4h, v1.4h, #0
815
+    add             v4.4h, v4.4h, v0.4h
816
+    add             v4.4h, v4.4h, v1.4h
817
+.endr
818
+    saddlv          s4, v4.4h
819
+    fmov            w12, s4
820
+    add             w0, w12, #16
821
+    ret
822
+endfunc
823
+
824
+function PFX(copy_cnt_8_neon)
825
+    lsl             x2, x2, #1
826
+    movi            v4.8b, #0
827
+.rept 4
828
+    ld1             {v0.16b}, x1, x2
829
+    ld1             {v1.16b}, x1, x2
830
+    stp             q0, q1, x0, #32
831
+    cmeq            v0.8h, v0.8h, #0
832
+    cmeq            v1.8h, v1.8h, #0
833
+    add             v4.8h, v4.8h, v0.8h
834
+    add             v4.8h, v4.8h, v1.8h
835
+.endr
836
+    saddlv          s4, v4.8h
837
+    fmov            w12, s4
838
+    add             w0, w12, #64
839
+    ret
840
+endfunc
841
+
842
+function PFX(copy_cnt_16_neon)
843
+    lsl             x2, x2, #1
844
+    movi            v4.8b, #0
845
+.rept 16
846
+    ld1             {v0.16b-v1.16b}, x1, x2
847
+    st1             {v0.16b-v1.16b}, x0, #32
848
+    cmeq            v0.8h, v0.8h, #0
849
+    cmeq            v1.8h, v1.8h, #0
850
+    add             v4.8h, v4.8h, v0.8h
851
+    add             v4.8h, v4.8h, v1.8h
852
+.endr
853
+    saddlv          s4, v4.8h
854
+    fmov            w12, s4
855
+    add             w0, w12, #256
856
+    ret
857
+endfunc
858
+
859
+function PFX(copy_cnt_32_neon)
860
+    lsl             x2, x2, #1
861
+    movi            v4.8b, #0
862
+.rept 32
863
+    ld1             {v0.16b-v3.16b}, x1, x2
864
+    st1             {v0.16b-v3.16b}, x0, #64
865
+    cmeq            v0.8h, v0.8h, #0
866
+    cmeq            v1.8h, v1.8h, #0
867
+    cmeq            v2.8h, v2.8h, #0
868
+    cmeq            v3.8h, v3.8h, #0
869
+    add             v0.8h, v0.8h, v1.8h
870
+    add             v2.8h, v2.8h, v3.8h
871
+    add             v4.8h, v4.8h, v0.8h
872
+    add             v4.8h, v4.8h, v2.8h
873
+.endr
874
+    saddlv          s4, v4.8h
875
+    fmov            w12, s4
876
+    add             w0, w12, #1024
877
+    ret
878
+endfunc
879
+
880
+// int  count_nonzero_c(const int16_t* quantCoeff)
881
+function PFX(count_nonzero_4_neon)
882
+    movi            v16.16b, #1
883
+    movi            v17.16b, #0
884
+    trn1            v16.16b, v16.16b, v17.16b
885
+    ldp             q0, q1, x0
886
+    cmhi            v0.8h, v0.8h, v17.8h
887
+    cmhi            v1.8h, v1.8h, v17.8h
888
+    and             v0.16b, v0.16b, v16.16b
889
+    and             v1.16b, v1.16b, v16.16b
890
+    add             v0.8h, v0.8h, v1.8h
891
+    uaddlv          s0, v0.8h
892
+    fmov            w0, s0
893
+    ret
894
+endfunc
895
+
896
+.macro COUNT_NONZERO_8
897
+    ld1             {v0.16b-v3.16b}, x0, #64
898
+    ld1             {v4.16b-v7.16b}, x0, #64
899
+    cmhi            v0.8h, v0.8h, v17.8h
900
+    cmhi            v1.8h, v1.8h, v17.8h
901
+    cmhi            v2.8h, v2.8h, v17.8h
902
+    cmhi            v3.8h, v3.8h, v17.8h
903
+    cmhi            v4.8h, v4.8h, v17.8h
904
+    cmhi            v5.8h, v5.8h, v17.8h
905
+    cmhi            v6.8h, v6.8h, v17.8h
906
+    cmhi            v7.8h, v7.8h, v17.8h
907
+    and             v0.16b, v0.16b, v16.16b
908
+    and             v1.16b, v1.16b, v16.16b
909
+    and             v2.16b, v2.16b, v16.16b
910
+    and             v3.16b, v3.16b, v16.16b
911
+    and             v4.16b, v4.16b, v16.16b
912
+    and             v5.16b, v5.16b, v16.16b
913
+    and             v6.16b, v6.16b, v16.16b
914
+    and             v7.16b, v7.16b, v16.16b
915
+    add             v0.8h, v0.8h, v1.8h
916
+    add             v2.8h, v2.8h, v3.8h
917
+    add             v4.8h, v4.8h, v5.8h
918
+    add             v6.8h, v6.8h, v7.8h
919
+    add             v0.8h, v0.8h, v2.8h
920
+    add             v4.8h, v4.8h, v6.8h
921
+    add             v0.8h, v0.8h, v4.8h
922
+.endm
923
+
924
+function PFX(count_nonzero_8_neon)
925
+    movi            v16.16b, #1
926
+    movi            v17.16b, #0
927
+    trn1            v16.16b, v16.16b, v17.16b
928
+    COUNT_NONZERO_8
929
+    uaddlv          s0, v0.8h
930
+    fmov            w0, s0
931
+    ret
932
+endfunc
933
+
934
+function PFX(count_nonzero_16_neon)
935
+    movi            v16.16b, #1
936
+    movi            v17.16b, #0
937
+    trn1            v16.16b, v16.16b, v17.16b
938
+    movi            v18.16b, #0
939
+.rept 4
940
+    COUNT_NONZERO_8
941
+    add             v18.16b, v18.16b, v0.16b
942
+.endr
943
+    uaddlv          s0, v18.8h
944
+    fmov            w0, s0
945
+    ret
946
+endfunc
947
+
948
+function PFX(count_nonzero_32_neon)
949
+    movi            v16.16b, #1
950
+    movi            v17.16b, #0
951
+    trn1            v16.16b, v16.16b, v17.16b
952
+    movi            v18.16b, #0
953
+    mov             w12, #16
954
+.loop_count_nonzero_32:
955
+    sub             w12, w12, #1
956
+    COUNT_NONZERO_8
957
+    add             v18.16b, v18.16b, v0.16b
958
+    cbnz            w12, .loop_count_nonzero_32
959
+
960
+    uaddlv          s0, v18.8h
961
+    fmov            w0, s0
962
+    ret
963
+endfunc
964
+
965
+// void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
966
+.macro cpy2Dto1D_shl_start
967
+    add             x2, x2, x2
968
+    dup             v0.8h, w3
969
+.endm
970
+
971
+function PFX(cpy2Dto1D_shl_4x4_neon)
972
+    cpy2Dto1D_shl_start
973
+    ld1             {v2.d}0, x1, x2
974
+    ld1             {v2.d}1, x1, x2
975
+    ld1             {v3.d}0, x1, x2
976
+    ld1             {v3.d}1, x1, x2
977
+    sshl            v2.8h, v2.8h, v0.8h
978
+    sshl            v3.8h, v3.8h, v0.8h
979
+    st1             {v2.16b-v3.16b}, x0
980
+    ret
981
+endfunc
982
+
983
+function PFX(cpy2Dto1D_shl_8x8_neon)
984
+    cpy2Dto1D_shl_start
985
+.rept 4
986
+    ld1             {v2.16b}, x1, x2
987
+    ld1             {v3.16b}, x1, x2
988
+    sshl            v2.8h, v2.8h, v0.8h
989
+    sshl            v3.8h, v3.8h, v0.8h
990
+    st1             {v2.16b-v3.16b}, x0, #32
991
+.endr
992
+    ret
993
+endfunc
994
+
995
+function PFX(cpy2Dto1D_shl_16x16_neon)
996
+    cpy2Dto1D_shl_start
997
+    mov             w12, #4
998
+.loop_cpy2Dto1D_shl_16:
999
+    sub             w12, w12, #1
1000
+.rept 4
1001
+    ld1             {v2.16b-v3.16b}, x1, x2
1002
+    sshl            v2.8h, v2.8h, v0.8h
1003
+    sshl            v3.8h, v3.8h, v0.8h
1004
+    st1             {v2.16b-v3.16b}, x0, #32
1005
+.endr
1006
+    cbnz            w12, .loop_cpy2Dto1D_shl_16
1007
+    ret
1008
+endfunc
1009
+
1010
+function PFX(cpy2Dto1D_shl_32x32_neon)
1011
+    cpy2Dto1D_shl_start
1012
+    mov             w12, #16
1013
+.loop_cpy2Dto1D_shl_32:
1014
+    sub             w12, w12, #1
1015
+.rept 2
1016
+    ld1             {v2.16b-v5.16b}, x1, x2
1017
+    sshl            v2.8h, v2.8h, v0.8h
1018
+    sshl            v3.8h, v3.8h, v0.8h
1019
+    sshl            v4.8h, v4.8h, v0.8h
1020
+    sshl            v5.8h, v5.8h, v0.8h
1021
+    st1             {v2.16b-v5.16b}, x0, #64
1022
+.endr
1023
+    cbnz            w12, .loop_cpy2Dto1D_shl_32
1024
+    ret
1025
+endfunc
1026
+
1027
+function PFX(cpy2Dto1D_shl_64x64_neon)
1028
+    cpy2Dto1D_shl_start
1029
+    mov             w12, #32
1030
+    sub             x2, x2, #64
1031
+.loop_cpy2Dto1D_shl_64:
1032
+    sub             w12, w12, #1
1033
+.rept 2
1034
+    ld1             {v2.16b-v5.16b}, x1, #64
1035
+    ld1             {v16.16b-v19.16b}, x1, x2
1036
+    sshl            v2.8h, v2.8h, v0.8h
1037
+    sshl            v3.8h, v3.8h, v0.8h
1038
+    sshl            v4.8h, v4.8h, v0.8h
1039
+    sshl            v5.8h, v5.8h, v0.8h
1040
+    sshl            v16.8h, v16.8h, v0.8h
1041
+    sshl            v17.8h, v17.8h, v0.8h
1042
+    sshl            v18.8h, v18.8h, v0.8h
1043
+    sshl            v19.8h, v19.8h, v0.8h
1044
+    st1             {v2.16b-v5.16b}, x0, #64
1045
+    st1             {v16.16b-v19.16b}, x0, #64
1046
+.endr
1047
+    cbnz            w12, .loop_cpy2Dto1D_shl_64
1048
+    ret
1049
+endfunc
1050
+
1051
+// void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
1052
+function PFX(cpy2Dto1D_shr_4x4_neon)
1053
+    cpy2Dto1D_shr_start
1054
+    ld1             {v2.d}0, x1, x2
1055
+    ld1             {v2.d}1, x1, x2
1056
+    ld1             {v3.d}0, x1, x2
1057
+    ld1             {v3.d}1, x1, x2
1058
+    sub             v2.8h, v2.8h, v1.8h
1059
+    sub             v3.8h, v3.8h, v1.8h
1060
+    sshl            v2.8h, v2.8h, v0.8h
1061
+    sshl            v3.8h, v3.8h, v0.8h
1062
+    stp             q2, q3, x0
1063
+    ret
1064
+endfunc
1065
+
1066
+function PFX(cpy2Dto1D_shr_8x8_neon)
1067
+    cpy2Dto1D_shr_start
1068
+.rept 4
1069
+    ld1             {v2.16b}, x1, x2
1070
+    ld1             {v3.16b}, x1, x2
1071
+    sub             v2.8h, v2.8h, v1.8h
1072
+    sub             v3.8h, v3.8h, v1.8h
1073
+    sshl            v2.8h, v2.8h, v0.8h
1074
+    sshl            v3.8h, v3.8h, v0.8h
1075
+    stp             q2, q3, x0, #32
1076
+.endr
1077
+    ret
1078
+endfunc
1079
+
1080
+function PFX(cpy2Dto1D_shr_16x16_neon)
1081
+    cpy2Dto1D_shr_start
1082
+    mov             w12, #4
1083
+.loop_cpy2Dto1D_shr_16:
1084
+    sub             w12, w12, #1
1085
+.rept 4
1086
+    ld1             {v2.8h-v3.8h}, x1, x2
1087
+    sub             v2.8h, v2.8h, v1.8h
1088
+    sub             v3.8h, v3.8h, v1.8h
1089
+    sshl            v2.8h, v2.8h, v0.8h
1090
+    sshl            v3.8h, v3.8h, v0.8h
1091
+    st1             {v2.8h-v3.8h}, x0, #32
1092
+.endr
1093
+    cbnz            w12, .loop_cpy2Dto1D_shr_16
1094
+    ret
1095
+endfunc
1096
+
1097
+function PFX(cpy2Dto1D_shr_32x32_neon)
1098
+    cpy2Dto1D_shr_start
1099
+    mov             w12, #16
1100
+.loop_cpy2Dto1D_shr_32:
1101
+    sub             w12, w12, #1
1102
+.rept 2
1103
+    ld1             {v2.8h-v5.8h}, x1, x2
1104
+    sub             v2.8h, v2.8h, v1.8h
1105
+    sub             v3.8h, v3.8h, v1.8h
1106
+    sub             v4.8h, v4.8h, v1.8h
1107
+    sub             v5.8h, v5.8h, v1.8h
1108
+    sshl            v2.8h, v2.8h, v0.8h
1109
+    sshl            v3.8h, v3.8h, v0.8h
1110
+    sshl            v4.8h, v4.8h, v0.8h
1111
+    sshl            v5.8h, v5.8h, v0.8h
1112
+    st1             {v2.8h-v5.8h}, x0, #64
1113
+.endr
1114
+    cbnz            w12, .loop_cpy2Dto1D_shr_32
1115
+    ret
1116
+endfunc
1117
+
1118
+// void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
1119
+.macro cpy1Dto2D_shl_start
1120
+    add             x2, x2, x2
1121
+    dup             v0.8h, w3
1122
+.endm
1123
+
1124
+function PFX(cpy1Dto2D_shl_4x4_neon)
1125
+    cpy1Dto2D_shl_start
1126
+    ld1             {v2.16b-v3.16b}, x1
1127
+    sshl            v2.8h, v2.8h, v0.8h
1128
+    sshl            v3.8h, v3.8h, v0.8h
1129
+    st1             {v2.d}0, x0, x2
1130
+    st1             {v2.d}1, x0, x2
1131
+    st1             {v3.d}0, x0, x2
1132
+    st1             {v3.d}1, x0, x2
1133
+    ret
1134
+endfunc
1135
+
1136
+function PFX(cpy1Dto2D_shl_8x8_neon)
1137
+    cpy1Dto2D_shl_start
1138
+.rept 4
1139
+    ld1             {v2.16b-v3.16b}, x1, #32
1140
+    sshl            v2.8h, v2.8h, v0.8h
1141
+    sshl            v3.8h, v3.8h, v0.8h
1142
+    st1             {v2.16b}, x0, x2
1143
+    st1             {v3.16b}, x0, x2
1144
+.endr
1145
+    ret
1146
+endfunc
1147
+
1148
+function PFX(cpy1Dto2D_shl_16x16_neon)
1149
+    cpy1Dto2D_shl_start
1150
+    mov             w12, #4
1151
+.loop_cpy1Dto2D_shl_16:
1152
+    sub             w12, w12, #1
1153
+.rept 4
1154
+    ld1             {v2.16b-v3.16b}, x1, #32
1155
+    sshl            v2.8h, v2.8h, v0.8h
1156
+    sshl            v3.8h, v3.8h, v0.8h
1157
+    st1             {v2.16b-v3.16b}, x0, x2
1158
+.endr
1159
+    cbnz            w12, .loop_cpy1Dto2D_shl_16
1160
+    ret
1161
+endfunc
1162
+
1163
+function PFX(cpy1Dto2D_shl_32x32_neon)
1164
+    cpy1Dto2D_shl_start
1165
+    mov             w12, #16
1166
+.loop_cpy1Dto2D_shl_32:
1167
+    sub             w12, w12, #1
1168
+.rept 2
1169
+    ld1             {v2.16b-v5.16b}, x1, #64
1170
+    sshl            v2.8h, v2.8h, v0.8h
1171
+    sshl            v3.8h, v3.8h, v0.8h
1172
+    sshl            v4.8h, v4.8h, v0.8h
1173
+    sshl            v5.8h, v5.8h, v0.8h
1174
+    st1             {v2.16b-v5.16b}, x0, x2
1175
+.endr
1176
+    cbnz            w12, .loop_cpy1Dto2D_shl_32
1177
+    ret
1178
+endfunc
1179
+
1180
+function PFX(cpy1Dto2D_shl_64x64_neon)
1181
+    cpy1Dto2D_shl_start
1182
+    mov             w12, #32
1183
+    sub             x2, x2, #64
1184
+.loop_cpy1Dto2D_shl_64:
1185
+    sub             w12, w12, #1
1186
+.rept 2
1187
+    ld1             {v2.16b-v5.16b}, x1, #64
1188
+    ld1             {v16.16b-v19.16b}, x1, #64
1189
+    sshl            v2.8h, v2.8h, v0.8h
1190
+    sshl            v3.8h, v3.8h, v0.8h
1191
+    sshl            v4.8h, v4.8h, v0.8h
1192
+    sshl            v5.8h, v5.8h, v0.8h
1193
+    sshl            v16.8h, v16.8h, v0.8h
1194
+    sshl            v17.8h, v17.8h, v0.8h
1195
+    sshl            v18.8h, v18.8h, v0.8h
1196
+    sshl            v19.8h, v19.8h, v0.8h
1197
+    st1             {v2.16b-v5.16b}, x0, #64
1198
+    st1             {v16.16b-v19.16b}, x0, x2
1199
+.endr
1200
+    cbnz            w12, .loop_cpy1Dto2D_shl_64
1201
+    ret
1202
+endfunc
1203
+
1204
+function PFX(cpy1Dto2D_shr_4x4_neon)
1205
+    cpy1Dto2D_shr_start
1206
+    ld1             {v2.16b-v3.16b}, x1
1207
+    sub             v2.8h, v2.8h, v1.8h
1208
+    sub             v3.8h, v3.8h, v1.8h
1209
+    sshl            v2.8h, v2.8h, v0.8h
1210
+    sshl            v3.8h, v3.8h, v0.8h
1211
+    st1             {v2.d}0, x0, x2
1212
+    st1             {v2.d}1, x0, x2
1213
+    st1             {v3.d}0, x0, x2
1214
+    st1             {v3.d}1, x0, x2
1215
+    ret
1216
+endfunc
1217
+
1218
+function PFX(cpy1Dto2D_shr_8x8_neon)
1219
+    cpy1Dto2D_shr_start
1220
+.rept 4
1221
+    ld1             {v2.16b-v3.16b}, x1, #32
1222
+    sub             v2.8h, v2.8h, v1.8h
1223
+    sub             v3.8h, v3.8h, v1.8h
1224
+    sshl            v2.8h, v2.8h, v0.8h
1225
+    sshl            v3.8h, v3.8h, v0.8h
1226
+    st1             {v2.16b}, x0, x2
1227
+    st1             {v3.16b}, x0, x2
1228
+.endr
1229
+    ret
1230
+endfunc
1231
+
1232
+function PFX(cpy1Dto2D_shr_16x16_neon)
1233
+    cpy1Dto2D_shr_start
1234
+    mov             w12, #4
1235
+.loop_cpy1Dto2D_shr_16:
1236
+    sub             w12, w12, #1
1237
+.rept 4
1238
+    ld1             {v2.8h-v3.8h}, x1, #32
1239
+    sub             v2.8h, v2.8h, v1.8h
1240
+    sub             v3.8h, v3.8h, v1.8h
1241
+    sshl            v2.8h, v2.8h, v0.8h
1242
+    sshl            v3.8h, v3.8h, v0.8h
1243
+    st1             {v2.8h-v3.8h}, x0, x2
1244
+.endr
1245
+    cbnz            w12, .loop_cpy1Dto2D_shr_16
1246
+    ret
1247
+endfunc
1248
+
1249
+function PFX(cpy1Dto2D_shr_32x32_neon)
1250
+    cpy1Dto2D_shr_start
1251
+    mov             w12, #16
1252
+.loop_cpy1Dto2D_shr_32:
1253
+    sub             w12, w12, #1
1254
+.rept 2
1255
+    ld1             {v2.16b-v5.16b}, x1, #64
1256
+    sub             v2.8h, v2.8h, v1.8h
1257
+    sub             v3.8h, v3.8h, v1.8h
1258
+    sub             v4.8h, v4.8h, v1.8h
1259
+    sub             v5.8h, v5.8h, v1.8h
1260
+    sshl            v2.8h, v2.8h, v0.8h
1261
+    sshl            v3.8h, v3.8h, v0.8h
1262
+    sshl            v4.8h, v4.8h, v0.8h
1263
+    sshl            v5.8h, v5.8h, v0.8h
1264
+    st1             {v2.16b-v5.16b}, x0, x2
1265
+.endr
1266
+    cbnz            w12, .loop_cpy1Dto2D_shr_32
1267
+    ret
1268
+endfunc
1269
+
1270
+function PFX(cpy1Dto2D_shr_64x64_neon)
1271
+    cpy1Dto2D_shr_start
1272
+    mov             w12, #32
1273
+    sub             x2, x2, #64
1274
+.loop_cpy1Dto2D_shr_64:
1275
+    sub             w12, w12, #1
1276
+.rept 2
1277
+    ld1             {v2.16b-v5.16b}, x1, #64
1278
+    ld1             {v16.16b-v19.16b}, x1, #64
1279
+    sub             v2.8h, v2.8h, v1.8h
1280
+    sub             v3.8h, v3.8h, v1.8h
1281
+    sub             v4.8h, v4.8h, v1.8h
1282
+    sub             v5.8h, v5.8h, v1.8h
1283
+    sub             v16.8h, v16.8h, v1.8h
1284
+    sub             v17.8h, v17.8h, v1.8h
1285
+    sub             v18.8h, v18.8h, v1.8h
1286
+    sub             v19.8h, v19.8h, v1.8h
1287
+    sshl            v2.8h, v2.8h, v0.8h
1288
+    sshl            v3.8h, v3.8h, v0.8h
1289
+    sshl            v4.8h, v4.8h, v0.8h
1290
+    sshl            v5.8h, v5.8h, v0.8h
1291
+    sshl            v16.8h, v16.8h, v0.8h
1292
+    sshl            v17.8h, v17.8h, v0.8h
1293
+    sshl            v18.8h, v18.8h, v0.8h
1294
+    sshl            v19.8h, v19.8h, v0.8h
1295
+    st1             {v2.16b-v5.16b}, x0, #64
1296
+    st1             {v16.16b-v19.16b}, x0, x2
1297
+.endr
1298
+    cbnz            w12, .loop_cpy1Dto2D_shr_64
1299
+    ret
1300
+endfunc
1301
x265_3.6.tar.gz/source/common/aarch64/dct-prim.cpp Added
950
 
1
@@ -0,0 +1,948 @@
2
+#include "dct-prim.h"
3
+
4
+
5
+#if HAVE_NEON
6
+
7
+#include <arm_neon.h>
8
+
9
+
10
+namespace
11
+{
12
+using namespace X265_NS;
13
+
14
+
15
+static int16x8_t rev16(const int16x8_t a)
16
+{
17
+    static const int8x16_t tbl = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
18
+    return vqtbx1q_u8(a, a, tbl);
19
+}
20
+
21
+static int32x4_t rev32(const int32x4_t a)
22
+{
23
+    static const int8x16_t tbl = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
24
+    return vqtbx1q_u8(a, a, tbl);
25
+}
26
+
27
+static void transpose_4x4x16(int16x4_t &x0, int16x4_t &x1, int16x4_t &x2, int16x4_t &x3)
28
+{
29
+    int16x4_t s0, s1, s2, s3;
30
+    s0 = vtrn1_s32(x0, x2);
31
+    s1 = vtrn1_s32(x1, x3);
32
+    s2 = vtrn2_s32(x0, x2);
33
+    s3 = vtrn2_s32(x1, x3);
34
+
35
+    x0 = vtrn1_s16(s0, s1);
36
+    x1 = vtrn2_s16(s0, s1);
37
+    x2 = vtrn1_s16(s2, s3);
38
+    x3 = vtrn2_s16(s2, s3);
39
+}
40
+
41
+
42
+
43
+static int scanPosLast_opt(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag,
44
+                           uint8_t *coeffNum, int numSig, const uint16_t * /*scanCG4x4*/, const int /*trSize*/)
45
+{
46
+
47
+    // This is an optimized function for scanPosLast, which removes the rmw dependency, once integrated into mainline x265, should replace reference implementation
48
+    // For clarity, left the original reference code in comments
49
+    int scanPosLast = 0;
50
+
51
+    uint16_t cSign = 0;
52
+    uint16_t cFlag = 0;
53
+    uint8_t cNum = 0;
54
+
55
+    uint32_t prevcgIdx = 0;
56
+    do
57
+    {
58
+        const uint32_t cgIdx = (uint32_t)scanPosLast >> MLS_CG_SIZE;
59
+
60
+        const uint32_t posLast = scanscanPosLast;
61
+
62
+        const int curCoeff = coeffposLast;
63
+        const uint32_t isNZCoeff = (curCoeff != 0);
64
+        /*
65
+        NOTE: the new algorithm is complicated, so I keep reference code here
66
+        uint32_t posy   = posLast >> log2TrSize;
67
+        uint32_t posx   = posLast - (posy << log2TrSize);
68
+        uint32_t blkIdx0 = ((posy >> MLS_CG_LOG2_SIZE) << codingParameters.log2TrSizeCG) + (posx >> MLS_CG_LOG2_SIZE);
69
+        const uint32_t blkIdx = ((posLast >> (2 * MLS_CG_LOG2_SIZE)) & ~maskPosXY) + ((posLast >> MLS_CG_LOG2_SIZE) & maskPosXY);
70
+        sigCoeffGroupFlag64 |= ((uint64_t)isNZCoeff << blkIdx);
71
+        */
72
+
73
+        // get L1 sig map
74
+        numSig -= isNZCoeff;
75
+
76
+        if (scanPosLast % (1 << MLS_CG_SIZE) == 0)
77
+        {
78
+            coeffSignprevcgIdx = cSign;
79
+            coeffFlagprevcgIdx = cFlag;
80
+            coeffNumprevcgIdx = cNum;
81
+            cSign = 0;
82
+            cFlag = 0;
83
+            cNum = 0;
84
+        }
85
+        // TODO: optimize by instruction BTS
86
+        cSign += (uint16_t)(((curCoeff < 0) ? 1 : 0) << cNum);
87
+        cFlag = (cFlag << 1) + (uint16_t)isNZCoeff;
88
+        cNum += (uint8_t)isNZCoeff;
89
+        prevcgIdx = cgIdx;
90
+        scanPosLast++;
91
+    }
92
+    while (numSig > 0);
93
+
94
+    coeffSignprevcgIdx = cSign;
95
+    coeffFlagprevcgIdx = cFlag;
96
+    coeffNumprevcgIdx = cNum;
97
+    return scanPosLast - 1;
98
+}
99
+
100
+
101
+#if (MLS_CG_SIZE == 4)
102
+template<int log2TrSize>
103
+static void nonPsyRdoQuant_neon(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost,
104
+                                int64_t *totalRdCost, uint32_t blkPos)
105
+{
106
+    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH -
107
+                               log2TrSize; /* Represents scaling through forward transform */
108
+    const int scaleBits = SCALE_BITS - 2 * transformShift;
109
+    const uint32_t trSize = 1 << log2TrSize;
110
+
111
+    int64x2_t vcost_sum_0 = vdupq_n_s64(0);
112
+    int64x2_t vcost_sum_1 = vdupq_n_s64(0);
113
+    for (int y = 0; y < MLS_CG_SIZE; y++)
114
+    {
115
+        int16x4_t in = *(int16x4_t *)&m_resiDctCoeffblkPos;
116
+        int32x4_t mul = vmull_s16(in, in);
117
+        int64x2_t cost0, cost1;
118
+        cost0 = vshll_n_s32(vget_low_s32(mul), scaleBits);
119
+        cost1 = vshll_high_n_s32(mul, scaleBits);
120
+        *(int64x2_t *)&costUncodedblkPos + 0 = cost0;
121
+        *(int64x2_t *)&costUncodedblkPos + 2 = cost1;
122
+        vcost_sum_0 = vaddq_s64(vcost_sum_0, cost0);
123
+        vcost_sum_1 = vaddq_s64(vcost_sum_1, cost1);
124
+        blkPos += trSize;
125
+    }
126
+    int64_t sum = vaddvq_s64(vaddq_s64(vcost_sum_0, vcost_sum_1));
127
+    *totalUncodedCost += sum;
128
+    *totalRdCost += sum;
129
+}
130
+
131
+template<int log2TrSize>
132
+static void psyRdoQuant_neon(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded,
133
+                             int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
134
+{
135
+    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH -
136
+                               log2TrSize; /* Represents scaling through forward transform */
137
+    const int scaleBits = SCALE_BITS - 2 * transformShift;
138
+    const uint32_t trSize = 1 << log2TrSize;
139
+    //using preprocessor to bypass clang bug
140
+    const int max = X265_MAX(0, (2 * transformShift + 1));
141
+
142
+    int64x2_t vcost_sum_0 = vdupq_n_s64(0);
143
+    int64x2_t vcost_sum_1 = vdupq_n_s64(0);
144
+    int32x4_t vpsy = vdupq_n_s32(*psyScale);
145
+    for (int y = 0; y < MLS_CG_SIZE; y++)
146
+    {
147
+        int32x4_t signCoef = vmovl_s16(*(int16x4_t *)&m_resiDctCoeffblkPos);
148
+        int32x4_t predictedCoef = vsubq_s32(vmovl_s16(*(int16x4_t *)&m_fencDctCoeffblkPos), signCoef);
149
+        int64x2_t cost0, cost1;
150
+        cost0 = vmull_s32(vget_low_s32(signCoef), vget_low_s32(signCoef));
151
+        cost1 = vmull_high_s32(signCoef, signCoef);
152
+        cost0 = vshlq_n_s64(cost0, scaleBits);
153
+        cost1 = vshlq_n_s64(cost1, scaleBits);
154
+        int64x2_t neg0 = vmull_s32(vget_low_s32(predictedCoef), vget_low_s32(vpsy));
155
+        int64x2_t neg1 = vmull_high_s32(predictedCoef, vpsy);
156
+        if (max > 0)
157
+        {
158
+            int64x2_t shift = vdupq_n_s64(-max);
159
+            neg0 = vshlq_s64(neg0, shift);
160
+            neg1 = vshlq_s64(neg1, shift);
161
+        }
162
+        cost0 = vsubq_s64(cost0, neg0);
163
+        cost1 = vsubq_s64(cost1, neg1);
164
+        *(int64x2_t *)&costUncodedblkPos + 0 = cost0;
165
+        *(int64x2_t *)&costUncodedblkPos + 2 = cost1;
166
+        vcost_sum_0 = vaddq_s64(vcost_sum_0, cost0);
167
+        vcost_sum_1 = vaddq_s64(vcost_sum_1, cost1);
168
+
169
+        blkPos += trSize;
170
+    }
171
+    int64_t sum = vaddvq_s64(vaddq_s64(vcost_sum_0, vcost_sum_1));
172
+    *totalUncodedCost += sum;
173
+    *totalRdCost += sum;
174
+}
175
+
176
+#else
177
+#error "MLS_CG_SIZE must be 4 for neon version"
178
+#endif
179
+
180
+
181
+
182
+template<int trSize>
183
+int  count_nonzero_neon(const int16_t *quantCoeff)
184
+{
185
+    X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
186
+    int count = 0;
187
+    int16x8_t vcount = vdupq_n_s16(0);
188
+    const int numCoeff = trSize * trSize;
189
+    int i = 0;
190
+    for (; (i + 8) <= numCoeff; i += 8)
191
+    {
192
+        int16x8_t in = *(int16x8_t *)&quantCoeffi;
193
+        vcount = vaddq_s16(vcount, vtstq_s16(in, in));
194
+    }
195
+    for (; i < numCoeff; i++)
196
+    {
197
+        count += quantCoeffi != 0;
198
+    }
199
+
200
+    return count - vaddvq_s16(vcount);
201
+}
202
+
203
+template<int trSize>
204
+uint32_t copy_count_neon(int16_t *coeff, const int16_t *residual, intptr_t resiStride)
205
+{
206
+    uint32_t numSig = 0;
207
+    int16x8_t vcount = vdupq_n_s16(0);
208
+    for (int k = 0; k < trSize; k++)
209
+    {
210
+        int j = 0;
211
+        for (; (j + 8) <= trSize; j += 8)
212
+        {
213
+            int16x8_t in = *(int16x8_t *)&residualj;
214
+            *(int16x8_t *)&coeffj = in;
215
+            vcount = vaddq_s16(vcount, vtstq_s16(in, in));
216
+        }
217
+        for (; j < trSize; j++)
218
+        {
219
+            coeffj = residualj;
220
+            numSig += (residualj != 0);
221
+        }
222
+        residual += resiStride;
223
+        coeff += trSize;
224
+    }
225
+
226
+    return numSig - vaddvq_s16(vcount);
227
+}
228
+
229
+
230
+static void partialButterfly16(const int16_t *src, int16_t *dst, int shift, int line)
231
+{
232
+    int j, k;
233
+    int32x4_t E2, O2;
234
+    int32x4_t EE, EO;
235
+    int32x2_t EEE, EEO;
236
+    const int add = 1 << (shift - 1);
237
+    const int32x4_t _vadd = {add, 0};
238
+
239
+    for (j = 0; j < line; j++)
240
+    {
241
+        int16x8_t in0 = *(int16x8_t *)src;
242
+        int16x8_t in1 = rev16(*(int16x8_t *)&src8);
243
+
244
+        E0 = vaddl_s16(vget_low_s16(in0), vget_low_s16(in1));
245
+        O0 = vsubl_s16(vget_low_s16(in0), vget_low_s16(in1));
246
+        E1 = vaddl_high_s16(in0, in1);
247
+        O1 = vsubl_high_s16(in0, in1);
248
+
249
+        for (k = 1; k < 16; k += 2)
250
+        {
251
+            int32x4_t c0 = vmovl_s16(*(int16x4_t *)&g_t16k0);
252
+            int32x4_t c1 = vmovl_s16(*(int16x4_t *)&g_t16k4);
253
+
254
+            int32x4_t res = _vadd;
255
+            res = vmlaq_s32(res, c0, O0);
256
+            res = vmlaq_s32(res, c1, O1);
257
+            dstk * line = (int16_t)(vaddvq_s32(res) >> shift);
258
+        }
259
+
260
+        /* EE and EO */
261
+        EE = vaddq_s32(E0, rev32(E1));
262
+        EO = vsubq_s32(E0, rev32(E1));
263
+
264
+        for (k = 2; k < 16; k += 4)
265
+        {
266
+            int32x4_t c0 = vmovl_s16(*(int16x4_t *)&g_t16k0);
267
+            int32x4_t res = _vadd;
268
+            res = vmlaq_s32(res, c0, EO);
269
+            dstk * line = (int16_t)(vaddvq_s32(res) >> shift);
270
+        }
271
+
272
+        /* EEE and EEO */
273
+        EEE0 = EE0 + EE3;
274
+        EEO0 = EE0 - EE3;
275
+        EEE1 = EE1 + EE2;
276
+        EEO1 = EE1 - EE2;
277
+
278
+        dst0 = (int16_t)((g_t1600 * EEE0 + g_t1601 * EEE1 + add) >> shift);
279
+        dst8 * line = (int16_t)((g_t1680 * EEE0 + g_t1681 * EEE1 + add) >> shift);
280
+        dst4 * line = (int16_t)((g_t1640 * EEO0 + g_t1641 * EEO1 + add) >> shift);
281
+        dst12 * line = (int16_t)((g_t16120 * EEO0 + g_t16121 * EEO1 + add) >> shift);
282
+
283
+
284
+        src += 16;
285
+        dst++;
286
+    }
287
+}
288
+
289
+
290
+static void partialButterfly32(const int16_t *src, int16_t *dst, int shift, int line)
291
+{
292
+    int j, k;
293
+    const int add = 1 << (shift - 1);
294
+
295
+
296
+    for (j = 0; j < line; j++)
297
+    {
298
+        int32x4_t VE4, VO0, VO1, VO2, VO3;
299
+        int32x4_t VEE2, VEO2;
300
+        int32x4_t VEEE, VEEO;
301
+        int EEEE2, EEEO2;
302
+
303
+        int16x8x4_t inputs;
304
+        inputs = *(int16x8x4_t *)&src0;
305
+        int16x8x4_t in_rev;
306
+
307
+        in_rev.val1 = rev16(inputs.val2);
308
+        in_rev.val0 = rev16(inputs.val3);
309
+
310
+        VE0 = vaddl_s16(vget_low_s16(inputs.val0), vget_low_s16(in_rev.val0));
311
+        VE1 = vaddl_high_s16(inputs.val0, in_rev.val0);
312
+        VO0 = vsubl_s16(vget_low_s16(inputs.val0), vget_low_s16(in_rev.val0));
313
+        VO1 = vsubl_high_s16(inputs.val0, in_rev.val0);
314
+        VE2 = vaddl_s16(vget_low_s16(inputs.val1), vget_low_s16(in_rev.val1));
315
+        VE3 = vaddl_high_s16(inputs.val1, in_rev.val1);
316
+        VO2 = vsubl_s16(vget_low_s16(inputs.val1), vget_low_s16(in_rev.val1));
317
+        VO3 = vsubl_high_s16(inputs.val1, in_rev.val1);
318
+
319
+        for (k = 1; k < 32; k += 2)
320
+        {
321
+            int32x4_t c0 = vmovl_s16(*(int16x4_t *)&g_t32k0);
322
+            int32x4_t c1 = vmovl_s16(*(int16x4_t *)&g_t32k4);
323
+            int32x4_t c2 = vmovl_s16(*(int16x4_t *)&g_t32k8);
324
+            int32x4_t c3 = vmovl_s16(*(int16x4_t *)&g_t32k12);
325
+            int32x4_t s = vmulq_s32(c0, VO0);
326
+            s = vmlaq_s32(s, c1, VO1);
327
+            s = vmlaq_s32(s, c2, VO2);
328
+            s = vmlaq_s32(s, c3, VO3);
329
+
330
+            dstk * line = (int16_t)((vaddvq_s32(s) + add) >> shift);
331
+
332
+        }
333
+
334
+        int32x4_t rev_VE2;
335
+
336
+
337
+        rev_VE0 = rev32(VE3);
338
+        rev_VE1 = rev32(VE2);
339
+
340
+        /* EE and EO */
341
+        for (k = 0; k < 2; k++)
342
+        {
343
+            VEEk = vaddq_s32(VEk, rev_VEk);
344
+            VEOk = vsubq_s32(VEk, rev_VEk);
345
+        }
346
+        for (k = 2; k < 32; k += 4)
347
+        {
348
+            int32x4_t c0 = vmovl_s16(*(int16x4_t *)&g_t32k0);
349
+            int32x4_t c1 = vmovl_s16(*(int16x4_t *)&g_t32k4);
350
+            int32x4_t s = vmulq_s32(c0, VEO0);
351
+            s = vmlaq_s32(s, c1, VEO1);
352
+
353
+            dstk * line = (int16_t)((vaddvq_s32(s) + add) >> shift);
354
+
355
+        }
356
+
357
+        int32x4_t tmp = rev32(VEE1);
358
+        VEEE = vaddq_s32(VEE0, tmp);
359
+        VEEO = vsubq_s32(VEE0, tmp);
360
+        for (k = 4; k < 32; k += 8)
361
+        {
362
+            int32x4_t c = vmovl_s16(*(int16x4_t *)&g_t32k0);
363
+            int32x4_t s = vmulq_s32(c, VEEO);
364
+
365
+            dstk * line = (int16_t)((vaddvq_s32(s) + add) >> shift);
366
+        }
367
+
368
+        /* EEEE and EEEO */
369
+        EEEE0 = VEEE0 + VEEE3;
370
+        EEEO0 = VEEE0 - VEEE3;
371
+        EEEE1 = VEEE1 + VEEE2;
372
+        EEEO1 = VEEE1 - VEEE2;
373
+
374
+        dst0 = (int16_t)((g_t3200 * EEEE0 + g_t3201 * EEEE1 + add) >> shift);
375
+        dst16 * line = (int16_t)((g_t32160 * EEEE0 + g_t32161 * EEEE1 + add) >> shift);
376
+        dst8 * line = (int16_t)((g_t3280 * EEEO0 + g_t3281 * EEEO1 + add) >> shift);
377
+        dst24 * line = (int16_t)((g_t32240 * EEEO0 + g_t32241 * EEEO1 + add) >> shift);
378
+
379
+
380
+
381
+        src += 32;
382
+        dst++;
383
+    }
384
+}
385
+
386
+static void partialButterfly8(const int16_t *src, int16_t *dst, int shift, int line)
387
+{
388
+    int j, k;
389
+    int E4, O4;
390
+    int EE2, EO2;
391
+    int add = 1 << (shift - 1);
392
+
393
+    for (j = 0; j < line; j++)
394
+    {
395
+        /* E and O*/
396
+        for (k = 0; k < 4; k++)
397
+        {
398
+            Ek = srck + src7 - k;
399
+            Ok = srck - src7 - k;
400
+        }
401
+
402
+        /* EE and EO */
403
+        EE0 = E0 + E3;
404
+        EO0 = E0 - E3;
405
+        EE1 = E1 + E2;
406
+        EO1 = E1 - E2;
407
+
408
+        dst0 = (int16_t)((g_t800 * EE0 + g_t801 * EE1 + add) >> shift);
409
+        dst4 * line = (int16_t)((g_t840 * EE0 + g_t841 * EE1 + add) >> shift);
410
+        dst2 * line = (int16_t)((g_t820 * EO0 + g_t821 * EO1 + add) >> shift);
411
+        dst6 * line = (int16_t)((g_t860 * EO0 + g_t861 * EO1 + add) >> shift);
412
+
413
+        dstline = (int16_t)((g_t810 * O0 + g_t811 * O1 + g_t812 * O2 + g_t813 * O3 + add) >> shift);
414
+        dst3 * line = (int16_t)((g_t830 * O0 + g_t831 * O1 + g_t832 * O2 + g_t833 * O3 + add) >>
415
+                                  shift);
416
+        dst5 * line = (int16_t)((g_t850 * O0 + g_t851 * O1 + g_t852 * O2 + g_t853 * O3 + add) >>
417
+                                  shift);
418
+        dst7 * line = (int16_t)((g_t870 * O0 + g_t871 * O1 + g_t872 * O2 + g_t873 * O3 + add) >>
419
+                                  shift);
420
+
421
+        src += 8;
422
+        dst++;
423
+    }
424
+}
425
+
426
+static void partialButterflyInverse4(const int16_t *src, int16_t *dst, int shift, int line)
427
+{
428
+    int j;
429
+    int E2, O2;
430
+    int add = 1 << (shift - 1);
431
+
432
+    for (j = 0; j < line; j++)
433
+    {
434
+        /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
435
+        O0 = g_t410 * srcline + g_t430 * src3 * line;
436
+        O1 = g_t411 * srcline + g_t431 * src3 * line;
437
+        E0 = g_t400 * src0 + g_t420 * src2 * line;
438
+        E1 = g_t401 * src0 + g_t421 * src2 * line;
439
+
440
+        /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
441
+        dst0 = (int16_t)(x265_clip3(-32768, 32767, (E0 + O0 + add) >> shift));
442
+        dst1 = (int16_t)(x265_clip3(-32768, 32767, (E1 + O1 + add) >> shift));
443
+        dst2 = (int16_t)(x265_clip3(-32768, 32767, (E1 - O1 + add) >> shift));
444
+        dst3 = (int16_t)(x265_clip3(-32768, 32767, (E0 - O0 + add) >> shift));
445
+
446
+        src++;
447
+        dst += 4;
448
+    }
449
+}
450
+
451
+
452
+
453
+static void partialButterflyInverse16_neon(const int16_t *src, int16_t *orig_dst, int shift, int line)
454
+{
455
+#define FMAK(x,l) sl = vmlal_lane_s16(sl,*(int16x4_t*)&src(x)*line,*(int16x4_t *)&g_t16xk,l)
456
+#define MULK(x,l) vmull_lane_s16(*(int16x4_t*)&srcx*line,*(int16x4_t *)&g_t16xk,l);
457
+#define ODD3_15(k) FMAK(3,k);FMAK(5,k);FMAK(7,k);FMAK(9,k);FMAK(11,k);FMAK(13,k);FMAK(15,k);
458
+#define EVEN6_14_STEP4(k) FMAK(6,k);FMAK(10,k);FMAK(14,k);
459
+
460
+
461
+    int j, k;
462
+    int32x4_t E8, O8;
463
+    int32x4_t EE4, EO4;
464
+    int32x4_t EEE2, EEO2;
465
+    const int add = 1 << (shift - 1);
466
+
467
+
468
+#pragma unroll(4)
469
+    for (j = 0; j < line; j += 4)
470
+    {
471
+        /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
472
+
473
+#pragma unroll(2)
474
+        for (k = 0; k < 2; k++)
475
+        {
476
+            int32x4_t s;
477
+            s = vmull_s16(vdup_n_s16(g_t164k), *(int16x4_t *)&src4 * line);;
478
+            EEOk = vmlal_s16(s, vdup_n_s16(g_t1612k), *(int16x4_t *)&src(12) * line);
479
+            s = vmull_s16(vdup_n_s16(g_t160k), *(int16x4_t *)&src0 * line);;
480
+            EEEk = vmlal_s16(s, vdup_n_s16(g_t168k), *(int16x4_t *)&src(8) * line);
481
+        }
482
+
483
+        /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
484
+        EE0 = vaddq_s32(EEE0 , EEO0);
485
+        EE2 = vsubq_s32(EEE1 , EEO1);
486
+        EE1 = vaddq_s32(EEE1 , EEO1);
487
+        EE3 = vsubq_s32(EEE0 , EEO0);
488
+
489
+
490
+#pragma unroll(1)
491
+        for (k = 0; k < 4; k += 4)
492
+        {
493
+            int32x4_t s4;
494
+            s0 = MULK(2, 0);
495
+            s1 = MULK(2, 1);
496
+            s2 = MULK(2, 2);
497
+            s3 = MULK(2, 3);
498
+
499
+            EVEN6_14_STEP4(0);
500
+            EVEN6_14_STEP4(1);
501
+            EVEN6_14_STEP4(2);
502
+            EVEN6_14_STEP4(3);
503
+
504
+            EOk = s0;
505
+            EOk + 1 = s1;
506
+            EOk + 2 = s2;
507
+            EOk + 3 = s3;
508
+        }
509
+
510
+
511
+
512
+        static const int32x4_t min = vdupq_n_s32(-32768);
513
+        static const int32x4_t max = vdupq_n_s32(32767);
514
+        const int32x4_t minus_shift = vdupq_n_s32(-shift);
515
+
516
+#pragma unroll(4)
517
+        for (k = 0; k < 4; k++)
518
+        {
519
+            Ek = vaddq_s32(EEk , EOk);
520
+            Ek + 4 = vsubq_s32(EE3 - k , EO3 - k);
521
+        }
522
+
523
+#pragma unroll(2)
524
+        for (k = 0; k < 8; k += 4)
525
+        {
526
+            int32x4_t s4;
527
+            s0 = MULK(1, 0);
528
+            s1 = MULK(1, 1);
529
+            s2 = MULK(1, 2);
530
+            s3 = MULK(1, 3);
531
+            ODD3_15(0);
532
+            ODD3_15(1);
533
+            ODD3_15(2);
534
+            ODD3_15(3);
535
+            Ok = s0;
536
+            Ok + 1 = s1;
537
+            Ok + 2 = s2;
538
+            Ok + 3 = s3;
539
+            int32x4_t t;
540
+            int16x4_t x0, x1, x2, x3;
541
+
542
+            Ek = vaddq_s32(vdupq_n_s32(add), Ek);
543
+            t = vaddq_s32(Ek, Ok);
544
+            t = vshlq_s32(t, minus_shift);
545
+            t = vmaxq_s32(t, min);
546
+            t = vminq_s32(t, max);
547
+            x0 = vmovn_s32(t);
548
+
549
+            Ek + 1 = vaddq_s32(vdupq_n_s32(add), Ek + 1);
550
+            t = vaddq_s32(Ek + 1, Ok + 1);
551
+            t = vshlq_s32(t, minus_shift);
552
+            t = vmaxq_s32(t, min);
553
+            t = vminq_s32(t, max);
554
+            x1 = vmovn_s32(t);
555
+
556
+            Ek + 2 = vaddq_s32(vdupq_n_s32(add), Ek + 2);
557
+            t = vaddq_s32(Ek + 2, Ok + 2);
558
+            t = vshlq_s32(t, minus_shift);
559
+            t = vmaxq_s32(t, min);
560
+            t = vminq_s32(t, max);
561
+            x2 = vmovn_s32(t);
562
+
563
+            Ek + 3 = vaddq_s32(vdupq_n_s32(add), Ek + 3);
564
+            t = vaddq_s32(Ek + 3, Ok + 3);
565
+            t = vshlq_s32(t, minus_shift);
566
+            t = vmaxq_s32(t, min);
567
+            t = vminq_s32(t, max);
568
+            x3 = vmovn_s32(t);
569
+
570
+            transpose_4x4x16(x0, x1, x2, x3);
571
+            *(int16x4_t *)&orig_dst0 * 16 + k = x0;
572
+            *(int16x4_t *)&orig_dst1 * 16 + k = x1;
573
+            *(int16x4_t *)&orig_dst2 * 16 + k = x2;
574
+            *(int16x4_t *)&orig_dst3 * 16 + k = x3;
575
+        }
576
+
577
+
578
+#pragma unroll(2)
579
+        for (k = 0; k < 8; k += 4)
580
+        {
581
+            int32x4_t t;
582
+            int16x4_t x0, x1, x2, x3;
583
+
584
+            t = vsubq_s32(E7 - k, O7 - k);
585
+            t = vshlq_s32(t, minus_shift);
586
+            t = vmaxq_s32(t, min);
587
+            t = vminq_s32(t, max);
588
+            x0 = vmovn_s32(t);
589
+
590
+            t = vsubq_s32(E6 - k, O6 - k);
591
+            t = vshlq_s32(t, minus_shift);
592
+            t = vmaxq_s32(t, min);
593
+            t = vminq_s32(t, max);
594
+            x1 = vmovn_s32(t);
595
+
596
+            t = vsubq_s32(E5 - k, O5 - k);
597
+
598
+            t = vshlq_s32(t, minus_shift);
599
+            t = vmaxq_s32(t, min);
600
+            t = vminq_s32(t, max);
601
+            x2 = vmovn_s32(t);
602
+
603
+            t = vsubq_s32(E4 - k, O4 - k);
604
+            t = vshlq_s32(t, minus_shift);
605
+            t = vmaxq_s32(t, min);
606
+            t = vminq_s32(t, max);
607
+            x3 = vmovn_s32(t);
608
+
609
+            transpose_4x4x16(x0, x1, x2, x3);
610
+            *(int16x4_t *)&orig_dst0 * 16 + k + 8 = x0;
611
+            *(int16x4_t *)&orig_dst1 * 16 + k + 8 = x1;
612
+            *(int16x4_t *)&orig_dst2 * 16 + k + 8 = x2;
613
+            *(int16x4_t *)&orig_dst3 * 16 + k + 8 = x3;
614
+        }
615
+        orig_dst += 4 * 16;
616
+        src += 4;
617
+    }
618
+
619
+#undef MUL
620
+#undef FMA
621
+#undef FMAK
622
+#undef MULK
623
+#undef ODD3_15
624
+#undef EVEN6_14_STEP4
625
+
626
+
627
+}
628
+
629
+
630
+
631
+static void partialButterflyInverse32_neon(const int16_t *src, int16_t *orig_dst, int shift, int line)
632
+{
633
+#define MUL(x) vmull_s16(vdup_n_s16(g_t32xk),*(int16x4_t*)&srcx*line);
634
+#define FMA(x) s = vmlal_s16(s,vdup_n_s16(g_t32xk),*(int16x4_t*)&src(x)*line)
635
+#define FMAK(x,l) sl = vmlal_lane_s16(sl,*(int16x4_t*)&src(x)*line,*(int16x4_t *)&g_t32xk,l)
636
+#define MULK(x,l) vmull_lane_s16(*(int16x4_t*)&srcx*line,*(int16x4_t *)&g_t32xk,l);
637
+#define ODD31(k) FMAK(3,k);FMAK(5,k);FMAK(7,k);FMAK(9,k);FMAK(11,k);FMAK(13,k);FMAK(15,k);FMAK(17,k);FMAK(19,k);FMAK(21,k);FMAK(23,k);FMAK(25,k);FMAK(27,k);FMAK(29,k);FMAK(31,k);
638
+
639
+#define ODD15(k) FMAK(6,k);FMAK(10,k);FMAK(14,k);FMAK(18,k);FMAK(22,k);FMAK(26,k);FMAK(30,k);
640
+#define ODD7(k) FMAK(12,k);FMAK(20,k);FMAK(28,k);
641
+
642
+
643
+    int j, k;
644
+    int32x4_t E16, O16;
645
+    int32x4_t EE8, EO8;
646
+    int32x4_t EEE4, EEO4;
647
+    int32x4_t EEEE2, EEEO2;
648
+    int16x4_t dst32;
649
+    int add = 1 << (shift - 1);
650
+
651
+#pragma unroll (8)
652
+    for (j = 0; j < line; j += 4)
653
+    {
654
+#pragma unroll (4)
655
+        for (k = 0; k < 16; k += 4)
656
+        {
657
+            int32x4_t s4;
658
+            s0 = MULK(1, 0);
659
+            s1 = MULK(1, 1);
660
+            s2 = MULK(1, 2);
661
+            s3 = MULK(1, 3);
662
+            ODD31(0);
663
+            ODD31(1);
664
+            ODD31(2);
665
+            ODD31(3);
666
+            Ok = s0;
667
+            Ok + 1 = s1;
668
+            Ok + 2 = s2;
669
+            Ok + 3 = s3;
670
+
671
+
672
+        }
673
+
674
+
675
+#pragma unroll (2)
676
+        for (k = 0; k < 8; k += 4)
677
+        {
678
+            int32x4_t s4;
679
+            s0 = MULK(2, 0);
680
+            s1 = MULK(2, 1);
681
+            s2 = MULK(2, 2);
682
+            s3 = MULK(2, 3);
683
+
684
+            ODD15(0);
685
+            ODD15(1);
686
+            ODD15(2);
687
+            ODD15(3);
688
+
689
+            EOk = s0;
690
+            EOk + 1 = s1;
691
+            EOk + 2 = s2;
692
+            EOk + 3 = s3;
693
+        }
694
+
695
+
696
+        for (k = 0; k < 4; k += 4)
697
+        {
698
+            int32x4_t s4;
699
+            s0 = MULK(4, 0);
700
+            s1 = MULK(4, 1);
701
+            s2 = MULK(4, 2);
702
+            s3 = MULK(4, 3);
703
+
704
+            ODD7(0);
705
+            ODD7(1);
706
+            ODD7(2);
707
+            ODD7(3);
708
+
709
+            EEOk = s0;
710
+            EEOk + 1 = s1;
711
+            EEOk + 2 = s2;
712
+            EEOk + 3 = s3;
713
+        }
714
+
715
+#pragma unroll (2)
716
+        for (k = 0; k < 2; k++)
717
+        {
718
+            int32x4_t s;
719
+            s = MUL(8);
720
+            EEEOk = FMA(24);
721
+            s = MUL(0);
722
+            EEEEk = FMA(16);
723
+        }
724
+        /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
725
+        EEE0 = vaddq_s32(EEEE0, EEEO0);
726
+        EEE3 = vsubq_s32(EEEE0, EEEO0);
727
+        EEE1 = vaddq_s32(EEEE1, EEEO1);
728
+        EEE2 = vsubq_s32(EEEE1, EEEO1);
729
+
730
+#pragma unroll (4)
731
+        for (k = 0; k < 4; k++)
732
+        {
733
+            EEk = vaddq_s32(EEEk, EEOk);
734
+            EEk + 4 = vsubq_s32((EEE3 - k), (EEO3 - k));
735
+        }
736
+
737
+#pragma unroll (8)
738
+        for (k = 0; k < 8; k++)
739
+        {
740
+            Ek = vaddq_s32(EEk, EOk);
741
+            Ek + 8 = vsubq_s32((EE7 - k), (EO7 - k));
742
+        }
743
+
744
+        static const int32x4_t min = vdupq_n_s32(-32768);
745
+        static const int32x4_t max = vdupq_n_s32(32767);
746
+
747
+
748
+
749
+#pragma unroll (16)
750
+        for (k = 0; k < 16; k++)
751
+        {
752
+            int32x4_t adde = vaddq_s32(vdupq_n_s32(add), Ek);
753
+            int32x4_t s = vaddq_s32(adde, Ok);
754
+            s = vshlq_s32(s, vdupq_n_s32(-shift));
755
+            s = vmaxq_s32(s, min);
756
+            s = vminq_s32(s, max);
757
+
758
+
759
+
760
+            dstk = vmovn_s32(s);
761
+            adde = vaddq_s32(vdupq_n_s32(add), (E15 - k));
762
+            s  = vsubq_s32(adde, (O15 - k));
763
+            s = vshlq_s32(s, vdupq_n_s32(-shift));
764
+            s = vmaxq_s32(s, min);
765
+            s = vminq_s32(s, max);
766
+
767
+            dstk + 16 = vmovn_s32(s);
768
+        }
769
+
770
+
771
+#pragma unroll (8)
772
+        for (k = 0; k < 32; k += 4)
773
+        {
774
+            int16x4_t x0 = dstk + 0;
775
+            int16x4_t x1 = dstk + 1;
776
+            int16x4_t x2 = dstk + 2;
777
+            int16x4_t x3 = dstk + 3;
778
+            transpose_4x4x16(x0, x1, x2, x3);
779
+            *(int16x4_t *)&orig_dst0 * 32 + k = x0;
780
+            *(int16x4_t *)&orig_dst1 * 32 + k = x1;
781
+            *(int16x4_t *)&orig_dst2 * 32 + k = x2;
782
+            *(int16x4_t *)&orig_dst3 * 32 + k = x3;
783
+        }
784
+        orig_dst += 4 * 32;
785
+        src += 4;
786
+    }
787
+#undef MUL
788
+#undef FMA
789
+#undef FMAK
790
+#undef MULK
791
+#undef ODD31
792
+#undef ODD15
793
+#undef ODD7
794
+
795
+}
796
+
797
+
798
+static void dct8_neon(const int16_t *src, int16_t *dst, intptr_t srcStride)
799
+{
800
+    const int shift_1st = 2 + X265_DEPTH - 8;
801
+    const int shift_2nd = 9;
802
+
803
+    ALIGN_VAR_32(int16_t, coef8 * 8);
804
+    ALIGN_VAR_32(int16_t, block8 * 8);
805
+
806
+    for (int i = 0; i < 8; i++)
807
+    {
808
+        memcpy(&blocki * 8, &srci * srcStride, 8 * sizeof(int16_t));
809
+    }
810
+
811
+    partialButterfly8(block, coef, shift_1st, 8);
812
+    partialButterfly8(coef, dst, shift_2nd, 8);
813
+}
814
+
815
+static void dct16_neon(const int16_t *src, int16_t *dst, intptr_t srcStride)
816
+{
817
+    const int shift_1st = 3 + X265_DEPTH - 8;
818
+    const int shift_2nd = 10;
819
+
820
+    ALIGN_VAR_32(int16_t, coef16 * 16);
821
+    ALIGN_VAR_32(int16_t, block16 * 16);
822
+
823
+    for (int i = 0; i < 16; i++)
824
+    {
825
+        memcpy(&blocki * 16, &srci * srcStride, 16 * sizeof(int16_t));
826
+    }
827
+
828
+    partialButterfly16(block, coef, shift_1st, 16);
829
+    partialButterfly16(coef, dst, shift_2nd, 16);
830
+}
831
+
832
+static void dct32_neon(const int16_t *src, int16_t *dst, intptr_t srcStride)
833
+{
834
+    const int shift_1st = 4 + X265_DEPTH - 8;
835
+    const int shift_2nd = 11;
836
+
837
+    ALIGN_VAR_32(int16_t, coef32 * 32);
838
+    ALIGN_VAR_32(int16_t, block32 * 32);
839
+
840
+    for (int i = 0; i < 32; i++)
841
+    {
842
+        memcpy(&blocki * 32, &srci * srcStride, 32 * sizeof(int16_t));
843
+    }
844
+
845
+    partialButterfly32(block, coef, shift_1st, 32);
846
+    partialButterfly32(coef, dst, shift_2nd, 32);
847
+}
848
+
849
+static void idct4_neon(const int16_t *src, int16_t *dst, intptr_t dstStride)
850
+{
851
+    const int shift_1st = 7;
852
+    const int shift_2nd = 12 - (X265_DEPTH - 8);
853
+
854
+    ALIGN_VAR_32(int16_t, coef4 * 4);
855
+    ALIGN_VAR_32(int16_t, block4 * 4);
856
+
857
+    partialButterflyInverse4(src, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output
858
+    partialButterflyInverse4(coef, block, shift_2nd, 4); // Forward DST BY FAST ALGORITHM, coef input, coeff output
859
+
860
+    for (int i = 0; i < 4; i++)
861
+    {
862
+        memcpy(&dsti * dstStride, &blocki * 4, 4 * sizeof(int16_t));
863
+    }
864
+}
865
+
866
+static void idct16_neon(const int16_t *src, int16_t *dst, intptr_t dstStride)
867
+{
868
+    const int shift_1st = 7;
869
+    const int shift_2nd = 12 - (X265_DEPTH - 8);
870
+
871
+    ALIGN_VAR_32(int16_t, coef16 * 16);
872
+    ALIGN_VAR_32(int16_t, block16 * 16);
873
+
874
+    partialButterflyInverse16_neon(src, coef, shift_1st, 16);
875
+    partialButterflyInverse16_neon(coef, block, shift_2nd, 16);
876
+
877
+    for (int i = 0; i < 16; i++)
878
+    {
879
+        memcpy(&dsti * dstStride, &blocki * 16, 16 * sizeof(int16_t));
880
+    }
881
+}
882
+
883
+static void idct32_neon(const int16_t *src, int16_t *dst, intptr_t dstStride)
884
+{
885
+    const int shift_1st = 7;
886
+    const int shift_2nd = 12 - (X265_DEPTH - 8);
887
+
888
+    ALIGN_VAR_32(int16_t, coef32 * 32);
889
+    ALIGN_VAR_32(int16_t, block32 * 32);
890
+
891
+    partialButterflyInverse32_neon(src, coef, shift_1st, 32);
892
+    partialButterflyInverse32_neon(coef, block, shift_2nd, 32);
893
+
894
+    for (int i = 0; i < 32; i++)
895
+    {
896
+        memcpy(&dsti * dstStride, &blocki * 32, 32 * sizeof(int16_t));
897
+    }
898
+}
899
+
900
+
901
+
902
+}
903
+
904
+namespace X265_NS
905
+{
906
+// x265 private namespace
907
+void setupDCTPrimitives_neon(EncoderPrimitives &p)
908
+{
909
+    p.cuBLOCK_4x4.nonPsyRdoQuant   = nonPsyRdoQuant_neon<2>;
910
+    p.cuBLOCK_8x8.nonPsyRdoQuant   = nonPsyRdoQuant_neon<3>;
911
+    p.cuBLOCK_16x16.nonPsyRdoQuant = nonPsyRdoQuant_neon<4>;
912
+    p.cuBLOCK_32x32.nonPsyRdoQuant = nonPsyRdoQuant_neon<5>;
913
+    p.cuBLOCK_4x4.psyRdoQuant = psyRdoQuant_neon<2>;
914
+    p.cuBLOCK_8x8.psyRdoQuant = psyRdoQuant_neon<3>;
915
+    p.cuBLOCK_16x16.psyRdoQuant = psyRdoQuant_neon<4>;
916
+    p.cuBLOCK_32x32.psyRdoQuant = psyRdoQuant_neon<5>;
917
+    p.cuBLOCK_8x8.dct   = dct8_neon;
918
+    p.cuBLOCK_16x16.dct = dct16_neon;
919
+    p.cuBLOCK_32x32.dct = dct32_neon;
920
+    p.cuBLOCK_4x4.idct   = idct4_neon;
921
+    p.cuBLOCK_16x16.idct = idct16_neon;
922
+    p.cuBLOCK_32x32.idct = idct32_neon;
923
+    p.cuBLOCK_4x4.count_nonzero = count_nonzero_neon<4>;
924
+    p.cuBLOCK_8x8.count_nonzero = count_nonzero_neon<8>;
925
+    p.cuBLOCK_16x16.count_nonzero = count_nonzero_neon<16>;
926
+    p.cuBLOCK_32x32.count_nonzero = count_nonzero_neon<32>;
927
+
928
+    p.cuBLOCK_4x4.copy_cnt   = copy_count_neon<4>;
929
+    p.cuBLOCK_8x8.copy_cnt   = copy_count_neon<8>;
930
+    p.cuBLOCK_16x16.copy_cnt = copy_count_neon<16>;
931
+    p.cuBLOCK_32x32.copy_cnt = copy_count_neon<32>;
932
+    p.cuBLOCK_4x4.psyRdoQuant_1p = nonPsyRdoQuant_neon<2>;
933
+    p.cuBLOCK_4x4.psyRdoQuant_2p = psyRdoQuant_neon<2>;
934
+    p.cuBLOCK_8x8.psyRdoQuant_1p = nonPsyRdoQuant_neon<3>;
935
+    p.cuBLOCK_8x8.psyRdoQuant_2p = psyRdoQuant_neon<3>;
936
+    p.cuBLOCK_16x16.psyRdoQuant_1p = nonPsyRdoQuant_neon<4>;
937
+    p.cuBLOCK_16x16.psyRdoQuant_2p = psyRdoQuant_neon<4>;
938
+    p.cuBLOCK_32x32.psyRdoQuant_1p = nonPsyRdoQuant_neon<5>;
939
+    p.cuBLOCK_32x32.psyRdoQuant_2p = psyRdoQuant_neon<5>;
940
+
941
+    p.scanPosLast  = scanPosLast_opt;
942
+
943
+}
944
+
945
+};
946
+
947
+
948
+
949
+#endif
950
x265_3.6.tar.gz/source/common/aarch64/dct-prim.h Added
21
 
1
@@ -0,0 +1,19 @@
2
+#ifndef __DCT_PRIM_NEON_H__
3
+#define __DCT_PRIM_NEON_H__
4
+
5
+
6
+#include "common.h"
7
+#include "primitives.h"
8
+#include "contexts.h"   // costCoeffNxN_c
9
+#include "threading.h"  // CLZ
10
+
11
+namespace X265_NS
12
+{
13
+// x265 private namespace
14
+void setupDCTPrimitives_neon(EncoderPrimitives &p);
15
+};
16
+
17
+
18
+
19
+#endif
20
+
21
x265_3.6.tar.gz/source/common/aarch64/filter-prim.cpp Added
997
 
1
@@ -0,0 +1,995 @@
2
+#if HAVE_NEON
3
+
4
+#include "filter-prim.h"
5
+#include <arm_neon.h>
6
+
7
+namespace
8
+{
9
+
10
+using namespace X265_NS;
11
+
12
+
13
+template<int width, int height>
14
+void filterPixelToShort_neon(const pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
15
+{
16
+    const int shift = IF_INTERNAL_PREC - X265_DEPTH;
17
+    int row, col;
18
+    const int16x8_t off = vdupq_n_s16(IF_INTERNAL_OFFS);
19
+    for (row = 0; row < height; row++)
20
+    {
21
+
22
+        for (col = 0; col < width; col += 8)
23
+        {
24
+            int16x8_t in;
25
+
26
+#if HIGH_BIT_DEPTH
27
+            in = *(int16x8_t *)&srccol;
28
+#else
29
+            in = vmovl_u8(*(uint8x8_t *)&srccol);
30
+#endif
31
+
32
+            int16x8_t tmp = vshlq_n_s16(in, shift);
33
+            tmp = vsubq_s16(tmp, off);
34
+            *(int16x8_t *)&dstcol = tmp;
35
+
36
+        }
37
+
38
+        src += srcStride;
39
+        dst += dstStride;
40
+    }
41
+}
42
+
43
+
44
+template<int N, int width, int height>
45
+void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
46
+{
47
+    const int16_t *coeff = (N == 4) ? g_chromaFiltercoeffIdx : g_lumaFiltercoeffIdx;
48
+    int headRoom = IF_FILTER_PREC;
49
+    int offset = (1 << (headRoom - 1));
50
+    uint16_t maxVal = (1 << X265_DEPTH) - 1;
51
+    int cStride = 1;
52
+
53
+    src -= (N / 2 - 1) * cStride;
54
+    int16x8_t vc;
55
+    vc = *(int16x8_t *)coeff;
56
+    int16x4_t low_vc = vget_low_s16(vc);
57
+    int16x4_t high_vc = vget_high_s16(vc);
58
+
59
+    const int32x4_t voffset = vdupq_n_s32(offset);
60
+    const int32x4_t vhr = vdupq_n_s32(-headRoom);
61
+
62
+    int row, col;
63
+    for (row = 0; row < height; row++)
64
+    {
65
+        for (col = 0; col < width; col += 8)
66
+        {
67
+            int32x4_t vsum1, vsum2;
68
+
69
+            int16x8_t inputN;
70
+
71
+            for (int i = 0; i < N; i++)
72
+            {
73
+#if HIGH_BIT_DEPTH
74
+                inputi = *(int16x8_t *)&srccol + i;
75
+#else
76
+                inputi = vmovl_u8(*(uint8x8_t *)&srccol + i);
77
+#endif
78
+            }
79
+            vsum1 = voffset;
80
+            vsum2 = voffset;
81
+
82
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input0), low_vc, 0);
83
+            vsum2 = vmlal_high_lane_s16(vsum2, input0, low_vc, 0);
84
+
85
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input1), low_vc, 1);
86
+            vsum2 = vmlal_high_lane_s16(vsum2, input1, low_vc, 1);
87
+
88
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input2), low_vc, 2);
89
+            vsum2 = vmlal_high_lane_s16(vsum2, input2, low_vc, 2);
90
+
91
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input3), low_vc, 3);
92
+            vsum2 = vmlal_high_lane_s16(vsum2, input3, low_vc, 3);
93
+
94
+            if (N == 8)
95
+            {
96
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input4), high_vc, 0);
97
+                vsum2 = vmlal_high_lane_s16(vsum2, input4, high_vc, 0);
98
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input5), high_vc, 1);
99
+                vsum2 = vmlal_high_lane_s16(vsum2, input5, high_vc, 1);
100
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input6), high_vc, 2);
101
+                vsum2 = vmlal_high_lane_s16(vsum2, input6, high_vc, 2);
102
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input7), high_vc, 3);
103
+                vsum2 = vmlal_high_lane_s16(vsum2, input7, high_vc, 3);
104
+
105
+            }
106
+
107
+            vsum1 = vshlq_s32(vsum1, vhr);
108
+            vsum2 = vshlq_s32(vsum2, vhr);
109
+
110
+            int16x8_t vsum = vuzp1q_s16(vsum1, vsum2);
111
+            vsum = vminq_s16(vsum, vdupq_n_s16(maxVal));
112
+            vsum = vmaxq_s16(vsum, vdupq_n_s16(0));
113
+#if HIGH_BIT_DEPTH
114
+            *(int16x8_t *)&dstcol = vsum;
115
+#else
116
+            uint8x16_t usum = vuzp1q_u8(vsum, vsum);
117
+            *(uint8x8_t *)&dstcol = vget_low_u8(usum);
118
+#endif
119
+
120
+        }
121
+
122
+        src += srcStride;
123
+        dst += dstStride;
124
+    }
125
+}
126
+
127
+#if HIGH_BIT_DEPTH
128
+
129
+template<int N, int width, int height>
130
+void interp_horiz_ps_neon(const uint16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx,
131
+                          int isRowExt)
132
+{
133
+    const int16_t *coeff = (N == 4) ? g_chromaFiltercoeffIdx : g_lumaFiltercoeffIdx;
134
+    const int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
135
+    const int shift = IF_FILTER_PREC - headRoom;
136
+    const int offset = (unsigned) - IF_INTERNAL_OFFS << shift;
137
+
138
+    int blkheight = height;
139
+    src -= N / 2 - 1;
140
+
141
+    if (isRowExt)
142
+    {
143
+        src -= (N / 2 - 1) * srcStride;
144
+        blkheight += N - 1;
145
+    }
146
+    int16x8_t vc3 = vld1q_s16(coeff);
147
+    const int32x4_t voffset = vdupq_n_s32(offset);
148
+    const int32x4_t vhr = vdupq_n_s32(-shift);
149
+
150
+    int row, col;
151
+    for (row = 0; row < blkheight; row++)
152
+    {
153
+        for (col = 0; col < width; col += 8)
154
+        {
155
+            int32x4_t vsum, vsum2;
156
+
157
+            int16x8_t inputN;
158
+            for (int i = 0; i < N; i++)
159
+            {
160
+                inputi = vld1q_s16((int16_t *)&srccol + i);
161
+            }
162
+
163
+            vsum = voffset;
164
+            vsum2 = voffset;
165
+
166
+            vsum = vmlal_lane_s16(vsum, vget_low_u16(input0), vget_low_s16(vc3), 0);
167
+            vsum2 = vmlal_high_lane_s16(vsum2, input0, vget_low_s16(vc3), 0);
168
+
169
+            vsum = vmlal_lane_s16(vsum, vget_low_u16(input1), vget_low_s16(vc3), 1);
170
+            vsum2 = vmlal_high_lane_s16(vsum2, input1, vget_low_s16(vc3), 1);
171
+
172
+            vsum = vmlal_lane_s16(vsum, vget_low_u16(input2), vget_low_s16(vc3), 2);
173
+            vsum2 = vmlal_high_lane_s16(vsum2, input2, vget_low_s16(vc3), 2);
174
+
175
+            vsum = vmlal_lane_s16(vsum, vget_low_u16(input3), vget_low_s16(vc3), 3);
176
+            vsum2 = vmlal_high_lane_s16(vsum2, input3, vget_low_s16(vc3), 3);
177
+
178
+            if (N == 8)
179
+            {
180
+                vsum = vmlal_lane_s16(vsum, vget_low_s16(input4), vget_high_s16(vc3), 0);
181
+                vsum2 = vmlal_high_lane_s16(vsum2, input4, vget_high_s16(vc3), 0);
182
+
183
+                vsum = vmlal_lane_s16(vsum, vget_low_s16(input5), vget_high_s16(vc3), 1);
184
+                vsum2 = vmlal_high_lane_s16(vsum2, input5, vget_high_s16(vc3), 1);
185
+
186
+                vsum = vmlal_lane_s16(vsum, vget_low_s16(input6), vget_high_s16(vc3), 2);
187
+                vsum2 = vmlal_high_lane_s16(vsum2, input6, vget_high_s16(vc3), 2);
188
+
189
+                vsum = vmlal_lane_s16(vsum, vget_low_s16(input7), vget_high_s16(vc3), 3);
190
+                vsum2 = vmlal_high_lane_s16(vsum2, input7, vget_high_s16(vc3), 3);
191
+            }
192
+
193
+            vsum = vshlq_s32(vsum, vhr);
194
+            vsum2 = vshlq_s32(vsum2, vhr);
195
+            *(int16x4_t *)&dstcol = vmovn_u32(vsum);
196
+            *(int16x4_t *)&dstcol+4 = vmovn_u32(vsum2);
197
+        }
198
+
199
+        src += srcStride;
200
+        dst += dstStride;
201
+    }
202
+}
203
+
204
+
205
+#else
206
+
207
+template<int N, int width, int height>
208
+void interp_horiz_ps_neon(const uint8_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx,
209
+                          int isRowExt)
210
+{
211
+    const int16_t *coeff = (N == 4) ? g_chromaFiltercoeffIdx : g_lumaFiltercoeffIdx;
212
+    const int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
213
+    const int shift = IF_FILTER_PREC - headRoom;
214
+    const int offset = (unsigned) - IF_INTERNAL_OFFS << shift;
215
+
216
+    int blkheight = height;
217
+    src -= N / 2 - 1;
218
+
219
+    if (isRowExt)
220
+    {
221
+        src -= (N / 2 - 1) * srcStride;
222
+        blkheight += N - 1;
223
+    }
224
+    int16x8_t vc;
225
+    vc = *(int16x8_t *)coeff;
226
+
227
+    const int16x8_t voffset = vdupq_n_s16(offset);
228
+    const int16x8_t vhr = vdupq_n_s16(-shift);
229
+
230
+    int row, col;
231
+    for (row = 0; row < blkheight; row++)
232
+    {
233
+        for (col = 0; col < width; col += 8)
234
+        {
235
+            int16x8_t vsum;
236
+
237
+            int16x8_t inputN;
238
+
239
+            for (int i = 0; i < N; i++)
240
+            {
241
+                inputi = vmovl_u8(*(uint8x8_t *)&srccol + i);
242
+            }
243
+            vsum = voffset;
244
+            vsum = vmlaq_laneq_s16(vsum, (input0), vc, 0);
245
+            vsum = vmlaq_laneq_s16(vsum, (input1), vc, 1);
246
+            vsum = vmlaq_laneq_s16(vsum, (input2), vc, 2);
247
+            vsum = vmlaq_laneq_s16(vsum, (input3), vc, 3);
248
+
249
+
250
+            if (N == 8)
251
+            {
252
+                vsum = vmlaq_laneq_s16(vsum, (input4), vc, 4);
253
+                vsum = vmlaq_laneq_s16(vsum, (input5), vc, 5);
254
+                vsum = vmlaq_laneq_s16(vsum, (input6), vc, 6);
255
+                vsum = vmlaq_laneq_s16(vsum, (input7), vc, 7);
256
+
257
+            }
258
+
259
+            vsum = vshlq_s16(vsum, vhr);
260
+            *(int16x8_t *)&dstcol = vsum;
261
+        }
262
+
263
+        src += srcStride;
264
+        dst += dstStride;
265
+    }
266
+}
267
+
268
+#endif
269
+
270
+
271
+template<int N, int width, int height>
272
+void interp_vert_ss_neon(const int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
273
+{
274
+    const int16_t *c = (N == 8 ? g_lumaFiltercoeffIdx : g_chromaFiltercoeffIdx);
275
+    int shift = IF_FILTER_PREC;
276
+    src -= (N / 2 - 1) * srcStride;
277
+    int16x8_t vc;
278
+    vc = *(int16x8_t *)c;
279
+    int16x4_t low_vc = vget_low_s16(vc);
280
+    int16x4_t high_vc = vget_high_s16(vc);
281
+
282
+    const int32x4_t vhr = vdupq_n_s32(-shift);
283
+
284
+    int row, col;
285
+    for (row = 0; row < height; row++)
286
+    {
287
+        for (col = 0; col < width; col += 8)
288
+        {
289
+            int32x4_t vsum1, vsum2;
290
+
291
+            int16x8_t inputN;
292
+
293
+            for (int i = 0; i < N; i++)
294
+            {
295
+                inputi = *(int16x8_t *)&srccol + i * srcStride;
296
+            }
297
+
298
+            vsum1 = vmull_lane_s16(vget_low_s16(input0), low_vc, 0);
299
+            vsum2 = vmull_high_lane_s16(input0, low_vc, 0);
300
+
301
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input1), low_vc, 1);
302
+            vsum2 = vmlal_high_lane_s16(vsum2, input1, low_vc, 1);
303
+
304
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input2), low_vc, 2);
305
+            vsum2 = vmlal_high_lane_s16(vsum2, input2, low_vc, 2);
306
+
307
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input3), low_vc, 3);
308
+            vsum2 = vmlal_high_lane_s16(vsum2, input3, low_vc, 3);
309
+
310
+            if (N == 8)
311
+            {
312
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input4), high_vc, 0);
313
+                vsum2 = vmlal_high_lane_s16(vsum2, input4, high_vc, 0);
314
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input5), high_vc, 1);
315
+                vsum2 = vmlal_high_lane_s16(vsum2, input5, high_vc, 1);
316
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input6), high_vc, 2);
317
+                vsum2 = vmlal_high_lane_s16(vsum2, input6, high_vc, 2);
318
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input7), high_vc, 3);
319
+                vsum2 = vmlal_high_lane_s16(vsum2, input7, high_vc, 3);
320
+
321
+            }
322
+
323
+            vsum1 = vshlq_s32(vsum1, vhr);
324
+            vsum2 = vshlq_s32(vsum2, vhr);
325
+
326
+            int16x8_t vsum = vuzp1q_s16(vsum1, vsum2);
327
+            *(int16x8_t *)&dstcol = vsum;
328
+        }
329
+
330
+        src += srcStride;
331
+        dst += dstStride;
332
+    }
333
+
334
+}
335
+
336
+
337
+#if HIGH_BIT_DEPTH
338
+
339
+template<int N, int width, int height>
340
+void interp_vert_pp_neon(const uint16_t *src, intptr_t srcStride, uint16_t *dst, intptr_t dstStride, int coeffIdx)
341
+{
342
+
343
+    const int16_t *c = (N == 4) ? g_chromaFiltercoeffIdx : g_lumaFiltercoeffIdx;
344
+    int shift = IF_FILTER_PREC;
345
+    int offset = 1 << (shift - 1);
346
+    const uint16_t maxVal = (1 << X265_DEPTH) - 1;
347
+
348
+    src -= (N / 2 - 1) * srcStride;
349
+    int16x8_t vc;
350
+    vc = *(int16x8_t *)c;
351
+    int32x4_t low_vc = vmovl_s16(vget_low_s16(vc));
352
+    int32x4_t high_vc = vmovl_s16(vget_high_s16(vc));
353
+
354
+    const int32x4_t voffset = vdupq_n_s32(offset);
355
+    const int32x4_t vhr = vdupq_n_s32(-shift);
356
+
357
+    int row, col;
358
+    for (row = 0; row < height; row++)
359
+    {
360
+        for (col = 0; col < width; col += 4)
361
+        {
362
+            int32x4_t vsum;
363
+
364
+            int32x4_t inputN;
365
+
366
+            for (int i = 0; i < N; i++)
367
+            {
368
+                inputi = vmovl_u16(*(uint16x4_t *)&srccol + i * srcStride);
369
+            }
370
+            vsum = voffset;
371
+
372
+            vsum = vmlaq_laneq_s32(vsum, (input0), low_vc, 0);
373
+            vsum = vmlaq_laneq_s32(vsum, (input1), low_vc, 1);
374
+            vsum = vmlaq_laneq_s32(vsum, (input2), low_vc, 2);
375
+            vsum = vmlaq_laneq_s32(vsum, (input3), low_vc, 3);
376
+
377
+            if (N == 8)
378
+            {
379
+                vsum = vmlaq_laneq_s32(vsum, (input4), high_vc, 0);
380
+                vsum = vmlaq_laneq_s32(vsum, (input5), high_vc, 1);
381
+                vsum = vmlaq_laneq_s32(vsum, (input6), high_vc, 2);
382
+                vsum = vmlaq_laneq_s32(vsum, (input7), high_vc, 3);
383
+            }
384
+
385
+            vsum = vshlq_s32(vsum, vhr);
386
+            vsum = vminq_s32(vsum, vdupq_n_s32(maxVal));
387
+            vsum = vmaxq_s32(vsum, vdupq_n_s32(0));
388
+            *(uint16x4_t *)&dstcol = vmovn_u32(vsum);
389
+        }
390
+        src += srcStride;
391
+        dst += dstStride;
392
+    }
393
+}
394
+
395
+
396
+
397
+
398
+#else
399
+
400
+template<int N, int width, int height>
401
+void interp_vert_pp_neon(const uint8_t *src, intptr_t srcStride, uint8_t *dst, intptr_t dstStride, int coeffIdx)
402
+{
403
+
404
+    const int16_t *c = (N == 4) ? g_chromaFiltercoeffIdx : g_lumaFiltercoeffIdx;
405
+    int shift = IF_FILTER_PREC;
406
+    int offset = 1 << (shift - 1);
407
+    const uint16_t maxVal = (1 << X265_DEPTH) - 1;
408
+
409
+    src -= (N / 2 - 1) * srcStride;
410
+    int16x8_t vc;
411
+    vc = *(int16x8_t *)c;
412
+
413
+    const int16x8_t voffset = vdupq_n_s16(offset);
414
+    const int16x8_t vhr = vdupq_n_s16(-shift);
415
+
416
+    int row, col;
417
+    for (row = 0; row < height; row++)
418
+    {
419
+        for (col = 0; col < width; col += 8)
420
+        {
421
+            int16x8_t vsum;
422
+
423
+            int16x8_t inputN;
424
+
425
+            for (int i = 0; i < N; i++)
426
+            {
427
+                inputi = vmovl_u8(*(uint8x8_t *)&srccol + i * srcStride);
428
+            }
429
+            vsum = voffset;
430
+
431
+            vsum = vmlaq_laneq_s16(vsum, (input0), vc, 0);
432
+            vsum = vmlaq_laneq_s16(vsum, (input1), vc, 1);
433
+            vsum = vmlaq_laneq_s16(vsum, (input2), vc, 2);
434
+            vsum = vmlaq_laneq_s16(vsum, (input3), vc, 3);
435
+
436
+            if (N == 8)
437
+            {
438
+                vsum = vmlaq_laneq_s16(vsum, (input4), vc, 4);
439
+                vsum = vmlaq_laneq_s16(vsum, (input5), vc, 5);
440
+                vsum = vmlaq_laneq_s16(vsum, (input6), vc, 6);
441
+                vsum = vmlaq_laneq_s16(vsum, (input7), vc, 7);
442
+
443
+            }
444
+
445
+            vsum = vshlq_s16(vsum, vhr);
446
+
447
+            vsum = vminq_s16(vsum, vdupq_n_s16(maxVal));
448
+            vsum = vmaxq_s16(vsum, vdupq_n_s16(0));
449
+            uint8x16_t usum = vuzp1q_u8(vsum, vsum);
450
+            *(uint8x8_t *)&dstcol = vget_low_u8(usum);
451
+
452
+        }
453
+
454
+        src += srcStride;
455
+        dst += dstStride;
456
+    }
457
+}
458
+
459
+
460
+#endif
461
+
462
+
463
+#if HIGH_BIT_DEPTH
464
+
465
+template<int N, int width, int height>
466
+void interp_vert_ps_neon(const uint16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
467
+{
468
+    const int16_t *c = (N == 4) ? g_chromaFiltercoeffIdx : g_lumaFiltercoeffIdx;
469
+    int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
470
+    int shift = IF_FILTER_PREC - headRoom;
471
+    int offset = (unsigned) - IF_INTERNAL_OFFS << shift;
472
+    src -= (N / 2 - 1) * srcStride;
473
+
474
+    int16x8_t vc;
475
+    vc = *(int16x8_t *)c;
476
+    int32x4_t low_vc = vmovl_s16(vget_low_s16(vc));
477
+    int32x4_t high_vc = vmovl_s16(vget_high_s16(vc));
478
+
479
+    const int32x4_t voffset = vdupq_n_s32(offset);
480
+    const int32x4_t vhr = vdupq_n_s32(-shift);
481
+
482
+    int row, col;
483
+    for (row = 0; row < height; row++)
484
+    {
485
+        for (col = 0; col < width; col += 4)
486
+        {
487
+            int16x8_t vsum;
488
+
489
+            int16x8_t inputN;
490
+
491
+            for (int i = 0; i < N; i++)
492
+            {
493
+                inputi = vmovl_u16(*(uint16x4_t *)&srccol + i * srcStride);
494
+            }
495
+            vsum = voffset;
496
+
497
+            vsum = vmlaq_laneq_s32(vsum, (input0), low_vc, 0);
498
+            vsum = vmlaq_laneq_s32(vsum, (input1), low_vc, 1);
499
+            vsum = vmlaq_laneq_s32(vsum, (input2), low_vc, 2);
500
+            vsum = vmlaq_laneq_s32(vsum, (input3), low_vc, 3);
501
+
502
+            if (N == 8)
503
+            {
504
+                int16x8_t  vsum1 = vmulq_laneq_s32((input4), high_vc, 0);
505
+                vsum1 = vmlaq_laneq_s32(vsum1, (input5), high_vc, 1);
506
+                vsum1 = vmlaq_laneq_s32(vsum1, (input6), high_vc, 2);
507
+                vsum1 = vmlaq_laneq_s32(vsum1, (input7), high_vc, 3);
508
+                vsum = vaddq_s32(vsum, vsum1);
509
+            }
510
+
511
+            vsum = vshlq_s32(vsum, vhr);
512
+
513
+            *(uint16x4_t *)&dstcol = vmovn_s32(vsum);
514
+        }
515
+
516
+        src += srcStride;
517
+        dst += dstStride;
518
+    }
519
+}
520
+
521
+#else
522
+
523
+template<int N, int width, int height>
524
+void interp_vert_ps_neon(const uint8_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
525
+{
526
+    const int16_t *c = (N == 4) ? g_chromaFiltercoeffIdx : g_lumaFiltercoeffIdx;
527
+    int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
528
+    int shift = IF_FILTER_PREC - headRoom;
529
+    int offset = (unsigned) - IF_INTERNAL_OFFS << shift;
530
+    src -= (N / 2 - 1) * srcStride;
531
+
532
+    int16x8_t vc;
533
+    vc = *(int16x8_t *)c;
534
+
535
+    const int16x8_t voffset = vdupq_n_s16(offset);
536
+    const int16x8_t vhr = vdupq_n_s16(-shift);
537
+
538
+    int row, col;
539
+    for (row = 0; row < height; row++)
540
+    {
541
+        for (col = 0; col < width; col += 8)
542
+        {
543
+            int16x8_t vsum;
544
+
545
+            int16x8_t inputN;
546
+
547
+            for (int i = 0; i < N; i++)
548
+            {
549
+                inputi = vmovl_u8(*(uint8x8_t *)&srccol + i * srcStride);
550
+            }
551
+            vsum = voffset;
552
+
553
+            vsum = vmlaq_laneq_s16(vsum, (input0), vc, 0);
554
+            vsum = vmlaq_laneq_s16(vsum, (input1), vc, 1);
555
+            vsum = vmlaq_laneq_s16(vsum, (input2), vc, 2);
556
+            vsum = vmlaq_laneq_s16(vsum, (input3), vc, 3);
557
+
558
+            if (N == 8)
559
+            {
560
+                int16x8_t  vsum1 = vmulq_laneq_s16((input4), vc, 4);
561
+                vsum1 = vmlaq_laneq_s16(vsum1, (input5), vc, 5);
562
+                vsum1 = vmlaq_laneq_s16(vsum1, (input6), vc, 6);
563
+                vsum1 = vmlaq_laneq_s16(vsum1, (input7), vc, 7);
564
+                vsum = vaddq_s16(vsum, vsum1);
565
+            }
566
+
567
+            vsum = vshlq_s32(vsum, vhr);
568
+            *(int16x8_t *)&dstcol = vsum;
569
+        }
570
+
571
+        src += srcStride;
572
+        dst += dstStride;
573
+    }
574
+}
575
+
576
+#endif
577
+
578
+
579
+
580
+template<int N, int width, int height>
581
+void interp_vert_sp_neon(const int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
582
+{
583
+    int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
584
+    int shift = IF_FILTER_PREC + headRoom;
585
+    int offset = (1 << (shift - 1)) + (IF_INTERNAL_OFFS << IF_FILTER_PREC);
586
+    uint16_t maxVal = (1 << X265_DEPTH) - 1;
587
+    const int16_t *coeff = (N == 8 ? g_lumaFiltercoeffIdx : g_chromaFiltercoeffIdx);
588
+
589
+    src -= (N / 2 - 1) * srcStride;
590
+
591
+    int16x8_t vc;
592
+    vc = *(int16x8_t *)coeff;
593
+    int16x4_t low_vc = vget_low_s16(vc);
594
+    int16x4_t high_vc = vget_high_s16(vc);
595
+
596
+    const int32x4_t voffset = vdupq_n_s32(offset);
597
+    const int32x4_t vhr = vdupq_n_s32(-shift);
598
+
599
+    int row, col;
600
+    for (row = 0; row < height; row++)
601
+    {
602
+        for (col = 0; col < width; col += 8)
603
+        {
604
+            int32x4_t vsum1, vsum2;
605
+
606
+            int16x8_t inputN;
607
+
608
+            for (int i = 0; i < N; i++)
609
+            {
610
+                inputi = *(int16x8_t *)&srccol + i * srcStride;
611
+            }
612
+            vsum1 = voffset;
613
+            vsum2 = voffset;
614
+
615
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input0), low_vc, 0);
616
+            vsum2 = vmlal_high_lane_s16(vsum2, input0, low_vc, 0);
617
+
618
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input1), low_vc, 1);
619
+            vsum2 = vmlal_high_lane_s16(vsum2, input1, low_vc, 1);
620
+
621
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input2), low_vc, 2);
622
+            vsum2 = vmlal_high_lane_s16(vsum2, input2, low_vc, 2);
623
+
624
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input3), low_vc, 3);
625
+            vsum2 = vmlal_high_lane_s16(vsum2, input3, low_vc, 3);
626
+
627
+            if (N == 8)
628
+            {
629
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input4), high_vc, 0);
630
+                vsum2 = vmlal_high_lane_s16(vsum2, input4, high_vc, 0);
631
+
632
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input5), high_vc, 1);
633
+                vsum2 = vmlal_high_lane_s16(vsum2, input5, high_vc, 1);
634
+
635
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input6), high_vc, 2);
636
+                vsum2 = vmlal_high_lane_s16(vsum2, input6, high_vc, 2);
637
+
638
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input7), high_vc, 3);
639
+                vsum2 = vmlal_high_lane_s16(vsum2, input7, high_vc, 3);
640
+            }
641
+
642
+            vsum1 = vshlq_s32(vsum1, vhr);
643
+            vsum2 = vshlq_s32(vsum2, vhr);
644
+
645
+            int16x8_t vsum = vuzp1q_s16(vsum1, vsum2);
646
+            vsum = vminq_s16(vsum, vdupq_n_s16(maxVal));
647
+            vsum = vmaxq_s16(vsum, vdupq_n_s16(0));
648
+#if HIGH_BIT_DEPTH
649
+            *(int16x8_t *)&dstcol = vsum;
650
+#else
651
+            uint8x16_t usum = vuzp1q_u8(vsum, vsum);
652
+            *(uint8x8_t *)&dstcol = vget_low_u8(usum);
653
+#endif
654
+
655
+        }
656
+
657
+        src += srcStride;
658
+        dst += dstStride;
659
+    }
660
+}
661
+
662
+
663
+
664
+
665
+
666
+
667
+template<int N, int width, int height>
668
+void interp_hv_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
669
+{
670
+    ALIGN_VAR_32(int16_t, immedwidth * (height + N - 1));
671
+
672
+    interp_horiz_ps_neon<N, width, height>(src, srcStride, immed, width, idxX, 1);
673
+    interp_vert_sp_neon<N, width, height>(immed + (N / 2 - 1) * width, width, dst, dstStride, idxY);
674
+}
675
+
676
+
677
+
678
+}
679
+
680
+
681
+
682
+
683
+namespace X265_NS
684
+{
685
+#if defined(__APPLE__)
686
+#define CHROMA_420(W, H) \
687
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_hpp = interp_horiz_pp_neon<4, W, H>; \
688
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_vpp = interp_vert_pp_neon<4, W, H>;  \
689
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_vps = interp_vert_ps_neon<4, W, H>;  \
690
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_vsp = interp_vert_sp_neon<4, W, H>;  \
691
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_vss = interp_vert_ss_neon<4, W, H>;  \
692
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.p2sNONALIGNED = filterPixelToShort_neon<W, H>;\
693
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.p2sALIGNED = filterPixelToShort_neon<W, H>;
694
+    
695
+#define CHROMA_FILTER_420(W, H) \
696
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_hps = interp_horiz_ps_neon<4, W, H>;
697
+    
698
+#else // defined(__APPLE__)
699
+#define CHROMA_420(W, H) \
700
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_vss = interp_vert_ss_neon<4, W, H>; \
701
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.p2sNONALIGNED = filterPixelToShort_neon<W, H>;\
702
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.p2sALIGNED = filterPixelToShort_neon<W, H>;
703
+    
704
+#define CHROMA_FILTER_420(W, H) \
705
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_hpp = interp_horiz_pp_neon<4, W, H>; \
706
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_hps = interp_horiz_ps_neon<4, W, H>; \
707
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_vpp = interp_vert_pp_neon<4, W, H>;  \
708
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_vps = interp_vert_ps_neon<4, W, H>;  \
709
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_vsp = interp_vert_sp_neon<4, W, H>;
710
+#endif // defined(__APPLE__)
711
+
712
+#if defined(__APPLE__)
713
+#define CHROMA_422(W, H) \
714
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_hpp = interp_horiz_pp_neon<4, W, H>; \
715
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vpp = interp_vert_pp_neon<4, W, H>;  \
716
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vps = interp_vert_ps_neon<4, W, H>;  \
717
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vsp = interp_vert_sp_neon<4, W, H>;  \
718
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vss = interp_vert_ss_neon<4, W, H>;  \
719
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.p2sNONALIGNED = filterPixelToShort_neon<W, H>;\
720
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.p2sALIGNED = filterPixelToShort_neon<W, H>;
721
+    
722
+#define CHROMA_FILTER_422(W, H) \
723
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_hps = interp_horiz_ps_neon<4, W, H>;
724
+    
725
+#else // defined(__APPLE__)
726
+#define CHROMA_422(W, H) \
727
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vss = interp_vert_ss_neon<4, W, H>; \
728
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.p2sNONALIGNED = filterPixelToShort_neon<W, H>;\
729
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.p2sALIGNED = filterPixelToShort_neon<W, H>;
730
+    
731
+#define CHROMA_FILTER_422(W, H) \
732
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_hpp = interp_horiz_pp_neon<4, W, H>; \
733
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_hps = interp_horiz_ps_neon<4, W, H>; \
734
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vpp = interp_vert_pp_neon<4, W, H>;  \
735
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vps = interp_vert_ps_neon<4, W, H>;  \
736
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vsp = interp_vert_sp_neon<4, W, H>;
737
+#endif // defined(__APPLE__)
738
+
739
+#if defined(__APPLE__)
740
+#define CHROMA_444(W, H) \
741
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_hpp = interp_horiz_pp_neon<4, W, H>; \
742
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_vpp = interp_vert_pp_neon<4, W, H>;  \
743
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_vps = interp_vert_ps_neon<4, W, H>;  \
744
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_vsp = interp_vert_sp_neon<4, W, H>;  \
745
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_vss = interp_vert_ss_neon<4, W, H>;  \
746
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.p2sNONALIGNED = filterPixelToShort_neon<W, H>;\
747
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.p2sALIGNED = filterPixelToShort_neon<W, H>;
748
+
749
+#define CHROMA_FILTER_444(W, H) \
750
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_hps = interp_horiz_ps_neon<4, W, H>;
751
+    
752
+#else // defined(__APPLE__)
753
+#define CHROMA_444(W, H) \
754
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.p2sNONALIGNED = filterPixelToShort_neon<W, H>;\
755
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.p2sALIGNED = filterPixelToShort_neon<W, H>;
756
+    
757
+#define CHROMA_FILTER_444(W, H) \
758
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_hpp = interp_horiz_pp_neon<4, W, H>; \
759
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_hps = interp_horiz_ps_neon<4, W, H>; \
760
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_vpp = interp_vert_pp_neon<4, W, H>;  \
761
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_vps = interp_vert_ps_neon<4, W, H>;  \
762
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_vsp = interp_vert_sp_neon<4, W, H>;  \
763
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_vss = interp_vert_ss_neon<4, W, H>;
764
+#endif // defined(__APPLE__)
765
+
766
+#if defined(__APPLE__)
767
+#define LUMA(W, H) \
768
+    p.puLUMA_ ## W ## x ## H.luma_hpp     = interp_horiz_pp_neon<8, W, H>; \
769
+    p.puLUMA_ ## W ## x ## H.luma_vpp     = interp_vert_pp_neon<8, W, H>;  \
770
+    p.puLUMA_ ## W ## x ## H.luma_vps     = interp_vert_ps_neon<8, W, H>;  \
771
+    p.puLUMA_ ## W ## x ## H.luma_vsp     = interp_vert_sp_neon<8, W, H>;  \
772
+    p.puLUMA_ ## W ## x ## H.luma_vss     = interp_vert_ss_neon<8, W, H>;  \
773
+    p.puLUMA_ ## W ## x ## H.luma_hvpp    = interp_hv_pp_neon<8, W, H>; \
774
+    p.puLUMA_ ## W ## x ## H.convert_p2sNONALIGNED = filterPixelToShort_neon<W, H>;\
775
+    p.puLUMA_ ## W ## x ## H.convert_p2sALIGNED = filterPixelToShort_neon<W, H>;
776
+    
777
+#else // defined(__APPLE__)
778
+#define LUMA(W, H) \
779
+    p.puLUMA_ ## W ## x ## H.luma_vss     = interp_vert_ss_neon<8, W, H>;  \
780
+    p.puLUMA_ ## W ## x ## H.convert_p2sNONALIGNED = filterPixelToShort_neon<W, H>;\
781
+    p.puLUMA_ ## W ## x ## H.convert_p2sALIGNED = filterPixelToShort_neon<W, H>;
782
+    
783
+#define LUMA_FILTER(W, H) \
784
+    p.puLUMA_ ## W ## x ## H.luma_hpp     = interp_horiz_pp_neon<8, W, H>; \
785
+    p.puLUMA_ ## W ## x ## H.luma_vpp     = interp_vert_pp_neon<8, W, H>;  \
786
+    p.puLUMA_ ## W ## x ## H.luma_vps     = interp_vert_ps_neon<8, W, H>;  \
787
+    p.puLUMA_ ## W ## x ## H.luma_vsp     = interp_vert_sp_neon<8, W, H>;  \
788
+    p.puLUMA_ ## W ## x ## H.luma_hvpp    = interp_hv_pp_neon<8, W, H>;
789
+#endif // defined(__APPLE__)
790
+
791
+void setupFilterPrimitives_neon(EncoderPrimitives &p)
792
+{
793
+
794
+    // All neon functions assume width of multiple of 8, (2,4,12 variants are not optimized)
795
+
796
+    LUMA(8, 8);
797
+    LUMA(8, 4);
798
+    LUMA(16, 16);
799
+    CHROMA_420(8,  8);
800
+    LUMA(16,  8);
801
+    CHROMA_420(8,  4);
802
+    LUMA(8, 16);
803
+    LUMA(16, 12);
804
+    CHROMA_420(8,  6);
805
+    LUMA(16,  4);
806
+    CHROMA_420(8,  2);
807
+    LUMA(32, 32);
808
+    CHROMA_420(16, 16);
809
+    LUMA(32, 16);
810
+    CHROMA_420(16, 8);
811
+    LUMA(16, 32);
812
+    CHROMA_420(8,  16);
813
+    LUMA(32, 24);
814
+    CHROMA_420(16, 12);
815
+    LUMA(24, 32);
816
+    LUMA(32,  8);
817
+    CHROMA_420(16, 4);
818
+    LUMA(8, 32);
819
+    LUMA(64, 64);
820
+    CHROMA_420(32, 32);
821
+    LUMA(64, 32);
822
+    CHROMA_420(32, 16);
823
+    LUMA(32, 64);
824
+    CHROMA_420(16, 32);
825
+    LUMA(64, 48);
826
+    CHROMA_420(32, 24);
827
+    LUMA(48, 64);
828
+    CHROMA_420(24, 32);
829
+    LUMA(64, 16);
830
+    CHROMA_420(32, 8);
831
+    LUMA(16, 64);
832
+    CHROMA_420(8,  32);
833
+    CHROMA_422(8,  16);
834
+    CHROMA_422(8,  8);
835
+    CHROMA_422(8,  12);
836
+    CHROMA_422(8,  4);
837
+    CHROMA_422(16, 32);
838
+    CHROMA_422(16, 16);
839
+    CHROMA_422(8,  32);
840
+    CHROMA_422(16, 24);
841
+    CHROMA_422(16, 8);
842
+    CHROMA_422(32, 64);
843
+    CHROMA_422(32, 32);
844
+    CHROMA_422(16, 64);
845
+    CHROMA_422(32, 48);
846
+    CHROMA_422(24, 64);
847
+    CHROMA_422(32, 16);
848
+    CHROMA_422(8,  64);
849
+    CHROMA_444(8,  8);
850
+    CHROMA_444(8,  4);
851
+    CHROMA_444(16, 16);
852
+    CHROMA_444(16, 8);
853
+    CHROMA_444(8,  16);
854
+    CHROMA_444(16, 12);
855
+    CHROMA_444(16, 4);
856
+    CHROMA_444(32, 32);
857
+    CHROMA_444(32, 16);
858
+    CHROMA_444(16, 32);
859
+    CHROMA_444(32, 24);
860
+    CHROMA_444(24, 32);
861
+    CHROMA_444(32, 8);
862
+    CHROMA_444(8,  32);
863
+    CHROMA_444(64, 64);
864
+    CHROMA_444(64, 32);
865
+    CHROMA_444(32, 64);
866
+    CHROMA_444(64, 48);
867
+    CHROMA_444(48, 64);
868
+    CHROMA_444(64, 16);
869
+    CHROMA_444(16, 64);
870
+
871
+#if defined(__APPLE__) || HIGH_BIT_DEPTH
872
+    p.puLUMA_8x4.luma_hps     = interp_horiz_ps_neon<8, 8, 4>;
873
+    p.puLUMA_8x8.luma_hps     = interp_horiz_ps_neon<8, 8, 8>;
874
+    p.puLUMA_8x16.luma_hps     = interp_horiz_ps_neon<8, 8, 16>;
875
+    p.puLUMA_8x32.luma_hps     = interp_horiz_ps_neon<8, 8, 32>;
876
+#endif // HIGH_BIT_DEPTH
877
+
878
+#if !defined(__APPLE__) && HIGH_BIT_DEPTH
879
+    p.puLUMA_24x32.luma_hps     = interp_horiz_ps_neon<8, 24, 32>;
880
+#endif // !defined(__APPLE__)
881
+
882
+#if !defined(__APPLE__)
883
+    p.puLUMA_32x8.luma_hpp      = interp_horiz_pp_neon<8, 32, 8>;
884
+    p.puLUMA_32x16.luma_hpp     = interp_horiz_pp_neon<8, 32, 16>;
885
+    p.puLUMA_32x24.luma_hpp     = interp_horiz_pp_neon<8, 32, 24>;
886
+    p.puLUMA_32x32.luma_hpp     = interp_horiz_pp_neon<8, 32, 32>;
887
+    p.puLUMA_32x64.luma_hpp     = interp_horiz_pp_neon<8, 32, 64>;
888
+    p.puLUMA_48x64.luma_hpp     = interp_horiz_pp_neon<8, 48, 64>;
889
+    p.puLUMA_64x16.luma_hpp     = interp_horiz_pp_neon<8, 64, 16>;
890
+    p.puLUMA_64x32.luma_hpp     = interp_horiz_pp_neon<8, 64, 32>;
891
+    p.puLUMA_64x48.luma_hpp     = interp_horiz_pp_neon<8, 64, 48>;
892
+    p.puLUMA_64x64.luma_hpp     = interp_horiz_pp_neon<8, 64, 64>;
893
+
894
+    LUMA_FILTER(8, 4);
895
+    LUMA_FILTER(8, 8);
896
+    LUMA_FILTER(8, 16);
897
+    LUMA_FILTER(8, 32);
898
+    LUMA_FILTER(24, 32);
899
+
900
+    LUMA_FILTER(16, 32);
901
+    LUMA_FILTER(32, 16);
902
+    LUMA_FILTER(32, 24);
903
+    LUMA_FILTER(32, 32);
904
+    LUMA_FILTER(32, 64);
905
+    LUMA_FILTER(48, 64);
906
+    LUMA_FILTER(64, 32);
907
+    LUMA_FILTER(64, 48);
908
+    LUMA_FILTER(64, 64);
909
+    
910
+    CHROMA_FILTER_420(24, 32);
911
+    
912
+    p.chromaX265_CSP_I420.puCHROMA_420_32x8.filter_hpp = interp_horiz_pp_neon<4, 32, 8>;
913
+    p.chromaX265_CSP_I420.puCHROMA_420_32x16.filter_hpp = interp_horiz_pp_neon<4, 32, 16>;
914
+    p.chromaX265_CSP_I420.puCHROMA_420_32x24.filter_hpp = interp_horiz_pp_neon<4, 32, 24>;
915
+    p.chromaX265_CSP_I420.puCHROMA_420_32x32.filter_hpp = interp_horiz_pp_neon<4, 32, 32>;
916
+    
917
+    CHROMA_FILTER_422(24, 64);
918
+    
919
+    p.chromaX265_CSP_I422.puCHROMA_422_32x16.filter_hpp = interp_horiz_pp_neon<4, 32, 16>;
920
+    p.chromaX265_CSP_I422.puCHROMA_422_32x32.filter_hpp = interp_horiz_pp_neon<4, 32, 32>;
921
+    p.chromaX265_CSP_I422.puCHROMA_422_32x48.filter_hpp = interp_horiz_pp_neon<4, 32, 48>;
922
+    p.chromaX265_CSP_I422.puCHROMA_422_32x64.filter_hpp = interp_horiz_pp_neon<4, 32, 64>;
923
+    
924
+    CHROMA_FILTER_444(24, 32);
925
+    
926
+    p.chromaX265_CSP_I444.puLUMA_32x8.filter_hpp  = interp_horiz_pp_neon<4, 32, 8>;
927
+    p.chromaX265_CSP_I444.puLUMA_32x16.filter_hpp = interp_horiz_pp_neon<4, 32, 16>;
928
+    p.chromaX265_CSP_I444.puLUMA_32x24.filter_hpp = interp_horiz_pp_neon<4, 32, 24>;
929
+    p.chromaX265_CSP_I444.puLUMA_32x32.filter_hpp = interp_horiz_pp_neon<4, 32, 32>;
930
+    p.chromaX265_CSP_I444.puLUMA_32x64.filter_hpp = interp_horiz_pp_neon<4, 32, 64>;
931
+    p.chromaX265_CSP_I444.puLUMA_48x64.filter_hpp = interp_horiz_pp_neon<4, 48, 64>;
932
+    p.chromaX265_CSP_I444.puLUMA_64x16.filter_hpp = interp_horiz_pp_neon<4, 64, 16>;
933
+    p.chromaX265_CSP_I444.puLUMA_64x32.filter_hpp = interp_horiz_pp_neon<4, 64, 32>;
934
+    p.chromaX265_CSP_I444.puLUMA_64x48.filter_hpp = interp_horiz_pp_neon<4, 64, 48>;
935
+    p.chromaX265_CSP_I444.puLUMA_64x64.filter_hpp = interp_horiz_pp_neon<4, 64, 64>;
936
+    
937
+    p.chromaX265_CSP_I444.puLUMA_16x4.filter_vss  = interp_vert_ss_neon<4, 16, 4>;
938
+    p.chromaX265_CSP_I444.puLUMA_16x8.filter_vss  = interp_vert_ss_neon<4, 16, 8>;
939
+    p.chromaX265_CSP_I444.puLUMA_16x12.filter_vss = interp_vert_ss_neon<4, 16, 12>;
940
+    p.chromaX265_CSP_I444.puLUMA_16x16.filter_vss = interp_vert_ss_neon<4, 16, 16>;
941
+    p.chromaX265_CSP_I444.puLUMA_16x32.filter_vss = interp_vert_ss_neon<4, 16, 32>;
942
+    p.chromaX265_CSP_I444.puLUMA_16x64.filter_vss = interp_vert_ss_neon<4, 16, 64>;
943
+    p.chromaX265_CSP_I444.puLUMA_32x8.filter_vss  = interp_vert_ss_neon<4, 32, 8>;
944
+    p.chromaX265_CSP_I444.puLUMA_32x16.filter_vss = interp_vert_ss_neon<4, 32, 16>;
945
+    p.chromaX265_CSP_I444.puLUMA_32x24.filter_vss = interp_vert_ss_neon<4, 32, 24>;
946
+    p.chromaX265_CSP_I444.puLUMA_32x32.filter_vss = interp_vert_ss_neon<4, 32, 32>;
947
+    p.chromaX265_CSP_I444.puLUMA_32x64.filter_vss = interp_vert_ss_neon<4, 32, 64>;
948
+#endif // !defined(__APPLE__)
949
+
950
+    CHROMA_FILTER_420(8, 2);
951
+    CHROMA_FILTER_420(8, 4);
952
+    CHROMA_FILTER_420(8, 6);
953
+    CHROMA_FILTER_420(8, 8);
954
+    CHROMA_FILTER_420(8, 16);
955
+    CHROMA_FILTER_420(8, 32);
956
+    
957
+    CHROMA_FILTER_422(8, 4);
958
+    CHROMA_FILTER_422(8, 8);
959
+    CHROMA_FILTER_422(8, 12);
960
+    CHROMA_FILTER_422(8, 16);
961
+    CHROMA_FILTER_422(8, 32);
962
+    CHROMA_FILTER_422(8, 64);
963
+    
964
+    CHROMA_FILTER_444(8, 4);
965
+    CHROMA_FILTER_444(8, 8);
966
+    CHROMA_FILTER_444(8, 16);
967
+    CHROMA_FILTER_444(8, 32);
968
+    
969
+#if defined(__APPLE__)
970
+    CHROMA_FILTER_420(16, 4);
971
+    CHROMA_FILTER_420(16, 8);
972
+    CHROMA_FILTER_420(16, 12);
973
+    CHROMA_FILTER_420(16, 16);
974
+    CHROMA_FILTER_420(16, 32);
975
+
976
+    CHROMA_FILTER_422(16, 8);
977
+    CHROMA_FILTER_422(16, 16);
978
+    CHROMA_FILTER_422(16, 24);
979
+    CHROMA_FILTER_422(16, 32);
980
+    CHROMA_FILTER_422(16, 64);
981
+    
982
+    CHROMA_FILTER_444(16, 4);
983
+    CHROMA_FILTER_444(16, 8);
984
+    CHROMA_FILTER_444(16, 12);
985
+    CHROMA_FILTER_444(16, 16);
986
+    CHROMA_FILTER_444(16, 32);
987
+    CHROMA_FILTER_444(16, 64);
988
+#endif // defined(__APPLE__)
989
+}
990
+
991
+};
992
+
993
+
994
+#endif
995
+
996
+
997
x265_3.6.tar.gz/source/common/aarch64/filter-prim.h Added
23
 
1
@@ -0,0 +1,21 @@
2
+#ifndef _FILTER_PRIM_ARM64_H__
3
+#define _FILTER_PRIM_ARM64_H__
4
+
5
+
6
+#include "common.h"
7
+#include "slicetype.h"      // LOWRES_COST_MASK
8
+#include "primitives.h"
9
+#include "x265.h"
10
+
11
+
12
+namespace X265_NS
13
+{
14
+
15
+
16
+void setupFilterPrimitives_neon(EncoderPrimitives &p);
17
+
18
+};
19
+
20
+
21
+#endif
22
+
23
x265_3.6.tar.gz/source/common/aarch64/fun-decls.h Added
258
 
1
@@ -0,0 +1,256 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2021 MulticoreWare, Inc
4
+ *
5
+ * Authors: Sebastian Pop <spop@amazon.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#define FUNCDEF_TU(ret, name, cpu, ...) \
26
+    ret PFX(name ## _4x4_ ## cpu(__VA_ARGS__)); \
27
+    ret PFX(name ## _8x8_ ## cpu(__VA_ARGS__)); \
28
+    ret PFX(name ## _16x16_ ## cpu(__VA_ARGS__)); \
29
+    ret PFX(name ## _32x32_ ## cpu(__VA_ARGS__)); \
30
+    ret PFX(name ## _64x64_ ## cpu(__VA_ARGS__))
31
+
32
+#define FUNCDEF_TU_S(ret, name, cpu, ...) \
33
+    ret PFX(name ## _4_ ## cpu(__VA_ARGS__)); \
34
+    ret PFX(name ## _8_ ## cpu(__VA_ARGS__)); \
35
+    ret PFX(name ## _16_ ## cpu(__VA_ARGS__)); \
36
+    ret PFX(name ## _32_ ## cpu(__VA_ARGS__)); \
37
+    ret PFX(name ## _64_ ## cpu(__VA_ARGS__))
38
+
39
+#define FUNCDEF_TU_S2(ret, name, cpu, ...) \
40
+    ret PFX(name ## 4_ ## cpu(__VA_ARGS__)); \
41
+    ret PFX(name ## 8_ ## cpu(__VA_ARGS__)); \
42
+    ret PFX(name ## 16_ ## cpu(__VA_ARGS__)); \
43
+    ret PFX(name ## 32_ ## cpu(__VA_ARGS__)); \
44
+    ret PFX(name ## 64_ ## cpu(__VA_ARGS__))
45
+
46
+#define FUNCDEF_PU(ret, name, cpu, ...) \
47
+    ret PFX(name ## _4x4_   ## cpu)(__VA_ARGS__); \
48
+    ret PFX(name ## _8x8_   ## cpu)(__VA_ARGS__); \
49
+    ret PFX(name ## _16x16_ ## cpu)(__VA_ARGS__); \
50
+    ret PFX(name ## _32x32_ ## cpu)(__VA_ARGS__); \
51
+    ret PFX(name ## _64x64_ ## cpu)(__VA_ARGS__); \
52
+    ret PFX(name ## _8x4_   ## cpu)(__VA_ARGS__); \
53
+    ret PFX(name ## _4x8_   ## cpu)(__VA_ARGS__); \
54
+    ret PFX(name ## _16x8_  ## cpu)(__VA_ARGS__); \
55
+    ret PFX(name ## _8x16_  ## cpu)(__VA_ARGS__); \
56
+    ret PFX(name ## _16x32_ ## cpu)(__VA_ARGS__); \
57
+    ret PFX(name ## _32x16_ ## cpu)(__VA_ARGS__); \
58
+    ret PFX(name ## _64x32_ ## cpu)(__VA_ARGS__); \
59
+    ret PFX(name ## _32x64_ ## cpu)(__VA_ARGS__); \
60
+    ret PFX(name ## _16x12_ ## cpu)(__VA_ARGS__); \
61
+    ret PFX(name ## _12x16_ ## cpu)(__VA_ARGS__); \
62
+    ret PFX(name ## _16x4_  ## cpu)(__VA_ARGS__); \
63
+    ret PFX(name ## _4x16_  ## cpu)(__VA_ARGS__); \
64
+    ret PFX(name ## _32x24_ ## cpu)(__VA_ARGS__); \
65
+    ret PFX(name ## _24x32_ ## cpu)(__VA_ARGS__); \
66
+    ret PFX(name ## _32x8_  ## cpu)(__VA_ARGS__); \
67
+    ret PFX(name ## _8x32_  ## cpu)(__VA_ARGS__); \
68
+    ret PFX(name ## _64x48_ ## cpu)(__VA_ARGS__); \
69
+    ret PFX(name ## _48x64_ ## cpu)(__VA_ARGS__); \
70
+    ret PFX(name ## _64x16_ ## cpu)(__VA_ARGS__); \
71
+    ret PFX(name ## _16x64_ ## cpu)(__VA_ARGS__)
72
+
73
+#define FUNCDEF_CHROMA_PU(ret, name, cpu, ...) \
74
+    FUNCDEF_PU(ret, name, cpu, __VA_ARGS__); \
75
+    ret PFX(name ## _4x2_ ## cpu)(__VA_ARGS__); \
76
+    ret PFX(name ## _4x4_ ## cpu)(__VA_ARGS__); \
77
+    ret PFX(name ## _2x4_ ## cpu)(__VA_ARGS__); \
78
+    ret PFX(name ## _8x2_ ## cpu)(__VA_ARGS__); \
79
+    ret PFX(name ## _2x8_ ## cpu)(__VA_ARGS__); \
80
+    ret PFX(name ## _8x6_ ## cpu)(__VA_ARGS__); \
81
+    ret PFX(name ## _6x8_ ## cpu)(__VA_ARGS__); \
82
+    ret PFX(name ## _8x12_ ## cpu)(__VA_ARGS__); \
83
+    ret PFX(name ## _12x8_ ## cpu)(__VA_ARGS__); \
84
+    ret PFX(name ## _6x16_ ## cpu)(__VA_ARGS__); \
85
+    ret PFX(name ## _16x6_ ## cpu)(__VA_ARGS__); \
86
+    ret PFX(name ## _2x16_ ## cpu)(__VA_ARGS__); \
87
+    ret PFX(name ## _16x2_ ## cpu)(__VA_ARGS__); \
88
+    ret PFX(name ## _4x12_ ## cpu)(__VA_ARGS__); \
89
+    ret PFX(name ## _12x4_ ## cpu)(__VA_ARGS__); \
90
+    ret PFX(name ## _32x12_ ## cpu)(__VA_ARGS__); \
91
+    ret PFX(name ## _12x32_ ## cpu)(__VA_ARGS__); \
92
+    ret PFX(name ## _32x4_ ## cpu)(__VA_ARGS__); \
93
+    ret PFX(name ## _4x32_ ## cpu)(__VA_ARGS__); \
94
+    ret PFX(name ## _32x48_ ## cpu)(__VA_ARGS__); \
95
+    ret PFX(name ## _48x32_ ## cpu)(__VA_ARGS__); \
96
+    ret PFX(name ## _16x24_ ## cpu)(__VA_ARGS__); \
97
+    ret PFX(name ## _24x16_ ## cpu)(__VA_ARGS__); \
98
+    ret PFX(name ## _8x64_ ## cpu)(__VA_ARGS__); \
99
+    ret PFX(name ## _64x8_ ## cpu)(__VA_ARGS__); \
100
+    ret PFX(name ## _64x24_ ## cpu)(__VA_ARGS__); \
101
+    ret PFX(name ## _24x64_ ## cpu)(__VA_ARGS__);
102
+
103
+#define DECLS(cpu) \
104
+    FUNCDEF_TU(void, cpy2Dto1D_shl, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
105
+    FUNCDEF_TU(void, cpy2Dto1D_shr, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
106
+    FUNCDEF_TU(void, cpy1Dto2D_shl, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
107
+    FUNCDEF_TU(void, cpy1Dto2D_shl_aligned, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
108
+    FUNCDEF_TU(void, cpy1Dto2D_shr, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
109
+    FUNCDEF_TU_S(uint32_t, copy_cnt, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride); \
110
+    FUNCDEF_TU_S(int, count_nonzero, cpu, const int16_t* quantCoeff); \
111
+    FUNCDEF_TU(void, blockfill_s, cpu, int16_t* dst, intptr_t dstride, int16_t val); \
112
+    FUNCDEF_TU(void, blockfill_s_aligned, cpu, int16_t* dst, intptr_t dstride, int16_t val); \
113
+    FUNCDEF_CHROMA_PU(void, blockcopy_ss, cpu, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); \
114
+    FUNCDEF_CHROMA_PU(void, blockcopy_pp, cpu, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); \
115
+    FUNCDEF_PU(void, blockcopy_sp, cpu, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); \
116
+    FUNCDEF_PU(void, blockcopy_ps, cpu, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); \
117
+    FUNCDEF_PU(void, interp_8tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
118
+    FUNCDEF_PU(void, interp_8tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
119
+    FUNCDEF_PU(void, interp_8tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
120
+    FUNCDEF_PU(void, interp_8tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
121
+    FUNCDEF_PU(void, interp_8tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
122
+    FUNCDEF_PU(void, interp_8tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
123
+    FUNCDEF_PU(void, interp_8tap_hv_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY); \
124
+    FUNCDEF_CHROMA_PU(void, filterPixelToShort, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
125
+    FUNCDEF_CHROMA_PU(void, filterPixelToShort_aligned, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
126
+    FUNCDEF_CHROMA_PU(void, interp_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
127
+    FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
128
+    FUNCDEF_CHROMA_PU(void, interp_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
129
+    FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
130
+    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
131
+    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
132
+    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
133
+    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
134
+    FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
135
+    FUNCDEF_CHROMA_PU(void, addAvg_aligned, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
136
+    FUNCDEF_PU(void, pixel_avg_pp, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
137
+    FUNCDEF_PU(void, pixel_avg_pp_aligned, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
138
+    FUNCDEF_PU(void, sad_x3, cpu, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
139
+    FUNCDEF_PU(void, sad_x4, cpu, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
140
+    FUNCDEF_CHROMA_PU(int, pixel_sad, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
141
+    FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
142
+    FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s_aligned, cpu, const int16_t*, intptr_t); \
143
+    FUNCDEF_TU_S(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
144
+    FUNCDEF_TU_S(sse_t, pixel_ssd_s_aligned, cpu, const int16_t*, intptr_t); \
145
+    FUNCDEF_PU(sse_t, pixel_sse_pp, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
146
+    FUNCDEF_CHROMA_PU(sse_t, pixel_sse_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
147
+    FUNCDEF_PU(void, pixel_sub_ps, cpu, int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); \
148
+    FUNCDEF_PU(void, pixel_add_ps, cpu, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \
149
+    FUNCDEF_PU(void, pixel_add_ps_aligned, cpu, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \
150
+    FUNCDEF_CHROMA_PU(int, pixel_satd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
151
+    FUNCDEF_TU_S2(void, ssimDist, cpu, const pixel *fenc, uint32_t fStride, const pixel *recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k); \
152
+    FUNCDEF_TU_S2(void, normFact, cpu, const pixel *src, uint32_t blockSize, int shift, uint64_t *z_k)
153
+
154
+DECLS(neon);
155
+DECLS(sve);
156
+DECLS(sve2);
157
+
158
+
159
+void x265_pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
160
+
161
+uint64_t x265_pixel_var_8x8_neon(const pixel* pix, intptr_t stride);
162
+uint64_t x265_pixel_var_16x16_neon(const pixel* pix, intptr_t stride);
163
+uint64_t x265_pixel_var_32x32_neon(const pixel* pix, intptr_t stride);
164
+uint64_t x265_pixel_var_64x64_neon(const pixel* pix, intptr_t stride);
165
+
166
+void x265_getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
167
+void x265_getResidual8_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
168
+void x265_getResidual16_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
169
+void x265_getResidual32_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
170
+
171
+void x265_scale1D_128to64_neon(pixel *dst, const pixel *src);
172
+void x265_scale2D_64to32_neon(pixel* dst, const pixel* src, intptr_t stride);
173
+
174
+int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
175
+int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
176
+int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
177
+int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
178
+int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
179
+int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
180
+int x265_pixel_satd_8x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
181
+int x265_pixel_satd_8x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
182
+int x265_pixel_satd_8x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
183
+int x265_pixel_satd_8x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
184
+int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
185
+int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
186
+int x265_pixel_satd_16x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
187
+int x265_pixel_satd_16x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
188
+int x265_pixel_satd_16x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
189
+int x265_pixel_satd_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
190
+int x265_pixel_satd_16x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
191
+int x265_pixel_satd_16x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
192
+int x265_pixel_satd_16x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
193
+int x265_pixel_satd_24x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
194
+int x265_pixel_satd_24x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
195
+int x265_pixel_satd_32x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
196
+int x265_pixel_satd_32x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
197
+int x265_pixel_satd_32x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
198
+int x265_pixel_satd_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
199
+int x265_pixel_satd_32x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
200
+int x265_pixel_satd_32x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
201
+int x265_pixel_satd_48x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
202
+int x265_pixel_satd_64x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
203
+int x265_pixel_satd_64x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
204
+int x265_pixel_satd_64x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
205
+int x265_pixel_satd_64x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
206
+
207
+int x265_pixel_sa8d_8x8_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
208
+int x265_pixel_sa8d_8x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
209
+int x265_pixel_sa8d_16x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
210
+int x265_pixel_sa8d_16x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
211
+int x265_pixel_sa8d_32x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
212
+int x265_pixel_sa8d_32x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
213
+int x265_pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
214
+
215
+uint32_t PFX(quant_neon)(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
216
+uint32_t PFX(nquant_neon)(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
217
+
218
+void x265_dequant_scaling_neon(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift);
219
+void x265_dequant_normal_neon(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
220
+
221
+void x265_ssim_4x4x2_core_neon(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24);
222
+
223
+int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
224
+int PFX(psyCost_8x8_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
225
+void PFX(weight_pp_neon)(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
226
+void PFX(weight_sp_neon)(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
227
+int PFX(scanPosLast_neon)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
228
+uint32_t PFX(costCoeffNxN_neon)(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
229
+
230
+uint64_t x265_pixel_var_8x8_sve2(const pixel* pix, intptr_t stride);
231
+uint64_t x265_pixel_var_16x16_sve2(const pixel* pix, intptr_t stride);
232
+uint64_t x265_pixel_var_32x32_sve2(const pixel* pix, intptr_t stride);
233
+uint64_t x265_pixel_var_64x64_sve2(const pixel* pix, intptr_t stride);
234
+
235
+void x265_getResidual16_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
236
+void x265_getResidual32_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
237
+
238
+void x265_scale1D_128to64_sve2(pixel *dst, const pixel *src);
239
+void x265_scale2D_64to32_sve2(pixel* dst, const pixel* src, intptr_t stride);
240
+
241
+int x265_pixel_satd_4x4_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
242
+int x265_pixel_satd_8x4_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
243
+int x265_pixel_satd_8x12_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
244
+int x265_pixel_satd_32x16_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
245
+int x265_pixel_satd_32x32_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
246
+int x265_pixel_satd_64x48_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
247
+
248
+uint32_t PFX(quant_sve)(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
249
+
250
+void x265_dequant_scaling_sve2(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift);
251
+void x265_dequant_normal_sve2(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
252
+
253
+void x265_ssim_4x4x2_core_sve2(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24);
254
+
255
+int PFX(psyCost_8x8_sve2)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
256
+void PFX(weight_sp_sve2)(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
257
+int PFX(scanPosLast_sve2)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
258
x265_3.6.tar.gz/source/common/aarch64/intrapred-prim.cpp Added
267
 
1
@@ -0,0 +1,265 @@
2
+#include "common.h"
3
+#include "primitives.h"
4
+
5
+
6
+#if 1
7
+#include "arm64-utils.h"
8
+#include <arm_neon.h>
9
+
10
+using namespace X265_NS;
11
+
12
+namespace
13
+{
14
+
15
+
16
+
17
+template<int width>
18
+void intra_pred_ang_neon(pixel *dst, intptr_t dstStride, const pixel *srcPix0, int dirMode, int bFilter)
19
+{
20
+    int width2 = width << 1;
21
+    // Flip the neighbours in the horizontal case.
22
+    int horMode = dirMode < 18;
23
+    pixel neighbourBuf129;
24
+    const pixel *srcPix = srcPix0;
25
+
26
+    if (horMode)
27
+    {
28
+        neighbourBuf0 = srcPix0;
29
+        //for (int i = 0; i < width << 1; i++)
30
+        //{
31
+        //    neighbourBuf1 + i = srcPixwidth2 + 1 + i;
32
+        //    neighbourBufwidth2 + 1 + i = srcPix1 + i;
33
+        //}
34
+        memcpy(&neighbourBuf1, &srcPixwidth2 + 1, sizeof(pixel) * (width << 1));
35
+        memcpy(&neighbourBufwidth2 + 1, &srcPix1, sizeof(pixel) * (width << 1));
36
+        srcPix = neighbourBuf;
37
+    }
38
+
39
+    // Intra prediction angle and inverse angle tables.
40
+    const int8_t angleTable17 = { -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 };
41
+    const int16_t invAngleTable8 = { 4096, 1638, 910, 630, 482, 390, 315, 256 };
42
+
43
+    // Get the prediction angle.
44
+    int angleOffset = horMode ? 10 - dirMode : dirMode - 26;
45
+    int angle = angleTable8 + angleOffset;
46
+
47
+    // Vertical Prediction.
48
+    if (!angle)
49
+    {
50
+        for (int y = 0; y < width; y++)
51
+        {
52
+            memcpy(&dsty * dstStride, srcPix + 1, sizeof(pixel)*width);
53
+        }
54
+        if (bFilter)
55
+        {
56
+            int topLeft = srcPix0, top = srcPix1;
57
+            for (int y = 0; y < width; y++)
58
+            {
59
+                dsty * dstStride = x265_clip((int16_t)(top + ((srcPixwidth2 + 1 + y - topLeft) >> 1)));
60
+            }
61
+        }
62
+    }
63
+    else // Angular prediction.
64
+    {
65
+        // Get the reference pixels. The reference base is the first pixel to the top (neighbourBuf1).
66
+        pixel refBuf64;
67
+        const pixel *ref;
68
+
69
+        // Use the projected left neighbours and the top neighbours.
70
+        if (angle < 0)
71
+        {
72
+            // Number of neighbours projected.
73
+            int nbProjected = -((width * angle) >> 5) - 1;
74
+            pixel *ref_pix = refBuf + nbProjected + 1;
75
+
76
+            // Project the neighbours.
77
+            int invAngle = invAngleTable- angleOffset - 1;
78
+            int invAngleSum = 128;
79
+            for (int i = 0; i < nbProjected; i++)
80
+            {
81
+                invAngleSum += invAngle;
82
+                ref_pix- 2 - i = srcPixwidth2 + (invAngleSum >> 8);
83
+            }
84
+
85
+            // Copy the top-left and top pixels.
86
+            //for (int i = 0; i < width + 1; i++)
87
+            //ref_pix-1 + i = srcPixi;
88
+
89
+            memcpy(&ref_pix-1, srcPix, (width + 1)*sizeof(pixel));
90
+            ref = ref_pix;
91
+        }
92
+        else // Use the top and top-right neighbours.
93
+        {
94
+            ref = srcPix + 1;
95
+        }
96
+
97
+        // Pass every row.
98
+        int angleSum = 0;
99
+        for (int y = 0; y < width; y++)
100
+        {
101
+            angleSum += angle;
102
+            int offset = angleSum >> 5;
103
+            int fraction = angleSum & 31;
104
+
105
+            if (fraction) // Interpolate
106
+            {
107
+                if (width >= 8 && sizeof(pixel) == 1)
108
+                {
109
+                    const int16x8_t f0 = vdupq_n_s16(32 - fraction);
110
+                    const int16x8_t f1 = vdupq_n_s16(fraction);
111
+                    for (int x = 0; x < width; x += 8)
112
+                    {
113
+                        uint8x8_t in0 = *(uint8x8_t *)&refoffset + x;
114
+                        uint8x8_t in1 = *(uint8x8_t *)&refoffset + x + 1;
115
+                        int16x8_t lo = vmlaq_s16(vdupq_n_s16(16), vmovl_u8(in0), f0);
116
+                        lo = vmlaq_s16(lo, vmovl_u8(in1), f1);
117
+                        lo = vshrq_n_s16(lo, 5);
118
+                        *(uint8x8_t *)&dsty * dstStride + x = vmovn_u16(lo);
119
+                    }
120
+                }
121
+                else if (width >= 4 && sizeof(pixel) == 2)
122
+                {
123
+                    const int32x4_t f0 = vdupq_n_s32(32 - fraction);
124
+                    const int32x4_t f1 = vdupq_n_s32(fraction);
125
+                    for (int x = 0; x < width; x += 4)
126
+                    {
127
+                        uint16x4_t in0 = *(uint16x4_t *)&refoffset + x;
128
+                        uint16x4_t in1 = *(uint16x4_t *)&refoffset + x + 1;
129
+                        int32x4_t lo = vmlaq_s32(vdupq_n_s32(16), vmovl_u16(in0), f0);
130
+                        lo = vmlaq_s32(lo, vmovl_u16(in1), f1);
131
+                        lo = vshrq_n_s32(lo, 5);
132
+                        *(uint16x4_t *)&dsty * dstStride + x = vmovn_u32(lo);
133
+                    }
134
+                }
135
+                else
136
+                {
137
+                    for (int x = 0; x < width; x++)
138
+                    {
139
+                        dsty * dstStride + x = (pixel)(((32 - fraction) * refoffset + x + fraction * refoffset + x + 1 + 16) >> 5);
140
+                    }
141
+                }
142
+            }
143
+            else // Copy.
144
+            {
145
+                memcpy(&dsty * dstStride, &refoffset, sizeof(pixel)*width);
146
+            }
147
+        }
148
+    }
149
+
150
+    // Flip for horizontal.
151
+    if (horMode)
152
+    {
153
+        if (width == 8)
154
+        {
155
+            transpose8x8(dst, dst, dstStride, dstStride);
156
+        }
157
+        else if (width == 16)
158
+        {
159
+            transpose16x16(dst, dst, dstStride, dstStride);
160
+        }
161
+        else if (width == 32)
162
+        {
163
+            transpose32x32(dst, dst, dstStride, dstStride);
164
+        }
165
+        else
166
+        {
167
+            for (int y = 0; y < width - 1; y++)
168
+            {
169
+                for (int x = y + 1; x < width; x++)
170
+                {
171
+                    pixel tmp              = dsty * dstStride + x;
172
+                    dsty * dstStride + x = dstx * dstStride + y;
173
+                    dstx * dstStride + y = tmp;
174
+                }
175
+            }
176
+        }
177
+    }
178
+}
179
+
180
+template<int log2Size>
181
+void all_angs_pred_neon(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
182
+{
183
+    const int size = 1 << log2Size;
184
+    for (int mode = 2; mode <= 34; mode++)
185
+    {
186
+        pixel *srcPix  = (g_intraFilterFlagsmode & size ? filtPix  : refPix);
187
+        pixel *out = dest + ((mode - 2) << (log2Size * 2));
188
+
189
+        intra_pred_ang_neon<size>(out, size, srcPix, mode, bLuma);
190
+
191
+        // Optimize code don't flip buffer
192
+        bool modeHor = (mode < 18);
193
+
194
+        // transpose the block if this is a horizontal mode
195
+        if (modeHor)
196
+        {
197
+            if (size == 8)
198
+            {
199
+                transpose8x8(out, out, size, size);
200
+            }
201
+            else if (size == 16)
202
+            {
203
+                transpose16x16(out, out, size, size);
204
+            }
205
+            else if (size == 32)
206
+            {
207
+                transpose32x32(out, out, size, size);
208
+            }
209
+            else
210
+            {
211
+                for (int k = 0; k < size - 1; k++)
212
+                {
213
+                    for (int l = k + 1; l < size; l++)
214
+                    {
215
+                        pixel tmp         = outk * size + l;
216
+                        outk * size + l = outl * size + k;
217
+                        outl * size + k = tmp;
218
+                    }
219
+                }
220
+            }
221
+        }
222
+    }
223
+}
224
+}
225
+
226
+namespace X265_NS
227
+{
228
+// x265 private namespace
229
+
230
+void setupIntraPrimitives_neon(EncoderPrimitives &p)
231
+{
232
+    for (int i = 2; i < NUM_INTRA_MODE; i++)
233
+    {
234
+        p.cuBLOCK_8x8.intra_predi = intra_pred_ang_neon<8>;
235
+        p.cuBLOCK_16x16.intra_predi = intra_pred_ang_neon<16>;
236
+        p.cuBLOCK_32x32.intra_predi = intra_pred_ang_neon<32>;
237
+    }
238
+    p.cuBLOCK_4x4.intra_pred2 = intra_pred_ang_neon<4>;
239
+    p.cuBLOCK_4x4.intra_pred10 = intra_pred_ang_neon<4>;
240
+    p.cuBLOCK_4x4.intra_pred18 = intra_pred_ang_neon<4>;
241
+    p.cuBLOCK_4x4.intra_pred26 = intra_pred_ang_neon<4>;
242
+    p.cuBLOCK_4x4.intra_pred34 = intra_pred_ang_neon<4>;
243
+
244
+    p.cuBLOCK_4x4.intra_pred_allangs = all_angs_pred_neon<2>;
245
+    p.cuBLOCK_8x8.intra_pred_allangs = all_angs_pred_neon<3>;
246
+    p.cuBLOCK_16x16.intra_pred_allangs = all_angs_pred_neon<4>;
247
+    p.cuBLOCK_32x32.intra_pred_allangs = all_angs_pred_neon<5>;
248
+}
249
+
250
+}
251
+
252
+
253
+
254
+#else
255
+
256
+namespace X265_NS
257
+{
258
+// x265 private namespace
259
+void setupIntraPrimitives_neon(EncoderPrimitives &p)
260
+{}
261
+}
262
+
263
+#endif
264
+
265
+
266
+
267
x265_3.6.tar.gz/source/common/aarch64/intrapred-prim.h Added
17
 
1
@@ -0,0 +1,15 @@
2
+#ifndef INTRAPRED_PRIM_H__
3
+
4
+#if defined(__aarch64__)
5
+
6
+namespace X265_NS
7
+{
8
+// x265 private namespace
9
+
10
+void setupIntraPrimitives_neon(EncoderPrimitives &p);
11
+}
12
+
13
+#endif
14
+
15
+#endif
16
+
17
x265_3.6.tar.gz/source/common/aarch64/ipfilter-common.S Added
1438
 
1
@@ -0,0 +1,1436 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+// Macros below follow these conventions:
29
+// - input data in registers: v0, v1, v2, v3, v4, v5, v6, v7
30
+// - constants in registers: v24, v25, v26, v27, v31
31
+// - temporary registers: v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30.
32
+// - _32b macros output a result in v17.4s
33
+// - _64b and _32b_1 macros output results in v17.4s, v18.4s
34
+
35
+#include "asm.S"
36
+
37
+.arch           armv8-a
38
+
39
+#ifdef __APPLE__
40
+.section __RODATA,__rodata
41
+#else
42
+.section .rodata
43
+#endif
44
+
45
+.align 4
46
+
47
+.macro vextin8 v
48
+    ldp             d6, d7, x11, #16
49
+.if \v == 0
50
+    // qpel_filter_0 only uses values in v3
51
+    ext             v3.8b, v6.8b, v7.8b, #4
52
+.else
53
+.if \v != 3
54
+    ext             v0.8b, v6.8b, v7.8b, #1
55
+.endif
56
+    ext             v1.8b, v6.8b, v7.8b, #2
57
+    ext             v2.8b, v6.8b, v7.8b, #3
58
+    ext             v3.8b, v6.8b, v7.8b, #4
59
+    ext             v4.8b, v6.8b, v7.8b, #5
60
+    ext             v5.8b, v6.8b, v7.8b, #6
61
+    ext             v6.8b, v6.8b, v7.8b, #7
62
+.endif
63
+.endm
64
+
65
+.macro vextin8_64 v
66
+    ldp             q6, q7, x11, #32
67
+.if \v == 0
68
+    // qpel_filter_0 only uses values in v3
69
+    ext             v3.16b, v6.16b, v7.16b, #4
70
+.else
71
+.if \v != 3
72
+    // qpel_filter_3 does not use values in v0
73
+    ext             v0.16b, v6.16b, v7.16b, #1
74
+.endif
75
+    ext             v1.16b, v6.16b, v7.16b, #2
76
+    ext             v2.16b, v6.16b, v7.16b, #3
77
+    ext             v3.16b, v6.16b, v7.16b, #4
78
+    ext             v4.16b, v6.16b, v7.16b, #5
79
+    ext             v5.16b, v6.16b, v7.16b, #6
80
+.if \v == 1
81
+    ext             v6.16b, v6.16b, v7.16b, #7
82
+    // qpel_filter_1 does not use v7
83
+.else
84
+    ext             v16.16b, v6.16b, v7.16b, #7
85
+    ext             v7.16b, v6.16b, v7.16b, #8
86
+    mov             v6.16b, v16.16b
87
+.endif
88
+.endif
89
+.endm
90
+
91
+.macro vextin8_chroma v
92
+    ldp             d6, d7, x11, #16
93
+.if \v == 0
94
+    // qpel_filter_chroma_0 only uses values in v1
95
+    ext             v1.8b, v6.8b, v7.8b, #2
96
+.else
97
+    ext             v0.8b, v6.8b, v7.8b, #1
98
+    ext             v1.8b, v6.8b, v7.8b, #2
99
+    ext             v2.8b, v6.8b, v7.8b, #3
100
+    ext             v3.8b, v6.8b, v7.8b, #4
101
+.endif
102
+.endm
103
+
104
+.macro vextin8_chroma_64 v
105
+    ldp             q16, q17, x11, #32
106
+.if \v == 0
107
+    // qpel_filter_chroma_0 only uses values in v1
108
+    ext             v1.16b, v16.16b, v17.16b, #2
109
+.else
110
+    ext             v0.16b, v16.16b, v17.16b, #1
111
+    ext             v1.16b, v16.16b, v17.16b, #2
112
+    ext             v2.16b, v16.16b, v17.16b, #3
113
+    ext             v3.16b, v16.16b, v17.16b, #4
114
+.endif
115
+.endm
116
+
117
+.macro qpel_load_32b v
118
+.if \v == 0
119
+    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
120
+    ld1             {v3.8b}, x6, x1
121
+.elseif \v == 1 || \v == 2 || \v == 3
122
+.if \v != 3                           // not used in qpel_filter_3
123
+    ld1             {v0.8b}, x6, x1
124
+.else
125
+    add             x6, x6, x1
126
+.endif
127
+    ld1             {v1.8b}, x6, x1
128
+    ld1             {v2.8b}, x6, x1
129
+    ld1             {v3.8b}, x6, x1
130
+    ld1             {v4.8b}, x6, x1
131
+    ld1             {v5.8b}, x6, x1
132
+.if \v != 1                           // not used in qpel_filter_1
133
+    ld1             {v6.8b}, x6, x1
134
+    ld1             {v7.8b}, x6
135
+.else
136
+    ld1             {v6.8b}, x6
137
+.endif
138
+.endif
139
+.endm
140
+
141
+.macro qpel_load_64b v
142
+.if \v == 0
143
+    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
144
+    ld1             {v3.16b}, x6, x1
145
+.elseif \v == 1 || \v == 2 || \v == 3
146
+.if \v != 3                           // not used in qpel_filter_3
147
+    ld1             {v0.16b}, x6, x1
148
+.else
149
+    add             x6, x6, x1
150
+.endif
151
+    ld1             {v1.16b}, x6, x1
152
+    ld1             {v2.16b}, x6, x1
153
+    ld1             {v3.16b}, x6, x1
154
+    ld1             {v4.16b}, x6, x1
155
+    ld1             {v5.16b}, x6, x1
156
+.if \v != 1                           // not used in qpel_filter_1
157
+    ld1             {v6.16b}, x6, x1
158
+    ld1             {v7.16b}, x6
159
+.else
160
+    ld1             {v6.16b}, x6
161
+.endif
162
+.endif
163
+.endm
164
+
165
+.macro qpel_chroma_load_32b v
166
+.if \v == 0
167
+    // qpel_filter_chroma_0 only uses values in v1
168
+    add             x6, x6, x1
169
+    ldr             d1, x6
170
+.else
171
+    ld1             {v0.8b}, x6, x1
172
+    ld1             {v1.8b}, x6, x1
173
+    ld1             {v2.8b}, x6, x1
174
+    ld1             {v3.8b}, x6
175
+.endif
176
+.endm
177
+
178
+.macro qpel_chroma_load_64b v
179
+.if \v == 0
180
+    // qpel_filter_chroma_0 only uses values in v1
181
+    add             x6, x6, x1
182
+    ldr             q1, x6
183
+.else
184
+    ld1             {v0.16b}, x6, x1
185
+    ld1             {v1.16b}, x6, x1
186
+    ld1             {v2.16b}, x6, x1
187
+    ld1             {v3.16b}, x6
188
+.endif
189
+.endm
190
+
191
+//          a, b,   c,  d,  e,   f, g,  h
192
+// .hword   0, 0,   0, 64,  0,   0, 0,  0
193
+.macro qpel_start_0
194
+    movi            v24.16b, #64
195
+.endm
196
+
197
+.macro qpel_filter_0_32b
198
+    umull           v17.8h, v3.8b, v24.8b    // 64*d
199
+.endm
200
+
201
+.macro qpel_filter_0_64b
202
+    qpel_filter_0_32b
203
+    umull2          v18.8h, v3.16b, v24.16b  // 64*d
204
+.endm
205
+
206
+.macro qpel_start_0_1
207
+    movi            v24.8h, #64
208
+.endm
209
+
210
+.macro qpel_filter_0_32b_1
211
+    smull           v17.4s, v3.4h, v24.4h    // 64*d0
212
+    smull2          v18.4s, v3.8h, v24.8h    // 64*d1
213
+.endm
214
+
215
+//          a, b,   c,  d,  e,   f, g,  h
216
+// .hword  -1, 4, -10, 58, 17,  -5, 1,  0
217
+.macro qpel_start_1
218
+    movi            v24.16b, #58
219
+    movi            v25.16b, #10
220
+    movi            v26.16b, #17
221
+    movi            v27.16b, #5
222
+.endm
223
+
224
+.macro qpel_filter_1_32b
225
+    umull           v19.8h, v2.8b, v25.8b  // c*10
226
+    umull           v17.8h, v3.8b, v24.8b  // d*58
227
+    umull           v21.8h, v4.8b, v26.8b  // e*17
228
+    umull           v23.8h, v5.8b, v27.8b  // f*5
229
+    sub             v17.8h, v17.8h, v19.8h // d*58 - c*10
230
+    ushll           v18.8h, v1.8b, #2      // b*4
231
+    add             v17.8h, v17.8h, v21.8h // d*58 - c*10 + e*17
232
+    usubl           v21.8h, v6.8b, v0.8b   // g - a
233
+    add             v17.8h, v17.8h, v18.8h // d*58 - c*10 + e*17 + b*4
234
+    sub             v21.8h, v21.8h, v23.8h // g - a - f*5
235
+    add             v17.8h, v17.8h, v21.8h // d*58 - c*10 + e*17 + b*4 + g - a - f*5
236
+.endm
237
+
238
+.macro qpel_filter_1_64b
239
+    qpel_filter_1_32b
240
+    umull2          v20.8h, v2.16b, v25.16b  // c*10
241
+    umull2          v18.8h, v3.16b, v24.16b  // d*58
242
+    umull2          v21.8h, v4.16b, v26.16b  // e*17
243
+    umull2          v23.8h, v5.16b, v27.16b  // f*5
244
+    sub             v18.8h, v18.8h, v20.8h   // d*58 - c*10
245
+    ushll2          v28.8h, v1.16b, #2       // b*4
246
+    add             v18.8h, v18.8h, v21.8h   // d*58 - c*10 + e*17
247
+    usubl2          v21.8h, v6.16b, v0.16b   // g - a
248
+    add             v18.8h, v18.8h, v28.8h   // d*58 - c*10 + e*17 + b*4
249
+    sub             v21.8h, v21.8h, v23.8h   // g - a - f*5
250
+    add             v18.8h, v18.8h, v21.8h   // d*58 - c*10 + e*17 + b*4 + g - a - f*5
251
+.endm
252
+
253
+.macro qpel_start_1_1
254
+    movi            v24.8h, #58
255
+    movi            v25.8h, #10
256
+    movi            v26.8h, #17
257
+    movi            v27.8h, #5
258
+.endm
259
+
260
+.macro qpel_filter_1_32b_1
261
+    smull           v17.4s, v3.4h, v24.4h    // 58 * d0
262
+    smull2          v18.4s, v3.8h, v24.8h    // 58 * d1
263
+    smull           v19.4s, v2.4h, v25.4h    // 10 * c0
264
+    smull2          v20.4s, v2.8h, v25.8h    // 10 * c1
265
+    smull           v21.4s, v4.4h, v26.4h    // 17 * e0
266
+    smull2          v22.4s, v4.8h, v26.8h    // 17 * e1
267
+    smull           v23.4s, v5.4h, v27.4h    //  5 * f0
268
+    smull2          v16.4s, v5.8h, v27.8h    //  5 * f1
269
+    sub             v17.4s, v17.4s, v19.4s   // 58 * d0 - 10 * c0
270
+    sub             v18.4s, v18.4s, v20.4s   // 58 * d1 - 10 * c1
271
+    sshll           v19.4s, v1.4h, #2        // 4 * b0
272
+    sshll2          v20.4s, v1.8h, #2        // 4 * b1
273
+    add             v17.4s, v17.4s, v21.4s   // 58 * d0 - 10 * c0 + 17 * e0
274
+    add             v18.4s, v18.4s, v22.4s   // 58 * d1 - 10 * c1 + 17 * e1
275
+    ssubl           v21.4s, v6.4h, v0.4h     // g0 - a0
276
+    ssubl2          v22.4s, v6.8h, v0.8h     // g1 - a1
277
+    add             v17.4s, v17.4s, v19.4s   // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0
278
+    add             v18.4s, v18.4s, v20.4s   // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1
279
+    sub             v21.4s, v21.4s, v23.4s   // g0 - a0 - 5 * f0
280
+    sub             v22.4s, v22.4s, v16.4s   // g1 - a1 - 5 * f1
281
+    add             v17.4s, v17.4s, v21.4s   // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0
282
+    add             v18.4s, v18.4s, v22.4s   // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1
283
+.endm
284
+
285
+//          a, b,   c,  d,  e,   f, g,  h
286
+// .hword  -1, 4, -11, 40, 40, -11, 4, -1
287
+.macro qpel_start_2
288
+    movi            v24.8h, #11
289
+    movi            v25.8h, #40
290
+.endm
291
+
292
+.macro qpel_filter_2_32b
293
+    uaddl           v17.8h, v3.8b, v4.8b     // d + e
294
+    uaddl           v19.8h, v2.8b, v5.8b     // c + f
295
+    uaddl           v23.8h, v1.8b, v6.8b     // b + g
296
+    uaddl           v21.8h, v0.8b, v7.8b     // a + h
297
+    mul             v17.8h, v17.8h, v25.8h   // 40 * (d + e)
298
+    mul             v19.8h, v19.8h, v24.8h   // 11 * (c + f)
299
+    shl             v23.8h, v23.8h, #2       // (b + g) * 4
300
+    add             v19.8h, v19.8h, v21.8h   // 11 * (c + f) + a + h
301
+    add             v17.8h, v17.8h, v23.8h   // 40 * (d + e) + (b + g) * 4
302
+    sub             v17.8h, v17.8h, v19.8h   // 40 * (d + e) + (b + g) * 4 - 11 * (c + f) - a - h
303
+.endm
304
+
305
+.macro qpel_filter_2_64b
306
+    qpel_filter_2_32b
307
+    uaddl2          v27.8h, v3.16b, v4.16b   // d + e
308
+    uaddl2          v16.8h, v2.16b, v5.16b   // c + f
309
+    uaddl2          v23.8h, v1.16b, v6.16b   // b + g
310
+    uaddl2          v21.8h, v0.16b, v7.16b   // a + h
311
+    mul             v27.8h, v27.8h, v25.8h   // 40 * (d + e)
312
+    mul             v16.8h, v16.8h, v24.8h   // 11 * (c + f)
313
+    shl             v23.8h, v23.8h, #2       // (b + g) * 4
314
+    add             v16.8h, v16.8h, v21.8h   // 11 * (c + f) + a + h
315
+    add             v27.8h, v27.8h, v23.8h   // 40 * (d + e) + (b + g) * 4
316
+    sub             v18.8h, v27.8h, v16.8h   // 40 * (d + e) + (b + g) * 4 - 11 * (c + f) - a - h
317
+.endm
318
+
319
+.macro qpel_start_2_1
320
+    movi            v24.4s, #11
321
+    movi            v25.4s, #40
322
+.endm
323
+
324
+.macro qpel_filter_2_32b_1
325
+    saddl           v17.4s, v3.4h, v4.4h     // d0 + e0
326
+    saddl2          v18.4s, v3.8h, v4.8h     // d1 + e1
327
+    saddl           v19.4s, v2.4h, v5.4h     // c0 + f0
328
+    saddl2          v20.4s, v2.8h, v5.8h     // c1 + f1
329
+    mul             v19.4s, v19.4s, v24.4s   // 11 * (c0 + f0)
330
+    mul             v20.4s, v20.4s, v24.4s   // 11 * (c1 + f1)
331
+    saddl           v23.4s, v1.4h, v6.4h     // b0 + g0
332
+    mul             v17.4s, v17.4s, v25.4s   // 40 * (d0 + e0)
333
+    mul             v18.4s, v18.4s, v25.4s   // 40 * (d1 + e1)
334
+    saddl2          v16.4s, v1.8h, v6.8h     // b1 + g1
335
+    saddl           v21.4s, v0.4h, v7.4h     // a0 + h0
336
+    saddl2          v22.4s, v0.8h, v7.8h     // a1 + h1
337
+    shl             v23.4s, v23.4s, #2       // 4*(b0+g0)
338
+    shl             v16.4s, v16.4s, #2       // 4*(b1+g1)
339
+    add             v19.4s, v19.4s, v21.4s   // 11 * (c0 + f0) + a0 + h0
340
+    add             v20.4s, v20.4s, v22.4s   // 11 * (c1 + f1) + a1 + h1
341
+    add             v17.4s, v17.4s, v23.4s   // 40 * (d0 + e0) + 4*(b0+g0)
342
+    add             v18.4s, v18.4s, v16.4s   // 40 * (d1 + e1) + 4*(b1+g1)
343
+    sub             v17.4s, v17.4s, v19.4s   // 40 * (d0 + e0) + 4*(b0+g0) - (11 * (c0 + f0) + a0 + h0)
344
+    sub             v18.4s, v18.4s, v20.4s   // 40 * (d1 + e1) + 4*(b1+g1) - (11 * (c1 + f1) + a1 + h1)
345
+.endm
346
+
347
+//          a, b,   c,  d,  e,   f, g,  h
348
+// .hword   0, 1,  -5, 17, 58, -10, 4, -1
349
+.macro qpel_start_3
350
+    movi            v24.16b, #17
351
+    movi            v25.16b, #5
352
+    movi            v26.16b, #58
353
+    movi            v27.16b, #10
354
+.endm
355
+
356
+.macro qpel_filter_3_32b
357
+    umull           v19.8h, v2.8b, v25.8b    // c * 5
358
+    umull           v17.8h, v3.8b, v24.8b    // d * 17
359
+    umull           v21.8h, v4.8b, v26.8b    // e * 58
360
+    umull           v23.8h, v5.8b, v27.8b    // f * 10
361
+    sub             v17.8h, v17.8h, v19.8h   // d * 17 - c * 5
362
+    ushll           v19.8h, v6.8b, #2        // g * 4
363
+    add             v17.8h, v17.8h, v21.8h   // d * 17 - c * 5 + e * 58
364
+    usubl           v21.8h, v1.8b, v7.8b     // b - h
365
+    add             v17.8h, v17.8h, v19.8h   // d * 17 - c * 5 + e * 58 + g * 4
366
+    sub             v21.8h, v21.8h, v23.8h   // b - h - f * 10
367
+    add             v17.8h, v17.8h, v21.8h   // d * 17 - c * 5 + e * 58 + g * 4 + b - h - f * 10
368
+.endm
369
+
370
+.macro qpel_filter_3_64b
371
+    qpel_filter_3_32b
372
+    umull2          v16.8h, v2.16b, v25.16b  // c * 5
373
+    umull2          v18.8h, v3.16b, v24.16b  // d * 17
374
+    umull2          v21.8h, v4.16b, v26.16b  // e * 58
375
+    umull2          v23.8h, v5.16b, v27.16b  // f * 10
376
+    sub             v18.8h, v18.8h, v16.8h   // d * 17 - c * 5
377
+    ushll2          v16.8h, v6.16b, #2       // g * 4
378
+    add             v18.8h, v18.8h, v21.8h   // d * 17 - c * 5 + e * 58
379
+    usubl2          v21.8h, v1.16b, v7.16b   // b - h
380
+    add             v18.8h, v18.8h, v16.8h   // d * 17 - c * 5 + e * 58 + g * 4
381
+    sub             v21.8h, v21.8h, v23.8h   // b - h - f * 10
382
+    add             v18.8h, v18.8h, v21.8h   // d * 17 - c * 5 + e * 58 + g * 4 + b - h - f * 10
383
+.endm
384
+
385
+.macro qpel_start_3_1
386
+    movi            v24.8h, #17
387
+    movi            v25.8h, #5
388
+    movi            v26.8h, #58
389
+    movi            v27.8h, #10
390
+.endm
391
+
392
+.macro qpel_filter_3_32b_1
393
+    smull           v17.4s, v3.4h, v24.4h    // 17 * d0
394
+    smull2          v18.4s, v3.8h, v24.8h    // 17 * d1
395
+    smull           v19.4s, v2.4h, v25.4h    //  5 * c0
396
+    smull2          v20.4s, v2.8h, v25.8h    //  5 * c1
397
+    smull           v21.4s, v4.4h, v26.4h    // 58 * e0
398
+    smull2          v22.4s, v4.8h, v26.8h    // 58 * e1
399
+    smull           v23.4s, v5.4h, v27.4h    // 10 * f0
400
+    smull2          v16.4s, v5.8h, v27.8h    // 10 * f1
401
+    sub             v17.4s, v17.4s, v19.4s   // 17 * d0 - 5 * c0
402
+    sub             v18.4s, v18.4s, v20.4s   // 17 * d1 - 5 * c1
403
+    sshll           v19.4s, v6.4h, #2        //  4 * g0
404
+    sshll2          v20.4s, v6.8h, #2        //  4 * g1
405
+    add             v17.4s, v17.4s, v21.4s   // 17 * d0 - 5 * c0 + 58 * e0
406
+    add             v18.4s, v18.4s, v22.4s   // 17 * d1 - 5 * c1 + 58 * e1
407
+    ssubl           v21.4s, v1.4h, v7.4h     // b0 - h0
408
+    ssubl2          v22.4s, v1.8h, v7.8h     // b1 - h1
409
+    add             v17.4s, v17.4s, v19.4s   // 17 * d0 - 5 * c0 + 58 * e0 + 4 * g0
410
+    add             v18.4s, v18.4s, v20.4s   // 17 * d1 - 5 * c1 + 58 * e1 + 4 * g1
411
+    sub             v21.4s, v21.4s, v23.4s   // b0 - h0 - 10 * f0
412
+    sub             v22.4s, v22.4s, v16.4s   // b1 - h1 - 10 * f1
413
+    add             v17.4s, v17.4s, v21.4s   // 17 * d0 - 5 * c0 + 58 * e0 + 4 * g0 + b0 - h0 - 10 * f0
414
+    add             v18.4s, v18.4s, v22.4s   // 17 * d1 - 5 * c1 + 58 * e1 + 4 * g1 + b1 - h1 - 10 * f1
415
+.endm
416
+
417
+.macro qpel_start_chroma_0
418
+    movi            v24.16b, #64
419
+.endm
420
+
421
+.macro qpel_filter_chroma_0_32b
422
+    umull           v17.8h, v1.8b, v24.8b    // 64*b
423
+.endm
424
+
425
+.macro qpel_filter_chroma_0_64b
426
+    umull           v17.8h, v1.8b, v24.8b    // 64*b
427
+    umull2          v18.8h, v1.16b, v24.16b  // 64*b
428
+.endm
429
+
430
+.macro qpel_start_chroma_0_1
431
+    movi            v24.8h, #64
432
+.endm
433
+
434
+.macro qpel_filter_chroma_0_32b_1
435
+    smull           v17.4s, v1.4h, v24.4h    // 64*b0
436
+    smull2          v18.4s, v1.8h, v24.8h    // 64*b1
437
+.endm
438
+
439
+.macro qpel_start_chroma_1
440
+    movi            v24.16b, #58
441
+    movi            v25.16b, #10
442
+.endm
443
+
444
+.macro qpel_filter_chroma_1_32b
445
+    umull           v17.8h, v1.8b, v24.8b    // 58 * b
446
+    umull           v19.8h, v2.8b, v25.8b    // 10 * c
447
+    uaddl           v22.8h, v0.8b, v3.8b     // a + d
448
+    shl             v22.8h, v22.8h, #1       // 2 * (a+d)
449
+    sub             v17.8h, v17.8h, v22.8h   // 58*b - 2*(a+d)
450
+    add             v17.8h, v17.8h, v19.8h   // 58*b-2*(a+d) + 10*c
451
+.endm
452
+
453
+.macro qpel_filter_chroma_1_64b
454
+    umull           v17.8h, v1.8b, v24.8b    // 58 * b
455
+    umull2          v18.8h, v1.16b, v24.16b  // 58 * b
456
+    umull           v19.8h, v2.8b, v25.8b    // 10 * c
457
+    umull2          v20.8h, v2.16b, v25.16b  // 10 * c
458
+    uaddl           v22.8h, v0.8b, v3.8b     // a + d
459
+    uaddl2          v23.8h, v0.16b, v3.16b   // a + d
460
+    shl             v22.8h, v22.8h, #1       // 2 * (a+d)
461
+    shl             v23.8h, v23.8h, #1       // 2 * (a+d)
462
+    sub             v17.8h, v17.8h, v22.8h   // 58*b - 2*(a+d)
463
+    sub             v18.8h, v18.8h, v23.8h   // 58*b - 2*(a+d)
464
+    add             v17.8h, v17.8h, v19.8h   // 58*b-2*(a+d) + 10*c
465
+    add             v18.8h, v18.8h, v20.8h   // 58*b-2*(a+d) + 10*c
466
+.endm
467
+
468
+.macro qpel_start_chroma_1_1
469
+    movi            v24.8h, #58
470
+    movi            v25.8h, #10
471
+.endm
472
+
473
+.macro qpel_filter_chroma_1_32b_1
474
+    smull           v17.4s, v1.4h, v24.4h    // 58 * b0
475
+    smull2          v18.4s, v1.8h, v24.8h    // 58 * b1
476
+    smull           v19.4s, v2.4h, v25.4h    // 10 * c0
477
+    smull2          v20.4s, v2.8h, v25.8h    // 10 * c1
478
+    add             v22.8h, v0.8h, v3.8h     // a + d
479
+    sshll           v21.4s, v22.4h, #1       // 2 * (a0+d0)
480
+    sshll2          v22.4s, v22.8h, #1       // 2 * (a1+d1)
481
+    sub             v17.4s, v17.4s, v21.4s   // 58*b0 - 2*(a0+d0)
482
+    sub             v18.4s, v18.4s, v22.4s   // 58*b1 - 2*(a1+d1)
483
+    add             v17.4s, v17.4s, v19.4s   // 58*b0-2*(a0+d0) + 10*c0
484
+    add             v18.4s, v18.4s, v20.4s   // 58*b1-2*(a1+d1) + 10*c1
485
+.endm
486
+
487
+.macro qpel_start_chroma_2
488
+    movi            v25.16b, #54
489
+.endm
490
+
491
+.macro qpel_filter_chroma_2_32b
492
+    umull           v17.8h, v1.8b, v25.8b    // 54 * b
493
+    ushll           v19.8h, v0.8b, #2        // 4 * a
494
+    ushll           v21.8h, v2.8b, #4        // 16 * c
495
+    ushll           v23.8h, v3.8b, #1        // 2 * d
496
+    add             v17.8h, v17.8h, v21.8h   // 54*b + 16*c
497
+    add             v19.8h, v19.8h, v23.8h   // 4*a + 2*d
498
+    sub             v17.8h, v17.8h, v19.8h   // 54*b+16*c - (4*a+2*d)
499
+.endm
500
+
501
+.macro qpel_filter_chroma_2_64b
502
+    umull           v17.8h, v1.8b, v25.8b    // 54 * b
503
+    umull2          v18.8h, v1.16b, v25.16b  // 54 * b
504
+    ushll           v19.8h, v0.8b, #2        // 4 * a
505
+    ushll2          v20.8h, v0.16b, #2       // 4 * a
506
+    ushll           v21.8h, v2.8b, #4        // 16 * c
507
+    ushll2          v22.8h, v2.16b, #4       // 16 * c
508
+    ushll           v23.8h, v3.8b, #1        // 2 * d
509
+    ushll2          v24.8h, v3.16b, #1       // 2 * d
510
+    add             v17.8h, v17.8h, v21.8h   // 54*b + 16*c
511
+    add             v18.8h, v18.8h, v22.8h   // 54*b + 16*c
512
+    add             v19.8h, v19.8h, v23.8h   // 4*a + 2*d
513
+    add             v20.8h, v20.8h, v24.8h   // 4*a + 2*d
514
+    sub             v17.8h, v17.8h, v19.8h   // 54*b+16*c - (4*a+2*d)
515
+    sub             v18.8h, v18.8h, v20.8h   // 54*b+16*c - (4*a+2*d)
516
+.endm
517
+
518
+.macro qpel_start_chroma_2_1
519
+    movi            v25.8h, #54
520
+.endm
521
+
522
+.macro qpel_filter_chroma_2_32b_1
523
+    smull           v17.4s, v1.4h, v25.4h    // 54 * b0
524
+    smull2          v18.4s, v1.8h, v25.8h    // 54 * b1
525
+    sshll           v19.4s, v0.4h, #2        // 4 * a0
526
+    sshll2          v20.4s, v0.8h, #2        // 4 * a1
527
+    sshll           v21.4s, v2.4h, #4        // 16 * c0
528
+    sshll2          v22.4s, v2.8h, #4        // 16 * c1
529
+    sshll           v23.4s, v3.4h, #1        // 2 * d0
530
+    sshll2          v24.4s, v3.8h, #1        // 2 * d1
531
+    add             v17.4s, v17.4s, v21.4s   // 54*b0 + 16*c0
532
+    add             v18.4s, v18.4s, v22.4s   // 54*b1 + 16*c1
533
+    add             v19.4s, v19.4s, v23.4s   // 4*a0 + 2*d0
534
+    add             v20.4s, v20.4s, v24.4s   // 4*a1 + 2*d1
535
+    sub             v17.4s, v17.4s, v19.4s   // 54*b0+16*c0 - (4*a0+2*d0)
536
+    sub             v18.4s, v18.4s, v20.4s   // 54*b1+16*c1 - (4*a1+2*d1)
537
+.endm
538
+
539
+.macro qpel_start_chroma_3
540
+    movi            v25.16b, #46
541
+    movi            v26.16b, #28
542
+    movi            v27.16b, #6
543
+.endm
544
+
545
+.macro qpel_filter_chroma_3_32b
546
+    umull           v17.8h, v1.8b, v25.8b    // 46 * b
547
+    umull           v19.8h, v2.8b, v26.8b    // 28 * c
548
+    ushll           v21.8h, v3.8b, #2        // 4 * d
549
+    umull           v23.8h, v0.8b, v27.8b    // 6 * a
550
+    add             v17.8h, v17.8h, v19.8h   // 46*b + 28*c
551
+    add             v21.8h, v21.8h, v23.8h   // 4*d + 6*a
552
+    sub             v17.8h, v17.8h, v21.8h   // 46*b+28*c - (4*d+6*a)
553
+.endm
554
+
555
+.macro qpel_filter_chroma_3_64b
556
+    umull           v17.8h, v1.8b, v25.8b    // 46 * b
557
+    umull2          v18.8h, v1.16b, v25.16b  // 46 * b
558
+    umull           v19.8h, v2.8b, v26.8b    // 28 * c
559
+    umull2          v20.8h, v2.16b, v26.16b  // 28 * c
560
+    ushll           v21.8h, v3.8b, #2        // 4 * d
561
+    ushll2          v22.8h, v3.16b, #2       // 4 * d
562
+    umull           v23.8h, v0.8b, v27.8b    // 6 * a
563
+    umull2          v24.8h, v0.16b, v27.16b  // 6 * a
564
+    add             v17.8h, v17.8h, v19.8h   // 46*b + 28*c
565
+    add             v18.8h, v18.8h, v20.8h   // 46*b + 28*c
566
+    add             v21.8h, v21.8h, v23.8h   // 4*d + 6*a
567
+    add             v22.8h, v22.8h, v24.8h   // 4*d + 6*a
568
+    sub             v17.8h, v17.8h, v21.8h   // 46*b+28*c - (4*d+6*a)
569
+    sub             v18.8h, v18.8h, v22.8h   // 46*b+28*c - (4*d+6*a)
570
+.endm
571
+
572
+.macro qpel_start_chroma_3_1
573
+    movi            v25.8h, #46
574
+    movi            v26.8h, #28
575
+    movi            v27.8h, #6
576
+.endm
577
+
578
+.macro qpel_filter_chroma_3_32b_1
579
+    smull           v17.4s, v1.4h, v25.4h    // 46 * b0
580
+    smull2          v18.4s, v1.8h, v25.8h    // 46 * b1
581
+    smull           v19.4s, v2.4h, v26.4h    // 28 * c0
582
+    smull2          v20.4s, v2.8h, v26.8h    // 28 * c1
583
+    sshll           v21.4s, v3.4h, #2        // 4 * d0
584
+    sshll2          v22.4s, v3.8h, #2        // 4 * d1
585
+    smull           v23.4s, v0.4h, v27.4h    // 6 * a0
586
+    smull2          v24.4s, v0.8h, v27.8h    // 6 * a1
587
+    add             v17.4s, v17.4s, v19.4s   // 46*b0 + 28*c0
588
+    add             v18.4s, v18.4s, v20.4s   // 46*b1 + 28*c1
589
+    add             v21.4s, v21.4s, v23.4s   // 4*d0 + 6*a0
590
+    add             v22.4s, v22.4s, v24.4s   // 4*d1 + 6*a1
591
+    sub             v17.4s, v17.4s, v21.4s   // 46*b0+28*c0 - (4*d0+6*a0)
592
+    sub             v18.4s, v18.4s, v22.4s   // 46*b1+28*c1 - (4*d1+6*a1)
593
+.endm
594
+
595
+.macro qpel_start_chroma_4
596
+    movi            v24.8h, #36
597
+.endm
598
+
599
+.macro qpel_filter_chroma_4_32b
600
+    uaddl           v20.8h, v0.8b, v3.8b     // a + d
601
+    uaddl           v17.8h, v1.8b, v2.8b     // b + c
602
+    shl             v20.8h, v20.8h, #2       // 4 * (a+d)
603
+    mul             v17.8h, v17.8h, v24.8h   // 36 * (b+c)
604
+    sub             v17.8h, v17.8h, v20.8h   // 36*(b+c) - 4*(a+d)
605
+.endm
606
+
607
+.macro qpel_filter_chroma_4_64b
608
+    uaddl           v20.8h, v0.8b, v3.8b     // a + d
609
+    uaddl2          v21.8h, v0.16b, v3.16b   // a + d
610
+    uaddl           v17.8h, v1.8b, v2.8b     // b + c
611
+    uaddl2          v18.8h, v1.16b, v2.16b   // b + c
612
+    shl             v20.8h, v20.8h, #2       // 4 * (a+d)
613
+    shl             v21.8h, v21.8h, #2       // 4 * (a+d)
614
+    mul             v17.8h, v17.8h, v24.8h   // 36 * (b+c)
615
+    mul             v18.8h, v18.8h, v24.8h   // 36 * (b+c)
616
+    sub             v17.8h, v17.8h, v20.8h   // 36*(b+c) - 4*(a+d)
617
+    sub             v18.8h, v18.8h, v21.8h   // 36*(b+c) - 4*(a+d)
618
+.endm
619
+
620
+.macro qpel_start_chroma_4_1
621
+    movi            v24.8h, #36
622
+.endm
623
+
624
+.macro qpel_filter_chroma_4_32b_1
625
+    add             v20.8h, v0.8h, v3.8h     // a + d
626
+    add             v21.8h, v1.8h, v2.8h     // b + c
627
+    smull           v17.4s, v21.4h, v24.4h   // 36 * (b0+c0)
628
+    smull2          v18.4s, v21.8h, v24.8h   // 36 * (b1+c1)
629
+    sshll           v21.4s, v20.4h, #2       // 4 * (a0+d0)
630
+    sshll2          v22.4s, v20.8h, #2       // 4 * (a1+d1)
631
+    sub             v17.4s, v17.4s, v21.4s   // 36*(b0+c0) - 4*(a0+d0)
632
+    sub             v18.4s, v18.4s, v22.4s   // 36*(b1+c1) - 4*(a1+d1)
633
+.endm
634
+
635
+.macro qpel_start_chroma_5
636
+    movi            v25.16b, #28
637
+    movi            v26.16b, #46
638
+    movi            v27.16b, #6
639
+.endm
640
+
641
+.macro qpel_filter_chroma_5_32b
642
+    umull           v17.8h, v1.8b, v25.8b    // 28 * b
643
+    umull           v19.8h, v2.8b, v26.8b    // 46 * c
644
+    ushll           v21.8h, v0.8b, #2        // 4 * a
645
+    umull           v23.8h, v3.8b, v27.8b    // 6 * d
646
+    add             v17.8h, v17.8h, v19.8h   // 28*b + 46*c
647
+    add             v21.8h, v21.8h, v23.8h   // 4*a + 6*d
648
+    sub             v17.8h, v17.8h, v21.8h   // 28*b+46*c - (4*a+6*d)
649
+.endm
650
+
651
+.macro qpel_filter_chroma_5_64b
652
+    umull           v17.8h, v1.8b, v25.8b    // 28 * b
653
+    umull2          v18.8h, v1.16b, v25.16b  // 28 * b
654
+    umull           v19.8h, v2.8b, v26.8b    // 46 * c
655
+    umull2          v20.8h, v2.16b, v26.16b  // 46 * c
656
+    ushll           v21.8h, v0.8b, #2        // 4 * a
657
+    ushll2          v22.8h, v0.16b, #2       // 4 * a
658
+    umull           v23.8h, v3.8b, v27.8b    // 6 * d
659
+    umull2          v24.8h, v3.16b, v27.16b  // 6 * d
660
+    add             v17.8h, v17.8h, v19.8h   // 28*b + 46*c
661
+    add             v18.8h, v18.8h, v20.8h   // 28*b + 46*c
662
+    add             v21.8h, v21.8h, v23.8h   // 4*a + 6*d
663
+    add             v22.8h, v22.8h, v24.8h   // 4*a + 6*d
664
+    sub             v17.8h, v17.8h, v21.8h   // 28*b+46*c - (4*a+6*d)
665
+    sub             v18.8h, v18.8h, v22.8h   // 28*b+46*c - (4*a+6*d)
666
+.endm
667
+
668
+.macro qpel_start_chroma_5_1
669
+    movi            v25.8h, #28
670
+    movi            v26.8h, #46
671
+    movi            v27.8h, #6
672
+.endm
673
+
674
+.macro qpel_filter_chroma_5_32b_1
675
+    smull           v17.4s, v1.4h, v25.4h    // 28 * b0
676
+    smull2          v18.4s, v1.8h, v25.8h    // 28 * b1
677
+    smull           v19.4s, v2.4h, v26.4h    // 46 * c0
678
+    smull2          v20.4s, v2.8h, v26.8h    // 46 * c1
679
+    sshll           v21.4s, v0.4h, #2        // 4 * a0
680
+    sshll2          v22.4s, v0.8h, #2        // 4 * a1
681
+    smull           v23.4s, v3.4h, v27.4h    // 6 * d0
682
+    smull2          v24.4s, v3.8h, v27.8h    // 6 * d1
683
+    add             v17.4s, v17.4s, v19.4s   // 28*b0 + 46*c0
684
+    add             v18.4s, v18.4s, v20.4s   // 28*b1 + 46*c1
685
+    add             v21.4s, v21.4s, v23.4s   // 4*a0 + 6*d0
686
+    add             v22.4s, v22.4s, v24.4s   // 4*a1 + 6*d1
687
+    sub             v17.4s, v17.4s, v21.4s   // 28*b0+46*c0 - (4*a0+6*d0)
688
+    sub             v18.4s, v18.4s, v22.4s   // 28*b1+46*c1 - (4*a1+6*d1)
689
+.endm
690
+
691
+.macro qpel_start_chroma_6
692
+    movi            v25.16b, #54
693
+.endm
694
+
695
+.macro qpel_filter_chroma_6_32b
696
+    umull           v17.8h, v2.8b, v25.8b    // 54 * c
697
+    ushll           v19.8h, v0.8b, #1        // 2 * a
698
+    ushll           v21.8h, v1.8b, #4        // 16 * b
699
+    ushll           v23.8h, v3.8b, #2        // 4 * d
700
+    add             v17.8h, v17.8h, v21.8h   // 54*c + 16*b
701
+    add             v19.8h, v19.8h, v23.8h   // 2*a + 4*d
702
+    sub             v17.8h, v17.8h, v19.8h   // 54*c+16*b - (2*a+4*d)
703
+.endm
704
+
705
+.macro qpel_filter_chroma_6_64b
706
+    umull           v17.8h, v2.8b, v25.8b    // 54 * c
707
+    umull2          v18.8h, v2.16b, v25.16b  // 54 * c
708
+    ushll           v19.8h, v0.8b, #1        // 2 * a
709
+    ushll2          v20.8h, v0.16b, #1       // 2 * a
710
+    ushll           v21.8h, v1.8b, #4        // 16 * b
711
+    ushll2          v22.8h, v1.16b, #4       // 16 * b
712
+    ushll           v23.8h, v3.8b, #2        // 4 * d
713
+    ushll2          v24.8h, v3.16b, #2       // 4 * d
714
+    add             v17.8h, v17.8h, v21.8h   // 54*c + 16*b
715
+    add             v18.8h, v18.8h, v22.8h   // 54*c + 16*b
716
+    add             v19.8h, v19.8h, v23.8h   // 2*a + 4*d
717
+    add             v20.8h, v20.8h, v24.8h   // 2*a + 4*d
718
+    sub             v17.8h, v17.8h, v19.8h   // 54*c+16*b - (2*a+4*d)
719
+    sub             v18.8h, v18.8h, v20.8h   // 54*c+16*b - (2*a+4*d)
720
+.endm
721
+
722
+.macro qpel_start_chroma_6_1
723
+    movi            v25.8h, #54
724
+.endm
725
+
726
+.macro qpel_filter_chroma_6_32b_1
727
+    smull           v17.4s, v2.4h, v25.4h    // 54 * c0
728
+    smull2          v18.4s, v2.8h, v25.8h    // 54 * c1
729
+    sshll           v19.4s, v0.4h, #1        // 2 * a0
730
+    sshll2          v20.4s, v0.8h, #1        // 2 * a1
731
+    sshll           v21.4s, v1.4h, #4        // 16 * b0
732
+    sshll2          v22.4s, v1.8h, #4        // 16 * b1
733
+    sshll           v23.4s, v3.4h, #2        // 4 * d0
734
+    sshll2          v24.4s, v3.8h, #2        // 4 * d1
735
+    add             v17.4s, v17.4s, v21.4s   // 54*c0 + 16*b0
736
+    add             v18.4s, v18.4s, v22.4s   // 54*c1 + 16*b1
737
+    add             v19.4s, v19.4s, v23.4s   // 2*a0 + 4*d0
738
+    add             v20.4s, v20.4s, v24.4s   // 2*a1 + 4*d1
739
+    sub             v17.4s, v17.4s, v19.4s   // 54*c0+16*b0 - (2*a0+4*d0)
740
+    sub             v18.4s, v18.4s, v20.4s   // 54*c1+16*b1 - (2*a1+4*d1)
741
+.endm
742
+
743
+.macro qpel_start_chroma_7
744
+    movi            v24.16b, #58
745
+    movi            v25.16b, #10
746
+.endm
747
+
748
+.macro qpel_filter_chroma_7_32b
749
+    uaddl           v20.8h, v0.8b, v3.8b     // a + d
750
+    umull           v17.8h, v2.8b, v24.8b    // 58 * c
751
+    shl             v20.8h, v20.8h, #1       // 2 * (a+d)
752
+    umull           v19.8h, v1.8b, v25.8b    // 10 * b
753
+    sub             v17.8h, v17.8h, v20.8h   // 58*c - 2*(a+d)
754
+    add             v17.8h, v17.8h, v19.8h   // 58*c-2*(a+d) + 10*b
755
+.endm
756
+
757
+.macro qpel_filter_chroma_7_64b
758
+    uaddl           v20.8h, v0.8b, v3.8b     // a + d
759
+    uaddl2          v21.8h, v0.16b, v3.16b   // a + d
760
+    umull           v17.8h, v2.8b, v24.8b    // 58 * c
761
+    umull2          v18.8h, v2.16b, v24.16b  // 58 * c
762
+    shl             v20.8h, v20.8h, #1       // 2 * (a+d)
763
+    shl             v21.8h, v21.8h, #1       // 2 * (a+d)
764
+    umull           v22.8h, v1.8b, v25.8b    // 10 * b
765
+    umull2          v23.8h, v1.16b, v25.16b  // 10 * b
766
+    sub             v17.8h, v17.8h, v20.8h   // 58*c - 2*(a+d)
767
+    sub             v18.8h, v18.8h, v21.8h   // 58*c - 2*(a+d)
768
+    add             v17.8h, v17.8h, v22.8h   // 58*c-2*(a+d) + 10*b
769
+    add             v18.8h, v18.8h, v23.8h   // 58*c-2*(a+d) + 10*b
770
+.endm
771
+
772
+.macro qpel_start_chroma_7_1
773
+    movi            v24.8h, #58
774
+    movi            v25.8h, #10
775
+.endm
776
+
777
+.macro qpel_filter_chroma_7_32b_1
778
+    add             v20.8h, v0.8h, v3.8h     // a + d
779
+    smull           v17.4s, v2.4h, v24.4h    // 58 * c0
780
+    smull2          v18.4s, v2.8h, v24.8h    // 58 * c1
781
+    sshll           v21.4s, v20.4h, #1       // 2 * (a0+d0)
782
+    sshll2          v22.4s, v20.8h, #1       // 2 * (a1+d1)
783
+    smull           v19.4s, v1.4h, v25.4h    // 10 * b0
784
+    smull2          v20.4s, v1.8h, v25.8h    // 10 * b1
785
+    sub             v17.4s, v17.4s, v21.4s   // 58*c0 - 2*(a0+d0)
786
+    sub             v18.4s, v18.4s, v22.4s   // 58*c1 - 2*(a1+d1)
787
+    add             v17.4s, v17.4s, v19.4s   // 58*c0-2*(a0+d0) + 10*b0
788
+    add             v18.4s, v18.4s, v20.4s   // 58*c1-2*(a1+d1) + 10*b1
789
+.endm
790
+
791
+.macro vpp_end
792
+    add             v17.8h, v17.8h, v31.8h
793
+    sqshrun         v17.8b, v17.8h, #6
794
+.endm
795
+
796
+.macro FILTER_LUMA_VPP w, h, v
797
+    lsl             x10, x1, #2      // x10 = 4 * x1
798
+    sub             x11, x10, x1     // x11 = 3 * x1
799
+    sub             x0, x0, x11      // src -= (8 / 2 - 1) * srcStride
800
+    mov             x5, #\h
801
+    mov             w12, #32
802
+    dup             v31.8h, w12
803
+    qpel_start_\v
804
+.loop_luma_vpp_\v\()_\w\()x\h:
805
+    mov             x7, x2
806
+    mov             x9, #0
807
+.loop_luma_vpp_w8_\v\()_\w\()x\h:
808
+    add             x6, x0, x9
809
+.if \w == 8 || \w == 24
810
+    qpel_load_32b \v
811
+    qpel_filter_\v\()_32b
812
+    vpp_end
813
+    str             d17, x7, #8
814
+    add             x9, x9, #8
815
+.elseif \w == 12
816
+    qpel_load_32b \v
817
+    qpel_filter_\v\()_32b
818
+    vpp_end
819
+    str             d17, x7, #8
820
+    add             x6, x0, #8
821
+    qpel_load_32b \v
822
+    qpel_filter_\v\()_32b
823
+    vpp_end
824
+    fmov            w6, s17
825
+    str             w6, x7, #4
826
+    add             x9, x9, #12
827
+.else
828
+    qpel_load_64b \v
829
+    qpel_filter_\v\()_64b
830
+    vpp_end
831
+    add             v18.8h, v18.8h, v31.8h
832
+    sqshrun2        v17.16b, v18.8h, #6
833
+    str             q17, x7, #16
834
+    add             x9, x9, #16
835
+.endif
836
+    cmp             x9, #\w
837
+    blt             .loop_luma_vpp_w8_\v\()_\w\()x\h
838
+    add             x0, x0, x1
839
+    add             x2, x2, x3
840
+    sub             x5, x5, #1
841
+    cbnz            x5, .loop_luma_vpp_\v\()_\w\()x\h
842
+    ret
843
+.endm
844
+
845
+.macro vps_end
846
+    sub             v17.8h, v17.8h, v31.8h
847
+.endm
848
+
849
+.macro FILTER_VPS w, h, v
850
+    lsl             x3, x3, #1
851
+    lsl             x10, x1, #2      // x10 = 4 * x1
852
+    sub             x11, x10, x1     // x11 = 3 * x1
853
+    sub             x0, x0, x11      // src -= (8 / 2 - 1) * srcStride
854
+    mov             x5, #\h
855
+    mov             w12, #8192
856
+    dup             v31.8h, w12
857
+    qpel_start_\v
858
+.loop_ps_\v\()_\w\()x\h:
859
+    mov             x7, x2
860
+    mov             x9, #0
861
+.loop_ps_w8_\v\()_\w\()x\h:
862
+    add             x6, x0, x9
863
+.if \w == 8 || \w == 24
864
+    qpel_load_32b \v
865
+    qpel_filter_\v\()_32b
866
+    vps_end
867
+    str             q17, x7, #16
868
+    add             x9, x9, #8
869
+.elseif \w == 12
870
+    qpel_load_32b \v
871
+    qpel_filter_\v\()_32b
872
+    vps_end
873
+    str             q17, x7, #16
874
+    add             x6, x0, #8
875
+    qpel_load_32b \v
876
+    qpel_filter_\v\()_32b
877
+    vps_end
878
+    str             d17, x7, #8
879
+    add             x9, x9, #12
880
+.else
881
+    qpel_load_64b \v
882
+    qpel_filter_\v\()_64b
883
+    vps_end
884
+    sub             v18.8h, v18.8h, v31.8h
885
+    stp             q17, q18, x7, #32
886
+    add             x9, x9, #16
887
+.endif
888
+    cmp             x9, #\w
889
+    blt             .loop_ps_w8_\v\()_\w\()x\h
890
+    add             x0, x0, x1
891
+    add             x2, x2, x3
892
+    sub             x5, x5, #1
893
+    cbnz            x5, .loop_ps_\v\()_\w\()x\h
894
+    ret
895
+.endm
896
+
897
+.macro vsp_end
898
+    add             v17.4s, v17.4s, v31.4s
899
+    add             v18.4s, v18.4s, v31.4s
900
+    sqshrun         v17.4h, v17.4s, #12
901
+    sqshrun2        v17.8h, v18.4s, #12
902
+    sqxtun          v17.8b, v17.8h
903
+.endm
904
+
905
+.macro FILTER_VSP w, h, v
906
+    lsl             x1, x1, #1
907
+    lsl             x10, x1, #2      // x10 = 4 * x1
908
+    sub             x11, x10, x1     // x11 = 3 * x1
909
+    sub             x0, x0, x11
910
+    mov             x5, #\h
911
+    mov             w12, #1
912
+    lsl             w12, w12, #19
913
+    add             w12, w12, #2048
914
+    dup             v31.4s, w12
915
+    mov             x12, #\w
916
+    lsl             x12, x12, #1
917
+    qpel_start_\v\()_1
918
+.loop_luma_vsp_\v\()_\w\()x\h:
919
+    mov             x7, x2
920
+    mov             x9, #0
921
+.loop_luma_vsp_w8_\v\()_\w\()x\h:
922
+    add             x6, x0, x9
923
+    qpel_load_64b \v
924
+    qpel_filter_\v\()_32b_1
925
+    vsp_end
926
+    str             d17, x7, #8
927
+    add             x9, x9, #16
928
+.if \w == 12
929
+    add             x6, x0, #16
930
+    qpel_load_64b \v
931
+    qpel_filter_\v\()_32b_1
932
+    vsp_end
933
+    str             s17, x7, #4
934
+    add             x9, x9, #8
935
+.endif
936
+    cmp             x9, x12
937
+    blt             .loop_luma_vsp_w8_\v\()_\w\()x\h
938
+    add             x0, x0, x1
939
+    add             x2, x2, x3
940
+    sub             x5, x5, #1
941
+    cbnz            x5, .loop_luma_vsp_\v\()_\w\()x\h
942
+    ret
943
+.endm
944
+
945
+.macro vss_end
946
+    sshr            v17.4s, v17.4s, #6
947
+    sshr            v18.4s, v18.4s, #6
948
+    uzp1            v17.8h, v17.8h, v18.8h
949
+.endm
950
+
951
+.macro FILTER_VSS w, h, v
952
+    lsl             x1, x1, #1
953
+    lsl             x10, x1, #2      // x10 = 4 * x1
954
+    sub             x11, x10, x1     // x11 = 3 * x1
955
+    sub             x0, x0, x11
956
+    lsl             x3, x3, #1
957
+    mov             x5, #\h
958
+    mov             x12, #\w
959
+    lsl             x12, x12, #1
960
+    qpel_start_\v\()_1
961
+.loop_luma_vss_\v\()_\w\()x\h:
962
+    mov             x7, x2
963
+    mov             x9, #0
964
+.loop_luma_vss_w8_\v\()_\w\()x\h:
965
+    add             x6, x0, x9
966
+    qpel_load_64b \v
967
+    qpel_filter_\v\()_32b_1
968
+    vss_end
969
+.if \w == 4
970
+    str             s17, x7, #4
971
+    add             x9, x9, #4
972
+.else
973
+    str             q17, x7, #16
974
+    add             x9, x9, #16
975
+.if \w == 12
976
+    add             x6, x0, x9
977
+    qpel_load_64b \v
978
+    qpel_filter_\v\()_32b_1
979
+    vss_end
980
+    str             d17, x7, #8
981
+    add             x9, x9, #8
982
+.endif
983
+.endif
984
+    cmp             x9, x12
985
+    blt             .loop_luma_vss_w8_\v\()_\w\()x\h
986
+    add             x0, x0, x1
987
+    add             x2, x2, x3
988
+    sub             x5, x5, #1
989
+    cbnz            x5, .loop_luma_vss_\v\()_\w\()x\h
990
+    ret
991
+.endm
992
+
993
+.macro hpp_end
994
+    add             v17.8h, v17.8h, v31.8h
995
+    sqshrun         v17.8b, v17.8h, #6
996
+.endm
997
+
998
+.macro FILTER_HPP w, h, v
999
+    mov             w6, #\h
1000
+    sub             x3, x3, #\w
1001
+    mov             w12, #32
1002
+    dup             v31.8h, w12
1003
+    qpel_start_\v
1004
+.if \w == 4
1005
+.rept \h
1006
+    mov             x11, x0
1007
+    sub             x11, x11, #4
1008
+    vextin8 \v
1009
+    qpel_filter_\v\()_32b
1010
+    hpp_end
1011
+    str             s17, x2, #4
1012
+    add             x0, x0, x1
1013
+    add             x2, x2, x3
1014
+.endr
1015
+    ret
1016
+.else
1017
+.loop1_hpp_\v\()_\w\()x\h:
1018
+    mov             x7, #\w
1019
+    mov             x11, x0
1020
+    sub             x11, x11, #4
1021
+.loop2_hpp_\v\()_\w\()x\h:
1022
+    vextin8 \v
1023
+    qpel_filter_\v\()_32b
1024
+    hpp_end
1025
+    str             d17, x2, #8
1026
+    sub             x11, x11, #8
1027
+    sub             x7, x7, #8
1028
+.if \w == 12
1029
+    vextin8 \v
1030
+    qpel_filter_\v\()_32b
1031
+    hpp_end
1032
+    str             s17, x2, #4
1033
+    sub             x7, x7, #4
1034
+.endif
1035
+    cbnz            x7, .loop2_hpp_\v\()_\w\()x\h
1036
+    sub             x6, x6, #1
1037
+    add             x0, x0, x1
1038
+    add             x2, x2, x3
1039
+    cbnz            x6, .loop1_hpp_\v\()_\w\()x\h
1040
+    ret
1041
+.endif
1042
+.endm
1043
+
1044
+.macro hps_end
1045
+    sub             v17.8h, v17.8h, v31.8h
1046
+.endm
1047
+
1048
+.macro FILTER_HPS w, h, v
1049
+    sub             x3, x3, #\w
1050
+    lsl             x3, x3, #1
1051
+    mov             w12, #8192
1052
+    dup             v31.8h, w12
1053
+    qpel_start_\v
1054
+.if \w == 4
1055
+.loop_hps_\v\()_\w\()x\h\():
1056
+    mov             x11, x0
1057
+    sub             x11, x11, #4
1058
+    vextin8 \v
1059
+    qpel_filter_\v\()_32b
1060
+    hps_end
1061
+    str             d17, x2, #8
1062
+    sub             w6, w6, #1
1063
+    add             x0, x0, x1
1064
+    add             x2, x2, x3
1065
+    cbnz            w6, .loop_hps_\v\()_\w\()x\h
1066
+    ret
1067
+.else
1068
+.loop1_hps_\v\()_\w\()x\h\():
1069
+    mov             w7, #\w
1070
+    mov             x11, x0
1071
+    sub             x11, x11, #4
1072
+.loop2_hps_\v\()_\w\()x\h\():
1073
+.if \w == 8 || \w == 12 || \w == 24
1074
+    vextin8 \v
1075
+    qpel_filter_\v\()_32b
1076
+    hps_end
1077
+    str             q17, x2, #16
1078
+    sub             w7, w7, #8
1079
+    sub             x11, x11, #8
1080
+.if \w == 12
1081
+    vextin8 \v
1082
+    qpel_filter_\v\()_32b
1083
+    hps_end
1084
+    str             d17, x2, #8
1085
+    sub             w7, w7, #4
1086
+.endif
1087
+.elseif \w == 16 || \w == 32 || \w == 48 || \w == 64
1088
+    vextin8_64 \v
1089
+    qpel_filter_\v\()_64b
1090
+    hps_end
1091
+    sub             v18.8h, v18.8h, v31.8h
1092
+    stp             q17, q18, x2, #32
1093
+    sub             w7, w7, #16
1094
+    sub             x11, x11, #16
1095
+.endif
1096
+    cbnz            w7, .loop2_hps_\v\()_\w\()x\h
1097
+    sub             w6, w6, #1
1098
+    add             x0, x0, x1
1099
+    add             x2, x2, x3
1100
+    cbnz            w6, .loop1_hps_\v\()_\w\()x\h
1101
+    ret
1102
+.endif
1103
+.endm
1104
+
1105
+.macro FILTER_CHROMA_VPP w, h, v
1106
+    qpel_start_chroma_\v
1107
+    mov             w12, #32
1108
+    dup             v31.8h, w12
1109
+    sub             x0, x0, x1
1110
+    mov             x5, #\h
1111
+.loop_chroma_vpp_\v\()_\w\()x\h:
1112
+    mov             x7, x2
1113
+    mov             x9, #0
1114
+.loop_chroma_vpp_w8_\v\()_\w\()x\h:
1115
+    add             x6, x0, x9
1116
+    qpel_chroma_load_32b \v
1117
+    qpel_filter_chroma_\v\()_32b
1118
+    vpp_end
1119
+    add             x9, x9, #8
1120
+.if \w == 2
1121
+    fmov            w12, s17
1122
+    strh            w12, x7, #2
1123
+.elseif \w == 4
1124
+    str             s17, x7, #4
1125
+.elseif \w == 6
1126
+    str             s17, x7, #4
1127
+    umov            w12, v17.h2
1128
+    strh            w12, x7, #2
1129
+.elseif \w == 12
1130
+    str             d17, x7, #8
1131
+    add             x6, x0, x9
1132
+    qpel_chroma_load_32b \v
1133
+    qpel_filter_chroma_\v\()_32b
1134
+    vpp_end
1135
+    str             s17, x7, #4
1136
+    add             x9, x9, #8
1137
+.else
1138
+    str             d17, x7, #8
1139
+.endif
1140
+    cmp             x9, #\w
1141
+    blt             .loop_chroma_vpp_w8_\v\()_\w\()x\h
1142
+    add             x0, x0, x1
1143
+    add             x2, x2, x3
1144
+    sub             x5, x5, #1
1145
+    cbnz            x5, .loop_chroma_vpp_\v\()_\w\()x\h
1146
+    ret
1147
+.endm
1148
+
1149
+.macro FILTER_CHROMA_VPS w, h, v
1150
+    qpel_start_chroma_\v
1151
+    mov             w12, #8192
1152
+    dup             v31.8h, w12
1153
+    lsl             x3, x3, #1
1154
+    sub             x0, x0, x1
1155
+    mov             x5, #\h
1156
+.loop_vps_\v\()_\w\()x\h:
1157
+    mov             x7, x2
1158
+    mov             x9, #0
1159
+.loop_vps_w8_\v\()_\w\()x\h:
1160
+    add             x6, x0, x9
1161
+    qpel_chroma_load_32b \v
1162
+    qpel_filter_chroma_\v\()_32b
1163
+    vps_end
1164
+    add             x9, x9, #8
1165
+.if \w == 2
1166
+    str             s17, x7, #4
1167
+.elseif \w == 4
1168
+    str             d17, x7, #8
1169
+.elseif \w == 6
1170
+    str             d17, x7, #8
1171
+    st1             {v17.s}2, x7, #4
1172
+.elseif \w == 12
1173
+    str             q17, x7, #16
1174
+    add             x6, x0, x9
1175
+    qpel_chroma_load_32b \v
1176
+    qpel_filter_chroma_\v\()_32b
1177
+    vps_end
1178
+    str             d17, x7, #8
1179
+    add             x9, x9, #8
1180
+.else
1181
+    str             q17, x7, #16
1182
+.endif
1183
+    cmp             x9, #\w
1184
+    blt             .loop_vps_w8_\v\()_\w\()x\h
1185
+
1186
+    add             x0, x0, x1
1187
+    add             x2, x2, x3
1188
+    sub             x5, x5, #1
1189
+    cbnz            x5, .loop_vps_\v\()_\w\()x\h
1190
+    ret
1191
+.endm
1192
+
1193
+.macro FILTER_CHROMA_VSP w, h, v
1194
+    lsl             x1, x1, #1
1195
+    sub             x0, x0, x1
1196
+    mov             x5, #\h
1197
+    mov             w12, #1
1198
+    lsl             w12, w12, #19
1199
+    add             w12, w12, #2048
1200
+    dup             v31.4s, w12
1201
+    mov             x12, #\w
1202
+    lsl             x12, x12, #1
1203
+    qpel_start_chroma_\v\()_1
1204
+.loop_vsp_\v\()_\w\()x\h:
1205
+    mov             x7, x2
1206
+    mov             x9, #0
1207
+.loop_vsp_w8_\v\()_\w\()x\h:
1208
+    add             x6, x0, x9
1209
+    qpel_chroma_load_64b \v
1210
+    qpel_filter_chroma_\v\()_32b_1
1211
+    vsp_end
1212
+    add             x9, x9, #16
1213
+.if \w == 4
1214
+    str             s17, x7, #4
1215
+.elseif \w == 12
1216
+    str             d17, x7, #8
1217
+    add             x6, x0, x9
1218
+    qpel_chroma_load_64b \v
1219
+    qpel_filter_chroma_\v\()_32b_1
1220
+    vsp_end
1221
+    str             s17, x7, #4
1222
+    add             x9, x9, #8
1223
+.else
1224
+    str             d17, x7, #8
1225
+.endif
1226
+    cmp             x9, x12
1227
+    blt             .loop_vsp_w8_\v\()_\w\()x\h
1228
+    add             x0, x0, x1
1229
+    add             x2, x2, x3
1230
+    sub             x5, x5, #1
1231
+    cbnz            x5, .loop_vsp_\v\()_\w\()x\h
1232
+    ret
1233
+.endm
1234
+
1235
+.macro FILTER_CHROMA_VSS w, h, v
1236
+    lsl             x1, x1, #1
1237
+    sub             x0, x0, x1
1238
+    lsl             x3, x3, #1
1239
+    mov             x5, #\h
1240
+    mov             x12, #\w
1241
+    lsl             x12, x12, #1
1242
+    qpel_start_chroma_\v\()_1
1243
+.loop_vss_\v\()_\w\()x\h:
1244
+    mov             x7, x2
1245
+    mov             x9, #0
1246
+.if \w == 4
1247
+.rept 2
1248
+    add             x6, x0, x9
1249
+    qpel_chroma_load_64b \v
1250
+    qpel_filter_chroma_\v\()_32b_1
1251
+    vss_end
1252
+    str             s17, x7, #4
1253
+    add             x9, x9, #4
1254
+.endr
1255
+.else
1256
+.loop_vss_w8_\v\()_\w\()x\h:
1257
+    add             x6, x0, x9
1258
+    qpel_chroma_load_64b \v
1259
+    qpel_filter_chroma_\v\()_32b_1
1260
+    vss_end
1261
+    str             q17, x7, #16
1262
+    add             x9, x9, #16
1263
+.if \w == 12
1264
+    add             x6, x0, x9
1265
+    qpel_chroma_load_64b \v
1266
+    qpel_filter_chroma_\v\()_32b_1
1267
+    vss_end
1268
+    str             d17, x7, #8
1269
+    add             x9, x9, #8
1270
+.endif
1271
+    cmp             x9, x12
1272
+    blt             .loop_vss_w8_\v\()_\w\()x\h
1273
+.endif
1274
+    add             x0, x0, x1
1275
+    add             x2, x2, x3
1276
+    sub             x5, x5, #1
1277
+    cbnz            x5, .loop_vss_\v\()_\w\()x\h
1278
+    ret
1279
+.endm
1280
+
1281
+.macro FILTER_CHROMA_HPP w, h, v
1282
+    qpel_start_chroma_\v
1283
+    mov             w12, #32
1284
+    dup             v31.8h, w12
1285
+    mov             w6, #\h
1286
+    sub             x3, x3, #\w
1287
+.if \w == 2 || \w == 4 || \w == 6 || \w == 12
1288
+.loop4_chroma_hpp_\v\()_\w\()x\h:
1289
+    mov             x11, x0
1290
+    sub             x11, x11, #2
1291
+    vextin8_chroma \v
1292
+    qpel_filter_chroma_\v\()_32b
1293
+    hpp_end
1294
+.if \w == 2
1295
+    fmov            w12, s17
1296
+    strh            w12, x2, #2
1297
+.elseif \w == 4
1298
+    str             s17, x2, #4
1299
+.elseif \w == 6
1300
+    str             s17, x2, #4
1301
+    umov            w12, v17.h2
1302
+    strh            w12, x2, #2
1303
+.elseif \w == 12
1304
+    str             d17, x2, #8
1305
+    sub             x11, x11, #8
1306
+    vextin8_chroma \v
1307
+    qpel_filter_chroma_\v\()_32b
1308
+    hpp_end
1309
+    str             s17, x2, #4
1310
+.endif
1311
+    sub             w6, w6, #1
1312
+    add             x0, x0, x1
1313
+    add             x2, x2, x3
1314
+    cbnz            w6, .loop4_chroma_hpp_\v\()_\w\()x\h
1315
+    ret
1316
+.else
1317
+.loop2_chroma_hpp_\v\()_\w\()x\h:
1318
+    mov             x7, #\w
1319
+    lsr             x7, x7, #3
1320
+    mov             x11, x0
1321
+    sub             x11, x11, #2
1322
+.loop3_chroma_hpp_\v\()_\w\()x\h:
1323
+.if \w == 8 || \w == 24
1324
+    vextin8_chroma \v
1325
+    qpel_filter_chroma_\v\()_32b
1326
+    hpp_end
1327
+    str             d17, x2, #8
1328
+    sub             x7, x7, #1
1329
+    sub             x11, x11, #8
1330
+.elseif \w == 16 || \w == 32 || \w == 48 || \w == 64
1331
+    vextin8_chroma_64 \v
1332
+    qpel_filter_chroma_\v\()_64b
1333
+    hpp_end
1334
+    add             v18.8h, v18.8h, v31.8h
1335
+    sqshrun2        v17.16b, v18.8h, #6
1336
+    str             q17, x2, #16
1337
+    sub             x7, x7, #2
1338
+    sub             x11, x11, #16
1339
+.endif
1340
+    cbnz            x7, .loop3_chroma_hpp_\v\()_\w\()x\h
1341
+    sub             w6, w6, #1
1342
+    add             x0, x0, x1
1343
+    add             x2, x2, x3
1344
+    cbnz            w6, .loop2_chroma_hpp_\v\()_\w\()x\h
1345
+    ret
1346
+.endif
1347
+.endm
1348
+
1349
+.macro CHROMA_HPS_2_4_6_12 w, v
1350
+    mov             x11, x0
1351
+    sub             x11, x11, #2
1352
+    vextin8_chroma \v
1353
+    qpel_filter_chroma_\v\()_32b
1354
+    hps_end
1355
+    sub             x11, x11, #8
1356
+.if \w == 2
1357
+    str             s17, x2, #4
1358
+.elseif \w == 4
1359
+    str             d17, x2, #8
1360
+.elseif \w == 6
1361
+    str             d17, x2, #8
1362
+    st1             {v17.s}2, x2, #4
1363
+.elseif \w == 12
1364
+    str             q17, x2, #16
1365
+    vextin8_chroma \v
1366
+    qpel_filter_chroma_\v\()_32b
1367
+    sub             v17.8h, v17.8h, v31.8h
1368
+    str             d17, x2, #8
1369
+.endif
1370
+    add             x0, x0, x1
1371
+    add             x2, x2, x3
1372
+.endm
1373
+
1374
+.macro FILTER_CHROMA_HPS w, h, v
1375
+    qpel_start_chroma_\v
1376
+    mov             w12, #8192
1377
+    dup             v31.8h, w12
1378
+    sub             x3, x3, #\w
1379
+    lsl             x3, x3, #1
1380
+
1381
+.if \w == 2 || \w == 4 || \w == 6 || \w == 12
1382
+    cmp             x5, #0
1383
+    beq             0f
1384
+    sub             x0, x0, x1
1385
+.rept 3
1386
+    CHROMA_HPS_2_4_6_12 \w, \v
1387
+.endr
1388
+0:
1389
+.rept \h
1390
+    CHROMA_HPS_2_4_6_12 \w, \v
1391
+.endr
1392
+    ret
1393
+.else
1394
+    mov             w10, #\h
1395
+    cmp             x5, #0
1396
+    beq             9f
1397
+    sub             x0, x0, x1
1398
+    add             w10, w10, #3
1399
+9:
1400
+    mov             w6, w10
1401
+.loop1_chroma_hps_\v\()_\w\()x\h\():
1402
+    mov             x7, #\w
1403
+    lsr             x7, x7, #3
1404
+    mov             x11, x0
1405
+    sub             x11, x11, #2
1406
+.loop2_chroma_hps_\v\()_\w\()x\h\():
1407
+.if \w == 8 || \w == 24
1408
+    vextin8_chroma \v
1409
+    qpel_filter_chroma_\v\()_32b
1410
+    hps_end
1411
+    str             q17, x2, #16
1412
+    sub             x7, x7, #1
1413
+    sub             x11, x11, #8
1414
+.elseif \w == 16 || \w == 32 || \w == 48 || \w == 64
1415
+    vextin8_chroma_64 \v
1416
+    qpel_filter_chroma_\v\()_64b
1417
+    hps_end
1418
+    sub             v18.8h, v18.8h, v31.8h
1419
+    stp             q17, q18, x2, #32
1420
+    sub             x7, x7, #2
1421
+    sub             x11, x11, #16
1422
+.endif
1423
+    cbnz            x7, .loop2_chroma_hps_\v\()_\w\()x\h\()
1424
+    sub             w6, w6, #1
1425
+    add             x0, x0, x1
1426
+    add             x2, x2, x3
1427
+    cbnz            w6, .loop1_chroma_hps_\v\()_\w\()x\h\()
1428
+    ret
1429
+.endif
1430
+.endm
1431
+
1432
+const g_lumaFilter, align=8
1433
+.word 0,0,0,0,0,0,64,64,0,0,0,0,0,0,0,0
1434
+.word -1,-1,4,4,-10,-10,58,58,17,17,-5,-5,1,1,0,0
1435
+.word -1,-1,4,4,-11,-11,40,40,40,40,-11,-11,4,4,-1,-1
1436
+.word 0,0,1,1,-5,-5,17,17,58,58,-10,-10,4,4,-1,-1
1437
+endconst
1438
x265_3.6.tar.gz/source/common/aarch64/ipfilter-sve2.S Added
1284
 
1
@@ -0,0 +1,1282 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// Functions in this file:
26
+// ***** luma_vpp *****
27
+// ***** luma_vps *****
28
+// ***** luma_vsp *****
29
+// ***** luma_vss *****
30
+// ***** luma_hpp *****
31
+// ***** luma_hps *****
32
+// ***** chroma_vpp *****
33
+// ***** chroma_vps *****
34
+// ***** chroma_vsp *****
35
+// ***** chroma_vss *****
36
+// ***** chroma_hpp *****
37
+// ***** chroma_hps *****
38
+
39
+#include "asm-sve.S"
40
+#include "ipfilter-common.S"
41
+
42
+.arch armv8-a+sve2
43
+
44
+#ifdef __APPLE__
45
+.section __RODATA,__rodata
46
+#else
47
+.section .rodata
48
+#endif
49
+
50
+.align 4
51
+
52
+.text
53
+
54
+.macro qpel_load_32b_sve2 v
55
+.if \v == 0
56
+    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
57
+    ld1b            {z3.h}, p0/z, x6
58
+    add             x6, x6, x1
59
+.elseif \v == 1 || \v == 2 || \v == 3
60
+.if \v != 3                           // not used in qpel_filter_3
61
+    ld1b            {z0.h}, p0/z, x6
62
+    add             x6, x6, x1
63
+.else
64
+    add             x6, x6, x1
65
+.endif
66
+    ld1b            {z1.h}, p0/z, x6
67
+    add             x6, x6, x1
68
+    ld1b            {z2.h}, p0/z, x6
69
+    add             x6, x6, x1
70
+    ld1b            {z3.h}, p0/z, x6
71
+    add             x6, x6, x1
72
+    ld1b            {z4.h}, p0/z, x6
73
+    add             x6, x6, x1
74
+    ld1b            {z5.h}, p0/z, x6
75
+    add             x6, x6, x1
76
+.if \v != 1                           // not used in qpel_filter_1
77
+    ld1b            {z6.h}, p0/z, x6
78
+    add             x6, x6, x1
79
+    ld1b            {z7.h}, p0/z, x6
80
+.else
81
+    ld1b            {z6.h}, p0/z, x6
82
+.endif
83
+.endif
84
+.endm
85
+
86
+.macro qpel_load_64b_sve2_gt_16 v
87
+.if \v == 0
88
+    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
89
+    ld1b            {z3.h}, p2/z, x6
90
+    add             x6, x6, x1
91
+.elseif \v == 1 || \v == 2 || \v == 3
92
+.if \v != 3                           // not used in qpel_filter_3
93
+    ld1b            {z0.h}, p2/z, x6
94
+    add             x6, x6, x1
95
+.else
96
+    add             x6, x6, x1
97
+.endif
98
+    ld1b            {z1.h}, p2/z, x6
99
+    add             x6, x6, x1
100
+    ld1b            {z2.h}, p2/z, x6
101
+    add             x6, x6, x1
102
+    ld1b            {z3.h}, p2/z, x6
103
+    add             x6, x6, x1
104
+    ld1b            {z4.h}, p2/z, x6
105
+    add             x6, x6, x1
106
+    ld1b            {z5.h}, p2/z, x6
107
+    add             x6, x6, x1
108
+.if \v != 1                           // not used in qpel_filter_1
109
+    ld1b            {z6.h}, p2/z, x6
110
+    add             x6, x6, x1
111
+    ld1b            {z7.h}, p2/z, x6
112
+.else
113
+    ld1b            {z6.h}, p2/z, x6
114
+.endif
115
+.endif
116
+.endm
117
+
118
+.macro qpel_chroma_load_32b_sve2 v
119
+.if \v == 0
120
+    // qpel_filter_chroma_0 only uses values in v1
121
+    add             x6, x6, x1
122
+    ld1b            {z1.h}, p0/z, x6
123
+.else
124
+    ld1b            {z0.h}, p0/z, x6
125
+    add             x6, x6, x1
126
+    ld1b            {z1.h}, p0/z, x6
127
+    add             x6, x6, x1
128
+    ld1b            {z2.h}, p0/z, x6
129
+    add             x6, x6, x1
130
+    ld1b            {z3.h}, p0/z, x6
131
+.endif
132
+.endm
133
+
134
+.macro qpel_start_sve2_0
135
+    mov             z24.h, #64
136
+.endm
137
+
138
+.macro qpel_filter_sve2_0_32b
139
+    mul             z17.h, z3.h, z24.h    // 64*d
140
+.endm
141
+
142
+.macro qpel_filter_sve2_0_64b
143
+    qpel_filter_sve2_0_32b
144
+    mul             z18.h, z11.h, z24.h
145
+.endm
146
+
147
+.macro qpel_start_sve2_1
148
+    mov             z24.h, #58
149
+    mov             z25.h, #10
150
+    mov             z26.h, #17
151
+    mov             z27.h, #5
152
+.endm
153
+
154
+.macro qpel_filter_sve2_1_32b
155
+    mul             z19.h, z2.h, z25.h  // c*10
156
+    mul             z17.h, z3.h, z24.h  // d*58
157
+    mul             z21.h, z4.h, z26.h  // e*17
158
+    mul             z23.h, z5.h, z27.h  // f*5
159
+    sub             z17.h, z17.h, z19.h // d*58 - c*10
160
+    lsl             z18.h, z1.h, #2      // b*4
161
+    add             z17.h, z17.h, z21.h // d*58 - c*10 + e*17
162
+    sub             z21.h, z6.h, z0.h   // g - a
163
+    add             z17.h, z17.h, z18.h // d*58 - c*10 + e*17 + b*4
164
+    sub             z21.h, z21.h, z23.h // g - a - f*5
165
+    add             z17.h, z17.h, z21.h // d*58 - c*10 + e*17 + b*4 + g - a - f*5
166
+.endm
167
+
168
+.macro qpel_filter_sve2_1_64b
169
+    qpel_filter_sve2_1_32b
170
+    mul             z20.h, z10.h, z25.h  // c*10
171
+    mul             z18.h, z11.h, z24.h  // d*58
172
+    mul             z21.h, z12.h, z26.h  // e*17
173
+    mul             z23.h, z13.h, z27.h  // f*5
174
+    sub             z18.h, z18.h, z20.h   // d*58 - c*10
175
+    lsl             z28.h, z30.h, #2       // b*4
176
+    add             z18.h, z18.h, z21.h   // d*58 - c*10 + e*17
177
+    sub             z21.h, z14.h, z29.h   // g - a
178
+    add             z18.h, z18.h, z28.h   // d*58 - c*10 + e*17 + b*4
179
+    sub             z21.h, z21.h, z23.h   // g - a - f*5
180
+    add             z18.h, z18.h, z21.h   // d*58 - c*10 + e*17 + b*4 + g - a - f*5
181
+.endm
182
+
183
+.macro qpel_start_sve2_2
184
+    mov             z24.h, #11
185
+    mov             z25.h, #40
186
+.endm
187
+
188
+.macro qpel_filter_sve2_2_32b
189
+    add             z17.h, z3.h, z4.h     // d + e
190
+    add             z19.h, z2.h, z5.h     // c + f
191
+    add             z23.h, z1.h, z6.h     // b + g
192
+    add             z21.h, z0.h, z7.h     // a + h
193
+    mul             z17.h, z17.h, z25.h   // 40 * (d + e)
194
+    mul             z19.h, z19.h, z24.h   // 11 * (c + f)
195
+    lsl             z23.h, z23.h, #2       // (b + g) * 4
196
+    add             z19.h, z19.h, z21.h   // 11 * (c + f) + a + h
197
+    add             z17.h, z17.h, z23.h   // 40 * (d + e) + (b + g) * 4
198
+    sub             z17.h, z17.h, z19.h   // 40 * (d + e) + (b + g) * 4 - 11 * (c + f) - a - h
199
+.endm
200
+
201
+.macro qpel_filter_sve2_2_64b
202
+    qpel_filter_sve2_2_32b
203
+    add             z27.h, z11.h, z12.h   // d + e
204
+    add             z16.h, z10.h, z13.h   // c + f
205
+    add             z23.h, z30.h, z14.h   // b + g
206
+    add             z21.h, z29.h, z15.h   // a + h
207
+    mul             z27.h, z27.h, z25.h   // 40 * (d + e)
208
+    mul             z16.h, z16.h, z24.h   // 11 * (c + f)
209
+    lsl             z23.h, z23.h, #2       // (b + g) * 4
210
+    add             z16.h, z16.h, z21.h   // 11 * (c + f) + a + h
211
+    add             z27.h, z27.h, z23.h   // 40 * (d + e) + (b + g) * 4
212
+    sub             z18.h, z27.h, z16.h   // 40 * (d + e) + (b + g) * 4 - 11 * (c + f) - a - h
213
+.endm
214
+
215
+.macro qpel_start_sve2_3
216
+    mov             z24.h, #17
217
+    mov             z25.h, #5
218
+    mov             z26.h, #58
219
+    mov             z27.h, #10
220
+.endm
221
+
222
+.macro qpel_filter_sve2_3_32b
223
+    mul             z19.h, z2.h, z25.h    // c * 5
224
+    mul             z17.h, z3.h, z24.h    // d * 17
225
+    mul             z21.h, z4.h, z26.h    // e * 58
226
+    mul             z23.h, z5.h, z27.h    // f * 10
227
+    sub             z17.h, z17.h, z19.h   // d * 17 - c * 5
228
+    lsl             z19.h, z6.h, #2        // g * 4
229
+    add             z17.h, z17.h, z21.h   // d * 17 - c * 5 + e * 58
230
+    sub             z21.h, z1.h, z7.h     // b - h
231
+    add             z17.h, z17.h, z19.h   // d * 17 - c * 5 + e * 58 + g * 4
232
+    sub             z21.h, z21.h, z23.h   // b - h - f * 10
233
+    add             z17.h, z17.h, z21.h   // d * 17 - c * 5 + e * 58 + g * 4 + b - h - f * 10
234
+.endm
235
+
236
+.macro qpel_filter_sve2_3_64b
237
+    qpel_filter_sve2_3_32b
238
+    mul             z16.h, z10.h, z25.h  // c * 5
239
+    mul             z18.h, z11.h, z24.h  // d * 17
240
+    mul             z21.h, z12.h, z26.h  // e * 58
241
+    mul             z23.h, z13.h, z27.h  // f * 10
242
+    sub             z18.h, z18.h, z16.h   // d * 17 - c * 5
243
+    lsl             z16.h, z14.h, #2       // g * 4
244
+    add             z18.h, z18.h, z21.h   // d * 17 - c * 5 + e * 58
245
+    sub             z21.h, z30.h, z15.h   // b - h
246
+    add             z18.h, z18.h, z16.h   // d * 17 - c * 5 + e * 58 + g * 4
247
+    sub             z21.h, z21.h, z23.h   // b - h - f * 10
248
+    add             z18.h, z18.h, z21.h   // d * 17 - c * 5 + e * 58 + g * 4 + b - h - f * 10
249
+.endm
250
+
251
+.macro qpel_start_chroma_sve2_0
252
+    mov             z29.h, #64
253
+.endm
254
+
255
+.macro qpel_filter_chroma_sve2_0_32b
256
+    mul             z17.h, z1.h, z29.h    // 64*b
257
+.endm
258
+
259
+.macro qpel_start_chroma_sve2_1
260
+    mov             z29.h, #58
261
+    mov             z30.h, #10
262
+.endm
263
+
264
+.macro qpel_filter_chroma_sve2_1_32b
265
+    mul             z17.h, z1.h, z29.h    // 58 * b
266
+    mul             z19.h, z2.h, z30.h    // 10 * c
267
+    add             z22.h, z0.h, z3.h     // a + d
268
+    lsl             z22.h, z22.h, #1       // 2 * (a+d)
269
+    sub             z17.h, z17.h, z22.h   // 58*b - 2*(a+d)
270
+    add             z17.h, z17.h, z19.h   // 58*b-2*(a+d) + 10*c
271
+.endm
272
+
273
+.macro qpel_start_chroma_sve2_2
274
+    mov             z30.h, #54
275
+.endm
276
+
277
+.macro qpel_filter_chroma_sve2_2_32b
278
+    mul             z17.h, z1.h, z30.h    // 54 * b
279
+    lsl             z19.h, z0.h, #2        // 4 * a
280
+    lsl             z21.h, z2.h, #4        // 16 * c
281
+    lsl             z23.h, z3.h, #1        // 2 * d
282
+    add             z17.h, z17.h, z21.h   // 54*b + 16*c
283
+    add             z19.h, z19.h, z23.h   // 4*a + 2*d
284
+    sub             z17.h, z17.h, z19.h   // 54*b+16*c - (4*a+2*d)
285
+.endm
286
+
287
+.macro qpel_start_chroma_sve2_3
288
+    mov             z28.h, #46
289
+    mov             z29.h, #28
290
+    mov             z30.h, #6
291
+.endm
292
+
293
+.macro qpel_filter_chroma_sve2_3_32b
294
+    mul             z17.h, z1.h, z28.h    // 46 * b
295
+    mul             z19.h, z2.h, z29.h    // 28 * c
296
+    lsl             z21.h, z3.h, #2        // 4 * d
297
+    mul             z23.h, z0.h, z30.h    // 6 * a
298
+    add             z17.h, z17.h, z19.h   // 46*b + 28*c
299
+    add             z21.h, z21.h, z23.h   // 4*d + 6*a
300
+    sub             z17.h, z17.h, z21.h   // 46*b+28*c - (4*d+6*a)
301
+.endm
302
+
303
+.macro qpel_start_chroma_sve2_4
304
+    mov             z29.h, #36
305
+.endm
306
+
307
+.macro qpel_filter_chroma_sve2_4_32b
308
+    add             z20.h, z0.h, z3.h     // a + d
309
+    add             z17.h, z1.h, z2.h     // b + c
310
+    lsl             z20.h, z20.h, #2       // 4 * (a+d)
311
+    mul             z17.h, z17.h, z29.h   // 36 * (b+c)
312
+    sub             z17.h, z17.h, z20.h   // 36*(b+c) - 4*(a+d)
313
+.endm
314
+
315
+.macro qpel_start_chroma_sve2_5
316
+    mov             z28.h, #28
317
+    mov             z29.h, #46
318
+    mov             z30.h, #6
319
+.endm
320
+
321
+.macro qpel_filter_chroma_sve2_5_32b
322
+    mul             z17.h, z1.h, z28.h    // 28 * b
323
+    mul             z19.h, z2.h, z29.h    // 46 * c
324
+    lsl             z21.h, z0.h, #2        // 4 * a
325
+    mul             z23.h, z3.h, z30.h    // 6 * d
326
+    add             z17.h, z17.h, z19.h   // 28*b + 46*c
327
+    add             z21.h, z21.h, z23.h   // 4*a + 6*d
328
+    sub             z17.h, z17.h, z21.h   // 28*b+46*c - (4*a+6*d)
329
+.endm
330
+
331
+.macro qpel_start_chroma_sve2_6
332
+    mov             z30.h, #54
333
+.endm
334
+
335
+.macro qpel_filter_chroma_sve2_6_32b
336
+    mul             z17.h, z2.h, z30.h    // 54 * c
337
+    lsl             z19.h, z0.h, #1        // 2 * a
338
+    lsl             z21.h, z1.h, #4        // 16 * b
339
+    lsl             z23.h, z3.h, #2        // 4 * d
340
+    add             z17.h, z17.h, z21.h   // 54*c + 16*b
341
+    add             z19.h, z19.h, z23.h   // 2*a + 4*d
342
+    sub             z17.h, z17.h, z19.h   // 54*c+16*b - (2*a+4*d)
343
+.endm
344
+
345
+.macro qpel_start_chroma_sve2_7
346
+    mov             z29.h, #58
347
+    mov             z30.h, #10
348
+.endm
349
+
350
+.macro qpel_filter_chroma_sve2_7_32b
351
+    add             z20.h, z0.h, z3.h     // a + d
352
+    mul             z17.h, z2.h, z29.h    // 58 * c
353
+    lsl             z20.h, z20.h, #1       // 2 * (a+d)
354
+    mul             z19.h, z1.h, z30.h    // 10 * b
355
+    sub             z17.h, z17.h, z20.h   // 58*c - 2*(a+d)
356
+    add             z17.h, z17.h, z19.h   // 58*c-2*(a+d) + 10*b
357
+.endm
358
+
359
+.macro vpp_end_sve2
360
+    add             z17.h, z17.h, z31.h
361
+    sqshrun         v17.8b, v17.8h, #6
362
+.endm
363
+
364
+.macro FILTER_LUMA_VPP_SVE2 w, h, v
365
+    lsl             x10, x1, #2      // x10 = 4 * x1
366
+    sub             x11, x10, x1     // x11 = 3 * x1
367
+    sub             x0, x0, x11      // src -= (8 / 2 - 1) * srcStride
368
+    mov             x5, #\h
369
+    mov             z31.h, #32
370
+    rdvl            x9, #1
371
+    cmp             x9, #16
372
+    bgt             .vl_gt_16_FILTER_LUMA_VPP_\v\()_\w\()x\h
373
+    qpel_start_\v
374
+.loop_luma_vpp_sve2_\v\()_\w\()x\h:
375
+    mov             x7, x2
376
+    mov             x9, #0
377
+.loop_luma_vpp_w8_sve2_\v\()_\w\()x\h:
378
+    add             x6, x0, x9
379
+.if \w == 8 || \w == 24
380
+    qpel_load_32b \v
381
+    qpel_filter_\v\()_32b
382
+    vpp_end
383
+    str             d17, x7, #8
384
+    add             x9, x9, #8
385
+.elseif \w == 12
386
+    qpel_load_32b \v
387
+    qpel_filter_\v\()_32b
388
+    vpp_end
389
+    str             d17, x7, #8
390
+    add             x6, x0, #8
391
+    qpel_load_32b \v
392
+    qpel_filter_\v\()_32b
393
+    vpp_end
394
+    fmov            w6, s17
395
+    str             w6, x7, #4
396
+    add             x9, x9, #12
397
+.else
398
+    qpel_load_64b \v
399
+    qpel_filter_\v\()_64b
400
+    vpp_end
401
+    add             v18.8h, v18.8h, v31.8h
402
+    sqshrun2        v17.16b, v18.8h, #6
403
+    str             q17, x7, #16
404
+    add             x9, x9, #16
405
+.endif
406
+    cmp             x9, #\w
407
+    blt             .loop_luma_vpp_w8_sve2_\v\()_\w\()x\h
408
+    add             x0, x0, x1
409
+    add             x2, x2, x3
410
+    sub             x5, x5, #1
411
+    cbnz            x5, .loop_luma_vpp_sve2_\v\()_\w\()x\h
412
+    ret
413
+.vl_gt_16_FILTER_LUMA_VPP_\v\()_\w\()x\h:
414
+    ptrue           p0.h, vl8
415
+    ptrue           p2.h, vl16
416
+    qpel_start_sve2_\v
417
+.gt_16_loop_luma_vpp_sve2_\v\()_\w\()x\h:
418
+    mov             x7, x2
419
+    mov             x9, #0
420
+.gt_16_loop_luma_vpp_w8_sve2_\v\()_\w\()x\h:
421
+    add             x6, x0, x9
422
+.if \w == 8 || \w == 24
423
+    qpel_load_32b_sve2 \v
424
+    qpel_filter_sve2_\v\()_32b
425
+    vpp_end_sve2
426
+    str             d17, x7, #8
427
+    add             x9, x9, #8
428
+.elseif \w == 12
429
+    qpel_load_32b_sve2 \v
430
+    qpel_filter_sve2_\v\()_32b
431
+    vpp_end_sve2
432
+    str             d17, x7, #8
433
+    add             x6, x0, #8
434
+    qpel_load_32b_sve2 \v
435
+    qpel_filter_sve2_\v\()_32b
436
+    vpp_end_sve2
437
+    fmov            w6, s17
438
+    str             w6, x7, #4
439
+    add             x9, x9, #12
440
+.else
441
+    qpel_load_64b_sve2_gt_16 \v
442
+    qpel_filter_sve2_\v\()_32b
443
+    vpp_end_sve2
444
+    add             z18.h, z18.h, z31.h
445
+    sqshrun2        v17.16b, v18.8h, #6
446
+    str             q17, x7, #16
447
+    add             x9, x9, #16
448
+.endif
449
+    cmp             x9, #\w
450
+    blt             .gt_16_loop_luma_vpp_w8_sve2_\v\()_\w\()x\h
451
+    add             x0, x0, x1
452
+    add             x2, x2, x3
453
+    sub             x5, x5, #1
454
+    cbnz            x5, .gt_16_loop_luma_vpp_sve2_\v\()_\w\()x\h
455
+    ret
456
+.endm
457
+
458
+// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
459
+.macro LUMA_VPP_SVE2 w, h
460
+function x265_interp_8tap_vert_pp_\w\()x\h\()_sve2
461
+    cmp             x4, #0
462
+    b.eq            0f
463
+    cmp             x4, #1
464
+    b.eq            1f
465
+    cmp             x4, #2
466
+    b.eq            2f
467
+    cmp             x4, #3
468
+    b.eq            3f
469
+0:
470
+    FILTER_LUMA_VPP_SVE2 \w, \h, 0
471
+1:
472
+    FILTER_LUMA_VPP_SVE2 \w, \h, 1
473
+2:
474
+    FILTER_LUMA_VPP_SVE2 \w, \h, 2
475
+3:
476
+    FILTER_LUMA_VPP_SVE2 \w, \h, 3
477
+endfunc
478
+.endm
479
+
480
+LUMA_VPP_SVE2 8, 4
481
+LUMA_VPP_SVE2 8, 8
482
+LUMA_VPP_SVE2 8, 16
483
+LUMA_VPP_SVE2 8, 32
484
+LUMA_VPP_SVE2 12, 16
485
+LUMA_VPP_SVE2 16, 4
486
+LUMA_VPP_SVE2 16, 8
487
+LUMA_VPP_SVE2 16, 16
488
+LUMA_VPP_SVE2 16, 32
489
+LUMA_VPP_SVE2 16, 64
490
+LUMA_VPP_SVE2 16, 12
491
+LUMA_VPP_SVE2 24, 32
492
+LUMA_VPP_SVE2 32, 8
493
+LUMA_VPP_SVE2 32, 16
494
+LUMA_VPP_SVE2 32, 32
495
+LUMA_VPP_SVE2 32, 64
496
+LUMA_VPP_SVE2 32, 24
497
+LUMA_VPP_SVE2 48, 64
498
+LUMA_VPP_SVE2 64, 16
499
+LUMA_VPP_SVE2 64, 32
500
+LUMA_VPP_SVE2 64, 64
501
+LUMA_VPP_SVE2 64, 48
502
+
503
+// void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
504
+.macro LUMA_VPS_4xN_SVE2 h
505
+function x265_interp_8tap_vert_ps_4x\h\()_sve2
506
+    lsl             x3, x3, #1
507
+    lsl             x5, x4, #6
508
+    lsl             x4, x1, #2
509
+    sub             x4, x4, x1
510
+    sub             x0, x0, x4
511
+
512
+    mov             z28.s, #8192
513
+    mov             x4, #\h
514
+    movrel          x12, g_lumaFilter
515
+    add             x12, x12, x5
516
+    ptrue           p0.s, vl4
517
+    ld1rd           {z16.d}, p0/z, x12
518
+    ld1rd           {z17.d}, p0/z, x12, #8
519
+    ld1rd           {z18.d}, p0/z, x12, #16
520
+    ld1rd           {z19.d}, p0/z, x12, #24
521
+    ld1rd           {z20.d}, p0/z, x12, #32
522
+    ld1rd           {z21.d}, p0/z, x12, #40
523
+    ld1rd           {z22.d}, p0/z, x12, #48
524
+    ld1rd           {z23.d}, p0/z, x12, #56
525
+
526
+.loop_vps_sve2_4x\h:
527
+    mov             x6, x0
528
+
529
+    ld1b            {z0.s}, p0/z, x6
530
+    add             x6, x6, x1
531
+    ld1b            {z1.s}, p0/z, x6
532
+    add             x6, x6, x1
533
+    ld1b            {z2.s}, p0/z, x6
534
+    add             x6, x6, x1
535
+    ld1b            {z3.s}, p0/z, x6
536
+    add             x6, x6, x1
537
+    ld1b            {z4.s}, p0/z, x6
538
+    add             x6, x6, x1
539
+    ld1b            {z5.s}, p0/z, x6
540
+    add             x6, x6, x1
541
+    ld1b            {z6.s}, p0/z, x6
542
+    add             x6, x6, x1
543
+    ld1b            {z7.s}, p0/z, x6
544
+    add             x6, x6, x1
545
+
546
+    mul             z0.s, z0.s, z16.s
547
+    mla             z0.s, p0/m, z1.s, z17.s
548
+    mla             z0.s, p0/m, z2.s, z18.s
549
+    mla             z0.s, p0/m, z3.s, z19.s
550
+    mla             z0.s, p0/m, z4.s, z20.s
551
+    mla             z0.s, p0/m, z5.s, z21.s
552
+    mla             z0.s, p0/m, z6.s, z22.s
553
+    mla             z0.s, p0/m, z7.s, z23.s
554
+
555
+    sub             z0.s, z0.s, z28.s
556
+    sqxtn           v0.4h, v0.4s
557
+    st1             {v0.8b}, x2, x3
558
+
559
+    add             x0, x0, x1
560
+    sub             x4, x4, #1
561
+    cbnz            x4, .loop_vps_sve2_4x\h
562
+    ret
563
+endfunc
564
+.endm
565
+
566
+LUMA_VPS_4xN_SVE2 4
567
+LUMA_VPS_4xN_SVE2 8
568
+LUMA_VPS_4xN_SVE2 16
569
+
570
+// void interp_vert_sp_c(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
571
+.macro LUMA_VSP_4xN_SVE2 h
572
+function x265_interp_8tap_vert_sp_4x\h\()_sve2
573
+    lsl             x5, x4, #6
574
+    lsl             x1, x1, #1
575
+    lsl             x4, x1, #2
576
+    sub             x4, x4, x1
577
+    sub             x0, x0, x4
578
+
579
+    mov             w12, #1
580
+    lsl             w12, w12, #19
581
+    add             w12, w12, #2048
582
+    dup             v24.4s, w12
583
+    mov             x4, #\h
584
+    movrel          x12, g_lumaFilter
585
+    add             x12, x12, x5
586
+
587
+    ptrue           p0.s, vl4
588
+    ld1rd           {z16.d}, p0/z, x12
589
+    ld1rd           {z17.d}, p0/z, x12, #8
590
+    ld1rd           {z18.d}, p0/z, x12, #16
591
+    ld1rd           {z19.d}, p0/z, x12, #24
592
+    ld1rd           {z20.d}, p0/z, x12, #32
593
+    ld1rd           {z21.d}, p0/z, x12, #40
594
+    ld1rd           {z22.d}, p0/z, x12, #48
595
+    ld1rd           {z23.d}, p0/z, x12, #56
596
+
597
+.loop_vsp_sve2_4x\h:
598
+    mov             x6, x0
599
+
600
+    ld1             {v0.8b}, x6, x1
601
+    ld1             {v1.8b}, x6, x1
602
+    ld1             {v2.8b}, x6, x1
603
+    ld1             {v3.8b}, x6, x1
604
+    ld1             {v4.8b}, x6, x1
605
+    ld1             {v5.8b}, x6, x1
606
+    ld1             {v6.8b}, x6, x1
607
+    ld1             {v7.8b}, x6, x1
608
+
609
+    sunpklo         z0.s, z0.h
610
+    sunpklo         z1.s, z1.h
611
+    mul             z0.s, z0.s, z16.s
612
+    sunpklo         z2.s, z2.h
613
+    mla             z0.s, p0/m, z1.s, z17.s
614
+    sunpklo         z3.s, z3.h
615
+    mla             z0.s, p0/m, z2.s, z18.s
616
+    sunpklo         z4.s, z4.h
617
+    mla             z0.s, p0/m, z3.s, z19.s
618
+    sunpklo         z5.s, z5.h
619
+    mla             z0.s, p0/m, z4.s, z20.s
620
+    sunpklo         z6.s, z6.h
621
+    mla             z0.s, p0/m, z5.s, z21.s
622
+    sunpklo         z7.s, z7.h
623
+    mla             z0.s, p0/m, z6.s, z22.s
624
+
625
+    mla             z0.s, p0/m, z7.s, z23.s
626
+
627
+    add             z0.s, z0.s, z24.s
628
+    sqshrun         v0.4h, v0.4s, #12
629
+    sqxtun          v0.8b, v0.8h
630
+    st1             {v0.s}0, x2, x3
631
+
632
+    add             x0, x0, x1
633
+    sub             x4, x4, #1
634
+    cbnz            x4, .loop_vsp_sve2_4x\h
635
+    ret
636
+endfunc
637
+.endm
638
+
639
+LUMA_VSP_4xN_SVE2 4
640
+LUMA_VSP_4xN_SVE2 8
641
+LUMA_VSP_4xN_SVE2 16
642
+
643
+.macro vps_end_sve2
644
+    sub             z17.h, z17.h, z31.h
645
+.endm
646
+
647
+.macro FILTER_VPS_SVE2 w, h, v
648
+    lsl             x3, x3, #1
649
+    lsl             x10, x1, #2      // x10 = 4 * x1
650
+    sub             x11, x10, x1     // x11 = 3 * x1
651
+    sub             x0, x0, x11      // src -= (8 / 2 - 1) * srcStride
652
+    mov             x5, #\h
653
+    mov             z31.h, #8192
654
+    rdvl            x14, #1
655
+    cmp             x14, #16
656
+    bgt             .vl_gt_16_FILTER_VPS_\v\()_\w\()x\h
657
+    qpel_start_\v
658
+.loop_ps_sve2_\v\()_\w\()x\h:
659
+    mov             x7, x2
660
+    mov             x9, #0
661
+.loop_ps_w8_sve2_\v\()_\w\()x\h:
662
+    add             x6, x0, x9
663
+.if \w == 8 || \w == 24
664
+    qpel_load_32b \v
665
+    qpel_filter_\v\()_32b
666
+    vps_end
667
+    str             q17, x7, #16
668
+    add             x9, x9, #8
669
+.elseif \w == 12
670
+    qpel_load_32b \v
671
+    qpel_filter_\v\()_32b
672
+    vps_end
673
+    str             q17, x7, #16
674
+    add             x6, x0, #8
675
+    qpel_load_32b \v
676
+    qpel_filter_\v\()_32b
677
+    vps_end
678
+    str             d17, x7, #8
679
+    add             x9, x9, #12
680
+.else
681
+    qpel_load_64b \v
682
+    qpel_filter_\v\()_64b
683
+    vps_end
684
+    sub             v18.8h, v18.8h, v31.8h
685
+    stp             q17, q18, x7, #32
686
+    add             x9, x9, #16
687
+.endif
688
+    cmp             x9, #\w
689
+    blt             .loop_ps_w8_sve2_\v\()_\w\()x\h
690
+    add             x0, x0, x1
691
+    add             x2, x2, x3
692
+    sub             x5, x5, #1
693
+    cbnz            x5, .loop_ps_sve2_\v\()_\w\()x\h
694
+    ret
695
+.vl_gt_16_FILTER_VPS_\v\()_\w\()x\h:
696
+    ptrue           p0.h, vl8
697
+    ptrue           p2.h, vl16
698
+    qpel_start_sve2_\v
699
+.gt_16_loop_ps_sve2_\v\()_\w\()x\h:
700
+    mov             x7, x2
701
+    mov             x9, #0
702
+.gt_16_loop_ps_w8_sve2_\v\()_\w\()x\h:
703
+    add             x6, x0, x9
704
+.if \w == 8 || \w == 24
705
+    qpel_load_32b_sve2 \v
706
+    qpel_filter_sve2_\v\()_32b
707
+    vps_end_sve2
708
+    str             q17, x7, #16
709
+    add             x9, x9, #8
710
+.elseif \w == 12
711
+    qpel_load_32b_sve2 \v
712
+    qpel_filter_sve2_\v\()_32b
713
+    vps_end_sve2
714
+    str             q17, x7, #16
715
+    add             x6, x0, #8
716
+    qpel_load_32b_sve2 \v
717
+    qpel_filter_sve2_\v\()_32b
718
+    vps_end_sve2
719
+    str             d17, x7, #8
720
+    add             x9, x9, #12
721
+.else
722
+    qpel_load_64b_sve2_gt_16 \v
723
+    qpel_filter_sve2_\v\()_32b
724
+    vps_end_sve2
725
+    sub             z18.h, z18.h, z31.h
726
+    stp             q17, q18, x7, #32
727
+    add             x9, x9, #16
728
+.endif
729
+    cmp             x9, #\w
730
+    blt             .gt_16_loop_ps_w8_sve2_\v\()_\w\()x\h
731
+    add             x0, x0, x1
732
+    add             x2, x2, x3
733
+    sub             x5, x5, #1
734
+    cbnz            x5, .gt_16_loop_ps_sve2_\v\()_\w\()x\h
735
+    ret
736
+.endm
737
+
738
+// void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
739
+.macro LUMA_VPS_SVE2 w, h
740
+function x265_interp_8tap_vert_ps_\w\()x\h\()_sve2
741
+    cmp             x4, #0
742
+    beq             0f
743
+    cmp             x4, #1
744
+    beq             1f
745
+    cmp             x4, #2
746
+    beq             2f
747
+    cmp             x4, #3
748
+    beq             3f
749
+0:
750
+    FILTER_VPS_SVE2 \w, \h, 0
751
+1:
752
+    FILTER_VPS_SVE2 \w, \h, 1
753
+2:
754
+    FILTER_VPS_SVE2 \w, \h, 2
755
+3:
756
+    FILTER_VPS_SVE2 \w, \h, 3
757
+endfunc
758
+.endm
759
+
760
+LUMA_VPS_SVE2 8, 4
761
+LUMA_VPS_SVE2 8, 8
762
+LUMA_VPS_SVE2 8, 16
763
+LUMA_VPS_SVE2 8, 32
764
+LUMA_VPS_SVE2 12, 16
765
+LUMA_VPS_SVE2 16, 4
766
+LUMA_VPS_SVE2 16, 8
767
+LUMA_VPS_SVE2 16, 16
768
+LUMA_VPS_SVE2 16, 32
769
+LUMA_VPS_SVE2 16, 64
770
+LUMA_VPS_SVE2 16, 12
771
+LUMA_VPS_SVE2 24, 32
772
+LUMA_VPS_SVE2 32, 8
773
+LUMA_VPS_SVE2 32, 16
774
+LUMA_VPS_SVE2 32, 32
775
+LUMA_VPS_SVE2 32, 64
776
+LUMA_VPS_SVE2 32, 24
777
+LUMA_VPS_SVE2 48, 64
778
+LUMA_VPS_SVE2 64, 16
779
+LUMA_VPS_SVE2 64, 32
780
+LUMA_VPS_SVE2 64, 64
781
+LUMA_VPS_SVE2 64, 48
782
+
783
+// ***** luma_vss *****
784
+.macro vss_end_sve2
785
+    asr             z17.s, z17.s, #6
786
+    asr             z18.s, z18.s, #6
787
+    uzp1            v17.8h, v17.8h, v18.8h
788
+.endm
789
+
790
+.macro FILTER_VSS_SVE2 w, h, v
791
+    lsl             x1, x1, #1
792
+    lsl             x10, x1, #2      // x10 = 4 * x1
793
+    sub             x11, x10, x1     // x11 = 3 * x1
794
+    sub             x0, x0, x11
795
+    lsl             x3, x3, #1
796
+    mov             x5, #\h
797
+    mov             x12, #\w
798
+    lsl             x12, x12, #1
799
+    qpel_start_\v\()_1
800
+.loop_luma_vss_sve2_\v\()_\w\()x\h:
801
+    mov             x7, x2
802
+    mov             x9, #0
803
+.loop_luma_vss_w8_sve2_\v\()_\w\()x\h:
804
+    add             x6, x0, x9
805
+    qpel_load_64b \v
806
+    qpel_filter_\v\()_32b_1
807
+    vss_end_sve2
808
+.if \w == 4
809
+    str             s17, x7, #4
810
+    add             x9, x9, #4
811
+.else
812
+    str             q17, x7, #16
813
+    add             x9, x9, #16
814
+.if \w == 12
815
+    add             x6, x0, x9
816
+    qpel_load_64b \v
817
+    qpel_filter_\v\()_32b_1
818
+    vss_end_sve2
819
+    str             d17, x7, #8
820
+    add             x9, x9, #8
821
+.endif
822
+.endif
823
+    cmp             x9, x12
824
+    blt             .loop_luma_vss_w8_sve2_\v\()_\w\()x\h
825
+    add             x0, x0, x1
826
+    add             x2, x2, x3
827
+    sub             x5, x5, #1
828
+    cbnz            x5, .loop_luma_vss_sve2_\v\()_\w\()x\h
829
+    ret
830
+.endm
831
+
832
+// void interp_vert_ss_c(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
833
+.macro LUMA_VSS_SVE2 w, h
834
+function x265_interp_8tap_vert_ss_\w\()x\h\()_sve2
835
+    cmp             x4, #0
836
+    beq             0f
837
+    cmp             x4, #1
838
+    beq             1f
839
+    cmp             x4, #2
840
+    beq             2f
841
+    cmp             x4, #3
842
+    beq             3f
843
+0:
844
+    FILTER_VSS_SVE2 \w, \h, 0
845
+1:
846
+    FILTER_VSS_SVE2 \w, \h, 1
847
+2:
848
+    FILTER_VSS_SVE2 \w, \h, 2
849
+3:
850
+    FILTER_VSS_SVE2 \w, \h, 3
851
+endfunc
852
+.endm
853
+
854
+LUMA_VSS_SVE2 4, 4
855
+LUMA_VSS_SVE2 4, 8
856
+LUMA_VSS_SVE2 4, 16
857
+LUMA_VSS_SVE2 8, 4
858
+LUMA_VSS_SVE2 8, 8
859
+LUMA_VSS_SVE2 8, 16
860
+LUMA_VSS_SVE2 8, 32
861
+LUMA_VSS_SVE2 12, 16
862
+LUMA_VSS_SVE2 16, 4
863
+LUMA_VSS_SVE2 16, 8
864
+LUMA_VSS_SVE2 16, 16
865
+LUMA_VSS_SVE2 16, 32
866
+LUMA_VSS_SVE2 16, 64
867
+LUMA_VSS_SVE2 16, 12
868
+LUMA_VSS_SVE2 32, 8
869
+LUMA_VSS_SVE2 32, 16
870
+LUMA_VSS_SVE2 32, 32
871
+LUMA_VSS_SVE2 32, 64
872
+LUMA_VSS_SVE2 32, 24
873
+LUMA_VSS_SVE2 64, 16
874
+LUMA_VSS_SVE2 64, 32
875
+LUMA_VSS_SVE2 64, 64
876
+LUMA_VSS_SVE2 64, 48
877
+LUMA_VSS_SVE2 24, 32
878
+LUMA_VSS_SVE2 48, 64
879
+
880
+// ***** luma_hps *****
881
+
882
+.macro FILTER_CHROMA_VPP_SVE2 w, h, v
883
+    ptrue           p0.h, vl8
884
+    qpel_start_chroma_sve2_\v
885
+    mov             z31.h, #32
886
+    sub             x0, x0, x1
887
+    mov             x5, #\h
888
+.loop_chroma_vpp_sve2_\v\()_\w\()x\h:
889
+    mov             x7, x2
890
+    mov             x9, #0
891
+.loop_chroma_vpp_w8_sve2_\v\()_\w\()x\h:
892
+    add             x6, x0, x9
893
+    qpel_chroma_load_32b_sve2 \v
894
+    qpel_filter_chroma_sve2_\v\()_32b
895
+    vpp_end_sve2
896
+    add             x9, x9, #8
897
+.if \w == 2
898
+    fmov            w12, s17
899
+    strh            w12, x7, #2
900
+.elseif \w == 4
901
+    str             s17, x7, #4
902
+.elseif \w == 6
903
+    str             s17, x7, #4
904
+    umov            w12, v17.h2
905
+    strh            w12, x7, #2
906
+.elseif \w == 12
907
+    str             d17, x7, #8
908
+    add             x6, x0, x9
909
+    qpel_chroma_load_32b_sve2 \v
910
+    qpel_filter_chroma_sve2_\v\()_32b
911
+    vpp_end_sve2
912
+    str             s17, x7, #4
913
+    add             x9, x9, #8
914
+.else
915
+    str             d17, x7, #8
916
+.endif
917
+    cmp             x9, #\w
918
+    blt             .loop_chroma_vpp_w8_sve2_\v\()_\w\()x\h
919
+    add             x0, x0, x1
920
+    add             x2, x2, x3
921
+    sub             x5, x5, #1
922
+    cbnz            x5, .loop_chroma_vpp_sve2_\v\()_\w\()x\h
923
+    ret
924
+.endm
925
+
926
+// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
927
+.macro CHROMA_VPP_SVE2 w, h
928
+function x265_interp_4tap_vert_pp_\w\()x\h\()_sve2
929
+    cmp             x4, #0
930
+    beq             0f
931
+    cmp             x4, #1
932
+    beq             1f
933
+    cmp             x4, #2
934
+    beq             2f
935
+    cmp             x4, #3
936
+    beq             3f
937
+    cmp             x4, #4
938
+    beq             4f
939
+    cmp             x4, #5
940
+    beq             5f
941
+    cmp             x4, #6
942
+    beq             6f
943
+    cmp             x4, #7
944
+    beq             7f
945
+0:
946
+    FILTER_CHROMA_VPP_SVE2  \w, \h, 0
947
+1:
948
+    FILTER_CHROMA_VPP_SVE2  \w, \h, 1
949
+2:
950
+    FILTER_CHROMA_VPP_SVE2  \w, \h, 2
951
+3:
952
+    FILTER_CHROMA_VPP_SVE2  \w, \h, 3
953
+4:
954
+    FILTER_CHROMA_VPP_SVE2  \w, \h, 4
955
+5:
956
+    FILTER_CHROMA_VPP_SVE2  \w, \h, 5
957
+6:
958
+    FILTER_CHROMA_VPP_SVE2  \w, \h, 6
959
+7:
960
+    FILTER_CHROMA_VPP_SVE2  \w, \h, 7
961
+endfunc
962
+.endm
963
+
964
+CHROMA_VPP_SVE2 2, 4
965
+CHROMA_VPP_SVE2 2, 8
966
+CHROMA_VPP_SVE2 2, 16
967
+CHROMA_VPP_SVE2 4, 2
968
+CHROMA_VPP_SVE2 4, 4
969
+CHROMA_VPP_SVE2 4, 8
970
+CHROMA_VPP_SVE2 4, 16
971
+CHROMA_VPP_SVE2 4, 32
972
+CHROMA_VPP_SVE2 6, 8
973
+CHROMA_VPP_SVE2 6, 16
974
+CHROMA_VPP_SVE2 8, 2
975
+CHROMA_VPP_SVE2 8, 4
976
+CHROMA_VPP_SVE2 8, 6
977
+CHROMA_VPP_SVE2 8, 8
978
+CHROMA_VPP_SVE2 8, 16
979
+CHROMA_VPP_SVE2 8, 32
980
+CHROMA_VPP_SVE2 8, 12
981
+CHROMA_VPP_SVE2 8, 64
982
+CHROMA_VPP_SVE2 12, 16
983
+CHROMA_VPP_SVE2 12, 32
984
+CHROMA_VPP_SVE2 16, 4
985
+CHROMA_VPP_SVE2 16, 8
986
+CHROMA_VPP_SVE2 16, 12
987
+CHROMA_VPP_SVE2 16, 16
988
+CHROMA_VPP_SVE2 16, 32
989
+CHROMA_VPP_SVE2 16, 64
990
+CHROMA_VPP_SVE2 16, 24
991
+CHROMA_VPP_SVE2 32, 8
992
+CHROMA_VPP_SVE2 32, 16
993
+CHROMA_VPP_SVE2 32, 24
994
+CHROMA_VPP_SVE2 32, 32
995
+CHROMA_VPP_SVE2 32, 64
996
+CHROMA_VPP_SVE2 32, 48
997
+CHROMA_VPP_SVE2 24, 32
998
+CHROMA_VPP_SVE2 24, 64
999
+CHROMA_VPP_SVE2 64, 16
1000
+CHROMA_VPP_SVE2 64, 32
1001
+CHROMA_VPP_SVE2 64, 48
1002
+CHROMA_VPP_SVE2 64, 64
1003
+CHROMA_VPP_SVE2 48, 64
1004
+
1005
+.macro FILTER_CHROMA_VPS_SVE2 w, h, v
1006
+    ptrue           p0.h, vl8
1007
+    qpel_start_chroma_sve2_\v
1008
+    mov             z31.h, #8192
1009
+    lsl             x3, x3, #1
1010
+    sub             x0, x0, x1
1011
+    mov             x5, #\h
1012
+.loop_vps_sve2_\v\()_\w\()x\h:
1013
+    mov             x7, x2
1014
+    mov             x9, #0
1015
+.loop_vps_w8_sve2_\v\()_\w\()x\h:
1016
+    add             x6, x0, x9
1017
+    qpel_chroma_load_32b_sve2 \v
1018
+    qpel_filter_chroma_sve2_\v\()_32b
1019
+    vps_end_sve2
1020
+    add             x9, x9, #8
1021
+.if \w == 2
1022
+    str             s17, x7, #4
1023
+.elseif \w == 4
1024
+    str             d17, x7, #8
1025
+.elseif \w == 6
1026
+    str             d17, x7, #8
1027
+    st1             {v17.s}2, x7, #4
1028
+.elseif \w == 12
1029
+    str             q17, x7, #16
1030
+    add             x6, x0, x9
1031
+    qpel_chroma_load_32b_sve2 \v
1032
+    qpel_filter_chroma_sve2_\v\()_32b
1033
+    vps_end_sve2
1034
+    str             d17, x7, #8
1035
+    add             x9, x9, #8
1036
+.else
1037
+    str             q17, x7, #16
1038
+.endif
1039
+    cmp             x9, #\w
1040
+    blt             .loop_vps_w8_sve2_\v\()_\w\()x\h
1041
+
1042
+    add             x0, x0, x1
1043
+    add             x2, x2, x3
1044
+    sub             x5, x5, #1
1045
+    cbnz            x5, .loop_vps_sve2_\v\()_\w\()x\h
1046
+    ret
1047
+.endm
1048
+
1049
+// void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
1050
+.macro CHROMA_VPS_SVE2 w, h
1051
+function x265_interp_4tap_vert_ps_\w\()x\h\()_sve2
1052
+    cmp             x4, #0
1053
+    beq             0f
1054
+    cmp             x4, #1
1055
+    beq             1f
1056
+    cmp             x4, #2
1057
+    beq             2f
1058
+    cmp             x4, #3
1059
+    beq             3f
1060
+    cmp             x4, #4
1061
+    beq             4f
1062
+    cmp             x4, #5
1063
+    beq             5f
1064
+    cmp             x4, #6
1065
+    beq             6f
1066
+    cmp             x4, #7
1067
+    beq             7f
1068
+0:
1069
+    FILTER_CHROMA_VPS_SVE2  \w, \h, 0
1070
+1:
1071
+    FILTER_CHROMA_VPS_SVE2  \w, \h, 1
1072
+2:
1073
+    FILTER_CHROMA_VPS_SVE2  \w, \h, 2
1074
+3:
1075
+    FILTER_CHROMA_VPS_SVE2  \w, \h, 3
1076
+4:
1077
+    FILTER_CHROMA_VPS_SVE2  \w, \h, 4
1078
+5:
1079
+    FILTER_CHROMA_VPS_SVE2  \w, \h, 5
1080
+6:
1081
+    FILTER_CHROMA_VPS_SVE2  \w, \h, 6
1082
+7:
1083
+    FILTER_CHROMA_VPS_SVE2  \w, \h, 7
1084
+endfunc
1085
+.endm
1086
+
1087
+CHROMA_VPS_SVE2 2, 4
1088
+CHROMA_VPS_SVE2 2, 8
1089
+CHROMA_VPS_SVE2 2, 16
1090
+CHROMA_VPS_SVE2 4, 2
1091
+CHROMA_VPS_SVE2 4, 4
1092
+CHROMA_VPS_SVE2 4, 8
1093
+CHROMA_VPS_SVE2 4, 16
1094
+CHROMA_VPS_SVE2 4, 32
1095
+CHROMA_VPS_SVE2 6, 8
1096
+CHROMA_VPS_SVE2 6, 16
1097
+CHROMA_VPS_SVE2 8, 2
1098
+CHROMA_VPS_SVE2 8, 4
1099
+CHROMA_VPS_SVE2 8, 6
1100
+CHROMA_VPS_SVE2 8, 8
1101
+CHROMA_VPS_SVE2 8, 16
1102
+CHROMA_VPS_SVE2 8, 32
1103
+CHROMA_VPS_SVE2 8, 12
1104
+CHROMA_VPS_SVE2 8, 64
1105
+CHROMA_VPS_SVE2 12, 16
1106
+CHROMA_VPS_SVE2 12, 32
1107
+CHROMA_VPS_SVE2 16, 4
1108
+CHROMA_VPS_SVE2 16, 8
1109
+CHROMA_VPS_SVE2 16, 12
1110
+CHROMA_VPS_SVE2 16, 16
1111
+CHROMA_VPS_SVE2 16, 32
1112
+CHROMA_VPS_SVE2 16, 64
1113
+CHROMA_VPS_SVE2 16, 24
1114
+CHROMA_VPS_SVE2 32, 8
1115
+CHROMA_VPS_SVE2 32, 16
1116
+CHROMA_VPS_SVE2 32, 24
1117
+CHROMA_VPS_SVE2 32, 32
1118
+CHROMA_VPS_SVE2 32, 64
1119
+CHROMA_VPS_SVE2 32, 48
1120
+CHROMA_VPS_SVE2 24, 32
1121
+CHROMA_VPS_SVE2 24, 64
1122
+CHROMA_VPS_SVE2 64, 16
1123
+CHROMA_VPS_SVE2 64, 32
1124
+CHROMA_VPS_SVE2 64, 48
1125
+CHROMA_VPS_SVE2 64, 64
1126
+CHROMA_VPS_SVE2 48, 64
1127
+
1128
+.macro qpel_start_chroma_sve2_0_1
1129
+    mov             z24.h, #64
1130
+.endm
1131
+
1132
+.macro qpel_start_chroma_sve2_1_1
1133
+    mov             z24.h, #58
1134
+    mov             z25.h, #10
1135
+.endm
1136
+
1137
+.macro qpel_start_chroma_sve2_2_1
1138
+    mov             z25.h, #54
1139
+.endm
1140
+
1141
+.macro qpel_start_chroma_sve2_3_1
1142
+    mov             z25.h, #46
1143
+    mov             z26.h, #28
1144
+    mov             z27.h, #6
1145
+.endm
1146
+
1147
+.macro qpel_start_chroma_sve2_4_1
1148
+    mov             z24.h, #36
1149
+.endm
1150
+
1151
+.macro qpel_start_chroma_sve2_5_1
1152
+    mov             z25.h, #28
1153
+    mov             z26.h, #46
1154
+    mov             z27.h, #6
1155
+.endm
1156
+
1157
+.macro qpel_start_chroma_sve2_6_1
1158
+    mov             z25.h, #54
1159
+.endm
1160
+
1161
+.macro qpel_start_chroma_sve2_7_1
1162
+    mov             z24.h, #58
1163
+    mov             z25.h, #10
1164
+.endm
1165
+
1166
+.macro FILTER_CHROMA_VSS_SVE2 w, h, v
1167
+    lsl             x1, x1, #1
1168
+    sub             x0, x0, x1
1169
+    lsl             x3, x3, #1
1170
+    mov             x5, #\h
1171
+    mov             x12, #\w
1172
+    lsl             x12, x12, #1
1173
+    qpel_start_chroma_sve2_\v\()_1
1174
+.loop_vss_sve2_\v\()_\w\()x\h:
1175
+    mov             x7, x2
1176
+    mov             x9, #0
1177
+.if \w == 4
1178
+.rept 2
1179
+    add             x6, x0, x9
1180
+    qpel_chroma_load_64b \v
1181
+    qpel_filter_chroma_\v\()_32b_1
1182
+    vss_end_sve2
1183
+    str             s17, x7, #4
1184
+    add             x9, x9, #4
1185
+.endr
1186
+.else
1187
+.loop_vss_w8_sve2_\v\()_\w\()x\h:
1188
+    add             x6, x0, x9
1189
+    qpel_chroma_load_64b \v
1190
+    qpel_filter_chroma_\v\()_32b_1
1191
+    vss_end_sve2
1192
+    str             q17, x7, #16
1193
+    add             x9, x9, #16
1194
+.if \w == 12
1195
+    add             x6, x0, x9
1196
+    qpel_chroma_load_64b \v
1197
+    qpel_filter_chroma_\v\()_32b_1
1198
+    vss_end_sve2
1199
+    str             d17, x7, #8
1200
+    add             x9, x9, #8
1201
+.endif
1202
+    cmp             x9, x12
1203
+    blt             .loop_vss_w8_sve2_\v\()_\w\()x\h
1204
+.endif
1205
+    add             x0, x0, x1
1206
+    add             x2, x2, x3
1207
+    sub             x5, x5, #1
1208
+    cbnz            x5, .loop_vss_sve2_\v\()_\w\()x\h
1209
+    ret
1210
+.endm
1211
+
1212
+// void interp_vert_ss_c(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
1213
+.macro CHROMA_VSS_SVE2 w, h
1214
+function x265_interp_4tap_vert_ss_\w\()x\h\()_sve2
1215
+    cmp             x4, #0
1216
+    beq             0f
1217
+    cmp             x4, #1
1218
+    beq             1f
1219
+    cmp             x4, #2
1220
+    beq             2f
1221
+    cmp             x4, #3
1222
+    beq             3f
1223
+    cmp             x4, #4
1224
+    beq             4f
1225
+    cmp             x4, #5
1226
+    beq             5f
1227
+    cmp             x4, #6
1228
+    beq             6f
1229
+    cmp             x4, #7
1230
+    beq             7f
1231
+0:
1232
+    FILTER_CHROMA_VSS_SVE2  \w, \h, 0
1233
+1:
1234
+    FILTER_CHROMA_VSS_SVE2  \w, \h, 1
1235
+2:
1236
+    FILTER_CHROMA_VSS_SVE2  \w, \h, 2
1237
+3:
1238
+    FILTER_CHROMA_VSS_SVE2  \w, \h, 3
1239
+4:
1240
+    FILTER_CHROMA_VSS_SVE2  \w, \h, 4
1241
+5:
1242
+    FILTER_CHROMA_VSS_SVE2  \w, \h, 5
1243
+6:
1244
+    FILTER_CHROMA_VSS_SVE2  \w, \h, 6
1245
+7:
1246
+    FILTER_CHROMA_VSS_SVE2  \w, \h, 7
1247
+endfunc
1248
+.endm
1249
+
1250
+CHROMA_VSS_SVE2 4, 4
1251
+CHROMA_VSS_SVE2 4, 8
1252
+CHROMA_VSS_SVE2 4, 16
1253
+CHROMA_VSS_SVE2 4, 32
1254
+CHROMA_VSS_SVE2 8, 2
1255
+CHROMA_VSS_SVE2 8, 4
1256
+CHROMA_VSS_SVE2 8, 6
1257
+CHROMA_VSS_SVE2 8, 8
1258
+CHROMA_VSS_SVE2 8, 16
1259
+CHROMA_VSS_SVE2 8, 32
1260
+CHROMA_VSS_SVE2 8, 12
1261
+CHROMA_VSS_SVE2 8, 64
1262
+CHROMA_VSS_SVE2 12, 16
1263
+CHROMA_VSS_SVE2 12, 32
1264
+CHROMA_VSS_SVE2 16, 4
1265
+CHROMA_VSS_SVE2 16, 8
1266
+CHROMA_VSS_SVE2 16, 12
1267
+CHROMA_VSS_SVE2 16, 16
1268
+CHROMA_VSS_SVE2 16, 32
1269
+CHROMA_VSS_SVE2 16, 64
1270
+CHROMA_VSS_SVE2 16, 24
1271
+CHROMA_VSS_SVE2 32, 8
1272
+CHROMA_VSS_SVE2 32, 16
1273
+CHROMA_VSS_SVE2 32, 24
1274
+CHROMA_VSS_SVE2 32, 32
1275
+CHROMA_VSS_SVE2 32, 64
1276
+CHROMA_VSS_SVE2 32, 48
1277
+CHROMA_VSS_SVE2 24, 32
1278
+CHROMA_VSS_SVE2 24, 64
1279
+CHROMA_VSS_SVE2 64, 16
1280
+CHROMA_VSS_SVE2 64, 32
1281
+CHROMA_VSS_SVE2 64, 48
1282
+CHROMA_VSS_SVE2 64, 64
1283
+CHROMA_VSS_SVE2 48, 64
1284
x265_3.6.tar.gz/source/common/aarch64/ipfilter.S Added
1056
 
1
@@ -0,0 +1,1054 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2021 MulticoreWare, Inc
4
+ *
5
+ * Authors: Sebastian Pop <spop@amazon.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// Functions in this file:
26
+// ***** luma_vpp *****
27
+// ***** luma_vps *****
28
+// ***** luma_vsp *****
29
+// ***** luma_vss *****
30
+// ***** luma_hpp *****
31
+// ***** luma_hps *****
32
+// ***** chroma_vpp *****
33
+// ***** chroma_vps *****
34
+// ***** chroma_vsp *****
35
+// ***** chroma_vss *****
36
+// ***** chroma_hpp *****
37
+// ***** chroma_hps *****
38
+
39
+#include "asm.S"
40
+#include "ipfilter-common.S"
41
+
42
+#ifdef __APPLE__
43
+.section __RODATA,__rodata
44
+#else
45
+.section .rodata
46
+#endif
47
+
48
+.align 4
49
+
50
+.text
51
+
52
+// ***** luma_vpp *****
53
+// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
54
+.macro LUMA_VPP_4xN h
55
+function x265_interp_8tap_vert_pp_4x\h\()_neon
56
+    movrel          x10, g_luma_s16
57
+    sub             x0, x0, x1
58
+    sub             x0, x0, x1, lsl #1         // src -= 3 * srcStride
59
+    lsl             x4, x4, #4
60
+    ldr             q0, x10, x4              // q0 = luma interpolate coeff
61
+    dup             v24.8h, v0.h0
62
+    dup             v25.8h, v0.h1
63
+    trn1            v24.2d, v24.2d, v25.2d
64
+    dup             v26.8h, v0.h2
65
+    dup             v27.8h, v0.h3
66
+    trn1            v26.2d, v26.2d, v27.2d
67
+    dup             v28.8h, v0.h4
68
+    dup             v29.8h, v0.h5
69
+    trn1            v28.2d, v28.2d, v29.2d
70
+    dup             v30.8h, v0.h6
71
+    dup             v31.8h, v0.h7
72
+    trn1            v30.2d, v30.2d, v31.2d
73
+
74
+    // prepare to load 8 lines
75
+    ld1             {v0.s}0, x0, x1
76
+    ld1             {v0.s}1, x0, x1
77
+    ushll           v0.8h, v0.8b, #0
78
+    ld1             {v1.s}0, x0, x1
79
+    ld1             {v1.s}1, x0, x1
80
+    ushll           v1.8h, v1.8b, #0
81
+    ld1             {v2.s}0, x0, x1
82
+    ld1             {v2.s}1, x0, x1
83
+    ushll           v2.8h, v2.8b, #0
84
+    ld1             {v3.s}0, x0, x1
85
+    ld1             {v3.s}1, x0, x1
86
+    ushll           v3.8h, v3.8b, #0
87
+
88
+    mov             x9, #\h
89
+.loop_4x\h:
90
+    ld1             {v4.s}0, x0, x1
91
+    ld1             {v4.s}1, x0, x1
92
+    ushll           v4.8h, v4.8b, #0
93
+
94
+    // row0-1
95
+    mul             v16.8h, v0.8h, v24.8h
96
+    ext             v21.16b, v0.16b, v1.16b, #8
97
+    mul             v17.8h, v21.8h, v24.8h
98
+    mov             v0.16b, v1.16b
99
+
100
+    // row2-3
101
+    mla             v16.8h, v1.8h, v26.8h
102
+    ext             v21.16b, v1.16b, v2.16b, #8
103
+    mla             v17.8h, v21.8h, v26.8h
104
+    mov             v1.16b, v2.16b
105
+
106
+    // row4-5
107
+    mla             v16.8h, v2.8h, v28.8h
108
+    ext             v21.16b, v2.16b, v3.16b, #8
109
+    mla             v17.8h, v21.8h, v28.8h
110
+    mov             v2.16b, v3.16b
111
+
112
+    // row6-7
113
+    mla             v16.8h, v3.8h, v30.8h
114
+    ext             v21.16b, v3.16b, v4.16b, #8
115
+    mla             v17.8h, v21.8h, v30.8h
116
+    mov             v3.16b, v4.16b
117
+
118
+    // sum row0-7
119
+    trn1            v20.2d, v16.2d, v17.2d
120
+    trn2            v21.2d, v16.2d, v17.2d
121
+    add             v16.8h, v20.8h, v21.8h
122
+
123
+    sqrshrun        v16.8b,  v16.8h,  #6
124
+    st1             {v16.s}0, x2, x3
125
+    st1             {v16.s}1, x2, x3
126
+
127
+    sub             x9, x9, #2
128
+    cbnz            x9, .loop_4x\h
129
+    ret
130
+endfunc
131
+.endm
132
+
133
+LUMA_VPP_4xN 4
134
+LUMA_VPP_4xN 8
135
+LUMA_VPP_4xN 16
136
+
137
+// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
138
+.macro LUMA_VPP w, h
139
+function x265_interp_8tap_vert_pp_\w\()x\h\()_neon
140
+    cmp             x4, #0
141
+    b.eq            0f
142
+    cmp             x4, #1
143
+    b.eq            1f
144
+    cmp             x4, #2
145
+    b.eq            2f
146
+    cmp             x4, #3
147
+    b.eq            3f
148
+0:
149
+    FILTER_LUMA_VPP \w, \h, 0
150
+1:
151
+    FILTER_LUMA_VPP \w, \h, 1
152
+2:
153
+    FILTER_LUMA_VPP \w, \h, 2
154
+3:
155
+    FILTER_LUMA_VPP \w, \h, 3
156
+endfunc
157
+.endm
158
+
159
+LUMA_VPP 8, 4
160
+LUMA_VPP 8, 8
161
+LUMA_VPP 8, 16
162
+LUMA_VPP 8, 32
163
+LUMA_VPP 12, 16
164
+LUMA_VPP 16, 4
165
+LUMA_VPP 16, 8
166
+LUMA_VPP 16, 16
167
+LUMA_VPP 16, 32
168
+LUMA_VPP 16, 64
169
+LUMA_VPP 16, 12
170
+LUMA_VPP 24, 32
171
+LUMA_VPP 32, 8
172
+LUMA_VPP 32, 16
173
+LUMA_VPP 32, 32
174
+LUMA_VPP 32, 64
175
+LUMA_VPP 32, 24
176
+LUMA_VPP 48, 64
177
+LUMA_VPP 64, 16
178
+LUMA_VPP 64, 32
179
+LUMA_VPP 64, 64
180
+LUMA_VPP 64, 48
181
+
182
+// ***** luma_vps *****
183
+// void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
184
+.macro LUMA_VPS_4xN h
185
+function x265_interp_8tap_vert_ps_4x\h\()_neon
186
+    lsl             x3, x3, #1
187
+    lsl             x5, x4, #6
188
+    lsl             x4, x1, #2
189
+    sub             x4, x4, x1
190
+    sub             x0, x0, x4
191
+
192
+    mov             w6, #8192
193
+    dup             v28.4s, w6
194
+    mov             x4, #\h
195
+    movrel          x12, g_lumaFilter
196
+    add             x12, x12, x5
197
+    ld1r            {v16.2d}, x12, #8
198
+    ld1r            {v17.2d}, x12, #8
199
+    ld1r            {v18.2d}, x12, #8
200
+    ld1r            {v19.2d}, x12, #8
201
+    ld1r            {v20.2d}, x12, #8
202
+    ld1r            {v21.2d}, x12, #8
203
+    ld1r            {v22.2d}, x12, #8
204
+    ld1r            {v23.2d}, x12, #8
205
+
206
+.loop_vps_4x\h:
207
+    mov             x6, x0
208
+
209
+    ld1             {v0.s}0, x6, x1
210
+    ld1             {v1.s}0, x6, x1
211
+    ld1             {v2.s}0, x6, x1
212
+    ld1             {v3.s}0, x6, x1
213
+    ld1             {v4.s}0, x6, x1
214
+    ld1             {v5.s}0, x6, x1
215
+    ld1             {v6.s}0, x6, x1
216
+    ld1             {v7.s}0, x6, x1
217
+    uxtl            v0.8h, v0.8b
218
+    uxtl            v0.4s, v0.4h
219
+
220
+    uxtl            v1.8h, v1.8b
221
+    uxtl            v1.4s, v1.4h
222
+    mul             v0.4s, v0.4s, v16.4s
223
+
224
+    uxtl            v2.8h, v2.8b
225
+    uxtl            v2.4s, v2.4h
226
+    mla             v0.4s, v1.4s, v17.4s
227
+
228
+    uxtl            v3.8h, v3.8b
229
+    uxtl            v3.4s, v3.4h
230
+    mla             v0.4s, v2.4s, v18.4s
231
+
232
+    uxtl            v4.8h, v4.8b
233
+    uxtl            v4.4s, v4.4h
234
+    mla             v0.4s, v3.4s, v19.4s
235
+
236
+    uxtl            v5.8h, v5.8b
237
+    uxtl            v5.4s, v5.4h
238
+    mla             v0.4s, v4.4s, v20.4s
239
+
240
+    uxtl            v6.8h, v6.8b
241
+    uxtl            v6.4s, v6.4h
242
+    mla             v0.4s, v5.4s, v21.4s
243
+
244
+    uxtl            v7.8h, v7.8b
245
+    uxtl            v7.4s, v7.4h
246
+    mla             v0.4s, v6.4s, v22.4s
247
+
248
+    mla             v0.4s, v7.4s, v23.4s
249
+
250
+    sub             v0.4s, v0.4s, v28.4s
251
+    sqxtn           v0.4h, v0.4s
252
+    st1             {v0.8b}, x2, x3
253
+
254
+    add             x0, x0, x1
255
+    sub             x4, x4, #1
256
+    cbnz            x4, .loop_vps_4x\h
257
+    ret
258
+endfunc
259
+.endm
260
+
261
+LUMA_VPS_4xN 4
262
+LUMA_VPS_4xN 8
263
+LUMA_VPS_4xN 16
264
+
265
+// void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
266
+.macro LUMA_VPS w, h
267
+function x265_interp_8tap_vert_ps_\w\()x\h\()_neon
268
+    cmp             x4, #0
269
+    beq             0f
270
+    cmp             x4, #1
271
+    beq             1f
272
+    cmp             x4, #2
273
+    beq             2f
274
+    cmp             x4, #3
275
+    beq             3f
276
+0:
277
+    FILTER_VPS \w, \h, 0
278
+1:
279
+    FILTER_VPS \w, \h, 1
280
+2:
281
+    FILTER_VPS \w, \h, 2
282
+3:
283
+    FILTER_VPS \w, \h, 3
284
+endfunc
285
+.endm
286
+
287
+LUMA_VPS 8, 4
288
+LUMA_VPS 8, 8
289
+LUMA_VPS 8, 16
290
+LUMA_VPS 8, 32
291
+LUMA_VPS 12, 16
292
+LUMA_VPS 16, 4
293
+LUMA_VPS 16, 8
294
+LUMA_VPS 16, 16
295
+LUMA_VPS 16, 32
296
+LUMA_VPS 16, 64
297
+LUMA_VPS 16, 12
298
+LUMA_VPS 24, 32
299
+LUMA_VPS 32, 8
300
+LUMA_VPS 32, 16
301
+LUMA_VPS 32, 32
302
+LUMA_VPS 32, 64
303
+LUMA_VPS 32, 24
304
+LUMA_VPS 48, 64
305
+LUMA_VPS 64, 16
306
+LUMA_VPS 64, 32
307
+LUMA_VPS 64, 64
308
+LUMA_VPS 64, 48
309
+
310
+// ***** luma_vsp *****
311
+// void interp_vert_sp_c(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
312
+.macro LUMA_VSP_4xN h
313
+function x265_interp_8tap_vert_sp_4x\h\()_neon
314
+    lsl             x5, x4, #6
315
+    lsl             x1, x1, #1
316
+    lsl             x4, x1, #2
317
+    sub             x4, x4, x1
318
+    sub             x0, x0, x4
319
+
320
+    mov             w12, #1
321
+    lsl             w12, w12, #19
322
+    add             w12, w12, #2048
323
+    dup             v24.4s, w12
324
+    mov             x4, #\h
325
+    movrel          x12, g_lumaFilter
326
+    add             x12, x12, x5
327
+    ld1r            {v16.2d}, x12, #8
328
+    ld1r            {v17.2d}, x12, #8
329
+    ld1r            {v18.2d}, x12, #8
330
+    ld1r            {v19.2d}, x12, #8
331
+    ld1r            {v20.2d}, x12, #8
332
+    ld1r            {v21.2d}, x12, #8
333
+    ld1r            {v22.2d}, x12, #8
334
+    ld1r            {v23.2d}, x12, #8
335
+.loop_vsp_4x\h:
336
+    mov             x6, x0
337
+
338
+    ld1             {v0.8b}, x6, x1
339
+    ld1             {v1.8b}, x6, x1
340
+    ld1             {v2.8b}, x6, x1
341
+    ld1             {v3.8b}, x6, x1
342
+    ld1             {v4.8b}, x6, x1
343
+    ld1             {v5.8b}, x6, x1
344
+    ld1             {v6.8b}, x6, x1
345
+    ld1             {v7.8b}, x6, x1
346
+
347
+    sshll           v0.4s, v0.4h, #0
348
+    sshll           v1.4s, v1.4h, #0
349
+    mul             v0.4s, v0.4s, v16.4s
350
+    sshll           v2.4s, v2.4h, #0
351
+    mla             v0.4s, v1.4s, v17.4s
352
+    sshll           v3.4s, v3.4h, #0
353
+    mla             v0.4s, v2.4s, v18.4s
354
+    sshll           v4.4s, v4.4h, #0
355
+    mla             v0.4s, v3.4s, v19.4s
356
+    sshll           v5.4s, v5.4h, #0
357
+    mla             v0.4s, v4.4s, v20.4s
358
+    sshll           v6.4s, v6.4h, #0
359
+    mla             v0.4s, v5.4s, v21.4s
360
+    sshll           v7.4s, v7.4h, #0
361
+    mla             v0.4s, v6.4s, v22.4s
362
+
363
+    mla             v0.4s, v7.4s, v23.4s
364
+
365
+    add             v0.4s, v0.4s, v24.4s
366
+    sqshrun         v0.4h, v0.4s, #12
367
+    sqxtun          v0.8b, v0.8h
368
+    st1             {v0.s}0, x2, x3
369
+
370
+    add             x0, x0, x1
371
+    sub             x4, x4, #1
372
+    cbnz            x4, .loop_vsp_4x\h
373
+    ret
374
+endfunc
375
+.endm
376
+
377
+LUMA_VSP_4xN 4
378
+LUMA_VSP_4xN 8
379
+LUMA_VSP_4xN 16
380
+
381
+// void interp_vert_sp_c(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
382
+.macro LUMA_VSP w, h
383
+function x265_interp_8tap_vert_sp_\w\()x\h\()_neon
384
+    cmp             x4, #0
385
+    beq             0f
386
+    cmp             x4, #1
387
+    beq             1f
388
+    cmp             x4, #2
389
+    beq             2f
390
+    cmp             x4, #3
391
+    beq             3f
392
+0:
393
+    FILTER_VSP \w, \h, 0
394
+1:
395
+    FILTER_VSP \w, \h, 1
396
+2:
397
+    FILTER_VSP \w, \h, 2
398
+3:
399
+    FILTER_VSP \w, \h, 3
400
+endfunc
401
+.endm
402
+
403
+LUMA_VSP 8, 4
404
+LUMA_VSP 8, 8
405
+LUMA_VSP 8, 16
406
+LUMA_VSP 8, 32
407
+LUMA_VSP 12, 16
408
+LUMA_VSP 16, 4
409
+LUMA_VSP 16, 8
410
+LUMA_VSP 16, 16
411
+LUMA_VSP 16, 32
412
+LUMA_VSP 16, 64
413
+LUMA_VSP 16, 12
414
+LUMA_VSP 32, 8
415
+LUMA_VSP 32, 16
416
+LUMA_VSP 32, 32
417
+LUMA_VSP 32, 64
418
+LUMA_VSP 32, 24
419
+LUMA_VSP 64, 16
420
+LUMA_VSP 64, 32
421
+LUMA_VSP 64, 64
422
+LUMA_VSP 64, 48
423
+LUMA_VSP 24, 32
424
+LUMA_VSP 48, 64
425
+
426
+// ***** luma_vss *****
427
+// void interp_vert_ss_c(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
428
+.macro LUMA_VSS w, h
429
+function x265_interp_8tap_vert_ss_\w\()x\h\()_neon
430
+    cmp             x4, #0
431
+    beq             0f
432
+    cmp             x4, #1
433
+    beq             1f
434
+    cmp             x4, #2
435
+    beq             2f
436
+    cmp             x4, #3
437
+    beq             3f
438
+0:
439
+    FILTER_VSS \w, \h, 0
440
+1:
441
+    FILTER_VSS \w, \h, 1
442
+2:
443
+    FILTER_VSS \w, \h, 2
444
+3:
445
+    FILTER_VSS \w, \h, 3
446
+endfunc
447
+.endm
448
+
449
+LUMA_VSS 4, 4
450
+LUMA_VSS 4, 8
451
+LUMA_VSS 4, 16
452
+LUMA_VSS 8, 4
453
+LUMA_VSS 8, 8
454
+LUMA_VSS 8, 16
455
+LUMA_VSS 8, 32
456
+LUMA_VSS 12, 16
457
+LUMA_VSS 16, 4
458
+LUMA_VSS 16, 8
459
+LUMA_VSS 16, 16
460
+LUMA_VSS 16, 32
461
+LUMA_VSS 16, 64
462
+LUMA_VSS 16, 12
463
+LUMA_VSS 32, 8
464
+LUMA_VSS 32, 16
465
+LUMA_VSS 32, 32
466
+LUMA_VSS 32, 64
467
+LUMA_VSS 32, 24
468
+LUMA_VSS 64, 16
469
+LUMA_VSS 64, 32
470
+LUMA_VSS 64, 64
471
+LUMA_VSS 64, 48
472
+LUMA_VSS 24, 32
473
+LUMA_VSS 48, 64
474
+
475
+// ***** luma_hpp *****
476
+// void interp_horiz_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
477
+.macro LUMA_HPP w, h
478
+function x265_interp_horiz_pp_\w\()x\h\()_neon
479
+    cmp             x4, #0
480
+    beq             0f
481
+    cmp             x4, #1
482
+    beq             1f
483
+    cmp             x4, #2
484
+    beq             2f
485
+    cmp             x4, #3
486
+    beq             3f
487
+0:
488
+    FILTER_HPP \w, \h, 0
489
+1:
490
+    FILTER_HPP \w, \h, 1
491
+2:
492
+    FILTER_HPP \w, \h, 2
493
+3:
494
+    FILTER_HPP \w, \h, 3
495
+endfunc
496
+.endm
497
+
498
+LUMA_HPP 4, 4
499
+LUMA_HPP 4, 8
500
+LUMA_HPP 4, 16
501
+LUMA_HPP 8, 4
502
+LUMA_HPP 8, 8
503
+LUMA_HPP 8, 16
504
+LUMA_HPP 8, 32
505
+LUMA_HPP 12, 16
506
+LUMA_HPP 16, 4
507
+LUMA_HPP 16, 8
508
+LUMA_HPP 16, 12
509
+LUMA_HPP 16, 16
510
+LUMA_HPP 16, 32
511
+LUMA_HPP 16, 64
512
+LUMA_HPP 24, 32
513
+LUMA_HPP 32, 8
514
+LUMA_HPP 32, 16
515
+LUMA_HPP 32, 24
516
+LUMA_HPP 32, 32
517
+LUMA_HPP 32, 64
518
+LUMA_HPP 48, 64
519
+LUMA_HPP 64, 16
520
+LUMA_HPP 64, 32
521
+LUMA_HPP 64, 48
522
+LUMA_HPP 64, 64
523
+
524
+// ***** luma_hps *****
525
+// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
526
+.macro LUMA_HPS w, h
527
+function x265_interp_horiz_ps_\w\()x\h\()_neon
528
+    mov             w10, #\h
529
+    cmp             w5, #0
530
+    b.eq            6f
531
+    sub             x0, x0, x1, lsl #2
532
+    add             x0, x0, x1
533
+    add             w10, w10, #7
534
+6:
535
+    mov             w6, w10
536
+    cmp             w4, #0
537
+    b.eq            0f
538
+    cmp             w4, #1
539
+    b.eq            1f
540
+    cmp             w4, #2
541
+    b.eq            2f
542
+    cmp             w4, #3
543
+    b.eq            3f
544
+0:
545
+    FILTER_HPS \w, \h, 0
546
+1:
547
+    FILTER_HPS \w, \h, 1
548
+2:
549
+    FILTER_HPS \w, \h, 2
550
+3:
551
+    FILTER_HPS \w, \h, 3
552
+endfunc
553
+.endm
554
+
555
+LUMA_HPS 4, 4
556
+LUMA_HPS 4, 8
557
+LUMA_HPS 4, 16
558
+LUMA_HPS 8, 4
559
+LUMA_HPS 8, 8
560
+LUMA_HPS 8, 16
561
+LUMA_HPS 8, 32
562
+LUMA_HPS 12, 16
563
+LUMA_HPS 16, 4
564
+LUMA_HPS 16, 8
565
+LUMA_HPS 16, 12
566
+LUMA_HPS 16, 16
567
+LUMA_HPS 16, 32
568
+LUMA_HPS 16, 64
569
+LUMA_HPS 24, 32
570
+LUMA_HPS 32, 8
571
+LUMA_HPS 32, 16
572
+LUMA_HPS 32, 24
573
+LUMA_HPS 32, 32
574
+LUMA_HPS 32, 64
575
+LUMA_HPS 48, 64
576
+LUMA_HPS 64, 16
577
+LUMA_HPS 64, 32
578
+LUMA_HPS 64, 48
579
+LUMA_HPS 64, 64
580
+
581
+// ***** chroma_vpp *****
582
+// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
583
+.macro CHROMA_VPP w, h
584
+function x265_interp_4tap_vert_pp_\w\()x\h\()_neon
585
+    cmp             x4, #0
586
+    beq             0f
587
+    cmp             x4, #1
588
+    beq             1f
589
+    cmp             x4, #2
590
+    beq             2f
591
+    cmp             x4, #3
592
+    beq             3f
593
+    cmp             x4, #4
594
+    beq             4f
595
+    cmp             x4, #5
596
+    beq             5f
597
+    cmp             x4, #6
598
+    beq             6f
599
+    cmp             x4, #7
600
+    beq             7f
601
+0:
602
+    FILTER_CHROMA_VPP  \w, \h, 0
603
+1:
604
+    FILTER_CHROMA_VPP  \w, \h, 1
605
+2:
606
+    FILTER_CHROMA_VPP  \w, \h, 2
607
+3:
608
+    FILTER_CHROMA_VPP  \w, \h, 3
609
+4:
610
+    FILTER_CHROMA_VPP  \w, \h, 4
611
+5:
612
+    FILTER_CHROMA_VPP  \w, \h, 5
613
+6:
614
+    FILTER_CHROMA_VPP  \w, \h, 6
615
+7:
616
+    FILTER_CHROMA_VPP  \w, \h, 7
617
+endfunc
618
+.endm
619
+
620
+CHROMA_VPP 2, 4
621
+CHROMA_VPP 2, 8
622
+CHROMA_VPP 2, 16
623
+CHROMA_VPP 4, 2
624
+CHROMA_VPP 4, 4
625
+CHROMA_VPP 4, 8
626
+CHROMA_VPP 4, 16
627
+CHROMA_VPP 4, 32
628
+CHROMA_VPP 6, 8
629
+CHROMA_VPP 6, 16
630
+CHROMA_VPP 8, 2
631
+CHROMA_VPP 8, 4
632
+CHROMA_VPP 8, 6
633
+CHROMA_VPP 8, 8
634
+CHROMA_VPP 8, 16
635
+CHROMA_VPP 8, 32
636
+CHROMA_VPP 8, 12
637
+CHROMA_VPP 8, 64
638
+CHROMA_VPP 12, 16
639
+CHROMA_VPP 12, 32
640
+CHROMA_VPP 16, 4
641
+CHROMA_VPP 16, 8
642
+CHROMA_VPP 16, 12
643
+CHROMA_VPP 16, 16
644
+CHROMA_VPP 16, 32
645
+CHROMA_VPP 16, 64
646
+CHROMA_VPP 16, 24
647
+CHROMA_VPP 32, 8
648
+CHROMA_VPP 32, 16
649
+CHROMA_VPP 32, 24
650
+CHROMA_VPP 32, 32
651
+CHROMA_VPP 32, 64
652
+CHROMA_VPP 32, 48
653
+CHROMA_VPP 24, 32
654
+CHROMA_VPP 24, 64
655
+CHROMA_VPP 64, 16
656
+CHROMA_VPP 64, 32
657
+CHROMA_VPP 64, 48
658
+CHROMA_VPP 64, 64
659
+CHROMA_VPP 48, 64
660
+
661
+// ***** chroma_vps *****
662
+// void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
663
+.macro CHROMA_VPS w, h
664
+function x265_interp_4tap_vert_ps_\w\()x\h\()_neon
665
+    cmp             x4, #0
666
+    beq             0f
667
+    cmp             x4, #1
668
+    beq             1f
669
+    cmp             x4, #2
670
+    beq             2f
671
+    cmp             x4, #3
672
+    beq             3f
673
+    cmp             x4, #4
674
+    beq             4f
675
+    cmp             x4, #5
676
+    beq             5f
677
+    cmp             x4, #6
678
+    beq             6f
679
+    cmp             x4, #7
680
+    beq             7f
681
+0:
682
+    FILTER_CHROMA_VPS  \w, \h, 0
683
+1:
684
+    FILTER_CHROMA_VPS  \w, \h, 1
685
+2:
686
+    FILTER_CHROMA_VPS  \w, \h, 2
687
+3:
688
+    FILTER_CHROMA_VPS  \w, \h, 3
689
+4:
690
+    FILTER_CHROMA_VPS  \w, \h, 4
691
+5:
692
+    FILTER_CHROMA_VPS  \w, \h, 5
693
+6:
694
+    FILTER_CHROMA_VPS  \w, \h, 6
695
+7:
696
+    FILTER_CHROMA_VPS  \w, \h, 7
697
+endfunc
698
+.endm
699
+
700
+CHROMA_VPS 2, 4
701
+CHROMA_VPS 2, 8
702
+CHROMA_VPS 2, 16
703
+CHROMA_VPS 4, 2
704
+CHROMA_VPS 4, 4
705
+CHROMA_VPS 4, 8
706
+CHROMA_VPS 4, 16
707
+CHROMA_VPS 4, 32
708
+CHROMA_VPS 6, 8
709
+CHROMA_VPS 6, 16
710
+CHROMA_VPS 8, 2
711
+CHROMA_VPS 8, 4
712
+CHROMA_VPS 8, 6
713
+CHROMA_VPS 8, 8
714
+CHROMA_VPS 8, 16
715
+CHROMA_VPS 8, 32
716
+CHROMA_VPS 8, 12
717
+CHROMA_VPS 8, 64
718
+CHROMA_VPS 12, 16
719
+CHROMA_VPS 12, 32
720
+CHROMA_VPS 16, 4
721
+CHROMA_VPS 16, 8
722
+CHROMA_VPS 16, 12
723
+CHROMA_VPS 16, 16
724
+CHROMA_VPS 16, 32
725
+CHROMA_VPS 16, 64
726
+CHROMA_VPS 16, 24
727
+CHROMA_VPS 32, 8
728
+CHROMA_VPS 32, 16
729
+CHROMA_VPS 32, 24
730
+CHROMA_VPS 32, 32
731
+CHROMA_VPS 32, 64
732
+CHROMA_VPS 32, 48
733
+CHROMA_VPS 24, 32
734
+CHROMA_VPS 24, 64
735
+CHROMA_VPS 64, 16
736
+CHROMA_VPS 64, 32
737
+CHROMA_VPS 64, 48
738
+CHROMA_VPS 64, 64
739
+CHROMA_VPS 48, 64
740
+
741
+// ***** chroma_vsp *****
742
+// void interp_vert_sp_c(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
743
+.macro CHROMA_VSP w, h
744
+function x265_interp_4tap_vert_sp_\w\()x\h\()_neon
745
+    cmp             x4, #0
746
+    beq             0f
747
+    cmp             x4, #1
748
+    beq             1f
749
+    cmp             x4, #2
750
+    beq             2f
751
+    cmp             x4, #3
752
+    beq             3f
753
+    cmp             x4, #4
754
+    beq             4f
755
+    cmp             x4, #5
756
+    beq             5f
757
+    cmp             x4, #6
758
+    beq             6f
759
+    cmp             x4, #7
760
+    beq             7f
761
+0:
762
+    FILTER_CHROMA_VSP  \w, \h, 0
763
+1:
764
+    FILTER_CHROMA_VSP  \w, \h, 1
765
+2:
766
+    FILTER_CHROMA_VSP  \w, \h, 2
767
+3:
768
+    FILTER_CHROMA_VSP  \w, \h, 3
769
+4:
770
+    FILTER_CHROMA_VSP  \w, \h, 4
771
+5:
772
+    FILTER_CHROMA_VSP  \w, \h, 5
773
+6:
774
+    FILTER_CHROMA_VSP  \w, \h, 6
775
+7:
776
+    FILTER_CHROMA_VSP  \w, \h, 7
777
+endfunc
778
+.endm
779
+
780
+CHROMA_VSP 4, 4
781
+CHROMA_VSP 4, 8
782
+CHROMA_VSP 4, 16
783
+CHROMA_VSP 4, 32
784
+CHROMA_VSP 8, 2
785
+CHROMA_VSP 8, 4
786
+CHROMA_VSP 8, 6
787
+CHROMA_VSP 8, 8
788
+CHROMA_VSP 8, 16
789
+CHROMA_VSP 8, 32
790
+CHROMA_VSP 8, 12
791
+CHROMA_VSP 8, 64
792
+CHROMA_VSP 12, 16
793
+CHROMA_VSP 12, 32
794
+CHROMA_VSP 16, 4
795
+CHROMA_VSP 16, 8
796
+CHROMA_VSP 16, 12
797
+CHROMA_VSP 16, 16
798
+CHROMA_VSP 16, 32
799
+CHROMA_VSP 16, 64
800
+CHROMA_VSP 16, 24
801
+CHROMA_VSP 32, 8
802
+CHROMA_VSP 32, 16
803
+CHROMA_VSP 32, 24
804
+CHROMA_VSP 32, 32
805
+CHROMA_VSP 32, 64
806
+CHROMA_VSP 32, 48
807
+CHROMA_VSP 24, 32
808
+CHROMA_VSP 24, 64
809
+CHROMA_VSP 64, 16
810
+CHROMA_VSP 64, 32
811
+CHROMA_VSP 64, 48
812
+CHROMA_VSP 64, 64
813
+CHROMA_VSP 48, 64
814
+
815
+// ***** chroma_vss *****
816
+// void interp_vert_ss_c(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
817
+.macro CHROMA_VSS w, h
818
+function x265_interp_4tap_vert_ss_\w\()x\h\()_neon
819
+    cmp             x4, #0
820
+    beq             0f
821
+    cmp             x4, #1
822
+    beq             1f
823
+    cmp             x4, #2
824
+    beq             2f
825
+    cmp             x4, #3
826
+    beq             3f
827
+    cmp             x4, #4
828
+    beq             4f
829
+    cmp             x4, #5
830
+    beq             5f
831
+    cmp             x4, #6
832
+    beq             6f
833
+    cmp             x4, #7
834
+    beq             7f
835
+0:
836
+    FILTER_CHROMA_VSS  \w, \h, 0
837
+1:
838
+    FILTER_CHROMA_VSS  \w, \h, 1
839
+2:
840
+    FILTER_CHROMA_VSS  \w, \h, 2
841
+3:
842
+    FILTER_CHROMA_VSS  \w, \h, 3
843
+4:
844
+    FILTER_CHROMA_VSS  \w, \h, 4
845
+5:
846
+    FILTER_CHROMA_VSS  \w, \h, 5
847
+6:
848
+    FILTER_CHROMA_VSS  \w, \h, 6
849
+7:
850
+    FILTER_CHROMA_VSS  \w, \h, 7
851
+endfunc
852
+.endm
853
+
854
+CHROMA_VSS 4, 4
855
+CHROMA_VSS 4, 8
856
+CHROMA_VSS 4, 16
857
+CHROMA_VSS 4, 32
858
+CHROMA_VSS 8, 2
859
+CHROMA_VSS 8, 4
860
+CHROMA_VSS 8, 6
861
+CHROMA_VSS 8, 8
862
+CHROMA_VSS 8, 16
863
+CHROMA_VSS 8, 32
864
+CHROMA_VSS 8, 12
865
+CHROMA_VSS 8, 64
866
+CHROMA_VSS 12, 16
867
+CHROMA_VSS 12, 32
868
+CHROMA_VSS 16, 4
869
+CHROMA_VSS 16, 8
870
+CHROMA_VSS 16, 12
871
+CHROMA_VSS 16, 16
872
+CHROMA_VSS 16, 32
873
+CHROMA_VSS 16, 64
874
+CHROMA_VSS 16, 24
875
+CHROMA_VSS 32, 8
876
+CHROMA_VSS 32, 16
877
+CHROMA_VSS 32, 24
878
+CHROMA_VSS 32, 32
879
+CHROMA_VSS 32, 64
880
+CHROMA_VSS 32, 48
881
+CHROMA_VSS 24, 32
882
+CHROMA_VSS 24, 64
883
+CHROMA_VSS 64, 16
884
+CHROMA_VSS 64, 32
885
+CHROMA_VSS 64, 48
886
+CHROMA_VSS 64, 64
887
+CHROMA_VSS 48, 64
888
+
889
+// ***** chroma_hpp *****
890
+// void interp_horiz_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
891
+.macro CHROMA_HPP w, h
892
+function x265_interp_4tap_horiz_pp_\w\()x\h\()_neon
893
+    cmp             x4, #0
894
+    beq             0f
895
+    cmp             x4, #1
896
+    beq             1f
897
+    cmp             x4, #2
898
+    beq             2f
899
+    cmp             x4, #3
900
+    beq             3f
901
+    cmp             x4, #4
902
+    beq             4f
903
+    cmp             x4, #5
904
+    beq             5f
905
+    cmp             x4, #6
906
+    beq             6f
907
+    cmp             x4, #7
908
+    beq             7f
909
+0:
910
+    FILTER_CHROMA_HPP  \w, \h, 0
911
+1:
912
+    FILTER_CHROMA_HPP  \w, \h, 1
913
+2:
914
+    FILTER_CHROMA_HPP  \w, \h, 2
915
+3:
916
+    FILTER_CHROMA_HPP  \w, \h, 3
917
+4:
918
+    FILTER_CHROMA_HPP  \w, \h, 4
919
+5:
920
+    FILTER_CHROMA_HPP  \w, \h, 5
921
+6:
922
+    FILTER_CHROMA_HPP  \w, \h, 6
923
+7:
924
+    FILTER_CHROMA_HPP  \w, \h, 7
925
+endfunc
926
+.endm
927
+
928
+CHROMA_HPP 2, 4
929
+CHROMA_HPP 2, 8
930
+CHROMA_HPP 2, 16
931
+CHROMA_HPP 4, 2
932
+CHROMA_HPP 4, 4
933
+CHROMA_HPP 4, 8
934
+CHROMA_HPP 4, 16
935
+CHROMA_HPP 4, 32
936
+CHROMA_HPP 6, 8
937
+CHROMA_HPP 6, 16
938
+CHROMA_HPP 8, 2
939
+CHROMA_HPP 8, 4
940
+CHROMA_HPP 8, 6
941
+CHROMA_HPP 8, 8
942
+CHROMA_HPP 8, 12
943
+CHROMA_HPP 8, 16
944
+CHROMA_HPP 8, 32
945
+CHROMA_HPP 8, 64
946
+CHROMA_HPP 12, 16
947
+CHROMA_HPP 12, 32
948
+CHROMA_HPP 16, 4
949
+CHROMA_HPP 16, 8
950
+CHROMA_HPP 16, 12
951
+CHROMA_HPP 16, 16
952
+CHROMA_HPP 16, 24
953
+CHROMA_HPP 16, 32
954
+CHROMA_HPP 16, 64
955
+CHROMA_HPP 24, 32
956
+CHROMA_HPP 24, 64
957
+CHROMA_HPP 32, 8
958
+CHROMA_HPP 32, 16
959
+CHROMA_HPP 32, 24
960
+CHROMA_HPP 32, 32
961
+CHROMA_HPP 32, 48
962
+CHROMA_HPP 32, 64
963
+CHROMA_HPP 48, 64
964
+CHROMA_HPP 64, 16
965
+CHROMA_HPP 64, 32
966
+CHROMA_HPP 64, 48
967
+CHROMA_HPP 64, 64
968
+
969
+// ***** chroma_hps *****
970
+// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
971
+.macro CHROMA_HPS w, h
972
+function x265_interp_4tap_horiz_ps_\w\()x\h\()_neon
973
+    cmp             x4, #0
974
+    beq             0f
975
+    cmp             x4, #1
976
+    beq             1f
977
+    cmp             x4, #2
978
+    beq             2f
979
+    cmp             x4, #3
980
+    beq             3f
981
+    cmp             x4, #4
982
+    beq             4f
983
+    cmp             x4, #5
984
+    beq             5f
985
+    cmp             x4, #6
986
+    beq             6f
987
+    cmp             x4, #7
988
+    beq             7f
989
+0:
990
+    FILTER_CHROMA_HPS  \w, \h, 0
991
+1:
992
+    FILTER_CHROMA_HPS  \w, \h, 1
993
+2:
994
+    FILTER_CHROMA_HPS  \w, \h, 2
995
+3:
996
+    FILTER_CHROMA_HPS  \w, \h, 3
997
+4:
998
+    FILTER_CHROMA_HPS  \w, \h, 4
999
+5:
1000
+    FILTER_CHROMA_HPS  \w, \h, 5
1001
+6:
1002
+    FILTER_CHROMA_HPS  \w, \h, 6
1003
+7:
1004
+    FILTER_CHROMA_HPS  \w, \h, 7
1005
+endfunc
1006
+.endm
1007
+
1008
+CHROMA_HPS 2, 4
1009
+CHROMA_HPS 2, 8
1010
+CHROMA_HPS 2, 16
1011
+CHROMA_HPS 4, 2
1012
+CHROMA_HPS 4, 4
1013
+CHROMA_HPS 4, 8
1014
+CHROMA_HPS 4, 16
1015
+CHROMA_HPS 4, 32
1016
+CHROMA_HPS 6, 8
1017
+CHROMA_HPS 6, 16
1018
+CHROMA_HPS 8, 2
1019
+CHROMA_HPS 8, 4
1020
+CHROMA_HPS 8, 6
1021
+CHROMA_HPS 8, 8
1022
+CHROMA_HPS 8, 12
1023
+CHROMA_HPS 8, 16
1024
+CHROMA_HPS 8, 32
1025
+CHROMA_HPS 8, 64
1026
+CHROMA_HPS 12, 16
1027
+CHROMA_HPS 12, 32
1028
+CHROMA_HPS 16, 4
1029
+CHROMA_HPS 16, 8
1030
+CHROMA_HPS 16, 12
1031
+CHROMA_HPS 16, 16
1032
+CHROMA_HPS 16, 24
1033
+CHROMA_HPS 16, 32
1034
+CHROMA_HPS 16, 64
1035
+CHROMA_HPS 24, 32
1036
+CHROMA_HPS 24, 64
1037
+CHROMA_HPS 32, 8
1038
+CHROMA_HPS 32, 16
1039
+CHROMA_HPS 32, 24
1040
+CHROMA_HPS 32, 32
1041
+CHROMA_HPS 32, 48
1042
+CHROMA_HPS 32, 64
1043
+CHROMA_HPS 48, 64
1044
+CHROMA_HPS 64, 16
1045
+CHROMA_HPS 64, 32
1046
+CHROMA_HPS 64, 48
1047
+CHROMA_HPS 64, 64
1048
+
1049
+const g_luma_s16, align=8
1050
+//       a, b,   c,  d,  e,   f, g,  h
1051
+.hword   0, 0,   0, 64,  0,   0, 0,  0
1052
+.hword  -1, 4, -10, 58, 17,  -5, 1,  0
1053
+.hword  -1, 4, -11, 40, 40, -11, 4, -1
1054
+.hword   0, 1,  -5, 17, 58, -10, 4, -1
1055
+endconst
1056
x265_3.6.tar.gz/source/common/aarch64/loopfilter-prim.cpp Added
293
 
1
@@ -0,0 +1,291 @@
2
+#include "loopfilter-prim.h"
3
+
4
+#define PIXEL_MIN 0
5
+
6
+
7
+
8
+#if !(HIGH_BIT_DEPTH) && defined(HAVE_NEON)
9
+#include<arm_neon.h>
10
+
11
+namespace
12
+{
13
+
14
+
15
+/* get the sign of input variable (TODO: this is a dup, make common) */
16
+static inline int8_t signOf(int x)
17
+{
18
+    return (x >> 31) | ((int)((((uint32_t) - x)) >> 31));
19
+}
20
+
21
+static inline int8x8_t sign_diff_neon(const uint8x8_t in0, const uint8x8_t in1)
22
+{
23
+    int16x8_t in = vsubl_u8(in0, in1);
24
+    return vmovn_s16(vmaxq_s16(vminq_s16(in, vdupq_n_s16(1)), vdupq_n_s16(-1)));
25
+}
26
+
27
+static void calSign_neon(int8_t *dst, const pixel *src1, const pixel *src2, const int endX)
28
+{
29
+    int x = 0;
30
+    for (; (x + 8) <= endX; x += 8)
31
+    {
32
+        *(int8x8_t *)&dstx  = sign_diff_neon(*(uint8x8_t *)&src1x, *(uint8x8_t *)&src2x);
33
+    }
34
+
35
+    for (; x < endX; x++)
36
+    {
37
+        dstx = signOf(src1x - src2x);
38
+    }
39
+}
40
+
41
+static void processSaoCUE0_neon(pixel *rec, int8_t *offsetEo, int width, int8_t *signLeft, intptr_t stride)
42
+{
43
+
44
+
45
+    int y;
46
+    int8_t signRight, signLeft0;
47
+    int8_t edgeType;
48
+
49
+    for (y = 0; y < 2; y++)
50
+    {
51
+        signLeft0 = signLefty;
52
+        int x = 0;
53
+
54
+        if (width >= 8)
55
+        {
56
+            int8x8_t vsignRight;
57
+            int8x8x2_t shifter;
58
+            shifter.val10 = signLeft0;
59
+            static const int8x8_t index = {8, 0, 1, 2, 3, 4, 5, 6};
60
+            int8x8_t tbl = *(int8x8_t *)offsetEo;
61
+            for (; (x + 8) <= width; x += 8)
62
+            {
63
+                uint8x8_t in = *(uint8x8_t *)&recx;
64
+                vsignRight = sign_diff_neon(in, *(uint8x8_t *)&recx + 1);
65
+                shifter.val0 = vneg_s8(vsignRight);
66
+                int8x8_t tmp = shifter.val0;
67
+                int8x8_t edge = vtbl2_s8(shifter, index);
68
+                int8x8_t vedgeType = vadd_s8(vadd_s8(vsignRight, edge), vdup_n_s8(2));
69
+                shifter.val10 = tmp7;
70
+                int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
71
+                t1 = vaddw_u8(t1, in);
72
+                t1 = vmaxq_s16(t1, vdupq_n_s16(0));
73
+                t1 = vminq_s16(t1, vdupq_n_s16(255));
74
+                *(uint8x8_t *)&recx = vmovn_u16(t1);
75
+            }
76
+            signLeft0 = shifter.val10;
77
+        }
78
+        for (; x < width; x++)
79
+        {
80
+            signRight = ((recx - recx + 1) < 0) ? -1 : ((recx - recx + 1) > 0) ? 1 : 0;
81
+            edgeType = signRight + signLeft0 + 2;
82
+            signLeft0 = -signRight;
83
+            recx = x265_clip(recx + offsetEoedgeType);
84
+        }
85
+        rec += stride;
86
+    }
87
+}
88
+
89
+static void processSaoCUE1_neon(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int width)
90
+{
91
+    int x = 0;
92
+    int8_t signDown;
93
+    int edgeType;
94
+
95
+    if (width >= 8)
96
+    {
97
+        int8x8_t tbl = *(int8x8_t *)offsetEo;
98
+        for (; (x + 8) <= width; x += 8)
99
+        {
100
+            uint8x8_t in0 = *(uint8x8_t *)&recx;
101
+            uint8x8_t in1 = *(uint8x8_t *)&recx + stride;
102
+            int8x8_t vsignDown = sign_diff_neon(in0, in1);
103
+            int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1x), vdup_n_s8(2));
104
+            *(int8x8_t *)&upBuff1x = vneg_s8(vsignDown);
105
+            int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
106
+            t1 = vaddw_u8(t1, in0);
107
+            *(uint8x8_t *)&recx = vqmovun_s16(t1);
108
+        }
109
+    }
110
+    for (; x < width; x++)
111
+    {
112
+        signDown = signOf(recx - recx + stride);
113
+        edgeType = signDown + upBuff1x + 2;
114
+        upBuff1x = -signDown;
115
+        recx = x265_clip(recx + offsetEoedgeType);
116
+    }
117
+}
118
+
119
+static void processSaoCUE1_2Rows_neon(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int width)
120
+{
121
+    int y;
122
+    int8_t signDown;
123
+    int edgeType;
124
+
125
+    for (y = 0; y < 2; y++)
126
+    {
127
+        int x = 0;
128
+        if (width >= 8)
129
+        {
130
+            int8x8_t tbl = *(int8x8_t *)offsetEo;
131
+            for (; (x + 8) <= width; x += 8)
132
+            {
133
+                uint8x8_t in0 = *(uint8x8_t *)&recx;
134
+                uint8x8_t in1 = *(uint8x8_t *)&recx + stride;
135
+                int8x8_t vsignDown = sign_diff_neon(in0, in1);
136
+                int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1x), vdup_n_s8(2));
137
+                *(int8x8_t *)&upBuff1x = vneg_s8(vsignDown);
138
+                int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
139
+                t1 = vaddw_u8(t1, in0);
140
+                t1 = vmaxq_s16(t1, vdupq_n_s16(0));
141
+                t1 = vminq_s16(t1, vdupq_n_s16(255));
142
+                *(uint8x8_t *)&recx = vmovn_u16(t1);
143
+
144
+            }
145
+        }
146
+        for (; x < width; x++)
147
+        {
148
+            signDown = signOf(recx - recx + stride);
149
+            edgeType = signDown + upBuff1x + 2;
150
+            upBuff1x = -signDown;
151
+            recx = x265_clip(recx + offsetEoedgeType);
152
+        }
153
+        rec += stride;
154
+    }
155
+}
156
+
157
+static void processSaoCUE2_neon(pixel *rec, int8_t *bufft, int8_t *buff1, int8_t *offsetEo, int width, intptr_t stride)
158
+{
159
+    int x;
160
+
161
+    if (abs(buff1 - bufft) < 16)
162
+    {
163
+        for (x = 0; x < width; x++)
164
+        {
165
+            int8_t signDown = signOf(recx - recx + stride + 1);
166
+            int edgeType = signDown + buff1x + 2;
167
+            bufftx + 1 = -signDown;
168
+            recx = x265_clip(recx + offsetEoedgeType);;
169
+        }
170
+    }
171
+    else
172
+    {
173
+        int8x8_t tbl = *(int8x8_t *)offsetEo;
174
+        x = 0;
175
+        for (; (x + 8) <= width; x += 8)
176
+        {
177
+            uint8x8_t in0 = *(uint8x8_t *)&recx;
178
+            uint8x8_t in1 = *(uint8x8_t *)&recx + stride + 1;
179
+            int8x8_t vsignDown = sign_diff_neon(in0, in1);
180
+            int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&buff1x), vdup_n_s8(2));
181
+            *(int8x8_t *)&bufftx + 1 = vneg_s8(vsignDown);
182
+            int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
183
+            t1 = vaddw_u8(t1, in0);
184
+            t1 = vmaxq_s16(t1, vdupq_n_s16(0));
185
+            t1 = vminq_s16(t1, vdupq_n_s16(255));
186
+            *(uint8x8_t *)&recx = vmovn_u16(t1);
187
+        }
188
+        for (; x < width; x++)
189
+        {
190
+            int8_t signDown = signOf(recx - recx + stride + 1);
191
+            int edgeType = signDown + buff1x + 2;
192
+            bufftx + 1 = -signDown;
193
+            recx = x265_clip(recx + offsetEoedgeType);;
194
+        }
195
+
196
+    }
197
+}
198
+
199
+
200
+static void processSaoCUE3_neon(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int startX, int endX)
201
+{
202
+    int8_t signDown;
203
+    int8_t edgeType;
204
+    int8x8_t tbl = *(int8x8_t *)offsetEo;
205
+
206
+    int x = startX + 1;
207
+    for (; (x + 8) <= endX; x += 8)
208
+    {
209
+        uint8x8_t in0 = *(uint8x8_t *)&recx;
210
+        uint8x8_t in1 = *(uint8x8_t *)&recx + stride;
211
+        int8x8_t vsignDown = sign_diff_neon(in0, in1);
212
+        int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1x), vdup_n_s8(2));
213
+        *(int8x8_t *)&upBuff1x - 1 = vneg_s8(vsignDown);
214
+        int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
215
+        t1 = vaddw_u8(t1, in0);
216
+        t1 = vmaxq_s16(t1, vdupq_n_s16(0));
217
+        t1 = vminq_s16(t1, vdupq_n_s16(255));
218
+        *(uint8x8_t *)&recx = vmovn_u16(t1);
219
+
220
+    }
221
+    for (; x < endX; x++)
222
+    {
223
+        signDown = signOf(recx - recx + stride);
224
+        edgeType = signDown + upBuff1x + 2;
225
+        upBuff1x - 1 = -signDown;
226
+        recx = x265_clip(recx + offsetEoedgeType);
227
+    }
228
+}
229
+
230
+static void processSaoCUB0_neon(pixel *rec, const int8_t *offset, int ctuWidth, int ctuHeight, intptr_t stride)
231
+{
232
+#define SAO_BO_BITS 5
233
+    const int boShift = X265_DEPTH - SAO_BO_BITS;
234
+    int x, y;
235
+    int8x8x4_t table;
236
+    table = *(int8x8x4_t *)offset;
237
+
238
+    for (y = 0; y < ctuHeight; y++)
239
+    {
240
+
241
+        for (x = 0; (x + 8) <= ctuWidth; x += 8)
242
+        {
243
+            int8x8_t in = *(int8x8_t *)&recx;
244
+            int8x8_t offsets = vtbl4_s8(table, vshr_n_u8(in, boShift));
245
+            int16x8_t tmp = vmovl_s8(offsets);
246
+            tmp = vaddw_u8(tmp, in);
247
+            tmp = vmaxq_s16(tmp, vdupq_n_s16(0));
248
+            tmp = vminq_s16(tmp, vdupq_n_s16(255));
249
+            *(uint8x8_t *)&recx = vmovn_u16(tmp);
250
+        }
251
+        for (; x < ctuWidth; x++)
252
+        {
253
+            recx = x265_clip(recx + offsetrecx >> boShift);
254
+        }
255
+        rec += stride;
256
+    }
257
+}
258
+
259
+}
260
+
261
+
262
+
263
+namespace X265_NS
264
+{
265
+void setupLoopFilterPrimitives_neon(EncoderPrimitives &p)
266
+{
267
+    p.saoCuOrgE0 = processSaoCUE0_neon;
268
+    p.saoCuOrgE1 = processSaoCUE1_neon;
269
+    p.saoCuOrgE1_2Rows = processSaoCUE1_2Rows_neon;
270
+    p.saoCuOrgE20 = processSaoCUE2_neon;
271
+    p.saoCuOrgE21 = processSaoCUE2_neon;
272
+    p.saoCuOrgE30 = processSaoCUE3_neon;
273
+    p.saoCuOrgE31 = processSaoCUE3_neon;
274
+    p.saoCuOrgB0 = processSaoCUB0_neon;
275
+    p.sign = calSign_neon;
276
+
277
+}
278
+
279
+
280
+#else //HIGH_BIT_DEPTH
281
+
282
+
283
+namespace X265_NS
284
+{
285
+void setupLoopFilterPrimitives_neon(EncoderPrimitives &)
286
+{
287
+}
288
+
289
+#endif
290
+
291
+
292
+}
293
x265_3.6.tar.gz/source/common/aarch64/loopfilter-prim.h Added
18
 
1
@@ -0,0 +1,16 @@
2
+#ifndef _LOOPFILTER_NEON_H__
3
+#define _LOOPFILTER_NEON_H__
4
+
5
+#include "common.h"
6
+#include "primitives.h"
7
+
8
+#define PIXEL_MIN 0
9
+
10
+namespace X265_NS
11
+{
12
+void setupLoopFilterPrimitives_neon(EncoderPrimitives &p);
13
+
14
+};
15
+
16
+
17
+#endif
18
x265_3.6.tar.gz/source/common/aarch64/mc-a-common.S Added
50
 
1
@@ -0,0 +1,48 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+.arch           armv8-a
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.macro addAvg_start
37
+    lsl             x3, x3, #1
38
+    lsl             x4, x4, #1
39
+    mov             w11, #0x40
40
+    dup             v30.16b, w11
41
+.endm
42
+
43
+.macro addavg_1 v0, v1
44
+    add             \v0\().8h, \v0\().8h, \v1\().8h
45
+    saddl           v16.4s, \v0\().4h, v30.4h
46
+    saddl2          v17.4s, \v0\().8h, v30.8h
47
+    shrn            \v0\().4h, v16.4s, #7
48
+    shrn2           \v0\().8h, v17.4s, #7
49
+.endm
50
x265_3.6.tar.gz/source/common/aarch64/mc-a-sve2.S Added
926
 
1
@@ -0,0 +1,924 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "mc-a-common.S"
27
+
28
+.arch armv8-a+sve2
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+function PFX(pixel_avg_pp_12x16_sve2)
41
+    sub             x1, x1, #4
42
+    sub             x3, x3, #4
43
+    sub             x5, x5, #4
44
+    ptrue           p0.s, vl1
45
+    ptrue           p1.b, vl8
46
+    mov             x11, #4
47
+.rept 16
48
+    ld1w            {z0.s}, p0/z, x2
49
+    ld1b            {z1.b}, p1/z, x2, x11
50
+    ld1w            {z2.s}, p0/z, x4
51
+    ld1b            {z3.b}, p1/z, x4, x11
52
+    add             x2, x2, #4
53
+    add             x2, x2, x3
54
+    add             x4, x4, #4
55
+    add             x4, x4, x5
56
+    urhadd          z0.b, p1/m, z0.b, z2.b
57
+    urhadd          z1.b, p1/m, z1.b, z3.b
58
+    st1b            {z0.b}, p1, x0
59
+    st1b            {z1.b}, p1, x0, x11
60
+    add             x0, x0, #4
61
+    add             x0, x0, x1
62
+.endr
63
+    ret
64
+endfunc
65
+
66
+function PFX(pixel_avg_pp_24x32_sve2)
67
+    mov             w12, #4
68
+    rdvl            x9, #1
69
+    cmp             x9, #16
70
+    bgt             .vl_gt_16_pixel_avg_pp_24x32
71
+    sub             x1, x1, #16
72
+    sub             x3, x3, #16
73
+    sub             x5, x5, #16
74
+.lpavg_24x32_sve2:
75
+    sub             w12, w12, #1
76
+.rept 8
77
+    ld1             {v0.16b}, x2, #16
78
+    ld1             {v1.8b}, x2, x3
79
+    ld1             {v2.16b}, x4, #16
80
+    ld1             {v3.8b}, x4, x5
81
+    urhadd          v0.16b, v0.16b, v2.16b
82
+    urhadd          v1.8b, v1.8b, v3.8b
83
+    st1             {v0.16b}, x0, #16
84
+    st1             {v1.8b}, x0, x1
85
+.endr
86
+    cbnz            w12, .lpavg_24x32_sve2
87
+    ret
88
+.vl_gt_16_pixel_avg_pp_24x32:
89
+    mov             x10, #24
90
+    mov             x11, #0
91
+    whilelt         p0.b, x11, x10
92
+.vl_gt_16_loop_pixel_avg_pp_24x32:
93
+    sub             w12, w12, #1
94
+.rept 8
95
+    ld1b            {z0.b}, p0/z, x2
96
+    ld1b            {z2.b}, p0/z, x4
97
+    add             x2, x2, x3
98
+    add             x4, x4, x5
99
+    urhadd          z0.b, p0/m, z0.b, z2.b
100
+    st1b            {z0.b}, p0, x0
101
+    add             x0, x0, x1
102
+.endr
103
+    cbnz            w12, .vl_gt_16_loop_pixel_avg_pp_24x32
104
+    ret
105
+endfunc
106
+
107
+.macro pixel_avg_pp_32xN_sve2 h
108
+function PFX(pixel_avg_pp_32x\h\()_sve2)
109
+    rdvl            x9, #1
110
+    cmp             x9, #16
111
+    bgt             .vl_gt_16_pixel_avg_pp_32_\h
112
+.rept \h
113
+    ld1             {v0.16b-v1.16b}, x2, x3
114
+    ld1             {v2.16b-v3.16b}, x4, x5
115
+    urhadd          v0.16b, v0.16b, v2.16b
116
+    urhadd          v1.16b, v1.16b, v3.16b
117
+    st1             {v0.16b-v1.16b}, x0, x1
118
+.endr
119
+    ret
120
+.vl_gt_16_pixel_avg_pp_32_\h:
121
+    ptrue           p0.b, vl32
122
+.rept \h
123
+    ld1b            {z0.b}, p0/z, x2
124
+    ld1b            {z2.b}, p0/z, x4
125
+    add             x2, x2, x3
126
+    add             x4, x4, x5
127
+    urhadd          z0.b, p0/m, z0.b, z2.b
128
+    st1b            {z0.b}, p0, x0
129
+    add             x0, x0, x1
130
+.endr
131
+    ret
132
+endfunc
133
+.endm
134
+
135
+pixel_avg_pp_32xN_sve2 8
136
+pixel_avg_pp_32xN_sve2 16
137
+pixel_avg_pp_32xN_sve2 24
138
+
139
+.macro pixel_avg_pp_32xN1_sve2 h
140
+function PFX(pixel_avg_pp_32x\h\()_sve2)
141
+    rdvl            x9, #1
142
+    cmp             x9, #16
143
+    bgt             .vl_gt_16_pixel_avg_pp_32xN1_\h
144
+    mov             w12, #\h / 8
145
+.lpavg_sve2_32x\h\():
146
+    sub             w12, w12, #1
147
+.rept 8
148
+    ld1             {v0.16b-v1.16b}, x2, x3
149
+    ld1             {v2.16b-v3.16b}, x4, x5
150
+    urhadd          v0.16b, v0.16b, v2.16b
151
+    urhadd          v1.16b, v1.16b, v3.16b
152
+    st1             {v0.16b-v1.16b}, x0, x1
153
+.endr
154
+    cbnz            w12, .lpavg_sve2_32x\h
155
+    ret
156
+.vl_gt_16_pixel_avg_pp_32xN1_\h:
157
+    ptrue           p0.b, vl32
158
+    mov             w12, #\h / 8
159
+.eq_32_loop_pixel_avg_pp_32xN1_\h\():
160
+    sub             w12, w12, #1
161
+.rept 8
162
+    ld1b            {z0.b}, p0/z, x2
163
+    ld1b            {z2.b}, p0/z, x4
164
+    add             x2, x2, x3
165
+    add             x4, x4, x5
166
+    urhadd          z0.b, p0/m, z0.b, z2.b
167
+    st1b            {z0.b}, p0, x0
168
+    add             x0, x0, x1
169
+.endr
170
+    cbnz            w12, .eq_32_loop_pixel_avg_pp_32xN1_\h
171
+    ret
172
+endfunc
173
+.endm
174
+
175
+pixel_avg_pp_32xN1_sve2 32
176
+pixel_avg_pp_32xN1_sve2 64
177
+
178
+function PFX(pixel_avg_pp_48x64_sve2)
179
+    rdvl            x9, #1
180
+    cmp             x9, #16
181
+    bgt             .vl_gt_16_pixel_avg_pp_48x64
182
+    mov             w12, #8
183
+.lpavg_48x64_sve2:
184
+    sub             w12, w12, #1
185
+.rept 8
186
+    ld1             {v0.16b-v2.16b}, x2, x3
187
+    ld1             {v3.16b-v5.16b}, x4, x5
188
+    urhadd          v0.16b, v0.16b, v3.16b
189
+    urhadd          v1.16b, v1.16b, v4.16b
190
+    urhadd          v2.16b, v2.16b, v5.16b
191
+    st1             {v0.16b-v2.16b}, x0, x1
192
+.endr
193
+    cbnz            w12, .lpavg_48x64_sve2
194
+    ret
195
+.vl_gt_16_pixel_avg_pp_48x64:
196
+    cmp             x9, #32
197
+    bgt             .vl_gt_32_pixel_avg_pp_48x64
198
+    ptrue           p0.b, vl32
199
+    ptrue           p1.b, vl16
200
+    mov             w12, #8
201
+.vl_eq_32_pixel_avg_pp_48x64:
202
+    sub             w12, w12, #1
203
+.rept 8
204
+    ld1b            {z0.b}, p0/z, x2
205
+    ld1b            {z1.b}, p1/z, x2, #1, mul vl
206
+    ld1b            {z2.b}, p0/z, x4
207
+    ld1b            {z3.b}, p1/z, x4, #1, mul vl
208
+    add             x2, x2, x3
209
+    add             x4, x4, x5
210
+    urhadd          z0.b, p0/m, z0.b, z2.b
211
+    urhadd          z1.b, p1/m, z1.b, z3.b
212
+    st1b            {z0.b}, p0, x0
213
+    st1b            {z1.b}, p1, x0, #1, mul vl
214
+    add             x0, x0, x1
215
+.endr
216
+    cbnz            w12, .vl_eq_32_pixel_avg_pp_48x64
217
+    ret
218
+.vl_gt_32_pixel_avg_pp_48x64:
219
+    mov             x10, #48
220
+    mov             x11, #0
221
+    whilelt         p0.b, x11, x10
222
+    mov             w12, #8
223
+.loop_gt_32_pixel_avg_pp_48x64:
224
+    sub             w12, w12, #1
225
+.rept 8
226
+    ld1b            {z0.b}, p0/z, x2
227
+    ld1b            {z2.b}, p0/z, x4
228
+    add             x2, x2, x3
229
+    add             x4, x4, x5
230
+    urhadd          z0.b, p0/m, z0.b, z2.b
231
+    st1b            {z0.b}, p0, x0
232
+    add             x0, x0, x1
233
+.endr
234
+    cbnz            w12, .loop_gt_32_pixel_avg_pp_48x64
235
+    ret
236
+endfunc
237
+
238
+.macro pixel_avg_pp_64xN_sve2 h
239
+function PFX(pixel_avg_pp_64x\h\()_sve2)
240
+    rdvl            x9, #1
241
+    cmp             x9, #16
242
+    bgt             .vl_gt_16_pixel_avg_pp_64x\h
243
+    mov             w12, #\h / 4
244
+.lpavg_sve2_64x\h\():
245
+    sub             w12, w12, #1
246
+.rept 4
247
+    ld1             {v0.16b-v3.16b}, x2, x3
248
+    ld1             {v4.16b-v7.16b}, x4, x5
249
+    urhadd          v0.16b, v0.16b, v4.16b
250
+    urhadd          v1.16b, v1.16b, v5.16b
251
+    urhadd          v2.16b, v2.16b, v6.16b
252
+    urhadd          v3.16b, v3.16b, v7.16b
253
+    st1             {v0.16b-v3.16b}, x0, x1
254
+.endr
255
+    cbnz            w12, .lpavg_sve2_64x\h
256
+    ret
257
+.vl_gt_16_pixel_avg_pp_64x\h\():
258
+    cmp             x9, #48
259
+    bgt             .vl_gt_48_pixel_avg_pp_64x\h
260
+    ptrue           p0.b, vl32
261
+    mov             w12, #\h / 4
262
+.vl_eq_32_pixel_avg_pp_64x\h\():
263
+    sub             w12, w12, #1
264
+.rept 4
265
+    ld1b            {z0.b}, p0/z, x2
266
+    ld1b            {z1.b}, p0/z, x2, #1, mul vl
267
+    ld1b            {z2.b}, p0/z, x4
268
+    ld1b            {z3.b}, p0/z, x4, #1, mul vl
269
+    add             x2, x2, x3
270
+    add             x4, x4, x5
271
+    urhadd          z0.b, p0/m, z0.b, z2.b
272
+    urhadd          z1.b, p0/m, z1.b, z3.b
273
+    st1b            {z0.b}, p0, x0
274
+    st1b            {z1.b}, p0, x0, #1, mul vl
275
+    add             x0, x0, x1
276
+.endr
277
+    cbnz            w12, .vl_eq_32_pixel_avg_pp_64x\h
278
+    ret
279
+.vl_gt_48_pixel_avg_pp_64x\h\():
280
+    ptrue           p0.b, vl64
281
+    mov             w12, #\h / 4
282
+.vl_eq_64_pixel_avg_pp_64x\h\():
283
+    sub             w12, w12, #1
284
+.rept 4
285
+    ld1b            {z0.b}, p0/z, x2
286
+    ld1b            {z2.b}, p0/z, x4
287
+    add             x2, x2, x3
288
+    add             x4, x4, x5
289
+    urhadd          z0.b, p0/m, z0.b, z2.b
290
+    st1b            {z0.b}, p0, x0
291
+    add             x0, x0, x1
292
+.endr
293
+    cbnz            w12, .vl_eq_64_pixel_avg_pp_64x\h
294
+    ret
295
+endfunc
296
+.endm
297
+
298
+pixel_avg_pp_64xN_sve2 16
299
+pixel_avg_pp_64xN_sve2 32
300
+pixel_avg_pp_64xN_sve2 48
301
+pixel_avg_pp_64xN_sve2 64
302
+
303
+// void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
304
+
305
+.macro addAvg_2xN_sve2 h
306
+function PFX(addAvg_2x\h\()_sve2)
307
+    ptrue           p0.s, vl2
308
+    ptrue           p1.h, vl4
309
+    ptrue           p2.h, vl2
310
+.rept \h / 2
311
+    ld1rw           {z0.s}, p0/z, x0
312
+    ld1rw           {z1.s}, p0/z, x1
313
+    add             x0, x0, x3, lsl #1
314
+    add             x1, x1, x4, lsl #1
315
+    ld1rw           {z2.s}, p0/z, x0
316
+    ld1rw           {z3.s}, p0/z, x1
317
+    add             x0, x0, x3, lsl #1
318
+    add             x1, x1, x4, lsl #1
319
+    add             z0.h, p1/m, z0.h, z1.h
320
+    add             z2.h, p1/m, z2.h, z3.h
321
+    sqrshrnb        z0.b, z0.h, #7
322
+    add             z0.b, z0.b, #0x80
323
+    sqrshrnb        z2.b, z2.h, #7
324
+    add             z2.b, z2.b, #0x80
325
+    st1b            {z0.h}, p2, x2
326
+    add             x2, x2, x5
327
+    st1b            {z2.h}, p2, x2
328
+    add             x2, x2, x5
329
+.endr
330
+    ret
331
+endfunc
332
+.endm
333
+
334
+addAvg_2xN_sve2 4
335
+addAvg_2xN_sve2 8
336
+addAvg_2xN_sve2 16
337
+
338
+.macro addAvg_6xN_sve2 h
339
+function PFX(addAvg_6x\h\()_sve2)
340
+    mov             w12, #\h / 2
341
+    ptrue           p0.b, vl16
342
+    ptrue           p2.h, vl6
343
+.loop_sve2_addavg_6x\h\():
344
+    sub             w12, w12, #1
345
+    ld1b            {z0.b}, p0/z, x0
346
+    ld1b            {z1.b}, p0/z, x1
347
+    add             x0, x0, x3, lsl #1
348
+    add             x1, x1, x4, lsl #1
349
+    ld1b            {z2.b}, p0/z, x0
350
+    ld1b            {z3.b}, p0/z, x1
351
+    add             x0, x0, x3, lsl #1
352
+    add             x1, x1, x4, lsl #1
353
+    add             z0.h, p0/m, z0.h, z1.h
354
+    add             z2.h, p0/m, z2.h, z3.h
355
+    sqrshrnb        z0.b, z0.h, #7
356
+    sqrshrnb        z2.b, z2.h, #7
357
+    add             z0.b, z0.b, #0x80
358
+    add             z2.b, z2.b, #0x80
359
+    st1b            {z0.h}, p2, x2
360
+    add             x2, x2, x5
361
+    st1b            {z2.h}, p2, x2
362
+    add             x2, x2, x5
363
+    cbnz            w12, .loop_sve2_addavg_6x\h
364
+    ret
365
+endfunc
366
+.endm
367
+
368
+addAvg_6xN_sve2 8
369
+addAvg_6xN_sve2 16
370
+
371
+.macro addAvg_8xN_sve2 h
372
+function PFX(addAvg_8x\h\()_sve2)
373
+    ptrue           p0.b, vl16
374
+.rept \h / 2
375
+    ld1b            {z0.b}, p0/z, x0
376
+    ld1b            {z1.b}, p0/z, x1
377
+    add             x0, x0, x3, lsl #1
378
+    add             x1, x1, x4, lsl #1
379
+    ld1b            {z2.b}, p0/z, x0
380
+    ld1b            {z3.b}, p0/z, x1
381
+    add             x0, x0, x3, lsl #1
382
+    add             x1, x1, x4, lsl #1
383
+    add             z0.h, p0/m, z0.h, z1.h
384
+    add             z2.h, p0/m, z2.h, z3.h
385
+    sqrshrnb        z0.b, z0.h, #7
386
+    add             z0.b, z0.b, #0x80
387
+    sqrshrnb        z2.b, z2.h, #7
388
+    add             z2.b, z2.b, #0x80
389
+    st1b            {z0.h}, p0, x2
390
+    add             x2, x2, x5
391
+    st1b            {z2.h}, p0, x2
392
+    add             x2, x2, x5
393
+.endr
394
+    ret
395
+endfunc
396
+.endm
397
+
398
+.macro addAvg_8xN1_sve2 h
399
+function PFX(addAvg_8x\h\()_sve2)
400
+    mov             w12, #\h / 2
401
+    ptrue           p0.b, vl16
402
+.loop_sve2_addavg_8x\h\():
403
+    sub             w12, w12, #1
404
+    ld1b            {z0.b}, p0/z, x0
405
+    ld1b            {z1.b}, p0/z, x1
406
+    add             x0, x0, x3, lsl #1
407
+    add             x1, x1, x4, lsl #1
408
+    ld1b            {z2.b}, p0/z, x0
409
+    ld1b            {z3.b}, p0/z, x1
410
+    add             x0, x0, x3, lsl #1
411
+    add             x1, x1, x4, lsl #1
412
+    add             z0.h, p0/m, z0.h, z1.h
413
+    add             z2.h, p0/m, z2.h, z3.h
414
+    sqrshrnb        z0.b, z0.h, #7
415
+    add             z0.b, z0.b, #0x80
416
+    sqrshrnb        z2.b, z2.h, #7
417
+    add             z2.b, z2.b, #0x80
418
+    st1b            {z0.h}, p0, x2
419
+    add             x2, x2, x5
420
+    st1b            {z2.h}, p0, x2
421
+    add             x2, x2, x5
422
+    cbnz            w12, .loop_sve2_addavg_8x\h
423
+    ret
424
+endfunc
425
+.endm
426
+
427
+addAvg_8xN_sve2 2
428
+addAvg_8xN_sve2 4
429
+addAvg_8xN_sve2 6
430
+addAvg_8xN_sve2 8
431
+addAvg_8xN_sve2 12
432
+addAvg_8xN_sve2 16
433
+addAvg_8xN1_sve2 32
434
+addAvg_8xN1_sve2 64
435
+
436
+.macro addAvg_12xN_sve2 h
437
+function PFX(addAvg_12x\h\()_sve2)
438
+    mov             w12, #\h
439
+    rdvl            x9, #1
440
+    cmp             x9, #16
441
+    bgt             .vl_gt_16_addAvg_12x\h
442
+    ptrue           p0.b, vl16
443
+    ptrue           p1.b, vl8
444
+.loop_sve2_addavg_12x\h\():
445
+    sub             w12, w12, #1
446
+    ld1b            {z0.b}, p0/z, x0
447
+    ld1b            {z1.b}, p0/z, x1
448
+    ld1b            {z2.b}, p1/z, x0, #1, mul vl
449
+    ld1b            {z3.b}, p1/z, x1, #1, mul vl
450
+    add             x0, x0, x3, lsl #1
451
+    add             x1, x1, x4, lsl #1
452
+    add             z0.h, p0/m, z0.h, z1.h
453
+    add             z2.h, p1/m, z2.h, z3.h
454
+    sqrshrnb        z0.b, z0.h, #7
455
+    add             z0.b, z0.b, #0x80
456
+    sqrshrnb        z2.b, z2.h, #7
457
+    add             z2.b, z2.b, #0x80
458
+    st1b            {z0.h}, p0, x2
459
+    st1b            {z2.h}, p1, x2, #1, mul vl
460
+    add             x2, x2, x5
461
+    cbnz            w12, .loop_sve2_addavg_12x\h
462
+    ret
463
+.vl_gt_16_addAvg_12x\h\():
464
+    mov             x10, #24
465
+    mov             x11, #0
466
+    whilelt         p0.b, x11, x10
467
+.loop_sve2_gt_16_addavg_12x\h\():
468
+    sub             w12, w12, #1
469
+    ld1b            {z0.b}, p0/z, x0
470
+    ld1b            {z1.b}, p0/z, x1
471
+    add             x0, x0, x3, lsl #1
472
+    add             x1, x1, x4, lsl #1
473
+    add             z0.h, p0/m, z0.h, z1.h
474
+    sqrshrnb        z0.b, z0.h, #7
475
+    add             z0.b, z0.b, #0x80
476
+    sqrshrnb        z2.b, z2.h, #7
477
+    add             z2.b, z2.b, #0x80
478
+    st1b            {z0.h}, p0, x2
479
+    add             x2, x2, x5
480
+    cbnz            w12, .loop_sve2_gt_16_addavg_12x\h
481
+    ret
482
+endfunc
483
+.endm
484
+
485
+addAvg_12xN_sve2 16
486
+addAvg_12xN_sve2 32
487
+
488
+.macro addAvg_16xN_sve2 h
489
+function PFX(addAvg_16x\h\()_sve2)
490
+    mov             w12, #\h
491
+    rdvl            x9, #1
492
+    cmp             x9, #16
493
+    bgt             .vl_gt_16_addAvg_16x\h
494
+    ptrue           p0.b, vl16
495
+.loop_eq_16_sve2_addavg_16x\h\():
496
+    sub             w12, w12, #1
497
+    ld1b            {z0.b}, p0/z, x0
498
+    ld1b            {z1.b}, p0/z, x1
499
+    ld1b            {z2.b}, p0/z, x0, #1, mul vl
500
+    ld1b            {z3.b}, p0/z, x1, #1, mul vl
501
+    add             x0, x0, x3, lsl #1
502
+    add             x1, x1, x4, lsl #1
503
+    add             z0.h, p0/m, z0.h, z1.h
504
+    add             z2.h, p0/m, z2.h, z3.h
505
+    sqrshrnb        z0.b, z0.h, #7
506
+    add             z0.b, z0.b, #0x80
507
+    sqrshrnb        z2.b, z2.h, #7
508
+    add             z2.b, z2.b, #0x80
509
+    st1b            {z0.h}, p0, x2
510
+    st1b            {z2.h}, p0, x2, #1, mul vl
511
+    add             x2, x2, x5
512
+    cbnz            w12, .loop_eq_16_sve2_addavg_16x\h
513
+    ret
514
+.vl_gt_16_addAvg_16x\h\():
515
+    cmp             x9, #32
516
+    bgt             .vl_gt_32_addAvg_16x\h
517
+    ptrue           p0.b, vl32
518
+.loop_gt_16_sve2_addavg_16x\h\():
519
+    sub             w12, w12, #1
520
+    ld1b            {z0.b}, p0/z, x0
521
+    ld1b            {z1.b}, p0/z, x1
522
+    add             x0, x0, x3, lsl #1
523
+    add             x1, x1, x4, lsl #1
524
+    add             z0.h, p0/m, z0.h, z1.h
525
+    sqrshrnb        z0.b, z0.h, #7
526
+    add             z0.b, z0.b, #0x80
527
+    st1b            {z0.h}, p1, x2
528
+    add             x2, x2, x5
529
+    cbnz            w12, .loop_gt_16_sve2_addavg_16x\h
530
+    ret
531
+.vl_gt_32_addAvg_16x\h\():
532
+    mov             x10, #48
533
+    mov             x11, #0
534
+    whilelt         p0.b, x11, x10
535
+.loop_gt_32_sve2_addavg_16x\h\():
536
+    sub             w12, w12, #1
537
+    ld1b            {z0.b}, p0/z, x0
538
+    add             x0, x0, x3, lsl #1
539
+    add             x1, x1, x4, lsl #1
540
+    add             z0.h, p0/m, z0.h, z1.h
541
+    sqrshrnb        z0.b, z0.h, #7
542
+    add             z0.b, z0.b, #0x80
543
+    st1b            {z0.h}, p0, x2
544
+    add             x2, x2, x5
545
+    cbnz            w12, .loop_gt_32_sve2_addavg_16x\h
546
+    ret
547
+endfunc
548
+.endm
549
+
550
+addAvg_16xN_sve2 4
551
+addAvg_16xN_sve2 8
552
+addAvg_16xN_sve2 12
553
+addAvg_16xN_sve2 16
554
+addAvg_16xN_sve2 24
555
+addAvg_16xN_sve2 32
556
+addAvg_16xN_sve2 64
557
+
558
+.macro addAvg_24xN_sve2 h
559
+function PFX(addAvg_24x\h\()_sve2)
560
+    mov             w12, #\h
561
+    rdvl            x9, #1
562
+    cmp             x9, #16
563
+    bgt             .vl_gt_16_addAvg_24x\h
564
+    addAvg_start
565
+.loop_eq_16_sve2_addavg_24x\h\():
566
+    sub             w12, w12, #1
567
+    ld1             {v0.16b-v2.16b}, x0, x3
568
+    ld1             {v3.16b-v5.16b}, x1, x4
569
+    addavg_1        v0, v3
570
+    addavg_1        v1, v4
571
+    addavg_1        v2, v5
572
+    sqxtun          v0.8b, v0.8h
573
+    sqxtun          v1.8b, v1.8h
574
+    sqxtun          v2.8b, v2.8h
575
+    st1             {v0.8b-v2.8b}, x2, x5
576
+    cbnz            w12, .loop_eq_16_sve2_addavg_24x\h
577
+    ret
578
+.vl_gt_16_addAvg_24x\h\():
579
+    cmp             x9, #48
580
+    bgt             .vl_gt_48_addAvg_24x\h
581
+    ptrue           p0.b, vl32
582
+    ptrue           p1.b, vl16
583
+.loop_gt_16_sve2_addavg_24x\h\():
584
+    sub             w12, w12, #1
585
+    ld1b            {z0.b}, p0/z, x0
586
+    ld1b            {z1.b}, p1/z, x0, #1, mul vl
587
+    ld1b            {z2.b}, p0/z, x1
588
+    ld1b            {z3.b}, p1/z, x1, #1, mul vl
589
+    add             x0, x0, x3, lsl #1
590
+    add             x1, x1, x4, lsl #1
591
+    add             z0.h, p0/m, z0.h, z2.h
592
+    add             z1.h, p1/m, z1.h, z3.h
593
+    sqrshrnb        z0.b, z0.h, #7
594
+    add             z0.b, z0.b, #0x80
595
+    sqrshrnb        z1.b, z1.h, #7
596
+    add             z1.b, z1.b, #0x80
597
+    st1b            {z0.h}, p0, x2
598
+    st1b            {z1.h}, p1, x2, #1, mul vl
599
+    add             x2, x2, x5
600
+    cbnz            w12, .loop_gt_16_sve2_addavg_24x\h
601
+    ret
602
+.vl_gt_48_addAvg_24x\h\():
603
+    mov             x10, #48
604
+    mov             x11, #0
605
+    whilelt         p0.b, x11, x10
606
+.loop_gt_48_sve2_addavg_24x\h\():
607
+    sub             w12, w12, #1
608
+    ld1b            {z0.b}, p0/z, x0
609
+    ld1b            {z2.b}, p0/z, x1
610
+    add             x0, x0, x3, lsl #1
611
+    add             x1, x1, x4, lsl #1
612
+    add             z0.h, p0/m, z0.h, z2.h
613
+    sqrshrnb        z0.b, z0.h, #7
614
+    add             z0.b, z0.b, #0x80
615
+    st1b            {z0.h}, p0, x2
616
+    add             x2, x2, x5
617
+    cbnz            w12, .loop_gt_48_sve2_addavg_24x\h
618
+    ret
619
+endfunc
620
+.endm
621
+
622
+addAvg_24xN_sve2 32
623
+addAvg_24xN_sve2 64
624
+
625
+.macro addAvg_32xN_sve2 h
626
+function PFX(addAvg_32x\h\()_sve2)
627
+    mov             w12, #\h
628
+    rdvl            x9, #1
629
+    cmp             x9, #16
630
+    bgt             .vl_gt_16_addAvg_32x\h
631
+    ptrue           p0.b, vl16
632
+.loop_eq_16_sve2_addavg_32x\h\():
633
+    sub             w12, w12, #1
634
+    ld1b            {z0.b}, p0/z, x0
635
+    ld1b            {z1.b}, p0/z, x0, #1, mul vl
636
+    ld1b            {z2.b}, p0/z, x0, #2, mul vl
637
+    ld1b            {z3.b}, p0/z, x0, #3, mul vl
638
+    ld1b            {z4.b}, p0/z, x1
639
+    ld1b            {z5.b}, p0/z, x1, #1, mul vl
640
+    ld1b            {z6.b}, p0/z, x1, #2, mul vl
641
+    ld1b            {z7.b}, p0/z, x1, #3, mul vl
642
+    add             x0, x0, x3, lsl #1
643
+    add             x1, x1, x4, lsl #1
644
+    add             z0.h, p0/m, z0.h, z4.h
645
+    add             z1.h, p0/m, z1.h, z5.h
646
+    add             z2.h, p0/m, z2.h, z6.h
647
+    add             z3.h, p0/m, z3.h, z7.h
648
+    sqrshrnb        z0.b, z0.h, #7
649
+    add             z0.b, z0.b, #0x80
650
+    sqrshrnb        z1.b, z1.h, #7
651
+    add             z1.b, z1.b, #0x80
652
+    sqrshrnb        z2.b, z2.h, #7
653
+    add             z2.b, z2.b, #0x80
654
+    sqrshrnb        z3.b, z3.h, #7
655
+    add             z3.b, z3.b, #0x80
656
+    st1b            {z0.h}, p0, x2
657
+    st1b            {z1.h}, p0, x2, #1, mul vl
658
+    st1b            {z2.h}, p0, x2, #2, mul vl
659
+    st1b            {z3.h}, p0, x2, #3, mul vl
660
+    add             x2, x2, x5
661
+    cbnz            w12, .loop_eq_16_sve2_addavg_32x\h
662
+    ret
663
+.vl_gt_16_addAvg_32x\h\():
664
+    cmp             x9, #48
665
+    bgt             .vl_gt_48_addAvg_32x\h
666
+    ptrue           p0.b, vl32
667
+.loop_gt_eq_32_sve2_addavg_32x\h\():
668
+    sub             w12, w12, #1
669
+    ld1b            {z0.b}, p0/z, x0
670
+    ld1b            {z1.b}, p0/z, x0, #1, mul vl
671
+    ld1b            {z2.b}, p0/z, x1
672
+    ld1b            {z3.b}, p0/z, x1, #1, mul vl
673
+    add             x0, x0, x3, lsl #1
674
+    add             x1, x1, x4, lsl #1
675
+    add             z0.h, p0/m, z0.h, z2.h
676
+    add             z1.h, p0/m, z1.h, z3.h
677
+    sqrshrnb        z0.b, z0.h, #7
678
+    add             z1.b, z1.b, #0x80
679
+    sqrshrnb        z1.b, z1.h, #7
680
+    add             z0.b, z0.b, #0x80
681
+    st1b            {z0.h}, p0, x2
682
+    st1b            {z1.h}, p0, x2, #1, mul vl
683
+    add             x2, x2, x5
684
+    cbnz            w12, .loop_gt_eq_32_sve2_addavg_32x\h
685
+    ret
686
+.vl_gt_48_addAvg_32x\h\():
687
+    ptrue           p0.b, vl64
688
+.loop_eq_64_sve2_addavg_32x\h\():
689
+    sub             w12, w12, #1
690
+    ld1b            {z0.b}, p0/z, x0
691
+    ld1b            {z1.b}, p0/z, x1
692
+    add             x0, x0, x3, lsl #1
693
+    add             x1, x1, x4, lsl #1
694
+    add             z0.h, p0/m, z0.h, z1.h
695
+    sqrshrnb        z0.b, z0.h, #7
696
+    add             z0.b, z0.b, #0x80
697
+    st1b            {z0.h}, p0, x2
698
+    add             x2, x2, x5
699
+    cbnz            w12, .loop_eq_64_sve2_addavg_32x\h
700
+    ret
701
+endfunc
702
+.endm
703
+
704
+addAvg_32xN_sve2 8
705
+addAvg_32xN_sve2 16
706
+addAvg_32xN_sve2 24
707
+addAvg_32xN_sve2 32
708
+addAvg_32xN_sve2 48
709
+addAvg_32xN_sve2 64
710
+
711
+function PFX(addAvg_48x64_sve2)
712
+    mov             w12, #64
713
+    rdvl            x9, #1
714
+    cmp             x9, #16
715
+    bgt             .vl_gt_16_addAvg_48x64
716
+    addAvg_start
717
+    sub             x3, x3, #64
718
+    sub             x4, x4, #64
719
+.loop_eq_16_sve2_addavg_48x64:
720
+    sub             w12, w12, #1
721
+    ld1             {v0.8h-v3.8h}, x0, #64
722
+    ld1             {v4.8h-v7.8h}, x1, #64
723
+    ld1             {v20.8h-v21.8h}, x0, x3
724
+    ld1             {v22.8h-v23.8h}, x1, x4
725
+    addavg_1        v0, v4
726
+    addavg_1        v1, v5
727
+    addavg_1        v2, v6
728
+    addavg_1        v3, v7
729
+    addavg_1        v20, v22
730
+    addavg_1        v21, v23
731
+    sqxtun          v0.8b, v0.8h
732
+    sqxtun2         v0.16b, v1.8h
733
+    sqxtun          v1.8b, v2.8h
734
+    sqxtun2         v1.16b, v3.8h
735
+    sqxtun          v2.8b, v20.8h
736
+    sqxtun2         v2.16b, v21.8h
737
+    st1             {v0.16b-v2.16b}, x2, x5
738
+    cbnz            w12, .loop_eq_16_sve2_addavg_48x64
739
+    ret
740
+.vl_gt_16_addAvg_48x64:
741
+    cmp             x9, #48
742
+    bgt             .vl_gt_48_addAvg_48x64
743
+    ptrue           p0.b, vl32
744
+.loop_gt_eq_32_sve2_addavg_48x64:
745
+    sub             w12, w12, #1
746
+    ld1b            {z0.b}, p0/z, x0
747
+    ld1b            {z1.b}, p0/z, x0, #1, mul vl
748
+    ld1b            {z2.b}, p0/z, x0, #2, mul vl
749
+    ld1b            {z4.b}, p0/z, x1
750
+    ld1b            {z5.b}, p0/z, x1, #1, mul vl
751
+    ld1b            {z6.b}, p0/z, x1, #2, mul vl
752
+    add             x0, x0, x3, lsl #1
753
+    add             x1, x1, x4, lsl #1
754
+    add             z0.h, p0/m, z0.h, z4.h
755
+    add             z1.h, p0/m, z1.h, z5.h
756
+    add             z2.h, p0/m, z2.h, z6.h
757
+    sqrshrnb        z0.b, z0.h, #7
758
+    add             z0.b, z0.b, #0x80
759
+    sqrshrnb        z1.b, z1.h, #7
760
+    add             z1.b, z1.b, #0x80
761
+    sqrshrnb        z2.b, z2.h, #7
762
+    add             z2.b, z2.b, #0x80
763
+    st1b            {z0.h}, p0, x2
764
+    st1b            {z1.h}, p0, x2, #1, mul vl
765
+    st1b            {z2.h}, p0, x2, #2, mul vl
766
+    add             x2, x2, x5
767
+    cbnz            w12, .loop_gt_eq_32_sve2_addavg_48x64
768
+    ret
769
+.vl_gt_48_addAvg_48x64:
770
+    cmp             x9, #112
771
+    bgt             .vl_gt_112_addAvg_48x64
772
+    ptrue           p0.b, vl64
773
+    ptrue           p1.b, vl32
774
+.loop_gt_48_sve2_addavg_48x64:
775
+    sub             w12, w12, #1
776
+    ld1b            {z0.b}, p0/z, x0
777
+    ld1b            {z1.b}, p1/z, x0, #1, mul vl
778
+    ld1b            {z4.b}, p0/z, x1
779
+    ld1b            {z5.b}, p1/z, x1, #1, mul vl
780
+    add             x0, x0, x3, lsl #1
781
+    add             x1, x1, x4, lsl #1
782
+    add             z0.h, p0/m, z0.h, z4.h
783
+    add             z1.h, p1/m, z1.h, z5.h
784
+    sqrshrnb        z0.b, z0.h, #7
785
+    add             z0.b, z0.b, #0x80
786
+    sqrshrnb        z1.b, z1.h, #7
787
+    add             z1.b, z1.b, #0x80
788
+    st1b            {z0.h}, p0, x2
789
+    st1b            {z1.h}, p1, x2, #1, mul vl
790
+    add             x2, x2, x5
791
+    cbnz            w12, .loop_gt_48_sve2_addavg_48x64
792
+    ret
793
+.vl_gt_112_addAvg_48x64:
794
+    mov             x10, #96
795
+    mov             x11, #0
796
+    whilelt         p0.b, x11, x10
797
+.loop_gt_112_sve2_addavg_48x64:
798
+    sub             w12, w12, #1
799
+    ld1b            {z0.b}, p0/z, x0
800
+    ld1b            {z4.b}, p0/z, x1
801
+    add             x0, x0, x3, lsl #1
802
+    add             x1, x1, x4, lsl #1
803
+    add             z0.h, p0/m, z0.h, z4.h
804
+    sqrshrnb        z0.b, z0.h, #7
805
+    add             z0.b, z0.b, #0x80
806
+    st1b            {z0.h}, p0, x2
807
+    add             x2, x2, x5
808
+    cbnz            w12, .loop_gt_112_sve2_addavg_48x64
809
+    ret
810
+endfunc
811
+
812
+.macro addAvg_64xN_sve2 h
813
+function PFX(addAvg_64x\h\()_sve2)
814
+    mov             w12, #\h
815
+    rdvl            x9, #1
816
+    cmp             x9, #16
817
+    bgt             .vl_gt_16_addAvg_64x\h
818
+    addAvg_start
819
+    sub             x3, x3, #64
820
+    sub             x4, x4, #64
821
+.loop_eq_16_sve2_addavg_64x\h\():
822
+    sub             w12, w12, #1
823
+    ld1             {v0.8h-v3.8h}, x0, #64
824
+    ld1             {v4.8h-v7.8h}, x1, #64
825
+    ld1             {v20.8h-v23.8h}, x0, x3
826
+    ld1             {v24.8h-v27.8h}, x1, x4
827
+    addavg_1        v0, v4
828
+    addavg_1        v1, v5
829
+    addavg_1        v2, v6
830
+    addavg_1        v3, v7
831
+    addavg_1        v20, v24
832
+    addavg_1        v21, v25
833
+    addavg_1        v22, v26
834
+    addavg_1        v23, v27
835
+    sqxtun          v0.8b, v0.8h
836
+    sqxtun2         v0.16b, v1.8h
837
+    sqxtun          v1.8b, v2.8h
838
+    sqxtun2         v1.16b, v3.8h
839
+    sqxtun          v2.8b, v20.8h
840
+    sqxtun2         v2.16b, v21.8h
841
+    sqxtun          v3.8b, v22.8h
842
+    sqxtun2         v3.16b, v23.8h
843
+    st1             {v0.16b-v3.16b}, x2, x5
844
+    cbnz            w12, .loop_eq_16_sve2_addavg_64x\h
845
+    ret
846
+.vl_gt_16_addAvg_64x\h\():
847
+    cmp             x9, #48
848
+    bgt             .vl_gt_48_addAvg_64x\h
849
+    ptrue           p0.b, vl32
850
+.loop_gt_eq_32_sve2_addavg_64x\h\():
851
+    sub             w12, w12, #1
852
+    ld1b            {z0.b}, p0/z, x0
853
+    ld1b            {z1.b}, p0/z, x0, #1, mul vl
854
+    ld1b            {z2.b}, p0/z, x0, #2, mul vl
855
+    ld1b            {z3.b}, p0/z, x0, #3, mul vl
856
+    ld1b            {z4.b}, p0/z, x1
857
+    ld1b            {z5.b}, p0/z, x1, #1, mul vl
858
+    ld1b            {z6.b}, p0/z, x1, #2, mul vl
859
+    ld1b            {z7.b}, p0/z, x1, #3, mul vl
860
+    add             x0, x0, x3, lsl #1
861
+    add             x1, x1, x4, lsl #1
862
+    add             z0.h, p0/m, z0.h, z4.h
863
+    add             z1.h, p0/m, z1.h, z5.h
864
+    add             z2.h, p0/m, z2.h, z6.h
865
+    add             z3.h, p0/m, z3.h, z7.h
866
+    sqrshrnb        z0.b, z0.h, #7
867
+    add             z0.b, z0.b, #0x80
868
+    sqrshrnb        z1.b, z1.h, #7
869
+    add             z1.b, z1.b, #0x80
870
+    sqrshrnb        z2.b, z2.h, #7
871
+    add             z2.b, z2.b, #0x80
872
+    sqrshrnb        z3.b, z3.h, #7
873
+    add             z3.b, z3.b, #0x80
874
+    st1b            {z0.h}, p0, x2
875
+    st1b            {z1.h}, p0, x2, #1, mul vl
876
+    st1b            {z2.h}, p0, x2, #2, mul vl
877
+    st1b            {z3.h}, p0, x2, #3, mul vl
878
+    add             x2, x2, x5
879
+    cbnz            w12, .loop_gt_eq_32_sve2_addavg_64x\h
880
+    ret
881
+.vl_gt_48_addAvg_64x\h\():
882
+    cmp             x9, #112
883
+    bgt             .vl_gt_112_addAvg_64x\h
884
+    ptrue           p0.b, vl64
885
+.loop_gt_eq_48_sve2_addavg_64x\h\():
886
+    sub             w12, w12, #1
887
+    ld1b            {z0.b}, p0/z, x0
888
+    ld1b            {z1.b}, p0/z, x0, #1, mul vl
889
+    ld1b            {z4.b}, p0/z, x1
890
+    ld1b            {z5.b}, p0/z, x1, #1, mul vl
891
+    add             x0, x0, x3, lsl #1
892
+    add             x1, x1, x4, lsl #1
893
+    add             z0.h, p0/m, z0.h, z4.h
894
+    add             z1.h, p0/m, z1.h, z5.h
895
+    sqrshrnb        z0.b, z0.h, #7
896
+    add             z0.b, z0.b, #0x80
897
+    sqrshrnb        z1.b, z1.h, #7
898
+    add             z1.b, z1.b, #0x80
899
+    st1b            {z0.h}, p0, x2
900
+    st1b            {z1.h}, p0, x2, #1, mul vl
901
+    add             x2, x2, x5
902
+    cbnz            w12, .loop_gt_eq_48_sve2_addavg_64x\h
903
+    ret
904
+.vl_gt_112_addAvg_64x\h\():
905
+    ptrue           p0.b, vl128
906
+.loop_gt_eq_128_sve2_addavg_64x\h\():
907
+    sub             w12, w12, #1
908
+    ld1b            {z0.b}, p0/z, x0
909
+    ld1b            {z4.b}, p0/z, x1
910
+    add             x0, x0, x3, lsl #1
911
+    add             x1, x1, x4, lsl #1
912
+    add             z0.h, p0/m, z0.h, z4.h
913
+    sqrshrnb        z0.b, z0.h, #7
914
+    add             z0.b, z0.b, #0x80
915
+    st1b            {z0.h}, p0, x2
916
+    add             x2, x2, x5
917
+    cbnz            w12, .loop_gt_eq_128_sve2_addavg_64x\h
918
+    ret
919
+endfunc
920
+.endm
921
+
922
+addAvg_64xN_sve2 16
923
+addAvg_64xN_sve2 32
924
+addAvg_64xN_sve2 48
925
+addAvg_64xN_sve2 64
926
x265_3.5.tar.gz/source/common/aarch64/mc-a.S -> x265_3.6.tar.gz/source/common/aarch64/mc-a.S Changed
534
 
1
@@ -1,7 +1,8 @@
2
 /*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
+ * Copyright (C) 2020-2021 MulticoreWare, Inc
5
  *
6
  * Authors: Hongbin Liu <liuhongbin1@huawei.com>
7
+ *          Sebastian Pop <spop@amazon.com>
8
  *
9
  * This program is free software; you can redistribute it and/or modify
10
  * it under the terms of the GNU General Public License as published by
11
@@ -22,15 +23,20 @@
12
  *****************************************************************************/
13
 
14
 #include "asm.S"
15
+#include "mc-a-common.S"
16
 
17
+#ifdef __APPLE__
18
+.section __RODATA,__rodata
19
+#else
20
 .section .rodata
21
+#endif
22
 
23
 .align 4
24
 
25
 .text
26
 
27
 .macro pixel_avg_pp_4xN_neon h
28
-function x265_pixel_avg_pp_4x\h\()_neon
29
+function PFX(pixel_avg_pp_4x\h\()_neon)
30
 .rept \h
31
     ld1             {v0.s}0, x2, x3
32
     ld1             {v1.s}0, x4, x5
33
@@ -46,7 +52,7 @@
34
 pixel_avg_pp_4xN_neon 16
35
 
36
 .macro pixel_avg_pp_8xN_neon h
37
-function x265_pixel_avg_pp_8x\h\()_neon
38
+function PFX(pixel_avg_pp_8x\h\()_neon)
39
 .rept \h
40
     ld1             {v0.8b}, x2, x3
41
     ld1             {v1.8b}, x4, x5
42
@@ -61,3 +67,491 @@
43
 pixel_avg_pp_8xN_neon 8
44
 pixel_avg_pp_8xN_neon 16
45
 pixel_avg_pp_8xN_neon 32
46
+
47
+function PFX(pixel_avg_pp_12x16_neon)
48
+    sub             x1, x1, #4
49
+    sub             x3, x3, #4
50
+    sub             x5, x5, #4
51
+.rept 16
52
+    ld1             {v0.s}0, x2, #4
53
+    ld1             {v1.8b}, x2, x3
54
+    ld1             {v2.s}0, x4, #4
55
+    ld1             {v3.8b}, x4, x5
56
+    urhadd          v4.8b, v0.8b, v2.8b
57
+    urhadd          v5.8b, v1.8b, v3.8b
58
+    st1             {v4.s}0, x0, #4
59
+    st1             {v5.8b}, x0, x1
60
+.endr
61
+    ret
62
+endfunc
63
+
64
+.macro pixel_avg_pp_16xN_neon h
65
+function PFX(pixel_avg_pp_16x\h\()_neon)
66
+.rept \h
67
+    ld1             {v0.16b}, x2, x3
68
+    ld1             {v1.16b}, x4, x5
69
+    urhadd          v2.16b, v0.16b, v1.16b
70
+    st1             {v2.16b}, x0, x1
71
+.endr
72
+    ret
73
+endfunc
74
+.endm
75
+
76
+pixel_avg_pp_16xN_neon 4
77
+pixel_avg_pp_16xN_neon 8
78
+pixel_avg_pp_16xN_neon 12
79
+pixel_avg_pp_16xN_neon 16
80
+pixel_avg_pp_16xN_neon 32
81
+
82
+function PFX(pixel_avg_pp_16x64_neon)
83
+    mov             w12, #8
84
+.lpavg_16x64:
85
+    sub             w12, w12, #1
86
+.rept 8
87
+    ld1             {v0.16b}, x2, x3
88
+    ld1             {v1.16b}, x4, x5
89
+    urhadd          v2.16b, v0.16b, v1.16b
90
+    st1             {v2.16b}, x0, x1
91
+.endr
92
+    cbnz            w12, .lpavg_16x64
93
+    ret
94
+endfunc
95
+
96
+function PFX(pixel_avg_pp_24x32_neon)
97
+    sub             x1, x1, #16
98
+    sub             x3, x3, #16
99
+    sub             x5, x5, #16
100
+    mov             w12, #4
101
+.lpavg_24x32:
102
+    sub             w12, w12, #1
103
+.rept 8
104
+    ld1             {v0.16b}, x2, #16
105
+    ld1             {v1.8b}, x2, x3
106
+    ld1             {v2.16b}, x4, #16
107
+    ld1             {v3.8b}, x4, x5
108
+    urhadd          v0.16b, v0.16b, v2.16b
109
+    urhadd          v1.8b, v1.8b, v3.8b
110
+    st1             {v0.16b}, x0, #16
111
+    st1             {v1.8b}, x0, x1
112
+.endr
113
+    cbnz            w12, .lpavg_24x32
114
+    ret
115
+endfunc
116
+
117
+.macro pixel_avg_pp_32xN_neon h
118
+function PFX(pixel_avg_pp_32x\h\()_neon)
119
+.rept \h
120
+    ld1             {v0.16b-v1.16b}, x2, x3
121
+    ld1             {v2.16b-v3.16b}, x4, x5
122
+    urhadd          v0.16b, v0.16b, v2.16b
123
+    urhadd          v1.16b, v1.16b, v3.16b
124
+    st1             {v0.16b-v1.16b}, x0, x1
125
+.endr
126
+    ret
127
+endfunc
128
+.endm
129
+
130
+pixel_avg_pp_32xN_neon 8
131
+pixel_avg_pp_32xN_neon 16
132
+pixel_avg_pp_32xN_neon 24
133
+
134
+.macro pixel_avg_pp_32xN1_neon h
135
+function PFX(pixel_avg_pp_32x\h\()_neon)
136
+    mov             w12, #\h / 8
137
+.lpavg_32x\h\():
138
+    sub             w12, w12, #1
139
+.rept 8
140
+    ld1             {v0.16b-v1.16b}, x2, x3
141
+    ld1             {v2.16b-v3.16b}, x4, x5
142
+    urhadd          v0.16b, v0.16b, v2.16b
143
+    urhadd          v1.16b, v1.16b, v3.16b
144
+    st1             {v0.16b-v1.16b}, x0, x1
145
+.endr
146
+    cbnz            w12, .lpavg_32x\h
147
+    ret
148
+endfunc
149
+.endm
150
+
151
+pixel_avg_pp_32xN1_neon 32
152
+pixel_avg_pp_32xN1_neon 64
153
+
154
+function PFX(pixel_avg_pp_48x64_neon)
155
+    mov             w12, #8
156
+.lpavg_48x64:
157
+    sub             w12, w12, #1
158
+.rept 8
159
+    ld1             {v0.16b-v2.16b}, x2, x3
160
+    ld1             {v3.16b-v5.16b}, x4, x5
161
+    urhadd          v0.16b, v0.16b, v3.16b
162
+    urhadd          v1.16b, v1.16b, v4.16b
163
+    urhadd          v2.16b, v2.16b, v5.16b
164
+    st1             {v0.16b-v2.16b}, x0, x1
165
+.endr
166
+    cbnz            w12, .lpavg_48x64
167
+    ret
168
+endfunc
169
+
170
+.macro pixel_avg_pp_64xN_neon h
171
+function PFX(pixel_avg_pp_64x\h\()_neon)
172
+    mov             w12, #\h / 4
173
+.lpavg_64x\h\():
174
+    sub             w12, w12, #1
175
+.rept 4
176
+    ld1             {v0.16b-v3.16b}, x2, x3
177
+    ld1             {v4.16b-v7.16b}, x4, x5
178
+    urhadd          v0.16b, v0.16b, v4.16b
179
+    urhadd          v1.16b, v1.16b, v5.16b
180
+    urhadd          v2.16b, v2.16b, v6.16b
181
+    urhadd          v3.16b, v3.16b, v7.16b
182
+    st1             {v0.16b-v3.16b}, x0, x1
183
+.endr
184
+    cbnz            w12, .lpavg_64x\h
185
+    ret
186
+endfunc
187
+.endm
188
+
189
+pixel_avg_pp_64xN_neon 16
190
+pixel_avg_pp_64xN_neon 32
191
+pixel_avg_pp_64xN_neon 48
192
+pixel_avg_pp_64xN_neon 64
193
+
194
+// void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
195
+.macro addAvg_2xN h
196
+function PFX(addAvg_2x\h\()_neon)
197
+    addAvg_start
198
+.rept \h / 2
199
+    ldr             w10, x0
200
+    ldr             w11, x1
201
+    add             x0, x0, x3
202
+    add             x1, x1, x4
203
+    ldr             w12, x0
204
+    ldr             w13, x1
205
+    add             x0, x0, x3
206
+    add             x1, x1, x4
207
+    dup             v0.2s, w10
208
+    dup             v1.2s, w11
209
+    dup             v2.2s, w12
210
+    dup             v3.2s, w13
211
+    add             v0.4h, v0.4h, v1.4h
212
+    add             v2.4h, v2.4h, v3.4h
213
+    saddl           v0.4s, v0.4h, v30.4h
214
+    saddl           v2.4s, v2.4h, v30.4h
215
+    shrn            v0.4h, v0.4s, #7
216
+    shrn2           v0.8h, v2.4s, #7
217
+    sqxtun          v0.8b, v0.8h
218
+    st1             {v0.h}0, x2, x5
219
+    st1             {v0.h}2, x2, x5
220
+.endr
221
+    ret
222
+endfunc
223
+.endm
224
+
225
+addAvg_2xN 4
226
+addAvg_2xN 8
227
+addAvg_2xN 16
228
+
229
+.macro addAvg_4xN h
230
+function PFX(addAvg_4x\h\()_neon)
231
+    addAvg_start
232
+.rept \h / 2
233
+    ld1             {v0.8b}, x0, x3
234
+    ld1             {v1.8b}, x1, x4
235
+    ld1             {v2.8b}, x0, x3
236
+    ld1             {v3.8b}, x1, x4
237
+    add             v0.4h, v0.4h, v1.4h
238
+    add             v2.4h, v2.4h, v3.4h
239
+    saddl           v0.4s, v0.4h, v30.4h
240
+    saddl           v2.4s, v2.4h, v30.4h
241
+    shrn            v0.4h, v0.4s, #7
242
+    shrn2           v0.8h, v2.4s, #7
243
+    sqxtun          v0.8b, v0.8h
244
+    st1             {v0.s}0, x2, x5
245
+    st1             {v0.s}1, x2, x5
246
+.endr
247
+    ret
248
+endfunc
249
+.endm
250
+
251
+addAvg_4xN 2
252
+addAvg_4xN 4
253
+addAvg_4xN 8
254
+addAvg_4xN 16
255
+addAvg_4xN 32
256
+
257
+.macro addAvg_6xN h
258
+function PFX(addAvg_6x\h\()_neon)
259
+    addAvg_start
260
+    mov             w12, #\h / 2
261
+    sub             x5, x5, #4
262
+.loop_addavg_6x\h:
263
+    sub             w12, w12, #1
264
+    ld1             {v0.16b}, x0, x3
265
+    ld1             {v1.16b}, x1, x4
266
+    ld1             {v2.16b}, x0, x3
267
+    ld1             {v3.16b}, x1, x4
268
+    add             v0.8h, v0.8h, v1.8h
269
+    add             v2.8h, v2.8h, v3.8h
270
+    saddl           v16.4s, v0.4h, v30.4h
271
+    saddl2          v17.4s, v0.8h, v30.8h
272
+    saddl           v18.4s, v2.4h, v30.4h
273
+    saddl2          v19.4s, v2.8h, v30.8h
274
+    shrn            v0.4h, v16.4s, #7
275
+    shrn2           v0.8h, v17.4s, #7
276
+    shrn            v1.4h, v18.4s, #7
277
+    shrn2           v1.8h, v19.4s, #7
278
+    sqxtun          v0.8b, v0.8h
279
+    sqxtun          v1.8b, v1.8h
280
+    str             s0, x2, #4
281
+    st1             {v0.h}2, x2, x5
282
+    str             s1, x2, #4
283
+    st1             {v1.h}2, x2, x5
284
+    cbnz            w12, .loop_addavg_6x\h
285
+    ret
286
+endfunc
287
+.endm
288
+
289
+addAvg_6xN 8
290
+addAvg_6xN 16
291
+
292
+.macro addAvg_8xN h
293
+function PFX(addAvg_8x\h\()_neon)
294
+    addAvg_start
295
+.rept \h / 2
296
+    ld1             {v0.16b}, x0, x3
297
+    ld1             {v1.16b}, x1, x4
298
+    ld1             {v2.16b}, x0, x3
299
+    ld1             {v3.16b}, x1, x4
300
+    add             v0.8h, v0.8h, v1.8h
301
+    add             v2.8h, v2.8h, v3.8h
302
+    saddl           v16.4s, v0.4h, v30.4h
303
+    saddl2          v17.4s, v0.8h, v30.8h
304
+    saddl           v18.4s, v2.4h, v30.4h
305
+    saddl2          v19.4s, v2.8h, v30.8h
306
+    shrn            v0.4h, v16.4s, #7
307
+    shrn2           v0.8h, v17.4s, #7
308
+    shrn            v1.4h, v18.4s, #7
309
+    shrn2           v1.8h, v19.4s, #7
310
+    sqxtun          v0.8b, v0.8h
311
+    sqxtun          v1.8b, v1.8h
312
+    st1             {v0.8b}, x2, x5
313
+    st1             {v1.8b}, x2, x5
314
+.endr
315
+    ret
316
+endfunc
317
+.endm
318
+
319
+.macro addAvg_8xN1 h
320
+function PFX(addAvg_8x\h\()_neon)
321
+    addAvg_start
322
+    mov             w12, #\h / 2
323
+.loop_addavg_8x\h:
324
+    sub             w12, w12, #1
325
+    ld1             {v0.16b}, x0, x3
326
+    ld1             {v1.16b}, x1, x4
327
+    ld1             {v2.16b}, x0, x3
328
+    ld1             {v3.16b}, x1, x4
329
+    add             v0.8h, v0.8h, v1.8h
330
+    add             v2.8h, v2.8h, v3.8h
331
+    saddl           v16.4s, v0.4h, v30.4h
332
+    saddl2          v17.4s, v0.8h, v30.8h
333
+    saddl           v18.4s, v2.4h, v30.4h
334
+    saddl2          v19.4s, v2.8h, v30.8h
335
+    shrn            v0.4h, v16.4s, #7
336
+    shrn2           v0.8h, v17.4s, #7
337
+    shrn            v1.4h, v18.4s, #7
338
+    shrn2           v1.8h, v19.4s, #7
339
+    sqxtun          v0.8b, v0.8h
340
+    sqxtun          v1.8b, v1.8h
341
+    st1             {v0.8b}, x2, x5
342
+    st1             {v1.8b}, x2, x5
343
+    cbnz            w12, .loop_addavg_8x\h
344
+    ret
345
+endfunc
346
+.endm
347
+
348
+addAvg_8xN 2
349
+addAvg_8xN 4
350
+addAvg_8xN 6
351
+addAvg_8xN 8
352
+addAvg_8xN 12
353
+addAvg_8xN 16
354
+addAvg_8xN1 32
355
+addAvg_8xN1 64
356
+
357
+.macro addAvg_12xN h
358
+function PFX(addAvg_12x\h\()_neon)
359
+    addAvg_start
360
+    sub             x3, x3, #16
361
+    sub             x4, x4, #16
362
+    sub             x5, x5, #8
363
+    mov             w12, #\h
364
+.loop_addAvg_12X\h\():
365
+    sub             w12, w12, #1
366
+    ld1             {v0.16b}, x0, #16
367
+    ld1             {v1.16b}, x1, #16
368
+    ld1             {v2.8b}, x0, x3
369
+    ld1             {v3.8b}, x1, x4
370
+    add             v0.8h, v0.8h, v1.8h
371
+    add             v2.4h, v2.4h, v3.4h
372
+    saddl           v16.4s, v0.4h, v30.4h
373
+    saddl2          v17.4s, v0.8h, v30.8h
374
+    saddl           v18.4s, v2.4h, v30.4h
375
+    shrn            v0.4h, v16.4s, #7
376
+    shrn2           v0.8h, v17.4s, #7
377
+    shrn            v1.4h, v18.4s, #7
378
+    sqxtun          v0.8b, v0.8h
379
+    sqxtun          v1.8b, v1.8h
380
+    st1             {v0.8b}, x2, #8
381
+    st1             {v1.s}0, x2, x5
382
+    cbnz            w12, .loop_addAvg_12X\h
383
+    ret
384
+endfunc
385
+.endm
386
+
387
+addAvg_12xN 16
388
+addAvg_12xN 32
389
+
390
+.macro addAvg_16xN h
391
+function PFX(addAvg_16x\h\()_neon)
392
+    addAvg_start
393
+    mov             w12, #\h
394
+.loop_addavg_16x\h:
395
+    sub             w12, w12, #1
396
+    ld1             {v0.8h-v1.8h}, x0, x3
397
+    ld1             {v2.8h-v3.8h}, x1, x4
398
+    addavg_1        v0, v2
399
+    addavg_1        v1, v3
400
+    sqxtun          v0.8b, v0.8h
401
+    sqxtun2         v0.16b, v1.8h
402
+    st1             {v0.16b}, x2, x5
403
+    cbnz            w12, .loop_addavg_16x\h
404
+    ret
405
+endfunc
406
+.endm
407
+
408
+addAvg_16xN 4
409
+addAvg_16xN 8
410
+addAvg_16xN 12
411
+addAvg_16xN 16
412
+addAvg_16xN 24
413
+addAvg_16xN 32
414
+addAvg_16xN 64
415
+
416
+.macro addAvg_24xN h
417
+function PFX(addAvg_24x\h\()_neon)
418
+    addAvg_start
419
+    mov             w12, #\h
420
+.loop_addavg_24x\h\():
421
+    sub             w12, w12, #1
422
+    ld1             {v0.16b-v2.16b}, x0, x3
423
+    ld1             {v3.16b-v5.16b}, x1, x4
424
+    addavg_1        v0, v3
425
+    addavg_1        v1, v4
426
+    addavg_1        v2, v5
427
+    sqxtun          v0.8b, v0.8h
428
+    sqxtun          v1.8b, v1.8h
429
+    sqxtun          v2.8b, v2.8h
430
+    st1             {v0.8b-v2.8b}, x2, x5
431
+    cbnz            w12, .loop_addavg_24x\h
432
+    ret
433
+endfunc
434
+.endm
435
+
436
+addAvg_24xN 32
437
+addAvg_24xN 64
438
+
439
+.macro addAvg_32xN h
440
+function PFX(addAvg_32x\h\()_neon)
441
+    addAvg_start
442
+    mov             w12, #\h
443
+.loop_addavg_32x\h\():
444
+    sub             w12, w12, #1
445
+    ld1             {v0.8h-v3.8h}, x0, x3
446
+    ld1             {v4.8h-v7.8h}, x1, x4
447
+    addavg_1        v0, v4
448
+    addavg_1        v1, v5
449
+    addavg_1        v2, v6
450
+    addavg_1        v3, v7
451
+    sqxtun          v0.8b, v0.8h
452
+    sqxtun          v1.8b, v1.8h
453
+    sqxtun          v2.8b, v2.8h
454
+    sqxtun          v3.8b, v3.8h
455
+    st1             {v0.8b-v3.8b}, x2, x5
456
+    cbnz            w12, .loop_addavg_32x\h
457
+    ret
458
+endfunc
459
+.endm
460
+
461
+addAvg_32xN 8
462
+addAvg_32xN 16
463
+addAvg_32xN 24
464
+addAvg_32xN 32
465
+addAvg_32xN 48
466
+addAvg_32xN 64
467
+
468
+function PFX(addAvg_48x64_neon)
469
+    addAvg_start
470
+    sub             x3, x3, #64
471
+    sub             x4, x4, #64
472
+    mov             w12, #64
473
+.loop_addavg_48x64:
474
+    sub             w12, w12, #1
475
+    ld1             {v0.8h-v3.8h}, x0, #64
476
+    ld1             {v4.8h-v7.8h}, x1, #64
477
+    ld1             {v20.8h-v21.8h}, x0, x3
478
+    ld1             {v22.8h-v23.8h}, x1, x4
479
+    addavg_1        v0, v4
480
+    addavg_1        v1, v5
481
+    addavg_1        v2, v6
482
+    addavg_1        v3, v7
483
+    addavg_1        v20, v22
484
+    addavg_1        v21, v23
485
+    sqxtun          v0.8b, v0.8h
486
+    sqxtun2         v0.16b, v1.8h
487
+    sqxtun          v1.8b, v2.8h
488
+    sqxtun2         v1.16b, v3.8h
489
+    sqxtun          v2.8b, v20.8h
490
+    sqxtun2         v2.16b, v21.8h
491
+    st1             {v0.16b-v2.16b}, x2, x5
492
+    cbnz            w12, .loop_addavg_48x64
493
+    ret
494
+endfunc
495
+
496
+.macro addAvg_64xN h
497
+function PFX(addAvg_64x\h\()_neon)
498
+    addAvg_start
499
+    mov             w12, #\h
500
+    sub             x3, x3, #64
501
+    sub             x4, x4, #64
502
+.loop_addavg_64x\h\():
503
+    sub             w12, w12, #1
504
+    ld1             {v0.8h-v3.8h}, x0, #64
505
+    ld1             {v4.8h-v7.8h}, x1, #64
506
+    ld1             {v20.8h-v23.8h}, x0, x3
507
+    ld1             {v24.8h-v27.8h}, x1, x4
508
+    addavg_1        v0, v4
509
+    addavg_1        v1, v5
510
+    addavg_1        v2, v6
511
+    addavg_1        v3, v7
512
+    addavg_1        v20, v24
513
+    addavg_1        v21, v25
514
+    addavg_1        v22, v26
515
+    addavg_1        v23, v27
516
+    sqxtun          v0.8b, v0.8h
517
+    sqxtun2         v0.16b, v1.8h
518
+    sqxtun          v1.8b, v2.8h
519
+    sqxtun2         v1.16b, v3.8h
520
+    sqxtun          v2.8b, v20.8h
521
+    sqxtun2         v2.16b, v21.8h
522
+    sqxtun          v3.8b, v22.8h
523
+    sqxtun2         v3.16b, v23.8h
524
+    st1             {v0.16b-v3.16b}, x2, x5
525
+    cbnz            w12, .loop_addavg_64x\h
526
+    ret
527
+endfunc
528
+.endm
529
+
530
+addAvg_64xN 16
531
+addAvg_64xN 32
532
+addAvg_64xN 48
533
+addAvg_64xN 64
534
x265_3.6.tar.gz/source/common/aarch64/p2s-common.S Added
104
 
1
@@ -0,0 +1,102 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+.arch           armv8-a
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+#if HIGH_BIT_DEPTH
39
+# if BIT_DEPTH == 10
40
+#  define P2S_SHIFT 4
41
+# elif BIT_DEPTH == 12
42
+#  define P2S_SHIFT 2
43
+# endif
44
+.macro p2s_start
45
+    add             x3, x3, x3
46
+    add             x1, x1, x1
47
+    movi            v31.8h, #0xe0, lsl #8
48
+.endm
49
+
50
+#else // if !HIGH_BIT_DEPTH
51
+# define P2S_SHIFT 6
52
+.macro p2s_start
53
+    add             x3, x3, x3
54
+    movi            v31.8h, #0xe0, lsl #8
55
+.endm
56
+#endif // HIGH_BIT_DEPTH
57
+
58
+.macro p2s_2x2
59
+#if HIGH_BIT_DEPTH
60
+    ld1             {v0.s}0, x0, x1
61
+    ld1             {v0.s}1, x0, x1
62
+    shl             v3.8h, v0.8h, #P2S_SHIFT
63
+#else
64
+    ldrh            w10, x0
65
+    add             x0, x0, x1
66
+    ldrh            w11, x0
67
+    orr             w10, w10, w11, lsl #16
68
+    add             x0, x0, x1
69
+    dup             v0.4s, w10
70
+    ushll           v3.8h, v0.8b, #P2S_SHIFT
71
+#endif
72
+    add             v3.8h, v3.8h, v31.8h
73
+    st1             {v3.s}0, x2, x3
74
+    st1             {v3.s}1, x2, x3
75
+.endm
76
+
77
+.macro p2s_6x2
78
+#if HIGH_BIT_DEPTH
79
+    ld1             {v0.d}0, x0, #8
80
+    ld1             {v1.s}0, x0, x1
81
+    ld1             {v0.d}1, x0, #8
82
+    ld1             {v1.s}1, x0, x1
83
+    shl             v3.8h, v0.8h, #P2S_SHIFT
84
+    shl             v4.8h, v1.8h, #P2S_SHIFT
85
+#else
86
+    ldr             s0, x0
87
+    ldrh            w10, x0, #4
88
+    add             x0, x0, x1
89
+    ld1             {v0.s}1, x0
90
+    ldrh            w11, x0, #4
91
+    add             x0, x0, x1
92
+    orr             w10, w10, w11, lsl #16
93
+    dup             v1.4s, w10
94
+    ushll           v3.8h, v0.8b, #P2S_SHIFT
95
+    ushll           v4.8h, v1.8b, #P2S_SHIFT
96
+#endif
97
+    add             v3.8h, v3.8h, v31.8h
98
+    add             v4.8h, v4.8h, v31.8h
99
+    st1             {v3.d}0, x2, #8
100
+    st1             {v4.s}0, x2, x3
101
+    st1             {v3.d}1, x2, #8
102
+    st1             {v4.s}1, x2, x3
103
+.endm
104
x265_3.6.tar.gz/source/common/aarch64/p2s-sve.S Added
447
 
1
@@ -0,0 +1,445 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "p2s-common.S"
27
+
28
+.arch armv8-a+sve
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+#if HIGH_BIT_DEPTH
41
+# if BIT_DEPTH == 10
42
+#  define P2S_SHIFT 4
43
+# elif BIT_DEPTH == 12
44
+#  define P2S_SHIFT 2
45
+# endif
46
+
47
+.macro p2s_start_sve
48
+    add             x3, x3, x3
49
+    add             x1, x1, x1
50
+    mov             z31.h, #0xe0, lsl #8
51
+.endm
52
+
53
+#else // if !HIGH_BIT_DEPTH
54
+# define P2S_SHIFT 6
55
+.macro p2s_start_sve
56
+    add             x3, x3, x3
57
+    mov             z31.h, #0xe0, lsl #8
58
+.endm
59
+
60
+#endif // HIGH_BIT_DEPTH
61
+
62
+// filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
63
+.macro p2s_2xN_sve h
64
+function PFX(filterPixelToShort_2x\h\()_sve)
65
+    p2s_start_sve
66
+.rept \h / 2
67
+    p2s_2x2
68
+.endr
69
+    ret
70
+endfunc
71
+.endm
72
+
73
+p2s_2xN_sve 4
74
+p2s_2xN_sve 8
75
+p2s_2xN_sve 16
76
+
77
+.macro p2s_6xN_sve h
78
+function PFX(filterPixelToShort_6x\h\()_sve)
79
+    p2s_start_sve
80
+    sub             x3, x3, #8
81
+#if HIGH_BIT_DEPTH
82
+    sub             x1, x1, #8
83
+#endif
84
+.rept \h / 2
85
+    p2s_6x2
86
+.endr
87
+    ret
88
+endfunc
89
+.endm
90
+
91
+p2s_6xN_sve 8
92
+p2s_6xN_sve 16
93
+
94
+function PFX(filterPixelToShort_4x2_sve)
95
+    p2s_start_sve
96
+#if HIGH_BIT_DEPTH
97
+    ptrue           p0.h, vl8
98
+    index           z1.d, #0, x1
99
+    index           z2.d, #0, x3
100
+    ld1d            {z3.d}, p0/z, x0, z1.d
101
+    lsl             z3.h, p0/m, z3.h, #P2S_SHIFT
102
+    add             z3.h, p0/m, z3.h, z31.h
103
+    st1d            {z3.d}, p0, x2, z2.d
104
+#else
105
+    ptrue           p0.h, vl4
106
+    ld1b            {z0.h}, p0/z, x0
107
+    add             x0, x0, x1
108
+    ld1b            {z1.h}, p0/z, x0
109
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
110
+    lsl             z1.h, p0/m, z1.h, #P2S_SHIFT
111
+    add             z0.h, p0/m, z0.h, z31.h
112
+    add             z1.h, p0/m, z1.h, z31.h
113
+    st1h            {z0.h}, p0, x2
114
+    add             x2, x2, x3
115
+    st1h            {z1.h}, p0, x2
116
+#endif
117
+    ret
118
+endfunc
119
+
120
+
121
+.macro p2s_8xN_sve h
122
+function PFX(filterPixelToShort_8x\h\()_sve)
123
+    p2s_start_sve
124
+    ptrue           p0.h, vl8
125
+.rept \h
126
+#if HIGH_BIT_DEPTH
127
+    ld1d            {z0.d}, p0/z, x0
128
+    add             x0, x0, x1
129
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
130
+    add             z0.h, p0/m, z0.h, z31.h
131
+    st1h            {z0.h}, p0, x2
132
+    add             x2, x2, x3
133
+#else
134
+    ld1b            {z0.h}, p0/z, x0
135
+    add             x0, x0, x1
136
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
137
+    add             z0.h, p0/m, z0.h, z31.h
138
+    st1h            {z0.h}, p0, x2
139
+    add             x2, x2, x3
140
+#endif
141
+.endr
142
+    ret
143
+endfunc
144
+.endm
145
+
146
+p2s_8xN_sve 2
147
+
148
+.macro p2s_32xN_sve h
149
+function PFX(filterPixelToShort_32x\h\()_sve)
150
+#if HIGH_BIT_DEPTH
151
+    p2s_start_sve
152
+    rdvl            x9, #1
153
+    cmp             x9, #16
154
+    bgt             .vl_gt_16_filterPixelToShort_high_32x\h
155
+    ptrue           p0.h, vl8
156
+.rept \h
157
+    ld1h            {z0.h}, p0/z, x0
158
+    ld1h            {z1.h}, p0/z, x0, #1, mul vl
159
+    ld1h            {z2.h}, p0/z, x0, #2, mul vl
160
+    ld1h            {z3.h}, p0/z, x0, #3, mul vl
161
+    add             x0, x0, x1
162
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
163
+    lsl             z1.h, p0/m, z1.h, #P2S_SHIFT
164
+    lsl             z2.h, p0/m, z2.h, #P2S_SHIFT
165
+    lsl             z3.h, p0/m, z3.h, #P2S_SHIFT
166
+    add             z0.h, p0/m, z0.h, z31.h
167
+    add             z1.h, p0/m, z1.h, z31.h
168
+    add             z2.h, p0/m, z2.h, z31.h
169
+    add             z3.h, p0/m, z3.h, z31.h
170
+    st1h            {z0.h}, p0, x2
171
+    st1h            {z1.h}, p0, x2, #1, mul vl
172
+    st1h            {z2.h}, p0, x2, #2, mul vl
173
+    st1h            {z3.h}, p0, x2, #3, mul vl
174
+    add             x2, x2, x3
175
+.endr
176
+    ret
177
+.vl_gt_16_filterPixelToShort_high_32x\h\():
178
+    cmp             x9, #48
179
+    bgt             .vl_gt_48_filterPixelToShort_high_32x\h
180
+    ptrue           p0.h, vl16
181
+.rept \h
182
+    ld1h            {z0.h}, p0/z, x0
183
+    ld1h            {z1.h}, p0/z, x0, #1, mul vl
184
+    add             x0, x0, x1
185
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
186
+    lsl             z1.h, p0/m, z1.h, #P2S_SHIFT
187
+    add             z0.h, p0/m, z0.h, z31.h
188
+    add             z1.h, p0/m, z1.h, z31.h
189
+    st1h            {z0.h}, p0, x2
190
+    st1h            {z1.h}, p0, x2, #1, mul vl
191
+    add             x2, x2, x3
192
+.endr
193
+    ret
194
+.vl_gt_48_filterPixelToShort_high_32x\h\():
195
+    ptrue           p0.h, vl32
196
+.rept \h
197
+    ld1h            {z0.h}, p0/z, x0
198
+    add             x0, x0, x1
199
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
200
+    add             z0.h, p0/m, z0.h, z31.h
201
+    st1h            {z0.h}, p0, x2
202
+    add             x2, x2, x3
203
+.endr
204
+    ret
205
+#else
206
+    p2s_start
207
+    mov             x9, #\h
208
+.loop_filter_sve_P2S_32x\h:
209
+    sub             x9, x9, #1
210
+    ld1             {v0.16b-v1.16b}, x0, x1
211
+    ushll           v22.8h, v0.8b,  #P2S_SHIFT
212
+    ushll2          v23.8h, v0.16b, #P2S_SHIFT
213
+    ushll           v24.8h, v1.8b,  #P2S_SHIFT
214
+    ushll2          v25.8h, v1.16b, #P2S_SHIFT
215
+    add             v22.8h, v22.8h, v31.8h
216
+    add             v23.8h, v23.8h, v31.8h
217
+    add             v24.8h, v24.8h, v31.8h
218
+    add             v25.8h, v25.8h, v31.8h
219
+    st1             {v22.16b-v25.16b}, x2, x3
220
+    cbnz            x9, .loop_filter_sve_P2S_32x\h
221
+    ret
222
+#endif
223
+endfunc
224
+.endm
225
+
226
+p2s_32xN_sve 8
227
+p2s_32xN_sve 16
228
+p2s_32xN_sve 24
229
+p2s_32xN_sve 32
230
+p2s_32xN_sve 48
231
+p2s_32xN_sve 64
232
+
233
+.macro p2s_64xN_sve h
234
+function PFX(filterPixelToShort_64x\h\()_sve)
235
+#if HIGH_BIT_DEPTH
236
+    p2s_start_sve
237
+    rdvl            x9, #1
238
+    cmp             x9, #16
239
+    bgt             .vl_gt_16_filterPixelToShort_high_64x\h
240
+    ptrue           p0.h, vl8
241
+.rept \h
242
+    ld1h            {z0.h}, p0/z, x0
243
+    ld1h            {z1.h}, p0/z, x0, #1, mul vl
244
+    ld1h            {z2.h}, p0/z, x0, #2, mul vl
245
+    ld1h            {z3.h}, p0/z, x0, #3, mul vl
246
+    ld1h            {z4.h}, p0/z, x0, #4, mul vl
247
+    ld1h            {z5.h}, p0/z, x0, #5, mul vl
248
+    ld1h            {z6.h}, p0/z, x0, #6, mul vl
249
+    ld1h            {z7.h}, p0/z, x0, #7, mul vl
250
+    add             x0, x0, x1
251
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
252
+    lsl             z1.h, p0/m, z1.h, #P2S_SHIFT
253
+    lsl             z2.h, p0/m, z2.h, #P2S_SHIFT
254
+    lsl             z3.h, p0/m, z3.h, #P2S_SHIFT
255
+    lsl             z4.h, p0/m, z4.h, #P2S_SHIFT
256
+    lsl             z5.h, p0/m, z5.h, #P2S_SHIFT
257
+    lsl             z6.h, p0/m, z6.h, #P2S_SHIFT
258
+    lsl             z7.h, p0/m, z7.h, #P2S_SHIFT
259
+    add             z0.h, p0/m, z0.h, z31.h
260
+    add             z1.h, p0/m, z1.h, z31.h
261
+    add             z2.h, p0/m, z2.h, z31.h
262
+    add             z3.h, p0/m, z3.h, z31.h
263
+    add             z4.h, p0/m, z4.h, z31.h
264
+    add             z5.h, p0/m, z5.h, z31.h
265
+    add             z6.h, p0/m, z6.h, z31.h
266
+    add             z7.h, p0/m, z7.h, z31.h
267
+    st1h            {z0.h}, p0, x2
268
+    st1h            {z1.h}, p0, x2, #1, mul vl
269
+    st1h            {z2.h}, p0, x2, #2, mul vl
270
+    st1h            {z3.h}, p0, x2, #3, mul vl
271
+    st1h            {z4.h}, p0, x2, #4, mul vl
272
+    st1h            {z5.h}, p0, x2, #5, mul vl
273
+    st1h            {z6.h}, p0, x2, #6, mul vl
274
+    st1h            {z7.h}, p0, x2, #7, mul vl
275
+    add             x2, x2, x3
276
+.endr
277
+    ret
278
+.vl_gt_16_filterPixelToShort_high_64x\h\():
279
+    cmp             x9, #48
280
+    bgt             .vl_gt_48_filterPixelToShort_high_64x\h
281
+    ptrue           p0.h, vl16
282
+.rept \h
283
+    ld1h            {z0.h}, p0/z, x0
284
+    ld1h            {z1.h}, p0/z, x0, #1, mul vl
285
+    ld1h            {z2.h}, p0/z, x0, #2, mul vl
286
+    ld1h            {z3.h}, p0/z, x0, #3, mul vl
287
+    add             x0, x0, x1
288
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
289
+    lsl             z1.h, p0/m, z1.h, #P2S_SHIFT
290
+    lsl             z2.h, p0/m, z2.h, #P2S_SHIFT
291
+    lsl             z3.h, p0/m, z3.h, #P2S_SHIFT
292
+    add             z0.h, p0/m, z0.h, z31.h
293
+    add             z1.h, p0/m, z1.h, z31.h
294
+    add             z2.h, p0/m, z2.h, z31.h
295
+    add             z3.h, p0/m, z3.h, z31.h
296
+    st1h            {z0.h}, p0, x2
297
+    st1h            {z1.h}, p0, x2, #1, mul vl
298
+    st1h            {z2.h}, p0, x2, #2, mul vl
299
+    st1h            {z3.h}, p0, x2, #3, mul vl
300
+    add             x2, x2, x3
301
+.endr
302
+    ret
303
+.vl_gt_48_filterPixelToShort_high_64x\h\():
304
+    cmp             x9, #112
305
+    bgt             .vl_gt_112_filterPixelToShort_high_64x\h
306
+    ptrue           p0.h, vl32
307
+.rept \h
308
+    ld1h            {z0.h}, p0/z, x0
309
+    ld1h            {z1.h}, p0/z, x0, #1, mul vl
310
+    add             x0, x0, x1
311
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
312
+    lsl             z1.h, p0/m, z1.h, #P2S_SHIFT
313
+    add             z0.h, p0/m, z0.h, z31.h
314
+    add             z1.h, p0/m, z1.h, z31.h
315
+    st1h            {z0.h}, p0, x2
316
+    st1h            {z1.h}, p0, x2, #1, mul vl
317
+    add             x2, x2, x3
318
+.endr
319
+    ret
320
+.vl_gt_112_filterPixelToShort_high_64x\h\():
321
+    ptrue           p0.h, vl64
322
+.rept \h
323
+    ld1h            {z0.h}, p0/z, x0
324
+    add             x0, x0, x1
325
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
326
+    add             z0.h, p0/m, z0.h, z31.h
327
+    st1h            {z0.h}, p0, x2
328
+    add             x2, x2, x3
329
+.endr
330
+    ret
331
+#else
332
+    p2s_start
333
+    sub             x3, x3, #64
334
+    mov             x9, #\h
335
+.loop_filter_sve_P2S_64x\h:
336
+    sub             x9, x9, #1
337
+    ld1             {v0.16b-v3.16b}, x0, x1
338
+    ushll           v16.8h, v0.8b,  #P2S_SHIFT
339
+    ushll2          v17.8h, v0.16b, #P2S_SHIFT
340
+    ushll           v18.8h, v1.8b,  #P2S_SHIFT
341
+    ushll2          v19.8h, v1.16b, #P2S_SHIFT
342
+    ushll           v20.8h, v2.8b,  #P2S_SHIFT
343
+    ushll2          v21.8h, v2.16b, #P2S_SHIFT
344
+    ushll           v22.8h, v3.8b,  #P2S_SHIFT
345
+    ushll2          v23.8h, v3.16b, #P2S_SHIFT
346
+    add             v16.8h, v16.8h, v31.8h
347
+    add             v17.8h, v17.8h, v31.8h
348
+    add             v18.8h, v18.8h, v31.8h
349
+    add             v19.8h, v19.8h, v31.8h
350
+    add             v20.8h, v20.8h, v31.8h
351
+    add             v21.8h, v21.8h, v31.8h
352
+    add             v22.8h, v22.8h, v31.8h
353
+    add             v23.8h, v23.8h, v31.8h
354
+    st1             {v16.16b-v19.16b}, x2, #64
355
+    st1             {v20.16b-v23.16b}, x2, x3
356
+    cbnz            x9, .loop_filter_sve_P2S_64x\h
357
+    ret
358
+#endif
359
+endfunc
360
+.endm
361
+
362
+p2s_64xN_sve 16
363
+p2s_64xN_sve 32
364
+p2s_64xN_sve 48
365
+p2s_64xN_sve 64
366
+
367
+function PFX(filterPixelToShort_48x64_sve)
368
+#if HIGH_BIT_DEPTH
369
+    p2s_start_sve
370
+    rdvl            x9, #1
371
+    cmp             x9, #16
372
+    bgt             .vl_gt_16_filterPixelToShort_high_48x64
373
+    ptrue           p0.h, vl8
374
+.rept 64
375
+    ld1h            {z0.h}, p0/z, x0
376
+    ld1h            {z1.h}, p0/z, x0, #1, mul vl
377
+    ld1h            {z2.h}, p0/z, x0, #2, mul vl
378
+    ld1h            {z3.h}, p0/z, x0, #3, mul vl
379
+    ld1h            {z4.h}, p0/z, x0, #4, mul vl
380
+    ld1h            {z5.h}, p0/z, x0, #5, mul vl
381
+    add             x0, x0, x1
382
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
383
+    lsl             z1.h, p0/m, z1.h, #P2S_SHIFT
384
+    lsl             z2.h, p0/m, z2.h, #P2S_SHIFT
385
+    lsl             z3.h, p0/m, z3.h, #P2S_SHIFT
386
+    lsl             z4.h, p0/m, z4.h, #P2S_SHIFT
387
+    lsl             z5.h, p0/m, z5.h, #P2S_SHIFT
388
+    add             z0.h, p0/m, z0.h, z31.h
389
+    add             z1.h, p0/m, z1.h, z31.h
390
+    add             z2.h, p0/m, z2.h, z31.h
391
+    add             z3.h, p0/m, z3.h, z31.h
392
+    add             z4.h, p0/m, z4.h, z31.h
393
+    add             z5.h, p0/m, z5.h, z31.h
394
+    st1h            {z0.h}, p0, x2
395
+    st1h            {z1.h}, p0, x2, #1, mul vl
396
+    st1h            {z2.h}, p0, x2, #2, mul vl
397
+    st1h            {z3.h}, p0, x2, #3, mul vl
398
+    st1h            {z4.h}, p0, x2, #4, mul vl
399
+    st1h            {z5.h}, p0, x2, #5, mul vl
400
+    add             x2, x2, x3
401
+.endr
402
+    ret
403
+.vl_gt_16_filterPixelToShort_high_48x64:
404
+    ptrue           p0.h, vl16
405
+.rept 64
406
+    ld1h            {z0.h}, p0/z, x0
407
+    ld1h            {z1.h}, p0/z, x0, #1, mul vl
408
+    ld1h            {z2.h}, p0/z, x0, #2, mul vl
409
+    add             x0, x0, x1
410
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
411
+    lsl             z1.h, p0/m, z1.h, #P2S_SHIFT
412
+    lsl             z2.h, p0/m, z2.h, #P2S_SHIFT
413
+    add             z0.h, p0/m, z0.h, z31.h
414
+    add             z1.h, p0/m, z1.h, z31.h
415
+    add             z2.h, p0/m, z2.h, z31.h
416
+    st1h            {z0.h}, p0, x2
417
+    st1h            {z1.h}, p0, x2, #1, mul vl
418
+    st1h            {z2.h}, p0, x2, #2, mul vl
419
+    add             x2, x2, x3
420
+.endr
421
+    ret
422
+#else
423
+    p2s_start
424
+    sub             x3, x3, #64
425
+    mov             x9, #64
426
+.loop_filterP2S_sve_48x64:
427
+    sub            x9, x9, #1
428
+    ld1             {v0.16b-v2.16b}, x0, x1
429
+    ushll           v16.8h, v0.8b,  #P2S_SHIFT
430
+    ushll2          v17.8h, v0.16b, #P2S_SHIFT
431
+    ushll           v18.8h, v1.8b,  #P2S_SHIFT
432
+    ushll2          v19.8h, v1.16b, #P2S_SHIFT
433
+    ushll           v20.8h, v2.8b,  #P2S_SHIFT
434
+    ushll2          v21.8h, v2.16b, #P2S_SHIFT
435
+    add             v16.8h, v16.8h, v31.8h
436
+    add             v17.8h, v17.8h, v31.8h
437
+    add             v18.8h, v18.8h, v31.8h
438
+    add             v19.8h, v19.8h, v31.8h
439
+    add             v20.8h, v20.8h, v31.8h
440
+    add             v21.8h, v21.8h, v31.8h
441
+    st1             {v16.16b-v19.16b}, x2, #64
442
+    st1             {v20.16b-v21.16b}, x2, x3
443
+    cbnz            x9, .loop_filterP2S_sve_48x64
444
+    ret
445
+#endif
446
+endfunc
447
x265_3.6.tar.gz/source/common/aarch64/p2s.S Added
388
 
1
@@ -0,0 +1,386 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2021 MulticoreWare, Inc
4
+ *
5
+ * Authors: Sebastian Pop <spop@amazon.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+#include "p2s-common.S"
27
+
28
+#ifdef __APPLE__
29
+.section __RODATA,__rodata
30
+#else
31
+.section .rodata
32
+#endif
33
+
34
+.align 4
35
+
36
+.text
37
+
38
+// filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
39
+.macro p2s_2xN h
40
+function PFX(filterPixelToShort_2x\h\()_neon)
41
+    p2s_start
42
+.rept \h / 2
43
+    p2s_2x2
44
+.endr
45
+    ret
46
+endfunc
47
+.endm
48
+
49
+p2s_2xN 4
50
+p2s_2xN 8
51
+p2s_2xN 16
52
+
53
+.macro p2s_6xN h
54
+function PFX(filterPixelToShort_6x\h\()_neon)
55
+    p2s_start
56
+    sub             x3, x3, #8
57
+#if HIGH_BIT_DEPTH
58
+    sub             x1, x1, #8
59
+#endif
60
+.rept \h / 2
61
+    p2s_6x2
62
+.endr
63
+    ret
64
+endfunc
65
+.endm
66
+
67
+p2s_6xN 8
68
+p2s_6xN 16
69
+
70
+function PFX(filterPixelToShort_4x2_neon)
71
+    p2s_start
72
+#if HIGH_BIT_DEPTH
73
+    ld1             {v0.d}0, x0, x1
74
+    ld1             {v0.d}1, x0, x1
75
+    shl             v3.8h, v0.8h, #P2S_SHIFT
76
+#else
77
+    ld1             {v0.s}0, x0, x1
78
+    ld1             {v0.s}1, x0, x1
79
+    ushll           v3.8h, v0.8b, #P2S_SHIFT
80
+#endif
81
+    add             v3.8h, v3.8h, v31.8h
82
+    st1             {v3.d}0, x2, x3
83
+    st1             {v3.d}1, x2, x3
84
+    ret
85
+endfunc
86
+
87
+function PFX(filterPixelToShort_4x4_neon)
88
+    p2s_start
89
+#if HIGH_BIT_DEPTH
90
+    ld1             {v0.d}0, x0, x1
91
+    ld1             {v0.d}1, x0, x1
92
+    shl             v3.8h, v0.8h, #P2S_SHIFT
93
+#else
94
+    ld1             {v0.s}0, x0, x1
95
+    ld1             {v0.s}1, x0, x1
96
+    ushll           v3.8h, v0.8b, #P2S_SHIFT
97
+#endif
98
+    add             v3.8h, v3.8h, v31.8h
99
+    st1             {v3.d}0, x2, x3
100
+    st1             {v3.d}1, x2, x3
101
+#if HIGH_BIT_DEPTH
102
+    ld1             {v1.d}0, x0, x1
103
+    ld1             {v1.d}1, x0, x1
104
+    shl             v4.8h, v1.8h, #P2S_SHIFT
105
+#else
106
+    ld1             {v1.s}0, x0, x1
107
+    ld1             {v1.s}1, x0, x1
108
+    ushll           v4.8h, v1.8b, #P2S_SHIFT
109
+#endif
110
+    add             v4.8h, v4.8h, v31.8h
111
+    st1             {v4.d}0, x2, x3
112
+    st1             {v4.d}1, x2, x3
113
+    ret
114
+endfunc
115
+
116
+.macro p2s_4xN h
117
+function PFX(filterPixelToShort_4x\h\()_neon)
118
+    p2s_start
119
+.rept \h / 2
120
+#if HIGH_BIT_DEPTH
121
+    ld1             {v0.16b}, x0, x1
122
+    shl             v0.8h, v0.8h, #P2S_SHIFT
123
+#else
124
+    ld1             {v0.8b}, x0, x1
125
+    ushll           v0.8h, v0.8b, #P2S_SHIFT
126
+#endif
127
+    add             v2.4h, v0.4h, v31.4h
128
+    st1             {v2.4h}, x2, x3
129
+#if HIGH_BIT_DEPTH
130
+    ld1             {v1.16b}, x0, x1
131
+    shl             v1.8h, v1.8h, #P2S_SHIFT
132
+#else
133
+    ld1             {v1.8b}, x0, x1
134
+    ushll           v1.8h, v1.8b, #P2S_SHIFT
135
+#endif
136
+    add             v3.4h, v1.4h, v31.4h
137
+    st1             {v3.4h}, x2, x3
138
+.endr
139
+    ret
140
+endfunc
141
+.endm
142
+
143
+p2s_4xN 8
144
+p2s_4xN 16
145
+p2s_4xN 32
146
+
147
+.macro p2s_8xN h
148
+function PFX(filterPixelToShort_8x\h\()_neon)
149
+    p2s_start
150
+.rept \h / 2
151
+#if HIGH_BIT_DEPTH
152
+    ld1             {v0.16b}, x0, x1
153
+    ld1             {v1.16b}, x0, x1
154
+    shl             v0.8h, v0.8h, #P2S_SHIFT
155
+    shl             v1.8h, v1.8h, #P2S_SHIFT
156
+#else
157
+    ld1             {v0.8b}, x0, x1
158
+    ld1             {v1.8b}, x0, x1
159
+    ushll           v0.8h, v0.8b, #P2S_SHIFT
160
+    ushll           v1.8h, v1.8b, #P2S_SHIFT
161
+#endif
162
+    add             v2.8h, v0.8h, v31.8h
163
+    st1             {v2.8h}, x2, x3
164
+    add             v3.8h, v1.8h, v31.8h
165
+    st1             {v3.8h}, x2, x3
166
+.endr
167
+    ret
168
+endfunc
169
+.endm
170
+
171
+p2s_8xN 2
172
+p2s_8xN 4
173
+p2s_8xN 6
174
+p2s_8xN 8
175
+p2s_8xN 12
176
+p2s_8xN 16
177
+p2s_8xN 32
178
+p2s_8xN 64
179
+
180
+.macro p2s_12xN h
181
+function PFX(filterPixelToShort_12x\h\()_neon)
182
+    p2s_start
183
+    sub             x3, x3, #16
184
+.rept \h
185
+#if HIGH_BIT_DEPTH
186
+    ld1             {v0.16b-v1.16b}, x0, x1
187
+    shl             v2.8h, v0.8h, #P2S_SHIFT
188
+    shl             v3.8h, v1.8h, #P2S_SHIFT
189
+#else
190
+    ld1             {v0.16b}, x0, x1
191
+    ushll           v2.8h, v0.8b,  #P2S_SHIFT
192
+    ushll2          v3.8h, v0.16b, #P2S_SHIFT
193
+#endif
194
+    add             v2.8h, v2.8h, v31.8h
195
+    add             v3.8h, v3.8h, v31.8h
196
+    st1             {v2.16b}, x2, #16
197
+    st1             {v3.8b}, x2, x3
198
+.endr
199
+    ret
200
+endfunc
201
+.endm
202
+
203
+p2s_12xN 16
204
+p2s_12xN 32
205
+
206
+.macro p2s_16xN h
207
+function PFX(filterPixelToShort_16x\h\()_neon)
208
+    p2s_start
209
+.rept \h
210
+#if HIGH_BIT_DEPTH
211
+    ld1             {v0.16b-v1.16b}, x0, x1
212
+    shl             v2.8h, v0.8h, #P2S_SHIFT
213
+    shl             v3.8h, v1.8h, #P2S_SHIFT
214
+#else
215
+    ld1             {v0.16b}, x0, x1
216
+    ushll           v2.8h, v0.8b,  #P2S_SHIFT
217
+    ushll2          v3.8h, v0.16b, #P2S_SHIFT
218
+#endif
219
+    add             v2.8h, v2.8h, v31.8h
220
+    add             v3.8h, v3.8h, v31.8h
221
+    st1             {v2.16b-v3.16b}, x2, x3
222
+.endr
223
+    ret
224
+endfunc
225
+.endm
226
+
227
+p2s_16xN 4
228
+p2s_16xN 8
229
+p2s_16xN 12
230
+p2s_16xN 16
231
+p2s_16xN 24
232
+p2s_16xN 32
233
+p2s_16xN 64
234
+
235
+.macro p2s_24xN h
236
+function PFX(filterPixelToShort_24x\h\()_neon)
237
+    p2s_start
238
+.rept \h
239
+#if HIGH_BIT_DEPTH
240
+    ld1             {v0.16b-v2.16b}, x0, x1
241
+    shl             v3.8h, v0.8h, #P2S_SHIFT
242
+    shl             v4.8h, v1.8h, #P2S_SHIFT
243
+    shl             v5.8h, v2.8h, #P2S_SHIFT
244
+#else
245
+    ld1             {v0.8b-v2.8b}, x0, x1
246
+    ushll           v3.8h, v0.8b, #P2S_SHIFT
247
+    ushll           v4.8h, v1.8b, #P2S_SHIFT
248
+    ushll           v5.8h, v2.8b, #P2S_SHIFT
249
+#endif
250
+    add             v3.8h, v3.8h, v31.8h
251
+    add             v4.8h, v4.8h, v31.8h
252
+    add             v5.8h, v5.8h, v31.8h
253
+    st1             {v3.16b-v5.16b}, x2, x3
254
+.endr
255
+    ret
256
+endfunc
257
+.endm
258
+
259
+p2s_24xN 32
260
+p2s_24xN 64
261
+
262
+.macro p2s_32xN h
263
+function PFX(filterPixelToShort_32x\h\()_neon)
264
+    p2s_start
265
+    mov             x9, #\h
266
+.loop_filterP2S_32x\h:
267
+    sub             x9, x9, #1
268
+#if HIGH_BIT_DEPTH
269
+    ld1             {v0.16b-v3.16b}, x0, x1
270
+    shl             v22.8h, v0.8h, #P2S_SHIFT
271
+    shl             v23.8h, v1.8h, #P2S_SHIFT
272
+    shl             v24.8h, v2.8h, #P2S_SHIFT
273
+    shl             v25.8h, v3.8h, #P2S_SHIFT
274
+#else
275
+    ld1             {v0.16b-v1.16b}, x0, x1
276
+    ushll           v22.8h, v0.8b,  #P2S_SHIFT
277
+    ushll2          v23.8h, v0.16b, #P2S_SHIFT
278
+    ushll           v24.8h, v1.8b,  #P2S_SHIFT
279
+    ushll2          v25.8h, v1.16b, #P2S_SHIFT
280
+#endif
281
+    add             v22.8h, v22.8h, v31.8h
282
+    add             v23.8h, v23.8h, v31.8h
283
+    add             v24.8h, v24.8h, v31.8h
284
+    add             v25.8h, v25.8h, v31.8h
285
+    st1             {v22.16b-v25.16b}, x2, x3
286
+    cbnz            x9, .loop_filterP2S_32x\h
287
+    ret
288
+endfunc
289
+.endm
290
+
291
+p2s_32xN 8
292
+p2s_32xN 16
293
+p2s_32xN 24
294
+p2s_32xN 32
295
+p2s_32xN 48
296
+p2s_32xN 64
297
+
298
+.macro p2s_64xN h
299
+function PFX(filterPixelToShort_64x\h\()_neon)
300
+    p2s_start
301
+#if HIGH_BIT_DEPTH
302
+    sub             x1, x1, #64
303
+#endif
304
+    sub             x3, x3, #64
305
+    mov             x9, #\h
306
+.loop_filterP2S_64x\h:
307
+    sub             x9, x9, #1
308
+#if HIGH_BIT_DEPTH
309
+    ld1             {v0.16b-v3.16b}, x0, #64
310
+    ld1             {v4.16b-v7.16b}, x0, x1
311
+    shl             v16.8h, v0.8h, #P2S_SHIFT
312
+    shl             v17.8h, v1.8h, #P2S_SHIFT
313
+    shl             v18.8h, v2.8h, #P2S_SHIFT
314
+    shl             v19.8h, v3.8h, #P2S_SHIFT
315
+    shl             v20.8h, v4.8h, #P2S_SHIFT
316
+    shl             v21.8h, v5.8h, #P2S_SHIFT
317
+    shl             v22.8h, v6.8h, #P2S_SHIFT
318
+    shl             v23.8h, v7.8h, #P2S_SHIFT
319
+#else
320
+    ld1             {v0.16b-v3.16b}, x0, x1
321
+    ushll           v16.8h, v0.8b,  #P2S_SHIFT
322
+    ushll2          v17.8h, v0.16b, #P2S_SHIFT
323
+    ushll           v18.8h, v1.8b,  #P2S_SHIFT
324
+    ushll2          v19.8h, v1.16b, #P2S_SHIFT
325
+    ushll           v20.8h, v2.8b,  #P2S_SHIFT
326
+    ushll2          v21.8h, v2.16b, #P2S_SHIFT
327
+    ushll           v22.8h, v3.8b,  #P2S_SHIFT
328
+    ushll2          v23.8h, v3.16b, #P2S_SHIFT
329
+#endif
330
+    add             v16.8h, v16.8h, v31.8h
331
+    add             v17.8h, v17.8h, v31.8h
332
+    add             v18.8h, v18.8h, v31.8h
333
+    add             v19.8h, v19.8h, v31.8h
334
+    add             v20.8h, v20.8h, v31.8h
335
+    add             v21.8h, v21.8h, v31.8h
336
+    add             v22.8h, v22.8h, v31.8h
337
+    add             v23.8h, v23.8h, v31.8h
338
+    st1             {v16.16b-v19.16b}, x2, #64
339
+    st1             {v20.16b-v23.16b}, x2, x3
340
+    cbnz            x9, .loop_filterP2S_64x\h
341
+    ret
342
+endfunc
343
+.endm
344
+
345
+p2s_64xN 16
346
+p2s_64xN 32
347
+p2s_64xN 48
348
+p2s_64xN 64
349
+
350
+function PFX(filterPixelToShort_48x64_neon)
351
+    p2s_start
352
+#if HIGH_BIT_DEPTH
353
+    sub             x1, x1, #64
354
+#endif
355
+    sub             x3, x3, #64
356
+    mov             x9, #64
357
+.loop_filterP2S_48x64:
358
+    sub            x9, x9, #1
359
+#if HIGH_BIT_DEPTH
360
+    ld1             {v0.16b-v3.16b}, x0, #64
361
+    ld1             {v4.16b-v5.16b}, x0, x1
362
+    shl             v16.8h, v0.8h, #P2S_SHIFT
363
+    shl             v17.8h, v1.8h, #P2S_SHIFT
364
+    shl             v18.8h, v2.8h, #P2S_SHIFT
365
+    shl             v19.8h, v3.8h, #P2S_SHIFT
366
+    shl             v20.8h, v4.8h, #P2S_SHIFT
367
+    shl             v21.8h, v5.8h, #P2S_SHIFT
368
+#else
369
+    ld1             {v0.16b-v2.16b}, x0, x1
370
+    ushll           v16.8h, v0.8b,  #P2S_SHIFT
371
+    ushll2          v17.8h, v0.16b, #P2S_SHIFT
372
+    ushll           v18.8h, v1.8b,  #P2S_SHIFT
373
+    ushll2          v19.8h, v1.16b, #P2S_SHIFT
374
+    ushll           v20.8h, v2.8b,  #P2S_SHIFT
375
+    ushll2          v21.8h, v2.16b, #P2S_SHIFT
376
+#endif
377
+    add             v16.8h, v16.8h, v31.8h
378
+    add             v17.8h, v17.8h, v31.8h
379
+    add             v18.8h, v18.8h, v31.8h
380
+    add             v19.8h, v19.8h, v31.8h
381
+    add             v20.8h, v20.8h, v31.8h
382
+    add             v21.8h, v21.8h, v31.8h
383
+    st1             {v16.16b-v19.16b}, x2, #64
384
+    st1             {v20.16b-v21.16b}, x2, x3
385
+    cbnz            x9, .loop_filterP2S_48x64
386
+    ret
387
+endfunc
388
x265_3.6.tar.gz/source/common/aarch64/pixel-prim.cpp Added
2061
 
1
@@ -0,0 +1,2059 @@
2
+#include "common.h"
3
+#include "slicetype.h"      // LOWRES_COST_MASK
4
+#include "primitives.h"
5
+#include "x265.h"
6
+
7
+#include "pixel-prim.h"
8
+#include "arm64-utils.h"
9
+#if HAVE_NEON
10
+
11
+#include <arm_neon.h>
12
+
13
+using namespace X265_NS;
14
+
15
+
16
+
17
+namespace
18
+{
19
+
20
+
21
+/* SATD SA8D variants - based on x264 */
22
+static inline void SUMSUB_AB(int16x8_t &sum, int16x8_t &sub, const int16x8_t a, const int16x8_t b)
23
+{
24
+    sum = vaddq_s16(a, b);
25
+    sub = vsubq_s16(a, b);
26
+}
27
+
28
+static inline void transpose_8h(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
29
+{
30
+    t1 = vtrn1q_s16(s1, s2);
31
+    t2 = vtrn2q_s16(s1, s2);
32
+}
33
+
34
+static inline void transpose_4s(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
35
+{
36
+    t1 = vtrn1q_s32(s1, s2);
37
+    t2 = vtrn2q_s32(s1, s2);
38
+}
39
+
40
+#if (X265_DEPTH <= 10)
41
+static inline void transpose_2d(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
42
+{
43
+    t1 = vtrn1q_s64(s1, s2);
44
+    t2 = vtrn2q_s64(s1, s2);
45
+}
46
+#endif
47
+
48
+
49
+static inline void SUMSUB_ABCD(int16x8_t &s1, int16x8_t &d1, int16x8_t &s2, int16x8_t &d2,
50
+                               int16x8_t a, int16x8_t  b, int16x8_t  c, int16x8_t  d)
51
+{
52
+    SUMSUB_AB(s1, d1, a, b);
53
+    SUMSUB_AB(s2, d2, c, d);
54
+}
55
+
56
+static inline void HADAMARD4_V(int16x8_t &r1, int16x8_t &r2, int16x8_t &r3, int16x8_t &r4,
57
+                               int16x8_t &t1, int16x8_t &t2, int16x8_t &t3, int16x8_t &t4)
58
+{
59
+    SUMSUB_ABCD(t1, t2, t3, t4, r1, r2, r3, r4);
60
+    SUMSUB_ABCD(r1, r3, r2, r4, t1, t3, t2, t4);
61
+}
62
+
63
+
64
+static int _satd_4x8_8x4_end_neon(int16x8_t v0, int16x8_t v1, int16x8_t v2, int16x8_t v3)
65
+
66
+{
67
+
68
+    int16x8_t v4, v5, v6, v7, v16, v17, v18, v19;
69
+
70
+
71
+    SUMSUB_AB(v16, v17, v0,  v1);
72
+    SUMSUB_AB(v18, v19, v2,  v3);
73
+
74
+    SUMSUB_AB(v4 , v6 , v16, v18);
75
+    SUMSUB_AB(v5 , v7 , v17, v19);
76
+
77
+    v0 = vtrn1q_s16(v4, v5);
78
+    v1 = vtrn2q_s16(v4, v5);
79
+    v2 = vtrn1q_s16(v6, v7);
80
+    v3 = vtrn2q_s16(v6, v7);
81
+
82
+    SUMSUB_AB(v16, v17, v0,  v1);
83
+    SUMSUB_AB(v18, v19, v2,  v3);
84
+
85
+    v0 = vtrn1q_s32(v16, v18);
86
+    v1 = vtrn2q_s32(v16, v18);
87
+    v2 = vtrn1q_s32(v17, v19);
88
+    v3 = vtrn2q_s32(v17, v19);
89
+
90
+    v0 = vabsq_s16(v0);
91
+    v1 = vabsq_s16(v1);
92
+    v2 = vabsq_s16(v2);
93
+    v3 = vabsq_s16(v3);
94
+
95
+    v0 = vmaxq_u16(v0, v1);
96
+    v1 = vmaxq_u16(v2, v3);
97
+
98
+    v0 = vaddq_u16(v0, v1);
99
+    return vaddlvq_u16(v0);
100
+}
101
+
102
+static inline int _satd_4x4_neon(int16x8_t v0, int16x8_t v1)
103
+{
104
+    int16x8_t v2, v3;
105
+    SUMSUB_AB(v2,  v3,  v0,  v1);
106
+
107
+    v0 = vzip1q_s64(v2, v3);
108
+    v1 = vzip2q_s64(v2, v3);
109
+    SUMSUB_AB(v2,  v3,  v0,  v1);
110
+
111
+    v0 = vtrn1q_s16(v2, v3);
112
+    v1 = vtrn2q_s16(v2, v3);
113
+    SUMSUB_AB(v2,  v3,  v0,  v1);
114
+
115
+    v0 = vtrn1q_s32(v2, v3);
116
+    v1 = vtrn2q_s32(v2, v3);
117
+
118
+    v0 = vabsq_s16(v0);
119
+    v1 = vabsq_s16(v1);
120
+    v0 = vmaxq_u16(v0, v1);
121
+
122
+    return vaddlvq_s16(v0);
123
+}
124
+
125
+static void _satd_8x4v_8x8h_neon(int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3, int16x8_t &v20,
126
+                                 int16x8_t &v21, int16x8_t &v22, int16x8_t &v23)
127
+{
128
+    int16x8_t v16, v17, v18, v19, v4, v5, v6, v7;
129
+
130
+    SUMSUB_AB(v16, v18, v0,  v2);
131
+    SUMSUB_AB(v17, v19, v1,  v3);
132
+
133
+    HADAMARD4_V(v20, v21, v22, v23, v0,  v1, v2, v3);
134
+
135
+    transpose_8h(v0,  v1,  v16, v17);
136
+    transpose_8h(v2,  v3,  v18, v19);
137
+    transpose_8h(v4,  v5,  v20, v21);
138
+    transpose_8h(v6,  v7,  v22, v23);
139
+
140
+    SUMSUB_AB(v16, v17, v0,  v1);
141
+    SUMSUB_AB(v18, v19, v2,  v3);
142
+    SUMSUB_AB(v20, v21, v4,  v5);
143
+    SUMSUB_AB(v22, v23, v6,  v7);
144
+
145
+    transpose_4s(v0,  v2,  v16, v18);
146
+    transpose_4s(v1,  v3,  v17, v19);
147
+    transpose_4s(v4,  v6,  v20, v22);
148
+    transpose_4s(v5,  v7,  v21, v23);
149
+
150
+    v0 = vabsq_s16(v0);
151
+    v1 = vabsq_s16(v1);
152
+    v2 = vabsq_s16(v2);
153
+    v3 = vabsq_s16(v3);
154
+    v4 = vabsq_s16(v4);
155
+    v5 = vabsq_s16(v5);
156
+    v6 = vabsq_s16(v6);
157
+    v7 = vabsq_s16(v7);
158
+
159
+    v0 = vmaxq_u16(v0, v2);
160
+    v1 = vmaxq_u16(v1, v3);
161
+    v2 = vmaxq_u16(v4, v6);
162
+    v3 = vmaxq_u16(v5, v7);
163
+
164
+}
165
+
166
+#if HIGH_BIT_DEPTH
167
+
168
+#if (X265_DEPTH > 10)
169
+static inline void transpose_2d(int32x4_t &t1, int32x4_t &t2, const int32x4_t s1, const int32x4_t s2)
170
+{
171
+    t1 = vtrn1q_s64(s1, s2);
172
+    t2 = vtrn2q_s64(s1, s2);
173
+}
174
+
175
+static inline void ISUMSUB_AB(int32x4_t &sum, int32x4_t &sub, const int32x4_t a, const int32x4_t b)
176
+{
177
+    sum = vaddq_s32(a, b);
178
+    sub = vsubq_s32(a, b);
179
+}
180
+
181
+static inline void ISUMSUB_AB_FROM_INT16(int32x4_t &suml, int32x4_t &sumh, int32x4_t &subl, int32x4_t &subh,
182
+        const int16x8_t a, const int16x8_t b)
183
+{
184
+    suml = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
185
+    sumh = vaddl_high_s16(a, b);
186
+    subl = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
187
+    subh = vsubl_high_s16(a, b);
188
+}
189
+
190
+#endif
191
+
192
+static inline void _sub_8x8_fly(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2,
193
+                                int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3,
194
+                                int16x8_t &v20, int16x8_t &v21, int16x8_t &v22, int16x8_t &v23)
195
+{
196
+    uint16x8_t r0, r1, r2, r3;
197
+    uint16x8_t t0, t1, t2, t3;
198
+    int16x8_t v16, v17;
199
+    int16x8_t v18, v19;
200
+
201
+    r0 = *(uint16x8_t *)(pix1 + 0 * stride_pix1);
202
+    r1 = *(uint16x8_t *)(pix1 + 1 * stride_pix1);
203
+    r2 = *(uint16x8_t *)(pix1 + 2 * stride_pix1);
204
+    r3 = *(uint16x8_t *)(pix1 + 3 * stride_pix1);
205
+
206
+    t0 = *(uint16x8_t *)(pix2 + 0 * stride_pix2);
207
+    t1 = *(uint16x8_t *)(pix2 + 1 * stride_pix2);
208
+    t2 = *(uint16x8_t *)(pix2 + 2 * stride_pix2);
209
+    t3 = *(uint16x8_t *)(pix2 + 3 * stride_pix2);
210
+
211
+    v16 = vsubq_u16(r0, t0);
212
+    v17 = vsubq_u16(r1, t1);
213
+    v18 = vsubq_u16(r2, t2);
214
+    v19 = vsubq_u16(r3, t3);
215
+
216
+    r0 = *(uint16x8_t *)(pix1 + 4 * stride_pix1);
217
+    r1 = *(uint16x8_t *)(pix1 + 5 * stride_pix1);
218
+    r2 = *(uint16x8_t *)(pix1 + 6 * stride_pix1);
219
+    r3 = *(uint16x8_t *)(pix1 + 7 * stride_pix1);
220
+
221
+    t0 = *(uint16x8_t *)(pix2 + 4 * stride_pix2);
222
+    t1 = *(uint16x8_t *)(pix2 + 5 * stride_pix2);
223
+    t2 = *(uint16x8_t *)(pix2 + 6 * stride_pix2);
224
+    t3 = *(uint16x8_t *)(pix2 + 7 * stride_pix2);
225
+
226
+    v20 = vsubq_u16(r0, t0);
227
+    v21 = vsubq_u16(r1, t1);
228
+    v22 = vsubq_u16(r2, t2);
229
+    v23 = vsubq_u16(r3, t3);
230
+
231
+    SUMSUB_AB(v0,  v1,  v16, v17);
232
+    SUMSUB_AB(v2,  v3,  v18, v19);
233
+
234
+}
235
+
236
+
237
+
238
+
239
+static void _satd_16x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2,
240
+                            int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3)
241
+{
242
+    uint8x16_t r0, r1, r2, r3;
243
+    uint8x16_t t0, t1, t2, t3;
244
+    int16x8_t v16, v17, v20, v21;
245
+    int16x8_t v18, v19, v22, v23;
246
+
247
+    r0 = *(int16x8_t *)(pix1 + 0 * stride_pix1);
248
+    r1 = *(int16x8_t *)(pix1 + 1 * stride_pix1);
249
+    r2 = *(int16x8_t *)(pix1 + 2 * stride_pix1);
250
+    r3 = *(int16x8_t *)(pix1 + 3 * stride_pix1);
251
+
252
+    t0 = *(int16x8_t *)(pix2 + 0 * stride_pix2);
253
+    t1 = *(int16x8_t *)(pix2 + 1 * stride_pix2);
254
+    t2 = *(int16x8_t *)(pix2 + 2 * stride_pix2);
255
+    t3 = *(int16x8_t *)(pix2 + 3 * stride_pix2);
256
+
257
+
258
+    v16 = vsubq_u16((r0), (t0));
259
+    v17 = vsubq_u16((r1), (t1));
260
+    v18 = vsubq_u16((r2), (t2));
261
+    v19 = vsubq_u16((r3), (t3));
262
+
263
+    r0 = *(int16x8_t *)(pix1 + 0 * stride_pix1 + 8);
264
+    r1 = *(int16x8_t *)(pix1 + 1 * stride_pix1 + 8);
265
+    r2 = *(int16x8_t *)(pix1 + 2 * stride_pix1 + 8);
266
+    r3 = *(int16x8_t *)(pix1 + 3 * stride_pix1 + 8);
267
+
268
+    t0 = *(int16x8_t *)(pix2 + 0 * stride_pix2 + 8);
269
+    t1 = *(int16x8_t *)(pix2 + 1 * stride_pix2 + 8);
270
+    t2 = *(int16x8_t *)(pix2 + 2 * stride_pix2 + 8);
271
+    t3 = *(int16x8_t *)(pix2 + 3 * stride_pix2 + 8);
272
+
273
+
274
+    v20 = vsubq_u16(r0, t0);
275
+    v21 = vsubq_u16(r1, t1);
276
+    v22 = vsubq_u16(r2, t2);
277
+    v23 = vsubq_u16(r3, t3);
278
+
279
+    SUMSUB_AB(v0,  v1,  v16, v17);
280
+    SUMSUB_AB(v2,  v3,  v18, v19);
281
+
282
+    _satd_8x4v_8x8h_neon(v0, v1, v2, v3, v20, v21, v22, v23);
283
+
284
+}
285
+
286
+
287
+int pixel_satd_4x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2)
288
+{
289
+    uint64x2_t t0, t1, r0, r1;
290
+    t00 = *(uint64_t *)(pix1 + 0 * stride_pix1);
291
+    t10 = *(uint64_t *)(pix1 + 1 * stride_pix1);
292
+    t01 = *(uint64_t *)(pix1 + 2 * stride_pix1);
293
+    t11 = *(uint64_t *)(pix1 + 3 * stride_pix1);
294
+
295
+    r00 = *(uint64_t *)(pix2 + 0 * stride_pix1);
296
+    r10 = *(uint64_t *)(pix2 + 1 * stride_pix2);
297
+    r01 = *(uint64_t *)(pix2 + 2 * stride_pix2);
298
+    r11 = *(uint64_t *)(pix2 + 3 * stride_pix2);
299
+
300
+    return _satd_4x4_neon(vsubq_u16(t0, r0), vsubq_u16(r1, t1));
301
+}
302
+
303
+
304
+
305
+
306
+
307
+
308
+int pixel_satd_8x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2)
309
+{
310
+    uint16x8_t i0, i1, i2, i3, i4, i5, i6, i7;
311
+
312
+    i0 = *(uint16x8_t *)(pix1 + 0 * stride_pix1);
313
+    i1 = *(uint16x8_t *)(pix2 + 0 * stride_pix2);
314
+    i2 = *(uint16x8_t *)(pix1 + 1 * stride_pix1);
315
+    i3 = *(uint16x8_t *)(pix2 + 1 * stride_pix2);
316
+    i4 = *(uint16x8_t *)(pix1 + 2 * stride_pix1);
317
+    i5 = *(uint16x8_t *)(pix2 + 2 * stride_pix2);
318
+    i6 = *(uint16x8_t *)(pix1 + 3 * stride_pix1);
319
+    i7 = *(uint16x8_t *)(pix2 + 3 * stride_pix2);
320
+
321
+    int16x8_t v0 = vsubq_u16(i0, i1);
322
+    int16x8_t v1 = vsubq_u16(i2, i3);
323
+    int16x8_t v2 = vsubq_u16(i4, i5);
324
+    int16x8_t v3 = vsubq_u16(i6, i7);
325
+
326
+    return _satd_4x8_8x4_end_neon(v0, v1, v2, v3);
327
+}
328
+
329
+
330
+int pixel_satd_16x16_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2)
331
+{
332
+    int32x4_t v30 = vdupq_n_u32(0), v31 = vdupq_n_u32(0);
333
+    int16x8_t v0, v1, v2, v3;
334
+    for (int offset = 0; offset <= 12; offset += 4) {
335
+        _satd_16x4_neon(pix1 + offset * stride_pix1, stride_pix1, pix2 + offset * stride_pix2, stride_pix2, v0, v1, v2, v3);
336
+        v30 = vpadalq_u16(v30, v0);
337
+        v30 = vpadalq_u16(v30, v1);
338
+        v31 = vpadalq_u16(v31, v2);
339
+        v31 = vpadalq_u16(v31, v3);
340
+    }
341
+    return vaddvq_s32(vaddq_s32(v30, v31));
342
+
343
+}
344
+
345
+#else       //HIGH_BIT_DEPTH
346
+
347
+static void _satd_16x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2,
348
+                            int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3)
349
+{
350
+    uint8x16_t r0, r1, r2, r3;
351
+    uint8x16_t t0, t1, t2, t3;
352
+    int16x8_t v16, v17, v20, v21;
353
+    int16x8_t v18, v19, v22, v23;
354
+
355
+    r0 = *(uint8x16_t *)(pix1 + 0 * stride_pix1);
356
+    r1 = *(uint8x16_t *)(pix1 + 1 * stride_pix1);
357
+    r2 = *(uint8x16_t *)(pix1 + 2 * stride_pix1);
358
+    r3 = *(uint8x16_t *)(pix1 + 3 * stride_pix1);
359
+
360
+    t0 = *(uint8x16_t *)(pix2 + 0 * stride_pix2);
361
+    t1 = *(uint8x16_t *)(pix2 + 1 * stride_pix2);
362
+    t2 = *(uint8x16_t *)(pix2 + 2 * stride_pix2);
363
+    t3 = *(uint8x16_t *)(pix2 + 3 * stride_pix2);
364
+
365
+
366
+
367
+    v16 = vsubl_u8(vget_low_u8(r0), vget_low_u8(t0));
368
+    v20 = vsubl_high_u8(r0, t0);
369
+    v17 = vsubl_u8(vget_low_u8(r1), vget_low_u8(t1));
370
+    v21 = vsubl_high_u8(r1, t1);
371
+    v18 = vsubl_u8(vget_low_u8(r2), vget_low_u8(t2));
372
+    v22 = vsubl_high_u8(r2, t2);
373
+    v19 = vsubl_u8(vget_low_u8(r3), vget_low_u8(t3));
374
+    v23 = vsubl_high_u8(r3, t3);
375
+
376
+    SUMSUB_AB(v0,  v1,  v16, v17);
377
+    SUMSUB_AB(v2,  v3,  v18, v19);
378
+
379
+    _satd_8x4v_8x8h_neon(v0, v1, v2, v3, v20, v21, v22, v23);
380
+
381
+}
382
+
383
+
384
+static inline void _sub_8x8_fly(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2,
385
+                                int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3,
386
+                                int16x8_t &v20, int16x8_t &v21, int16x8_t &v22, int16x8_t &v23)
387
+{
388
+    uint8x8_t r0, r1, r2, r3;
389
+    uint8x8_t t0, t1, t2, t3;
390
+    int16x8_t v16, v17;
391
+    int16x8_t v18, v19;
392
+
393
+    r0 = *(uint8x8_t *)(pix1 + 0 * stride_pix1);
394
+    r1 = *(uint8x8_t *)(pix1 + 1 * stride_pix1);
395
+    r2 = *(uint8x8_t *)(pix1 + 2 * stride_pix1);
396
+    r3 = *(uint8x8_t *)(pix1 + 3 * stride_pix1);
397
+
398
+    t0 = *(uint8x8_t *)(pix2 + 0 * stride_pix2);
399
+    t1 = *(uint8x8_t *)(pix2 + 1 * stride_pix2);
400
+    t2 = *(uint8x8_t *)(pix2 + 2 * stride_pix2);
401
+    t3 = *(uint8x8_t *)(pix2 + 3 * stride_pix2);
402
+
403
+    v16 = vsubl_u8(r0, t0);
404
+    v17 = vsubl_u8(r1, t1);
405
+    v18 = vsubl_u8(r2, t2);
406
+    v19 = vsubl_u8(r3, t3);
407
+
408
+    r0 = *(uint8x8_t *)(pix1 + 4 * stride_pix1);
409
+    r1 = *(uint8x8_t *)(pix1 + 5 * stride_pix1);
410
+    r2 = *(uint8x8_t *)(pix1 + 6 * stride_pix1);
411
+    r3 = *(uint8x8_t *)(pix1 + 7 * stride_pix1);
412
+
413
+    t0 = *(uint8x8_t *)(pix2 + 4 * stride_pix2);
414
+    t1 = *(uint8x8_t *)(pix2 + 5 * stride_pix2);
415
+    t2 = *(uint8x8_t *)(pix2 + 6 * stride_pix2);
416
+    t3 = *(uint8x8_t *)(pix2 + 7 * stride_pix2);
417
+
418
+    v20 = vsubl_u8(r0, t0);
419
+    v21 = vsubl_u8(r1, t1);
420
+    v22 = vsubl_u8(r2, t2);
421
+    v23 = vsubl_u8(r3, t3);
422
+
423
+
424
+    SUMSUB_AB(v0,  v1,  v16, v17);
425
+    SUMSUB_AB(v2,  v3,  v18, v19);
426
+
427
+}
428
+
429
+int pixel_satd_4x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2)
430
+{
431
+    uint32x2_t t0, t1, r0, r1;
432
+    t00 = *(uint32_t *)(pix1 + 0 * stride_pix1);
433
+    t10 = *(uint32_t *)(pix1 + 1 * stride_pix1);
434
+    t01 = *(uint32_t *)(pix1 + 2 * stride_pix1);
435
+    t11 = *(uint32_t *)(pix1 + 3 * stride_pix1);
436
+
437
+    r00 = *(uint32_t *)(pix2 + 0 * stride_pix1);
438
+    r10 = *(uint32_t *)(pix2 + 1 * stride_pix2);
439
+    r01 = *(uint32_t *)(pix2 + 2 * stride_pix2);
440
+    r11 = *(uint32_t *)(pix2 + 3 * stride_pix2);
441
+
442
+    return _satd_4x4_neon(vsubl_u8(t0, r0), vsubl_u8(r1, t1));
443
+}
444
+
445
+
446
+int pixel_satd_8x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2)
447
+{
448
+    uint8x8_t i0, i1, i2, i3, i4, i5, i6, i7;
449
+
450
+    i0 = *(uint8x8_t *)(pix1 + 0 * stride_pix1);
451
+    i1 = *(uint8x8_t *)(pix2 + 0 * stride_pix2);
452
+    i2 = *(uint8x8_t *)(pix1 + 1 * stride_pix1);
453
+    i3 = *(uint8x8_t *)(pix2 + 1 * stride_pix2);
454
+    i4 = *(uint8x8_t *)(pix1 + 2 * stride_pix1);
455
+    i5 = *(uint8x8_t *)(pix2 + 2 * stride_pix2);
456
+    i6 = *(uint8x8_t *)(pix1 + 3 * stride_pix1);
457
+    i7 = *(uint8x8_t *)(pix2 + 3 * stride_pix2);
458
+
459
+    int16x8_t v0 = vsubl_u8(i0, i1);
460
+    int16x8_t v1 = vsubl_u8(i2, i3);
461
+    int16x8_t v2 = vsubl_u8(i4, i5);
462
+    int16x8_t v3 = vsubl_u8(i6, i7);
463
+
464
+    return _satd_4x8_8x4_end_neon(v0, v1, v2, v3);
465
+}
466
+
467
+int pixel_satd_16x16_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2)
468
+{
469
+    int16x8_t v30, v31;
470
+    int16x8_t v0, v1, v2, v3;
471
+
472
+    _satd_16x4_neon(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3);
473
+    v30 = vaddq_s16(v0, v1);
474
+    v31 = vaddq_s16(v2, v3);
475
+
476
+    _satd_16x4_neon(pix1 + 4 * stride_pix1, stride_pix1, pix2 + 4 * stride_pix2, stride_pix2, v0, v1, v2, v3);
477
+    v0 = vaddq_s16(v0, v1);
478
+    v1 = vaddq_s16(v2, v3);
479
+    v30 = vaddq_s16(v30, v0);
480
+    v31 = vaddq_s16(v31, v1);
481
+
482
+    _satd_16x4_neon(pix1 + 8 * stride_pix1, stride_pix1, pix2 + 8 * stride_pix2, stride_pix2, v0, v1, v2, v3);
483
+    v0 = vaddq_s16(v0, v1);
484
+    v1 = vaddq_s16(v2, v3);
485
+    v30 = vaddq_s16(v30, v0);
486
+    v31 = vaddq_s16(v31, v1);
487
+
488
+    _satd_16x4_neon(pix1 + 12 * stride_pix1, stride_pix1, pix2 + 12 * stride_pix2, stride_pix2, v0, v1, v2, v3);
489
+    v0 = vaddq_s16(v0, v1);
490
+    v1 = vaddq_s16(v2, v3);
491
+    v30 = vaddq_s16(v30, v0);
492
+    v31 = vaddq_s16(v31, v1);
493
+
494
+    int32x4_t sum0 = vpaddlq_u16(v30);
495
+    int32x4_t sum1 = vpaddlq_u16(v31);
496
+    sum0 = vaddq_s32(sum0, sum1);
497
+    return vaddvq_s32(sum0);
498
+
499
+}
500
+#endif      //HIGH_BIT_DEPTH
501
+
502
+
503
+static inline void _sa8d_8x8_neon_end(int16x8_t &v0, int16x8_t &v1, int16x8_t v2, int16x8_t v3,
504
+                                      int16x8_t v20, int16x8_t v21, int16x8_t v22, int16x8_t v23)
505
+{
506
+    int16x8_t v16, v17, v18, v19;
507
+    int16x8_t v4, v5, v6, v7;
508
+
509
+    SUMSUB_AB(v16, v18, v0,  v2);
510
+    SUMSUB_AB(v17, v19, v1,  v3);
511
+
512
+    HADAMARD4_V(v20, v21, v22, v23, v0,  v1, v2, v3);
513
+
514
+    SUMSUB_AB(v0,  v16, v16, v20);
515
+    SUMSUB_AB(v1,  v17, v17, v21);
516
+    SUMSUB_AB(v2,  v18, v18, v22);
517
+    SUMSUB_AB(v3,  v19, v19, v23);
518
+
519
+    transpose_8h(v20, v21, v16, v17);
520
+    transpose_8h(v4,  v5,  v0,  v1);
521
+    transpose_8h(v22, v23, v18, v19);
522
+    transpose_8h(v6,  v7,  v2,  v3);
523
+
524
+#if (X265_DEPTH <= 10)
525
+
526
+    int16x8_t v24, v25;
527
+
528
+    SUMSUB_AB(v2,  v3,  v20, v21);
529
+    SUMSUB_AB(v24, v25, v4,  v5);
530
+    SUMSUB_AB(v0,  v1,  v22, v23);
531
+    SUMSUB_AB(v4,  v5,  v6,  v7);
532
+
533
+    transpose_4s(v20, v22, v2,  v0);
534
+    transpose_4s(v21, v23, v3,  v1);
535
+    transpose_4s(v16, v18, v24, v4);
536
+    transpose_4s(v17, v19, v25, v5);
537
+
538
+    SUMSUB_AB(v0,  v2,  v20, v22);
539
+    SUMSUB_AB(v1,  v3,  v21, v23);
540
+    SUMSUB_AB(v4,  v6,  v16, v18);
541
+    SUMSUB_AB(v5,  v7,  v17, v19);
542
+
543
+    transpose_2d(v16, v20,  v0,  v4);
544
+    transpose_2d(v17, v21,  v1,  v5);
545
+    transpose_2d(v18, v22,  v2,  v6);
546
+    transpose_2d(v19, v23,  v3,  v7);
547
+
548
+
549
+    v16 = vabsq_s16(v16);
550
+    v17 = vabsq_s16(v17);
551
+    v18 = vabsq_s16(v18);
552
+    v19 = vabsq_s16(v19);
553
+    v20 = vabsq_s16(v20);
554
+    v21 = vabsq_s16(v21);
555
+    v22 = vabsq_s16(v22);
556
+    v23 = vabsq_s16(v23);
557
+
558
+    v16 = vmaxq_u16(v16, v20);
559
+    v17 = vmaxq_u16(v17, v21);
560
+    v18 = vmaxq_u16(v18, v22);
561
+    v19 = vmaxq_u16(v19, v23);
562
+
563
+#if HIGH_BIT_DEPTH
564
+    v0 = vpaddlq_u16(v16);
565
+    v1 = vpaddlq_u16(v17);
566
+    v0 = vpadalq_u16(v0, v18);
567
+    v1 = vpadalq_u16(v1, v19);
568
+
569
+#else //HIGH_BIT_DEPTH
570
+
571
+    v0 = vaddq_u16(v16, v17);
572
+    v1 = vaddq_u16(v18, v19);
573
+
574
+#endif //HIGH_BIT_DEPTH
575
+
576
+#else // HIGH_BIT_DEPTH 12 bit only, switching math to int32, each int16x8 is up-convreted to 2 int32x4 (low and high)
577
+
578
+    int32x4_t v2l, v2h, v3l, v3h, v24l, v24h, v25l, v25h, v0l, v0h, v1l, v1h;
579
+    int32x4_t v22l, v22h, v23l, v23h;
580
+    int32x4_t v4l, v4h, v5l, v5h;
581
+    int32x4_t v6l, v6h, v7l, v7h;
582
+    int32x4_t v16l, v16h, v17l, v17h;
583
+    int32x4_t v18l, v18h, v19l, v19h;
584
+    int32x4_t v20l, v20h, v21l, v21h;
585
+
586
+    ISUMSUB_AB_FROM_INT16(v2l, v2h, v3l, v3h, v20, v21);
587
+    ISUMSUB_AB_FROM_INT16(v24l, v24h, v25l, v25h, v4, v5);
588
+
589
+    v22l = vmovl_s16(vget_low_s16(v22));
590
+    v22h = vmovl_high_s16(v22);
591
+    v23l = vmovl_s16(vget_low_s16(v23));
592
+    v23h = vmovl_high_s16(v23);
593
+
594
+    ISUMSUB_AB(v0l,  v1l,  v22l, v23l);
595
+    ISUMSUB_AB(v0h,  v1h,  v22h, v23h);
596
+
597
+    v6l = vmovl_s16(vget_low_s16(v6));
598
+    v6h = vmovl_high_s16(v6);
599
+    v7l = vmovl_s16(vget_low_s16(v7));
600
+    v7h = vmovl_high_s16(v7);
601
+
602
+    ISUMSUB_AB(v4l,  v5l,  v6l,  v7l);
603
+    ISUMSUB_AB(v4h,  v5h,  v6h,  v7h);
604
+
605
+    transpose_2d(v20l, v22l, v2l,  v0l);
606
+    transpose_2d(v21l, v23l, v3l,  v1l);
607
+    transpose_2d(v16l, v18l, v24l, v4l);
608
+    transpose_2d(v17l, v19l, v25l, v5l);
609
+
610
+    transpose_2d(v20h, v22h, v2h,  v0h);
611
+    transpose_2d(v21h, v23h, v3h,  v1h);
612
+    transpose_2d(v16h, v18h, v24h, v4h);
613
+    transpose_2d(v17h, v19h, v25h, v5h);
614
+
615
+    ISUMSUB_AB(v0l,  v2l,  v20l, v22l);
616
+    ISUMSUB_AB(v1l,  v3l,  v21l, v23l);
617
+    ISUMSUB_AB(v4l,  v6l,  v16l, v18l);
618
+    ISUMSUB_AB(v5l,  v7l,  v17l, v19l);
619
+
620
+    ISUMSUB_AB(v0h,  v2h,  v20h, v22h);
621
+    ISUMSUB_AB(v1h,  v3h,  v21h, v23h);
622
+    ISUMSUB_AB(v4h,  v6h,  v16h, v18h);
623
+    ISUMSUB_AB(v5h,  v7h,  v17h, v19h);
624
+
625
+    v16l = v0l;
626
+    v16h = v4l;
627
+    v20l = v0h;
628
+    v20h = v4h;
629
+
630
+    v17l = v1l;
631
+    v17h = v5l;
632
+    v21l = v1h;
633
+    v21h = v5h;
634
+
635
+    v18l = v2l;
636
+    v18h = v6l;
637
+    v22l = v2h;
638
+    v22h = v6h;
639
+
640
+    v19l = v3l;
641
+    v19h = v7l;
642
+    v23l = v3h;
643
+    v23h = v7h;
644
+
645
+    v16l = vabsq_s32(v16l);
646
+    v17l = vabsq_s32(v17l);
647
+    v18l = vabsq_s32(v18l);
648
+    v19l = vabsq_s32(v19l);
649
+    v20l = vabsq_s32(v20l);
650
+    v21l = vabsq_s32(v21l);
651
+    v22l = vabsq_s32(v22l);
652
+    v23l = vabsq_s32(v23l);
653
+
654
+    v16h = vabsq_s32(v16h);
655
+    v17h = vabsq_s32(v17h);
656
+    v18h = vabsq_s32(v18h);
657
+    v19h = vabsq_s32(v19h);
658
+    v20h = vabsq_s32(v20h);
659
+    v21h = vabsq_s32(v21h);
660
+    v22h = vabsq_s32(v22h);
661
+    v23h = vabsq_s32(v23h);
662
+
663
+    v16l = vmaxq_u32(v16l, v20l);
664
+    v17l = vmaxq_u32(v17l, v21l);
665
+    v18l = vmaxq_u32(v18l, v22l);
666
+    v19l = vmaxq_u32(v19l, v23l);
667
+
668
+    v16h = vmaxq_u32(v16h, v20h);
669
+    v17h = vmaxq_u32(v17h, v21h);
670
+    v18h = vmaxq_u32(v18h, v22h);
671
+    v19h = vmaxq_u32(v19h, v23h);
672
+
673
+    v16l = vaddq_u32(v16l, v16h);
674
+    v17l = vaddq_u32(v17l, v17h);
675
+    v18l = vaddq_u32(v18l, v18h);
676
+    v19l = vaddq_u32(v19l, v19h);
677
+
678
+    v0 = vaddq_u32(v16l, v17l);
679
+    v1 = vaddq_u32(v18l, v19l);
680
+
681
+
682
+#endif
683
+
684
+}
685
+
686
+
687
+
688
+static inline void _satd_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2,
689
+                                  int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3)
690
+{
691
+
692
+    int16x8_t v20, v21, v22, v23;
693
+    _sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
694
+    _satd_8x4v_8x8h_neon(v0, v1, v2, v3, v20, v21, v22, v23);
695
+
696
+}
697
+
698
+
699
+
700
+int pixel_satd_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
701
+{
702
+    int16x8_t v30, v31;
703
+    int16x8_t v0, v1, v2, v3;
704
+
705
+    _satd_8x8_neon(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3);
706
+#if !(HIGH_BIT_DEPTH)
707
+    v30 = vaddq_u16(v0, v1);
708
+    v31 = vaddq_u16(v2, v3);
709
+
710
+    uint16x8_t sum = vaddq_u16(v30, v31);
711
+    return vaddvq_s32(vpaddlq_u16(sum));
712
+#else
713
+
714
+    v30 = vaddq_u16(v0, v1);
715
+    v31 = vaddq_u16(v2, v3);
716
+
717
+    int32x4_t sum = vpaddlq_u16(v30);
718
+    sum = vpadalq_u16(sum, v31);
719
+    return vaddvq_s32(sum);
720
+#endif
721
+}
722
+
723
+
724
+int pixel_sa8d_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
725
+{
726
+    int16x8_t v0, v1, v2, v3;
727
+    int16x8_t v20, v21, v22, v23;
728
+
729
+    _sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
730
+    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
731
+
732
+#if HIGH_BIT_DEPTH
733
+    int32x4_t s = vaddq_u32(v0, v1);
734
+    return (vaddvq_u32(s) + 1) >> 1;
735
+#else
736
+    return (vaddlvq_s16(vaddq_u16(v0, v1)) + 1) >> 1;
737
+#endif
738
+}
739
+
740
+
741
+
742
+
743
+
744
+int pixel_sa8d_16x16_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
745
+{
746
+    int16x8_t v0, v1, v2, v3;
747
+    int16x8_t v20, v21, v22, v23;
748
+    int32x4_t v30, v31;
749
+
750
+    _sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
751
+    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
752
+
753
+#if !(HIGH_BIT_DEPTH)
754
+    v30 = vpaddlq_u16(v0);
755
+    v31 = vpaddlq_u16(v1);
756
+#else
757
+    v30 = vaddq_s32(v0, v1);
758
+#endif
759
+
760
+    _sub_8x8_fly(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
761
+    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
762
+
763
+#if !(HIGH_BIT_DEPTH)
764
+    v30 = vpadalq_u16(v30, v0);
765
+    v31 = vpadalq_u16(v31, v1);
766
+#else
767
+    v31 = vaddq_s32(v0, v1);
768
+#endif
769
+
770
+
771
+    _sub_8x8_fly(pix1 + 8 * stride_pix1, stride_pix1, pix2 + 8 * stride_pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22,
772
+                 v23);
773
+    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
774
+
775
+#if !(HIGH_BIT_DEPTH)
776
+    v30 = vpadalq_u16(v30, v0);
777
+    v31 = vpadalq_u16(v31, v1);
778
+#else
779
+    v30 = vaddq_s32(v30, v0);
780
+    v31 = vaddq_s32(v31, v1);
781
+#endif
782
+
783
+    _sub_8x8_fly(pix1 + 8 * stride_pix1 + 8, stride_pix1, pix2 + 8 * stride_pix2 + 8, stride_pix2, v0, v1, v2, v3, v20, v21,
784
+                 v22, v23);
785
+    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
786
+
787
+#if !(HIGH_BIT_DEPTH)
788
+    v30 = vpadalq_u16(v30, v0);
789
+    v31 = vpadalq_u16(v31, v1);
790
+#else
791
+    v30 = vaddq_s32(v30, v0);
792
+    v31 = vaddq_s32(v31, v1);
793
+#endif
794
+
795
+    v30 = vaddq_u32(v30, v31);
796
+
797
+    return (vaddvq_u32(v30) + 1) >> 1;
798
+}
799
+
800
+
801
+
802
+
803
+
804
+
805
+
806
+
807
+template<int size>
808
+void blockfill_s_neon(int16_t *dst, intptr_t dstride, int16_t val)
809
+{
810
+    for (int y = 0; y < size; y++)
811
+    {
812
+        int x = 0;
813
+        int16x8_t v = vdupq_n_s16(val);
814
+        for (; (x + 8) <= size; x += 8)
815
+        {
816
+            *(int16x8_t *)&dsty * dstride + x = v;
817
+        }
818
+        for (; x < size; x++)
819
+        {
820
+            dsty * dstride + x = val;
821
+        }
822
+    }
823
+}
824
+
825
+template<int lx, int ly>
826
+int sad_pp_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
827
+{
828
+    int sum = 0;
829
+
830
+
831
+    for (int y = 0; y < ly; y++)
832
+    {
833
+#if HIGH_BIT_DEPTH
834
+        int x = 0;
835
+        uint16x8_t vsum16_1 = vdupq_n_u16(0);
836
+        for (; (x + 8) <= lx; x += 8)
837
+        {
838
+            uint16x8_t p1 = *(uint16x8_t *)&pix1x;
839
+            uint16x8_t p2 = *(uint16x8_t *)&pix2x;
840
+            vsum16_1 = vabaq_s16(vsum16_1, p1, p2);
841
+
842
+        }
843
+        if (lx & 4)
844
+        {
845
+            uint16x4_t p1 = *(uint16x4_t *)&pix1x;
846
+            uint16x4_t p2 = *(uint16x4_t *)&pix2x;
847
+            sum += vaddlv_s16(vaba_s16(vdup_n_s16(0), p1, p2));
848
+            x += 4;
849
+        }
850
+        if (lx >= 4)
851
+        {
852
+            sum += vaddlvq_s16(vsum16_1);
853
+        }
854
+
855
+#else
856
+
857
+        int x = 0;
858
+        uint16x8_t vsum16_1 = vdupq_n_u16(0);
859
+        uint16x8_t vsum16_2 = vdupq_n_u16(0);
860
+
861
+        for (; (x + 16) <= lx; x += 16)
862
+        {
863
+            uint8x16_t p1 = *(uint8x16_t *)&pix1x;
864
+            uint8x16_t p2 = *(uint8x16_t *)&pix2x;
865
+            vsum16_1 = vabal_u8(vsum16_1, vget_low_u8(p1), vget_low_u8(p2));
866
+            vsum16_2 = vabal_high_u8(vsum16_2, p1, p2);
867
+        }
868
+        if (lx & 8)
869
+        {
870
+            uint8x8_t p1 = *(uint8x8_t *)&pix1x;
871
+            uint8x8_t p2 = *(uint8x8_t *)&pix2x;
872
+            vsum16_1 = vabal_u8(vsum16_1, p1, p2);
873
+            x += 8;
874
+        }
875
+        if (lx & 4)
876
+        {
877
+            uint32x2_t p1 = vdup_n_u32(0);
878
+            p10 = *(uint32_t *)&pix1x;
879
+            uint32x2_t p2 = vdup_n_u32(0);
880
+            p20 = *(uint32_t *)&pix2x;
881
+            vsum16_1 = vabal_u8(vsum16_1, p1, p2);
882
+            x += 4;
883
+        }
884
+        if (lx >= 16)
885
+        {
886
+            vsum16_1 = vaddq_u16(vsum16_1, vsum16_2);
887
+        }
888
+        if (lx >= 4)
889
+        {
890
+            sum += vaddvq_u16(vsum16_1);
891
+        }
892
+
893
+#endif
894
+        if (lx & 3) for (; x < lx; x++)
895
+            {
896
+                sum += abs(pix1x - pix2x);
897
+            }
898
+
899
+        pix1 += stride_pix1;
900
+        pix2 += stride_pix2;
901
+    }
902
+
903
+    return sum;
904
+}
905
+
906
+template<int lx, int ly>
907
+void sad_x3_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const pixel *pix4, intptr_t frefstride,
908
+                 int32_t *res)
909
+{
910
+    res0 = 0;
911
+    res1 = 0;
912
+    res2 = 0;
913
+    for (int y = 0; y < ly; y++)
914
+    {
915
+        int x = 0;
916
+        uint16x8_t vsum16_0 = vdupq_n_u16(0);
917
+        uint16x8_t vsum16_1 = vdupq_n_u16(0);
918
+        uint16x8_t vsum16_2 = vdupq_n_u16(0);
919
+#if HIGH_BIT_DEPTH
920
+        for (; (x + 8) <= lx; x += 8)
921
+        {
922
+            uint16x8_t p1 = *(uint16x8_t *)&pix1x;
923
+            uint16x8_t p2 = *(uint16x8_t *)&pix2x;
924
+            uint16x8_t p3 = *(uint16x8_t *)&pix3x;
925
+            uint16x8_t p4 = *(uint16x8_t *)&pix4x;
926
+            vsum16_0 = vabaq_s16(vsum16_0, p1, p2);
927
+            vsum16_1 = vabaq_s16(vsum16_1, p1, p3);
928
+            vsum16_2 = vabaq_s16(vsum16_2, p1, p4);
929
+
930
+        }
931
+        if (lx & 4)
932
+        {
933
+            uint16x4_t p1 = *(uint16x4_t *)&pix1x;
934
+            uint16x4_t p2 = *(uint16x4_t *)&pix2x;
935
+            uint16x4_t p3 = *(uint16x4_t *)&pix3x;
936
+            uint16x4_t p4 = *(uint16x4_t *)&pix4x;
937
+            res0 += vaddlv_s16(vaba_s16(vdup_n_s16(0), p1, p2));
938
+            res1 += vaddlv_s16(vaba_s16(vdup_n_s16(0), p1, p3));
939
+            res2 += vaddlv_s16(vaba_s16(vdup_n_s16(0), p1, p4));
940
+            x += 4;
941
+        }
942
+        if (lx >= 4)
943
+        {
944
+            res0 += vaddlvq_s16(vsum16_0);
945
+            res1 += vaddlvq_s16(vsum16_1);
946
+            res2 += vaddlvq_s16(vsum16_2);
947
+        }
948
+#else
949
+
950
+        for (; (x + 16) <= lx; x += 16)
951
+        {
952
+            uint8x16_t p1 = *(uint8x16_t *)&pix1x;
953
+            uint8x16_t p2 = *(uint8x16_t *)&pix2x;
954
+            uint8x16_t p3 = *(uint8x16_t *)&pix3x;
955
+            uint8x16_t p4 = *(uint8x16_t *)&pix4x;
956
+            vsum16_0 = vabal_u8(vsum16_0, vget_low_u8(p1), vget_low_u8(p2));
957
+            vsum16_0 = vabal_high_u8(vsum16_0, p1, p2);
958
+            vsum16_1 = vabal_u8(vsum16_1, vget_low_u8(p1), vget_low_u8(p3));
959
+            vsum16_1 = vabal_high_u8(vsum16_1, p1, p3);
960
+            vsum16_2 = vabal_u8(vsum16_2, vget_low_u8(p1), vget_low_u8(p4));
961
+            vsum16_2 = vabal_high_u8(vsum16_2, p1, p4);
962
+        }
963
+        if (lx & 8)
964
+        {
965
+            uint8x8_t p1 = *(uint8x8_t *)&pix1x;
966
+            uint8x8_t p2 = *(uint8x8_t *)&pix2x;
967
+            uint8x8_t p3 = *(uint8x8_t *)&pix3x;
968
+            uint8x8_t p4 = *(uint8x8_t *)&pix4x;
969
+            vsum16_0 = vabal_u8(vsum16_0, p1, p2);
970
+            vsum16_1 = vabal_u8(vsum16_1, p1, p3);
971
+            vsum16_2 = vabal_u8(vsum16_2, p1, p4);
972
+            x += 8;
973
+        }
974
+        if (lx & 4)
975
+        {
976
+            uint32x2_t p1 = vdup_n_u32(0);
977
+            p10 = *(uint32_t *)&pix1x;
978
+            uint32x2_t p2 = vdup_n_u32(0);
979
+            p20 = *(uint32_t *)&pix2x;
980
+            uint32x2_t p3 = vdup_n_u32(0);
981
+            p30 = *(uint32_t *)&pix3x;
982
+            uint32x2_t p4 = vdup_n_u32(0);
983
+            p40 = *(uint32_t *)&pix4x;
984
+            vsum16_0 = vabal_u8(vsum16_0, p1, p2);
985
+            vsum16_1 = vabal_u8(vsum16_1, p1, p3);
986
+            vsum16_2 = vabal_u8(vsum16_2, p1, p4);
987
+            x += 4;
988
+        }
989
+        if (lx >= 4)
990
+        {
991
+            res0 += vaddvq_u16(vsum16_0);
992
+            res1 += vaddvq_u16(vsum16_1);
993
+            res2 += vaddvq_u16(vsum16_2);
994
+        }
995
+
996
+#endif
997
+        if (lx & 3) for (; x < lx; x++)
998
+            {
999
+                res0 += abs(pix1x - pix2x);
1000
+                res1 += abs(pix1x - pix3x);
1001
+                res2 += abs(pix1x - pix4x);
1002
+            }
1003
+
1004
+        pix1 += FENC_STRIDE;
1005
+        pix2 += frefstride;
1006
+        pix3 += frefstride;
1007
+        pix4 += frefstride;
1008
+    }
1009
+}
1010
+
1011
+template<int lx, int ly>
1012
+void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const pixel *pix4, const pixel *pix5,
1013
+                 intptr_t frefstride, int32_t *res)
1014
+{
1015
+    int32x4_t result = {0};
1016
+    for (int y = 0; y < ly; y++)
1017
+    {
1018
+        int x = 0;
1019
+        uint16x8_t vsum16_0 = vdupq_n_u16(0);
1020
+        uint16x8_t vsum16_1 = vdupq_n_u16(0);
1021
+        uint16x8_t vsum16_2 = vdupq_n_u16(0);
1022
+        uint16x8_t vsum16_3 = vdupq_n_u16(0);
1023
+#if HIGH_BIT_DEPTH
1024
+        for (; (x + 16) <= lx; x += 16)
1025
+        {
1026
+            uint16x8x2_t p1 = vld1q_u16_x2(&pix1x);
1027
+            uint16x8x2_t p2 = vld1q_u16_x2(&pix2x);
1028
+            uint16x8x2_t p3 = vld1q_u16_x2(&pix3x);
1029
+            uint16x8x2_t p4 = vld1q_u16_x2(&pix4x);
1030
+            uint16x8x2_t p5 = vld1q_u16_x2(&pix5x);
1031
+            vsum16_0 = vabaq_s16(vsum16_0, p1.val0, p2.val0);
1032
+            vsum16_1 = vabaq_s16(vsum16_1, p1.val0, p3.val0);
1033
+            vsum16_2 = vabaq_s16(vsum16_2, p1.val0, p4.val0);
1034
+            vsum16_3 = vabaq_s16(vsum16_3, p1.val0, p5.val0);
1035
+            vsum16_0 = vabaq_s16(vsum16_0, p1.val1, p2.val1);
1036
+            vsum16_1 = vabaq_s16(vsum16_1, p1.val1, p3.val1);
1037
+            vsum16_2 = vabaq_s16(vsum16_2, p1.val1, p4.val1);
1038
+            vsum16_3 = vabaq_s16(vsum16_3, p1.val1, p5.val1);
1039
+        }
1040
+        if (lx & 8)
1041
+        {
1042
+            uint16x8_t p1 = *(uint16x8_t *)&pix1x;
1043
+            uint16x8_t p2 = *(uint16x8_t *)&pix2x;
1044
+            uint16x8_t p3 = *(uint16x8_t *)&pix3x;
1045
+            uint16x8_t p4 = *(uint16x8_t *)&pix4x;
1046
+            uint16x8_t p5 = *(uint16x8_t *)&pix5x;
1047
+            vsum16_0 = vabaq_s16(vsum16_0, p1, p2);
1048
+            vsum16_1 = vabaq_s16(vsum16_1, p1, p3);
1049
+            vsum16_2 = vabaq_s16(vsum16_2, p1, p4);
1050
+            vsum16_3 = vabaq_s16(vsum16_3, p1, p5);
1051
+            x += 8;
1052
+        }
1053
+        if (lx & 4)
1054
+        {
1055
+            /* This is equivalent to getting the absolute difference of pix1x with each of
1056
+             * pix2 - pix5, then summing across the vector (4 values each) and adding the
1057
+             * result to result. */
1058
+            uint16x8_t p1 = vreinterpretq_s16_u64(
1059
+                    vld1q_dup_u64((uint64_t *)&pix1x));
1060
+            uint16x8_t p2_3 = vcombine_s16(*(uint16x4_t *)&pix2x, *(uint16x4_t *)&pix3x);
1061
+            uint16x8_t p4_5 = vcombine_s16(*(uint16x4_t *)&pix4x, *(uint16x4_t *)&pix5x);
1062
+
1063
+            uint16x8_t a = vabdq_u16(p1, p2_3);
1064
+            uint16x8_t b = vabdq_u16(p1, p4_5);
1065
+
1066
+            result = vpadalq_s16(result, vpaddq_s16(a, b));
1067
+            x += 4;
1068
+        }
1069
+        if (lx >= 4)
1070
+        {
1071
+            /* This is equivalent to adding across each of the sum vectors and then adding
1072
+             * to result. */
1073
+            uint16x8_t a = vpaddq_s16(vsum16_0, vsum16_1);
1074
+            uint16x8_t b = vpaddq_s16(vsum16_2, vsum16_3);
1075
+            uint16x8_t c = vpaddq_s16(a, b);
1076
+            result = vpadalq_s16(result, c);
1077
+        }
1078
+
1079
+#else
1080
+
1081
+        for (; (x + 16) <= lx; x += 16)
1082
+        {
1083
+            uint8x16_t p1 = *(uint8x16_t *)&pix1x;
1084
+            uint8x16_t p2 = *(uint8x16_t *)&pix2x;
1085
+            uint8x16_t p3 = *(uint8x16_t *)&pix3x;
1086
+            uint8x16_t p4 = *(uint8x16_t *)&pix4x;
1087
+            uint8x16_t p5 = *(uint8x16_t *)&pix5x;
1088
+            vsum16_0 = vabal_u8(vsum16_0, vget_low_u8(p1), vget_low_u8(p2));
1089
+            vsum16_0 = vabal_high_u8(vsum16_0, p1, p2);
1090
+            vsum16_1 = vabal_u8(vsum16_1, vget_low_u8(p1), vget_low_u8(p3));
1091
+            vsum16_1 = vabal_high_u8(vsum16_1, p1, p3);
1092
+            vsum16_2 = vabal_u8(vsum16_2, vget_low_u8(p1), vget_low_u8(p4));
1093
+            vsum16_2 = vabal_high_u8(vsum16_2, p1, p4);
1094
+            vsum16_3 = vabal_u8(vsum16_3, vget_low_u8(p1), vget_low_u8(p5));
1095
+            vsum16_3 = vabal_high_u8(vsum16_3, p1, p5);
1096
+        }
1097
+        if (lx & 8)
1098
+        {
1099
+            uint8x8_t p1 = *(uint8x8_t *)&pix1x;
1100
+            uint8x8_t p2 = *(uint8x8_t *)&pix2x;
1101
+            uint8x8_t p3 = *(uint8x8_t *)&pix3x;
1102
+            uint8x8_t p4 = *(uint8x8_t *)&pix4x;
1103
+            uint8x8_t p5 = *(uint8x8_t *)&pix5x;
1104
+            vsum16_0 = vabal_u8(vsum16_0, p1, p2);
1105
+            vsum16_1 = vabal_u8(vsum16_1, p1, p3);
1106
+            vsum16_2 = vabal_u8(vsum16_2, p1, p4);
1107
+            vsum16_3 = vabal_u8(vsum16_3, p1, p5);
1108
+            x += 8;
1109
+        }
1110
+        if (lx & 4)
1111
+        {
1112
+            uint8x16_t p1 = vreinterpretq_u32_u8(
1113
+                vld1q_dup_u32((uint32_t *)&pix1x));
1114
+
1115
+            uint32x4_t p_x4;
1116
+            p_x4 = vld1q_lane_u32((uint32_t *)&pix2x, p_x4, 0);
1117
+            p_x4 = vld1q_lane_u32((uint32_t *)&pix3x, p_x4, 1);
1118
+            p_x4 = vld1q_lane_u32((uint32_t *)&pix4x, p_x4, 2);
1119
+            p_x4 = vld1q_lane_u32((uint32_t *)&pix5x, p_x4, 3);
1120
+
1121
+            uint16x8_t sum = vabdl_u8(vget_low_u8(p1), vget_low_u8(p_x4));
1122
+            uint16x8_t sum2 = vabdl_high_u8(p1, p_x4);
1123
+
1124
+            uint16x8_t a = vpaddq_u16(sum, sum2);
1125
+            result = vpadalq_u16(result, a);
1126
+        }
1127
+        if (lx >= 4)
1128
+        {
1129
+            result0 += vaddvq_u16(vsum16_0);
1130
+            result1 += vaddvq_u16(vsum16_1);
1131
+            result2 += vaddvq_u16(vsum16_2);
1132
+            result3 += vaddvq_u16(vsum16_3);
1133
+        }
1134
+
1135
+#endif
1136
+        if (lx & 3) for (; x < lx; x++)
1137
+        {
1138
+            result0 += abs(pix1x - pix2x);
1139
+            result1 += abs(pix1x - pix3x);
1140
+            result2 += abs(pix1x - pix4x);
1141
+            result3 += abs(pix1x - pix5x);
1142
+        }
1143
+
1144
+        pix1 += FENC_STRIDE;
1145
+        pix2 += frefstride;
1146
+        pix3 += frefstride;
1147
+        pix4 += frefstride;
1148
+        pix5 += frefstride;
1149
+    }
1150
+    vst1q_s32(res, result);
1151
+}
1152
+
1153
+
1154
+template<int lx, int ly, class T1, class T2>
1155
+sse_t sse_neon(const T1 *pix1, intptr_t stride_pix1, const T2 *pix2, intptr_t stride_pix2)
1156
+{
1157
+    sse_t sum = 0;
1158
+
1159
+    int32x4_t vsum1 = vdupq_n_s32(0);
1160
+    int32x4_t vsum2 = vdupq_n_s32(0);
1161
+    for (int y = 0; y < ly; y++)
1162
+    {
1163
+        int x = 0;
1164
+        for (; (x + 8) <= lx; x += 8)
1165
+        {
1166
+            int16x8_t tmp;
1167
+            if (sizeof(T1) == 2 && sizeof(T2) == 2)
1168
+            {
1169
+                tmp = vsubq_s16(*(int16x8_t *)&pix1x, *(int16x8_t *)&pix2x);
1170
+            }
1171
+            else if (sizeof(T1) == 1 && sizeof(T2) == 1)
1172
+            {
1173
+                tmp = vsubl_u8(*(uint8x8_t *)&pix1x, *(uint8x8_t *)&pix2x);
1174
+            }
1175
+            else
1176
+            {
1177
+                X265_CHECK(false, "unsupported sse");
1178
+            }
1179
+            vsum1 = vmlal_s16(vsum1, vget_low_s16(tmp), vget_low_s16(tmp));
1180
+            vsum2 = vmlal_high_s16(vsum2, tmp, tmp);
1181
+        }
1182
+        for (; x < lx; x++)
1183
+        {
1184
+            int tmp = pix1x - pix2x;
1185
+            sum += (tmp * tmp);
1186
+        }
1187
+
1188
+        if (sizeof(T1) == 2 && sizeof(T2) == 2)
1189
+        {
1190
+            int32x4_t vsum = vaddq_u32(vsum1, vsum2);;
1191
+            sum += vaddvq_u32(vsum);
1192
+            vsum1 = vsum2 = vdupq_n_u16(0);
1193
+        }
1194
+
1195
+        pix1 += stride_pix1;
1196
+        pix2 += stride_pix2;
1197
+    }
1198
+    int32x4_t vsum = vaddq_u32(vsum1, vsum2);
1199
+
1200
+    return sum + vaddvq_u32(vsum);
1201
+}
1202
+
1203
+
1204
+template<int bx, int by>
1205
+void blockcopy_ps_neon(int16_t *a, intptr_t stridea, const pixel *b, intptr_t strideb)
1206
+{
1207
+    for (int y = 0; y < by; y++)
1208
+    {
1209
+        int x = 0;
1210
+        for (; (x + 8) <= bx; x += 8)
1211
+        {
1212
+#if HIGH_BIT_DEPTH
1213
+            *(int16x8_t *)&ax = *(int16x8_t *)&bx;
1214
+#else
1215
+            *(int16x8_t *)&ax = vmovl_u8(*(int8x8_t *)&bx);
1216
+#endif
1217
+        }
1218
+        for (; x < bx; x++)
1219
+        {
1220
+            ax = (int16_t)bx;
1221
+        }
1222
+
1223
+        a += stridea;
1224
+        b += strideb;
1225
+    }
1226
+}
1227
+
1228
+
1229
+template<int bx, int by>
1230
+void blockcopy_pp_neon(pixel *a, intptr_t stridea, const pixel *b, intptr_t strideb)
1231
+{
1232
+    for (int y = 0; y < by; y++)
1233
+    {
1234
+        int x = 0;
1235
+#if HIGH_BIT_DEPTH
1236
+        for (; (x + 8) <= bx; x += 8)
1237
+        {
1238
+            *(int16x8_t *)&ax = *(int16x8_t *)&bx;
1239
+        }
1240
+        if (bx & 4)
1241
+        {
1242
+            *(uint64_t *)&ax = *(uint64_t *)&bx;
1243
+            x += 4;
1244
+        }
1245
+#else
1246
+        for (; (x + 16) <= bx; x += 16)
1247
+        {
1248
+            *(uint8x16_t *)&ax = *(uint8x16_t *)&bx;
1249
+        }
1250
+        if (bx & 8)
1251
+        {
1252
+            *(uint8x8_t *)&ax = *(uint8x8_t *)&bx;
1253
+            x += 8;
1254
+        }
1255
+        if (bx & 4)
1256
+        {
1257
+            *(uint32_t *)&ax = *(uint32_t *)&bx;
1258
+            x += 4;
1259
+        }
1260
+#endif
1261
+        for (; x < bx; x++)
1262
+        {
1263
+            ax = bx;
1264
+        }
1265
+
1266
+        a += stridea;
1267
+        b += strideb;
1268
+    }
1269
+}
1270
+
1271
+
1272
+template<int bx, int by>
1273
+void pixel_sub_ps_neon(int16_t *a, intptr_t dstride, const pixel *b0, const pixel *b1, intptr_t sstride0,
1274
+                       intptr_t sstride1)
1275
+{
1276
+    for (int y = 0; y < by; y++)
1277
+    {
1278
+        int x = 0;
1279
+        for (; (x + 8) <= bx; x += 8)
1280
+        {
1281
+#if HIGH_BIT_DEPTH
1282
+            *(int16x8_t *)&ax = vsubq_s16(*(int16x8_t *)&b0x, *(int16x8_t *)&b1x);
1283
+#else
1284
+            *(int16x8_t *)&ax = vsubl_u8(*(uint8x8_t *)&b0x, *(uint8x8_t *)&b1x);
1285
+#endif
1286
+        }
1287
+        for (; x < bx; x++)
1288
+        {
1289
+            ax = (int16_t)(b0x - b1x);
1290
+        }
1291
+
1292
+        b0 += sstride0;
1293
+        b1 += sstride1;
1294
+        a += dstride;
1295
+    }
1296
+}
1297
+
1298
+template<int bx, int by>
1299
+void pixel_add_ps_neon(pixel *a, intptr_t dstride, const pixel *b0, const int16_t *b1, intptr_t sstride0,
1300
+                       intptr_t sstride1)
1301
+{
1302
+    for (int y = 0; y < by; y++)
1303
+    {
1304
+        int x = 0;
1305
+        for (; (x + 8) <= bx; x += 8)
1306
+        {
1307
+            int16x8_t t;
1308
+            int16x8_t b1e = *(int16x8_t *)&b1x;
1309
+            int16x8_t b0e;
1310
+#if HIGH_BIT_DEPTH
1311
+            b0e = *(int16x8_t *)&b0x;
1312
+            t = vaddq_s16(b0e, b1e);
1313
+            t = vminq_s16(t, vdupq_n_s16((1 << X265_DEPTH) - 1));
1314
+            t = vmaxq_s16(t, vdupq_n_s16(0));
1315
+            *(int16x8_t *)&ax = t;
1316
+#else
1317
+            b0e = vmovl_u8(*(uint8x8_t *)&b0x);
1318
+            t = vaddq_s16(b0e, b1e);
1319
+            *(uint8x8_t *)&ax = vqmovun_s16(t);
1320
+#endif
1321
+        }
1322
+        for (; x < bx; x++)
1323
+        {
1324
+            ax = (int16_t)x265_clip(b0x + b1x);
1325
+        }
1326
+
1327
+        b0 += sstride0;
1328
+        b1 += sstride1;
1329
+        a += dstride;
1330
+    }
1331
+}
1332
+
1333
+template<int bx, int by>
1334
+void addAvg_neon(const int16_t *src0, const int16_t *src1, pixel *dst, intptr_t src0Stride, intptr_t src1Stride,
1335
+                 intptr_t dstStride)
1336
+{
1337
+
1338
+    const int shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH;
1339
+    const int offset = (1 << (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS;
1340
+
1341
+    const int32x4_t addon = vdupq_n_s32(offset);
1342
+    for (int y = 0; y < by; y++)
1343
+    {
1344
+        int x = 0;
1345
+
1346
+        for (; (x + 8) <= bx; x += 8)
1347
+        {
1348
+            int16x8_t in0 = *(int16x8_t *)&src0x;
1349
+            int16x8_t in1 = *(int16x8_t *)&src1x;
1350
+            int32x4_t t1 = vaddl_s16(vget_low_s16(in0), vget_low_s16(in1));
1351
+            int32x4_t t2 = vaddl_high_s16(in0, in1);
1352
+            t1 = vaddq_s32(t1, addon);
1353
+            t2 = vaddq_s32(t2, addon);
1354
+            t1 = vshrq_n_s32(t1, shiftNum);
1355
+            t2 = vshrq_n_s32(t2, shiftNum);
1356
+            int16x8_t t = vuzp1q_s16(t1, t2);
1357
+#if HIGH_BIT_DEPTH
1358
+            t = vminq_s16(t, vdupq_n_s16((1 << X265_DEPTH) - 1));
1359
+            t = vmaxq_s16(t, vdupq_n_s16(0));
1360
+            *(int16x8_t *)&dstx = t;
1361
+#else
1362
+            *(uint8x8_t *)&dstx = vqmovun_s16(t);
1363
+#endif
1364
+        }
1365
+        for (; x < bx; x += 2)
1366
+        {
1367
+            dstx + 0 = x265_clip((src0x + 0 + src1x + 0 + offset) >> shiftNum);
1368
+            dstx + 1 = x265_clip((src0x + 1 + src1x + 1 + offset) >> shiftNum);
1369
+        }
1370
+
1371
+        src0 += src0Stride;
1372
+        src1 += src1Stride;
1373
+        dst  += dstStride;
1374
+    }
1375
+}
1376
+
1377
+template<int lx, int ly>
1378
+void pixelavg_pp_neon(pixel *dst, intptr_t dstride, const pixel *src0, intptr_t sstride0, const pixel *src1,
1379
+                      intptr_t sstride1, int)
1380
+{
1381
+    for (int y = 0; y < ly; y++)
1382
+    {
1383
+        int x = 0;
1384
+        for (; (x + 8) <= lx; x += 8)
1385
+        {
1386
+#if HIGH_BIT_DEPTH
1387
+            uint16x8_t in0 = *(uint16x8_t *)&src0x;
1388
+            uint16x8_t in1 = *(uint16x8_t *)&src1x;
1389
+            uint16x8_t t = vrhaddq_u16(in0, in1);
1390
+            *(uint16x8_t *)&dstx = t;
1391
+#else
1392
+            int16x8_t in0 = vmovl_u8(*(uint8x8_t *)&src0x);
1393
+            int16x8_t in1 = vmovl_u8(*(uint8x8_t *)&src1x);
1394
+            int16x8_t t = vrhaddq_s16(in0, in1);
1395
+            *(uint8x8_t *)&dstx = vmovn_u16(t);
1396
+#endif
1397
+        }
1398
+        for (; x < lx; x++)
1399
+        {
1400
+            dstx = (src0x + src1x + 1) >> 1;
1401
+        }
1402
+
1403
+        src0 += sstride0;
1404
+        src1 += sstride1;
1405
+        dst += dstride;
1406
+    }
1407
+}
1408
+
1409
+
1410
+template<int size>
1411
+void cpy1Dto2D_shl_neon(int16_t *dst, const int16_t *src, intptr_t dstStride, int shift)
1412
+{
1413
+    X265_CHECK((((intptr_t)dst | (dstStride * sizeof(*dst))) & 15) == 0 || size == 4, "dst alignment error\n");
1414
+    X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
1415
+    X265_CHECK(shift >= 0, "invalid shift\n");
1416
+
1417
+    for (int i = 0; i < size; i++)
1418
+    {
1419
+        int j = 0;
1420
+        for (; (j + 8) <= size; j += 8)
1421
+        {
1422
+            *(int16x8_t *)&dstj = vshlq_s16(*(int16x8_t *)&srcj, vdupq_n_s16(shift));
1423
+        }
1424
+        for (; j < size; j++)
1425
+        {
1426
+            dstj = srcj << shift;
1427
+        }
1428
+        src += size;
1429
+        dst += dstStride;
1430
+    }
1431
+}
1432
+
1433
+
1434
+template<int size>
1435
+uint64_t pixel_var_neon(const uint8_t *pix, intptr_t i_stride)
1436
+{
1437
+    uint32_t sum = 0, sqr = 0;
1438
+
1439
+    int32x4_t vsqr = vdupq_n_s32(0);
1440
+    for (int y = 0; y < size; y++)
1441
+    {
1442
+        int x = 0;
1443
+        int16x8_t vsum = vdupq_n_s16(0);
1444
+        for (; (x + 8) <= size; x += 8)
1445
+        {
1446
+            int16x8_t in;
1447
+            in = vmovl_u8(*(uint8x8_t *)&pixx);
1448
+            vsum = vaddq_u16(vsum, in);
1449
+            vsqr = vmlal_s16(vsqr, vget_low_s16(in), vget_low_s16(in));
1450
+            vsqr = vmlal_high_s16(vsqr, in, in);
1451
+        }
1452
+        for (; x < size; x++)
1453
+        {
1454
+            sum += pixx;
1455
+            sqr += pixx * pixx;
1456
+        }
1457
+        sum += vaddvq_s16(vsum);
1458
+
1459
+        pix += i_stride;
1460
+    }
1461
+    sqr += vaddvq_u32(vsqr);
1462
+    return sum + ((uint64_t)sqr << 32);
1463
+}
1464
+
1465
+template<int blockSize>
1466
+void getResidual_neon(const pixel *fenc, const pixel *pred, int16_t *residual, intptr_t stride)
1467
+{
1468
+    for (int y = 0; y < blockSize; y++)
1469
+    {
1470
+        int x = 0;
1471
+        for (; (x + 8) < blockSize; x += 8)
1472
+        {
1473
+            int16x8_t vfenc, vpred;
1474
+#if HIGH_BIT_DEPTH
1475
+            vfenc = *(int16x8_t *)&fencx;
1476
+            vpred = *(int16x8_t *)&predx;
1477
+#else
1478
+            vfenc = vmovl_u8(*(uint8x8_t *)&fencx);
1479
+            vpred = vmovl_u8(*(uint8x8_t *)&predx);
1480
+#endif
1481
+            *(int16x8_t *)&residualx = vsubq_s16(vfenc, vpred);
1482
+        }
1483
+        for (; x < blockSize; x++)
1484
+        {
1485
+            residualx = static_cast<int16_t>(fencx) - static_cast<int16_t>(predx);
1486
+        }
1487
+        fenc += stride;
1488
+        residual += stride;
1489
+        pred += stride;
1490
+    }
1491
+}
1492
+
1493
+template<int size>
1494
+int psyCost_pp_neon(const pixel *source, intptr_t sstride, const pixel *recon, intptr_t rstride)
1495
+{
1496
+    static pixel zeroBuf8 /* = { 0 } */;
1497
+
1498
+    if (size)
1499
+    {
1500
+        int dim = 1 << (size + 2);
1501
+        uint32_t totEnergy = 0;
1502
+        for (int i = 0; i < dim; i += 8)
1503
+        {
1504
+            for (int j = 0; j < dim; j += 8)
1505
+            {
1506
+                /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
1507
+                int sourceEnergy = pixel_sa8d_8x8_neon(source + i * sstride + j, sstride, zeroBuf, 0) -
1508
+                                   (sad_pp_neon<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2);
1509
+                int reconEnergy =  pixel_sa8d_8x8_neon(recon + i * rstride + j, rstride, zeroBuf, 0) -
1510
+                                   (sad_pp_neon<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2);
1511
+
1512
+                totEnergy += abs(sourceEnergy - reconEnergy);
1513
+            }
1514
+        }
1515
+        return totEnergy;
1516
+    }
1517
+    else
1518
+    {
1519
+        /* 4x4 is too small for sa8d */
1520
+        int sourceEnergy = pixel_satd_4x4_neon(source, sstride, zeroBuf, 0) - (sad_pp_neon<4, 4>(source, sstride, zeroBuf,
1521
+                           0) >> 2);
1522
+        int reconEnergy = pixel_satd_4x4_neon(recon, rstride, zeroBuf, 0) - (sad_pp_neon<4, 4>(recon, rstride, zeroBuf,
1523
+                          0) >> 2);
1524
+        return abs(sourceEnergy - reconEnergy);
1525
+    }
1526
+}
1527
+
1528
+
1529
+template<int w, int h>
1530
+// Calculate sa8d in blocks of 8x8
1531
+int sa8d8(const pixel *pix1, intptr_t i_pix1, const pixel *pix2, intptr_t i_pix2)
1532
+{
1533
+    int cost = 0;
1534
+
1535
+    for (int y = 0; y < h; y += 8)
1536
+        for (int x = 0; x < w; x += 8)
1537
+        {
1538
+            cost += pixel_sa8d_8x8_neon(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);
1539
+        }
1540
+
1541
+    return cost;
1542
+}
1543
+
1544
+template<int w, int h>
1545
+// Calculate sa8d in blocks of 16x16
1546
+int sa8d16(const pixel *pix1, intptr_t i_pix1, const pixel *pix2, intptr_t i_pix2)
1547
+{
1548
+    int cost = 0;
1549
+
1550
+    for (int y = 0; y < h; y += 16)
1551
+        for (int x = 0; x < w; x += 16)
1552
+        {
1553
+            cost += pixel_sa8d_16x16_neon(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);
1554
+        }
1555
+
1556
+    return cost;
1557
+}
1558
+
1559
+template<int size>
1560
+void cpy2Dto1D_shl_neon(int16_t *dst, const int16_t *src, intptr_t srcStride, int shift)
1561
+{
1562
+    X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
1563
+    X265_CHECK((((intptr_t)src | (srcStride * sizeof(*src))) & 15) == 0 || size == 4, "src alignment error\n");
1564
+    X265_CHECK(shift >= 0, "invalid shift\n");
1565
+
1566
+    for (int i = 0; i < size; i++)
1567
+    {
1568
+        for (int j = 0; j < size; j++)
1569
+        {
1570
+            dstj = srcj << shift;
1571
+        }
1572
+
1573
+        src += srcStride;
1574
+        dst += size;
1575
+    }
1576
+}
1577
+
1578
+
1579
+template<int w, int h>
1580
+// calculate satd in blocks of 4x4
1581
+int satd4_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
1582
+{
1583
+    int satd = 0;
1584
+
1585
+    for (int row = 0; row < h; row += 4)
1586
+        for (int col = 0; col < w; col += 4)
1587
+            satd += pixel_satd_4x4_neon(pix1 + row * stride_pix1 + col, stride_pix1,
1588
+                                        pix2 + row * stride_pix2 + col, stride_pix2);
1589
+
1590
+    return satd;
1591
+}
1592
+
1593
+template<int w, int h>
1594
+// calculate satd in blocks of 8x4
1595
+int satd8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
1596
+{
1597
+    int satd = 0;
1598
+
1599
+    if (((w | h) & 15) == 0)
1600
+    {
1601
+        for (int row = 0; row < h; row += 16)
1602
+            for (int col = 0; col < w; col += 16)
1603
+                satd += pixel_satd_16x16_neon(pix1 + row * stride_pix1 + col, stride_pix1,
1604
+                                              pix2 + row * stride_pix2 + col, stride_pix2);
1605
+
1606
+    }
1607
+    else if (((w | h) & 7) == 0)
1608
+    {
1609
+        for (int row = 0; row < h; row += 8)
1610
+            for (int col = 0; col < w; col += 8)
1611
+                satd += pixel_satd_8x8_neon(pix1 + row * stride_pix1 + col, stride_pix1,
1612
+                                            pix2 + row * stride_pix2 + col, stride_pix2);
1613
+
1614
+    }
1615
+    else
1616
+    {
1617
+        for (int row = 0; row < h; row += 4)
1618
+            for (int col = 0; col < w; col += 8)
1619
+                satd += pixel_satd_8x4_neon(pix1 + row * stride_pix1 + col, stride_pix1,
1620
+                                            pix2 + row * stride_pix2 + col, stride_pix2);
1621
+    }
1622
+
1623
+    return satd;
1624
+}
1625
+
1626
+
1627
+template<int blockSize>
1628
+void transpose_neon(pixel *dst, const pixel *src, intptr_t stride)
1629
+{
1630
+    for (int k = 0; k < blockSize; k++)
1631
+        for (int l = 0; l < blockSize; l++)
1632
+        {
1633
+            dstk * blockSize + l = srcl * stride + k;
1634
+        }
1635
+}
1636
+
1637
+
1638
+template<>
1639
+void transpose_neon<8>(pixel *dst, const pixel *src, intptr_t stride)
1640
+{
1641
+    transpose8x8(dst, src, 8, stride);
1642
+}
1643
+
1644
+template<>
1645
+void transpose_neon<16>(pixel *dst, const pixel *src, intptr_t stride)
1646
+{
1647
+    transpose16x16(dst, src, 16, stride);
1648
+}
1649
+
1650
+template<>
1651
+void transpose_neon<32>(pixel *dst, const pixel *src, intptr_t stride)
1652
+{
1653
+    transpose32x32(dst, src, 32, stride);
1654
+}
1655
+
1656
+
1657
+template<>
1658
+void transpose_neon<64>(pixel *dst, const pixel *src, intptr_t stride)
1659
+{
1660
+    transpose32x32(dst, src, 64, stride);
1661
+    transpose32x32(dst + 32 * 64 + 32, src + 32 * stride + 32, 64, stride);
1662
+    transpose32x32(dst + 32 * 64, src + 32, 64, stride);
1663
+    transpose32x32(dst + 32, src + 32 * stride, 64, stride);
1664
+}
1665
+
1666
+
1667
+template<int size>
1668
+sse_t pixel_ssd_s_neon(const int16_t *a, intptr_t dstride)
1669
+{
1670
+    sse_t sum = 0;
1671
+
1672
+
1673
+    int32x4_t vsum = vdupq_n_s32(0);
1674
+
1675
+    for (int y = 0; y < size; y++)
1676
+    {
1677
+        int x = 0;
1678
+
1679
+        for (; (x + 8) <= size; x += 8)
1680
+        {
1681
+            int16x8_t in = *(int16x8_t *)&ax;
1682
+            vsum = vmlal_s16(vsum, vget_low_s16(in), vget_low_s16(in));
1683
+            vsum = vmlal_high_s16(vsum, (in), (in));
1684
+        }
1685
+        for (; x < size; x++)
1686
+        {
1687
+            sum += ax * ax;
1688
+        }
1689
+
1690
+        a += dstride;
1691
+    }
1692
+    return sum + vaddvq_s32(vsum);
1693
+}
1694
+
1695
+
1696
+};
1697
+
1698
+
1699
+
1700
+
1701
+namespace X265_NS
1702
+{
1703
+
1704
+
1705
+void setupPixelPrimitives_neon(EncoderPrimitives &p)
1706
+{
1707
+#define LUMA_PU(W, H) \
1708
+    p.puLUMA_ ## W ## x ## H.copy_pp = blockcopy_pp_neon<W, H>; \
1709
+    p.puLUMA_ ## W ## x ## H.addAvgNONALIGNED = addAvg_neon<W, H>; \
1710
+    p.puLUMA_ ## W ## x ## H.addAvgALIGNED = addAvg_neon<W, H>; \
1711
+    p.puLUMA_ ## W ## x ## H.sad = sad_pp_neon<W, H>; \
1712
+    p.puLUMA_ ## W ## x ## H.sad_x3 = sad_x3_neon<W, H>; \
1713
+    p.puLUMA_ ## W ## x ## H.sad_x4 = sad_x4_neon<W, H>; \
1714
+    p.puLUMA_ ## W ## x ## H.pixelavg_ppNONALIGNED = pixelavg_pp_neon<W, H>; \
1715
+    p.puLUMA_ ## W ## x ## H.pixelavg_ppALIGNED = pixelavg_pp_neon<W, H>;
1716
+
1717
+#if !(HIGH_BIT_DEPTH)
1718
+#define LUMA_PU_S(W, H) \
1719
+    p.puLUMA_ ## W ## x ## H.copy_pp = blockcopy_pp_neon<W, H>; \
1720
+    p.puLUMA_ ## W ## x ## H.addAvgNONALIGNED = addAvg_neon<W, H>; \
1721
+    p.puLUMA_ ## W ## x ## H.addAvgALIGNED = addAvg_neon<W, H>;
1722
+#else // !(HIGH_BIT_DEPTH)
1723
+#define LUMA_PU_S(W, H) \
1724
+    p.puLUMA_ ## W ## x ## H.copy_pp = blockcopy_pp_neon<W, H>; \
1725
+    p.puLUMA_ ## W ## x ## H.addAvgNONALIGNED = addAvg_neon<W, H>; \
1726
+    p.puLUMA_ ## W ## x ## H.addAvgALIGNED = addAvg_neon<W, H>; \
1727
+    p.puLUMA_ ## W ## x ## H.sad_x3 = sad_x3_neon<W, H>; \
1728
+    p.puLUMA_ ## W ## x ## H.sad_x4 = sad_x4_neon<W, H>; \
1729
+    p.puLUMA_ ## W ## x ## H.pixelavg_ppNONALIGNED = pixelavg_pp_neon<W, H>; \
1730
+    p.puLUMA_ ## W ## x ## H.pixelavg_ppALIGNED = pixelavg_pp_neon<W, H>;
1731
+#endif // !(HIGH_BIT_DEPTH)
1732
+
1733
+#define LUMA_CU(W, H) \
1734
+    p.cuBLOCK_ ## W ## x ## H.sub_ps        = pixel_sub_ps_neon<W, H>; \
1735
+    p.cuBLOCK_ ## W ## x ## H.add_psNONALIGNED    = pixel_add_ps_neon<W, H>; \
1736
+    p.cuBLOCK_ ## W ## x ## H.add_psALIGNED = pixel_add_ps_neon<W, H>; \
1737
+    p.cuBLOCK_ ## W ## x ## H.copy_pp       = blockcopy_pp_neon<W, H>; \
1738
+    p.cuBLOCK_ ## W ## x ## H.copy_ps       = blockcopy_ps_neon<W, H>; \
1739
+    p.cuBLOCK_ ## W ## x ## H.copy_pp       = blockcopy_pp_neon<W, H>; \
1740
+    p.cuBLOCK_ ## W ## x ## H.cpy2Dto1D_shl = cpy2Dto1D_shl_neon<W>; \
1741
+    p.cuBLOCK_ ## W ## x ## H.cpy1Dto2D_shlNONALIGNED = cpy1Dto2D_shl_neon<W>; \
1742
+    p.cuBLOCK_ ## W ## x ## H.cpy1Dto2D_shlALIGNED = cpy1Dto2D_shl_neon<W>; \
1743
+    p.cuBLOCK_ ## W ## x ## H.psy_cost_pp   = psyCost_pp_neon<BLOCK_ ## W ## x ## H>; \
1744
+    p.cuBLOCK_ ## W ## x ## H.transpose     = transpose_neon<W>;
1745
+
1746
+
1747
+    LUMA_PU_S(4, 4);
1748
+    LUMA_PU_S(8, 8);
1749
+    LUMA_PU(16, 16);
1750
+    LUMA_PU(32, 32);
1751
+    LUMA_PU(64, 64);
1752
+    LUMA_PU_S(4, 8);
1753
+    LUMA_PU_S(8, 4);
1754
+    LUMA_PU(16,  8);
1755
+    LUMA_PU_S(8, 16);
1756
+    LUMA_PU(16, 12);
1757
+    LUMA_PU(12, 16);
1758
+    LUMA_PU(16,  4);
1759
+    LUMA_PU_S(4, 16);
1760
+    LUMA_PU(32, 16);
1761
+    LUMA_PU(16, 32);
1762
+    LUMA_PU(32, 24);
1763
+    LUMA_PU(24, 32);
1764
+    LUMA_PU(32,  8);
1765
+    LUMA_PU_S(8, 32);
1766
+    LUMA_PU(64, 32);
1767
+    LUMA_PU(32, 64);
1768
+    LUMA_PU(64, 48);
1769
+    LUMA_PU(48, 64);
1770
+    LUMA_PU(64, 16);
1771
+    LUMA_PU(16, 64);
1772
+    
1773
+#if defined(__APPLE__)
1774
+    p.puLUMA_4x4.sad = sad_pp_neon<4, 4>;
1775
+    p.puLUMA_4x8.sad = sad_pp_neon<4, 8>;
1776
+    p.puLUMA_4x16.sad = sad_pp_neon<4, 16>;
1777
+#endif // defined(__APPLE__)
1778
+    p.puLUMA_8x4.sad = sad_pp_neon<8, 4>;
1779
+    p.puLUMA_8x8.sad = sad_pp_neon<8, 8>;
1780
+    p.puLUMA_8x16.sad = sad_pp_neon<8, 16>;
1781
+    p.puLUMA_8x32.sad = sad_pp_neon<8, 32>;
1782
+
1783
+#if !(HIGH_BIT_DEPTH)
1784
+    p.puLUMA_4x4.sad_x3 = sad_x3_neon<4, 4>;
1785
+    p.puLUMA_4x4.sad_x4 = sad_x4_neon<4, 4>;
1786
+    p.puLUMA_4x8.sad_x3 = sad_x3_neon<4, 8>;
1787
+    p.puLUMA_4x8.sad_x4 = sad_x4_neon<4, 8>;
1788
+    p.puLUMA_4x16.sad_x3 = sad_x3_neon<4, 16>;
1789
+    p.puLUMA_4x16.sad_x4 = sad_x4_neon<4, 16>;
1790
+#endif // !(HIGH_BIT_DEPTH)
1791
+
1792
+    p.puLUMA_4x4.satd   = pixel_satd_4x4_neon;
1793
+    p.puLUMA_8x4.satd   = pixel_satd_8x4_neon;
1794
+    
1795
+    p.puLUMA_8x8.satd   = satd8_neon<8, 8>;
1796
+    p.puLUMA_16x16.satd = satd8_neon<16, 16>;
1797
+    p.puLUMA_16x8.satd  = satd8_neon<16, 8>;
1798
+    p.puLUMA_8x16.satd  = satd8_neon<8, 16>;
1799
+    p.puLUMA_16x12.satd = satd8_neon<16, 12>;
1800
+    p.puLUMA_16x4.satd  = satd8_neon<16, 4>;
1801
+    p.puLUMA_32x32.satd = satd8_neon<32, 32>;
1802
+    p.puLUMA_32x16.satd = satd8_neon<32, 16>;
1803
+    p.puLUMA_16x32.satd = satd8_neon<16, 32>;
1804
+    p.puLUMA_32x24.satd = satd8_neon<32, 24>;
1805
+    p.puLUMA_24x32.satd = satd8_neon<24, 32>;
1806
+    p.puLUMA_32x8.satd  = satd8_neon<32, 8>;
1807
+    p.puLUMA_8x32.satd  = satd8_neon<8, 32>;
1808
+    p.puLUMA_64x64.satd = satd8_neon<64, 64>;
1809
+    p.puLUMA_64x32.satd = satd8_neon<64, 32>;
1810
+    p.puLUMA_32x64.satd = satd8_neon<32, 64>;
1811
+    p.puLUMA_64x48.satd = satd8_neon<64, 48>;
1812
+    p.puLUMA_48x64.satd = satd8_neon<48, 64>;
1813
+    p.puLUMA_64x16.satd = satd8_neon<64, 16>;
1814
+    p.puLUMA_16x64.satd = satd8_neon<16, 64>;
1815
+
1816
+#if HIGH_BIT_DEPTH
1817
+    p.puLUMA_4x8.satd   = satd4_neon<4, 8>;
1818
+    p.puLUMA_4x16.satd  = satd4_neon<4, 16>;
1819
+#endif // HIGH_BIT_DEPTH
1820
+
1821
+#if !defined(__APPLE__) || HIGH_BIT_DEPTH
1822
+    p.puLUMA_12x16.satd = satd4_neon<12, 16>;
1823
+#endif // !defined(__APPLE__)
1824
+
1825
+
1826
+    LUMA_CU(4, 4);
1827
+    LUMA_CU(8, 8);
1828
+    LUMA_CU(16, 16);
1829
+    LUMA_CU(32, 32);
1830
+    LUMA_CU(64, 64);
1831
+    
1832
+#if !(HIGH_BIT_DEPTH)
1833
+    p.cuBLOCK_8x8.var   = pixel_var_neon<8>;
1834
+    p.cuBLOCK_16x16.var = pixel_var_neon<16>;
1835
+#if defined(__APPLE__)
1836
+    p.cuBLOCK_32x32.var   = pixel_var_neon<32>;
1837
+    p.cuBLOCK_64x64.var = pixel_var_neon<64>;
1838
+#endif // defined(__APPLE__)
1839
+#endif // !(HIGH_BIT_DEPTH)
1840
+
1841
+    p.cuBLOCK_16x16.blockfill_sNONALIGNED = blockfill_s_neon<16>; 
1842
+    p.cuBLOCK_16x16.blockfill_sALIGNED    = blockfill_s_neon<16>;
1843
+    p.cuBLOCK_32x32.blockfill_sNONALIGNED = blockfill_s_neon<32>; 
1844
+    p.cuBLOCK_32x32.blockfill_sALIGNED    = blockfill_s_neon<32>;
1845
+    p.cuBLOCK_64x64.blockfill_sNONALIGNED = blockfill_s_neon<64>; 
1846
+    p.cuBLOCK_64x64.blockfill_sALIGNED    = blockfill_s_neon<64>;
1847
+
1848
+
1849
+    p.cuBLOCK_4x4.calcresidualNONALIGNED    = getResidual_neon<4>;
1850
+    p.cuBLOCK_4x4.calcresidualALIGNED       = getResidual_neon<4>;
1851
+    p.cuBLOCK_8x8.calcresidualNONALIGNED    = getResidual_neon<8>;
1852
+    p.cuBLOCK_8x8.calcresidualALIGNED       = getResidual_neon<8>;
1853
+    p.cuBLOCK_16x16.calcresidualNONALIGNED  = getResidual_neon<16>;
1854
+    p.cuBLOCK_16x16.calcresidualALIGNED     = getResidual_neon<16>;
1855
+    
1856
+#if defined(__APPLE__)
1857
+    p.cuBLOCK_32x32.calcresidualNONALIGNED  = getResidual_neon<32>;
1858
+    p.cuBLOCK_32x32.calcresidualALIGNED     = getResidual_neon<32>;
1859
+#endif // defined(__APPLE__)
1860
+
1861
+    p.cuBLOCK_4x4.sa8d   = pixel_satd_4x4_neon;
1862
+    p.cuBLOCK_8x8.sa8d   = pixel_sa8d_8x8_neon;
1863
+    p.cuBLOCK_16x16.sa8d = pixel_sa8d_16x16_neon;
1864
+    p.cuBLOCK_32x32.sa8d = sa8d16<32, 32>;
1865
+    p.cuBLOCK_64x64.sa8d = sa8d16<64, 64>;
1866
+
1867
+
1868
+#define CHROMA_PU_420(W, H) \
1869
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.addAvgNONALIGNED  = addAvg_neon<W, H>;         \
1870
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.addAvgALIGNED  = addAvg_neon<W, H>;         \
1871
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.copy_pp = blockcopy_pp_neon<W, H>; \
1872
+
1873
+
1874
+    CHROMA_PU_420(4, 4);
1875
+    CHROMA_PU_420(8, 8);
1876
+    CHROMA_PU_420(16, 16);
1877
+    CHROMA_PU_420(32, 32);
1878
+    CHROMA_PU_420(4, 2);
1879
+    CHROMA_PU_420(8, 4);
1880
+    CHROMA_PU_420(4, 8);
1881
+    CHROMA_PU_420(8, 6);
1882
+    CHROMA_PU_420(6, 8);
1883
+    CHROMA_PU_420(8, 2);
1884
+    CHROMA_PU_420(2, 8);
1885
+    CHROMA_PU_420(16, 8);
1886
+    CHROMA_PU_420(8,  16);
1887
+    CHROMA_PU_420(16, 12);
1888
+    CHROMA_PU_420(12, 16);
1889
+    CHROMA_PU_420(16, 4);
1890
+    CHROMA_PU_420(4,  16);
1891
+    CHROMA_PU_420(32, 16);
1892
+    CHROMA_PU_420(16, 32);
1893
+    CHROMA_PU_420(32, 24);
1894
+    CHROMA_PU_420(24, 32);
1895
+    CHROMA_PU_420(32, 8);
1896
+    CHROMA_PU_420(8,  32);
1897
+
1898
+
1899
+
1900
+    p.chromaX265_CSP_I420.puCHROMA_420_2x2.satd   = NULL;
1901
+    p.chromaX265_CSP_I420.puCHROMA_420_4x4.satd   = pixel_satd_4x4_neon;
1902
+    p.chromaX265_CSP_I420.puCHROMA_420_8x8.satd   = satd8_neon<8, 8>;
1903
+    p.chromaX265_CSP_I420.puCHROMA_420_16x16.satd = satd8_neon<16, 16>;
1904
+    p.chromaX265_CSP_I420.puCHROMA_420_32x32.satd = satd8_neon<32, 32>;
1905
+
1906
+    p.chromaX265_CSP_I420.puCHROMA_420_4x2.satd   = NULL;
1907
+    p.chromaX265_CSP_I420.puCHROMA_420_2x4.satd   = NULL;
1908
+    p.chromaX265_CSP_I420.puCHROMA_420_8x4.satd   = pixel_satd_8x4_neon;
1909
+    p.chromaX265_CSP_I420.puCHROMA_420_16x8.satd  = satd8_neon<16, 8>;
1910
+    p.chromaX265_CSP_I420.puCHROMA_420_8x16.satd  = satd8_neon<8, 16>;
1911
+    p.chromaX265_CSP_I420.puCHROMA_420_32x16.satd = satd8_neon<32, 16>;
1912
+    p.chromaX265_CSP_I420.puCHROMA_420_16x32.satd = satd8_neon<16, 32>;
1913
+
1914
+    p.chromaX265_CSP_I420.puCHROMA_420_8x6.satd   = NULL;
1915
+    p.chromaX265_CSP_I420.puCHROMA_420_6x8.satd   = NULL;
1916
+    p.chromaX265_CSP_I420.puCHROMA_420_8x2.satd   = NULL;
1917
+    p.chromaX265_CSP_I420.puCHROMA_420_2x8.satd   = NULL;
1918
+    p.chromaX265_CSP_I420.puCHROMA_420_16x12.satd = satd4_neon<16, 12>;
1919
+    p.chromaX265_CSP_I420.puCHROMA_420_16x4.satd  = satd4_neon<16, 4>;
1920
+    p.chromaX265_CSP_I420.puCHROMA_420_32x24.satd = satd8_neon<32, 24>;
1921
+    p.chromaX265_CSP_I420.puCHROMA_420_24x32.satd = satd8_neon<24, 32>;
1922
+    p.chromaX265_CSP_I420.puCHROMA_420_32x8.satd  = satd8_neon<32, 8>;
1923
+    p.chromaX265_CSP_I420.puCHROMA_420_8x32.satd  = satd8_neon<8, 32>;
1924
+    
1925
+#if HIGH_BIT_DEPTH
1926
+    p.chromaX265_CSP_I420.puCHROMA_420_4x8.satd   = satd4_neon<4, 8>;
1927
+    p.chromaX265_CSP_I420.puCHROMA_420_4x16.satd  = satd4_neon<4, 16>;
1928
+#endif // HIGH_BIT_DEPTH
1929
+
1930
+#if !defined(__APPLE__) || HIGH_BIT_DEPTH
1931
+    p.chromaX265_CSP_I420.puCHROMA_420_12x16.satd = satd4_neon<12, 16>;
1932
+#endif // !defined(__APPLE__)
1933
+
1934
+
1935
+#define CHROMA_CU_420(W, H) \
1936
+    p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.sse_pp  = sse_neon<W, H, pixel, pixel>; \
1937
+    p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.copy_pp = blockcopy_pp_neon<W, H>; \
1938
+    p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.copy_ps = blockcopy_ps_neon<W, H>; \
1939
+    p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.sub_ps = pixel_sub_ps_neon<W, H>;  \
1940
+    p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.add_psNONALIGNED = pixel_add_ps_neon<W, H>; \
1941
+    p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.add_psALIGNED = pixel_add_ps_neon<W, H>;
1942
+    
1943
+#define CHROMA_CU_S_420(W, H) \
1944
+    p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.copy_pp = blockcopy_pp_neon<W, H>; \
1945
+    p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.copy_ps = blockcopy_ps_neon<W, H>; \
1946
+    p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.sub_ps = pixel_sub_ps_neon<W, H>;  \
1947
+    p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.add_psNONALIGNED = pixel_add_ps_neon<W, H>; \
1948
+    p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.add_psALIGNED = pixel_add_ps_neon<W, H>;
1949
+
1950
+
1951
+    CHROMA_CU_S_420(4, 4)
1952
+    CHROMA_CU_420(8, 8)
1953
+    CHROMA_CU_420(16, 16)
1954
+    CHROMA_CU_420(32, 32)
1955
+
1956
+
1957
+    p.chromaX265_CSP_I420.cuBLOCK_8x8.sa8d   = p.chromaX265_CSP_I420.puCHROMA_420_4x4.satd;
1958
+    p.chromaX265_CSP_I420.cuBLOCK_16x16.sa8d = sa8d8<8, 8>;
1959
+    p.chromaX265_CSP_I420.cuBLOCK_32x32.sa8d = sa8d16<16, 16>;
1960
+    p.chromaX265_CSP_I420.cuBLOCK_64x64.sa8d = sa8d16<32, 32>;
1961
+
1962
+
1963
+#define CHROMA_PU_422(W, H) \
1964
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.addAvgNONALIGNED  = addAvg_neon<W, H>;         \
1965
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.addAvgALIGNED  = addAvg_neon<W, H>;         \
1966
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.copy_pp = blockcopy_pp_neon<W, H>; \
1967
+
1968
+
1969
+    CHROMA_PU_422(4, 8);
1970
+    CHROMA_PU_422(8, 16);
1971
+    CHROMA_PU_422(16, 32);
1972
+    CHROMA_PU_422(32, 64);
1973
+    CHROMA_PU_422(4, 4);
1974
+    CHROMA_PU_422(2, 8);
1975
+    CHROMA_PU_422(8, 8);
1976
+    CHROMA_PU_422(4, 16);
1977
+    CHROMA_PU_422(8, 12);
1978
+    CHROMA_PU_422(6, 16);
1979
+    CHROMA_PU_422(8, 4);
1980
+    CHROMA_PU_422(2, 16);
1981
+    CHROMA_PU_422(16, 16);
1982
+    CHROMA_PU_422(8, 32);
1983
+    CHROMA_PU_422(16, 24);
1984
+    CHROMA_PU_422(12, 32);
1985
+    CHROMA_PU_422(16, 8);
1986
+    CHROMA_PU_422(4,  32);
1987
+    CHROMA_PU_422(32, 32);
1988
+    CHROMA_PU_422(16, 64);
1989
+    CHROMA_PU_422(32, 48);
1990
+    CHROMA_PU_422(24, 64);
1991
+    CHROMA_PU_422(32, 16);
1992
+    CHROMA_PU_422(8,  64);
1993
+
1994
+
1995
+    p.chromaX265_CSP_I422.puCHROMA_422_2x4.satd   = NULL;
1996
+    p.chromaX265_CSP_I422.puCHROMA_422_8x16.satd  = satd8_neon<8, 16>;
1997
+    p.chromaX265_CSP_I422.puCHROMA_422_16x32.satd = satd8_neon<16, 32>;
1998
+    p.chromaX265_CSP_I422.puCHROMA_422_32x64.satd = satd8_neon<32, 64>;
1999
+    p.chromaX265_CSP_I422.puCHROMA_422_4x4.satd   = pixel_satd_4x4_neon;
2000
+    p.chromaX265_CSP_I422.puCHROMA_422_2x8.satd   = NULL;
2001
+    p.chromaX265_CSP_I422.puCHROMA_422_8x8.satd   = satd8_neon<8, 8>;
2002
+    p.chromaX265_CSP_I422.puCHROMA_422_16x16.satd = satd8_neon<16, 16>;
2003
+    p.chromaX265_CSP_I422.puCHROMA_422_8x32.satd  = satd8_neon<8, 32>;
2004
+    p.chromaX265_CSP_I422.puCHROMA_422_32x32.satd = satd8_neon<32, 32>;
2005
+    p.chromaX265_CSP_I422.puCHROMA_422_16x64.satd = satd8_neon<16, 64>;
2006
+    p.chromaX265_CSP_I422.puCHROMA_422_6x16.satd  = NULL;
2007
+    p.chromaX265_CSP_I422.puCHROMA_422_8x4.satd   = satd4_neon<8, 4>;
2008
+    p.chromaX265_CSP_I422.puCHROMA_422_2x16.satd  = NULL;
2009
+    p.chromaX265_CSP_I422.puCHROMA_422_16x8.satd  = satd8_neon<16, 8>;
2010
+    p.chromaX265_CSP_I422.puCHROMA_422_32x16.satd = satd8_neon<32, 16>;
2011
+    
2012
+    p.chromaX265_CSP_I422.puCHROMA_422_8x12.satd  = satd4_neon<8, 12>;
2013
+    p.chromaX265_CSP_I422.puCHROMA_422_8x64.satd  = satd8_neon<8, 64>;
2014
+    p.chromaX265_CSP_I422.puCHROMA_422_12x32.satd = satd4_neon<12, 32>;
2015
+    p.chromaX265_CSP_I422.puCHROMA_422_16x24.satd = satd8_neon<16, 24>;
2016
+    p.chromaX265_CSP_I422.puCHROMA_422_24x64.satd = satd8_neon<24, 64>;
2017
+    p.chromaX265_CSP_I422.puCHROMA_422_32x48.satd = satd8_neon<32, 48>;
2018
+
2019
+#if HIGH_BIT_DEPTH
2020
+    p.chromaX265_CSP_I422.puCHROMA_422_4x8.satd   = satd4_neon<4, 8>;
2021
+    p.chromaX265_CSP_I422.puCHROMA_422_4x16.satd  = satd4_neon<4, 16>;
2022
+    p.chromaX265_CSP_I422.puCHROMA_422_4x32.satd  = satd4_neon<4, 32>;
2023
+#endif // HIGH_BIT_DEPTH
2024
+
2025
+
2026
+#define CHROMA_CU_422(W, H) \
2027
+    p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.sse_pp  = sse_neon<W, H, pixel, pixel>;  \
2028
+    p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.copy_pp = blockcopy_pp_neon<W, H>; \
2029
+    p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.copy_ps = blockcopy_ps_neon<W, H>; \
2030
+    p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.sub_ps = pixel_sub_ps_neon<W, H>; \
2031
+    p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.add_psNONALIGNED = pixel_add_ps_neon<W, H>; \
2032
+    p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.add_psALIGNED = pixel_add_ps_neon<W, H>;
2033
+
2034
+#define CHROMA_CU_S_422(W, H) \
2035
+    p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.copy_pp = blockcopy_pp_neon<W, H>; \
2036
+    p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.copy_ps = blockcopy_ps_neon<W, H>; \
2037
+    p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.sub_ps = pixel_sub_ps_neon<W, H>; \
2038
+    p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.add_psNONALIGNED = pixel_add_ps_neon<W, H>; \
2039
+    p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.add_psALIGNED = pixel_add_ps_neon<W, H>;
2040
+    
2041
+    
2042
+    CHROMA_CU_S_422(4, 8)
2043
+    CHROMA_CU_422(8, 16)
2044
+    CHROMA_CU_422(16, 32)
2045
+    CHROMA_CU_422(32, 64)
2046
+
2047
+    p.chromaX265_CSP_I422.cuBLOCK_8x8.sa8d   = p.chromaX265_CSP_I422.puCHROMA_422_4x8.satd;
2048
+    p.chromaX265_CSP_I422.cuBLOCK_16x16.sa8d = sa8d8<8, 16>;
2049
+    p.chromaX265_CSP_I422.cuBLOCK_32x32.sa8d = sa8d16<16, 32>;
2050
+    p.chromaX265_CSP_I422.cuBLOCK_64x64.sa8d = sa8d16<32, 64>;
2051
+
2052
+
2053
+}
2054
+
2055
+
2056
+}
2057
+
2058
+
2059
+#endif
2060
+
2061
x265_3.6.tar.gz/source/common/aarch64/pixel-prim.h Added
25
 
1
@@ -0,0 +1,23 @@
2
+#ifndef PIXEL_PRIM_NEON_H__
3
+#define PIXEL_PRIM_NEON_H__
4
+
5
+#include "common.h"
6
+#include "slicetype.h"      // LOWRES_COST_MASK
7
+#include "primitives.h"
8
+#include "x265.h"
9
+
10
+
11
+
12
+namespace X265_NS
13
+{
14
+
15
+
16
+
17
+void setupPixelPrimitives_neon(EncoderPrimitives &p);
18
+
19
+
20
+}
21
+
22
+
23
+#endif
24
+
25
x265_3.6.tar.gz/source/common/aarch64/pixel-util-common.S Added
86
 
1
@@ -0,0 +1,84 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+.arch           armv8-a
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.macro pixel_var_start
39
+    movi            v0.16b, #0
40
+    movi            v1.16b, #0
41
+    movi            v2.16b, #0
42
+    movi            v3.16b, #0
43
+.endm
44
+
45
+.macro pixel_var_1 v
46
+    uaddw           v0.8h, v0.8h, \v\().8b
47
+    umull           v30.8h, \v\().8b, \v\().8b
48
+    uaddw2          v1.8h, v1.8h, \v\().16b
49
+    umull2          v31.8h, \v\().16b, \v\().16b
50
+    uadalp          v2.4s, v30.8h
51
+    uadalp          v3.4s, v31.8h
52
+.endm
53
+
54
+.macro pixel_var_end
55
+    uaddlv          s0, v0.8h
56
+    uaddlv          s1, v1.8h
57
+    add             v2.4s, v2.4s, v3.4s
58
+    fadd            s0, s0, s1
59
+    uaddlv          d2, v2.4s
60
+    fmov            w0, s0
61
+    fmov            x2, d2
62
+    orr             x0, x0, x2, lsl #32
63
+.endm
64
+
65
+.macro ssimDist_start
66
+    movi            v0.16b, #0
67
+    movi            v1.16b, #0
68
+.endm
69
+
70
+.macro ssimDist_end
71
+    uaddlv          d0, v0.4s
72
+    uaddlv          d1, v1.4s
73
+    str             d0, x6
74
+    str             d1, x4
75
+.endm
76
+
77
+.macro normFact_start
78
+    movi            v0.16b, #0
79
+.endm
80
+
81
+.macro normFact_end
82
+    uaddlv          d0, v0.4s
83
+    str             d0, x3
84
+.endm
85
+
86
x265_3.6.tar.gz/source/common/aarch64/pixel-util-sve.S Added
375
 
1
@@ -0,0 +1,373 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "pixel-util-common.S"
27
+
28
+.arch armv8-a+sve
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+function PFX(pixel_sub_ps_8x16_sve)
41
+    lsl             x1, x1, #1
42
+    ptrue           p0.h, vl8
43
+.rept 8
44
+    ld1b            {z0.h}, p0/z, x2
45
+    ld1b            {z1.h}, p0/z, x3
46
+    add             x2, x2, x4
47
+    add             x3, x3, x5
48
+    ld1b            {z2.h}, p0/z, x2
49
+    ld1b            {z3.h}, p0/z, x3
50
+    add             x2, x2, x4
51
+    add             x3, x3, x5
52
+    sub             z4.h, z0.h, z1.h
53
+    sub             z5.h, z2.h, z3.h
54
+    st1             {v4.8h}, x0, x1
55
+    st1             {v5.8h}, x0, x1
56
+.endr
57
+    ret
58
+endfunc
59
+
60
+//******* satd *******
61
+.macro satd_4x4_sve
62
+    ld1b            {z0.h}, p0/z, x0
63
+    ld1b            {z2.h}, p0/z, x2
64
+    add             x0, x0, x1
65
+    add             x2, x2, x3
66
+    ld1b            {z1.h}, p0/z, x0
67
+    ld1b            {z3.h}, p0/z, x2
68
+    add             x0, x0, x1
69
+    add             x2, x2, x3
70
+    ld1b            {z4.h}, p0/z, x0
71
+    ld1b            {z6.h}, p0/z, x2
72
+    add             x0, x0, x1
73
+    add             x2, x2, x3
74
+    ld1b            {z5.h}, p0/z, x0
75
+    ld1b            {z7.h}, p0/z, x2
76
+    add             x0, x0, x1
77
+    add             x2, x2, x3
78
+
79
+    sub             z0.h, z0.h, z2.h
80
+    sub             z1.h, z1.h, z3.h
81
+    sub             z2.h, z4.h, z6.h
82
+    sub             z3.h, z5.h, z7.h
83
+
84
+    add             z4.h, z0.h, z2.h
85
+    add             z5.h, z1.h, z3.h
86
+    sub             z6.h, z0.h, z2.h
87
+    sub             z7.h, z1.h, z3.h
88
+
89
+    add             z0.h, z4.h, z5.h
90
+    sub             z1.h, z4.h, z5.h
91
+
92
+    add             z2.h, z6.h, z7.h
93
+    sub             z3.h, z6.h, z7.h
94
+
95
+    trn1            z4.h, z0.h, z2.h
96
+    trn2            z5.h, z0.h, z2.h
97
+
98
+    trn1            z6.h, z1.h, z3.h
99
+    trn2            z7.h, z1.h, z3.h
100
+
101
+    add             z0.h, z4.h, z5.h
102
+    sub             z1.h, z4.h, z5.h
103
+
104
+    add             z2.h, z6.h, z7.h
105
+    sub             z3.h, z6.h, z7.h
106
+
107
+    trn1            z4.s, z0.s, z1.s
108
+    trn2            z5.s, z0.s, z1.s
109
+
110
+    trn1            z6.s, z2.s, z3.s
111
+    trn2            z7.s, z2.s, z3.s
112
+
113
+    abs             z4.h, p0/m, z4.h
114
+    abs             z5.h, p0/m, z5.h
115
+    abs             z6.h, p0/m, z6.h
116
+    abs             z7.h, p0/m, z7.h
117
+
118
+    smax            z4.h, p0/m, z4.h, z5.h
119
+    smax            z6.h, p0/m, z6.h, z7.h
120
+
121
+    add             z0.h, z4.h, z6.h
122
+
123
+    uaddlp          v0.2s, v0.4h
124
+    uaddlp          v0.1d, v0.2s
125
+.endm
126
+
127
+// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
128
+function PFX(pixel_satd_4x4_sve)
129
+    ptrue           p0.h, vl4
130
+    satd_4x4_sve
131
+    fmov            x0, d0
132
+    ret
133
+endfunc
134
+
135
+function PFX(pixel_satd_8x4_sve)
136
+    ptrue           p0.h, vl4
137
+    mov             x4, x0
138
+    mov             x5, x2
139
+    satd_4x4_sve
140
+    add             x0, x4, #4
141
+    add             x2, x5, #4
142
+    umov            x6, v0.d0
143
+    satd_4x4_sve
144
+    umov            x0, v0.d0
145
+    add             x0, x0, x6
146
+    ret
147
+endfunc
148
+
149
+function PFX(pixel_satd_8x12_sve)
150
+    ptrue           p0.h, vl4
151
+    mov             x4, x0
152
+    mov             x5, x2
153
+    mov             x7, #0
154
+    satd_4x4_sve
155
+    umov            x6, v0.d0
156
+    add             x7, x7, x6
157
+    add             x0, x4, #4
158
+    add             x2, x5, #4
159
+    satd_4x4_sve
160
+    umov            x6, v0.d0
161
+    add             x7, x7, x6
162
+.rept 2
163
+    sub             x0, x0, #4
164
+    sub             x2, x2, #4
165
+    mov             x4, x0
166
+    mov             x5, x2
167
+    satd_4x4_sve
168
+    umov            x6, v0.d0
169
+    add             x7, x7, x6
170
+    add             x0, x4, #4
171
+    add             x2, x5, #4
172
+    satd_4x4_sve
173
+    umov            x6, v0.d0
174
+    add             x7, x7, x6
175
+.endr
176
+    mov             x0, x7
177
+    ret
178
+endfunc
179
+
180
+.macro LOAD_DIFF_16x4_sve v0 v1 v2 v3 v4 v5 v6 v7
181
+    mov             x11, #8 // in order to consider CPUs whose vector size is greater than 128 bits
182
+    ld1b            {z0.h}, p0/z, x0
183
+    ld1b            {z1.h}, p0/z, x0, x11
184
+    ld1b            {z2.h}, p0/z, x2
185
+    ld1b            {z3.h}, p0/z, x2, x11
186
+    add             x0, x0, x1
187
+    add             x2, x2, x3
188
+    ld1b            {z4.h}, p0/z, x0
189
+    ld1b            {z5.h}, p0/z, x0, x11
190
+    ld1b            {z6.h}, p0/z, x2
191
+    ld1b            {z7.h}, p0/z, x2, x11
192
+    add             x0, x0, x1
193
+    add             x2, x2, x3
194
+    ld1b            {z29.h}, p0/z, x0
195
+    ld1b            {z9.h}, p0/z, x0, x11
196
+    ld1b            {z10.h}, p0/z, x2
197
+    ld1b            {z11.h}, p0/z, x2, x11
198
+    add             x0, x0, x1
199
+    add             x2, x2, x3
200
+    ld1b            {z12.h}, p0/z, x0
201
+    ld1b            {z13.h}, p0/z, x0, x11
202
+    ld1b            {z14.h}, p0/z, x2
203
+    ld1b            {z15.h}, p0/z, x2, x11
204
+    add             x0, x0, x1
205
+    add             x2, x2, x3
206
+
207
+    sub             \v0\().h, z0.h, z2.h
208
+    sub             \v4\().h, z1.h, z3.h
209
+    sub             \v1\().h, z4.h, z6.h
210
+    sub             \v5\().h, z5.h, z7.h
211
+    sub             \v2\().h, z29.h, z10.h
212
+    sub             \v6\().h, z9.h, z11.h
213
+    sub             \v3\().h, z12.h, z14.h
214
+    sub             \v7\().h, z13.h, z15.h
215
+.endm
216
+
217
+// one vertical hadamard pass and two horizontal
218
+function PFX(satd_8x4v_8x8h_sve), export=0
219
+    HADAMARD4_V     z16.h, z18.h, z17.h, z19.h, z0.h, z2.h, z1.h, z3.h
220
+    HADAMARD4_V     z20.h, z21.h, z22.h, z23.h, z0.h, z1.h, z2.h, z3.h
221
+    trn4            z0.h, z1.h, z2.h, z3.h, z16.h, z17.h, z18.h, z19.h
222
+    trn4            z4.h, z5.h, z6.h, z7.h, z20.h, z21.h, z22.h, z23.h
223
+    SUMSUB_ABCD     z16.h, z17.h, z18.h, z19.h, z0.h, z1.h, z2.h, z3.h
224
+    SUMSUB_ABCD     z20.h, z21.h, z22.h, z23.h, z4.h, z5.h, z6.h, z7.h
225
+    trn4            z0.s, z2.s, z1.s, z3.s, z16.s, z18.s, z17.s, z19.s
226
+    trn4            z4.s, z6.s, z5.s, z7.s, z20.s, z22.s, z21.s, z23.s
227
+    ABS8_SVE        z0.h, z1.h, z2.h, z3.h, z4.h, z5.h, z6.h, z7.h, p0
228
+    smax            z0.h, p0/m, z0.h, z2.h
229
+    smax            z1.h, p0/m, z1.h, z3.h
230
+    smax            z4.h, p0/m, z4.h, z6.h
231
+    smax            z5.h, p0/m, z5.h, z7.h
232
+    ret
233
+endfunc
234
+
235
+function PFX(satd_16x4_sve), export=0
236
+    LOAD_DIFF_16x4_sve  z16, z17, z18, z19, z20, z21, z22, z23
237
+    b                    PFX(satd_8x4v_8x8h_sve)
238
+endfunc
239
+
240
+.macro pixel_satd_32x8_sve
241
+    mov             x4, x0
242
+    mov             x5, x2
243
+.rept 2
244
+    bl              PFX(satd_16x4_sve)
245
+    add             z30.h, z30.h, z0.h
246
+    add             z31.h, z31.h, z1.h
247
+    add             z30.h, z30.h, z4.h
248
+    add             z31.h, z31.h, z5.h
249
+.endr
250
+    add             x0, x4, #16
251
+    add             x2, x5, #16
252
+.rept 2
253
+    bl              PFX(satd_16x4_sve)
254
+    add             z30.h, z30.h, z0.h
255
+    add             z31.h, z31.h, z1.h
256
+    add             z30.h, z30.h, z4.h
257
+    add             z31.h, z31.h, z5.h
258
+.endr
259
+.endm
260
+
261
+.macro satd_32x16_sve
262
+    movi            v30.2d, #0
263
+    movi            v31.2d, #0
264
+    pixel_satd_32x8_sve
265
+    sub             x0, x0, #16
266
+    sub             x2, x2, #16
267
+    pixel_satd_32x8_sve
268
+    add             z0.h, z30.h, z31.h
269
+    uaddlv          s0, v0.8h
270
+    mov             w6, v0.s0
271
+.endm
272
+
273
+function PFX(pixel_satd_32x16_sve)
274
+    ptrue           p0.h, vl8
275
+    mov             x10, x30
276
+    satd_32x16_sve
277
+    mov             x0, x6
278
+    ret             x10
279
+endfunc
280
+
281
+function PFX(pixel_satd_32x32_sve)
282
+    ptrue           p0.h, vl8
283
+    mov             x10, x30
284
+    mov             x7, #0
285
+    satd_32x16_sve
286
+    sub             x0, x0, #16
287
+    sub             x2, x2, #16
288
+    add             x7, x7, x6
289
+    satd_32x16_sve
290
+    add             x0, x7, x6
291
+    ret             x10
292
+endfunc
293
+
294
+.macro satd_64x16_sve
295
+    mov             x8, x0
296
+    mov             x9, x2
297
+    satd_32x16_sve
298
+    add             x7, x7, x6
299
+    add             x0, x8, #32
300
+    add             x2, x9, #32
301
+    satd_32x16_sve
302
+    add             x7, x7, x6
303
+.endm
304
+
305
+function PFX(pixel_satd_64x48_sve)
306
+    ptrue           p0.h, vl8
307
+    mov             x10, x30
308
+    mov             x7, #0
309
+.rept 2
310
+    satd_64x16_sve
311
+    sub             x0, x0, #48
312
+    sub             x2, x2, #48
313
+.endr
314
+    satd_64x16_sve
315
+    mov             x0, x7
316
+    ret             x10
317
+endfunc
318
+
319
+/********* ssim ***********/
320
+// uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
321
+// No need to fully use sve instructions for this function
322
+function PFX(quant_sve)
323
+    mov             w9, #1
324
+    lsl             w9, w9, w4
325
+    mov             z0.s, w9
326
+    neg             w9, w4
327
+    mov             z1.s, w9
328
+    add             w9, w9, #8
329
+    mov             z2.s, w9
330
+    mov             z3.s, w5
331
+
332
+    lsr             w6, w6, #2
333
+    eor             z4.d, z4.d, z4.d
334
+    eor             w10, w10, w10
335
+    eor             z17.d, z17.d, z17.d
336
+
337
+.loop_quant_sve:
338
+    ld1             {v18.4h}, x0, #8
339
+    ld1             {v7.4s}, x1, #16
340
+    sxtl            v6.4s, v18.4h
341
+
342
+    cmlt            v5.4s, v6.4s, #0
343
+
344
+    abs             v6.4s, v6.4s
345
+
346
+
347
+    mul             v6.4s, v6.4s, v7.4s
348
+
349
+    add             v7.4s, v6.4s, v3.4s
350
+    sshl            v7.4s, v7.4s, v1.4s
351
+
352
+    mls             v6.4s, v7.4s, v0.s0
353
+    sshl            v16.4s, v6.4s, v2.4s
354
+    st1             {v16.4s}, x2, #16
355
+
356
+    // numsig
357
+    cmeq            v16.4s, v7.4s, v17.4s
358
+    add             v4.4s, v4.4s, v16.4s
359
+    add             w10, w10, #4
360
+
361
+    // level *= sign
362
+    eor             z16.d, z7.d, z5.d
363
+    sub             v16.4s, v16.4s, v5.4s
364
+    sqxtn           v5.4h, v16.4s
365
+    st1             {v5.4h}, x3, #8
366
+
367
+    subs            w6, w6, #1
368
+    b.ne             .loop_quant_sve
369
+
370
+    addv            s4, v4.4s
371
+    mov             w9, v4.s0
372
+    add             w0, w10, w9
373
+    ret
374
+endfunc
375
x265_3.6.tar.gz/source/common/aarch64/pixel-util-sve2.S Added
1688
 
1
@@ -0,0 +1,1686 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "pixel-util-common.S"
27
+
28
+.arch armv8-a+sve2
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+// uint64_t pixel_var(const pixel* pix, intptr_t i_stride)
41
+function PFX(pixel_var_8x8_sve2)
42
+    ptrue           p0.h, vl8
43
+    ld1b            {z0.h}, p0/z, x0
44
+    add             x0, x0, x1
45
+    mul             z31.h, z0.h, z0.h
46
+    uaddlp          v1.4s, v31.8h
47
+.rept 7
48
+    ld1b            {z4.h}, p0/z, x0
49
+    add             x0, x0, x1
50
+    add             z0.h, z0.h, z4.h
51
+    mul             z31.h, z4.h, z4.h
52
+    uadalp          z1.s, p0/m, z31.h
53
+.endr
54
+    uaddlv          s0, v0.8h
55
+    uaddlv          d1, v1.4s
56
+    fmov            w0, s0
57
+    fmov            x1, d1
58
+    orr             x0, x0, x1, lsl #32
59
+    ret
60
+endfunc
61
+
62
+function PFX(pixel_var_16x16_sve2)
63
+    rdvl            x9, #1
64
+    cmp             x9, #16
65
+    bgt             .vl_gt_16_pixel_var_16x16
66
+    pixel_var_start
67
+    mov             w12, #16
68
+.loop_var_16_sve2:
69
+    sub             w12, w12, #1
70
+    ld1             {v4.16b}, x0, x1
71
+    pixel_var_1 v4
72
+    cbnz            w12, .loop_var_16_sve2
73
+    pixel_var_end
74
+    ret
75
+.vl_gt_16_pixel_var_16x16:
76
+    ptrue           p0.h, vl16
77
+    mov             z0.d, #0
78
+.rept 16
79
+    ld1b            {z4.h}, p0/z, x0
80
+    add             x0, x0, x1
81
+    add             z0.h, z0.h, z4.h
82
+    mul             z30.h, z4.h, z4.h
83
+    uadalp          z1.s, p0/m, z30.h
84
+.endr
85
+    uaddv           d0, p0, z0.h
86
+    uaddv           d1, p0, z1.s
87
+    fmov            w0, s0
88
+    fmov            x1, d1
89
+    orr             x0, x0, x1, lsl #32
90
+    ret
91
+endfunc
92
+
93
+function PFX(pixel_var_32x32_sve2)
94
+    rdvl            x9, #1
95
+    cmp             x9, #16
96
+    bgt             .vl_gt_16_pixel_var_32x32
97
+    pixel_var_start
98
+    mov             w12, #32
99
+.loop_var_32_sve2:
100
+    sub             w12, w12, #1
101
+    ld1             {v4.16b-v5.16b}, x0, x1
102
+    pixel_var_1 v4
103
+    pixel_var_1 v5
104
+    cbnz            w12, .loop_var_32_sve2
105
+    pixel_var_end
106
+    ret
107
+.vl_gt_16_pixel_var_32x32:
108
+    cmp             x9, #48
109
+    bgt             .vl_gt_48_pixel_var_32x32
110
+    ptrue           p0.b, vl32
111
+    mov             z0.d, #0
112
+    mov             z1.d, #0
113
+.rept 32
114
+    ld1b            {z4.b}, p0/z, x0
115
+    add             x0, x0, x1
116
+    uaddwb          z0.h, z0.h, z4.b
117
+    uaddwt          z0.h, z0.h, z4.b
118
+    umullb          z28.h, z4.b, z4.b
119
+    umullt          z29.h, z4.b, z4.b
120
+    uadalp          z1.s, p0/m, z28.h
121
+    uadalp          z1.s, p0/m, z29.h
122
+.endr
123
+    uaddv           d0, p0, z0.h
124
+    uaddv           d1, p0, z1.s
125
+    fmov            w0, s0
126
+    fmov            x1, d1
127
+    orr             x0, x0, x1, lsl #32
128
+    ret
129
+.vl_gt_48_pixel_var_32x32:
130
+    ptrue           p0.h, vl32
131
+    mov             z0.d, #0
132
+    mov             z1.d, #0
133
+.rept 32
134
+    ld1b            {z4.h}, p0/z, x0
135
+    add             x0, x0, x1
136
+    add             z0.h, z0.h, z4.h
137
+    mul             z28.h, z4.h, z4.h
138
+    uadalp          z1.s, p0/m, z28.h
139
+.endr
140
+    uaddv           d0, p0, z0.h
141
+    uaddv           d1, p0, z1.s
142
+    fmov            w0, s0
143
+    fmov            x1, d1
144
+    orr             x0, x0, x1, lsl #32
145
+    ret
146
+endfunc
147
+
148
+function PFX(pixel_var_64x64_sve2)
149
+    rdvl            x9, #1
150
+    cmp             x9, #16
151
+    bgt             .vl_gt_16_pixel_var_64x64
152
+    pixel_var_start
153
+    mov             w12, #64
154
+.loop_var_64_sve2:
155
+    sub             w12, w12, #1
156
+    ld1             {v4.16b-v7.16b}, x0, x1
157
+    pixel_var_1 v4
158
+    pixel_var_1 v5
159
+    pixel_var_1 v6
160
+    pixel_var_1 v7
161
+    cbnz            w12, .loop_var_64_sve2
162
+    pixel_var_end
163
+    ret
164
+.vl_gt_16_pixel_var_64x64:
165
+    cmp             x9, #48
166
+    bgt             .vl_gt_48_pixel_var_64x64
167
+    ptrue           p0.b, vl32
168
+    mov             z0.d, #0
169
+    mov             z2.d, #0
170
+.rept 64
171
+    ld1b            {z4.b}, p0/z, x0
172
+    ld1b            {z5.b}, p0/z, x0, #1, mul vl
173
+    add             x0, x0, x1
174
+    uaddwb          z0.h, z0.h, z4.b
175
+    uaddwt          z0.h, z0.h, z4.b
176
+    uaddwb          z0.h, z0.h, z5.b
177
+    uaddwt          z0.h, z0.h, z5.b
178
+    umullb          z24.h, z4.b, z4.b
179
+    umullt          z25.h, z4.b, z4.b
180
+    umullb          z26.h, z5.b, z5.b
181
+    umullt          z27.h, z5.b, z5.b
182
+    uadalp          z2.s, p0/m, z24.h
183
+    uadalp          z2.s, p0/m, z25.h
184
+    uadalp          z2.s, p0/m, z26.h
185
+    uadalp          z2.s, p0/m, z27.h
186
+.endr
187
+    uaddv           d0, p0, z0.h
188
+    uaddv           d1, p0, z2.s
189
+    fmov            w0, s0
190
+    fmov            x1, d1
191
+    orr             x0, x0, x1, lsl #32
192
+    ret
193
+.vl_gt_48_pixel_var_64x64:
194
+    cmp             x9, #112
195
+    bgt             .vl_gt_112_pixel_var_64x64
196
+    ptrue           p0.b, vl64
197
+    mov             z0.d, #0
198
+    mov             z1.d, #0
199
+.rept 64
200
+    ld1b            {z4.b}, p0/z, x0
201
+    add             x0, x0, x1
202
+    uaddwb          z0.h, z0.h, z4.b
203
+    uaddwt          z0.h, z0.h, z4.b
204
+    umullb          z24.h, z4.b, z4.b
205
+    umullt          z25.h, z4.b, z4.b
206
+    uadalp          z2.s, p0/m, z24.h
207
+    uadalp          z2.s, p0/m, z25.h
208
+.endr
209
+    uaddv           d0, p0, z0.h
210
+    uaddv           d1, p0, z2.s
211
+    fmov            w0, s0
212
+    fmov            x1, d1
213
+    orr             x0, x0, x1, lsl #32
214
+    ret
215
+.vl_gt_112_pixel_var_64x64:
216
+    ptrue           p0.h, vl64
217
+    mov             z0.d, #0
218
+    mov             z1.d, #0
219
+.rept 64
220
+    ld1b            {z4.h}, p0/z, x0
221
+    add             x0, x0, x1
222
+    add             z0.h, z0.h, z4.h
223
+    mul             z24.h, z4.h, z4.h
224
+    uadalp          z1.s, p0/m, z24.h
225
+.endr
226
+    uaddv           d0, p0, z0.h
227
+    uaddv           d1, p0, z1.s
228
+    fmov            w0, s0
229
+    fmov            x1, d1
230
+    orr             x0, x0, x1, lsl #32
231
+    ret
232
+endfunc
233
+
234
+function PFX(getResidual16_sve2)
235
+    rdvl            x9, #1
236
+    cmp             x9, #16
237
+    bgt             .vl_gt_16_getResidual16
238
+    lsl             x4, x3, #1
239
+.rept 8
240
+    ld1             {v0.16b}, x0, x3
241
+    ld1             {v1.16b}, x1, x3
242
+    ld1             {v2.16b}, x0, x3
243
+    ld1             {v3.16b}, x1, x3
244
+    usubl           v4.8h, v0.8b, v1.8b
245
+    usubl2          v5.8h, v0.16b, v1.16b
246
+    usubl           v6.8h, v2.8b, v3.8b
247
+    usubl2          v7.8h, v2.16b, v3.16b
248
+    st1             {v4.8h-v5.8h}, x2, x4
249
+    st1             {v6.8h-v7.8h}, x2, x4
250
+.endr
251
+    ret
252
+.vl_gt_16_getResidual16:
253
+    ptrue           p0.h, vl16
254
+.rept 16
255
+    ld1b            {z0.h}, p0/z, x0
256
+    ld1b            {z2.h}, p0/z, x1
257
+    add             x0, x0, x3
258
+    add             x1, x1, x3
259
+    sub             z4.h, z0.h, z2.h
260
+    st1h            {z4.h}, p0, x2
261
+    add             x2, x2, x3, lsl #1
262
+.endr
263
+    ret
264
+endfunc
265
+
266
+function PFX(getResidual32_sve2)
267
+    rdvl            x9, #1
268
+    cmp             x9, #16
269
+    bgt             .vl_gt_16_getResidual32
270
+    lsl             x4, x3, #1
271
+    mov             w12, #4
272
+.loop_residual_32:
273
+    sub             w12, w12, #1
274
+.rept 4
275
+    ld1             {v0.16b-v1.16b}, x0, x3
276
+    ld1             {v2.16b-v3.16b}, x1, x3
277
+    ld1             {v4.16b-v5.16b}, x0, x3
278
+    ld1             {v6.16b-v7.16b}, x1, x3
279
+    usubl           v16.8h, v0.8b, v2.8b
280
+    usubl2          v17.8h, v0.16b, v2.16b
281
+    usubl           v18.8h, v1.8b, v3.8b
282
+    usubl2          v19.8h, v1.16b, v3.16b
283
+    usubl           v20.8h, v4.8b, v6.8b
284
+    usubl2          v21.8h, v4.16b, v6.16b
285
+    usubl           v22.8h, v5.8b, v7.8b
286
+    usubl2          v23.8h, v5.16b, v7.16b
287
+    st1             {v16.8h-v19.8h}, x2, x4
288
+    st1             {v20.8h-v23.8h}, x2, x4
289
+.endr
290
+    cbnz            w12, .loop_residual_32
291
+    ret
292
+.vl_gt_16_getResidual32:
293
+    cmp             x9, #48
294
+    bgt             .vl_gt_48_getResidual32
295
+    ptrue           p0.b, vl32
296
+.rept 32
297
+    ld1b            {z0.b}, p0/z, x0
298
+    ld1b            {z2.b}, p0/z, x1
299
+    add             x0, x0, x3
300
+    add             x1, x1, x3
301
+    usublb          z4.h, z0.b, z2.b
302
+    usublt          z5.h, z0.b, z2.b
303
+    st2h            {z4.h, z5.h}, p0, x2
304
+    add             x2, x2, x3, lsl #1
305
+.endr
306
+    ret
307
+.vl_gt_48_getResidual32:
308
+    ptrue           p0.h, vl32
309
+.rept 32
310
+    ld1b            {z0.h}, p0/z, x0
311
+    ld1b            {z4.h}, p0/z, x1
312
+    add             x0, x0, x3
313
+    add             x1, x1, x3
314
+    sub             z8.h, z0.h, z4.h
315
+    st1h            {z8.h}, p0, x2
316
+    add             x2, x2, x3, lsl #1
317
+.endr
318
+    ret
319
+endfunc
320
+
321
+function PFX(pixel_sub_ps_32x32_sve2)
322
+    rdvl            x9, #1
323
+    cmp             x9, #16
324
+    bgt             .vl_gt_16_pixel_sub_ps_32x32
325
+    lsl             x1, x1, #1
326
+    mov             w12, #4
327
+.loop_sub_ps_32_sve2:
328
+    sub             w12, w12, #1
329
+.rept 4
330
+    ld1             {v0.16b-v1.16b}, x2, x4
331
+    ld1             {v2.16b-v3.16b}, x3, x5
332
+    ld1             {v4.16b-v5.16b}, x2, x4
333
+    ld1             {v6.16b-v7.16b}, x3, x5
334
+    usubl           v16.8h, v0.8b, v2.8b
335
+    usubl2          v17.8h, v0.16b, v2.16b
336
+    usubl           v18.8h, v1.8b, v3.8b
337
+    usubl2          v19.8h, v1.16b, v3.16b
338
+    usubl           v20.8h, v4.8b, v6.8b
339
+    usubl2          v21.8h, v4.16b, v6.16b
340
+    usubl           v22.8h, v5.8b, v7.8b
341
+    usubl2          v23.8h, v5.16b, v7.16b
342
+    st1             {v16.8h-v19.8h}, x0, x1
343
+    st1             {v20.8h-v23.8h}, x0, x1
344
+.endr
345
+    cbnz            w12, .loop_sub_ps_32_sve2
346
+    ret
347
+.vl_gt_16_pixel_sub_ps_32x32:
348
+    cmp             x9, #48
349
+    bgt             .vl_gt_48_pixel_sub_ps_32x32
350
+    ptrue           p0.b, vl32
351
+    mov             w12, #8
352
+.vl_gt_16_loop_sub_ps_32_sve2:
353
+    sub             w12, w12, #1
354
+.rept 4
355
+    ld1b            {z0.b}, p0/z, x2
356
+    ld1b            {z2.b}, p0/z, x3
357
+    add             x2, x2, x4
358
+    add             x3, x3, x5
359
+    usublb          z16.h, z0.b, z2.b
360
+    usublt          z17.h, z0.b, z2.b
361
+    st2h            {z16.h, z17.h}, p0, x0
362
+    add             x0, x0, x1, lsl #1
363
+.endr
364
+    cbnz            w12, .vl_gt_16_loop_sub_ps_32_sve2
365
+    ret
366
+.vl_gt_48_pixel_sub_ps_32x32:
367
+    ptrue           p0.h, vl32
368
+    mov             w12, #8
369
+.vl_gt_48_loop_sub_ps_32_sve2:
370
+    sub             w12, w12, #1
371
+.rept 4
372
+    ld1b            {z0.h}, p0/z, x2
373
+    ld1b            {z4.h}, p0/z, x3
374
+    add             x2, x2, x4
375
+    add             x3, x3, x5
376
+    sub             z8.h, z0.h, z4.h
377
+    st1h            {z8.h}, p0, x0
378
+    add             x0, x0, x1, lsl #1
379
+.endr
380
+    cbnz            w12, .vl_gt_48_loop_sub_ps_32_sve2
381
+    ret
382
+endfunc
383
+
384
+function PFX(pixel_sub_ps_64x64_sve2)
385
+    rdvl            x9, #1
386
+    cmp             x9, #16
387
+    bgt             .vl_gt_16_pixel_sub_ps_64x64
388
+    lsl             x1, x1, #1
389
+    sub             x1, x1, #64
390
+    mov             w12, #16
391
+.loop_sub_ps_64_sve2:
392
+    sub             w12, w12, #1
393
+.rept 4
394
+    ld1             {v0.16b-v3.16b}, x2, x4
395
+    ld1             {v4.16b-v7.16b}, x3, x5
396
+    usubl           v16.8h, v0.8b, v4.8b
397
+    usubl2          v17.8h, v0.16b, v4.16b
398
+    usubl           v18.8h, v1.8b, v5.8b
399
+    usubl2          v19.8h, v1.16b, v5.16b
400
+    usubl           v20.8h, v2.8b, v6.8b
401
+    usubl2          v21.8h, v2.16b, v6.16b
402
+    usubl           v22.8h, v3.8b, v7.8b
403
+    usubl2          v23.8h, v3.16b, v7.16b
404
+    st1             {v16.8h-v19.8h}, x0, #64
405
+    st1             {v20.8h-v23.8h}, x0, x1
406
+.endr
407
+    cbnz            w12, .loop_sub_ps_64_sve2
408
+    ret
409
+.vl_gt_16_pixel_sub_ps_64x64:
410
+    rdvl            x9, #1
411
+    cmp             x9, #16
412
+    bgt             .vl_gt_16_pixel_sub_ps_64x64
413
+    ptrue           p0.b, vl32
414
+    mov             w12, #16
415
+.vl_gt_16_loop_sub_ps_64_sve2:
416
+    sub             w12, w12, #1
417
+.rept 4
418
+    ld1b            {z0.b}, p0/z, x2
419
+    ld1b            {z1.b}, p0/z, x2, #1, mul vl
420
+    ld1b            {z4.b}, p0/z, x3
421
+    ld1b            {z5.b}, p0/z, x3, #1, mul vl
422
+    add             x2, x2, x4
423
+    add             x3, x3, x5
424
+    usublb          z16.h, z0.b, z4.b
425
+    usublt          z17.h, z0.b, z4.b
426
+    usublb          z18.h, z1.b, z5.b
427
+    usublt          z19.h, z1.b, z5.b
428
+    st2h            {z16.h, z17.h}, p0, x0
429
+    st2h            {z18.h, z19.h}, p0, x0, #2, mul vl
430
+    add             x0, x0, x1, lsl #1
431
+.endr
432
+    cbnz            w12, .vl_gt_16_loop_sub_ps_64_sve2
433
+    ret
434
+.vl_gt_48_pixel_sub_ps_64x64:
435
+    cmp             x9, #112
436
+    bgt             .vl_gt_112_pixel_sub_ps_64x64
437
+    ptrue           p0.b, vl64
438
+    mov             w12, #16
439
+.vl_gt_48_loop_sub_ps_64_sve2:
440
+    sub             w12, w12, #1
441
+.rept 4
442
+    ld1b            {z0.b}, p0/z, x2
443
+    ld1b            {z4.b}, p0/z, x3
444
+    add             x2, x2, x4
445
+    add             x3, x3, x5
446
+    usublb          z16.h, z0.b, z4.b
447
+    usublt          z17.h, z0.b, z4.b
448
+    st2h            {z16.h, z17.h}, p0, x0
449
+    add             x0, x0, x1, lsl #1
450
+.endr
451
+    cbnz            w12, .vl_gt_48_loop_sub_ps_64_sve2
452
+    ret
453
+.vl_gt_112_pixel_sub_ps_64x64:
454
+    ptrue           p0.h, vl64
455
+    mov             w12, #16
456
+.vl_gt_112_loop_sub_ps_64_sve2:
457
+    sub             w12, w12, #1
458
+.rept 4
459
+    ld1b            {z0.h}, p0/z, x2
460
+    ld1b            {z8.h}, p0/z, x3
461
+    add             x2, x2, x4
462
+    add             x3, x3, x5
463
+    sub             z16.h, z0.h, z8.h
464
+    st1h            {z16.h}, p0, x0
465
+    add             x0, x0, x1, lsl #1
466
+.endr
467
+    cbnz            w12, .vl_gt_112_loop_sub_ps_64_sve2
468
+    ret
469
+endfunc
470
+
471
+function PFX(pixel_sub_ps_32x64_sve2)
472
+    rdvl            x9, #1
473
+    cmp             x9, #16
474
+    bgt             .vl_gt_16_pixel_sub_ps_32x64
475
+    lsl             x1, x1, #1
476
+    mov             w12, #8
477
+.loop_sub_ps_32x64_sve2:
478
+    sub             w12, w12, #1
479
+.rept 4
480
+    ld1             {v0.16b-v1.16b}, x2, x4
481
+    ld1             {v2.16b-v3.16b}, x3, x5
482
+    ld1             {v4.16b-v5.16b}, x2, x4
483
+    ld1             {v6.16b-v7.16b}, x3, x5
484
+    usubl           v16.8h, v0.8b, v2.8b
485
+    usubl2          v17.8h, v0.16b, v2.16b
486
+    usubl           v18.8h, v1.8b, v3.8b
487
+    usubl2          v19.8h, v1.16b, v3.16b
488
+    usubl           v20.8h, v4.8b, v6.8b
489
+    usubl2          v21.8h, v4.16b, v6.16b
490
+    usubl           v22.8h, v5.8b, v7.8b
491
+    usubl2          v23.8h, v5.16b, v7.16b
492
+    st1             {v16.8h-v19.8h}, x0, x1
493
+    st1             {v20.8h-v23.8h}, x0, x1
494
+.endr
495
+    cbnz            w12, .loop_sub_ps_32x64_sve2
496
+    ret
497
+.vl_gt_16_pixel_sub_ps_32x64:
498
+    cmp             x9, #48
499
+    bgt             .vl_gt_48_pixel_sub_ps_32x64
500
+    ptrue           p0.b, vl32
501
+    mov             w12, #8
502
+.vl_gt_16_loop_sub_ps_32x64_sve2:
503
+    sub             w12, w12, #1
504
+.rept 8
505
+    ld1b            {z0.b}, p0/z, x2
506
+    ld1b            {z2.b}, p0/z, x3
507
+    add             x2, x2, x4
508
+    add             x3, x3, x5
509
+    usublb          z16.h, z0.b, z2.b
510
+    usublt          z17.h, z0.b, z2.b
511
+    st2h            {z16.h, z17.h}, p0, x0
512
+    add             x0, x0, x1, lsl #1
513
+.endr
514
+    cbnz            w12, .vl_gt_16_loop_sub_ps_32x64_sve2
515
+    ret
516
+.vl_gt_48_pixel_sub_ps_32x64:
517
+    ptrue           p0.h, vl32
518
+    mov             w12, #8
519
+.vl_gt_48_loop_sub_ps_32x64_sve2:
520
+    sub             w12, w12, #1
521
+.rept 8
522
+    ld1b            {z0.h}, p0/z, x2
523
+    ld1b            {z4.h}, p0/z, x3
524
+    add             x2, x2, x4
525
+    add             x3, x3, x5
526
+    sub             z8.h, z0.h, z4.h
527
+    st1h            {z8.h}, p0, x0
528
+    add             x0, x0, x1, lsl #1
529
+.endr
530
+    cbnz            w12, .vl_gt_48_loop_sub_ps_32x64_sve2
531
+    ret
532
+endfunc
533
+
534
+function PFX(pixel_add_ps_4x4_sve2)
535
+    ptrue           p0.h, vl8
536
+    ptrue           p1.h, vl4
537
+.rept 4
538
+    ld1b            {z0.h}, p0/z, x2
539
+    ld1h            {z2.h}, p1/z, x3
540
+    add             x2, x2, x4
541
+    add             x3, x3, x5, lsl #1
542
+    add             z4.h, z0.h, z2.h
543
+    sqxtunb         z4.b, z4.h
544
+    st1b            {z4.h}, p1, x0
545
+    add             x0, x0, x1
546
+.endr
547
+    ret
548
+endfunc
549
+
550
+function PFX(pixel_add_ps_8x8_sve2)
551
+    ptrue           p0.h, vl8
552
+.rept 8
553
+    ld1b            {z0.h}, p0/z, x2
554
+    ld1h            {z2.h}, p0/z, x3
555
+    add             x2, x2, x4
556
+    add             x3, x3, x5, lsl #1
557
+    add             z4.h, z0.h, z2.h
558
+    sqxtunb         z4.b, z4.h
559
+    st1b            {z4.h}, p0, x0
560
+    add             x0, x0, x1
561
+.endr
562
+    ret
563
+endfunc
564
+
565
+.macro pixel_add_ps_16xN_sve2 h
566
+function PFX(pixel_add_ps_16x\h\()_sve2)
567
+    rdvl            x9, #1
568
+    cmp             x9, #16
569
+    bgt             .vl_gt_16_pixel_add_ps_16x\h
570
+    ptrue           p0.b, vl16
571
+.rept \h
572
+    ld1b            {z0.h}, p0/z, x2
573
+    ld1b            {z1.h}, p0/z, x2, #1, mul vl
574
+    ld1h            {z2.h}, p0/z, x3
575
+    ld1h            {z3.h}, p0/z, x3, #1, mul vl
576
+    add             x2, x2, x4
577
+    add             x3, x3, x5, lsl #1
578
+    add             z24.h, z0.h, z2.h
579
+    add             z25.h, z1.h, z3.h
580
+    sqxtunb         z6.b, z24.h
581
+    sqxtunb         z7.b, z25.h
582
+    st1b            {z6.h}, p0, x0
583
+    st1b            {z7.h}, p0, x0, #1, mul vl
584
+    add             x0, x0, x1
585
+.endr
586
+    ret
587
+.vl_gt_16_pixel_add_ps_16x\h\():
588
+    ptrue           p0.b, vl32
589
+.rept \h
590
+    ld1b            {z0.h}, p0/z, x2
591
+    ld1h            {z2.h}, p0/z, x3
592
+    add             x2, x2, x4
593
+    add             x3, x3, x5, lsl #1
594
+    add             z24.h, z0.h, z2.h
595
+    sqxtunb         z6.b, z24.h
596
+    st1b            {z6.h}, p0, x0
597
+    add             x0, x0, x1
598
+.endr
599
+    ret
600
+endfunc
601
+.endm
602
+
603
+pixel_add_ps_16xN_sve2 16
604
+pixel_add_ps_16xN_sve2 32
605
+
606
+.macro pixel_add_ps_32xN_sve2 h
607
+ function PFX(pixel_add_ps_32x\h\()_sve2)
608
+    rdvl            x9, #1
609
+    cmp             x9, #16
610
+    bgt             .vl_gt_16_pixel_add_ps_32x\h
611
+    lsl             x5, x5, #1
612
+    mov             w12, #\h / 4
613
+.loop_add_ps__sve2_32x\h\():
614
+    sub             w12, w12, #1
615
+.rept 4
616
+    ld1             {v0.16b-v1.16b}, x2, x4
617
+    ld1             {v16.8h-v19.8h}, x3, x5
618
+    uxtl            v4.8h, v0.8b
619
+    uxtl2           v5.8h, v0.16b
620
+    uxtl            v6.8h, v1.8b
621
+    uxtl2           v7.8h, v1.16b
622
+    add             v24.8h, v4.8h, v16.8h
623
+    add             v25.8h, v5.8h, v17.8h
624
+    add             v26.8h, v6.8h, v18.8h
625
+    add             v27.8h, v7.8h, v19.8h
626
+    sqxtun          v4.8b, v24.8h
627
+    sqxtun2         v4.16b, v25.8h
628
+    sqxtun          v5.8b, v26.8h
629
+    sqxtun2         v5.16b, v27.8h
630
+    st1             {v4.16b-v5.16b}, x0, x1
631
+.endr
632
+    cbnz            w12, .loop_add_ps__sve2_32x\h
633
+    ret
634
+.vl_gt_16_pixel_add_ps_32x\h\():
635
+    cmp             x9, #48
636
+    bgt             .vl_gt_48_pixel_add_ps_32x\h
637
+    ptrue           p0.b, vl32
638
+.rept \h
639
+    ld1b            {z0.h}, p0/z, x2
640
+    ld1b            {z1.h}, p0/z, x2, #1, mul vl
641
+    ld1h            {z4.h}, p0/z, x3
642
+    ld1h            {z5.h}, p0/z, x3, #1, mul vl
643
+    add             x2, x2, x4
644
+    add             x3, x3, x5, lsl #1
645
+    add             z24.h, z0.h, z4.h
646
+    add             z25.h, z1.h, z5.h
647
+    sqxtunb         z6.b, z24.h
648
+    sqxtunb         z7.b, z25.h
649
+    st1b            {z6.h}, p0, x0
650
+    st1b            {z7.h}, p0, x0, #1, mul vl
651
+    add             x0, x0, x1
652
+.endr
653
+    ret
654
+.vl_gt_48_pixel_add_ps_32x\h\():
655
+    ptrue           p0.b, vl64
656
+.rept \h
657
+    ld1b            {z0.h}, p0/z, x2
658
+    ld1h            {z4.h}, p0/z, x3
659
+    add             x2, x2, x4
660
+    add             x3, x3, x5, lsl #1
661
+    add             z24.h, z0.h, z4.h
662
+    sqxtunb         z6.b, z24.h
663
+    st1b            {z6.h}, p0, x0
664
+    add             x0, x0, x1
665
+.endr
666
+    ret
667
+endfunc
668
+.endm
669
+
670
+pixel_add_ps_32xN_sve2 32
671
+pixel_add_ps_32xN_sve2 64
672
+
673
+function PFX(pixel_add_ps_64x64_sve2)
674
+    rdvl            x9, #1
675
+    cmp             x9, #16
676
+    bgt             .vl_gt_16_pixel_add_ps_64x64
677
+    ptrue           p0.b, vl16
678
+.rept 64
679
+    ld1b            {z0.h}, p0/z, x2
680
+    ld1b            {z1.h}, p0/z, x2, #1, mul vl
681
+    ld1b            {z2.h}, p0/z, x2, #2, mul vl
682
+    ld1b            {z3.h}, p0/z, x2, #3, mul vl
683
+    ld1b            {z4.h}, p0/z, x2, #4 ,mul vl
684
+    ld1b            {z5.h}, p0/z, x2, #5, mul vl
685
+    ld1b            {z6.h}, p0/z, x2, #6, mul vl
686
+    ld1b            {z7.h}, p0/z, x2, #7, mul vl
687
+    ld1h            {z8.h}, p0/z, x3
688
+    ld1h            {z9.h}, p0/z, x3, #1, mul vl
689
+    ld1h            {z10.h}, p0/z, x3, #2, mul vl
690
+    ld1h            {z11.h}, p0/z, x3, #3, mul vl
691
+    ld1h            {z12.h}, p0/z, x3, #4, mul vl
692
+    ld1h            {z13.h}, p0/z, x3, #5, mul vl
693
+    ld1h            {z14.h}, p0/z, x3, #6, mul vl
694
+    ld1h            {z15.h}, p0/z, x3, #7, mul vl
695
+    add             x2, x2, x4
696
+    add             x3, x3, x5, lsl #1
697
+    add             z24.h, z0.h, z8.h
698
+    add             z25.h, z1.h, z9.h
699
+    add             z26.h, z2.h, z10.h
700
+    add             z27.h, z3.h, z11.h
701
+    add             z28.h, z4.h, z12.h
702
+    add             z29.h, z5.h, z13.h
703
+    add             z30.h, z6.h, z14.h
704
+    add             z31.h, z7.h, z15.h
705
+    sqxtunb         z6.b, z24.h
706
+    sqxtunb         z7.b, z25.h
707
+    sqxtunb         z8.b, z26.h
708
+    sqxtunb         z9.b, z27.h
709
+    sqxtunb         z10.b, z28.h
710
+    sqxtunb         z11.b, z29.h
711
+    sqxtunb         z12.b, z30.h
712
+    sqxtunb         z13.b, z31.h
713
+    st1b            {z6.h}, p0, x0
714
+    st1b            {z7.h}, p0, x0, #1, mul vl
715
+    st1b            {z8.h}, p0, x0, #2, mul vl
716
+    st1b            {z9.h}, p0, x0, #3, mul vl
717
+    st1b            {z10.h}, p0, x0, #4, mul vl
718
+    st1b            {z11.h}, p0, x0, #5, mul vl
719
+    st1b            {z12.h}, p0, x0, #6, mul vl
720
+    st1b            {z13.h}, p0, x0, #7, mul vl
721
+    add             x0, x0, x1
722
+.endr
723
+    ret
724
+.vl_gt_16_pixel_add_ps_64x64:
725
+    cmp             x9, #48
726
+    bgt             .vl_gt_48_pixel_add_ps_64x64
727
+    ptrue           p0.b, vl32
728
+.rept 64
729
+    ld1b            {z0.h}, p0/z, x2
730
+    ld1b            {z1.h}, p0/z, x2, #1, mul vl
731
+    ld1b            {z2.h}, p0/z, x2, #2, mul vl
732
+    ld1b            {z3.h}, p0/z, x2, #3, mul vl
733
+    ld1h            {z8.h}, p0/z, x3
734
+    ld1h            {z9.h}, p0/z, x3, #1, mul vl
735
+    ld1h            {z10.h}, p0/z, x3, #2, mul vl
736
+    ld1h            {z11.h}, p0/z, x3, #3, mul vl
737
+    add             x2, x2, x4
738
+    add             x3, x3, x5, lsl #1
739
+    add             z24.h, z0.h, z8.h
740
+    add             z25.h, z1.h, z9.h
741
+    add             z26.h, z2.h, z10.h
742
+    add             z27.h, z3.h, z11.h
743
+    sqxtunb         z6.b, z24.h
744
+    sqxtunb         z7.b, z25.h
745
+    sqxtunb         z8.b, z26.h
746
+    sqxtunb         z9.b, z27.h
747
+    st1b            {z6.h}, p0, x0
748
+    st1b            {z7.h}, p0, x0, #1, mul vl
749
+    st1b            {z8.h}, p0, x0, #2, mul vl
750
+    st1b            {z9.h}, p0, x0, #3, mul vl
751
+    add             x0, x0, x1
752
+.endr
753
+    ret
754
+.vl_gt_48_pixel_add_ps_64x64:
755
+    cmp             x9, #112
756
+    bgt             .vl_gt_112_pixel_add_ps_64x64
757
+    ptrue           p0.b, vl64
758
+.rept 64
759
+    ld1b            {z0.h}, p0/z, x2
760
+    ld1b            {z1.h}, p0/z, x2, #1, mul vl
761
+    ld1h            {z8.h}, p0/z, x3
762
+    ld1h            {z9.h}, p0/z, x3, #1, mul vl
763
+    add             x2, x2, x4
764
+    add             x3, x3, x5, lsl #1
765
+    add             z24.h, z0.h, z8.h
766
+    add             z25.h, z1.h, z9.h
767
+    sqxtunb         z6.b, z24.h
768
+    sqxtunb         z7.b, z25.h
769
+    st1b            {z6.h}, p0, x0
770
+    st1b            {z7.h}, p0, x0, #1, mul vl
771
+    add             x0, x0, x1
772
+.endr
773
+    ret
774
+.vl_gt_112_pixel_add_ps_64x64:
775
+    ptrue           p0.b, vl128
776
+.rept 64
777
+    ld1b            {z0.h}, p0/z, x2
778
+    ld1h            {z8.h}, p0/z, x3
779
+    add             x2, x2, x4
780
+    add             x3, x3, x5, lsl #1
781
+    add             z24.h, z0.h, z8.h
782
+    sqxtunb         z6.b, z24.h
783
+    st1b            {z6.h}, p0, x0
784
+    add             x0, x0, x1
785
+.endr
786
+    ret
787
+endfunc
788
+
789
+// Chroma add_ps
790
+function PFX(pixel_add_ps_4x8_sve2)
791
+    ptrue           p0.h,vl4
792
+.rept 8
793
+    ld1b            {z0.h}, p0/z, x2
794
+    ld1h            {z2.h}, p0/z, x3
795
+    add             x2, x2, x4
796
+    add             x3, x3, x5, lsl #1
797
+    add             z4.h, z0.h, z2.h
798
+    sqxtunb         z4.b, z4.h
799
+    st1b            {z4.h}, p0, x0
800
+    add             x0, x0, x1
801
+.endr
802
+    ret
803
+endfunc
804
+
805
+function PFX(pixel_add_ps_8x16_sve2)
806
+    ptrue           p0.h,vl8
807
+.rept 16
808
+    ld1b            {z0.h}, p0/z, x2
809
+    ld1h            {z2.h}, p0/z, x3
810
+    add             x2, x2, x4
811
+    add             x3, x3, x5, lsl #1
812
+    add             z4.h, z0.h, z2.h
813
+    sqxtunb         z4.b, z4.h
814
+    st1b            {z4.h}, p0, x0
815
+    add             x0, x0, x1
816
+.endr
817
+    ret
818
+endfunc
819
+
820
+// void scale1D_128to64(pixel *dst, const pixel *src)
821
+function PFX(scale1D_128to64_sve2)
822
+    rdvl            x9, #1
823
+    cmp             x9, #16
824
+    bgt             .vl_gt_16_scale1D_128to64
825
+    ptrue           p0.b, vl16
826
+.rept 2
827
+    ld2b            {z0.b, z1.b}, p0/z, x1
828
+    ld2b            {z2.b, z3.b}, p0/z, x1, #2, mul vl
829
+    ld2b            {z4.b, z5.b}, p0/z, x1, #4, mul vl
830
+    ld2b            {z6.b, z7.b}, p0/z, x1, #6, mul vl
831
+    add             x1, x1, #128
832
+    urhadd          z0.b, p0/m, z0.b, z1.b
833
+    urhadd          z2.b, p0/m, z2.b, z3.b
834
+    urhadd          z4.b, p0/m, z4.b, z5.b
835
+    urhadd          z6.b, p0/m, z6.b, z7.b
836
+    st1b            {z0.b}, p0, x0
837
+    st1b            {z2.b}, p0, x0, #1, mul vl
838
+    st1b            {z4.b}, p0, x0, #2, mul vl
839
+    st1b            {z6.b}, p0, x0, #3, mul vl
840
+    add             x0, x0, #64
841
+.endr
842
+    ret
843
+.vl_gt_16_scale1D_128to64:
844
+    cmp             x9, #48
845
+    bgt             .vl_gt_48_scale1D_128to64
846
+    ptrue           p0.b, vl32
847
+.rept 2
848
+    ld2b            {z0.b, z1.b}, p0/z, x1
849
+    ld2b            {z2.b, z3.b}, p0/z, x1, #2, mul vl
850
+    add             x1, x1, #128
851
+    urhadd          z0.b, p0/m, z0.b, z1.b
852
+    urhadd          z2.b, p0/m, z2.b, z3.b
853
+    st1b            {z0.b}, p0, x0
854
+    st1b            {z2.b}, p0, x0, #1, mul vl
855
+    add             x0, x0, #64
856
+.endr
857
+    ret
858
+.vl_gt_48_scale1D_128to64:
859
+    ptrue           p0.b, vl64
860
+.rept 2
861
+    ld2b            {z0.b, z1.b}, p0/z, x1
862
+    add             x1, x1, #128
863
+    urhadd          z0.b, p0/m, z0.b, z1.b
864
+    st1b            {z0.b}, p0, x0
865
+    add             x0, x0, #64
866
+.endr
867
+    ret
868
+endfunc
869
+
870
+/***** dequant_scaling*****/
871
+// void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
872
+function PFX(dequant_scaling_sve2)
873
+    ptrue           p0.h, vl8
874
+    add             x5, x5, #4              // shift + 4
875
+    lsr             x3, x3, #3              // num / 8
876
+    cmp             x5, x4
877
+    blt             .dequant_skip_sve2
878
+
879
+    mov             x12, #1
880
+    sub             x6, x5, x4          // shift - per
881
+    sub             x6, x6, #1          // shift - per - 1
882
+    lsl             x6, x12, x6         // 1 << shift - per - 1 (add)
883
+    mov             z0.s, w6
884
+    sub             x7, x4, x5          // per - shift
885
+    mov             z3.s, w7
886
+
887
+.dequant_loop1_sve2:
888
+    ld1h            {z19.h}, p0/z, x0
889
+    ld1w            {z2.s}, p0/z, x1
890
+    add             x1, x1, #16
891
+    ld1w            {z20.s}, p0/z, x1
892
+    add             x0, x0, #16
893
+    add             x1, x1, #16
894
+
895
+    sub             x3, x3, #1
896
+    sunpklo         z1.s, z19.h
897
+    sunpkhi         z19.s, z19.h
898
+
899
+    mul             z1.s, z1.s, z2.s // quantCoef * deQuantCoef
900
+    mul             z19.s, z19.s, z20.s
901
+    add             z1.s, z1.s, z0.s // quantCoef * deQuantCoef + add
902
+    add             z19.s, z19.s, z0.s
903
+
904
+    // No equivalent instructions in SVE2 for sshl
905
+    // as sqshl has double latency
906
+    sshl            v1.4s, v1.4s, v3.4s
907
+    sshl            v19.4s, v19.4s, v3.4s
908
+
909
+    sqxtnb          z16.h, z1.s
910
+    sqxtnb          z17.h, z19.s
911
+    st1h            {z16.s}, p0, x2
912
+    st1h            {z17.s}, p0, x2, #1, mul vl
913
+    add             x2, x2, #16
914
+    cbnz            x3, .dequant_loop1_sve2
915
+    ret
916
+
917
+.dequant_skip_sve2:
918
+    sub             x6, x4, x5          // per - shift
919
+    mov             z0.h, w6
920
+
921
+.dequant_loop2_sve2:
922
+    ld1h            {z19.h}, p0/z, x0
923
+    ld1w            {z2.s}, p0/z, x1
924
+    add             x1, x1, #16
925
+    ld1w            {z20.s}, p0/z, x1
926
+    add             x0, x0, #16
927
+    add             x1, x1, #16
928
+
929
+
930
+    sub             x3, x3, #1
931
+    sunpklo         z1.s, z19.h
932
+    sunpkhi         z19.s, z19.h
933
+
934
+    mul             z1.s, z1.s, z2.s // quantCoef * deQuantCoef
935
+    mul             z19.s, z19.s, z20.s
936
+
937
+    // Keeping NEON instructions here in order to have
938
+    // one sqshl later
939
+    sqxtn           v16.4h, v1.4s       // x265_clip3
940
+    sqxtn2          v16.8h, v19.4s
941
+
942
+    sqshl           z16.h, p0/m, z16.h, z0.h // coefQ << per - shift
943
+    st1h            {z16.h}, p0, x2
944
+    add             x2, x2, #16
945
+    cbnz            x3, .dequant_loop2_sve2
946
+    ret
947
+endfunc
948
+
949
+// void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
950
+function PFX(dequant_normal_sve2)
951
+    lsr             w2, w2, #4              // num / 16
952
+    neg             w4, w4
953
+    mov             z0.h, w3
954
+    mov             z1.s, w4
955
+    rdvl            x9, #1
956
+    cmp             x9, #16
957
+    bgt             .vl_gt_16_dequant_normal_sve2
958
+.dqn_loop1_sve2:
959
+    ld1             {v2.8h, v3.8h}, x0, #32
960
+    smull           v16.4s, v2.4h, v0.4h
961
+    smull2          v17.4s, v2.8h, v0.8h
962
+    smull           v18.4s, v3.4h, v0.4h
963
+    smull2          v19.4s, v3.8h, v0.8h
964
+
965
+    srshl           v16.4s, v16.4s, v1.4s
966
+    srshl           v17.4s, v17.4s, v1.4s
967
+    srshl           v18.4s, v18.4s, v1.4s
968
+    srshl           v19.4s, v19.4s, v1.4s
969
+
970
+    sqxtn           v2.4h, v16.4s
971
+    sqxtn2          v2.8h, v17.4s
972
+    sqxtn           v3.4h, v18.4s
973
+    sqxtn2          v3.8h, v19.4s
974
+
975
+    sub             w2, w2, #1
976
+    st1             {v2.8h, v3.8h}, x1, #32
977
+    cbnz            w2, .dqn_loop1_sve2
978
+    ret
979
+.vl_gt_16_dequant_normal_sve2:
980
+    ptrue           p0.h, vl16
981
+.gt_16_dqn_loop1_sve2:
982
+    ld1h            {z2.h}, p0/z, x0
983
+    add             x0, x0, #32
984
+    smullb          z16.s, z2.h, z0.h
985
+    smullt          z17.s, z2.h, z0.h
986
+
987
+    srshl           z16.s, p0/m, z16.s, z1.s
988
+    srshl           z17.s, p0/m, z17.s, z1.s
989
+
990
+    sqxtnb          z2.h, z16.s
991
+    sqxtnt          z2.h, z17.s
992
+    
993
+    sub             w2, w2, #1
994
+    st1h            {z2.h}, p0, x1
995
+    add             x1, x1, #32
996
+    cbnz            w2, .gt_16_dqn_loop1_sve2
997
+    ret
998
+
999
+endfunc
1000
+
1001
+// void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24)
1002
+function PFX(ssim_4x4x2_core_sve2)
1003
+    ptrue           p0.b, vl16
1004
+    movi            v30.2d, #0
1005
+    movi            v31.2d, #0
1006
+
1007
+    ld1b            {z0.h}, p0/z, x0
1008
+    add             x0, x0, x1
1009
+    ld1b            {z1.h}, p0/z, x0
1010
+    add             x0, x0, x1
1011
+    ld1b            {z2.h}, p0/z, x0
1012
+    add             x0, x0, x1
1013
+    ld1b            {z3.h}, p0/z, x0
1014
+    add             x0, x0, x1
1015
+
1016
+    ld1b            {z4.h}, p0/z, x2
1017
+    add             x2, x2, x3
1018
+    ld1b            {z5.h}, p0/z, x2
1019
+    add             x2, x2, x3
1020
+    ld1b            {z6.h}, p0/z, x2
1021
+    add             x2, x2, x3
1022
+    ld1b            {z7.h}, p0/z, x2
1023
+    add             x2, x2, x3
1024
+
1025
+    mul             z16.h, z0.h, z0.h
1026
+    mul             z17.h, z1.h, z1.h
1027
+    mul             z18.h, z2.h, z2.h
1028
+    uaddlp          v30.4s, v16.8h
1029
+
1030
+    mul             z19.h, z3.h, z3.h
1031
+    mul             z20.h, z4.h, z4.h
1032
+    mul             z21.h, z5.h, z5.h
1033
+    uadalp          v30.4s, v17.8h
1034
+
1035
+    mul             z22.h, z6.h, z6.h
1036
+    mul             z23.h, z7.h, z7.h
1037
+    mul             z24.h, z0.h, z4.h
1038
+    uadalp          v30.4s, v18.8h
1039
+
1040
+    mul             z25.h, z1.h, z5.h
1041
+    mul             z26.h, z2.h, z6.h
1042
+    mul             z27.h, z3.h, z7.h
1043
+    uadalp          v30.4s, v19.8h
1044
+
1045
+    add             z28.h, z0.h, z1.h
1046
+    add             z29.h, z4.h, z5.h
1047
+    uadalp          v30.4s, v20.8h
1048
+    uaddlp          v31.4s, v24.8h
1049
+
1050
+    add             z28.h, z28.h, z2.h
1051
+    add             z29.h, z29.h, z6.h
1052
+    uadalp          v30.4s, v21.8h
1053
+    uadalp          v31.4s, v25.8h
1054
+
1055
+    add             z28.h, z28.h, z3.h
1056
+    add             z29.h, z29.h, z7.h
1057
+    uadalp          v30.4s, v22.8h
1058
+    uadalp          v31.4s, v26.8h
1059
+
1060
+    // Better use NEON instructions here
1061
+    uaddlp          v28.4s, v28.8h
1062
+    uaddlp          v29.4s, v29.8h
1063
+    uadalp          v30.4s, v23.8h
1064
+    uadalp          v31.4s, v27.8h
1065
+
1066
+    addp            v28.4s, v28.4s, v28.4s
1067
+    addp            v29.4s, v29.4s, v29.4s
1068
+    addp            v30.4s, v30.4s, v30.4s
1069
+    addp            v31.4s, v31.4s, v31.4s
1070
+
1071
+    st4             {v28.2s, v29.2s, v30.2s, v31.2s}, x4
1072
+    ret
1073
+endfunc
1074
+
1075
+// void ssimDist_c(const pixel* fenc, uint32_t fStride, const pixel* recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k)
1076
+.macro ssimDist_start_sve2
1077
+    mov             z0.d, #0
1078
+    mov             z1.d, #0
1079
+.endm
1080
+
1081
+.macro ssimDist_1_sve2  z0 z1 z2 z3
1082
+    sub             z16.s, \z0\().s, \z2\().s
1083
+    sub             z17.s, \z1\().s, \z3\().s
1084
+    mul             z18.s, \z0\().s, \z0\().s
1085
+    mul             z19.s, \z1\().s, \z1\().s
1086
+    mul             z20.s, z16.s, z16.s
1087
+    mul             z21.s, z17.s, z17.s
1088
+    add             z0.s, z0.s, z18.s
1089
+    add             z0.s, z0.s, z19.s
1090
+    add             z1.s, z1.s, z20.s
1091
+    add             z1.s, z1.s, z21.s
1092
+.endm
1093
+
1094
+.macro ssimDist_end_sve2
1095
+    uaddv           d0, p0, z0.s
1096
+    uaddv           d1, p0, z1.s
1097
+    str             d0, x6
1098
+    str             d1, x4
1099
+.endm
1100
+
1101
+function PFX(ssimDist4_sve2)
1102
+    ssimDist_start
1103
+    ptrue           p0.s, vl4
1104
+.rept 4
1105
+    ld1b            {z4.s}, p0/z, x0
1106
+    add             x0, x0, x1
1107
+    ld1b            {z5.s}, p0/z, x2
1108
+    add             x2, x2, x3
1109
+    sub             z2.s, z4.s, z5.s
1110
+    mul             z3.s, z4.s, z4.s
1111
+    mul             z2.s, z2.s, z2.s
1112
+    add             z0.s, z0.s, z3.s
1113
+    add             z1.s, z1.s, z2.s
1114
+.endr
1115
+    ssimDist_end
1116
+    ret
1117
+endfunc
1118
+
1119
+function PFX(ssimDist8_sve2)
1120
+    rdvl            x9, #1
1121
+    cmp             x9, #16
1122
+    bgt             .vl_gt_16_ssimDist8
1123
+    ssimDist_start
1124
+    ptrue           p0.s, vl4
1125
+.rept 8
1126
+    ld1b            {z4.s}, p0/z, x0
1127
+    ld1b            {z5.s}, p0/z, x0, #1, mul vl
1128
+    add             x0, x0, x1
1129
+    ld1b            {z6.s}, p0/z, x2
1130
+    ld1b            {z7.s}, p0/z, x2, #1, mul vl
1131
+    add             x2, x2, x3
1132
+    ssimDist_1_sve2 z4, z5, z6, z7
1133
+.endr
1134
+    ssimDist_end
1135
+    ret
1136
+.vl_gt_16_ssimDist8:
1137
+    ssimDist_start_sve2
1138
+    ptrue           p0.s, vl8
1139
+.rept 8
1140
+    ld1b            {z4.s}, p0/z, x0
1141
+    add             x0, x0, x1
1142
+    ld1b            {z6.s}, p0/z, x2
1143
+    add             x2, x2, x3
1144
+    sub             z20.s, z4.s, z6.s
1145
+    mul             z16.s, z4.s, z4.s
1146
+    mul             z18.s, z20.s, z20.s
1147
+    add             z0.s, z0.s, z16.s
1148
+    add             z1.s, z1.s, z18.s
1149
+.endr
1150
+    ssimDist_end_sve2
1151
+    ret
1152
+endfunc
1153
+
1154
+function PFX(ssimDist16_sve2)
1155
+    mov             w12, #16
1156
+    rdvl            x9, #1
1157
+    cmp             x9, #16
1158
+    bgt             .vl_gt_16_ssimDist16
1159
+    ssimDist_start
1160
+    ptrue           p0.s, vl4
1161
+.loop_ssimDist16_sve2:
1162
+    sub             w12, w12, #1
1163
+    ld1b            {z4.s}, p0/z, x0
1164
+    ld1b            {z5.s}, p0/z, x0, #1, mul vl
1165
+    ld1b            {z6.s}, p0/z, x0, #2, mul vl
1166
+    ld1b            {z7.s}, p0/z, x0, #3, mul vl
1167
+    add             x0, x0, x1
1168
+    ld1b            {z8.s}, p0/z, x2
1169
+    ld1b            {z9.s}, p0/z, x2, #1, mul vl
1170
+    ld1b            {z10.s}, p0/z, x2, #2, mul vl
1171
+    ld1b            {z11.s}, p0/z, x2, #3, mul vl
1172
+    add             x2, x2, x3
1173
+    ssimDist_1_sve2 z4, z5, z8, z9
1174
+    ssimDist_1_sve2 z6, z7, z10, z11
1175
+    cbnz            w12, .loop_ssimDist16_sve2
1176
+    ssimDist_end
1177
+    ret
1178
+.vl_gt_16_ssimDist16:
1179
+    cmp             x9, #48
1180
+    bgt             .vl_gt_48_ssimDist16
1181
+    ssimDist_start_sve2
1182
+    ptrue           p0.s, vl8
1183
+.vl_gt_16_loop_ssimDist16_sve2:
1184
+    sub             w12, w12, #1
1185
+    ld1b            {z4.s}, p0/z, x0
1186
+    ld1b            {z5.s}, p0/z, x0, #1, mul vl
1187
+    add             x0, x0, x1
1188
+    ld1b            {z8.s}, p0/z, x2
1189
+    ld1b            {z9.s}, p0/z, x2, #1, mul vl
1190
+    add             x2, x2, x3
1191
+    ssimDist_1_sve2 z4, z5, z8, z9
1192
+    cbnz            w12, .vl_gt_16_loop_ssimDist16_sve2
1193
+    ssimDist_end_sve2
1194
+    ret
1195
+.vl_gt_48_ssimDist16:
1196
+    ssimDist_start_sve2
1197
+    ptrue           p0.s, vl16
1198
+.vl_gt_48_loop_ssimDist16_sve2:
1199
+    sub             w12, w12, #1
1200
+    ld1b            {z4.s}, p0/z, x0
1201
+    add             x0, x0, x1
1202
+    ld1b            {z8.s}, p0/z, x2
1203
+    add             x2, x2, x3
1204
+    sub             z20.s, z4.s, z8.s
1205
+    mul             z16.s, z4.s, z4.s
1206
+    mul             z18.s, z20.s, z20.s
1207
+    add             z0.s, z0.s, z16.s
1208
+    add             z1.s, z1.s, z18.s
1209
+    cbnz            w12, .vl_gt_48_loop_ssimDist16_sve2
1210
+    ssimDist_end_sve2
1211
+    ret
1212
+endfunc
1213
+
1214
+function PFX(ssimDist32_sve2)
1215
+    mov             w12, #32
1216
+    rdvl            x9, #1
1217
+    cmp             x9, #16
1218
+    bgt             .vl_gt_16_ssimDist32
1219
+    ssimDist_start
1220
+    ptrue           p0.s, vl4
1221
+.loop_ssimDist32_sve2:
1222
+    sub             w12, w12, #1
1223
+    ld1b            {z2.s}, p0/z, x0
1224
+    ld1b            {z3.s}, p0/z, x0, #1, mul vl
1225
+    ld1b            {z4.s}, p0/z, x0, #2, mul vl
1226
+    ld1b            {z5.s}, p0/z, x0, #3, mul vl
1227
+    ld1b            {z6.s}, p0/z, x0, #4, mul vl
1228
+    ld1b            {z7.s}, p0/z, x0, #5, mul vl
1229
+    ld1b            {z8.s}, p0/z, x0, #6, mul vl
1230
+    ld1b            {z9.s}, p0/z, x0, #7, mul vl
1231
+    add             x0, x0, x1
1232
+    ld1b            {z10.s}, p0/z, x2
1233
+    ld1b            {z11.s}, p0/z, x2, #1, mul vl
1234
+    ld1b            {z12.s}, p0/z, x2, #2, mul vl
1235
+    ld1b            {z13.s}, p0/z, x2, #3, mul vl
1236
+    ld1b            {z14.s}, p0/z, x2, #4, mul vl
1237
+    ld1b            {z15.s}, p0/z, x2, #5, mul vl
1238
+    ld1b            {z30.s}, p0/z, x2, #6, mul vl
1239
+    ld1b            {z31.s}, p0/z, x2, #7, mul vl
1240
+    add             x2, x2, x3
1241
+    ssimDist_1_sve2 z2, z3, z10, z11
1242
+    ssimDist_1_sve2 z4, z5, z12, z13
1243
+    ssimDist_1_sve2 z6, z7, z14, z15
1244
+    ssimDist_1_sve2 z8, z9, z30, z31
1245
+    cbnz            w12, .loop_ssimDist32_sve2
1246
+    ssimDist_end
1247
+    ret
1248
+.vl_gt_16_ssimDist32:
1249
+    cmp             x9, #48
1250
+    bgt             .vl_gt_48_ssimDist32
1251
+    ssimDist_start_sve2
1252
+    ptrue           p0.s, vl8
1253
+.vl_gt_16_loop_ssimDist32_sve2:
1254
+    sub             w12, w12, #1
1255
+    ld1b            {z2.s}, p0/z, x0
1256
+    ld1b            {z3.s}, p0/z, x0, #1, mul vl
1257
+    ld1b            {z4.s}, p0/z, x0, #2, mul vl
1258
+    ld1b            {z5.s}, p0/z, x0, #3, mul vl
1259
+    add             x0, x0, x1
1260
+    ld1b            {z10.s}, p0/z, x2
1261
+    ld1b            {z11.s}, p0/z, x2, #1, mul vl
1262
+    ld1b            {z12.s}, p0/z, x2, #2, mul vl
1263
+    ld1b            {z13.s}, p0/z, x2, #3, mul vl
1264
+    add             x2, x2, x3
1265
+    ssimDist_1_sve2 z2, z3, z10, z11
1266
+    ssimDist_1_sve2 z4, z5, z12, z13
1267
+    cbnz            w12, .vl_gt_16_loop_ssimDist32_sve2
1268
+    ssimDist_end_sve2
1269
+    ret
1270
+.vl_gt_48_ssimDist32:
1271
+    cmp             x9, #112
1272
+    bgt             .vl_gt_112_ssimDist32
1273
+    ssimDist_start_sve2
1274
+    ptrue           p0.s, vl16
1275
+.vl_gt_48_loop_ssimDist32_sve2:
1276
+    sub             w12, w12, #1
1277
+    ld1b            {z2.s}, p0/z, x0
1278
+    ld1b            {z3.s}, p0/z, x0, #1, mul vl
1279
+    add             x0, x0, x1
1280
+    ld1b            {z10.s}, p0/z, x2
1281
+    ld1b            {z11.s}, p0/z, x2, #1, mul vl
1282
+    add             x2, x2, x3
1283
+    ssimDist_1_sve2 z2, z3, z10, z11
1284
+    cbnz            w12, .vl_gt_48_loop_ssimDist32_sve2
1285
+    ssimDist_end_sve2
1286
+    ret
1287
+.vl_gt_112_ssimDist32:
1288
+    ssimDist_start_sve2
1289
+    ptrue           p0.s, vl32
1290
+.vl_gt_112_loop_ssimDist32_sve2:
1291
+    sub             w12, w12, #1
1292
+    ld1b            {z2.s}, p0/z, x0
1293
+    add             x0, x0, x1
1294
+    ld1b            {z10.s}, p0/z, x2
1295
+    add             x2, x2, x3
1296
+    sub             z20.s, z2.s, z10.s
1297
+    mul             z16.s, z2.s, z2.s
1298
+    mul             z18.s, z20.s, z20.s
1299
+    add             z0.s, z0.s, z16.s
1300
+    add             z1.s, z1.s, z18.s
1301
+    cbnz            w12, .vl_gt_112_loop_ssimDist32_sve2
1302
+    ssimDist_end_sve2
1303
+    ret
1304
+endfunc
1305
+
1306
+function PFX(ssimDist64_sve2)
1307
+    mov             w12, #64
1308
+    rdvl            x9, #1
1309
+    cmp             x9, #16
1310
+    bgt             .vl_gt_16_ssimDist64
1311
+    ssimDist_start
1312
+    ptrue           p0.s, vl4
1313
+.loop_ssimDist64_sve2:
1314
+    sub             w12, w12, #1
1315
+    ld1b            {z2.s}, p0/z, x0
1316
+    ld1b            {z3.s}, p0/z, x0, #1, mul vl
1317
+    ld1b            {z4.s}, p0/z, x0, #2, mul vl
1318
+    ld1b            {z5.s}, p0/z, x0, #3, mul vl
1319
+    ld1b            {z6.s}, p0/z, x0, #4, mul vl
1320
+    ld1b            {z7.s}, p0/z, x0, #5, mul vl
1321
+    ld1b            {z8.s}, p0/z, x0, #6, mul vl
1322
+    ld1b            {z9.s}, p0/z, x0, #7, mul vl
1323
+    ld1b            {z23.s}, p0/z, x2
1324
+    ld1b            {z24.s}, p0/z, x2, #1, mul vl
1325
+    ld1b            {z25.s}, p0/z, x2, #2, mul vl
1326
+    ld1b            {z26.s}, p0/z, x2, #3, mul vl
1327
+    ld1b            {z27.s}, p0/z, x2, #4, mul vl
1328
+    ld1b            {z28.s}, p0/z, x2, #5, mul vl
1329
+    ld1b            {z29.s}, p0/z, x2, #6, mul vl
1330
+    ld1b            {z30.s}, p0/z, x2, #7, mul vl
1331
+    ssimDist_1_sve2 z2, z3, z23, z24
1332
+    ssimDist_1_sve2 z4, z5, z25, z26
1333
+    ssimDist_1_sve2 z6, z7, z27, z28
1334
+    ssimDist_1_sve2 z8, z9, z29, z30
1335
+    mov             x4, x0
1336
+    mov             x5, x2
1337
+    add             x4, x4, #32
1338
+    add             x5, x5, #32
1339
+    ld1b            {z2.s}, p0/z, x4
1340
+    ld1b            {z3.s}, p0/z, x4, #1, mul vl
1341
+    ld1b            {z4.s}, p0/z, x4, #2, mul vl
1342
+    ld1b            {z5.s}, p0/z, x4, #3, mul vl
1343
+    ld1b            {z6.s}, p0/z, x4, #4, mul vl
1344
+    ld1b            {z7.s}, p0/z, x4, #5, mul vl
1345
+    ld1b            {z8.s}, p0/z, x4, #6, mul vl
1346
+    ld1b            {z9.s}, p0/z, x4, #7, mul vl
1347
+    ld1b            {z23.s}, p0/z, x5
1348
+    ld1b            {z24.s}, p0/z, x5, #1, mul vl
1349
+    ld1b            {z25.s}, p0/z, x5, #2, mul vl
1350
+    ld1b            {z26.s}, p0/z, x5, #3, mul vl
1351
+    ld1b            {z27.s}, p0/z, x5, #4, mul vl
1352
+    ld1b            {z28.s}, p0/z, x5, #5, mul vl
1353
+    ld1b            {z29.s}, p0/z, x5, #6, mul vl
1354
+    ld1b            {z30.s}, p0/z, x5, #7, mul vl
1355
+    ssimDist_1_sve2 z2, z3, z23, z24
1356
+    ssimDist_1_sve2 z4, z5, z25, z26
1357
+    ssimDist_1_sve2 z6, z7, z27, z28
1358
+    ssimDist_1_sve2 z8, z9, z29, z30
1359
+    add             x0, x0, x1
1360
+    add             x2, x2, x3
1361
+    cbnz            w12, .loop_ssimDist64_sve2
1362
+    ssimDist_end
1363
+    ret
1364
+.vl_gt_16_ssimDist64:
1365
+    cmp             x9, #48
1366
+    bgt             .vl_gt_48_ssimDist64
1367
+    ssimDist_start_sve2
1368
+    ptrue           p0.s, vl8
1369
+.vl_gt_16_loop_ssimDist64_sve2:
1370
+    sub             w12, w12, #1
1371
+    ld1b            {z2.s}, p0/z, x0
1372
+    ld1b            {z3.s}, p0/z, x0, #1, mul vl
1373
+    ld1b            {z4.s}, p0/z, x0, #2, mul vl
1374
+    ld1b            {z5.s}, p0/z, x0, #3, mul vl
1375
+    ld1b            {z6.s}, p0/z, x0, #4, mul vl
1376
+    ld1b            {z7.s}, p0/z, x0, #5, mul vl
1377
+    ld1b            {z8.s}, p0/z, x0, #6, mul vl
1378
+    ld1b            {z9.s}, p0/z, x0, #7, mul vl
1379
+    ld1b            {z23.s}, p0/z, x2
1380
+    ld1b            {z24.s}, p0/z, x2, #1, mul vl
1381
+    ld1b            {z25.s}, p0/z, x2, #2, mul vl
1382
+    ld1b            {z26.s}, p0/z, x2, #3, mul vl
1383
+    ld1b            {z27.s}, p0/z, x2, #4, mul vl
1384
+    ld1b            {z28.s}, p0/z, x2, #5, mul vl
1385
+    ld1b            {z29.s}, p0/z, x2, #6, mul vl
1386
+    ld1b            {z30.s}, p0/z, x2, #7, mul vl
1387
+    ssimDist_1_sve2 z2, z3, z23, z24
1388
+    ssimDist_1_sve2 z4, z5, z25, z26
1389
+    ssimDist_1_sve2 z6, z7, z27, z28
1390
+    ssimDist_1_sve2 z8, z9, z29, z30
1391
+    add             x0, x0, x1
1392
+    add             x2, x2, x3
1393
+    cbnz            w12, .vl_gt_16_loop_ssimDist64_sve2
1394
+    ssimDist_end_sve2
1395
+    ret
1396
+.vl_gt_48_ssimDist64:
1397
+    cmp             x9, #112
1398
+    bgt             .vl_gt_112_ssimDist64
1399
+    ssimDist_start_sve2
1400
+    ptrue           p0.s, vl16
1401
+.vl_gt_48_loop_ssimDist64_sve2:
1402
+    sub             w12, w12, #1
1403
+    ld1b            {z2.s}, p0/z, x0
1404
+    ld1b            {z3.s}, p0/z, x0, #1, mul vl
1405
+    ld1b            {z4.s}, p0/z, x0, #2, mul vl
1406
+    ld1b            {z5.s}, p0/z, x0, #3, mul vl
1407
+    ld1b            {z23.s}, p0/z, x2
1408
+    ld1b            {z24.s}, p0/z, x2, #1, mul vl
1409
+    ld1b            {z25.s}, p0/z, x2, #2, mul vl
1410
+    ld1b            {z26.s}, p0/z, x2, #3, mul vl
1411
+    ssimDist_1_sve2 z2, z3, z23, z24
1412
+    ssimDist_1_sve2 z4, z5, z25, z26
1413
+    add             x0, x0, x1
1414
+    add             x2, x2, x3
1415
+    cbnz            w12, .vl_gt_48_loop_ssimDist64_sve2
1416
+    ssimDist_end_sve2
1417
+    ret
1418
+.vl_gt_112_ssimDist64:
1419
+    ssimDist_start_sve2
1420
+    ptrue           p0.s, vl32
1421
+.vl_gt_112_loop_ssimDist64_sve2:
1422
+    sub             w12, w12, #1
1423
+    ld1b            {z2.s}, p0/z, x0
1424
+    ld1b            {z3.s}, p0/z, x0, #1, mul vl
1425
+    ld1b            {z23.s}, p0/z, x2
1426
+    ld1b            {z24.s}, p0/z, x2, #1, mul vl
1427
+    ssimDist_1_sve2 z2, z3, z23, z24
1428
+    add             x0, x0, x1
1429
+    add             x2, x2, x3
1430
+    cbnz            w12, .vl_gt_112_loop_ssimDist64_sve2
1431
+    ssimDist_end_sve2
1432
+    ret
1433
+endfunc
1434
+
1435
+// void normFact_c(const pixel* src, uint32_t blockSize, int shift, uint64_t *z_k)
1436
+.macro normFact_start_sve2
1437
+    mov             z0.d, #0
1438
+.endm
1439
+
1440
+.macro normFact_1_sve2  z0, z1
1441
+    mul             z16.s, \z0\().s, \z0\().s
1442
+    mul             z17.s, \z1\().s, \z1\().s
1443
+    add             z0.s, z0.s, z16.s
1444
+    add             z0.s, z0.s, z17.s
1445
+.endm
1446
+
1447
+.macro normFact_end_sve2
1448
+    uaddv           d0, p0, z0.s
1449
+    str             d0, x3
1450
+.endm
1451
+
1452
+function PFX(normFact8_sve2)
1453
+    rdvl            x9, #1
1454
+    cmp             x9, #16
1455
+    bgt             .vl_gt_16_normFact8
1456
+    normFact_start
1457
+    ptrue           p0.s, vl4
1458
+.rept 8
1459
+    ld1b            {z4.s}, p0/z, x0
1460
+    ld1b            {z5.s}, p0/z, x0, #1, mul vl
1461
+    add             x0, x0, x1
1462
+    normFact_1_sve2 z4, z5
1463
+.endr
1464
+    normFact_end
1465
+    ret
1466
+.vl_gt_16_normFact8:
1467
+    normFact_start_sve2
1468
+    ptrue           p0.s, vl8
1469
+.rept 8
1470
+    ld1b            {z4.s}, p0/z, x0
1471
+    add             x0, x0, x1
1472
+    mul             z16.s, z4.s, z4.s
1473
+    add             z0.s, z0.s, z16.s
1474
+.endr
1475
+    normFact_end_sve2
1476
+    ret
1477
+endfunc
1478
+
1479
+function PFX(normFact16_sve2)
1480
+    mov             w12, #16
1481
+    rdvl            x9, #1
1482
+    cmp             x9, #16
1483
+    bgt             .vl_gt_16_normFact16
1484
+    normFact_start
1485
+    ptrue           p0.s, vl4
1486
+.loop_normFact16_sve2:
1487
+    sub             w12, w12, #1
1488
+    ld1b            {z4.s}, p0/z, x0
1489
+    ld1b            {z5.s}, p0/z, x0, #1, mul vl
1490
+    ld1b            {z6.s}, p0/z, x0, #2, mul vl
1491
+    ld1b            {z7.s}, p0/z, x0, #3, mul vl
1492
+    add             x0, x0, x1
1493
+    normFact_1_sve2 z4, z5
1494
+    normFact_1_sve2 z6, z7
1495
+    cbnz            w12, .loop_normFact16_sve2
1496
+    normFact_end
1497
+    ret
1498
+.vl_gt_16_normFact16:
1499
+    cmp             x9, #48
1500
+    bgt             .vl_gt_48_normFact16
1501
+    normFact_start_sve2
1502
+    ptrue           p0.s, vl8
1503
+.vl_gt_16_loop_normFact16_sve2:
1504
+    sub             w12, w12, #1
1505
+    ld1b            {z4.s}, p0/z, x0
1506
+    ld1b            {z5.s}, p0/z, x0, #1, mul vl
1507
+    add             x0, x0, x1
1508
+    normFact_1_sve2 z4, z5
1509
+    cbnz            w12, .vl_gt_16_loop_normFact16_sve2
1510
+    normFact_end_sve2
1511
+    ret
1512
+.vl_gt_48_normFact16:
1513
+    normFact_start_sve2
1514
+    ptrue           p0.s, vl16
1515
+.vl_gt_48_loop_normFact16_sve2:
1516
+    sub             w12, w12, #1
1517
+    ld1b            {z4.s}, p0/z, x0
1518
+    add             x0, x0, x1
1519
+    mul             z16.s, z4.s, z4.s
1520
+    add             z0.s, z0.s, z16.s
1521
+    cbnz            w12, .vl_gt_48_loop_normFact16_sve2
1522
+    normFact_end_sve2
1523
+    ret
1524
+endfunc
1525
+
1526
+function PFX(normFact32_sve2)
1527
+    mov             w12, #32
1528
+    rdvl            x9, #1
1529
+    cmp             x9, #16
1530
+    bgt             .vl_gt_16_normFact32
1531
+    normFact_start
1532
+    ptrue           p0.s, vl4
1533
+.loop_normFact32_sve2:
1534
+    sub             w12, w12, #1
1535
+    ld1b            {z4.s}, p0/z, x0
1536
+    ld1b            {z5.s}, p0/z, x0, #1, mul vl
1537
+    ld1b            {z6.s}, p0/z, x0, #2, mul vl
1538
+    ld1b            {z7.s}, p0/z, x0, #3, mul vl
1539
+    ld1b            {z8.s}, p0/z, x0, #4, mul vl
1540
+    ld1b            {z9.s}, p0/z, x0, #5, mul vl
1541
+    ld1b            {z10.s}, p0/z, x0, #6, mul vl
1542
+    ld1b            {z11.s}, p0/z, x0, #7, mul vl
1543
+    add             x0, x0, x1
1544
+    normFact_1_sve2 z4, z5
1545
+    normFact_1_sve2 z6, z7
1546
+    normFact_1_sve2 z8, z9
1547
+    normFact_1_sve2 z10, z11
1548
+    cbnz            w12, .loop_normFact32_sve2
1549
+    normFact_end
1550
+    ret
1551
+.vl_gt_16_normFact32:
1552
+    cmp             x9, #48
1553
+    bgt             .vl_gt_48_normFact32
1554
+    normFact_start_sve2
1555
+    ptrue           p0.s, vl8
1556
+.vl_gt_16_loop_normFact32_sve2:
1557
+    sub             w12, w12, #1
1558
+    ld1b            {z4.s}, p0/z, x0
1559
+    ld1b            {z5.s}, p0/z, x0, #1, mul vl
1560
+    ld1b            {z6.s}, p0/z, x0, #2, mul vl
1561
+    ld1b            {z7.s}, p0/z, x0, #3, mul vl
1562
+    add             x0, x0, x1
1563
+    normFact_1_sve2 z4, z5
1564
+    normFact_1_sve2 z6, z7
1565
+    cbnz            w12, .vl_gt_16_loop_normFact32_sve2
1566
+    normFact_end_sve2
1567
+    ret
1568
+.vl_gt_48_normFact32:
1569
+    cmp             x9, #112
1570
+    bgt             .vl_gt_112_normFact32
1571
+    normFact_start_sve2
1572
+    ptrue           p0.s, vl16
1573
+.vl_gt_48_loop_normFact32_sve2:
1574
+    sub             w12, w12, #1
1575
+    ld1b            {z4.s}, p0/z, x0
1576
+    ld1b            {z5.s}, p0/z, x0, #1, mul vl
1577
+    add             x0, x0, x1
1578
+    normFact_1_sve2 z4, z5
1579
+    cbnz            w12, .vl_gt_48_loop_normFact32_sve2
1580
+    normFact_end_sve2
1581
+    ret
1582
+.vl_gt_112_normFact32:
1583
+    normFact_start_sve2
1584
+    ptrue           p0.s, vl32
1585
+.vl_gt_112_loop_normFact32_sve2:
1586
+    sub             w12, w12, #1
1587
+    ld1b            {z4.s}, p0/z, x0
1588
+    add             x0, x0, x1
1589
+    mul             z16.s, z4.s, z4.s
1590
+    add             z0.s, z0.s, z16.s
1591
+    cbnz            w12, .vl_gt_112_loop_normFact32_sve2
1592
+    normFact_end_sve2
1593
+    ret
1594
+endfunc
1595
+
1596
+function PFX(normFact64_sve2)
1597
+    mov             w12, #64
1598
+    rdvl            x9, #1
1599
+    cmp             x9, #16
1600
+    bgt             .vl_gt_16_normFact64
1601
+    normFact_start
1602
+    ptrue           p0.s, vl4
1603
+.loop_normFact64_sve2:
1604
+    sub             w12, w12, #1
1605
+    ld1b            {z4.s}, p0/z, x0
1606
+    ld1b            {z5.s}, p0/z, x0, #1, mul vl
1607
+    ld1b            {z6.s}, p0/z, x0, #2, mul vl
1608
+    ld1b            {z7.s}, p0/z, x0, #3, mul vl
1609
+    ld1b            {z8.s}, p0/z, x0, #4, mul vl
1610
+    ld1b            {z9.s}, p0/z, x0, #5, mul vl
1611
+    ld1b            {z10.s}, p0/z, x0, #6, mul vl
1612
+    ld1b            {z11.s}, p0/z, x0, #7, mul vl
1613
+    normFact_1_sve2 z4, z5
1614
+    normFact_1_sve2 z6, z7
1615
+    normFact_1_sve2 z8, z9
1616
+    normFact_1_sve2 z10, z11
1617
+    mov             x2, x0
1618
+    add             x2, x2, #32
1619
+    ld1b            {z4.s}, p0/z, x2
1620
+    ld1b            {z5.s}, p0/z, x2, #1, mul vl
1621
+    ld1b            {z6.s}, p0/z, x2, #2, mul vl
1622
+    ld1b            {z7.s}, p0/z, x2, #3, mul vl
1623
+    ld1b            {z8.s}, p0/z, x2, #4, mul vl
1624
+    ld1b            {z9.s}, p0/z, x2, #5, mul vl
1625
+    ld1b            {z10.s}, p0/z, x2, #6, mul vl
1626
+    ld1b            {z11.s}, p0/z, x2, #7, mul vl
1627
+    normFact_1_sve2 z4, z5
1628
+    normFact_1_sve2 z6, z7
1629
+    normFact_1_sve2 z8, z9
1630
+    normFact_1_sve2 z10, z11
1631
+    add             x0, x0, x1
1632
+    cbnz            w12, .loop_normFact64_sve2
1633
+    normFact_end
1634
+    ret
1635
+.vl_gt_16_normFact64:
1636
+    cmp             x9, #48
1637
+    bgt             .vl_gt_48_normFact64
1638
+    normFact_start_sve2
1639
+    ptrue           p0.s, vl8
1640
+.vl_gt_16_loop_normFact64_sve2:
1641
+    sub             w12, w12, #1
1642
+    ld1b            {z4.s}, p0/z, x0
1643
+    ld1b            {z5.s}, p0/z, x0, #1, mul vl
1644
+    ld1b            {z6.s}, p0/z, x0, #2, mul vl
1645
+    ld1b            {z7.s}, p0/z, x0, #3, mul vl
1646
+    ld1b            {z8.s}, p0/z, x0, #4, mul vl
1647
+    ld1b            {z9.s}, p0/z, x0, #5, mul vl
1648
+    ld1b            {z10.s}, p0/z, x0, #6, mul vl
1649
+    ld1b            {z11.s}, p0/z, x0, #7, mul vl
1650
+    normFact_1_sve2 z4, z5
1651
+    normFact_1_sve2 z6, z7
1652
+    normFact_1_sve2 z8, z9
1653
+    normFact_1_sve2 z10, z11
1654
+    add             x0, x0, x1
1655
+    cbnz            w12, .vl_gt_16_loop_normFact64_sve2
1656
+    normFact_end_sve2
1657
+    ret
1658
+.vl_gt_48_normFact64:
1659
+    cmp             x9, #112
1660
+    bgt             .vl_gt_112_normFact64
1661
+    normFact_start_sve2
1662
+    ptrue           p0.s, vl16
1663
+.vl_gt_48_loop_normFact64_sve2:
1664
+    sub             w12, w12, #1
1665
+    ld1b            {z4.s}, p0/z, x0
1666
+    ld1b            {z5.s}, p0/z, x0, #1, mul vl
1667
+    ld1b            {z6.s}, p0/z, x0, #2, mul vl
1668
+    ld1b            {z7.s}, p0/z, x0, #3, mul vl
1669
+    normFact_1_sve2 z4, z5
1670
+    normFact_1_sve2 z6, z7
1671
+    add             x0, x0, x1
1672
+    cbnz            w12, .vl_gt_48_loop_normFact64_sve2
1673
+    normFact_end_sve2
1674
+    ret
1675
+.vl_gt_112_normFact64:
1676
+    normFact_start_sve2
1677
+    ptrue           p0.s, vl32
1678
+.vl_gt_112_loop_normFact64_sve2:
1679
+    sub             w12, w12, #1
1680
+    ld1b            {z4.s}, p0/z, x0
1681
+    ld1b            {z5.s}, p0/z, x0, #1, mul vl
1682
+    normFact_1_sve2 z4, z5
1683
+    add             x0, x0, x1
1684
+    cbnz            w12, .vl_gt_112_loop_normFact64_sve2
1685
+    normFact_end_sve2
1686
+    ret
1687
+endfunc
1688
x265_3.5.tar.gz/source/common/aarch64/pixel-util.S -> x265_3.6.tar.gz/source/common/aarch64/pixel-util.S Changed
2419
 
1
@@ -1,8 +1,9 @@
2
 /*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
+ * Copyright (C) 2020-2021 MulticoreWare, Inc
5
  *
6
  * Authors: Yimeng Su <yimeng.su@huawei.com>
7
  *          Hongbin Liu <liuhongbin1@huawei.com>
8
+ *          Sebastian Pop <spop@amazon.com>
9
  *
10
  * This program is free software; you can redistribute it and/or modify
11
  * it under the terms of the GNU General Public License as published by
12
@@ -23,13 +24,652 @@
13
  *****************************************************************************/
14
 
15
 #include "asm.S"
16
+#include "pixel-util-common.S"
17
 
18
+#ifdef __APPLE__
19
+.section __RODATA,__rodata
20
+#else
21
 .section .rodata
22
+#endif
23
 
24
 .align 4
25
 
26
 .text
27
 
28
+// uint64_t pixel_var(const pixel* pix, intptr_t i_stride)
29
+function PFX(pixel_var_8x8_neon)
30
+    ld1             {v4.8b}, x0, x1        // pixx
31
+    uxtl            v0.8h, v4.8b             // sum = pixx
32
+    umull           v1.8h, v4.8b, v4.8b
33
+    uaddlp          v1.4s, v1.8h             // sqr = pixx * pixx
34
+
35
+.rept 7
36
+    ld1             {v4.8b}, x0, x1        // pixx
37
+    umull           v31.8h, v4.8b, v4.8b
38
+    uaddw           v0.8h, v0.8h, v4.8b      // sum += pixx
39
+    uadalp          v1.4s, v31.8h            // sqr += pixx * pixx
40
+.endr
41
+    uaddlv          s0, v0.8h
42
+    uaddlv          d1, v1.4s
43
+    fmov            w0, s0
44
+    fmov            x1, d1
45
+    orr             x0, x0, x1, lsl #32      // return sum + ((uint64_t)sqr << 32);
46
+    ret
47
+endfunc
48
+
49
+function PFX(pixel_var_16x16_neon)
50
+    pixel_var_start
51
+    mov             w12, #16
52
+.loop_var_16:
53
+    sub             w12, w12, #1
54
+    ld1             {v4.16b}, x0, x1
55
+    pixel_var_1 v4
56
+    cbnz            w12, .loop_var_16
57
+    pixel_var_end
58
+    ret
59
+endfunc
60
+
61
+function PFX(pixel_var_32x32_neon)
62
+    pixel_var_start
63
+    mov             w12, #32
64
+.loop_var_32:
65
+    sub             w12, w12, #1
66
+    ld1             {v4.16b-v5.16b}, x0, x1
67
+    pixel_var_1 v4
68
+    pixel_var_1 v5
69
+    cbnz            w12, .loop_var_32
70
+    pixel_var_end
71
+    ret
72
+endfunc
73
+
74
+function PFX(pixel_var_64x64_neon)
75
+    pixel_var_start
76
+    mov             w12, #64
77
+.loop_var_64:
78
+    sub             w12, w12, #1
79
+    ld1             {v4.16b-v7.16b}, x0, x1
80
+    pixel_var_1 v4
81
+    pixel_var_1 v5
82
+    pixel_var_1 v6
83
+    pixel_var_1 v7
84
+    cbnz            w12, .loop_var_64
85
+    pixel_var_end
86
+    ret
87
+endfunc
88
+
89
+// void getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride)
90
+function PFX(getResidual4_neon)
91
+    lsl             x4, x3, #1
92
+.rept 2
93
+    ld1             {v0.8b}, x0, x3
94
+    ld1             {v1.8b}, x1, x3
95
+    ld1             {v2.8b}, x0, x3
96
+    ld1             {v3.8b}, x1, x3
97
+    usubl           v4.8h, v0.8b, v1.8b
98
+    usubl           v5.8h, v2.8b, v3.8b
99
+    st1             {v4.8b}, x2, x4
100
+    st1             {v5.8b}, x2, x4
101
+.endr
102
+    ret
103
+endfunc
104
+
105
+function PFX(getResidual8_neon)
106
+    lsl             x4, x3, #1
107
+.rept 4
108
+    ld1             {v0.8b}, x0, x3
109
+    ld1             {v1.8b}, x1, x3
110
+    ld1             {v2.8b}, x0, x3
111
+    ld1             {v3.8b}, x1, x3
112
+    usubl           v4.8h, v0.8b, v1.8b
113
+    usubl           v5.8h, v2.8b, v3.8b
114
+    st1             {v4.16b}, x2, x4
115
+    st1             {v5.16b}, x2, x4
116
+.endr
117
+    ret
118
+endfunc
119
+
120
+function PFX(getResidual16_neon)
121
+    lsl             x4, x3, #1
122
+.rept 8
123
+    ld1             {v0.16b}, x0, x3
124
+    ld1             {v1.16b}, x1, x3
125
+    ld1             {v2.16b}, x0, x3
126
+    ld1             {v3.16b}, x1, x3
127
+    usubl           v4.8h, v0.8b, v1.8b
128
+    usubl2          v5.8h, v0.16b, v1.16b
129
+    usubl           v6.8h, v2.8b, v3.8b
130
+    usubl2          v7.8h, v2.16b, v3.16b
131
+    st1             {v4.8h-v5.8h}, x2, x4
132
+    st1             {v6.8h-v7.8h}, x2, x4
133
+.endr
134
+    ret
135
+endfunc
136
+
137
+function PFX(getResidual32_neon)
138
+    lsl             x4, x3, #1
139
+    mov             w12, #4
140
+.loop_residual_32:
141
+    sub             w12, w12, #1
142
+.rept 4
143
+    ld1             {v0.16b-v1.16b}, x0, x3
144
+    ld1             {v2.16b-v3.16b}, x1, x3
145
+    ld1             {v4.16b-v5.16b}, x0, x3
146
+    ld1             {v6.16b-v7.16b}, x1, x3
147
+    usubl           v16.8h, v0.8b, v2.8b
148
+    usubl2          v17.8h, v0.16b, v2.16b
149
+    usubl           v18.8h, v1.8b, v3.8b
150
+    usubl2          v19.8h, v1.16b, v3.16b
151
+    usubl           v20.8h, v4.8b, v6.8b
152
+    usubl2          v21.8h, v4.16b, v6.16b
153
+    usubl           v22.8h, v5.8b, v7.8b
154
+    usubl2          v23.8h, v5.16b, v7.16b
155
+    st1             {v16.8h-v19.8h}, x2, x4
156
+    st1             {v20.8h-v23.8h}, x2, x4
157
+.endr
158
+    cbnz            w12, .loop_residual_32
159
+    ret
160
+endfunc
161
+
162
+// void pixel_sub_ps_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1)
163
+function PFX(pixel_sub_ps_4x4_neon)
164
+    lsl             x1, x1, #1
165
+.rept 2
166
+    ld1             {v0.8b}, x2, x4
167
+    ld1             {v1.8b}, x3, x5
168
+    ld1             {v2.8b}, x2, x4
169
+    ld1             {v3.8b}, x3, x5
170
+    usubl           v4.8h, v0.8b, v1.8b
171
+    usubl           v5.8h, v2.8b, v3.8b
172
+    st1             {v4.4h}, x0, x1
173
+    st1             {v5.4h}, x0, x1
174
+.endr
175
+    ret
176
+endfunc
177
+
178
+function PFX(pixel_sub_ps_8x8_neon)
179
+    lsl             x1, x1, #1
180
+.rept 4
181
+    ld1             {v0.8b}, x2, x4
182
+    ld1             {v1.8b}, x3, x5
183
+    ld1             {v2.8b}, x2, x4
184
+    ld1             {v3.8b}, x3, x5
185
+    usubl           v4.8h, v0.8b, v1.8b
186
+    usubl           v5.8h, v2.8b, v3.8b
187
+    st1             {v4.8h}, x0, x1
188
+    st1             {v5.8h}, x0, x1
189
+.endr
190
+    ret
191
+endfunc
192
+
193
+function PFX(pixel_sub_ps_16x16_neon)
194
+    lsl             x1, x1, #1
195
+.rept 8
196
+    ld1             {v0.16b}, x2, x4
197
+    ld1             {v1.16b}, x3, x5
198
+    ld1             {v2.16b}, x2, x4
199
+    ld1             {v3.16b}, x3, x5
200
+    usubl           v4.8h, v0.8b, v1.8b
201
+    usubl2          v5.8h, v0.16b, v1.16b
202
+    usubl           v6.8h, v2.8b, v3.8b
203
+    usubl2          v7.8h, v2.16b, v3.16b
204
+    st1             {v4.8h-v5.8h}, x0, x1
205
+    st1             {v6.8h-v7.8h}, x0, x1
206
+.endr
207
+    ret
208
+endfunc
209
+
210
+function PFX(pixel_sub_ps_32x32_neon)
211
+    lsl             x1, x1, #1
212
+    mov             w12, #4
213
+.loop_sub_ps_32:
214
+    sub             w12, w12, #1
215
+.rept 4
216
+    ld1             {v0.16b-v1.16b}, x2, x4
217
+    ld1             {v2.16b-v3.16b}, x3, x5
218
+    ld1             {v4.16b-v5.16b}, x2, x4
219
+    ld1             {v6.16b-v7.16b}, x3, x5
220
+    usubl           v16.8h, v0.8b, v2.8b
221
+    usubl2          v17.8h, v0.16b, v2.16b
222
+    usubl           v18.8h, v1.8b, v3.8b
223
+    usubl2          v19.8h, v1.16b, v3.16b
224
+    usubl           v20.8h, v4.8b, v6.8b
225
+    usubl2          v21.8h, v4.16b, v6.16b
226
+    usubl           v22.8h, v5.8b, v7.8b
227
+    usubl2          v23.8h, v5.16b, v7.16b
228
+    st1             {v16.8h-v19.8h}, x0, x1
229
+    st1             {v20.8h-v23.8h}, x0, x1
230
+.endr
231
+    cbnz            w12, .loop_sub_ps_32
232
+    ret
233
+endfunc
234
+
235
+function PFX(pixel_sub_ps_64x64_neon)
236
+    lsl             x1, x1, #1
237
+    sub             x1, x1, #64
238
+    mov             w12, #16
239
+.loop_sub_ps_64:
240
+    sub             w12, w12, #1
241
+.rept 4
242
+    ld1             {v0.16b-v3.16b}, x2, x4
243
+    ld1             {v4.16b-v7.16b}, x3, x5
244
+    usubl           v16.8h, v0.8b, v4.8b
245
+    usubl2          v17.8h, v0.16b, v4.16b
246
+    usubl           v18.8h, v1.8b, v5.8b
247
+    usubl2          v19.8h, v1.16b, v5.16b
248
+    usubl           v20.8h, v2.8b, v6.8b
249
+    usubl2          v21.8h, v2.16b, v6.16b
250
+    usubl           v22.8h, v3.8b, v7.8b
251
+    usubl2          v23.8h, v3.16b, v7.16b
252
+    st1             {v16.8h-v19.8h}, x0, #64
253
+    st1             {v20.8h-v23.8h}, x0, x1
254
+.endr
255
+    cbnz            w12, .loop_sub_ps_64
256
+    ret
257
+endfunc
258
+
259
+// chroma sub_ps
260
+function PFX(pixel_sub_ps_4x8_neon)
261
+    lsl             x1, x1, #1
262
+.rept 4
263
+    ld1             {v0.8b}, x2, x4
264
+    ld1             {v1.8b}, x3, x5
265
+    ld1             {v2.8b}, x2, x4
266
+    ld1             {v3.8b}, x3, x5
267
+    usubl           v4.8h, v0.8b, v1.8b
268
+    usubl           v5.8h, v2.8b, v3.8b
269
+    st1             {v4.4h}, x0, x1
270
+    st1             {v5.4h}, x0, x1
271
+.endr
272
+    ret
273
+endfunc
274
+
275
+function PFX(pixel_sub_ps_8x16_neon)
276
+    lsl             x1, x1, #1
277
+.rept 8
278
+    ld1             {v0.8b}, x2, x4
279
+    ld1             {v1.8b}, x3, x5
280
+    ld1             {v2.8b}, x2, x4
281
+    ld1             {v3.8b}, x3, x5
282
+    usubl           v4.8h, v0.8b, v1.8b
283
+    usubl           v5.8h, v2.8b, v3.8b
284
+    st1             {v4.8h}, x0, x1
285
+    st1             {v5.8h}, x0, x1
286
+.endr
287
+    ret
288
+endfunc
289
+
290
+function PFX(pixel_sub_ps_16x32_neon)
291
+    lsl             x1, x1, #1
292
+.rept 16
293
+    ld1             {v0.16b}, x2, x4
294
+    ld1             {v1.16b}, x3, x5
295
+    ld1             {v2.16b}, x2, x4
296
+    ld1             {v3.16b}, x3, x5
297
+    usubl           v4.8h, v0.8b, v1.8b
298
+    usubl2          v5.8h, v0.16b, v1.16b
299
+    usubl           v6.8h, v2.8b, v3.8b
300
+    usubl2          v7.8h, v2.16b, v3.16b
301
+    st1             {v4.8h-v5.8h}, x0, x1
302
+    st1             {v6.8h-v7.8h}, x0, x1
303
+.endr
304
+    ret
305
+endfunc
306
+
307
+function PFX(pixel_sub_ps_32x64_neon)
308
+    lsl             x1, x1, #1
309
+    mov             w12, #8
310
+.loop_sub_ps_32x64:
311
+    sub             w12, w12, #1
312
+.rept 4
313
+    ld1             {v0.16b-v1.16b}, x2, x4
314
+    ld1             {v2.16b-v3.16b}, x3, x5
315
+    ld1             {v4.16b-v5.16b}, x2, x4
316
+    ld1             {v6.16b-v7.16b}, x3, x5
317
+    usubl           v16.8h, v0.8b, v2.8b
318
+    usubl2          v17.8h, v0.16b, v2.16b
319
+    usubl           v18.8h, v1.8b, v3.8b
320
+    usubl2          v19.8h, v1.16b, v3.16b
321
+    usubl           v20.8h, v4.8b, v6.8b
322
+    usubl2          v21.8h, v4.16b, v6.16b
323
+    usubl           v22.8h, v5.8b, v7.8b
324
+    usubl2          v23.8h, v5.16b, v7.16b
325
+    st1             {v16.8h-v19.8h}, x0, x1
326
+    st1             {v20.8h-v23.8h}, x0, x1
327
+.endr
328
+    cbnz            w12, .loop_sub_ps_32x64
329
+    ret
330
+endfunc
331
+
332
+// void x265_pixel_add_ps_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
333
+function PFX(pixel_add_ps_4x4_neon)
334
+    lsl             x5, x5, #1
335
+.rept 2
336
+    ld1             {v0.8b}, x2, x4
337
+    ld1             {v1.8b}, x2, x4
338
+    ld1             {v2.4h}, x3, x5
339
+    ld1             {v3.4h}, x3, x5
340
+    uxtl            v0.8h, v0.8b
341
+    uxtl            v1.8h, v1.8b
342
+    add             v4.8h, v0.8h, v2.8h
343
+    add             v5.8h, v1.8h, v3.8h
344
+    sqxtun          v4.8b, v4.8h
345
+    sqxtun          v5.8b, v5.8h
346
+    st1             {v4.s}0, x0, x1
347
+    st1             {v5.s}0, x0, x1
348
+.endr
349
+    ret
350
+endfunc
351
+
352
+function PFX(pixel_add_ps_8x8_neon)
353
+    lsl             x5, x5, #1
354
+.rept 4
355
+    ld1             {v0.8b}, x2, x4
356
+    ld1             {v1.8b}, x2, x4
357
+    ld1             {v2.8h}, x3, x5
358
+    ld1             {v3.8h}, x3, x5
359
+    uxtl            v0.8h, v0.8b
360
+    uxtl            v1.8h, v1.8b
361
+    add             v4.8h, v0.8h, v2.8h
362
+    add             v5.8h, v1.8h, v3.8h
363
+    sqxtun          v4.8b, v4.8h
364
+    sqxtun          v5.8b, v5.8h
365
+    st1             {v4.8b}, x0, x1
366
+    st1             {v5.8b}, x0, x1
367
+.endr
368
+    ret
369
+endfunc
370
+
371
+.macro pixel_add_ps_16xN_neon h
372
+function PFX(pixel_add_ps_16x\h\()_neon)
373
+    lsl             x5, x5, #1
374
+    mov             w12, #\h / 8
375
+.loop_add_ps_16x\h\():
376
+    sub             w12, w12, #1
377
+.rept 4
378
+    ld1             {v0.16b}, x2, x4
379
+    ld1             {v1.16b}, x2, x4
380
+    ld1             {v16.8h-v17.8h}, x3, x5
381
+    ld1             {v18.8h-v19.8h}, x3, x5
382
+    uxtl            v4.8h, v0.8b
383
+    uxtl2           v5.8h, v0.16b
384
+    uxtl            v6.8h, v1.8b
385
+    uxtl2           v7.8h, v1.16b
386
+    add             v24.8h, v4.8h, v16.8h
387
+    add             v25.8h, v5.8h, v17.8h
388
+    add             v26.8h, v6.8h, v18.8h
389
+    add             v27.8h, v7.8h, v19.8h
390
+    sqxtun          v4.8b, v24.8h
391
+    sqxtun2         v4.16b, v25.8h
392
+    sqxtun          v5.8b, v26.8h
393
+    sqxtun2         v5.16b, v27.8h
394
+    st1             {v4.16b}, x0, x1
395
+    st1             {v5.16b}, x0, x1
396
+.endr
397
+    cbnz            w12, .loop_add_ps_16x\h
398
+    ret
399
+endfunc
400
+.endm
401
+
402
+pixel_add_ps_16xN_neon 16
403
+pixel_add_ps_16xN_neon 32
404
+
405
+.macro pixel_add_ps_32xN_neon h
406
+ function PFX(pixel_add_ps_32x\h\()_neon)
407
+    lsl             x5, x5, #1
408
+    mov             w12, #\h / 4
409
+.loop_add_ps_32x\h\():
410
+    sub             w12, w12, #1
411
+.rept 4
412
+    ld1             {v0.16b-v1.16b}, x2, x4
413
+    ld1             {v16.8h-v19.8h}, x3, x5
414
+    uxtl            v4.8h, v0.8b
415
+    uxtl2           v5.8h, v0.16b
416
+    uxtl            v6.8h, v1.8b
417
+    uxtl2           v7.8h, v1.16b
418
+    add             v24.8h, v4.8h, v16.8h
419
+    add             v25.8h, v5.8h, v17.8h
420
+    add             v26.8h, v6.8h, v18.8h
421
+    add             v27.8h, v7.8h, v19.8h
422
+    sqxtun          v4.8b, v24.8h
423
+    sqxtun2         v4.16b, v25.8h
424
+    sqxtun          v5.8b, v26.8h
425
+    sqxtun2         v5.16b, v27.8h
426
+    st1             {v4.16b-v5.16b}, x0, x1
427
+.endr
428
+    cbnz            w12, .loop_add_ps_32x\h
429
+    ret
430
+endfunc
431
+.endm
432
+
433
+pixel_add_ps_32xN_neon 32
434
+pixel_add_ps_32xN_neon 64
435
+
436
+function PFX(pixel_add_ps_64x64_neon)
437
+    lsl             x5, x5, #1
438
+    sub             x5, x5, #64
439
+    mov             w12, #32
440
+.loop_add_ps_64x64:
441
+    sub             w12, w12, #1
442
+.rept 2
443
+    ld1             {v0.16b-v3.16b}, x2, x4
444
+    ld1             {v16.8h-v19.8h}, x3, #64
445
+    ld1             {v20.8h-v23.8h}, x3, x5
446
+    uxtl            v4.8h, v0.8b
447
+    uxtl2           v5.8h, v0.16b
448
+    uxtl            v6.8h, v1.8b
449
+    uxtl2           v7.8h, v1.16b
450
+    uxtl            v24.8h, v2.8b
451
+    uxtl2           v25.8h, v2.16b
452
+    uxtl            v26.8h, v3.8b
453
+    uxtl2           v27.8h, v3.16b
454
+    add             v0.8h, v4.8h, v16.8h
455
+    add             v1.8h, v5.8h, v17.8h
456
+    add             v2.8h, v6.8h, v18.8h
457
+    add             v3.8h, v7.8h, v19.8h
458
+    add             v4.8h, v24.8h, v20.8h
459
+    add             v5.8h, v25.8h, v21.8h
460
+    add             v6.8h, v26.8h, v22.8h
461
+    add             v7.8h, v27.8h, v23.8h
462
+    sqxtun          v0.8b, v0.8h
463
+    sqxtun2         v0.16b, v1.8h
464
+    sqxtun          v1.8b, v2.8h
465
+    sqxtun2         v1.16b, v3.8h
466
+    sqxtun          v2.8b, v4.8h
467
+    sqxtun2         v2.16b, v5.8h
468
+    sqxtun          v3.8b, v6.8h
469
+    sqxtun2         v3.16b, v7.8h
470
+    st1             {v0.16b-v3.16b}, x0, x1
471
+.endr
472
+    cbnz            w12, .loop_add_ps_64x64
473
+    ret
474
+endfunc
475
+
476
+// Chroma add_ps
477
+function PFX(pixel_add_ps_4x8_neon)
478
+    lsl             x5, x5, #1
479
+.rept 4
480
+    ld1             {v0.8b}, x2, x4
481
+    ld1             {v1.8b}, x2, x4
482
+    ld1             {v2.4h}, x3, x5
483
+    ld1             {v3.4h}, x3, x5
484
+    uxtl            v0.8h, v0.8b
485
+    uxtl            v1.8h, v1.8b
486
+    add             v4.8h, v0.8h, v2.8h
487
+    add             v5.8h, v1.8h, v3.8h
488
+    sqxtun          v4.8b, v4.8h
489
+    sqxtun          v5.8b, v5.8h
490
+    st1             {v4.s}0, x0, x1
491
+    st1             {v5.s}0, x0, x1
492
+.endr
493
+    ret
494
+endfunc
495
+
496
+function PFX(pixel_add_ps_8x16_neon)
497
+    lsl             x5, x5, #1
498
+.rept 8
499
+    ld1             {v0.8b}, x2, x4
500
+    ld1             {v1.8b}, x2, x4
501
+    ld1             {v2.8h}, x3, x5
502
+    ld1             {v3.8h}, x3, x5
503
+    uxtl            v0.8h, v0.8b
504
+    uxtl            v1.8h, v1.8b
505
+    add             v4.8h, v0.8h, v2.8h
506
+    add             v5.8h, v1.8h, v3.8h
507
+    sqxtun          v4.8b, v4.8h
508
+    sqxtun          v5.8b, v5.8h
509
+    st1             {v4.8b}, x0, x1
510
+    st1             {v5.8b}, x0, x1
511
+.endr
512
+    ret
513
+endfunc
514
+
515
+// void scale1D_128to64(pixel *dst, const pixel *src)
516
+function PFX(scale1D_128to64_neon)
517
+.rept 2
518
+    ld2             {v0.16b, v1.16b}, x1, #32
519
+    ld2             {v2.16b, v3.16b}, x1, #32
520
+    ld2             {v4.16b, v5.16b}, x1, #32
521
+    ld2             {v6.16b, v7.16b}, x1, #32
522
+    urhadd          v0.16b, v0.16b, v1.16b
523
+    urhadd          v1.16b, v2.16b, v3.16b
524
+    urhadd          v2.16b, v4.16b, v5.16b
525
+    urhadd          v3.16b, v6.16b, v7.16b
526
+    st1             {v0.16b-v3.16b}, x0, #64
527
+.endr
528
+    ret
529
+endfunc
530
+
531
+.macro scale2D_1  v0, v1
532
+    uaddlp          \v0\().8h, \v0\().16b
533
+    uaddlp          \v1\().8h, \v1\().16b
534
+    add             \v0\().8h, \v0\().8h, \v1\().8h
535
+.endm
536
+
537
+// void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride)
538
+function PFX(scale2D_64to32_neon)
539
+    mov             w12, #32
540
+.loop_scale2D:
541
+    ld1             {v0.16b-v3.16b}, x1, x2
542
+    sub             w12, w12, #1
543
+    ld1             {v4.16b-v7.16b}, x1, x2
544
+    scale2D_1       v0, v4
545
+    scale2D_1       v1, v5
546
+    scale2D_1       v2, v6
547
+    scale2D_1       v3, v7
548
+    uqrshrn         v0.8b, v0.8h, #2
549
+    uqrshrn2        v0.16b, v1.8h, #2
550
+    uqrshrn         v1.8b, v2.8h, #2
551
+    uqrshrn2        v1.16b, v3.8h, #2
552
+    st1             {v0.16b-v1.16b}, x0, #32
553
+    cbnz            w12, .loop_scale2D
554
+    ret
555
+endfunc
556
+
557
+// void planecopy_cp_c(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift)
558
+function PFX(pixel_planecopy_cp_neon)
559
+    dup             v2.16b, w6
560
+    sub             x5, x5, #1
561
+.loop_h:
562
+    mov             x6, x0
563
+    mov             x12, x2
564
+    mov             x7, #0
565
+.loop_w:
566
+    ldr             q0, x6, #16
567
+    ushl            v0.16b, v0.16b, v2.16b
568
+    str             q0, x12, #16
569
+    add             x7, x7, #16
570
+    cmp             x7, x4
571
+    blt             .loop_w
572
+
573
+    add             x0, x0, x1
574
+    add             x2, x2, x3
575
+    sub             x5, x5, #1
576
+    cbnz            x5, .loop_h
577
+
578
+// handle last row
579
+    mov             x5, x4
580
+    lsr             x5, x5, #3
581
+.loopW8:
582
+    ldr             d0, x0, #8
583
+    ushl            v0.8b, v0.8b, v2.8b
584
+    str             d0, x2, #8
585
+    sub             x4, x4, #8
586
+    sub             x5, x5, #1
587
+    cbnz            x5, .loopW8
588
+
589
+    mov             x5, #8
590
+    sub             x5, x5, x4
591
+    sub             x0, x0, x5
592
+    sub             x2, x2, x5
593
+    ldr             d0, x0
594
+    ushl            v0.8b, v0.8b, v2.8b
595
+    str             d0, x2
596
+    ret
597
+endfunc
598
+
599
+//******* satd *******
600
+.macro satd_4x4_neon
601
+    ld1             {v0.s}0, x0, x1
602
+    ld1             {v0.s}1, x0, x1
603
+    ld1             {v1.s}0, x2, x3
604
+    ld1             {v1.s}1, x2, x3
605
+    ld1             {v2.s}0, x0, x1
606
+    ld1             {v2.s}1, x0, x1
607
+    ld1             {v3.s}0, x2, x3
608
+    ld1             {v3.s}1, x2, x3
609
+
610
+    usubl           v4.8h, v0.8b, v1.8b
611
+    usubl           v5.8h, v2.8b, v3.8b
612
+
613
+    add             v6.8h, v4.8h, v5.8h
614
+    sub             v7.8h, v4.8h, v5.8h
615
+
616
+    mov             v4.d0, v6.d1
617
+    add             v0.4h, v6.4h, v4.4h
618
+    sub             v2.4h, v6.4h, v4.4h
619
+
620
+    mov             v5.d0, v7.d1
621
+    add             v1.4h, v7.4h, v5.4h
622
+    sub             v3.4h, v7.4h, v5.4h
623
+
624
+    trn1            v4.4h, v0.4h, v1.4h
625
+    trn2            v5.4h, v0.4h, v1.4h
626
+
627
+    trn1            v6.4h, v2.4h, v3.4h
628
+    trn2            v7.4h, v2.4h, v3.4h
629
+
630
+    add             v0.4h, v4.4h, v5.4h
631
+    sub             v1.4h, v4.4h, v5.4h
632
+
633
+    add             v2.4h, v6.4h, v7.4h
634
+    sub             v3.4h, v6.4h, v7.4h
635
+
636
+    trn1            v4.2s, v0.2s, v1.2s
637
+    trn2            v5.2s, v0.2s, v1.2s
638
+
639
+    trn1            v6.2s, v2.2s, v3.2s
640
+    trn2            v7.2s, v2.2s, v3.2s
641
+
642
+    abs             v4.4h, v4.4h
643
+    abs             v5.4h, v5.4h
644
+    abs             v6.4h, v6.4h
645
+    abs             v7.4h, v7.4h
646
+
647
+    smax            v1.4h, v4.4h, v5.4h
648
+    smax            v2.4h, v6.4h, v7.4h
649
+
650
+    add             v0.4h, v1.4h, v2.4h
651
+    uaddlp          v0.2s, v0.4h
652
+    uaddlp          v0.1d, v0.2s
653
+.endm
654
+
655
+// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
656
+function PFX(pixel_satd_4x4_neon)
657
+    satd_4x4_neon
658
+    fmov            x0, d0
659
+    ret
660
+endfunc
661
+
662
 .macro x265_satd_4x8_8x4_end_neon
663
     add             v0.8h, v4.8h, v6.8h
664
     add             v1.8h, v5.8h, v7.8h
665
@@ -59,7 +699,7 @@
666
 .endm
667
 
668
 .macro pixel_satd_4x8_neon
669
-    ld1r             {v1.2s}, x2, x3
670
+    ld1r            {v1.2s}, x2, x3
671
     ld1r            {v0.2s}, x0, x1
672
     ld1r            {v3.2s}, x2, x3
673
     ld1r            {v2.2s}, x0, x1
674
@@ -82,129 +722,995 @@
675
     sub             v5.8h, v0.8h, v1.8h
676
     ld1             {v6.s}1, x0, x1
677
     usubl           v3.8h, v6.8b, v7.8b
678
-    add         v6.8h, v2.8h, v3.8h
679
-    sub         v7.8h, v2.8h, v3.8h
680
+    add             v6.8h, v2.8h, v3.8h
681
+    sub             v7.8h, v2.8h, v3.8h
682
     x265_satd_4x8_8x4_end_neon
683
 .endm
684
 
685
-// template<int w, int h>
686
-// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
687
-function x265_pixel_satd_4x8_neon
688
-    pixel_satd_4x8_neon
689
-    mov               w0, v0.s0
690
-    ret
691
+// template<int w, int h>
692
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
693
+function PFX(pixel_satd_4x8_neon)
694
+    pixel_satd_4x8_neon
695
+    mov             w0, v0.s0
696
+    ret
697
+endfunc
698
+
699
+function PFX(pixel_satd_4x16_neon)
700
+    mov             w4, #0
701
+    pixel_satd_4x8_neon
702
+    mov             w5, v0.s0
703
+    add             w4, w4, w5
704
+    pixel_satd_4x8_neon
705
+    mov             w5, v0.s0
706
+    add             w0, w5, w4
707
+    ret
708
+endfunc
709
+
710
+function PFX(pixel_satd_4x32_neon)
711
+    mov             w4, #0
712
+.rept 4
713
+    pixel_satd_4x8_neon
714
+    mov             w5, v0.s0
715
+    add             w4, w4, w5
716
+.endr
717
+    mov             w0, w4
718
+    ret
719
+endfunc
720
+
721
+function PFX(pixel_satd_12x16_neon)
722
+    mov             x4, x0
723
+    mov             x5, x2
724
+    mov             w7, #0
725
+    pixel_satd_4x8_neon
726
+    mov             w6, v0.s0
727
+    add             w7, w7, w6
728
+    pixel_satd_4x8_neon
729
+    mov             w6, v0.s0
730
+    add             w7, w7, w6
731
+
732
+    add             x0, x4, #4
733
+    add             x2, x5, #4
734
+    pixel_satd_4x8_neon
735
+    mov             w6, v0.s0
736
+    add             w7, w7, w6
737
+    pixel_satd_4x8_neon
738
+    mov             w6, v0.s0
739
+    add             w7, w7, w6
740
+
741
+    add             x0, x4, #8
742
+    add             x2, x5, #8
743
+    pixel_satd_4x8_neon
744
+    mov             w6, v0.s0
745
+    add             w7, w7, w6
746
+    pixel_satd_4x8_neon
747
+    mov             w6, v0.s0
748
+    add             w0, w7, w6
749
+    ret
750
+endfunc
751
+
752
+function PFX(pixel_satd_12x32_neon)
753
+    mov             x4, x0
754
+    mov             x5, x2
755
+    mov             w7, #0
756
+.rept 4
757
+    pixel_satd_4x8_neon
758
+    mov             w6, v0.s0
759
+    add             w7, w7, w6
760
+.endr
761
+
762
+    add             x0, x4, #4
763
+    add             x2, x5, #4
764
+.rept 4
765
+    pixel_satd_4x8_neon
766
+    mov             w6, v0.s0
767
+    add             w7, w7, w6
768
+.endr
769
+
770
+    add             x0, x4, #8
771
+    add             x2, x5, #8
772
+.rept 4
773
+    pixel_satd_4x8_neon
774
+    mov             w6, v0.s0
775
+    add             w7, w7, w6
776
+.endr
777
+
778
+    mov             w0, w7
779
+    ret
780
+endfunc
781
+
782
+function PFX(pixel_satd_8x4_neon)
783
+    mov             x4, x0
784
+    mov             x5, x2
785
+    satd_4x4_neon
786
+    add             x0, x4, #4
787
+    add             x2, x5, #4
788
+    umov            x6, v0.d0
789
+    satd_4x4_neon
790
+    umov            x0, v0.d0
791
+    add             x0, x0, x6
792
+    ret
793
+endfunc
794
+
795
+.macro LOAD_DIFF_8x4 v0 v1 v2 v3
796
+    ld1             {v0.8b}, x0, x1
797
+    ld1             {v1.8b}, x2, x3
798
+    ld1             {v2.8b}, x0, x1
799
+    ld1             {v3.8b}, x2, x3
800
+    ld1             {v4.8b}, x0, x1
801
+    ld1             {v5.8b}, x2, x3
802
+    ld1             {v6.8b}, x0, x1
803
+    ld1             {v7.8b}, x2, x3
804
+    usubl           \v0, v0.8b, v1.8b
805
+    usubl           \v1, v2.8b, v3.8b
806
+    usubl           \v2, v4.8b, v5.8b
807
+    usubl           \v3, v6.8b, v7.8b
808
+.endm
809
+
810
+.macro LOAD_DIFF_16x4 v0 v1 v2 v3 v4 v5 v6 v7
811
+    ld1             {v0.16b}, x0, x1
812
+    ld1             {v1.16b}, x2, x3
813
+    ld1             {v2.16b}, x0, x1
814
+    ld1             {v3.16b}, x2, x3
815
+    ld1             {v4.16b}, x0, x1
816
+    ld1             {v5.16b}, x2, x3
817
+    ld1             {v6.16b}, x0, x1
818
+    ld1             {v7.16b}, x2, x3
819
+    usubl           \v0, v0.8b, v1.8b
820
+    usubl           \v1, v2.8b, v3.8b
821
+    usubl           \v2, v4.8b, v5.8b
822
+    usubl           \v3, v6.8b, v7.8b
823
+    usubl2          \v4, v0.16b, v1.16b
824
+    usubl2          \v5, v2.16b, v3.16b
825
+    usubl2          \v6, v4.16b, v5.16b
826
+    usubl2          \v7, v6.16b, v7.16b
827
+.endm
828
+
829
+function PFX(satd_16x4_neon), export=0
830
+    LOAD_DIFF_16x4  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
831
+    b               PFX(satd_8x4v_8x8h_neon)
832
+endfunc
833
+
834
+function PFX(satd_8x8_neon), export=0
835
+    LOAD_DIFF_8x4   v16.8h, v17.8h, v18.8h, v19.8h
836
+    LOAD_DIFF_8x4   v20.8h, v21.8h, v22.8h, v23.8h
837
+    b               PFX(satd_8x4v_8x8h_neon)
838
+endfunc
839
+
840
+// one vertical hadamard pass and two horizontal
841
+function PFX(satd_8x4v_8x8h_neon), export=0
842
+    HADAMARD4_V     v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h
843
+    HADAMARD4_V     v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
844
+    trn4            v0.8h, v1.8h, v2.8h, v3.8h, v16.8h, v17.8h, v18.8h, v19.8h
845
+    trn4            v4.8h, v5.8h, v6.8h, v7.8h, v20.8h, v21.8h, v22.8h, v23.8h
846
+    SUMSUB_ABCD     v16.8h, v17.8h, v18.8h, v19.8h, v0.8h, v1.8h, v2.8h, v3.8h
847
+    SUMSUB_ABCD     v20.8h, v21.8h, v22.8h, v23.8h, v4.8h, v5.8h, v6.8h, v7.8h
848
+    trn4            v0.4s, v2.4s, v1.4s, v3.4s, v16.4s, v18.4s, v17.4s, v19.4s
849
+    trn4            v4.4s, v6.4s, v5.4s, v7.4s, v20.4s, v22.4s, v21.4s, v23.4s
850
+    ABS8            v0.8h, v1.8h, v2.8h, v3.8h, v4.8h, v5.8h, v6.8h, v7.8h
851
+    smax            v0.8h, v0.8h, v2.8h
852
+    smax            v1.8h, v1.8h, v3.8h
853
+    smax            v2.8h, v4.8h, v6.8h
854
+    smax            v3.8h, v5.8h, v7.8h
855
+    ret
856
+endfunc
857
+
858
+function PFX(pixel_satd_8x8_neon)
859
+    mov             x10, x30
860
+    bl              PFX(satd_8x8_neon)
861
+    add             v0.8h, v0.8h, v1.8h
862
+    add             v1.8h, v2.8h, v3.8h
863
+    add             v0.8h, v0.8h, v1.8h
864
+    uaddlv          s0, v0.8h
865
+    mov             w0, v0.s0
866
+    ret             x10
867
+endfunc
868
+
869
+function PFX(pixel_satd_8x12_neon)
870
+    mov             x4, x0
871
+    mov             x5, x2
872
+    mov             x7, #0
873
+    satd_4x4_neon
874
+    umov            x6, v0.d0
875
+    add             x7, x7, x6
876
+    add             x0, x4, #4
877
+    add             x2, x5, #4
878
+    satd_4x4_neon
879
+    umov            x6, v0.d0
880
+    add             x7, x7, x6
881
+.rept 2
882
+    sub             x0, x0, #4
883
+    sub             x2, x2, #4
884
+    mov             x4, x0
885
+    mov             x5, x2
886
+    satd_4x4_neon
887
+    umov            x6, v0.d0
888
+    add             x7, x7, x6
889
+    add             x0, x4, #4
890
+    add             x2, x5, #4
891
+    satd_4x4_neon
892
+    umov            x6, v0.d0
893
+    add             x7, x7, x6
894
+.endr
895
+    mov             x0, x7
896
+    ret
897
+endfunc
898
+
899
+function PFX(pixel_satd_8x16_neon)
900
+    mov             x10, x30
901
+    bl              PFX(satd_8x8_neon)
902
+    add             v30.8h, v0.8h, v1.8h
903
+    add             v31.8h, v2.8h, v3.8h
904
+    bl              PFX(satd_8x8_neon)
905
+    add             v30.8h, v30.8h, v0.8h
906
+    add             v31.8h, v31.8h, v1.8h
907
+    add             v30.8h, v30.8h, v2.8h
908
+    add             v31.8h, v31.8h, v3.8h
909
+    add             v0.8h, v30.8h, v31.8h
910
+    uaddlv          s0, v0.8h
911
+    mov             w0, v0.s0
912
+    ret             x10
913
+endfunc
914
+
915
+function PFX(pixel_satd_8x32_neon)
916
+    mov             x10, x30
917
+    bl              PFX(satd_8x8_neon)
918
+    add             v30.8h, v0.8h, v1.8h
919
+    add             v31.8h, v2.8h, v3.8h
920
+.rept 3
921
+    bl              PFX(satd_8x8_neon)
922
+    add             v30.8h, v30.8h, v0.8h
923
+    add             v31.8h, v31.8h, v1.8h
924
+    add             v30.8h, v30.8h, v2.8h
925
+    add             v31.8h, v31.8h, v3.8h
926
+.endr
927
+    add             v0.8h, v30.8h, v31.8h
928
+    uaddlv          s0, v0.8h
929
+    mov             w0, v0.s0
930
+    ret             x10
931
+endfunc
932
+
933
+function PFX(pixel_satd_8x64_neon)
934
+    mov             x10, x30
935
+    bl              PFX(satd_8x8_neon)
936
+    add             v30.8h, v0.8h, v1.8h
937
+    add             v31.8h, v2.8h, v3.8h
938
+.rept 7
939
+    bl              PFX(satd_8x8_neon)
940
+    add             v30.8h, v30.8h, v0.8h
941
+    add             v31.8h, v31.8h, v1.8h
942
+    add             v30.8h, v30.8h, v2.8h
943
+    add             v31.8h, v31.8h, v3.8h
944
+.endr
945
+    add             v0.8h, v30.8h, v31.8h
946
+    uaddlv          s0, v0.8h
947
+    mov             w0, v0.s0
948
+    ret             x10
949
+endfunc
950
+
951
+function PFX(pixel_satd_16x4_neon)
952
+    mov             x10, x30
953
+    bl              PFX(satd_16x4_neon)
954
+    add             v30.8h, v0.8h, v1.8h
955
+    add             v31.8h, v2.8h, v3.8h
956
+    add             v0.8h, v30.8h, v31.8h
957
+    uaddlv          s0, v0.8h
958
+    mov             w0, v0.s0
959
+    ret             x10
960
+endfunc
961
+
962
+function PFX(pixel_satd_16x8_neon)
963
+    mov             x10, x30
964
+    bl              PFX(satd_16x4_neon)
965
+    add             v30.8h, v0.8h, v1.8h
966
+    add             v31.8h, v2.8h, v3.8h
967
+    bl              PFX(satd_16x4_neon)
968
+    add             v30.8h, v30.8h, v0.8h
969
+    add             v31.8h, v31.8h, v1.8h
970
+    add             v30.8h, v30.8h, v2.8h
971
+    add             v31.8h, v31.8h, v3.8h
972
+    add             v0.8h, v30.8h, v31.8h
973
+    uaddlv          s0, v0.8h
974
+    mov             w0, v0.s0
975
+    ret             x10
976
+endfunc
977
+
978
+function PFX(pixel_satd_16x12_neon)
979
+    mov             x10, x30
980
+    bl              PFX(satd_16x4_neon)
981
+    add             v30.8h, v0.8h, v1.8h
982
+    add             v31.8h, v2.8h, v3.8h
983
+.rept 2
984
+    bl              PFX(satd_16x4_neon)
985
+    add             v30.8h, v30.8h, v0.8h
986
+    add             v31.8h, v31.8h, v1.8h
987
+    add             v30.8h, v30.8h, v2.8h
988
+    add             v31.8h, v31.8h, v3.8h
989
+.endr
990
+    add             v0.8h, v30.8h, v31.8h
991
+    uaddlv          s0, v0.8h
992
+    mov             w0, v0.s0
993
+    ret             x10
994
+endfunc
995
+
996
+function PFX(pixel_satd_16x16_neon)
997
+    mov             x10, x30
998
+    bl              PFX(satd_16x4_neon)
999
+    add             v30.8h, v0.8h, v1.8h
1000
+    add             v31.8h, v2.8h, v3.8h
1001
+.rept 3
1002
+    bl              PFX(satd_16x4_neon)
1003
+    add             v30.8h, v30.8h, v0.8h
1004
+    add             v31.8h, v31.8h, v1.8h
1005
+    add             v30.8h, v30.8h, v2.8h
1006
+    add             v31.8h, v31.8h, v3.8h
1007
+.endr
1008
+    add             v0.8h, v30.8h, v31.8h
1009
+    uaddlv          s0, v0.8h
1010
+    mov             w0, v0.s0
1011
+    ret             x10
1012
+endfunc
1013
+
1014
+function PFX(pixel_satd_16x24_neon)
1015
+    mov             x10, x30
1016
+    bl              PFX(satd_16x4_neon)
1017
+    add             v30.8h, v0.8h, v1.8h
1018
+    add             v31.8h, v2.8h, v3.8h
1019
+.rept 5
1020
+    bl              PFX(satd_16x4_neon)
1021
+    add             v30.8h, v30.8h, v0.8h
1022
+    add             v31.8h, v31.8h, v1.8h
1023
+    add             v30.8h, v30.8h, v2.8h
1024
+    add             v31.8h, v31.8h, v3.8h
1025
+.endr
1026
+    add             v0.8h, v30.8h, v31.8h
1027
+    uaddlv          s0, v0.8h
1028
+    mov             w0, v0.s0
1029
+    ret             x10
1030
+endfunc
1031
+
1032
+.macro pixel_satd_16x32_neon
1033
+    bl              PFX(satd_16x4_neon)
1034
+    add             v30.8h, v0.8h, v1.8h
1035
+    add             v31.8h, v2.8h, v3.8h
1036
+.rept 7
1037
+    bl              PFX(satd_16x4_neon)
1038
+    add             v30.8h, v30.8h, v0.8h
1039
+    add             v31.8h, v31.8h, v1.8h
1040
+    add             v30.8h, v30.8h, v2.8h
1041
+    add             v31.8h, v31.8h, v3.8h
1042
+.endr
1043
+.endm
1044
+
1045
+function PFX(pixel_satd_16x32_neon)
1046
+    mov             x10, x30
1047
+    pixel_satd_16x32_neon
1048
+    add             v0.8h, v30.8h, v31.8h
1049
+    uaddlv          s0, v0.8h
1050
+    mov             w0, v0.s0
1051
+    ret             x10
1052
 endfunc
1053
 
1054
-// template<int w, int h>
1055
-// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1056
-function x265_pixel_satd_4x16_neon
1057
-    eor             w4, w4, w4
1058
-    pixel_satd_4x8_neon
1059
-    mov               w5, v0.s0
1060
-    add             w4, w4, w5
1061
-    pixel_satd_4x8_neon
1062
-    mov               w5, v0.s0
1063
-    add             w0, w5, w4
1064
-    ret
1065
+function PFX(pixel_satd_16x64_neon)
1066
+    mov             x10, x30
1067
+    bl              PFX(satd_16x4_neon)
1068
+    add             v30.8h, v0.8h, v1.8h
1069
+    add             v31.8h, v2.8h, v3.8h
1070
+.rept 15
1071
+    bl              PFX(satd_16x4_neon)
1072
+    add             v30.8h, v30.8h, v0.8h
1073
+    add             v31.8h, v31.8h, v1.8h
1074
+    add             v30.8h, v30.8h, v2.8h
1075
+    add             v31.8h, v31.8h, v3.8h
1076
+.endr
1077
+    add             v0.8h, v30.8h, v31.8h
1078
+    uaddlv          s0, v0.8h
1079
+    mov             w0, v0.s0
1080
+    ret             x10
1081
 endfunc
1082
 
1083
-// template<int w, int h>
1084
-// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1085
-function x265_pixel_satd_4x32_neon
1086
-    eor             w4, w4, w4
1087
+function PFX(pixel_satd_24x32_neon)
1088
+    mov             x10, x30
1089
+    mov             x7, #0
1090
+    mov             x4, x0
1091
+    mov             x5, x2
1092
+.rept 3
1093
+    movi            v30.8h, #0
1094
+    movi            v31.8h, #0
1095
 .rept 4
1096
-    pixel_satd_4x8_neon
1097
-    mov             w5, v0.s0
1098
-    add             w4, w4, w5
1099
+    bl              PFX(satd_8x8_neon)
1100
+    add             v30.8h, v30.8h, v0.8h
1101
+    add             v31.8h, v31.8h, v1.8h
1102
+    add             v30.8h, v30.8h, v2.8h
1103
+    add             v31.8h, v31.8h, v3.8h
1104
 .endr
1105
-    mov             w0, w4
1106
-    ret
1107
+    add             v0.8h, v30.8h, v31.8h
1108
+    uaddlv          s0, v0.8h
1109
+    mov             w6, v0.s0
1110
+    add             x7, x7, x6
1111
+    add             x4, x4, #8
1112
+    add             x5, x5, #8
1113
+    mov             x0, x4
1114
+    mov             x2, x5
1115
+.endr
1116
+    mov             x0, x7
1117
+    ret             x10
1118
 endfunc
1119
 
1120
-// template<int w, int h>
1121
-// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1122
-function x265_pixel_satd_12x16_neon
1123
+function PFX(pixel_satd_24x64_neon)
1124
+    mov             x10, x30
1125
+    mov             x7, #0
1126
     mov             x4, x0
1127
     mov             x5, x2
1128
-    eor             w7, w7, w7
1129
-    pixel_satd_4x8_neon
1130
+.rept 3
1131
+    movi            v30.8h, #0
1132
+    movi            v31.8h, #0
1133
+.rept 4
1134
+    bl              PFX(satd_8x8_neon)
1135
+    add             v30.8h, v30.8h, v0.8h
1136
+    add             v31.8h, v31.8h, v1.8h
1137
+    add             v30.8h, v30.8h, v2.8h
1138
+    add             v31.8h, v31.8h, v3.8h
1139
+.endr
1140
+    add             v0.8h, v30.8h, v31.8h
1141
+    uaddlv          s0, v0.8h
1142
     mov             w6, v0.s0
1143
-    add             w7, w7, w6
1144
-    pixel_satd_4x8_neon
1145
+    add             x7, x7, x6
1146
+    add             x4, x4, #8
1147
+    add             x5, x5, #8
1148
+    mov             x0, x4
1149
+    mov             x2, x5
1150
+.endr
1151
+    sub             x4, x4, #24
1152
+    sub             x5, x5, #24
1153
+    add             x0, x4, x1, lsl #5
1154
+    add             x2, x5, x3, lsl #5
1155
+    mov             x4, x0
1156
+    mov             x5, x2
1157
+.rept 3
1158
+    movi            v30.8h, #0
1159
+    movi            v31.8h, #0
1160
+.rept 4
1161
+    bl              PFX(satd_8x8_neon)
1162
+    add             v30.8h, v30.8h, v0.8h
1163
+    add             v31.8h, v31.8h, v1.8h
1164
+    add             v30.8h, v30.8h, v2.8h
1165
+    add             v31.8h, v31.8h, v3.8h
1166
+.endr
1167
+    add             v0.8h, v30.8h, v31.8h
1168
+    uaddlv          s0, v0.8h
1169
     mov             w6, v0.s0
1170
-    add             w7, w7, w6
1171
+    add             x7, x7, x6
1172
+    add             x4, x4, #8
1173
+    add             x5, x5, #8
1174
+    mov             x0, x4
1175
+    mov             x2, x5
1176
+.endr
1177
+    mov             x0, x7
1178
+    ret             x10
1179
+endfunc
1180
 
1181
-    add             x0, x4, #4
1182
-    add             x2, x5, #4
1183
-    pixel_satd_4x8_neon
1184
-    mov             w6, v0.s0
1185
-    add             w7, w7, w6
1186
-    pixel_satd_4x8_neon
1187
-    mov             w6, v0.s0
1188
-    add             w7, w7, w6
1189
+.macro pixel_satd_32x8
1190
+    mov             x4, x0
1191
+    mov             x5, x2
1192
+.rept 2
1193
+    bl              PFX(satd_16x4_neon)
1194
+    add             v30.8h, v30.8h, v0.8h
1195
+    add             v31.8h, v31.8h, v1.8h
1196
+    add             v30.8h, v30.8h, v2.8h
1197
+    add             v31.8h, v31.8h, v3.8h
1198
+.endr
1199
+    add             x0, x4, #16
1200
+    add             x2, x5, #16
1201
+.rept 2
1202
+    bl              PFX(satd_16x4_neon)
1203
+    add             v30.8h, v30.8h, v0.8h
1204
+    add             v31.8h, v31.8h, v1.8h
1205
+    add             v30.8h, v30.8h, v2.8h
1206
+    add             v31.8h, v31.8h, v3.8h
1207
+.endr
1208
+.endm
1209
 
1210
-    add             x0, x4, #8
1211
-    add             x2, x5, #8
1212
-    pixel_satd_4x8_neon
1213
-    mov             w6, v0.s0
1214
-    add             w7, w7, w6
1215
-    pixel_satd_4x8_neon
1216
+.macro satd_32x16_neon
1217
+    movi            v30.8h, #0
1218
+    movi            v31.8h, #0
1219
+    pixel_satd_32x8
1220
+    sub             x0, x0, #16
1221
+    sub             x2, x2, #16
1222
+    pixel_satd_32x8
1223
+    add             v0.8h, v30.8h, v31.8h
1224
+    uaddlv          s0, v0.8h
1225
     mov             w6, v0.s0
1226
-    add             w0, w7, w6
1227
-    ret
1228
-endfunc
1229
+.endm
1230
 
1231
-// template<int w, int h>
1232
-// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1233
-function x265_pixel_satd_12x32_neon
1234
+.macro satd_64x16_neon
1235
+    mov             x8, x0
1236
+    mov             x9, x2
1237
+    satd_32x16_neon
1238
+    add             x7, x7, x6
1239
+    add             x0, x8, #32
1240
+    add             x2, x9, #32
1241
+    satd_32x16_neon
1242
+    add             x7, x7, x6
1243
+.endm
1244
+
1245
+function PFX(pixel_satd_32x8_neon)
1246
+    mov             x10, x30
1247
+    mov             x7, #0
1248
     mov             x4, x0
1249
     mov             x5, x2
1250
-    eor             w7, w7, w7
1251
-.rept 4
1252
-    pixel_satd_4x8_neon
1253
-    mov             w6, v0.s0
1254
-    add             w7, w7, w6
1255
+    movi            v30.8h, #0
1256
+    movi            v31.8h, #0
1257
+    pixel_satd_32x8
1258
+    add             v0.8h, v30.8h, v31.8h
1259
+    uaddlv          s0, v0.8h
1260
+    mov             w0, v0.s0
1261
+    ret             x10
1262
+endfunc
1263
+
1264
+function PFX(pixel_satd_32x16_neon)
1265
+    mov             x10, x30
1266
+    satd_32x16_neon
1267
+    mov             x0, x6
1268
+    ret             x10
1269
+endfunc
1270
+
1271
+function PFX(pixel_satd_32x24_neon)
1272
+    mov             x10, x30
1273
+    satd_32x16_neon
1274
+    movi            v30.8h, #0
1275
+    movi            v31.8h, #0
1276
+    sub             x0, x0, #16
1277
+    sub             x2, x2, #16
1278
+    pixel_satd_32x8
1279
+    add             v0.8h, v30.8h, v31.8h
1280
+    uaddlv          s0, v0.8h
1281
+    mov             w0, v0.s0
1282
+    add             x0, x0, x6
1283
+    ret             x10
1284
+endfunc
1285
+
1286
+function PFX(pixel_satd_32x32_neon)
1287
+    mov             x10, x30
1288
+    mov             x7, #0
1289
+    satd_32x16_neon
1290
+    sub             x0, x0, #16
1291
+    sub             x2, x2, #16
1292
+    add             x7, x7, x6
1293
+    satd_32x16_neon
1294
+    add             x0, x7, x6
1295
+    ret             x10
1296
+endfunc
1297
+
1298
+function PFX(pixel_satd_32x48_neon)
1299
+    mov             x10, x30
1300
+    mov             x7, #0
1301
+.rept 2
1302
+    satd_32x16_neon
1303
+    sub             x0, x0, #16
1304
+    sub             x2, x2, #16
1305
+    add             x7, x7, x6
1306
 .endr
1307
+    satd_32x16_neon
1308
+    add             x0, x7, x6
1309
+    ret             x10
1310
+endfunc
1311
 
1312
-    add             x0, x4, #4
1313
-    add             x2, x5, #4
1314
-.rept 4
1315
-    pixel_satd_4x8_neon
1316
-    mov             w6, v0.s0
1317
-    add             w7, w7, w6
1318
+function PFX(pixel_satd_32x64_neon)
1319
+    mov             x10, x30
1320
+    mov             x7, #0
1321
+.rept 3
1322
+    satd_32x16_neon
1323
+    sub             x0, x0, #16
1324
+    sub             x2, x2, #16
1325
+    add             x7, x7, x6
1326
 .endr
1327
+    satd_32x16_neon
1328
+    add             x0, x7, x6
1329
+    ret             x10
1330
+endfunc
1331
 
1332
-    add             x0, x4, #8
1333
-    add             x2, x5, #8
1334
-.rept 4
1335
-    pixel_satd_4x8_neon
1336
-    mov             w6, v0.s0
1337
-    add             w7, w7, w6
1338
+function PFX(pixel_satd_64x16_neon)
1339
+    mov             x10, x30
1340
+    mov             x7, #0
1341
+    satd_64x16_neon
1342
+    mov             x0, x7
1343
+    ret             x10
1344
+endfunc
1345
+
1346
+function PFX(pixel_satd_64x32_neon)
1347
+    mov             x10, x30
1348
+    mov             x7, #0
1349
+    satd_64x16_neon
1350
+    sub             x0, x0, #48
1351
+    sub             x2, x2, #48
1352
+    satd_64x16_neon
1353
+    mov             x0, x7
1354
+    ret             x10
1355
+endfunc
1356
+
1357
+function PFX(pixel_satd_64x48_neon)
1358
+    mov             x10, x30
1359
+    mov             x7, #0
1360
+.rept 2
1361
+    satd_64x16_neon
1362
+    sub             x0, x0, #48
1363
+    sub             x2, x2, #48
1364
 .endr
1365
+    satd_64x16_neon
1366
+    mov             x0, x7
1367
+    ret             x10
1368
+endfunc
1369
 
1370
-    mov             w0, w7
1371
+function PFX(pixel_satd_64x64_neon)
1372
+    mov             x10, x30
1373
+    mov             x7, #0
1374
+.rept 3
1375
+    satd_64x16_neon
1376
+    sub             x0, x0, #48
1377
+    sub             x2, x2, #48
1378
+.endr
1379
+    satd_64x16_neon
1380
+    mov             x0, x7
1381
+    ret             x10
1382
+endfunc
1383
+
1384
+function PFX(pixel_satd_48x64_neon)
1385
+    mov             x10, x30
1386
+    mov             x7, #0
1387
+    mov             x8, x0
1388
+    mov             x9, x2
1389
+.rept 3
1390
+    satd_32x16_neon
1391
+    sub             x0, x0, #16
1392
+    sub             x2, x2, #16
1393
+    add             x7, x7, x6
1394
+.endr
1395
+    satd_32x16_neon
1396
+    add             x7, x7, x6
1397
+
1398
+    add             x0, x8, #32
1399
+    add             x2, x9, #32
1400
+    pixel_satd_16x32_neon
1401
+    add             v0.8h, v30.8h, v31.8h
1402
+    uaddlv          s0, v0.8h
1403
+    mov             w6, v0.s0
1404
+    add             x7, x7, x6
1405
+
1406
+    movi            v30.8h, #0
1407
+    movi            v31.8h, #0
1408
+    pixel_satd_16x32_neon
1409
+    add             v0.8h, v30.8h, v31.8h
1410
+    uaddlv          s0, v0.8h
1411
+    mov             w6, v0.s0
1412
+    add             x0, x7, x6
1413
+    ret             x10
1414
+endfunc
1415
+
1416
+function PFX(sa8d_8x8_neon), export=0
1417
+    LOAD_DIFF_8x4   v16.8h, v17.8h, v18.8h, v19.8h
1418
+    LOAD_DIFF_8x4   v20.8h, v21.8h, v22.8h, v23.8h
1419
+    HADAMARD4_V     v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h
1420
+    HADAMARD4_V     v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
1421
+    SUMSUB_ABCD     v0.8h, v16.8h, v1.8h, v17.8h, v16.8h, v20.8h, v17.8h, v21.8h
1422
+    SUMSUB_ABCD     v2.8h, v18.8h, v3.8h, v19.8h, v18.8h, v22.8h, v19.8h, v23.8h
1423
+    trn4            v4.8h, v5.8h, v6.8h, v7.8h, v0.8h, v1.8h, v2.8h, v3.8h
1424
+    trn4            v20.8h, v21.8h, v22.8h, v23.8h, v16.8h, v17.8h, v18.8h, v19.8h
1425
+    SUMSUB_ABCD     v2.8h, v3.8h, v24.8h, v25.8h, v20.8h, v21.8h, v4.8h, v5.8h
1426
+    SUMSUB_ABCD     v0.8h, v1.8h, v4.8h, v5.8h, v22.8h, v23.8h, v6.8h, v7.8h
1427
+    trn4            v20.4s, v22.4s, v21.4s, v23.4s, v2.4s, v0.4s, v3.4s, v1.4s
1428
+    trn4            v16.4s, v18.4s, v17.4s, v19.4s, v24.4s, v4.4s, v25.4s, v5.4s
1429
+    SUMSUB_ABCD     v0.8h, v2.8h, v1.8h, v3.8h, v20.8h, v22.8h, v21.8h, v23.8h
1430
+    SUMSUB_ABCD     v4.8h, v6.8h, v5.8h, v7.8h, v16.8h, v18.8h, v17.8h, v19.8h
1431
+    trn4            v16.2d, v20.2d, v17.2d, v21.2d, v0.2d, v4.2d, v1.2d, v5.2d
1432
+    trn4            v18.2d, v22.2d, v19.2d, v23.2d, v2.2d, v6.2d, v3.2d, v7.2d
1433
+    ABS8            v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
1434
+    smax            v16.8h, v16.8h, v20.8h
1435
+    smax            v17.8h, v17.8h, v21.8h
1436
+    smax            v18.8h, v18.8h, v22.8h
1437
+    smax            v19.8h, v19.8h, v23.8h
1438
+    add             v0.8h, v16.8h, v17.8h
1439
+    add             v1.8h, v18.8h, v19.8h
1440
     ret
1441
 endfunc
1442
 
1443
-// template<int w, int h>
1444
-// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1445
-function x265_pixel_satd_8x8_neon
1446
-    eor             w4, w4, w4
1447
-    mov             x6, x0
1448
-    mov             x7, x2
1449
-    pixel_satd_4x8_neon
1450
-    mov             w5, v0.s0
1451
-    add             w4, w4, w5
1452
-    add             x0, x6, #4
1453
-    add             x2, x7, #4
1454
-    pixel_satd_4x8_neon
1455
+function PFX(pixel_sa8d_8x8_neon)
1456
+    mov             x10, x30
1457
+    bl              PFX(sa8d_8x8_neon)
1458
+    add             v0.8h, v0.8h, v1.8h
1459
+    uaddlv          s0, v0.8h
1460
+    mov             w0, v0.s0
1461
+    add             w0, w0, #1
1462
+    lsr             w0, w0, #1
1463
+    ret             x10
1464
+endfunc
1465
+
1466
+function PFX(pixel_sa8d_8x16_neon)
1467
+    mov             x10, x30
1468
+    bl              PFX(sa8d_8x8_neon)
1469
+    add             v0.8h, v0.8h, v1.8h
1470
+    uaddlv          s0, v0.8h
1471
     mov             w5, v0.s0
1472
+    add             w5, w5, #1
1473
+    lsr             w5, w5, #1
1474
+    bl              PFX(sa8d_8x8_neon)
1475
+    add             v0.8h, v0.8h, v1.8h
1476
+    uaddlv          s0, v0.8h
1477
+    mov             w4, v0.s0
1478
+    add             w4, w4, #1
1479
+    lsr             w4, w4, #1
1480
+    add             w0, w4, w5
1481
+    ret             x10
1482
+endfunc
1483
+
1484
+.macro sa8d_16x16 reg
1485
+    bl              PFX(sa8d_8x8_neon)
1486
+    uaddlp          v30.4s, v0.8h
1487
+    uaddlp          v31.4s, v1.8h
1488
+    bl              PFX(sa8d_8x8_neon)
1489
+    uadalp          v30.4s, v0.8h
1490
+    uadalp          v31.4s, v1.8h
1491
+    sub             x0, x0, x1, lsl #4
1492
+    sub             x2, x2, x3, lsl #4
1493
+    add             x0, x0, #8
1494
+    add             x2, x2, #8
1495
+    bl              PFX(sa8d_8x8_neon)
1496
+    uadalp          v30.4s, v0.8h
1497
+    uadalp          v31.4s, v1.8h
1498
+    bl              PFX(sa8d_8x8_neon)
1499
+    uadalp          v30.4s, v0.8h
1500
+    uadalp          v31.4s, v1.8h
1501
+    add             v0.4s, v30.4s, v31.4s
1502
+    addv            s0, v0.4s
1503
+    mov             \reg, v0.s0
1504
+    add             \reg, \reg, #1
1505
+    lsr             \reg, \reg, #1
1506
+.endm
1507
+
1508
+function PFX(pixel_sa8d_16x16_neon)
1509
+    mov             x10, x30
1510
+    sa8d_16x16      w0
1511
+    ret             x10
1512
+endfunc
1513
+
1514
+function PFX(pixel_sa8d_16x32_neon)
1515
+    mov             x10, x30
1516
+    sa8d_16x16      w4
1517
+    sub             x0, x0, #8
1518
+    sub             x2, x2, #8
1519
+    sa8d_16x16      w5
1520
     add             w0, w4, w5
1521
+    ret             x10
1522
+endfunc
1523
+
1524
+function PFX(pixel_sa8d_32x32_neon)
1525
+    mov             x10, x30
1526
+    sa8d_16x16      w4
1527
+    sub             x0, x0, x1, lsl #4
1528
+    sub             x2, x2, x3, lsl #4
1529
+    add             x0, x0, #8
1530
+    add             x2, x2, #8
1531
+    sa8d_16x16      w5
1532
+    sub             x0, x0, #24
1533
+    sub             x2, x2, #24
1534
+    sa8d_16x16      w6
1535
+    sub             x0, x0, x1, lsl #4
1536
+    sub             x2, x2, x3, lsl #4
1537
+    add             x0, x0, #8
1538
+    add             x2, x2, #8
1539
+    sa8d_16x16      w7
1540
+    add             w4, w4, w5
1541
+    add             w6, w6, w7
1542
+    add             w0, w4, w6
1543
+    ret             x10
1544
+endfunc
1545
+
1546
+function PFX(pixel_sa8d_32x64_neon)
1547
+    mov             x10, x30
1548
+    mov             w11, #4
1549
+    mov             w9, #0
1550
+.loop_sa8d_32:
1551
+    sub             w11, w11, #1
1552
+    sa8d_16x16      w4
1553
+    sub             x0, x0, x1, lsl #4
1554
+    sub             x2, x2, x3, lsl #4
1555
+    add             x0, x0, #8
1556
+    add             x2, x2, #8
1557
+    sa8d_16x16      w5
1558
+    add             w4, w4, w5
1559
+    add             w9, w9, w4
1560
+    sub             x0, x0, #24
1561
+    sub             x2, x2, #24
1562
+    cbnz            w11, .loop_sa8d_32
1563
+    mov             w0, w9
1564
+    ret             x10
1565
+endfunc
1566
+
1567
+function PFX(pixel_sa8d_64x64_neon)
1568
+    mov             x10, x30
1569
+    mov             w11, #4
1570
+    mov             w9, #0
1571
+.loop_sa8d_64:
1572
+    sub             w11, w11, #1
1573
+    sa8d_16x16      w4
1574
+    sub             x0, x0, x1, lsl #4
1575
+    sub             x2, x2, x3, lsl #4
1576
+    add             x0, x0, #8
1577
+    add             x2, x2, #8
1578
+    sa8d_16x16      w5
1579
+    sub             x0, x0, x1, lsl #4
1580
+    sub             x2, x2, x3, lsl #4
1581
+    add             x0, x0, #8
1582
+    add             x2, x2, #8
1583
+    sa8d_16x16      w6
1584
+    sub             x0, x0, x1, lsl #4
1585
+    sub             x2, x2, x3, lsl #4
1586
+    add             x0, x0, #8
1587
+    add             x2, x2, #8
1588
+    sa8d_16x16      w7
1589
+    add             w4, w4, w5
1590
+    add             w6, w6, w7
1591
+    add             w8, w4, w6
1592
+    add             w9, w9, w8
1593
+
1594
+    sub             x0, x0, #56
1595
+    sub             x2, x2, #56
1596
+    cbnz            w11, .loop_sa8d_64
1597
+    mov             w0, w9
1598
+    ret             x10
1599
+endfunc
1600
+
1601
+/***** dequant_scaling*****/
1602
+// void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
1603
+function PFX(dequant_scaling_neon)
1604
+    add             x5, x5, #4              // shift + 4
1605
+    lsr             x3, x3, #3              // num / 8
1606
+    cmp             x5, x4
1607
+    blt             .dequant_skip
1608
+
1609
+    mov             x12, #1
1610
+    sub             x6, x5, x4          // shift - per
1611
+    sub             x6, x6, #1          // shift - per - 1
1612
+    lsl             x6, x12, x6         // 1 << shift - per - 1 (add)
1613
+    dup             v0.4s, w6
1614
+    sub             x7, x4, x5          // per - shift
1615
+    dup             v3.4s, w7
1616
+
1617
+.dequant_loop1:
1618
+    ld1             {v19.8h}, x0, #16 // quantCoef
1619
+    ld1             {v2.4s}, x1, #16  // deQuantCoef
1620
+    ld1             {v20.4s}, x1, #16
1621
+    sub             x3, x3, #1
1622
+    sxtl            v1.4s, v19.4h
1623
+    sxtl2           v19.4s, v19.8h
1624
+
1625
+    mul             v1.4s, v1.4s, v2.4s // quantCoef * deQuantCoef
1626
+    mul             v19.4s, v19.4s, v20.4s
1627
+    add             v1.4s, v1.4s, v0.4s // quantCoef * deQuantCoef + add
1628
+    add             v19.4s, v19.4s, v0.4s
1629
+
1630
+    sshl            v1.4s, v1.4s, v3.4s
1631
+    sshl            v19.4s, v19.4s, v3.4s
1632
+    sqxtn           v16.4h, v1.4s       // x265_clip3
1633
+    sqxtn2          v16.8h, v19.4s
1634
+    st1             {v16.8h}, x2, #16
1635
+    cbnz            x3, .dequant_loop1
1636
+    ret
1637
+
1638
+.dequant_skip:
1639
+    sub             x6, x4, x5          // per - shift
1640
+    dup             v0.8h, w6
1641
+
1642
+.dequant_loop2:
1643
+    ld1             {v19.8h}, x0, #16 // quantCoef
1644
+    ld1             {v2.4s}, x1, #16  // deQuantCoef
1645
+    ld1             {v20.4s}, x1, #16
1646
+    sub             x3, x3, #1
1647
+    sxtl            v1.4s, v19.4h
1648
+    sxtl2           v19.4s, v19.8h
1649
+
1650
+    mul             v1.4s, v1.4s, v2.4s // quantCoef * deQuantCoef
1651
+    mul             v19.4s, v19.4s, v20.4s
1652
+    sqxtn           v16.4h, v1.4s       // x265_clip3
1653
+    sqxtn2          v16.8h, v19.4s
1654
+
1655
+    sqshl           v16.8h, v16.8h, v0.8h // coefQ << per - shift
1656
+    st1             {v16.8h}, x2, #16
1657
+    cbnz            x3, .dequant_loop2
1658
+    ret
1659
+endfunc
1660
+
1661
+// void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
1662
+function PFX(dequant_normal_neon)
1663
+    lsr             w2, w2, #4              // num / 16
1664
+    neg             w4, w4
1665
+    dup             v0.8h, w3
1666
+    dup             v1.4s, w4
1667
+
1668
+.dqn_loop1:
1669
+    ld1             {v2.8h, v3.8h}, x0, #32
1670
+    smull           v16.4s, v2.4h, v0.4h
1671
+    smull2          v17.4s, v2.8h, v0.8h
1672
+    smull           v18.4s, v3.4h, v0.4h
1673
+    smull2          v19.4s, v3.8h, v0.8h
1674
+
1675
+    srshl           v16.4s, v16.4s, v1.4s
1676
+    srshl           v17.4s, v17.4s, v1.4s
1677
+    srshl           v18.4s, v18.4s, v1.4s
1678
+    srshl           v19.4s, v19.4s, v1.4s
1679
+
1680
+    sqxtn           v2.4h, v16.4s
1681
+    sqxtn2          v2.8h, v17.4s
1682
+    sqxtn           v3.4h, v18.4s
1683
+    sqxtn2          v3.8h, v19.4s
1684
+
1685
+    sub             w2, w2, #1
1686
+    st1             {v2.8h, v3.8h}, x1, #32
1687
+    cbnz            w2, .dqn_loop1
1688
+    ret
1689
+endfunc
1690
+
1691
+/********* ssim ***********/
1692
+// void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24)
1693
+function PFX(ssim_4x4x2_core_neon)
1694
+    ld1             {v0.8b}, x0, x1
1695
+    ld1             {v1.8b}, x0, x1
1696
+    ld1             {v2.8b}, x0, x1
1697
+    ld1             {v3.8b}, x0, x1
1698
+
1699
+    ld1             {v4.8b}, x2, x3
1700
+    ld1             {v5.8b}, x2, x3
1701
+    ld1             {v6.8b}, x2, x3
1702
+    ld1             {v7.8b}, x2, x3
1703
+
1704
+    umull           v16.8h, v0.8b, v0.8b
1705
+    umull           v17.8h, v1.8b, v1.8b
1706
+    umull           v18.8h, v2.8b, v2.8b
1707
+    uaddlp          v30.4s, v16.8h
1708
+    umull           v19.8h, v3.8b, v3.8b
1709
+    umull           v20.8h, v4.8b, v4.8b
1710
+    umull           v21.8h, v5.8b, v5.8b
1711
+    uadalp          v30.4s, v17.8h
1712
+    umull           v22.8h, v6.8b, v6.8b
1713
+    umull           v23.8h, v7.8b, v7.8b
1714
+
1715
+    umull           v24.8h, v0.8b, v4.8b
1716
+    uadalp          v30.4s, v18.8h
1717
+    umull           v25.8h, v1.8b, v5.8b
1718
+    umull           v26.8h, v2.8b, v6.8b
1719
+    umull           v27.8h, v3.8b, v7.8b
1720
+    uadalp          v30.4s, v19.8h
1721
+
1722
+    uaddl           v28.8h, v0.8b, v1.8b
1723
+    uaddl           v29.8h, v4.8b, v5.8b
1724
+    uadalp          v30.4s, v20.8h
1725
+    uaddlp          v31.4s, v24.8h
1726
+
1727
+    uaddw           v28.8h, v28.8h, v2.8b
1728
+    uaddw           v29.8h, v29.8h, v6.8b
1729
+    uadalp          v30.4s, v21.8h
1730
+    uadalp          v31.4s, v25.8h
1731
+
1732
+    uaddw           v28.8h, v28.8h, v3.8b
1733
+    uaddw           v29.8h, v29.8h, v7.8b
1734
+    uadalp          v30.4s, v22.8h
1735
+    uadalp          v31.4s, v26.8h
1736
+
1737
+    uaddlp          v28.4s, v28.8h
1738
+    uaddlp          v29.4s, v29.8h
1739
+    uadalp          v30.4s, v23.8h
1740
+    uadalp          v31.4s, v27.8h
1741
+
1742
+    addp            v28.4s, v28.4s, v28.4s
1743
+    addp            v29.4s, v29.4s, v29.4s
1744
+    addp            v30.4s, v30.4s, v30.4s
1745
+    addp            v31.4s, v31.4s, v31.4s
1746
+
1747
+    st4             {v28.2s, v29.2s, v30.2s, v31.2s}, x4
1748
     ret
1749
 endfunc
1750
 
1751
 // int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
1752
-function x265_psyCost_4x4_neon
1753
+function PFX(psyCost_4x4_neon)
1754
     ld1r            {v4.2s}, x0, x1
1755
     ld1r            {v5.2s}, x0, x1
1756
     ld1             {v4.s}1, x0, x1
1757
@@ -286,7 +1792,7 @@
1758
 endfunc
1759
 
1760
 // uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
1761
-function x265_quant_neon
1762
+function PFX(quant_neon)
1763
     mov             w9, #1
1764
     lsl             w9, w9, w4
1765
     dup             v0.2s, w9
1766
@@ -341,79 +1847,597 @@
1767
     ret
1768
 endfunc
1769
 
1770
-.macro satd_4x4_neon
1771
-    ld1             {v1.s}0, x2, x3
1772
-    ld1             {v0.s}0, x0, x1
1773
-    ld1             {v3.s}0, x2, x3
1774
-    ld1             {v2.s}0, x0, x1
1775
+// uint32_t nquant_c(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff)
1776
+function PFX(nquant_neon)
1777
+    neg             x12, x3
1778
+    dup             v0.4s, w12             // q0= -qbits
1779
+    dup             v1.4s, w4              // add
1780
 
1781
-    ld1             {v1.s}1, x2, x3
1782
-    ld1             {v0.s}1, x0, x1
1783
-    ld1             {v3.s}1, x2, x3
1784
-    ld1             {v2.s}1, x0, x1
1785
+    lsr             w5, w5, #2
1786
+    movi            v4.4s, #0              // v4= accumulate numsig
1787
+    mov             x4, #0
1788
+    movi            v22.4s, #0
1789
 
1790
-    usubl           v4.8h, v0.8b, v1.8b
1791
-    usubl           v5.8h, v2.8b, v3.8b
1792
+.loop_nquant:
1793
+    ld1             {v16.4h}, x0, #8
1794
+    sub             w5, w5, #1
1795
+    sxtl            v19.4s, v16.4h         // v19 = coefblockpos
1796
 
1797
-    add             v6.8h, v4.8h, v5.8h
1798
-    sub             v7.8h, v4.8h, v5.8h
1799
+    cmlt            v18.4s, v19.4s, #0     // v18 = sign
1800
 
1801
-    mov             v4.d0, v6.d1
1802
-    add             v0.8h, v6.8h, v4.8h
1803
-    sub             v2.8h, v6.8h, v4.8h
1804
+    abs             v19.4s, v19.4s         // v19 = level=abs(coefblockpos)
1805
+    ld1             {v20.4s}, x1, #16    // v20 = quantCoeffblockpos
1806
+    mul             v19.4s, v19.4s, v20.4s // v19 = tmplevel = abs(level) * quantCoeffblockpos;
1807
 
1808
-    mov             v5.d0, v7.d1
1809
-    add             v1.8h, v7.8h, v5.8h
1810
-    sub             v3.8h, v7.8h, v5.8h
1811
+    add             v20.4s, v19.4s, v1.4s  // v20 = tmplevel+add
1812
+    sshl            v20.4s, v20.4s, v0.4s  // v20 = level =(tmplevel+add) >> qbits
1813
 
1814
-    trn1            v4.4h, v0.4h, v1.4h
1815
-    trn2            v5.4h, v0.4h, v1.4h
1816
+    // numsig
1817
+    cmeq            v21.4s, v20.4s, v22.4s
1818
+    add             v4.4s, v4.4s, v21.4s
1819
+    add             x4, x4, #4
1820
 
1821
-    trn1            v6.4h, v2.4h, v3.4h
1822
-    trn2            v7.4h, v2.4h, v3.4h
1823
+    eor             v21.16b, v20.16b, v18.16b
1824
+    sub             v21.4s, v21.4s, v18.4s
1825
+    sqxtn           v16.4h, v21.4s
1826
+    abs             v17.4h, v16.4h
1827
+    st1             {v17.4h}, x2, #8
1828
 
1829
-    add             v0.4h, v4.4h, v5.4h
1830
-    sub             v1.4h, v4.4h, v5.4h
1831
+    cbnz            w5, .loop_nquant
1832
 
1833
-    add             v2.4h, v6.4h, v7.4h
1834
-    sub             v3.4h, v6.4h, v7.4h
1835
+    uaddlv          d4, v4.4s
1836
+    fmov            x12, d4
1837
+    add             x0, x4, x12
1838
+    ret
1839
+endfunc
1840
 
1841
-    trn1            v4.2s, v0.2s, v1.2s
1842
-    trn2            v5.2s, v0.2s, v1.2s
1843
+// void ssimDist_c(const pixel* fenc, uint32_t fStride, const pixel* recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k)
1844
+.macro ssimDist_1  v4 v5
1845
+    sub             v20.8h, \v4\().8h, \v5\().8h
1846
+    smull           v16.4s, \v4\().4h, \v4\().4h
1847
+    smull2          v17.4s, \v4\().8h, \v4\().8h
1848
+    smull           v18.4s, v20.4h, v20.4h
1849
+    smull2          v19.4s, v20.8h, v20.8h
1850
+    add             v0.4s, v0.4s, v16.4s
1851
+    add             v0.4s, v0.4s, v17.4s
1852
+    add             v1.4s, v1.4s, v18.4s
1853
+    add             v1.4s, v1.4s, v19.4s
1854
+.endm
1855
 
1856
-    trn1            v6.2s, v2.2s, v3.2s
1857
-    trn2            v7.2s, v2.2s, v3.2s
1858
+function PFX(ssimDist4_neon)
1859
+    ssimDist_start
1860
+.rept 4
1861
+    ld1             {v4.s}0, x0, x1
1862
+    ld1             {v5.s}0, x2, x3
1863
+    uxtl            v4.8h, v4.8b
1864
+    uxtl            v5.8h, v5.8b
1865
+    sub             v2.4h, v4.4h, v5.4h
1866
+    smull           v3.4s, v4.4h, v4.4h
1867
+    smull           v2.4s, v2.4h, v2.4h
1868
+    add             v0.4s, v0.4s, v3.4s
1869
+    add             v1.4s, v1.4s, v2.4s
1870
+.endr
1871
+    ssimDist_end
1872
+    ret
1873
+endfunc
1874
 
1875
-    abs             v4.4h, v4.4h
1876
-    abs             v5.4h, v5.4h
1877
-    abs             v6.4h, v6.4h
1878
-    abs             v7.4h, v7.4h
1879
+function PFX(ssimDist8_neon)
1880
+    ssimDist_start
1881
+.rept 8
1882
+    ld1             {v4.8b}, x0, x1
1883
+    ld1             {v5.8b}, x2, x3
1884
+    uxtl            v4.8h, v4.8b
1885
+    uxtl            v5.8h, v5.8b
1886
+    ssimDist_1      v4, v5
1887
+.endr
1888
+    ssimDist_end
1889
+    ret
1890
+endfunc
1891
 
1892
-    smax            v1.4h, v4.4h, v5.4h
1893
-    smax            v2.4h, v6.4h, v7.4h
1894
+function PFX(ssimDist16_neon)
1895
+    mov w12, #16
1896
+    ssimDist_start
1897
+.loop_ssimDist16:
1898
+    sub             w12, w12, #1
1899
+    ld1             {v4.16b}, x0, x1
1900
+    ld1             {v5.16b}, x2, x3
1901
+    uxtl            v6.8h, v4.8b
1902
+    uxtl            v7.8h, v5.8b
1903
+    uxtl2           v4.8h, v4.16b
1904
+    uxtl2           v5.8h, v5.16b
1905
+    ssimDist_1      v6, v7
1906
+    ssimDist_1      v4, v5
1907
+    cbnz            w12, .loop_ssimDist16
1908
+    ssimDist_end
1909
+    ret
1910
+endfunc
1911
 
1912
-    add             v0.4h, v1.4h, v2.4h
1913
-    uaddlp          v0.2s, v0.4h
1914
-    uaddlp          v0.1d, v0.2s
1915
+function PFX(ssimDist32_neon)
1916
+    mov w12, #32
1917
+    ssimDist_start
1918
+.loop_ssimDist32:
1919
+    sub             w12, w12, #1
1920
+    ld1             {v4.16b-v5.16b}, x0, x1
1921
+    ld1             {v6.16b-v7.16b}, x2, x3
1922
+    uxtl            v21.8h, v4.8b
1923
+    uxtl            v22.8h, v6.8b
1924
+    uxtl            v23.8h, v5.8b
1925
+    uxtl            v24.8h, v7.8b
1926
+    uxtl2           v25.8h, v4.16b
1927
+    uxtl2           v26.8h, v6.16b
1928
+    uxtl2           v27.8h, v5.16b
1929
+    uxtl2           v28.8h, v7.16b
1930
+    ssimDist_1      v21, v22
1931
+    ssimDist_1      v23, v24
1932
+    ssimDist_1      v25, v26
1933
+    ssimDist_1      v27, v28
1934
+    cbnz            w12, .loop_ssimDist32
1935
+    ssimDist_end
1936
+    ret
1937
+endfunc
1938
+
1939
+function PFX(ssimDist64_neon)
1940
+    mov w12, #64
1941
+    ssimDist_start
1942
+.loop_ssimDist64:
1943
+    sub             w12, w12, #1
1944
+    ld1             {v4.16b-v7.16b}, x0, x1
1945
+    ld1             {v16.16b-v19.16b}, x2, x3
1946
+    uxtl            v21.8h, v4.8b
1947
+    uxtl            v22.8h, v16.8b
1948
+    uxtl            v23.8h, v5.8b
1949
+    uxtl            v24.8h, v17.8b
1950
+    uxtl2           v25.8h, v4.16b
1951
+    uxtl2           v26.8h, v16.16b
1952
+    uxtl2           v27.8h, v5.16b
1953
+    uxtl2           v28.8h, v17.16b
1954
+    ssimDist_1      v21, v22
1955
+    ssimDist_1      v23, v24
1956
+    ssimDist_1      v25, v26
1957
+    ssimDist_1      v27, v28
1958
+    uxtl            v21.8h, v6.8b
1959
+    uxtl            v22.8h, v18.8b
1960
+    uxtl            v23.8h, v7.8b
1961
+    uxtl            v24.8h, v19.8b
1962
+    uxtl2           v25.8h, v6.16b
1963
+    uxtl2           v26.8h, v18.16b
1964
+    uxtl2           v27.8h, v7.16b
1965
+    uxtl2           v28.8h, v19.16b
1966
+    ssimDist_1      v21, v22
1967
+    ssimDist_1      v23, v24
1968
+    ssimDist_1      v25, v26
1969
+    ssimDist_1      v27, v28
1970
+    cbnz            w12, .loop_ssimDist64
1971
+    ssimDist_end
1972
+    ret
1973
+endfunc
1974
+
1975
+// void normFact_c(const pixel* src, uint32_t blockSize, int shift, uint64_t *z_k)
1976
+
1977
+.macro normFact_1  v4
1978
+    smull           v16.4s, \v4\().4h, \v4\().4h
1979
+    smull2          v17.4s, \v4\().8h, \v4\().8h
1980
+    add             v0.4s, v0.4s, v16.4s
1981
+    add             v0.4s, v0.4s, v17.4s
1982
 .endm
1983
 
1984
-// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1985
-function x265_pixel_satd_4x4_neon
1986
-    satd_4x4_neon
1987
-    umov            x0, v0.d0
1988
+function PFX(normFact8_neon)
1989
+    normFact_start
1990
+.rept 8
1991
+    ld1             {v4.8b}, x0, x1
1992
+    uxtl            v4.8h, v4.8b
1993
+    normFact_1      v4
1994
+.endr
1995
+    normFact_end
1996
     ret
1997
 endfunc
1998
 
1999
-// int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2000
-function x265_pixel_satd_8x4_neon
2001
-    mov             x4, x0
2002
-    mov             x5, x2
2003
-    satd_4x4_neon
2004
-    add             x0, x4, #4
2005
-    add             x2, x5, #4
2006
-    umov            x6, v0.d0
2007
-    satd_4x4_neon
2008
-    umov            x0, v0.d0
2009
-    add             x0, x0, x6
2010
+function PFX(normFact16_neon)
2011
+    mov w12, #16
2012
+    normFact_start
2013
+.loop_normFact16:
2014
+    sub             w12, w12, #1
2015
+    ld1             {v4.16b}, x0, x1
2016
+    uxtl            v5.8h, v4.8b
2017
+    uxtl2           v4.8h, v4.16b
2018
+    normFact_1      v5
2019
+    normFact_1      v4
2020
+    cbnz            w12, .loop_normFact16
2021
+    normFact_end
2022
+    ret
2023
+endfunc
2024
+
2025
+function PFX(normFact32_neon)
2026
+    mov w12, #32
2027
+    normFact_start
2028
+.loop_normFact32:
2029
+    sub             w12, w12, #1
2030
+    ld1             {v4.16b-v5.16b}, x0, x1
2031
+    uxtl            v6.8h, v4.8b
2032
+    uxtl2           v4.8h, v4.16b
2033
+    uxtl            v7.8h, v5.8b
2034
+    uxtl2           v5.8h, v5.16b
2035
+    normFact_1      v4
2036
+    normFact_1      v5
2037
+    normFact_1      v6
2038
+    normFact_1      v7
2039
+    cbnz            w12, .loop_normFact32
2040
+    normFact_end
2041
+    ret
2042
+endfunc
2043
+
2044
+function PFX(normFact64_neon)
2045
+    mov w12, #64
2046
+    normFact_start
2047
+.loop_normFact64:
2048
+    sub             w12, w12, #1
2049
+    ld1             {v4.16b-v7.16b}, x0, x1
2050
+    uxtl            v26.8h, v4.8b
2051
+    uxtl2           v24.8h, v4.16b
2052
+    uxtl            v27.8h, v5.8b
2053
+    uxtl2           v25.8h, v5.16b
2054
+    normFact_1      v24
2055
+    normFact_1      v25
2056
+    normFact_1      v26
2057
+    normFact_1      v27
2058
+    uxtl            v26.8h, v6.8b
2059
+    uxtl2           v24.8h, v6.16b
2060
+    uxtl            v27.8h, v7.8b
2061
+    uxtl2           v25.8h, v7.16b
2062
+    normFact_1      v24
2063
+    normFact_1      v25
2064
+    normFact_1      v26
2065
+    normFact_1      v27
2066
+    cbnz            w12, .loop_normFact64
2067
+    normFact_end
2068
+    ret
2069
+endfunc
2070
+
2071
+// void weight_pp_c(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
2072
+function PFX(weight_pp_neon)
2073
+    sub             x2, x2, x3
2074
+    ldr             w9, sp              // offset
2075
+    lsl             w5, w5, #6            // w0 << correction
2076
+
2077
+    // count trailing zeros in w5 and compare against shift right amount.
2078
+    rbit            w10, w5
2079
+    clz             w10, w10
2080
+    cmp             w10, w7
2081
+    b.lt            .unfoldedShift
2082
+
2083
+    // shift right only removes trailing zeros: hoist LSR out of the loop.
2084
+    lsr             w10, w5, w7           // w0 << correction >> shift
2085
+    dup             v25.16b, w10
2086
+    lsr             w6, w6, w7            // round >> shift
2087
+    add             w6, w6, w9            // round >> shift + offset
2088
+    dup             v26.8h, w6
2089
+
2090
+    // Check arithmetic range.
2091
+    mov             w11, #255
2092
+    madd            w11, w11, w10, w6
2093
+    add             w11, w11, w9
2094
+    lsr             w11, w11, #16
2095
+    cbnz            w11, .widenTo32Bit
2096
+
2097
+    // 16-bit arithmetic is enough.
2098
+.loopHpp:
2099
+    mov             x12, x3
2100
+.loopWpp:
2101
+    ldr             q0, x0, #16
2102
+    sub             x12, x12, #16
2103
+    umull           v1.8h, v0.8b, v25.8b  // val *= w0 << correction >> shift
2104
+    umull2          v2.8h, v0.16b, v25.16b
2105
+    add             v1.8h, v1.8h, v26.8h  // val += round >> shift + offset
2106
+    add             v2.8h, v2.8h, v26.8h
2107
+    sqxtun          v0.8b, v1.8h          // val = x265_clip(val)
2108
+    sqxtun2         v0.16b, v2.8h
2109
+    str             q0, x1, #16
2110
+    cbnz            x12, .loopWpp
2111
+    add             x1, x1, x2
2112
+    add             x0, x0, x2
2113
+    sub             x4, x4, #1
2114
+    cbnz            x4, .loopHpp
2115
+    ret
2116
+
2117
+    // 32-bit arithmetic is needed.
2118
+.widenTo32Bit:
2119
+.loopHpp32:
2120
+    mov             x12, x3
2121
+.loopWpp32:
2122
+    ldr             d0, x0, #8
2123
+    sub             x12, x12, #8
2124
+    uxtl            v0.8h, v0.8b
2125
+    umull           v1.4s, v0.4h, v25.4h  // val *= w0 << correction >> shift
2126
+    umull2          v2.4s, v0.8h, v25.8h
2127
+    add             v1.4s, v1.4s, v26.4s  // val += round >> shift + offset
2128
+    add             v2.4s, v2.4s, v26.4s
2129
+    sqxtn           v0.4h, v1.4s          // val = x265_clip(val)
2130
+    sqxtn2          v0.8h, v2.4s
2131
+    sqxtun          v0.8b, v0.8h
2132
+    str             d0, x1, #8
2133
+    cbnz            x12, .loopWpp32
2134
+    add             x1, x1, x2
2135
+    add             x0, x0, x2
2136
+    sub             x4, x4, #1
2137
+    cbnz            x4, .loopHpp32
2138
+    ret
2139
+
2140
+    // The shift right cannot be moved out of the loop.
2141
+.unfoldedShift:
2142
+    dup             v25.8h, w5            // w0 << correction
2143
+    dup             v26.4s, w6            // round
2144
+    neg             w7, w7                // -shift
2145
+    dup             v27.4s, w7
2146
+    dup             v29.4s, w9            // offset
2147
+.loopHppUS:
2148
+    mov             x12, x3
2149
+.loopWppUS:
2150
+    ldr             d0, x0, #8
2151
+    sub             x12, x12, #8
2152
+    uxtl            v0.8h, v0.8b
2153
+    umull           v1.4s, v0.4h, v25.4h  // val *= w0
2154
+    umull2          v2.4s, v0.8h, v25.8h
2155
+    add             v1.4s, v1.4s, v26.4s  // val += round
2156
+    add             v2.4s, v2.4s, v26.4s
2157
+    sshl            v1.4s, v1.4s, v27.4s  // val >>= shift
2158
+    sshl            v2.4s, v2.4s, v27.4s
2159
+    add             v1.4s, v1.4s, v29.4s  // val += offset
2160
+    add             v2.4s, v2.4s, v29.4s
2161
+    sqxtn           v0.4h, v1.4s          // val = x265_clip(val)
2162
+    sqxtn2          v0.8h, v2.4s
2163
+    sqxtun          v0.8b, v0.8h
2164
+    str             d0, x1, #8
2165
+    cbnz            x12, .loopWppUS
2166
+    add             x1, x1, x2
2167
+    add             x0, x0, x2
2168
+    sub             x4, x4, #1
2169
+    cbnz            x4, .loopHppUS
2170
+    ret
2171
+endfunc
2172
+
2173
+// int scanPosLast(
2174
+//     const uint16_t *scan,      // x0
2175
+//     const coeff_t *coeff,      // x1
2176
+//     uint16_t *coeffSign,       // x2
2177
+//     uint16_t *coeffFlag,       // x3
2178
+//     uint8_t *coeffNum,         // x4
2179
+//     int numSig,                // x5
2180
+//     const uint16_t* scanCG4x4, // x6
2181
+//     const int trSize)          // x7
2182
+function PFX(scanPosLast_neon)
2183
+    // convert unit of Stride(trSize) to int16_t
2184
+    add             x7, x7, x7
2185
+
2186
+    // load scan table and convert to Byte
2187
+    ldp             q0, q1, x6
2188
+    xtn             v0.8b, v0.8h
2189
+    xtn2            v0.16b, v1.8h   // v0 - Zigzag scan table
2190
+
2191
+    movrel          x10, g_SPL_and_mask
2192
+    ldr             q28, x10      // v28 = mask for pmovmskb
2193
+    movi            v31.16b, #0     // v31 = {0, ..., 0}
2194
+    add             x10, x7, x7     // 2*x7
2195
+    add             x11, x10, x7    // 3*x7
2196
+    add             x9, x4, #1      // CG count
2197
+
2198
+.loop_spl:
2199
+    // position of current CG
2200
+    ldrh            w6, x0, #32
2201
+    add             x6, x1, x6, lsl #1
2202
+
2203
+    // loading current CG
2204
+    ldr             d2, x6
2205
+    ldr             d3, x6, x7
2206
+    ldr             d4, x6, x10
2207
+    ldr             d5, x6, x11
2208
+    mov             v2.d1, v3.d0
2209
+    mov             v4.d1, v5.d0
2210
+    sqxtn           v2.8b, v2.8h
2211
+    sqxtn2          v2.16b, v4.8h
2212
+
2213
+    // Zigzag
2214
+    tbl             v3.16b, {v2.16b}, v0.16b
2215
+
2216
+    // get sign
2217
+    cmhi            v5.16b, v3.16b, v31.16b   // v5 = non-zero
2218
+    cmlt            v3.16b, v3.16b, #0        // v3 = negative
2219
+
2220
+    // val - w13 = pmovmskb(v3)
2221
+    and             v3.16b, v3.16b, v28.16b
2222
+    mov             d4, v3.d1
2223
+    addv            b23, v3.8b
2224
+    addv            b24, v4.8b
2225
+    mov             v23.b1, v24.b0
2226
+    fmov            w13, s23
2227
+
2228
+    // mask - w15 = pmovmskb(v5)
2229
+    and             v5.16b, v5.16b, v28.16b
2230
+    mov             d6, v5.d1
2231
+    addv            b25, v5.8b
2232
+    addv            b26, v6.8b
2233
+    mov             v25.b1, v26.b0
2234
+    fmov            w15, s25
2235
+
2236
+    // coeffFlag = reverse_bit(w15) in 16-bit
2237
+    rbit            w12, w15
2238
+    lsr             w12, w12, #16
2239
+    fmov            s30, w12
2240
+    strh            w12, x3, #2
2241
+
2242
+    // accelerate by preparing w13 = w13 & w15
2243
+    and             w13, w13, w15
2244
+    mov             x14, xzr
2245
+.loop_spl_1:
2246
+    cbz             w15, .pext_end
2247
+    clz             w6, w15
2248
+    lsl             w13, w13, w6
2249
+    lsl             w15, w15, w6
2250
+    extr            w14, w14, w13, #31
2251
+    bfm             w15, wzr, #1, #0
2252
+    b               .loop_spl_1
2253
+.pext_end:
2254
+    strh            w14, x2, #2
2255
+
2256
+    // compute coeffNum = popcount(coeffFlag)
2257
+    cnt             v30.8b, v30.8b
2258
+    addp            v30.8b, v30.8b, v30.8b
2259
+    fmov            w6, s30
2260
+    sub             x5, x5, x6
2261
+    strb            w6, x4, #1
2262
+
2263
+    cbnz            x5, .loop_spl
2264
+
2265
+    // count trailing zeros
2266
+    rbit            w13, w12
2267
+    clz             w13, w13
2268
+    lsr             w12, w12, w13
2269
+    strh            w12, x3, #-2
2270
+
2271
+    // get last pos
2272
+    sub             x9, x4, x9
2273
+    lsl             x0, x9, #4
2274
+    eor             w13, w13, #15
2275
+    add             x0, x0, x13
2276
+    ret
2277
+endfunc
2278
+
2279
+// uint32_t costCoeffNxN(
2280
+//    uint16_t *scan,        // x0
2281
+//    coeff_t *coeff,        // x1
2282
+//    intptr_t trSize,       // x2
2283
+//    uint16_t *absCoeff,    // x3
2284
+//    uint8_t *tabSigCtx,    // x4
2285
+//    uint16_t scanFlagMask, // x5
2286
+//    uint8_t *baseCtx,      // x6
2287
+//    int offset,            // x7
2288
+//    int scanPosSigOff,     // sp
2289
+//    int subPosBase)        // sp + 8
2290
+function PFX(costCoeffNxN_neon)
2291
+    // abs(coeff)
2292
+    add             x2, x2, x2
2293
+    ld1             {v1.d}0, x1, x2
2294
+    ld1             {v1.d}1, x1, x2
2295
+    ld1             {v2.d}0, x1, x2
2296
+    ld1             {v2.d}1, x1, x2
2297
+    abs             v1.8h, v1.8h
2298
+    abs             v2.8h, v2.8h
2299
+
2300
+    // WARNING: beyond-bound read here!
2301
+    // loading scan table
2302
+    ldr             w2, sp
2303
+    eor             w15, w2, #15
2304
+    add             x1, x0, x15, lsl #1
2305
+    ldp             q20, q21, x1
2306
+    uzp1            v20.16b, v20.16b, v21.16b
2307
+    movi            v21.16b, #15
2308
+    eor             v0.16b, v20.16b, v21.16b
2309
+
2310
+    // reorder coeff
2311
+    uzp1           v22.16b, v1.16b, v2.16b
2312
+    uzp2           v23.16b, v1.16b, v2.16b
2313
+    tbl            v24.16b, {v22.16b}, v0.16b
2314
+    tbl            v25.16b, {v23.16b}, v0.16b
2315
+    zip1           v2.16b, v24.16b, v25.16b
2316
+    zip2           v3.16b, v24.16b, v25.16b
2317
+
2318
+    // loading tabSigCtx (+offset)
2319
+    ldr             q1, x4
2320
+    tbl             v1.16b, {v1.16b}, v0.16b
2321
+    dup             v4.16b, w7
2322
+    movi            v5.16b, #0
2323
+    tbl             v4.16b, {v4.16b}, v5.16b
2324
+    add             v1.16b, v1.16b, v4.16b
2325
+
2326
+    // register mapping
2327
+    // x0 - sum
2328
+    // x1 - entropyStateBits
2329
+    // v1 - sigCtx
2330
+    // {v3,v2} - abs(coeff)
2331
+    // x2 - scanPosSigOff
2332
+    // x3 - absCoeff
2333
+    // x4 - numNonZero
2334
+    // x5 - scanFlagMask
2335
+    // x6 - baseCtx
2336
+    mov             x0, #0
2337
+    movrel          x1, PFX_C(entropyStateBits)
2338
+    mov             x4, #0
2339
+    mov             x11, #0
2340
+    movi            v31.16b, #0
2341
+    cbz             x2, .idx_zero
2342
+.loop_ccnn:
2343
+//   {
2344
+//        const uint32_t cnt = tabSigCtxblkPos + offset + posOffset;
2345
+//        ctxSig = cnt & posZeroMask;
2346
+//        const uint32_t mstate = baseCtxctxSig;
2347
+//        const uint32_t mps = mstate & 1;
2348
+//        const uint32_t stateBits = x265_entropyStateBitsmstate ^ sig;
2349
+//        uint32_t nextState = (stateBits >> 24) + mps;
2350
+//        if ((mstate ^ sig) == 1)
2351
+//            nextState = sig;
2352
+//        baseCtxctxSig = (uint8_t)nextState;
2353
+//        sum += stateBits;
2354
+//    }
2355
+//    absCoeffnumNonZero = tmpCoeffblkPos;
2356
+//    numNonZero += sig;
2357
+//    scanPosSigOff--;
2358
+
2359
+    add             x13, x3, x4, lsl #1
2360
+    sub             x2, x2, #1
2361
+    str             h2, x13             // absCoeffnumNonZero = tmpCoeffblkPos
2362
+    fmov            w14, s1               // x14 = ctxSig
2363
+    uxtb            w14, w14
2364
+    ubfx            w11, w5, #0, #1       // x11 = sig
2365
+    lsr             x5, x5, #1
2366
+    add             x4, x4, x11           // numNonZero += sig
2367
+    ext             v1.16b, v1.16b, v31.16b, #1
2368
+    ext             v2.16b, v2.16b, v3.16b, #2
2369
+    ext             v3.16b, v3.16b, v31.16b, #2
2370
+    ldrb            w9, x6, x14         // mstate = baseCtxctxSig
2371
+    and             w10, w9, #1           // mps = mstate & 1
2372
+    eor             w9, w9, w11           // x9 = mstate ^ sig
2373
+    add             x12, x1, x9, lsl #2
2374
+    ldr             w13, x12
2375
+    add             w0, w0, w13           // sum += x265_entropyStateBitsmstate ^ sig
2376
+    ldrb            w13, x12, #3
2377
+    add             w10, w10, w13         // nextState = (stateBits >> 24) + mps
2378
+    cmp             w9, #1
2379
+    csel            w10, w11, w10, eq
2380
+    strb            w10, x6, x14
2381
+    cbnz            x2, .loop_ccnn
2382
+.idx_zero:
2383
+
2384
+    add             x13, x3, x4, lsl #1
2385
+    add             x4, x4, x15
2386
+    str             h2, x13              // absCoeffnumNonZero = tmpCoeffblkPos
2387
+
2388
+    ldr             x9, sp, #8           // subPosBase
2389
+    uxth            w9, w9
2390
+    cmp             w9, #0
2391
+    cset            x2, eq
2392
+    add             x4, x4, x2
2393
+    cbz             x4, .exit_ccnn
2394
+
2395
+    sub             w2, w2, #1
2396
+    uxtb            w2, w2
2397
+    fmov            w3, s1
2398
+    and             w2, w2, w3
2399
+
2400
+    ldrb            w3, x6, x2         // mstate = baseCtxctxSig
2401
+    eor             w4, w5, w3            // x5 = mstate ^ sig
2402
+    and             w3, w3, #1            // mps = mstate & 1
2403
+    add             x1, x1, x4, lsl #2
2404
+    ldr             w11, x1
2405
+    ldrb            w12, x1, #3
2406
+    add             w0, w0, w11           // sum += x265_entropyStateBitsmstate ^ sig
2407
+    add             w3, w3, w12           // nextState = (stateBits >> 24) + mps
2408
+    cmp             w4, #1
2409
+    csel            w3, w5, w3, eq
2410
+    strb            w3, x6, x2
2411
+.exit_ccnn:
2412
+    ubfx            w0, w0, #0, #24
2413
     ret
2414
 endfunc
2415
+
2416
+const g_SPL_and_mask, align=8
2417
+.byte 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
2418
+endconst
2419
x265_3.6.tar.gz/source/common/aarch64/sad-a-common.S Added
516
 
1
@@ -0,0 +1,514 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+#include "asm.S"
29
+
30
+.arch           armv8-a
31
+
32
+#ifdef __APPLE__
33
+.section __RODATA,__rodata
34
+#else
35
+.section .rodata
36
+#endif
37
+
38
+.align 4
39
+
40
+.macro SAD_START_4 f
41
+    ld1             {v0.s}0, x0, x1
42
+    ld1             {v0.s}1, x0, x1
43
+    ld1             {v1.s}0, x2, x3
44
+    ld1             {v1.s}1, x2, x3
45
+    \f              v16.8h, v0.8b, v1.8b
46
+.endm
47
+
48
+.macro SAD_4 h
49
+.rept \h / 2 - 1
50
+    SAD_START_4 uabal
51
+.endr
52
+.endm
53
+
54
+.macro SAD_START_8 f
55
+    ld1             {v0.8b}, x0, x1
56
+    ld1             {v1.8b}, x2, x3
57
+    ld1             {v2.8b}, x0, x1
58
+    ld1             {v3.8b}, x2, x3
59
+    \f              v16.8h, v0.8b, v1.8b
60
+    \f              v17.8h, v2.8b, v3.8b
61
+.endm
62
+
63
+.macro SAD_8 h
64
+.rept \h / 2 - 1
65
+    SAD_START_8 uabal
66
+.endr
67
+.endm
68
+
69
+.macro SAD_START_16 f
70
+    ld1             {v0.16b}, x0, x1
71
+    ld1             {v1.16b}, x2, x3
72
+    ld1             {v2.16b}, x0, x1
73
+    ld1             {v3.16b}, x2, x3
74
+    \f              v16.8h, v0.8b, v1.8b
75
+    \f\()2          v17.8h, v0.16b, v1.16b
76
+    uabal           v16.8h, v2.8b, v3.8b
77
+    uabal2          v17.8h, v2.16b, v3.16b
78
+.endm
79
+
80
+.macro SAD_16 h
81
+.rept \h / 2 - 1
82
+    SAD_START_16 uabal
83
+.endr
84
+.endm
85
+
86
+.macro SAD_START_32
87
+    movi            v16.16b, #0
88
+    movi            v17.16b, #0
89
+    movi            v18.16b, #0
90
+    movi            v19.16b, #0
91
+.endm
92
+
93
+.macro SAD_32
94
+    ld1             {v0.16b-v1.16b}, x0, x1
95
+    ld1             {v2.16b-v3.16b}, x2, x3
96
+    ld1             {v4.16b-v5.16b}, x0, x1
97
+    ld1             {v6.16b-v7.16b}, x2, x3
98
+    uabal           v16.8h, v0.8b, v2.8b
99
+    uabal2          v17.8h, v0.16b, v2.16b
100
+    uabal           v18.8h, v1.8b, v3.8b
101
+    uabal2          v19.8h, v1.16b, v3.16b
102
+    uabal           v16.8h, v4.8b, v6.8b
103
+    uabal2          v17.8h, v4.16b, v6.16b
104
+    uabal           v18.8h, v5.8b, v7.8b
105
+    uabal2          v19.8h, v5.16b, v7.16b
106
+.endm
107
+
108
+.macro SAD_END_32
109
+    add             v16.8h, v16.8h, v17.8h
110
+    add             v17.8h, v18.8h, v19.8h
111
+    add             v16.8h, v16.8h, v17.8h
112
+    uaddlv          s0, v16.8h
113
+    fmov            w0, s0
114
+    ret
115
+.endm
116
+
117
+.macro SAD_START_64
118
+    movi            v16.16b, #0
119
+    movi            v17.16b, #0
120
+    movi            v18.16b, #0
121
+    movi            v19.16b, #0
122
+    movi            v20.16b, #0
123
+    movi            v21.16b, #0
124
+    movi            v22.16b, #0
125
+    movi            v23.16b, #0
126
+.endm
127
+
128
+.macro SAD_64
129
+    ld1             {v0.16b-v3.16b}, x0, x1
130
+    ld1             {v4.16b-v7.16b}, x2, x3
131
+    ld1             {v24.16b-v27.16b}, x0, x1
132
+    ld1             {v28.16b-v31.16b}, x2, x3
133
+    uabal           v16.8h, v0.8b, v4.8b
134
+    uabal2          v17.8h, v0.16b, v4.16b
135
+    uabal           v18.8h, v1.8b, v5.8b
136
+    uabal2          v19.8h, v1.16b, v5.16b
137
+    uabal           v20.8h, v2.8b, v6.8b
138
+    uabal2          v21.8h, v2.16b, v6.16b
139
+    uabal           v22.8h, v3.8b, v7.8b
140
+    uabal2          v23.8h, v3.16b, v7.16b
141
+
142
+    uabal           v16.8h, v24.8b, v28.8b
143
+    uabal2          v17.8h, v24.16b, v28.16b
144
+    uabal           v18.8h, v25.8b, v29.8b
145
+    uabal2          v19.8h, v25.16b, v29.16b
146
+    uabal           v20.8h, v26.8b, v30.8b
147
+    uabal2          v21.8h, v26.16b, v30.16b
148
+    uabal           v22.8h, v27.8b, v31.8b
149
+    uabal2          v23.8h, v27.16b, v31.16b
150
+.endm
151
+
152
+.macro SAD_END_64
153
+    add             v16.8h, v16.8h, v17.8h
154
+    add             v17.8h, v18.8h, v19.8h
155
+    add             v16.8h, v16.8h, v17.8h
156
+    uaddlp          v16.4s, v16.8h
157
+    add             v18.8h, v20.8h, v21.8h
158
+    add             v19.8h, v22.8h, v23.8h
159
+    add             v17.8h, v18.8h, v19.8h
160
+    uaddlp          v17.4s, v17.8h
161
+    add             v16.4s, v16.4s, v17.4s
162
+    uaddlv          d0, v16.4s
163
+    fmov            x0, d0
164
+    ret
165
+.endm
166
+
167
+.macro SAD_START_12
168
+    movrel          x12, sad12_mask
169
+    ld1             {v31.16b}, x12
170
+    movi            v16.16b, #0
171
+    movi            v17.16b, #0
172
+.endm
173
+
174
+.macro SAD_12
175
+    ld1             {v0.16b}, x0, x1
176
+    and             v0.16b, v0.16b, v31.16b
177
+    ld1             {v1.16b}, x2, x3
178
+    and             v1.16b, v1.16b, v31.16b
179
+    ld1             {v2.16b}, x0, x1
180
+    and             v2.16b, v2.16b, v31.16b
181
+    ld1             {v3.16b}, x2, x3
182
+    and             v3.16b, v3.16b, v31.16b
183
+    uabal           v16.8h, v0.8b, v1.8b
184
+    uabal2          v17.8h, v0.16b, v1.16b
185
+    uabal           v16.8h, v2.8b, v3.8b
186
+    uabal2          v17.8h, v2.16b, v3.16b
187
+.endm
188
+
189
+.macro SAD_END_12
190
+    add             v16.8h, v16.8h, v17.8h
191
+    uaddlv          s0, v16.8h
192
+    fmov            w0, s0
193
+    ret
194
+.endm
195
+
196
+.macro SAD_START_24
197
+    movi            v16.16b, #0
198
+    movi            v17.16b, #0
199
+    movi            v18.16b, #0
200
+    sub             x1, x1, #16
201
+    sub             x3, x3, #16
202
+.endm
203
+
204
+.macro SAD_24
205
+    ld1             {v0.16b}, x0, #16
206
+    ld1             {v1.8b}, x0, x1
207
+    ld1             {v2.16b}, x2, #16
208
+    ld1             {v3.8b}, x2, x3
209
+    ld1             {v4.16b}, x0, #16
210
+    ld1             {v5.8b}, x0, x1
211
+    ld1             {v6.16b}, x2, #16
212
+    ld1             {v7.8b}, x2, x3
213
+    uabal           v16.8h, v0.8b, v2.8b
214
+    uabal2          v17.8h, v0.16b, v2.16b
215
+    uabal           v18.8h, v1.8b, v3.8b
216
+    uabal           v16.8h, v4.8b, v6.8b
217
+    uabal2          v17.8h, v4.16b, v6.16b
218
+    uabal           v18.8h, v5.8b, v7.8b
219
+.endm
220
+
221
+.macro SAD_END_24
222
+    add             v16.8h, v16.8h, v17.8h
223
+    add             v16.8h, v16.8h, v18.8h
224
+    uaddlv          s0, v16.8h
225
+    fmov            w0, s0
226
+    ret
227
+.endm
228
+
229
+.macro SAD_START_48
230
+    movi            v16.16b, #0
231
+    movi            v17.16b, #0
232
+    movi            v18.16b, #0
233
+    movi            v19.16b, #0
234
+    movi            v20.16b, #0
235
+    movi            v21.16b, #0
236
+.endm
237
+
238
+.macro SAD_48
239
+    ld1             {v0.16b-v2.16b}, x0, x1
240
+    ld1             {v4.16b-v6.16b}, x2, x3
241
+    ld1             {v24.16b-v26.16b}, x0, x1
242
+    ld1             {v28.16b-v30.16b}, x2, x3
243
+    uabal           v16.8h, v0.8b, v4.8b
244
+    uabal2          v17.8h, v0.16b, v4.16b
245
+    uabal           v18.8h, v1.8b, v5.8b
246
+    uabal2          v19.8h, v1.16b, v5.16b
247
+    uabal           v20.8h, v2.8b, v6.8b
248
+    uabal2          v21.8h, v2.16b, v6.16b
249
+
250
+    uabal           v16.8h, v24.8b, v28.8b
251
+    uabal2          v17.8h, v24.16b, v28.16b
252
+    uabal           v18.8h, v25.8b, v29.8b
253
+    uabal2          v19.8h, v25.16b, v29.16b
254
+    uabal           v20.8h, v26.8b, v30.8b
255
+    uabal2          v21.8h, v26.16b, v30.16b
256
+.endm
257
+
258
+.macro SAD_END_48
259
+    add             v16.8h, v16.8h, v17.8h
260
+    add             v17.8h, v18.8h, v19.8h
261
+    add             v16.8h, v16.8h, v17.8h
262
+    uaddlv          s0, v16.8h
263
+    fmov            w0, s0
264
+    add             v18.8h, v20.8h, v21.8h
265
+    uaddlv          s1, v18.8h
266
+    fmov            w1, s1
267
+    add             w0, w0, w1
268
+    ret
269
+.endm
270
+
271
+.macro SAD_X_START_4 h, x, f
272
+    ld1             {v0.s}0, x0, x9
273
+    ld1             {v0.s}1, x0, x9
274
+    ld1             {v1.s}0, x1, x5
275
+    ld1             {v1.s}1, x1, x5
276
+    ld1             {v2.s}0, x2, x5
277
+    ld1             {v2.s}1, x2, x5
278
+    ld1             {v3.s}0, x3, x5
279
+    ld1             {v3.s}1, x3, x5
280
+    \f              v16.8h, v0.8b, v1.8b
281
+    \f              v17.8h, v0.8b, v2.8b
282
+    \f              v18.8h, v0.8b, v3.8b
283
+.if \x == 4
284
+    ld1             {v4.s}0, x4, x5
285
+    ld1             {v4.s}1, x4, x5
286
+    \f              v19.8h, v0.8b, v4.8b
287
+.endif
288
+.endm
289
+
290
+.macro SAD_X_4 h, x
291
+.rept \h/2 - 1
292
+    SAD_X_START_4 \h, \x, uabal
293
+.endr
294
+.endm
295
+
296
+.macro SAD_X_END_4 x
297
+    uaddlv          s0, v16.8h
298
+    uaddlv          s1, v17.8h
299
+    uaddlv          s2, v18.8h
300
+    stp             s0, s1, x6
301
+.if \x == 3
302
+    str             s2, x6, #8
303
+.elseif \x == 4
304
+    uaddlv          s3, v19.8h
305
+    stp             s2, s3, x6, #8
306
+.endif
307
+    ret
308
+.endm
309
+
310
+.macro SAD_X_START_8 h, x, f
311
+    ld1             {v0.8b}, x0, x9
312
+    ld1             {v1.8b}, x1, x5
313
+    ld1             {v2.8b}, x2, x5
314
+    ld1             {v3.8b}, x3, x5
315
+    \f              v16.8h, v0.8b, v1.8b
316
+    \f              v17.8h, v0.8b, v2.8b
317
+    \f              v18.8h, v0.8b, v3.8b
318
+.if \x == 4
319
+    ld1             {v4.8b}, x4, x5
320
+    \f              v19.8h, v0.8b, v4.8b
321
+.endif
322
+.endm
323
+
324
+.macro SAD_X_8 h x
325
+.rept \h - 1
326
+    SAD_X_START_8 \h, \x, uabal
327
+.endr
328
+.endm
329
+
330
+.macro SAD_X_END_8 x
331
+    SAD_X_END_4 \x
332
+.endm
333
+
334
+.macro SAD_X_START_12 h, x, f
335
+    ld1             {v0.16b}, x0, x9
336
+    and             v0.16b, v0.16b, v31.16b
337
+    ld1             {v1.16b}, x1, x5
338
+    and             v1.16b, v1.16b, v31.16b
339
+    ld1             {v2.16b}, x2, x5
340
+    and             v2.16b, v2.16b, v31.16b
341
+    ld1             {v3.16b}, x3, x5
342
+    and             v3.16b, v3.16b, v31.16b
343
+    \f              v16.8h, v1.8b, v0.8b
344
+    \f\()2          v20.8h, v1.16b, v0.16b
345
+    \f              v17.8h, v2.8b, v0.8b
346
+    \f\()2          v21.8h, v2.16b, v0.16b
347
+    \f              v18.8h, v3.8b, v0.8b
348
+    \f\()2          v22.8h, v3.16b, v0.16b
349
+.if \x == 4
350
+    ld1             {v4.16b}, x4, x5
351
+    and             v4.16b, v4.16b, v31.16b
352
+    \f              v19.8h, v4.8b, v0.8b
353
+    \f\()2          v23.8h, v4.16b, v0.16b
354
+.endif
355
+.endm
356
+
357
+.macro SAD_X_12 h x
358
+.rept \h - 1
359
+    SAD_X_START_12 \h, \x, uabal
360
+.endr
361
+.endm
362
+
363
+.macro SAD_X_END_12 x
364
+    SAD_X_END_16 \x
365
+.endm
366
+
367
+.macro SAD_X_START_16 h, x, f
368
+    ld1             {v0.16b}, x0, x9
369
+    ld1             {v1.16b}, x1, x5
370
+    ld1             {v2.16b}, x2, x5
371
+    ld1             {v3.16b}, x3, x5
372
+    \f              v16.8h, v1.8b, v0.8b
373
+    \f\()2          v20.8h, v1.16b, v0.16b
374
+    \f              v17.8h, v2.8b, v0.8b
375
+    \f\()2          v21.8h, v2.16b, v0.16b
376
+    \f              v18.8h, v3.8b, v0.8b
377
+    \f\()2          v22.8h, v3.16b, v0.16b
378
+.if \x == 4
379
+    ld1             {v4.16b}, x4, x5
380
+    \f              v19.8h, v4.8b, v0.8b
381
+    \f\()2          v23.8h, v4.16b, v0.16b
382
+.endif
383
+.endm
384
+
385
+.macro SAD_X_16 h x
386
+.rept \h - 1
387
+    SAD_X_START_16 \h, \x, uabal
388
+.endr
389
+.endm
390
+
391
+.macro SAD_X_END_16 x
392
+    add             v16.8h, v16.8h, v20.8h
393
+    add             v17.8h, v17.8h, v21.8h
394
+    add             v18.8h, v18.8h, v22.8h
395
+.if \x == 4
396
+    add             v19.8h, v19.8h, v23.8h
397
+.endif
398
+
399
+    SAD_X_END_4 \x
400
+.endm
401
+
402
+.macro SAD_X_START_24 x
403
+    SAD_X_START_32 \x
404
+    sub             x5, x5, #16
405
+    sub             x9, x9, #16
406
+.endm
407
+
408
+.macro SAD_X_24 base v1 v2
409
+    ld1             {v0.16b},  \base , #16
410
+    ld1             {v1.8b},  \base , x5
411
+    uabal           \v1\().8h, v0.8b, v6.8b
412
+    uabal           \v1\().8h, v1.8b, v7.8b
413
+    uabal2          \v2\().8h, v0.16b, v6.16b
414
+.endm
415
+
416
+.macro SAD_X_END_24 x
417
+    SAD_X_END_16 \x
418
+.endm
419
+
420
+.macro SAD_X_START_32 x
421
+    movi v16.16b, #0
422
+    movi v17.16b, #0
423
+    movi v18.16b, #0
424
+    movi v20.16b, #0
425
+    movi v21.16b, #0
426
+    movi v22.16b, #0
427
+.if \x == 4
428
+    movi v19.16b, #0
429
+    movi v23.16b, #0
430
+.endif
431
+.endm
432
+
433
+.macro SAD_X_32 base v1 v2
434
+    ld1             {v0.16b-v1.16b},  \base , x5
435
+    uabal           \v1\().8h, v0.8b, v6.8b
436
+    uabal           \v1\().8h, v1.8b, v7.8b
437
+    uabal2          \v2\().8h, v0.16b, v6.16b
438
+    uabal2          \v2\().8h, v1.16b, v7.16b
439
+.endm
440
+
441
+.macro SAD_X_END_32 x
442
+    SAD_X_END_16 \x
443
+.endm
444
+
445
+.macro SAD_X_START_48 x
446
+    SAD_X_START_32 \x
447
+.endm
448
+
449
+.macro SAD_X_48 x1 v1 v2
450
+    ld1             {v0.16b-v2.16b},  \x1 , x5
451
+    uabal           \v1\().8h, v0.8b, v4.8b
452
+    uabal           \v1\().8h, v1.8b, v5.8b
453
+    uabal           \v1\().8h, v2.8b, v6.8b
454
+    uabal2          \v2\().8h, v0.16b, v4.16b
455
+    uabal2          \v2\().8h, v1.16b, v5.16b
456
+    uabal2          \v2\().8h, v2.16b, v6.16b
457
+.endm
458
+
459
+.macro SAD_X_END_48 x
460
+    SAD_X_END_64 \x
461
+.endm
462
+
463
+.macro SAD_X_START_64 x
464
+    SAD_X_START_32 \x
465
+.endm
466
+
467
+.macro SAD_X_64 x1 v1 v2
468
+    ld1             {v0.16b-v3.16b},  \x1 , x5
469
+    uabal           \v1\().8h, v0.8b, v4.8b
470
+    uabal           \v1\().8h, v1.8b, v5.8b
471
+    uabal           \v1\().8h, v2.8b, v6.8b
472
+    uabal           \v1\().8h, v3.8b, v7.8b
473
+    uabal2          \v2\().8h, v0.16b, v4.16b
474
+    uabal2          \v2\().8h, v1.16b, v5.16b
475
+    uabal2          \v2\().8h, v2.16b, v6.16b
476
+    uabal2          \v2\().8h, v3.16b, v7.16b
477
+.endm
478
+
479
+.macro SAD_X_END_64 x
480
+    uaddlp          v16.4s, v16.8h
481
+    uaddlp          v17.4s, v17.8h
482
+    uaddlp          v18.4s, v18.8h
483
+    uaddlp          v20.4s, v20.8h
484
+    uaddlp          v21.4s, v21.8h
485
+    uaddlp          v22.4s, v22.8h
486
+    add             v16.4s, v16.4s, v20.4s
487
+    add             v17.4s, v17.4s, v21.4s
488
+    add             v18.4s, v18.4s, v22.4s
489
+    trn2            v20.2d, v16.2d, v16.2d
490
+    trn2            v21.2d, v17.2d, v17.2d
491
+    trn2            v22.2d, v18.2d, v18.2d
492
+    add             v16.2s, v16.2s, v20.2s
493
+    add             v17.2s, v17.2s, v21.2s
494
+    add             v18.2s, v18.2s, v22.2s
495
+    uaddlp          v16.1d, v16.2s
496
+    uaddlp          v17.1d, v17.2s
497
+    uaddlp          v18.1d, v18.2s
498
+    stp             s16, s17, x6, #8
499
+.if \x == 3
500
+    str             s18, x6
501
+.elseif \x == 4
502
+    uaddlp          v19.4s, v19.8h
503
+    uaddlp          v23.4s, v23.8h
504
+    add             v19.4s, v19.4s, v23.4s
505
+    trn2            v23.2d, v19.2d, v19.2d
506
+    add             v19.2s, v19.2s, v23.2s
507
+    uaddlp          v19.1d, v19.2s
508
+    stp             s18, s19, x6
509
+.endif
510
+    ret
511
+.endm
512
+
513
+const sad12_mask, align=8
514
+.byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0
515
+endconst
516
x265_3.6.tar.gz/source/common/aarch64/sad-a-sve2.S Added
513
 
1
@@ -0,0 +1,511 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "sad-a-common.S"
27
+
28
+.arch armv8-a+sve2
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+.macro SAD_SVE2_16 h
41
+    mov             z16.d, #0
42
+    ptrue           p0.h, vl16
43
+.rept \h
44
+    ld1b            {z0.h}, p0/z, x0
45
+    ld1b            {z2.h}, p0/z, x2
46
+    add             x0, x0, x1
47
+    add             x2, x2, x3
48
+    uaba            z16.h, z0.h, z2.h
49
+.endr
50
+    uaddv           d0, p0, z16.h
51
+    fmov            w0, s0
52
+    ret
53
+.endm
54
+
55
+.macro SAD_SVE2_32 h
56
+    ptrue           p0.b, vl32
57
+.rept \h
58
+    ld1b            {z0.b}, p0/z, x0
59
+    ld1b            {z4.b}, p0/z, x2
60
+    add             x0, x0, x1
61
+    add             x2, x2, x3
62
+    uabalb          z16.h, z0.b, z4.b
63
+    uabalt          z16.h, z0.b, z4.b
64
+.endr
65
+    uaddv           d0, p0, z16.h
66
+    fmov            w0, s0
67
+    ret
68
+.endm
69
+
70
+.macro SAD_SVE2_64 h
71
+    cmp             x9, #48
72
+    bgt             .vl_gt_48_pixel_sad_64x\h
73
+    mov             z16.d, #0
74
+    mov             z17.d, #0
75
+    mov             z18.d, #0
76
+    mov             z19.d, #0
77
+    ptrue           p0.b, vl32
78
+.rept \h
79
+    ld1b            {z0.b}, p0/z, x0
80
+    ld1b            {z1.b}, p0/z, x0, #1, mul vl
81
+    ld1b            {z4.b}, p0/z, x2
82
+    ld1b            {z5.b}, p0/z, x2, #1, mul vl
83
+    add             x0, x0, x1
84
+    add             x2, x2, x3
85
+    uabalb          z16.h, z0.b, z4.b
86
+    uabalt          z17.h, z0.b, z4.b
87
+    uabalb          z18.h, z1.b, z5.b
88
+    uabalt          z19.h, z1.b, z5.b
89
+.endr
90
+    add             z16.h, z16.h, z17.h
91
+    add             z17.h, z18.h, z19.h
92
+    add             z16.h, z16.h, z17.h
93
+    uadalp          z24.s, p0/m, z16.h
94
+    uaddv           d5, p0, z24.s
95
+    fmov            x0, d5
96
+    ret
97
+.vl_gt_48_pixel_sad_64x\h\():
98
+    mov             z16.d, #0
99
+    mov             z17.d, #0
100
+    mov             z24.d, #0
101
+    ptrue           p0.b, vl64
102
+.rept \h
103
+    ld1b            {z0.b}, p0/z, x0
104
+    ld1b            {z4.b}, p0/z, x2
105
+    add             x0, x0, x1
106
+    add             x2, x2, x3
107
+    uabalb          z16.h, z0.b, z4.b
108
+    uabalt          z17.h, z0.b, z4.b
109
+.endr
110
+    add             z16.h, z16.h, z17.h
111
+    uadalp          z24.s, p0/m, z16.h
112
+    uaddv           d5, p0, z24.s
113
+    fmov            x0, d5
114
+    ret
115
+.endm
116
+
117
+.macro SAD_SVE2_24 h
118
+    mov             z16.d, #0
119
+    mov             x10, #24
120
+    mov             x11, #0
121
+    whilelt         p0.b, x11, x10
122
+.rept \h
123
+    ld1b            {z0.b}, p0/z, x0
124
+    ld1b            {z8.b}, p0/z, x2
125
+    add             x0, x0, x1
126
+    add             x2, x2, x3
127
+    uabalb          z16.h, z0.b, z8.b
128
+    uabalt          z16.h, z0.b, z8.b
129
+.endr
130
+    uaddv           d5, p0, z16.h
131
+    fmov            w0, s5
132
+    ret
133
+.endm
134
+
135
+.macro SAD_SVE2_48 h
136
+    cmp             x9, #48
137
+    bgt             .vl_gt_48_pixel_sad_48x\h
138
+    mov             z16.d, #0
139
+    mov             z17.d, #0
140
+    mov             z18.d, #0
141
+    mov             z19.d, #0
142
+    ptrue           p0.b, vl32
143
+    ptrue           p1.b, vl16
144
+.rept \h
145
+    ld1b            {z0.b}, p0/z, x0
146
+    ld1b            {z1.b}, p1/z, x0, #1, mul vl
147
+    ld1b            {z8.b}, p0/z, x2
148
+    ld1b            {z9.b}, p1/z, x2, #1, mul vl
149
+    add             x0, x0, x1
150
+    add             x2, x2, x3
151
+    uabalb          z16.h, z0.b, z8.b
152
+    uabalt          z17.h, z0.b, z8.b
153
+    uabalb          z18.h, z1.b, z9.b
154
+    uabalt          z19.h, z1.b, z9.b
155
+.endr
156
+    add             z16.h, z16.h, z17.h
157
+    add             z17.h, z18.h, z19.h
158
+    add             z16.h, z16.h, z17.h
159
+    uaddv           d5, p0, z16.h
160
+    fmov            w0, s5
161
+    ret
162
+.vl_gt_48_pixel_sad_48x\h\():
163
+    mov             z16.d, #0
164
+    mov             z17.d, #0
165
+    mov             x10, #48
166
+    mov             x11, #0
167
+    whilelt         p0.b, x11, x10
168
+.rept \h
169
+    ld1b            {z0.b}, p0/z, x0
170
+    ld1b            {z8.b}, p0/z, x2
171
+    add             x0, x0, x1
172
+    add             x2, x2, x3
173
+    uabalb          z16.h, z0.b, z8.b
174
+    uabalt          z17.h, z0.b, z8.b
175
+.endr
176
+    add             z16.h, z16.h, z17.h
177
+    uaddv           d5, p0, z16.h
178
+    fmov            w0, s5
179
+    ret
180
+.endm
181
+
182
+// Fully unrolled.
183
+.macro SAD_FUNC_SVE2 w, h
184
+function PFX(pixel_sad_\w\()x\h\()_sve2)
185
+    rdvl            x9, #1
186
+    cmp             x9, #16
187
+    bgt             .vl_gt_16_pixel_sad_\w\()x\h
188
+    SAD_START_\w uabdl
189
+    SAD_\w \h
190
+.if \w > 4
191
+    add             v16.8h, v16.8h, v17.8h
192
+.endif
193
+    uaddlv          s0, v16.8h
194
+    fmov            w0, s0
195
+    ret
196
+.vl_gt_16_pixel_sad_\w\()x\h\():
197
+.if \w == 4 || \w == 8 || \w == 12
198
+    SAD_START_\w uabdl
199
+    SAD_\w \h
200
+.if \w > 4
201
+    add             v16.8h, v16.8h, v17.8h
202
+.endif
203
+    uaddlv          s0, v16.8h
204
+    fmov            w0, s0
205
+    ret
206
+.else
207
+    SAD_SVE2_\w \h
208
+.endif
209
+endfunc
210
+.endm
211
+
212
+// Loop unrolled 4.
213
+.macro SAD_FUNC_LOOP_SVE2 w, h
214
+function PFX(pixel_sad_\w\()x\h\()_sve2)
215
+    rdvl            x9, #1
216
+    cmp             x9, #16
217
+    bgt             .vl_gt_16_pixel_sad_loop_\w\()x\h
218
+    SAD_START_\w
219
+
220
+    mov             w9, #\h/8
221
+.loop_sve2_\w\()x\h:
222
+    sub             w9, w9, #1
223
+.rept 4
224
+    SAD_\w
225
+.endr
226
+    cbnz            w9, .loop_sve2_\w\()x\h
227
+
228
+    SAD_END_\w
229
+
230
+.vl_gt_16_pixel_sad_loop_\w\()x\h\():
231
+.if \w == 4 || \w == 8 || \w == 12
232
+    SAD_START_\w
233
+
234
+    mov             w9, #\h/8
235
+.loop_sve2_loop_\w\()x\h:
236
+    sub             w9, w9, #1
237
+.rept 4
238
+    SAD_\w
239
+.endr
240
+    cbnz            w9, .loop_sve2_loop_\w\()x\h
241
+
242
+    SAD_END_\w
243
+.else
244
+    SAD_SVE2_\w \h
245
+.endif
246
+endfunc
247
+.endm
248
+
249
+SAD_FUNC_SVE2  4,  4
250
+SAD_FUNC_SVE2  4,  8
251
+SAD_FUNC_SVE2  4,  16
252
+SAD_FUNC_SVE2  8,  4
253
+SAD_FUNC_SVE2  8,  8
254
+SAD_FUNC_SVE2  8,  16
255
+SAD_FUNC_SVE2  8,  32
256
+SAD_FUNC_SVE2  16, 4
257
+SAD_FUNC_SVE2  16, 8
258
+SAD_FUNC_SVE2  16, 12
259
+SAD_FUNC_SVE2  16, 16
260
+SAD_FUNC_SVE2  16, 32
261
+SAD_FUNC_SVE2  16, 64
262
+
263
+SAD_FUNC_LOOP_SVE2  32, 8
264
+SAD_FUNC_LOOP_SVE2  32, 16
265
+SAD_FUNC_LOOP_SVE2  32, 24
266
+SAD_FUNC_LOOP_SVE2  32, 32
267
+SAD_FUNC_LOOP_SVE2  32, 64
268
+SAD_FUNC_LOOP_SVE2  64, 16
269
+SAD_FUNC_LOOP_SVE2  64, 32
270
+SAD_FUNC_LOOP_SVE2  64, 48
271
+SAD_FUNC_LOOP_SVE2  64, 64
272
+SAD_FUNC_LOOP_SVE2  12, 16
273
+SAD_FUNC_LOOP_SVE2  24, 32
274
+SAD_FUNC_LOOP_SVE2  48, 64
275
+
276
+// SAD_X3 and SAD_X4 code start
277
+
278
+.macro SAD_X_SVE2_24_INNER_GT_16 base z
279
+    ld1b            {z4.b}, p0/z,  \base 
280
+    add             \base, \base, x5
281
+    uabalb          \z\().h, z4.b, z0.b
282
+    uabalt          \z\().h, z4.b, z0.b
283
+.endm
284
+
285
+.macro SAD_X_SVE2_24 h x
286
+    mov             z20.d, #0
287
+    mov             z21.d, #0
288
+    mov             z22.d, #0
289
+    mov             z23.d, #0
290
+    mov             x10, #24
291
+    mov             x11, #0
292
+    whilelt         p0.b, x11, x10
293
+.rept \h
294
+    ld1b            {z0.b}, p0/z, x0
295
+    add             x0, x0, x9
296
+    SAD_X_SVE2_24_INNER_GT_16 x1, z20
297
+    SAD_X_SVE2_24_INNER_GT_16 x2, z21
298
+    SAD_X_SVE2_24_INNER_GT_16 x3, z22
299
+.if \x == 4
300
+    SAD_X_SVE2_24_INNER_GT_16 x4, z23
301
+.endif
302
+.endr
303
+    uaddlv          s0, v20.8h
304
+    uaddlv          s1, v21.8h
305
+    uaddlv          s2, v22.8h
306
+    stp             s0, s1, x6
307
+.if \x == 3
308
+    str             s2, x6, #8
309
+.elseif \x == 4
310
+    uaddv           d0, p0, z20.h
311
+    uaddv           d1, p0, z21.h
312
+    uaddv           d2, p0, z22.h
313
+    stp             s2, s3, x6, #8
314
+.endif
315
+    ret
316
+.endm
317
+
318
+.macro SAD_X_SVE2_32_INNER_GT_16 base z
319
+    ld1b            {z4.b}, p0/z,  \base 
320
+    add             \base, \base, x5
321
+    uabalb          \z\().h, z4.b, z0.b
322
+    uabalt          \z\().h, z4.b, z0.b
323
+.endm
324
+
325
+.macro SAD_X_SVE2_32 h x
326
+    mov             z20.d, #0
327
+    mov             z21.d, #0
328
+    mov             z22.d, #0
329
+    mov             z23.d, #0
330
+    ptrue           p0.b, vl32
331
+.rept \h
332
+    ld1b            {z0.b}, p0/z, x0
333
+    add             x0, x0, x9
334
+    SAD_X_SVE2_32_INNER_GT_16 x1, z20
335
+    SAD_X_SVE2_32_INNER_GT_16 x2, z21
336
+    SAD_X_SVE2_32_INNER_GT_16 x3, z22
337
+.if \x == 4
338
+    SAD_X_SVE2_32_INNER_GT_16 x4, z23
339
+.endif
340
+.endr
341
+    uaddv           d0, p0, z20.h
342
+    uaddv           d1, p0, z21.h
343
+    uaddv           d2, p0, z22.h
344
+    stp             s0, s1, x6
345
+.if \x == 3
346
+    str             s2, x6, #8
347
+.elseif \x == 4
348
+    uaddv           d3, p0, z23.h
349
+    stp             s2, s3, x6, #8
350
+.endif
351
+    ret
352
+.endm
353
+
354
+// static void x264_pixel_sad_x3_##size(pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, intptr_t i_stride, int scores3)
355
+// static void x264_pixel_sad_x4_##size(pixel *fenc, pixel *pix0, pixel *pix1,pixel *pix2, pixel *pix3, intptr_t i_stride, int scores4)
356
+.macro SAD_X_FUNC_SVE2 x, w, h
357
+function PFX(sad_x\x\()_\w\()x\h\()_sve2)
358
+    mov             x9, #FENC_STRIDE
359
+
360
+// Make function arguments for x == 3 look like x == 4.
361
+.if \x == 3
362
+    mov             x6, x5
363
+    mov             x5, x4
364
+.endif
365
+    rdvl            x11, #1
366
+    cmp             x11, #16
367
+    bgt             .vl_gt_16_sad_x\x\()_\w\()x\h
368
+.if \w == 12
369
+    movrel          x12, sad12_mask
370
+    ld1             {v31.16b}, x12
371
+.endif
372
+
373
+    SAD_X_START_\w \h, \x, uabdl
374
+    SAD_X_\w \h, \x
375
+    SAD_X_END_\w \x
376
+.vl_gt_16_sad_x\x\()_\w\()x\h\():
377
+.if \w == 24 || \w == 32
378
+    SAD_X_SVE2_\w \h, \x
379
+.else
380
+.if \w == 12
381
+    movrel          x12, sad12_mask
382
+    ld1             {v31.16b}, x12
383
+.endif
384
+
385
+    SAD_X_START_\w \h, \x, uabdl
386
+    SAD_X_\w \h, \x
387
+    SAD_X_END_\w \x
388
+.endif
389
+endfunc
390
+.endm
391
+
392
+.macro SAD_X_LOOP_SVE2 x, w, h
393
+function PFX(sad_x\x\()_\w\()x\h\()_sve2)
394
+    mov             x9, #FENC_STRIDE
395
+
396
+// Make function arguments for x == 3 look like x == 4.
397
+.if \x == 3
398
+    mov             x6, x5
399
+    mov             x5, x4
400
+.endif
401
+    rdvl            x11, #1
402
+    cmp             x11, #16
403
+    bgt             .vl_gt_16_sad_x_loop_\x\()_\w\()x\h
404
+    SAD_X_START_\w \x
405
+    mov             w12, #\h/4
406
+.loop_sad_sve2_x\x\()_\w\()x\h:
407
+    sub             w12, w12, #1
408
+ .rept 4
409
+  .if \w == 24
410
+    ld1             {v6.16b}, x0, #16
411
+    ld1             {v7.8b}, x0, x9
412
+  .elseif \w == 32
413
+    ld1             {v6.16b-v7.16b}, x0, x9
414
+  .elseif \w == 48
415
+    ld1             {v4.16b-v6.16b}, x0, x9
416
+  .elseif \w == 64
417
+    ld1             {v4.16b-v7.16b}, x0, x9
418
+  .endif
419
+    SAD_X_\w x1, v16, v20
420
+    SAD_X_\w x2, v17, v21
421
+    SAD_X_\w x3, v18, v22
422
+  .if \x == 4
423
+    SAD_X_\w x4, v19, v23
424
+  .endif
425
+ .endr
426
+    cbnz            w12, .loop_sad_sve2_x\x\()_\w\()x\h
427
+    SAD_X_END_\w \x
428
+.vl_gt_16_sad_x_loop_\x\()_\w\()x\h\():
429
+.if \w == 24 || \w == 32
430
+    SAD_X_SVE2_\w \h, \x
431
+    ret
432
+.else
433
+    SAD_X_START_\w \x
434
+    mov             w12, #\h/4
435
+.loop_sad_sve2_gt_16_x\x\()_\w\()x\h:
436
+    sub             w12, w12, #1
437
+ .rept 4
438
+  .if \w == 24
439
+    ld1             {v6.16b}, x0, #16
440
+    ld1             {v7.8b}, x0, x9
441
+  .elseif \w == 32
442
+    ld1             {v6.16b-v7.16b}, x0, x9
443
+  .elseif \w == 48
444
+    ld1             {v4.16b-v6.16b}, x0, x9
445
+  .elseif \w == 64
446
+    ld1             {v4.16b-v7.16b}, x0, x9
447
+  .endif
448
+    SAD_X_\w x1, v16, v20
449
+    SAD_X_\w x2, v17, v21
450
+    SAD_X_\w x3, v18, v22
451
+  .if \x == 4
452
+    SAD_X_\w x4, v19, v23
453
+  .endif
454
+ .endr
455
+    cbnz            w12, .loop_sad_sve2_gt_16_x\x\()_\w\()x\h
456
+    SAD_X_END_\w \x
457
+.endif
458
+endfunc
459
+.endm
460
+
461
+
462
+SAD_X_FUNC_SVE2  3, 4,  4
463
+SAD_X_FUNC_SVE2  3, 4,  8
464
+SAD_X_FUNC_SVE2  3, 4,  16
465
+SAD_X_FUNC_SVE2  3, 8,  4
466
+SAD_X_FUNC_SVE2  3, 8,  8
467
+SAD_X_FUNC_SVE2  3, 8,  16
468
+SAD_X_FUNC_SVE2  3, 8,  32
469
+SAD_X_FUNC_SVE2  3, 12, 16
470
+SAD_X_FUNC_SVE2  3, 16, 4
471
+SAD_X_FUNC_SVE2  3, 16, 8
472
+SAD_X_FUNC_SVE2  3, 16, 12
473
+SAD_X_FUNC_SVE2  3, 16, 16
474
+SAD_X_FUNC_SVE2  3, 16, 32
475
+SAD_X_FUNC_SVE2  3, 16, 64
476
+SAD_X_LOOP_SVE2  3, 24, 32
477
+SAD_X_LOOP_SVE2  3, 32, 8
478
+SAD_X_LOOP_SVE2  3, 32, 16
479
+SAD_X_LOOP_SVE2  3, 32, 24
480
+SAD_X_LOOP_SVE2  3, 32, 32
481
+SAD_X_LOOP_SVE2  3, 32, 64
482
+SAD_X_LOOP_SVE2  3, 48, 64
483
+SAD_X_LOOP_SVE2  3, 64, 16
484
+SAD_X_LOOP_SVE2  3, 64, 32
485
+SAD_X_LOOP_SVE2  3, 64, 48
486
+SAD_X_LOOP_SVE2  3, 64, 64
487
+
488
+SAD_X_FUNC_SVE2  4, 4,  4
489
+SAD_X_FUNC_SVE2  4, 4,  8
490
+SAD_X_FUNC_SVE2  4, 4,  16
491
+SAD_X_FUNC_SVE2  4, 8,  4
492
+SAD_X_FUNC_SVE2  4, 8,  8
493
+SAD_X_FUNC_SVE2  4, 8,  16
494
+SAD_X_FUNC_SVE2  4, 8,  32
495
+SAD_X_FUNC_SVE2  4, 12, 16
496
+SAD_X_FUNC_SVE2  4, 16, 4
497
+SAD_X_FUNC_SVE2  4, 16, 8
498
+SAD_X_FUNC_SVE2  4, 16, 12
499
+SAD_X_FUNC_SVE2  4, 16, 16
500
+SAD_X_FUNC_SVE2  4, 16, 32
501
+SAD_X_FUNC_SVE2  4, 16, 64
502
+SAD_X_LOOP_SVE2  4, 24, 32
503
+SAD_X_LOOP_SVE2  4, 32, 8
504
+SAD_X_LOOP_SVE2  4, 32, 16
505
+SAD_X_LOOP_SVE2  4, 32, 24
506
+SAD_X_LOOP_SVE2  4, 32, 32
507
+SAD_X_LOOP_SVE2  4, 32, 64
508
+SAD_X_LOOP_SVE2  4, 48, 64
509
+SAD_X_LOOP_SVE2  4, 64, 16
510
+SAD_X_LOOP_SVE2  4, 64, 32
511
+SAD_X_LOOP_SVE2  4, 64, 48
512
+SAD_X_LOOP_SVE2  4, 64, 64
513
x265_3.5.tar.gz/source/common/aarch64/sad-a.S -> x265_3.6.tar.gz/source/common/aarch64/sad-a.S Changed
256
 
1
@@ -1,7 +1,8 @@
2
 /*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
+ * Copyright (C) 2020-2021 MulticoreWare, Inc
5
  *
6
  * Authors: Hongbin Liu <liuhongbin1@huawei.com>
7
+ *          Sebastian Pop <spop@amazon.com>
8
  *
9
  * This program is free software; you can redistribute it and/or modify
10
  * it under the terms of the GNU General Public License as published by
11
@@ -22,84 +23,186 @@
12
  *****************************************************************************/
13
 
14
 #include "asm.S"
15
+#include "sad-a-common.S"
16
 
17
+#ifdef __APPLE__
18
+.section __RODATA,__rodata
19
+#else
20
 .section .rodata
21
+#endif
22
 
23
 .align 4
24
 
25
 .text
26
 
27
-.macro SAD_X_START_8 x
28
-    ld1             {v0.8b}, x0, x9
29
-.if \x == 3
30
-    ld1             {v1.8b}, x1, x4
31
-    ld1             {v2.8b}, x2, x4
32
-    ld1             {v3.8b}, x3, x4
33
-.elseif \x == 4
34
-    ld1             {v1.8b}, x1, x5
35
-    ld1             {v2.8b}, x2, x5
36
-    ld1             {v3.8b}, x3, x5
37
-    ld1             {v4.8b}, x4, x5
38
-.endif
39
-    uabdl           v16.8h, v0.8b, v1.8b
40
-    uabdl           v17.8h, v0.8b, v2.8b
41
-    uabdl           v18.8h, v0.8b, v3.8b
42
-.if \x == 4
43
-    uabdl           v19.8h, v0.8b, v4.8b
44
+// Fully unrolled.
45
+.macro SAD_FUNC w, h
46
+function PFX(pixel_sad_\w\()x\h\()_neon)
47
+    SAD_START_\w uabdl
48
+    SAD_\w \h
49
+.if \w > 4
50
+    add             v16.8h, v16.8h, v17.8h
51
 .endif
52
+    uaddlv          s0, v16.8h
53
+    fmov            w0, s0
54
+    ret
55
+endfunc
56
+.endm
57
+
58
+// Loop unrolled 4.
59
+.macro SAD_FUNC_LOOP w, h
60
+function PFX(pixel_sad_\w\()x\h\()_neon)
61
+    SAD_START_\w
62
+
63
+    mov             w9, #\h/8
64
+.loop_\w\()x\h:
65
+    sub             w9, w9, #1
66
+.rept 4
67
+    SAD_\w
68
+.endr
69
+    cbnz            w9, .loop_\w\()x\h
70
+
71
+    SAD_END_\w
72
+endfunc
73
 .endm
74
 
75
-.macro SAD_X_8 x
76
-    ld1             {v0.8b}, x0, x9
77
+SAD_FUNC  4,  4
78
+SAD_FUNC  4,  8
79
+SAD_FUNC  4,  16
80
+SAD_FUNC  8,  4
81
+SAD_FUNC  8,  8
82
+SAD_FUNC  8,  16
83
+SAD_FUNC  8,  32
84
+SAD_FUNC  16, 4
85
+SAD_FUNC  16, 8
86
+SAD_FUNC  16, 12
87
+SAD_FUNC  16, 16
88
+SAD_FUNC  16, 32
89
+SAD_FUNC  16, 64
90
+
91
+SAD_FUNC_LOOP  32, 8
92
+SAD_FUNC_LOOP  32, 16
93
+SAD_FUNC_LOOP  32, 24
94
+SAD_FUNC_LOOP  32, 32
95
+SAD_FUNC_LOOP  32, 64
96
+SAD_FUNC_LOOP  64, 16
97
+SAD_FUNC_LOOP  64, 32
98
+SAD_FUNC_LOOP  64, 48
99
+SAD_FUNC_LOOP  64, 64
100
+SAD_FUNC_LOOP  12, 16
101
+SAD_FUNC_LOOP  24, 32
102
+SAD_FUNC_LOOP  48, 64
103
+
104
+// SAD_X3 and SAD_X4 code start
105
+
106
+// static void x264_pixel_sad_x3_##size(pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, intptr_t i_stride, int scores3)
107
+// static void x264_pixel_sad_x4_##size(pixel *fenc, pixel *pix0, pixel *pix1,pixel *pix2, pixel *pix3, intptr_t i_stride, int scores4)
108
+.macro SAD_X_FUNC x, w, h
109
+function PFX(sad_x\x\()_\w\()x\h\()_neon)
110
+    mov             x9, #FENC_STRIDE
111
+
112
+// Make function arguments for x == 3 look like x == 4.
113
 .if \x == 3
114
-    ld1             {v1.8b}, x1, x4
115
-    ld1             {v2.8b}, x2, x4
116
-    ld1             {v3.8b}, x3, x4
117
-.elseif \x == 4
118
-    ld1             {v1.8b}, x1, x5
119
-    ld1             {v2.8b}, x2, x5
120
-    ld1             {v3.8b}, x3, x5
121
-    ld1             {v4.8b}, x4, x5
122
+    mov             x6, x5
123
+    mov             x5, x4
124
 .endif
125
-    uabal           v16.8h, v0.8b, v1.8b
126
-    uabal           v17.8h, v0.8b, v2.8b
127
-    uabal           v18.8h, v0.8b, v3.8b
128
-.if \x == 4
129
-    uabal           v19.8h, v0.8b, v4.8b
130
+
131
+.if \w == 12
132
+    movrel          x12, sad12_mask
133
+    ld1             {v31.16b}, x12
134
 .endif
135
+
136
+    SAD_X_START_\w \h, \x, uabdl
137
+    SAD_X_\w \h, \x
138
+    SAD_X_END_\w \x
139
+endfunc
140
 .endm
141
 
142
-.macro SAD_X_8xN x, h
143
-function x265_sad_x\x\()_8x\h\()_neon
144
+.macro SAD_X_LOOP x, w, h
145
+function PFX(sad_x\x\()_\w\()x\h\()_neon)
146
     mov             x9, #FENC_STRIDE
147
-    SAD_X_START_8 \x
148
-.rept \h - 1
149
-    SAD_X_8 \x
150
-.endr
151
-    uaddlv          s0, v16.8h
152
-    uaddlv          s1, v17.8h
153
-    uaddlv          s2, v18.8h
154
-.if \x == 4
155
-    uaddlv          s3, v19.8h
156
-.endif
157
 
158
+// Make function arguments for x == 3 look like x == 4.
159
 .if \x == 3
160
-    stp             s0, s1, x5
161
-    str             s2, x5, #8
162
-.elseif \x == 4
163
-    stp             s0, s1, x6
164
-    stp             s2, s3, x6, #8
165
+    mov             x6, x5
166
+    mov             x5, x4
167
 .endif
168
-    ret
169
+    SAD_X_START_\w \x
170
+    mov             w12, #\h/4
171
+.loop_sad_x\x\()_\w\()x\h:
172
+    sub             w12, w12, #1
173
+ .rept 4
174
+  .if \w == 24
175
+    ld1             {v6.16b}, x0, #16
176
+    ld1             {v7.8b}, x0, x9
177
+  .elseif \w == 32
178
+    ld1             {v6.16b-v7.16b}, x0, x9
179
+  .elseif \w == 48
180
+    ld1             {v4.16b-v6.16b}, x0, x9
181
+  .elseif \w == 64
182
+    ld1             {v4.16b-v7.16b}, x0, x9
183
+  .endif
184
+    SAD_X_\w x1, v16, v20
185
+    SAD_X_\w x2, v17, v21
186
+    SAD_X_\w x3, v18, v22
187
+  .if \x == 4
188
+    SAD_X_\w x4, v19, v23
189
+  .endif
190
+ .endr
191
+    cbnz            w12, .loop_sad_x\x\()_\w\()x\h
192
+    SAD_X_END_\w \x
193
 endfunc
194
 .endm
195
 
196
-SAD_X_8xN 3 4
197
-SAD_X_8xN 3 8
198
-SAD_X_8xN 3 16
199
-SAD_X_8xN 3 32
200
 
201
-SAD_X_8xN 4 4
202
-SAD_X_8xN 4 8
203
-SAD_X_8xN 4 16
204
-SAD_X_8xN 4 32
205
+SAD_X_FUNC  3, 4,  4
206
+SAD_X_FUNC  3, 4,  8
207
+SAD_X_FUNC  3, 4,  16
208
+SAD_X_FUNC  3, 8,  4
209
+SAD_X_FUNC  3, 8,  8
210
+SAD_X_FUNC  3, 8,  16
211
+SAD_X_FUNC  3, 8,  32
212
+SAD_X_FUNC  3, 12, 16
213
+SAD_X_FUNC  3, 16, 4
214
+SAD_X_FUNC  3, 16, 8
215
+SAD_X_FUNC  3, 16, 12
216
+SAD_X_FUNC  3, 16, 16
217
+SAD_X_FUNC  3, 16, 32
218
+SAD_X_FUNC  3, 16, 64
219
+SAD_X_LOOP  3, 24, 32
220
+SAD_X_LOOP  3, 32, 8
221
+SAD_X_LOOP  3, 32, 16
222
+SAD_X_LOOP  3, 32, 24
223
+SAD_X_LOOP  3, 32, 32
224
+SAD_X_LOOP  3, 32, 64
225
+SAD_X_LOOP  3, 48, 64
226
+SAD_X_LOOP  3, 64, 16
227
+SAD_X_LOOP  3, 64, 32
228
+SAD_X_LOOP  3, 64, 48
229
+SAD_X_LOOP  3, 64, 64
230
+
231
+SAD_X_FUNC  4, 4,  4
232
+SAD_X_FUNC  4, 4,  8
233
+SAD_X_FUNC  4, 4,  16
234
+SAD_X_FUNC  4, 8,  4
235
+SAD_X_FUNC  4, 8,  8
236
+SAD_X_FUNC  4, 8,  16
237
+SAD_X_FUNC  4, 8,  32
238
+SAD_X_FUNC  4, 12, 16
239
+SAD_X_FUNC  4, 16, 4
240
+SAD_X_FUNC  4, 16, 8
241
+SAD_X_FUNC  4, 16, 12
242
+SAD_X_FUNC  4, 16, 16
243
+SAD_X_FUNC  4, 16, 32
244
+SAD_X_FUNC  4, 16, 64
245
+SAD_X_LOOP  4, 24, 32
246
+SAD_X_LOOP  4, 32, 8
247
+SAD_X_LOOP  4, 32, 16
248
+SAD_X_LOOP  4, 32, 24
249
+SAD_X_LOOP  4, 32, 32
250
+SAD_X_LOOP  4, 32, 64
251
+SAD_X_LOOP  4, 48, 64
252
+SAD_X_LOOP  4, 64, 16
253
+SAD_X_LOOP  4, 64, 32
254
+SAD_X_LOOP  4, 64, 48
255
+SAD_X_LOOP  4, 64, 64
256
x265_3.6.tar.gz/source/common/aarch64/ssd-a-common.S Added
39
 
1
@@ -0,0 +1,37 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+#include "asm.S"
29
+
30
+.arch           armv8-a
31
+
32
+.macro ret_v0_w0
33
+    trn2            v1.2d, v0.2d, v0.2d
34
+    add             v0.2s, v0.2s, v1.2s
35
+    addp            v0.2s, v0.2s, v0.2s
36
+    fmov            w0, s0
37
+    ret
38
+.endm
39
x265_3.6.tar.gz/source/common/aarch64/ssd-a-sve.S Added
80
 
1
@@ -0,0 +1,78 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+
27
+.arch armv8-a+sve
28
+
29
+#ifdef __APPLE__
30
+.section __RODATA,__rodata
31
+#else
32
+.section .rodata
33
+#endif
34
+
35
+.align 4
36
+
37
+.text
38
+
39
+function PFX(pixel_sse_pp_4x4_sve)
40
+    ptrue           p0.s, vl4
41
+    ld1b            {z0.s}, p0/z, x0
42
+    ld1b            {z17.s}, p0/z, x2
43
+    add             x0, x0, x1
44
+    add             x2, x2, x3
45
+    sub             z0.s, p0/m, z0.s, z17.s
46
+    mul             z0.s, p0/m, z0.s, z0.s
47
+.rept 3
48
+    ld1b            {z16.s}, p0/z, x0
49
+    ld1b            {z17.s}, p0/z, x2
50
+    add             x0, x0, x1
51
+    add             x2, x2, x3
52
+    sub             z16.s, p0/m, z16.s, z17.s
53
+    mla             z0.s, p0/m, z16.s, z16.s
54
+.endr
55
+    uaddv           d0, p0, z0.s
56
+    fmov            w0, s0
57
+    ret
58
+endfunc
59
+
60
+function PFX(pixel_sse_pp_4x8_sve)
61
+    ptrue           p0.s, vl4
62
+    ld1b            {z0.s}, p0/z, x0
63
+    ld1b            {z17.s}, p0/z, x2
64
+    add             x0, x0, x1
65
+    add             x2, x2, x3
66
+    sub             z0.s, p0/m, z0.s, z17.s
67
+    mul             z0.s, p0/m, z0.s, z0.s
68
+.rept 7
69
+    ld1b            {z16.s}, p0/z, x0
70
+    ld1b            {z17.s}, p0/z, x2
71
+    add             x0, x0, x1
72
+    add             x2, x2, x3
73
+    sub             z16.s, p0/m, z16.s, z17.s
74
+    mla             z0.s, p0/m, z16.s, z16.s
75
+.endr
76
+    uaddv           d0, p0, z0.s
77
+    fmov            w0, s0
78
+    ret
79
+endfunc
80
x265_3.6.tar.gz/source/common/aarch64/ssd-a-sve2.S Added
889
 
1
@@ -0,0 +1,887 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "ssd-a-common.S"
27
+
28
+.arch armv8-a+sve2
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+function PFX(pixel_sse_pp_32x32_sve2)
41
+    rdvl            x9, #1
42
+    cmp             x9, #16
43
+    bgt             .vl_gt_16_pixel_sse_pp_32x32
44
+    mov             w12, #8
45
+    movi            v0.16b, #0
46
+    movi            v1.16b, #0
47
+.loop_sse_pp_32_sve2:
48
+    sub             w12, w12, #1
49
+.rept 4
50
+    ld1             {v16.16b,v17.16b}, x0, x1
51
+    ld1             {v18.16b,v19.16b}, x2, x3
52
+    usubl           v2.8h, v16.8b, v18.8b
53
+    usubl2          v3.8h, v16.16b, v18.16b
54
+    usubl           v4.8h, v17.8b, v19.8b
55
+    usubl2          v5.8h, v17.16b, v19.16b
56
+    smlal           v0.4s, v2.4h, v2.4h
57
+    smlal2          v1.4s, v2.8h, v2.8h
58
+    smlal           v0.4s, v3.4h, v3.4h
59
+    smlal2          v1.4s, v3.8h, v3.8h
60
+    smlal           v0.4s, v4.4h, v4.4h
61
+    smlal2          v1.4s, v4.8h, v4.8h
62
+    smlal           v0.4s, v5.4h, v5.4h
63
+    smlal2          v1.4s, v5.8h, v5.8h
64
+.endr
65
+    cbnz            w12, .loop_sse_pp_32_sve2
66
+    add             v0.4s, v0.4s, v1.4s
67
+    ret_v0_w0
68
+.vl_gt_16_pixel_sse_pp_32x32:
69
+    ptrue           p0.b, vl32
70
+    ld1b            {z16.b}, p0/z, x0
71
+    ld1b            {z18.b}, p0/z, x2
72
+    add             x0, x0, x1
73
+    add             x2, x2, x3
74
+    usublb          z1.h, z16.b, z18.b
75
+    usublt          z2.h, z16.b, z18.b
76
+    smullb          z0.s, z1.h, z1.h
77
+    smlalt          z0.s, z1.h, z1.h
78
+    smlalb          z0.s, z2.h, z2.h
79
+    smlalt          z0.s, z2.h, z2.h
80
+.rept 31
81
+    ld1b            {z16.b}, p0/z, x0
82
+    ld1b            {z18.b}, p0/z, x2
83
+    add             x0, x0, x1
84
+    add             x2, x2, x3
85
+    usublb          z1.h, z16.b, z18.b
86
+    usublt          z2.h, z16.b, z18.b
87
+    smullb          z0.s, z1.h, z1.h
88
+    smlalt          z0.s, z1.h, z1.h
89
+    smlalb          z0.s, z2.h, z2.h
90
+    smlalt          z0.s, z2.h, z2.h
91
+.endr
92
+    uaddv           d3, p0, z0.s
93
+    fmov            w0, s3
94
+    ret
95
+endfunc
96
+
97
+function PFX(pixel_sse_pp_32x64_sve2)
98
+    rdvl            x9, #1
99
+    cmp             x9, #16
100
+    bgt             .vl_gt_16_pixel_sse_pp_32x64
101
+    ptrue           p0.b, vl16
102
+    ld1b            {z16.b}, p0/z, x0
103
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
104
+    ld1b            {z18.b}, p0/z, x2
105
+    ld1b            {z19.b}, p0/z, x2, #1, mul vl
106
+    add             x0, x0, x1
107
+    add             x2, x2, x3
108
+    usublb          z1.h, z16.b, z18.b
109
+    usublt          z2.h, z16.b, z18.b
110
+    usublb          z3.h, z17.b, z19.b
111
+    usublt          z4.h, z17.b, z19.b
112
+    smullb          z20.s, z1.h, z1.h
113
+    smullt          z21.s, z1.h, z1.h
114
+    smlalb          z20.s, z2.h, z2.h
115
+    smlalt          z21.s, z2.h, z2.h
116
+    smlalb          z20.s, z3.h, z3.h
117
+    smlalt          z21.s, z3.h, z3.h
118
+    smlalb          z20.s, z4.h, z4.h
119
+    smlalt          z21.s, z4.h, z4.h
120
+.rept 63
121
+    ld1b            {z16.b}, p0/z, x0
122
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
123
+    ld1b            {z18.b}, p0/z, x2
124
+    ld1b            {z19.b}, p0/z, x2, #1, mul vl
125
+    add             x0, x0, x1
126
+    add             x2, x2, x3
127
+    usublb          z1.h, z16.b, z18.b
128
+    usublt          z2.h, z16.b, z18.b
129
+    usublb          z3.h, z17.b, z19.b
130
+    usublt          z4.h, z17.b, z19.b
131
+    smlalb          z20.s, z1.h, z1.h
132
+    smlalt          z21.s, z1.h, z1.h
133
+    smlalb          z20.s, z2.h, z2.h
134
+    smlalt          z21.s, z2.h, z2.h
135
+    smlalb          z20.s, z3.h, z3.h
136
+    smlalt          z21.s, z3.h, z3.h
137
+    smlalb          z20.s, z4.h, z4.h
138
+    smlalt          z21.s, z4.h, z4.h
139
+.endr
140
+    uaddv           d3, p0, z20.s
141
+    fmov            w0, s3
142
+    uaddv           d4, p0, z21.s
143
+    fmov            w1, s4
144
+    add             w0, w0, w1
145
+    ret
146
+.vl_gt_16_pixel_sse_pp_32x64:
147
+    ptrue           p0.b, vl32
148
+    ld1b            {z16.b}, p0/z, x0
149
+    ld1b            {z18.b}, p0/z, x2
150
+    add             x0, x0, x1
151
+    add             x2, x2, x3
152
+    usublb          z1.h, z16.b, z18.b
153
+    usublt          z2.h, z16.b, z18.b
154
+    smullb          z20.s, z1.h, z1.h
155
+    smullt          z21.s, z1.h, z1.h
156
+    smlalb          z20.s, z2.h, z2.h
157
+    smlalt          z21.s, z2.h, z2.h
158
+.rept 63
159
+    ld1b            {z16.b}, p0/z, x0
160
+    ld1b            {z18.b}, p0/z, x2
161
+    add             x0, x0, x1
162
+    add             x2, x2, x3
163
+    usublb          z1.h, z16.b, z18.b
164
+    usublt          z2.h, z16.b, z18.b
165
+    smlalb          z20.s, z1.h, z1.h
166
+    smlalt          z21.s, z1.h, z1.h
167
+    smlalb          z20.s, z2.h, z2.h
168
+    smlalt          z21.s, z2.h, z2.h
169
+.endr
170
+    uaddv           d3, p0, z20.s
171
+    fmov            w0, s3
172
+    uaddv           d4, p0, z21.s
173
+    fmov            w1, s4
174
+    add             w0, w0, w1
175
+    ret
176
+endfunc
177
+
178
+function PFX(pixel_sse_pp_64x64_sve2)
179
+    rdvl            x9, #1
180
+    cmp             x9, #16
181
+    bgt             .vl_gt_16_pixel_sse_pp_64x64
182
+    mov             w12, #16
183
+    movi            v0.16b, #0
184
+    movi            v1.16b, #0
185
+
186
+.loop_sse_pp_64_sve2:
187
+    sub             w12, w12, #1
188
+.rept 4
189
+    ld1             {v16.16b-v19.16b}, x0, x1
190
+    ld1             {v20.16b-v23.16b}, x2, x3
191
+
192
+    usubl           v2.8h, v16.8b, v20.8b
193
+    usubl2          v3.8h, v16.16b, v20.16b
194
+    usubl           v4.8h, v17.8b, v21.8b
195
+    usubl2          v5.8h, v17.16b, v21.16b
196
+    smlal           v0.4s, v2.4h, v2.4h
197
+    smlal2          v1.4s, v2.8h, v2.8h
198
+    smlal           v0.4s, v3.4h, v3.4h
199
+    smlal2          v1.4s, v3.8h, v3.8h
200
+    smlal           v0.4s, v4.4h, v4.4h
201
+    smlal2          v1.4s, v4.8h, v4.8h
202
+    smlal           v0.4s, v5.4h, v5.4h
203
+    smlal2          v1.4s, v5.8h, v5.8h
204
+
205
+    usubl           v2.8h, v18.8b, v22.8b
206
+    usubl2          v3.8h, v18.16b, v22.16b
207
+    usubl           v4.8h, v19.8b, v23.8b
208
+    usubl2          v5.8h, v19.16b, v23.16b
209
+    smlal           v0.4s, v2.4h, v2.4h
210
+    smlal2          v1.4s, v2.8h, v2.8h
211
+    smlal           v0.4s, v3.4h, v3.4h
212
+    smlal2          v1.4s, v3.8h, v3.8h
213
+    smlal           v0.4s, v4.4h, v4.4h
214
+    smlal2          v1.4s, v4.8h, v4.8h
215
+    smlal           v0.4s, v5.4h, v5.4h
216
+    smlal2          v1.4s, v5.8h, v5.8h
217
+.endr
218
+    cbnz            w12, .loop_sse_pp_64_sve2
219
+    add             v0.4s, v0.4s, v1.4s
220
+    ret_v0_w0
221
+.vl_gt_16_pixel_sse_pp_64x64:
222
+    cmp             x9, #48
223
+    bgt             .vl_gt_48_pixel_sse_pp_64x64
224
+    ptrue           p0.b, vl32
225
+    ld1b            {z16.b}, p0/z, x0
226
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
227
+    ld1b            {z20.b}, p0/z, x2
228
+    ld1b            {z21.b}, p0/z, x2, #1, mul vl
229
+    add             x0, x0, x1
230
+    add             x2, x2, x3
231
+    usublb          z1.h, z16.b, z20.b
232
+    usublt          z2.h, z16.b, z20.b
233
+    usublb          z3.h, z17.b, z21.b
234
+    usublt          z4.h, z17.b, z21.b
235
+    smullb          z24.s, z1.h, z1.h
236
+    smullt          z25.s, z1.h, z1.h
237
+    smlalb          z24.s, z2.h, z2.h
238
+    smlalt          z25.s, z2.h, z2.h
239
+    smlalb          z24.s, z3.h, z3.h
240
+    smlalt          z25.s, z3.h, z3.h
241
+    smlalb          z24.s, z4.h, z4.h
242
+    smlalt          z25.s, z4.h, z4.h
243
+.rept 63
244
+    ld1b            {z16.b}, p0/z, x0
245
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
246
+    ld1b            {z20.b}, p0/z, x2
247
+    ld1b            {z21.b}, p0/z, x2, #1, mul vl
248
+    add             x0, x0, x1
249
+    add             x2, x2, x3
250
+    usublb          z1.h, z16.b, z20.b
251
+    usublt          z2.h, z16.b, z20.b
252
+    usublb          z3.h, z17.b, z21.b
253
+    usublt          z4.h, z17.b, z21.b
254
+    smlalb          z24.s, z1.h, z1.h
255
+    smlalt          z25.s, z1.h, z1.h
256
+    smlalb          z24.s, z2.h, z2.h
257
+    smlalt          z25.s, z2.h, z2.h
258
+    smlalb          z24.s, z3.h, z3.h
259
+    smlalt          z25.s, z3.h, z3.h
260
+    smlalb          z24.s, z4.h, z4.h
261
+    smlalt          z25.s, z4.h, z4.h
262
+.endr
263
+    uaddv           d3, p0, z24.s
264
+    fmov            w0, s3
265
+    uaddv           d4, p0, z25.s
266
+    fmov            w1, s4
267
+    add             w0, w0, w1
268
+    ret
269
+.vl_gt_48_pixel_sse_pp_64x64:
270
+    ptrue           p0.b, vl64
271
+    ld1b            {z16.b}, p0/z, x0
272
+    ld1b            {z20.b}, p0/z, x2
273
+    add             x0, x0, x1
274
+    add             x2, x2, x3
275
+    usublb          z1.h, z16.b, z20.b
276
+    usublt          z2.h, z16.b, z20.b
277
+    smullb          z24.s, z1.h, z1.h
278
+    smullt          z25.s, z1.h, z1.h
279
+    smlalb          z24.s, z2.h, z2.h
280
+    smlalt          z25.s, z2.h, z2.h
281
+.rept 63
282
+    ld1b            {z16.b}, p0/z, x0
283
+    ld1b            {z20.b}, p0/z, x2
284
+    add             x0, x0, x1
285
+    add             x2, x2, x3
286
+    usublb          z1.h, z16.b, z20.b
287
+    usublt          z2.h, z16.b, z20.b
288
+    smlalb          z24.s, z1.h, z1.h
289
+    smlalt          z25.s, z1.h, z1.h
290
+    smlalb          z24.s, z2.h, z2.h
291
+    smlalt          z25.s, z2.h, z2.h
292
+.endr
293
+    uaddv           d3, p0, z24.s
294
+    fmov            w0, s3
295
+    uaddv           d4, p0, z25.s
296
+    fmov            w1, s4
297
+    add             w0, w0, w1
298
+    ret
299
+endfunc
300
+
301
+function PFX(pixel_sse_ss_4x4_sve2)
302
+    ptrue           p0.b, vl8
303
+    ld1b            {z16.b}, p0/z, x0
304
+    ld1b            {z17.b}, p0/z, x2
305
+    add             x0, x0, x1, lsl #1
306
+    add             x2, x2, x3, lsl #1
307
+    sub             z1.h, z16.h, z17.h
308
+    smullb          z3.s, z1.h, z1.h
309
+    smullt          z4.s, z1.h, z1.h
310
+.rept 3
311
+    ld1b            {z16.b}, p0/z, x0
312
+    ld1b            {z17.b}, p0/z, x2
313
+    add             x0, x0, x1, lsl #1
314
+    add             x2, x2, x3, lsl #1
315
+    sub             z1.h, z16.h, z17.h
316
+    smlalb          z3.s, z1.h, z1.h
317
+    smlalt          z4.s, z1.h, z1.h
318
+.endr
319
+    uaddv           d3, p0, z3.s
320
+    fmov            w0, s3
321
+    uaddv           d4, p0, z4.s
322
+    fmov            w1, s4
323
+    add             w0, w0, w1
324
+    ret
325
+endfunc
326
+
327
+function PFX(pixel_sse_ss_8x8_sve2)
328
+    ptrue           p0.b, vl16
329
+    ld1b            {z16.b}, p0/z, x0
330
+    ld1b            {z17.b}, p0/z, x2
331
+    add             x0, x0, x1, lsl #1
332
+    add             x2, x2, x3, lsl #1
333
+    sub             z1.h, z16.h, z17.h
334
+    smullb          z3.s, z1.h, z1.h
335
+    smullt          z4.s, z1.h, z1.h
336
+.rept 7
337
+    ld1b            {z16.b}, p0/z, x0
338
+    ld1b            {z17.b}, p0/z, x2
339
+    add             x0, x0, x1, lsl #1
340
+    add             x2, x2, x3, lsl #1
341
+    sub             z1.h, z16.h, z17.h
342
+    smlalb          z3.s, z1.h, z1.h
343
+    smlalt          z4.s, z1.h, z1.h
344
+.endr
345
+    uaddv           d3, p0, z3.s
346
+    fmov            w0, s3
347
+    uaddv           d4, p0, z4.s
348
+    fmov            w1, s4
349
+    add             w0, w0, w1
350
+    ret
351
+endfunc
352
+
353
+function PFX(pixel_sse_ss_16x16_sve2)
354
+    rdvl            x9, #1
355
+    cmp             x9, #16
356
+    bgt             .vl_gt_16_pixel_sse_ss_16x16
357
+    ptrue           p0.b, vl16
358
+    ld1b            {z16.b}, p0/z, x0
359
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
360
+    ld1b            {z18.b}, p0/z, x2
361
+    ld1b            {z19.b}, p0/z, x2, #1, mul vl
362
+    add             x0, x0, x1, lsl #1
363
+    add             x2, x2, x3, lsl #1
364
+    sub             z1.h, z16.h, z18.h
365
+    sub             z2.h, z17.h, z19.h
366
+    smullb          z3.s, z1.h, z1.h
367
+    smullt          z4.s, z1.h, z1.h
368
+    smlalb          z3.s, z2.h, z2.h
369
+    smlalt          z4.s, z2.h, z2.h
370
+.rept 15
371
+    ld1b            {z16.b}, p0/z, x0
372
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
373
+    ld1b            {z18.b}, p0/z, x2
374
+    ld1b            {z19.b}, p0/z, x2, #1, mul vl
375
+    add             x0, x0, x1, lsl #1
376
+    add             x2, x2, x3, lsl #1
377
+    sub             z1.h, z16.h, z18.h
378
+    sub             z2.h, z17.h, z19.h
379
+    smlalb          z3.s, z1.h, z1.h
380
+    smlalt          z4.s, z1.h, z1.h
381
+    smlalb          z3.s, z2.h, z2.h
382
+    smlalt          z4.s, z2.h, z2.h
383
+.endr
384
+    uaddv           d3, p0, z3.s
385
+    fmov            w0, s3
386
+    uaddv           d4, p0, z4.s
387
+    fmov            w1, s4
388
+    add             w0, w0, w1
389
+    ret
390
+.vl_gt_16_pixel_sse_ss_16x16:
391
+    ptrue           p0.b, vl32
392
+    ld1b            {z16.b}, p0/z, x0
393
+    ld1b            {z18.b}, p0/z, x2
394
+    add             x0, x0, x1, lsl #1
395
+    add             x2, x2, x3, lsl #1
396
+    sub             z1.h, z16.h, z18.h
397
+    smullb          z3.s, z1.h, z1.h
398
+    smullt          z4.s, z1.h, z1.h
399
+.rept 15
400
+    ld1b            {z16.b}, p0/z, x0
401
+    ld1b            {z18.b}, p0/z, x2
402
+    add             x0, x0, x1, lsl #1
403
+    add             x2, x2, x3, lsl #1
404
+    sub             z1.h, z16.h, z18.h
405
+    smlalb          z3.s, z1.h, z1.h
406
+    smlalt          z4.s, z1.h, z1.h
407
+.endr
408
+    uaddv           d3, p0, z3.s
409
+    fmov            w0, s3
410
+    uaddv           d4, p0, z4.s
411
+    fmov            w1, s4
412
+    add             w0, w0, w1
413
+    ret
414
+endfunc
415
+
416
+function PFX(pixel_sse_ss_32x32_sve2)
417
+    rdvl            x9, #1
418
+    cmp             x9, #16
419
+    bgt             .vl_gt_16_pixel_sse_ss_32x32
420
+    ptrue           p0.b, vl16
421
+    ld1b            {z16.b}, p0/z, x0
422
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
423
+    ld1b            {z18.b}, p0/z, x0, #2, mul vl
424
+    ld1b            {z19.b}, p0/z, x0, #3, mul vl
425
+    ld1b            {z20.b}, p0/z, x2
426
+    ld1b            {z21.b}, p0/z, x2, #1, mul vl
427
+    ld1b            {z22.b}, p0/z, x2, #2, mul vl
428
+    ld1b            {z23.b}, p0/z, x2, #3, mul vl
429
+    add             x0, x0, x1, lsl #1
430
+    add             x2, x2, x3, lsl #1
431
+    sub             z1.h, z16.h, z20.h
432
+    sub             z2.h, z17.h, z21.h
433
+    sub             z3.h, z18.h, z22.h
434
+    sub             z4.h, z19.h, z23.h
435
+    smullb          z5.s, z1.h, z1.h
436
+    smullt          z6.s, z1.h, z1.h
437
+    smlalb          z5.s, z2.h, z2.h
438
+    smlalt          z6.s, z2.h, z2.h
439
+    smlalb          z5.s, z3.h, z3.h
440
+    smlalt          z6.s, z3.h, z3.h
441
+    smlalb          z5.s, z4.h, z4.h
442
+    smlalt          z6.s, z4.h, z4.h
443
+.rept 31
444
+    ld1b            {z16.b}, p0/z, x0
445
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
446
+    ld1b            {z18.b}, p0/z, x0, #2, mul vl
447
+    ld1b            {z19.b}, p0/z, x0, #3, mul vl
448
+    ld1b            {z20.b}, p0/z, x2
449
+    ld1b            {z21.b}, p0/z, x2, #1, mul vl
450
+    ld1b            {z22.b}, p0/z, x2, #2, mul vl
451
+    ld1b            {z23.b}, p0/z, x2, #3, mul vl
452
+    add             x0, x0, x1, lsl #1
453
+    add             x2, x2, x3, lsl #1
454
+    sub             z1.h, z16.h, z20.h
455
+    sub             z2.h, z17.h, z21.h
456
+    sub             z3.h, z18.h, z22.h
457
+    sub             z4.h, z19.h, z23.h
458
+    smlalb          z5.s, z1.h, z1.h
459
+    smlalt          z6.s, z1.h, z1.h
460
+    smlalb          z5.s, z2.h, z2.h
461
+    smlalt          z6.s, z2.h, z2.h
462
+    smlalb          z5.s, z3.h, z3.h
463
+    smlalt          z6.s, z3.h, z3.h
464
+    smlalb          z5.s, z4.h, z4.h
465
+    smlalt          z6.s, z4.h, z4.h
466
+.endr
467
+    uaddv           d3, p0, z5.s
468
+    fmov            w0, s3
469
+    uaddv           d4, p0, z6.s
470
+    fmov            w1, s4
471
+    add             w0, w0, w1
472
+    ret
473
+.vl_gt_16_pixel_sse_ss_32x32:
474
+    cmp             x9, #48
475
+    bgt             .vl_gt_48_pixel_sse_ss_32x32
476
+    ptrue           p0.b, vl32
477
+    ld1b            {z16.b}, p0/z, x0
478
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
479
+    ld1b            {z20.b}, p0/z, x2
480
+    ld1b            {z21.b}, p0/z, x2, #1, mul vl
481
+    add             x0, x0, x1, lsl #1
482
+    add             x2, x2, x3, lsl #1
483
+    sub             z1.h, z16.h, z20.h
484
+    sub             z2.h, z17.h, z21.h
485
+    smullb          z5.s, z1.h, z1.h
486
+    smullt          z6.s, z1.h, z1.h
487
+    smlalb          z5.s, z2.h, z2.h
488
+    smlalt          z6.s, z2.h, z2.h
489
+.rept 31
490
+    ld1b            {z16.b}, p0/z, x0
491
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
492
+    ld1b            {z20.b}, p0/z, x2
493
+    ld1b            {z21.b}, p0/z, x2, #1, mul vl
494
+    add             x0, x0, x1, lsl #1
495
+    add             x2, x2, x3, lsl #1
496
+    sub             z1.h, z16.h, z20.h
497
+    sub             z2.h, z17.h, z21.h
498
+    smlalb          z5.s, z1.h, z1.h
499
+    smlalt          z6.s, z1.h, z1.h
500
+    smlalb          z5.s, z2.h, z2.h
501
+    smlalt          z6.s, z2.h, z2.h
502
+.endr
503
+    uaddv           d3, p0, z5.s
504
+    fmov            w0, s3
505
+    uaddv           d4, p0, z6.s
506
+    fmov            w1, s4
507
+    add             w0, w0, w1
508
+    ret
509
+.vl_gt_48_pixel_sse_ss_32x32:
510
+    ptrue           p0.b, vl64
511
+    ld1b            {z16.b}, p0/z, x0
512
+    ld1b            {z20.b}, p0/z, x2
513
+    add             x0, x0, x1, lsl #1
514
+    add             x2, x2, x3, lsl #1
515
+    sub             z1.h, z16.h, z20.h
516
+    smullb          z5.s, z1.h, z1.h
517
+    smullt          z6.s, z1.h, z1.h
518
+.rept 31
519
+    ld1b            {z16.b}, p0/z, x0
520
+    ld1b            {z20.b}, p0/z, x2
521
+    add             x0, x0, x1, lsl #1
522
+    add             x2, x2, x3, lsl #1
523
+    sub             z1.h, z16.h, z20.h
524
+    smlalb          z5.s, z1.h, z1.h
525
+    smlalt          z6.s, z1.h, z1.h
526
+.endr
527
+    uaddv           d3, p0, z5.s
528
+    fmov            w0, s3
529
+    uaddv           d4, p0, z6.s
530
+    fmov            w1, s4
531
+    add             w0, w0, w1
532
+    ret
533
+endfunc
534
+
535
+function PFX(pixel_sse_ss_64x64_sve2)
536
+    rdvl            x9, #1
537
+    cmp             x9, #16
538
+    bgt             .vl_gt_16_pixel_sse_ss_64x64
539
+    ptrue           p0.b, vl16
540
+    ld1b            {z24.b}, p0/z, x0
541
+    ld1b            {z25.b}, p0/z, x0, #1, mul vl
542
+    ld1b            {z26.b}, p0/z, x0, #2, mul vl
543
+    ld1b            {z27.b}, p0/z, x0, #3, mul vl
544
+    ld1b            {z28.b}, p0/z, x2
545
+    ld1b            {z29.b}, p0/z, x2, #1, mul vl
546
+    ld1b            {z30.b}, p0/z, x2, #2, mul vl
547
+    ld1b            {z31.b}, p0/z, x2, #3, mul vl
548
+    sub             z0.h, z24.h, z28.h
549
+    sub             z1.h, z25.h, z29.h
550
+    sub             z2.h, z26.h, z30.h
551
+    sub             z3.h, z27.h, z31.h
552
+    smullb          z5.s, z0.h, z0.h
553
+    smullt          z6.s, z0.h, z0.h
554
+    smlalb          z5.s, z1.h, z1.h
555
+    smlalt          z6.s, z1.h, z1.h
556
+    smlalb          z5.s, z2.h, z2.h
557
+    smlalt          z6.s, z2.h, z2.h
558
+    smlalb          z5.s, z3.h, z3.h
559
+    smlalt          z6.s, z3.h, z3.h
560
+    ld1b            {z24.b}, p0/z, x0, #4, mul vl
561
+    ld1b            {z25.b}, p0/z, x0, #5, mul vl
562
+    ld1b            {z26.b}, p0/z, x0, #6, mul vl
563
+    ld1b            {z27.b}, p0/z, x0, #7, mul vl
564
+    ld1b            {z28.b}, p0/z, x2, #4, mul vl
565
+    ld1b            {z29.b}, p0/z, x2, #5, mul vl
566
+    ld1b            {z30.b}, p0/z, x2, #6, mul vl
567
+    ld1b            {z31.b}, p0/z, x2, #7, mul vl
568
+    sub             z0.h, z24.h, z28.h
569
+    sub             z1.h, z25.h, z29.h
570
+    sub             z2.h, z26.h, z30.h
571
+    sub             z3.h, z27.h, z31.h
572
+    smlalb          z5.s, z0.h, z0.h
573
+    smlalt          z6.s, z0.h, z0.h
574
+    smlalb          z5.s, z1.h, z1.h
575
+    smlalt          z6.s, z1.h, z1.h
576
+    smlalb          z5.s, z2.h, z2.h
577
+    smlalt          z6.s, z2.h, z2.h
578
+    smlalb          z5.s, z3.h, z3.h
579
+    smlalt          z6.s, z3.h, z3.h
580
+    add             x0, x0, x1, lsl #1
581
+    add             x2, x2, x3, lsl #1
582
+.rept 63
583
+    ld1b            {z24.b}, p0/z, x0
584
+    ld1b            {z25.b}, p0/z, x0, #1, mul vl
585
+    ld1b            {z26.b}, p0/z, x0, #2, mul vl
586
+    ld1b            {z27.b}, p0/z, x0, #3, mul vl
587
+    ld1b            {z28.b}, p0/z, x2
588
+    ld1b            {z29.b}, p0/z, x2, #1, mul vl
589
+    ld1b            {z30.b}, p0/z, x2, #2, mul vl
590
+    ld1b            {z31.b}, p0/z, x2, #3, mul vl
591
+    sub             z0.h, z24.h, z28.h
592
+    sub             z1.h, z25.h, z29.h
593
+    sub             z2.h, z26.h, z30.h
594
+    sub             z3.h, z27.h, z31.h
595
+    smlalb          z5.s, z0.h, z0.h
596
+    smlalt          z6.s, z0.h, z0.h
597
+    smlalb          z5.s, z1.h, z1.h
598
+    smlalt          z6.s, z1.h, z1.h
599
+    smlalb          z5.s, z2.h, z2.h
600
+    smlalt          z6.s, z2.h, z2.h
601
+    smlalb          z5.s, z3.h, z3.h
602
+    smlalt          z6.s, z3.h, z3.h
603
+    ld1b            {z24.b}, p0/z, x0, #4, mul vl
604
+    ld1b            {z25.b}, p0/z, x0, #5, mul vl
605
+    ld1b            {z26.b}, p0/z, x0, #6, mul vl
606
+    ld1b            {z27.b}, p0/z, x0, #7, mul vl
607
+    ld1b            {z28.b}, p0/z, x2, #4, mul vl
608
+    ld1b            {z29.b}, p0/z, x2, #5, mul vl
609
+    ld1b            {z30.b}, p0/z, x2, #6, mul vl
610
+    ld1b            {z31.b}, p0/z, x2, #7, mul vl
611
+    sub             z0.h, z24.h, z28.h
612
+    sub             z1.h, z25.h, z29.h
613
+    sub             z2.h, z26.h, z30.h
614
+    sub             z3.h, z27.h, z31.h
615
+    smlalb          z5.s, z0.h, z0.h
616
+    smlalt          z6.s, z0.h, z0.h
617
+    smlalb          z5.s, z1.h, z1.h
618
+    smlalt          z6.s, z1.h, z1.h
619
+    smlalb          z5.s, z2.h, z2.h
620
+    smlalt          z6.s, z2.h, z2.h
621
+    smlalb          z5.s, z3.h, z3.h
622
+    smlalt          z6.s, z3.h, z3.h
623
+    add             x0, x0, x1, lsl #1
624
+    add             x2, x2, x3, lsl #1
625
+.endr
626
+    uaddv           d3, p0, z5.s
627
+    fmov            w0, s3
628
+    uaddv           d4, p0, z6.s
629
+    fmov            w1, s4
630
+    add             w0, w0, w1
631
+    ret
632
+.vl_gt_16_pixel_sse_ss_64x64:
633
+    cmp             x9, #48
634
+    bgt             .vl_gt_48_pixel_sse_ss_64x64
635
+    ptrue           p0.b, vl32
636
+    ld1b            {z24.b}, p0/z, x0
637
+    ld1b            {z25.b}, p0/z, x0, #1, mul vl
638
+    ld1b            {z28.b}, p0/z, x2
639
+    ld1b            {z29.b}, p0/z, x2, #1, mul vl
640
+    sub             z0.h, z24.h, z28.h
641
+    sub             z1.h, z25.h, z29.h
642
+    smullb          z5.s, z0.h, z0.h
643
+    smullt          z6.s, z0.h, z0.h
644
+    smlalb          z5.s, z1.h, z1.h
645
+    smlalt          z6.s, z1.h, z1.h
646
+    ld1b            {z24.b}, p0/z, x0, #1, mul vl
647
+    ld1b            {z25.b}, p0/z, x0, #2, mul vl
648
+    ld1b            {z28.b}, p0/z, x2, #1, mul vl
649
+    ld1b            {z29.b}, p0/z, x2, #2, mul vl
650
+    sub             z0.h, z24.h, z28.h
651
+    sub             z1.h, z25.h, z29.h
652
+    smlalb          z5.s, z0.h, z0.h
653
+    smlalt          z6.s, z0.h, z0.h
654
+    smlalb          z5.s, z1.h, z1.h
655
+    smlalt          z6.s, z1.h, z1.h
656
+    add             x0, x0, x1, lsl #1
657
+    add             x2, x2, x3, lsl #1
658
+.rept 63
659
+    ld1b            {z24.b}, p0/z, x0
660
+    ld1b            {z25.b}, p0/z, x0, #1, mul vl
661
+    ld1b            {z28.b}, p0/z, x2
662
+    ld1b            {z29.b}, p0/z, x2, #1, mul vl
663
+    sub             z0.h, z24.h, z28.h
664
+    sub             z1.h, z25.h, z29.h
665
+    smlalb          z5.s, z0.h, z0.h
666
+    smlalt          z6.s, z0.h, z0.h
667
+    smlalb          z5.s, z1.h, z1.h
668
+    smlalt          z6.s, z1.h, z1.h
669
+    ld1b            {z24.b}, p0/z, x0, #1, mul vl
670
+    ld1b            {z25.b}, p0/z, x0, #2, mul vl
671
+    ld1b            {z28.b}, p0/z, x2, #1, mul vl
672
+    ld1b            {z29.b}, p0/z, x2, #2, mul vl
673
+    sub             z0.h, z24.h, z28.h
674
+    sub             z1.h, z25.h, z29.h
675
+    smlalb          z5.s, z0.h, z0.h
676
+    smlalt          z6.s, z0.h, z0.h
677
+    smlalb          z5.s, z1.h, z1.h
678
+    smlalt          z6.s, z1.h, z1.h
679
+    add             x0, x0, x1, lsl #1
680
+    add             x2, x2, x3, lsl #1
681
+.endr
682
+    uaddv           d3, p0, z5.s
683
+    fmov            w0, s3
684
+    uaddv           d4, p0, z6.s
685
+    fmov            w1, s4
686
+    add             w0, w0, w1
687
+    ret
688
+.vl_gt_48_pixel_sse_ss_64x64:
689
+    cmp             x9, #112
690
+    bgt             .vl_gt_112_pixel_sse_ss_64x64
691
+    ptrue           p0.b, vl64
692
+    ld1b            {z24.b}, p0/z, x0
693
+    ld1b            {z28.b}, p0/z, x2
694
+    sub             z0.h, z24.h, z28.h
695
+    smullb          z5.s, z0.h, z0.h
696
+    smullt          z6.s, z0.h, z0.h
697
+    ld1b            {z24.b}, p0/z, x0, #1, mul vl
698
+    ld1b            {z28.b}, p0/z, x2, #1, mul vl
699
+    sub             z0.h, z24.h, z28.h
700
+    smlalb          z5.s, z0.h, z0.h
701
+    smlalt          z6.s, z0.h, z0.h
702
+    add             x0, x0, x1, lsl #1
703
+    add             x2, x2, x3, lsl #1
704
+.rept 63
705
+    ld1b            {z24.b}, p0/z, x0
706
+    ld1b            {z28.b}, p0/z, x2
707
+    sub             z0.h, z24.h, z28.h
708
+    smlalb          z5.s, z0.h, z0.h
709
+    smlalt          z6.s, z0.h, z0.h
710
+    ld1b            {z24.b}, p0/z, x0, #1, mul vl
711
+    ld1b            {z28.b}, p0/z, x2, #1, mul vl
712
+    sub             z0.h, z24.h, z28.h
713
+    smlalb          z5.s, z0.h, z0.h
714
+    smlalt          z6.s, z0.h, z0.h
715
+    add             x0, x0, x1, lsl #1
716
+    add             x2, x2, x3, lsl #1
717
+.endr
718
+    uaddv           d3, p0, z5.s
719
+    fmov            w0, s3
720
+    uaddv           d4, p0, z6.s
721
+    fmov            w1, s4
722
+    add             w0, w0, w1
723
+    ret
724
+.vl_gt_112_pixel_sse_ss_64x64:
725
+    ptrue           p0.b, vl128
726
+    ld1b            {z24.b}, p0/z, x0
727
+    ld1b            {z28.b}, p0/z, x2
728
+    sub             z0.h, z24.h, z28.h
729
+    smullb          z5.s, z0.h, z0.h
730
+    smullt          z6.s, z0.h, z0.h
731
+    add             x0, x0, x1, lsl #1
732
+    add             x2, x2, x3, lsl #1
733
+.rept 63
734
+    ld1b            {z24.b}, p0/z, x0
735
+    ld1b            {z28.b}, p0/z, x2
736
+    sub             z0.h, z24.h, z28.h
737
+    smlalb          z5.s, z0.h, z0.h
738
+    smlalt          z6.s, z0.h, z0.h
739
+    add             x0, x0, x1, lsl #1
740
+    add             x2, x2, x3, lsl #1
741
+.endr
742
+    uaddv           d3, p0, z5.s
743
+    fmov            w0, s3
744
+    uaddv           d4, p0, z6.s
745
+    fmov            w1, s4
746
+    add             w0, w0, w1
747
+    ret
748
+endfunc
749
+
750
+function PFX(pixel_ssd_s_4x4_sve2)
751
+    ptrue           p0.b, vl8
752
+    ld1b            {z16.b}, p0/z, x0
753
+    add             x0, x0, x1, lsl #1
754
+    smullb          z0.s, z16.h, z16.h
755
+    smlalt          z0.s, z16.h, z16.h
756
+.rept 3
757
+    ld1b            {z16.b}, p0/z, x0
758
+    add             x0, x0, x1, lsl #1
759
+    smlalb          z0.s, z16.h, z16.h
760
+    smlalt          z0.s, z16.h, z16.h
761
+.endr
762
+    uaddv           d3, p0, z0.s
763
+    fmov            w0, s3
764
+    ret
765
+endfunc
766
+
767
+function PFX(pixel_ssd_s_8x8_sve2)
768
+    ptrue           p0.b, vl16
769
+    ld1b            {z16.b}, p0/z, x0
770
+    add             x0, x0, x1, lsl #1
771
+    smullb          z0.s, z16.h, z16.h
772
+    smlalt          z0.s, z16.h, z16.h
773
+.rept 7
774
+    ld1b            {z16.b}, p0/z, x0
775
+    add             x0, x0, x1, lsl #1
776
+    smlalb          z0.s, z16.h, z16.h
777
+    smlalt          z0.s, z16.h, z16.h
778
+.endr
779
+    uaddv           d3, p0, z0.s
780
+    fmov            w0, s3
781
+    ret
782
+endfunc
783
+
784
+function PFX(pixel_ssd_s_16x16_sve2)
785
+    rdvl            x9, #1
786
+    cmp             x9, #16
787
+    bgt             .vl_gt_16_pixel_ssd_s_16x16
788
+    add             x1, x1, x1
789
+    mov             w12, #4
790
+    movi            v0.16b, #0
791
+    movi            v1.16b, #0
792
+.loop_ssd_s_16_sve2:
793
+    sub             w12, w12, #1
794
+.rept 2
795
+    ld1             {v4.16b,v5.16b}, x0, x1
796
+    ld1             {v6.16b,v7.16b}, x0, x1
797
+    smlal           v0.4s, v4.4h, v4.4h
798
+    smlal2          v1.4s, v4.8h, v4.8h
799
+    smlal           v0.4s, v5.4h, v5.4h
800
+    smlal2          v1.4s, v5.8h, v5.8h
801
+    smlal           v0.4s, v6.4h, v6.4h
802
+    smlal2          v1.4s, v6.8h, v6.8h
803
+    smlal           v0.4s, v7.4h, v7.4h
804
+    smlal2          v1.4s, v7.8h, v7.8h
805
+.endr
806
+    cbnz            w12, .loop_ssd_s_16_sve2
807
+    add             v0.4s, v0.4s, v1.4s
808
+    ret_v0_w0
809
+.vl_gt_16_pixel_ssd_s_16x16:
810
+    ptrue           p0.b, vl32
811
+    ld1b            {z16.b}, p0/z, x0
812
+    add             x0, x0, x1, lsl #1
813
+    smullb          z0.s, z16.h, z16.h
814
+    smlalt          z0.s, z16.h, z16.h
815
+.rept 15
816
+    ld1b            {z16.b}, p0/z, x0
817
+    add             x0, x0, x1, lsl #1
818
+    smlalb          z0.s, z16.h, z16.h
819
+    smlalt          z0.s, z16.h, z16.h
820
+.endr
821
+    uaddv           d3, p0, z0.s
822
+    fmov            w0, s3
823
+    ret
824
+endfunc
825
+
826
+function PFX(pixel_ssd_s_32x32_sve2)
827
+    rdvl            x9, #1
828
+    cmp             x9, #16
829
+    bgt             .vl_gt_16_pixel_ssd_s_32x32
830
+    add             x1, x1, x1
831
+    mov             w12, #8
832
+    movi            v0.16b, #0
833
+    movi            v1.16b, #0
834
+.loop_ssd_s_32:
835
+    sub             w12, w12, #1
836
+.rept 4
837
+    ld1             {v4.16b-v7.16b}, x0, x1
838
+    smlal           v0.4s, v4.4h, v4.4h
839
+    smlal2          v1.4s, v4.8h, v4.8h
840
+    smlal           v0.4s, v5.4h, v5.4h
841
+    smlal2          v1.4s, v5.8h, v5.8h
842
+    smlal           v0.4s, v6.4h, v6.4h
843
+    smlal2          v1.4s, v6.8h, v6.8h
844
+    smlal           v0.4s, v7.4h, v7.4h
845
+    smlal2          v1.4s, v7.8h, v7.8h
846
+.endr
847
+    cbnz            w12, .loop_ssd_s_32
848
+    add             v0.4s, v0.4s, v1.4s
849
+    ret_v0_w0
850
+.vl_gt_16_pixel_ssd_s_32x32:
851
+    cmp             x9, #48
852
+    bgt             .vl_gt_48_pixel_ssd_s_32x32
853
+    ptrue           p0.b, vl32
854
+    ld1b            {z16.b}, p0/z, x0
855
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
856
+    add             x0, x0, x1, lsl #1
857
+    smullb          z0.s, z16.h, z16.h
858
+    smlalt          z0.s, z16.h, z16.h
859
+    smlalb          z0.s, z17.h, z17.h
860
+    smlalt          z0.s, z17.h, z17.h
861
+.rept 31
862
+    ld1b            {z16.b}, p0/z, x0
863
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
864
+    add             x0, x0, x1, lsl #1
865
+    smlalb          z0.s, z16.h, z16.h
866
+    smlalt          z0.s, z16.h, z16.h
867
+    smlalb          z0.s, z17.h, z17.h
868
+    smlalt          z0.s, z17.h, z17.h
869
+.endr
870
+    uaddv           d3, p0, z0.s
871
+    fmov            w0, s3
872
+    ret
873
+.vl_gt_48_pixel_ssd_s_32x32:
874
+    ptrue           p0.b, vl64
875
+    ld1b            {z16.b}, p0/z, x0
876
+    add             x0, x0, x1, lsl #1
877
+    smullb          z0.s, z16.h, z16.h
878
+    smlalt          z0.s, z16.h, z16.h
879
+.rept 31
880
+    ld1b            {z16.b}, p0/z, x0
881
+    add             x0, x0, x1, lsl #1
882
+    smlalb          z0.s, z16.h, z16.h
883
+    smlalt          z0.s, z16.h, z16.h
884
+.endr
885
+    uaddv           d3, p0, z0.s
886
+    fmov            w0, s3
887
+    ret
888
+endfunc
889
x265_3.6.tar.gz/source/common/aarch64/ssd-a.S Added
478
 
1
@@ -0,0 +1,476 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2021 MulticoreWare, Inc
4
+ *
5
+ * Authors: Sebastian Pop <spop@amazon.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+#include "ssd-a-common.S"
27
+
28
+#ifdef __APPLE__
29
+.section __RODATA,__rodata
30
+#else
31
+.section .rodata
32
+#endif
33
+
34
+.align 4
35
+
36
+.text
37
+
38
+function PFX(pixel_sse_pp_4x4_neon)
39
+    ld1             {v16.s}0, x0, x1
40
+    ld1             {v17.s}0, x2, x3
41
+    ld1             {v18.s}0, x0, x1
42
+    ld1             {v19.s}0, x2, x3
43
+    ld1             {v20.s}0, x0, x1
44
+    ld1             {v21.s}0, x2, x3
45
+    ld1             {v22.s}0, x0, x1
46
+    ld1             {v23.s}0, x2, x3
47
+
48
+    usubl           v1.8h, v16.8b, v17.8b
49
+    usubl           v2.8h, v18.8b, v19.8b
50
+    usubl           v3.8h, v20.8b, v21.8b
51
+    usubl           v4.8h, v22.8b, v23.8b
52
+
53
+    smull           v0.4s, v1.4h, v1.4h
54
+    smlal           v0.4s, v2.4h, v2.4h
55
+    smlal           v0.4s, v3.4h, v3.4h
56
+    smlal           v0.4s, v4.4h, v4.4h
57
+    ret_v0_w0
58
+endfunc
59
+
60
+function PFX(pixel_sse_pp_4x8_neon)
61
+    ld1             {v16.s}0, x0, x1
62
+    ld1             {v17.s}0, x2, x3
63
+    usubl           v1.8h, v16.8b, v17.8b
64
+    ld1             {v16.s}0, x0, x1
65
+    ld1             {v17.s}0, x2, x3
66
+    smull           v0.4s, v1.4h, v1.4h
67
+.rept 6
68
+    usubl           v1.8h, v16.8b, v17.8b
69
+    ld1             {v16.s}0, x0, x1
70
+    smlal           v0.4s, v1.4h, v1.4h
71
+    ld1             {v17.s}0, x2, x3
72
+.endr
73
+    usubl           v1.8h, v16.8b, v17.8b
74
+    smlal           v0.4s, v1.4h, v1.4h
75
+    ret_v0_w0
76
+endfunc
77
+
78
+function PFX(pixel_sse_pp_8x8_neon)
79
+    ld1             {v16.8b}, x0, x1
80
+    ld1             {v17.8b}, x2, x3
81
+    usubl           v1.8h, v16.8b, v17.8b
82
+    ld1             {v16.8b}, x0, x1
83
+    smull           v0.4s, v1.4h, v1.4h
84
+    smlal2          v0.4s, v1.8h, v1.8h
85
+    ld1             {v17.8b}, x2, x3
86
+
87
+.rept 6
88
+    usubl           v1.8h, v16.8b, v17.8b
89
+    ld1             {v16.8b}, x0, x1
90
+    smlal           v0.4s, v1.4h, v1.4h
91
+    smlal2          v0.4s, v1.8h, v1.8h
92
+    ld1             {v17.8b}, x2, x3
93
+.endr
94
+    usubl           v1.8h, v16.8b, v17.8b
95
+    smlal           v0.4s, v1.4h, v1.4h
96
+    smlal2          v0.4s, v1.8h, v1.8h
97
+    ret_v0_w0
98
+endfunc
99
+
100
+function PFX(pixel_sse_pp_8x16_neon)
101
+    ld1             {v16.8b}, x0, x1
102
+    ld1             {v17.8b}, x2, x3
103
+    usubl           v1.8h, v16.8b, v17.8b
104
+    ld1             {v16.8b}, x0, x1
105
+    smull           v0.4s, v1.4h, v1.4h
106
+    smlal2          v0.4s, v1.8h, v1.8h
107
+    ld1             {v17.8b}, x2, x3
108
+
109
+.rept 14
110
+    usubl           v1.8h, v16.8b, v17.8b
111
+    ld1             {v16.8b}, x0, x1
112
+    smlal           v0.4s, v1.4h, v1.4h
113
+    smlal2          v0.4s, v1.8h, v1.8h
114
+    ld1             {v17.8b}, x2, x3
115
+.endr
116
+    usubl           v1.8h, v16.8b, v17.8b
117
+    smlal           v0.4s, v1.4h, v1.4h
118
+    smlal2          v0.4s, v1.8h, v1.8h
119
+    ret_v0_w0
120
+endfunc
121
+
122
+.macro sse_pp_16xN h
123
+function PFX(pixel_sse_pp_16x\h\()_neon)
124
+    ld1             {v16.16b}, x0, x1
125
+    ld1             {v17.16b}, x2, x3
126
+    usubl           v1.8h, v16.8b, v17.8b
127
+    usubl2          v2.8h, v16.16b, v17.16b
128
+    ld1             {v16.16b}, x0, x1
129
+    ld1             {v17.16b}, x2, x3
130
+    smull           v0.4s, v1.4h, v1.4h
131
+    smlal2          v0.4s, v1.8h, v1.8h
132
+    smlal           v0.4s, v2.4h, v2.4h
133
+    smlal2          v0.4s, v2.8h, v2.8h
134
+.rept \h - 2
135
+    usubl           v1.8h, v16.8b, v17.8b
136
+    usubl2          v2.8h, v16.16b, v17.16b
137
+    ld1             {v16.16b}, x0, x1
138
+    smlal           v0.4s, v1.4h, v1.4h
139
+    smlal2          v0.4s, v1.8h, v1.8h
140
+    ld1             {v17.16b}, x2, x3
141
+    smlal           v0.4s, v2.4h, v2.4h
142
+    smlal2          v0.4s, v2.8h, v2.8h
143
+.endr
144
+    usubl           v1.8h, v16.8b, v17.8b
145
+    usubl2          v2.8h, v16.16b, v17.16b
146
+    smlal           v0.4s, v1.4h, v1.4h
147
+    smlal2          v0.4s, v1.8h, v1.8h
148
+    smlal           v0.4s, v2.4h, v2.4h
149
+    smlal2          v0.4s, v2.8h, v2.8h
150
+    ret_v0_w0
151
+endfunc
152
+.endm
153
+
154
+sse_pp_16xN 16
155
+sse_pp_16xN 32
156
+
157
+function PFX(pixel_sse_pp_32x32_neon)
158
+    mov             w12, #8
159
+    movi            v0.16b, #0
160
+    movi            v1.16b, #0
161
+.loop_sse_pp_32:
162
+    sub             w12, w12, #1
163
+.rept 4
164
+    ld1             {v16.16b,v17.16b}, x0, x1
165
+    ld1             {v18.16b,v19.16b}, x2, x3
166
+    usubl           v2.8h, v16.8b, v18.8b
167
+    usubl2          v3.8h, v16.16b, v18.16b
168
+    usubl           v4.8h, v17.8b, v19.8b
169
+    usubl2          v5.8h, v17.16b, v19.16b
170
+    smlal           v0.4s, v2.4h, v2.4h
171
+    smlal2          v1.4s, v2.8h, v2.8h
172
+    smlal           v0.4s, v3.4h, v3.4h
173
+    smlal2          v1.4s, v3.8h, v3.8h
174
+    smlal           v0.4s, v4.4h, v4.4h
175
+    smlal2          v1.4s, v4.8h, v4.8h
176
+    smlal           v0.4s, v5.4h, v5.4h
177
+    smlal2          v1.4s, v5.8h, v5.8h
178
+.endr
179
+    cbnz            w12, .loop_sse_pp_32
180
+    add             v0.4s, v0.4s, v1.4s
181
+    ret_v0_w0
182
+endfunc
183
+
184
+function PFX(pixel_sse_pp_32x64_neon)
185
+    mov             w12, #16
186
+    movi            v0.16b, #0
187
+    movi            v1.16b, #0
188
+.loop_sse_pp_32x64:
189
+    sub             w12, w12, #1
190
+.rept 4
191
+    ld1             {v16.16b,v17.16b}, x0, x1
192
+    ld1             {v18.16b,v19.16b}, x2, x3
193
+    usubl           v2.8h, v16.8b, v18.8b
194
+    usubl2          v3.8h, v16.16b, v18.16b
195
+    usubl           v4.8h, v17.8b, v19.8b
196
+    usubl2          v5.8h, v17.16b, v19.16b
197
+    smlal           v0.4s, v2.4h, v2.4h
198
+    smlal2          v1.4s, v2.8h, v2.8h
199
+    smlal           v0.4s, v3.4h, v3.4h
200
+    smlal2          v1.4s, v3.8h, v3.8h
201
+    smlal           v0.4s, v4.4h, v4.4h
202
+    smlal2          v1.4s, v4.8h, v4.8h
203
+    smlal           v0.4s, v5.4h, v5.4h
204
+    smlal2          v1.4s, v5.8h, v5.8h
205
+.endr
206
+    cbnz            w12, .loop_sse_pp_32x64
207
+    add             v0.4s, v0.4s, v1.4s
208
+    ret_v0_w0
209
+endfunc
210
+
211
+function PFX(pixel_sse_pp_64x64_neon)
212
+    mov             w12, #16
213
+    movi            v0.16b, #0
214
+    movi            v1.16b, #0
215
+
216
+.loop_sse_pp_64:
217
+    sub             w12, w12, #1
218
+.rept 4
219
+    ld1             {v16.16b-v19.16b}, x0, x1
220
+    ld1             {v20.16b-v23.16b}, x2, x3
221
+
222
+    usubl           v2.8h, v16.8b, v20.8b
223
+    usubl2          v3.8h, v16.16b, v20.16b
224
+    usubl           v4.8h, v17.8b, v21.8b
225
+    usubl2          v5.8h, v17.16b, v21.16b
226
+    smlal           v0.4s, v2.4h, v2.4h
227
+    smlal2          v1.4s, v2.8h, v2.8h
228
+    smlal           v0.4s, v3.4h, v3.4h
229
+    smlal2          v1.4s, v3.8h, v3.8h
230
+    smlal           v0.4s, v4.4h, v4.4h
231
+    smlal2          v1.4s, v4.8h, v4.8h
232
+    smlal           v0.4s, v5.4h, v5.4h
233
+    smlal2          v1.4s, v5.8h, v5.8h
234
+
235
+    usubl           v2.8h, v18.8b, v22.8b
236
+    usubl2          v3.8h, v18.16b, v22.16b
237
+    usubl           v4.8h, v19.8b, v23.8b
238
+    usubl2          v5.8h, v19.16b, v23.16b
239
+    smlal           v0.4s, v2.4h, v2.4h
240
+    smlal2          v1.4s, v2.8h, v2.8h
241
+    smlal           v0.4s, v3.4h, v3.4h
242
+    smlal2          v1.4s, v3.8h, v3.8h
243
+    smlal           v0.4s, v4.4h, v4.4h
244
+    smlal2          v1.4s, v4.8h, v4.8h
245
+    smlal           v0.4s, v5.4h, v5.4h
246
+    smlal2          v1.4s, v5.8h, v5.8h
247
+.endr
248
+    cbnz            w12, .loop_sse_pp_64
249
+    add             v0.4s, v0.4s, v1.4s
250
+    ret_v0_w0
251
+endfunc
252
+
253
+function PFX(pixel_sse_ss_4x4_neon)
254
+    add             x1, x1, x1
255
+    add             x3, x3, x3
256
+    ld1             {v16.8b}, x0, x1
257
+    ld1             {v17.8b}, x2, x3
258
+    sub             v2.4h, v16.4h, v17.4h
259
+    ld1             {v16.8b}, x0, x1
260
+    ld1             {v17.8b}, x2, x3
261
+    smull           v0.4s, v2.4h, v2.4h
262
+    sub             v2.4h, v16.4h, v17.4h
263
+    ld1             {v16.8b}, x0, x1
264
+    ld1             {v17.8b}, x2, x3
265
+    smlal           v0.4s, v2.4h, v2.4h
266
+    sub             v2.4h, v16.4h, v17.4h
267
+    ld1             {v16.8b}, x0, x1
268
+    smlal           v0.4s, v2.4h, v2.4h
269
+    ld1             {v17.8b}, x2, x3
270
+    sub             v2.4h, v16.4h, v17.4h
271
+    smlal           v0.4s, v2.4h, v2.4h
272
+    ret_v0_w0
273
+endfunc
274
+
275
+function PFX(pixel_sse_ss_8x8_neon)
276
+    add             x1, x1, x1
277
+    add             x3, x3, x3
278
+    ld1             {v16.16b}, x0, x1
279
+    ld1             {v17.16b}, x2, x3
280
+    sub             v2.8h, v16.8h, v17.8h
281
+    ld1             {v16.16b}, x0, x1
282
+    ld1             {v17.16b}, x2, x3
283
+    smull           v0.4s, v2.4h, v2.4h
284
+    smull2          v1.4s, v2.8h, v2.8h
285
+    sub             v2.8h, v16.8h, v17.8h
286
+.rept 6
287
+    ld1             {v16.16b}, x0, x1
288
+    ld1             {v17.16b}, x2, x3
289
+    smlal           v0.4s, v2.4h, v2.4h
290
+    smlal2          v1.4s, v2.8h, v2.8h
291
+    sub             v2.8h, v16.8h, v17.8h
292
+.endr
293
+    smlal           v0.4s, v2.4h, v2.4h
294
+    smlal2          v1.4s, v2.8h, v2.8h
295
+    add             v0.4s, v0.4s, v1.4s
296
+    ret_v0_w0
297
+endfunc
298
+
299
+function PFX(pixel_sse_ss_16x16_neon)
300
+    add             x1, x1, x1
301
+    add             x3, x3, x3
302
+    mov             w12, #4
303
+    movi            v0.16b, #0
304
+    movi            v1.16b, #0
305
+.loop_sse_ss_16:
306
+    sub             w12, w12, #1
307
+.rept 4
308
+    ld1             {v16.16b, v17.16b}, x0, x1
309
+    ld1             {v18.16b, v19.16b}, x2, x3
310
+    sub             v2.8h, v16.8h, v18.8h
311
+    sub             v3.8h, v17.8h, v19.8h
312
+    smlal           v0.4s, v2.4h, v2.4h
313
+    smlal2          v1.4s, v2.8h, v2.8h
314
+    smlal           v0.4s, v3.4h, v3.4h
315
+    smlal2          v1.4s, v3.8h, v3.8h
316
+.endr
317
+    cbnz            w12, .loop_sse_ss_16
318
+    add             v0.4s, v0.4s, v1.4s
319
+    ret_v0_w0
320
+endfunc
321
+
322
+function PFX(pixel_sse_ss_32x32_neon)
323
+    add             x1, x1, x1
324
+    add             x3, x3, x3
325
+
326
+    mov             w12, #8
327
+    movi            v0.16b, #0
328
+    movi            v1.16b, #0
329
+.loop_sse_ss_32:
330
+    sub             w12, w12, #1
331
+.rept 4
332
+    ld1             {v16.16b-v19.16b}, x0, x1
333
+    ld1             {v20.16b-v23.16b}, x2, x3
334
+    sub             v2.8h, v16.8h, v20.8h
335
+    sub             v3.8h, v17.8h, v21.8h
336
+    sub             v4.8h, v18.8h, v22.8h
337
+    sub             v5.8h, v19.8h, v23.8h
338
+    smlal           v0.4s, v2.4h, v2.4h
339
+    smlal2          v1.4s, v2.8h, v2.8h
340
+    smlal           v0.4s, v3.4h, v3.4h
341
+    smlal2          v1.4s, v3.8h, v3.8h
342
+    smlal           v0.4s, v4.4h, v4.4h
343
+    smlal2          v1.4s, v4.8h, v4.8h
344
+    smlal           v0.4s, v5.4h, v5.4h
345
+    smlal2          v1.4s, v5.8h, v5.8h
346
+.endr
347
+    cbnz            w12, .loop_sse_ss_32
348
+    add             v0.4s, v0.4s, v1.4s
349
+    ret_v0_w0
350
+endfunc
351
+
352
+function PFX(pixel_sse_ss_64x64_neon)
353
+    add             x1, x1, x1
354
+    add             x3, x3, x3
355
+    sub             x1, x1, #64
356
+    sub             x3, x3, #64
357
+
358
+    mov             w12, #32
359
+    movi            v0.16b, #0
360
+    movi            v1.16b, #0
361
+.loop_sse_ss_64:
362
+    sub             w12, w12, #1
363
+.rept 2
364
+    ld1             {v16.16b-v19.16b}, x0, #64
365
+    ld1             {v20.16b-v23.16b}, x2, #64
366
+    sub             v2.8h, v16.8h, v20.8h
367
+    sub             v3.8h, v17.8h, v21.8h
368
+    sub             v4.8h, v18.8h, v22.8h
369
+    sub             v5.8h, v19.8h, v23.8h
370
+    ld1             {v16.16b-v19.16b}, x0, x1
371
+    ld1             {v20.16b-v23.16b}, x2, x3
372
+    smlal           v0.4s, v2.4h, v2.4h
373
+    smlal2          v1.4s, v2.8h, v2.8h
374
+    smlal           v0.4s, v3.4h, v3.4h
375
+    smlal2          v1.4s, v3.8h, v3.8h
376
+    smlal           v0.4s, v4.4h, v4.4h
377
+    smlal2          v1.4s, v4.8h, v4.8h
378
+    smlal           v0.4s, v5.4h, v5.4h
379
+    smlal2          v1.4s, v5.8h, v5.8h
380
+    sub             v2.8h, v16.8h, v20.8h
381
+    sub             v3.8h, v17.8h, v21.8h
382
+    sub             v4.8h, v18.8h, v22.8h
383
+    sub             v5.8h, v19.8h, v23.8h
384
+    smlal           v0.4s, v2.4h, v2.4h
385
+    smlal2          v1.4s, v2.8h, v2.8h
386
+    smlal           v0.4s, v3.4h, v3.4h
387
+    smlal2          v1.4s, v3.8h, v3.8h
388
+    smlal           v0.4s, v4.4h, v4.4h
389
+    smlal2          v1.4s, v4.8h, v4.8h
390
+    smlal           v0.4s, v5.4h, v5.4h
391
+    smlal2          v1.4s, v5.8h, v5.8h
392
+.endr
393
+    cbnz            w12, .loop_sse_ss_64
394
+    add             v0.4s, v0.4s, v1.4s
395
+    ret_v0_w0
396
+endfunc
397
+
398
+function PFX(pixel_ssd_s_4x4_neon)
399
+    add             x1, x1, x1
400
+    ld1             {v4.8b}, x0, x1
401
+    ld1             {v5.8b}, x0, x1
402
+    ld1             {v6.8b}, x0, x1
403
+    ld1             {v7.8b}, x0
404
+    smull           v0.4s, v4.4h, v4.4h
405
+    smull           v1.4s, v5.4h, v5.4h
406
+    smlal           v0.4s, v6.4h, v6.4h
407
+    smlal           v1.4s, v7.4h, v7.4h
408
+    add             v0.4s, v0.4s, v1.4s
409
+    ret_v0_w0
410
+endfunc
411
+
412
+function PFX(pixel_ssd_s_8x8_neon)
413
+    add             x1, x1, x1
414
+    ld1             {v4.16b}, x0, x1
415
+    ld1             {v5.16b}, x0, x1
416
+    smull           v0.4s, v4.4h, v4.4h
417
+    smull2          v1.4s, v4.8h, v4.8h
418
+    smlal           v0.4s, v5.4h, v5.4h
419
+    smlal2          v1.4s, v5.8h, v5.8h
420
+.rept 3
421
+    ld1             {v4.16b}, x0, x1
422
+    ld1             {v5.16b}, x0, x1
423
+    smlal           v0.4s, v4.4h, v4.4h
424
+    smlal2          v1.4s, v4.8h, v4.8h
425
+    smlal           v0.4s, v5.4h, v5.4h
426
+    smlal2          v1.4s, v5.8h, v5.8h
427
+.endr
428
+    add             v0.4s, v0.4s, v1.4s
429
+    ret_v0_w0
430
+endfunc
431
+
432
+function PFX(pixel_ssd_s_16x16_neon)
433
+    add             x1, x1, x1
434
+    mov             w12, #4
435
+    movi            v0.16b, #0
436
+    movi            v1.16b, #0
437
+.loop_ssd_s_16:
438
+    sub             w12, w12, #1
439
+.rept 2
440
+    ld1             {v4.16b,v5.16b}, x0, x1
441
+    ld1             {v6.16b,v7.16b}, x0, x1
442
+    smlal           v0.4s, v4.4h, v4.4h
443
+    smlal2          v1.4s, v4.8h, v4.8h
444
+    smlal           v0.4s, v5.4h, v5.4h
445
+    smlal2          v1.4s, v5.8h, v5.8h
446
+    smlal           v0.4s, v6.4h, v6.4h
447
+    smlal2          v1.4s, v6.8h, v6.8h
448
+    smlal           v0.4s, v7.4h, v7.4h
449
+    smlal2          v1.4s, v7.8h, v7.8h
450
+.endr
451
+    cbnz            w12, .loop_ssd_s_16
452
+    add             v0.4s, v0.4s, v1.4s
453
+    ret_v0_w0
454
+endfunc
455
+
456
+function PFX(pixel_ssd_s_32x32_neon)
457
+    add             x1, x1, x1
458
+    mov             w12, #8
459
+    movi            v0.16b, #0
460
+    movi            v1.16b, #0
461
+.loop_ssd_s_32:
462
+    sub             w12, w12, #1
463
+.rept 4
464
+    ld1             {v4.16b-v7.16b}, x0, x1
465
+    smlal           v0.4s, v4.4h, v4.4h
466
+    smlal2          v1.4s, v4.8h, v4.8h
467
+    smlal           v0.4s, v5.4h, v5.4h
468
+    smlal2          v1.4s, v5.8h, v5.8h
469
+    smlal           v0.4s, v6.4h, v6.4h
470
+    smlal2          v1.4s, v6.8h, v6.8h
471
+    smlal           v0.4s, v7.4h, v7.4h
472
+    smlal2          v1.4s, v7.8h, v7.8h
473
+.endr
474
+    cbnz            w12, .loop_ssd_s_32
475
+    add             v0.4s, v0.4s, v1.4s
476
+    ret_v0_w0
477
+endfunc
478
x265_3.5.tar.gz/source/common/common.h -> x265_3.6.tar.gz/source/common/common.h Changed
51
 
1
@@ -130,7 +130,6 @@
2
 typedef uint64_t pixel4;
3
 typedef int64_t  ssum2_t;
4
 #define SHIFT_TO_BITPLANE 9
5
-#define HISTOGRAM_BINS 1024
6
 #else
7
 typedef uint8_t  pixel;
8
 typedef uint16_t sum_t;
9
@@ -138,7 +137,6 @@
10
 typedef uint32_t pixel4;
11
 typedef int32_t  ssum2_t; // Signed sum
12
 #define SHIFT_TO_BITPLANE 7
13
-#define HISTOGRAM_BINS 256
14
 #endif // if HIGH_BIT_DEPTH
15
 
16
 #if X265_DEPTH < 10
17
@@ -162,6 +160,8 @@
18
 
19
 #define MIN_QPSCALE     0.21249999999999999
20
 #define MAX_MAX_QPSCALE 615.46574234477100
21
+#define FRAME_BRIGHTNESS_THRESHOLD  50.0 // Min % of pixels in a frame, that are above BRIGHTNESS_THRESHOLD for it to be considered a bright frame
22
+#define FRAME_EDGE_THRESHOLD  10.0 // Min % of edge pixels in a frame, for it to be considered to have high edge density
23
 
24
 
25
 template<typename T>
26
@@ -340,6 +340,9 @@
27
 #define FILLER_OVERHEAD (NAL_TYPE_OVERHEAD + START_CODE_OVERHEAD + 1)
28
 
29
 #define MAX_NUM_DYN_REFINE          (NUM_CU_DEPTH * X265_REFINE_INTER_LEVELS)
30
+#define X265_BYTE 8
31
+
32
+#define MAX_MCSTF_TEMPORAL_WINDOW_LENGTH 8
33
 
34
 namespace X265_NS {
35
 
36
@@ -434,6 +437,14 @@
37
 #define  x265_unlink(fileName) unlink(fileName)
38
 #define  x265_rename(oldName, newName) rename(oldName, newName)
39
 #endif
40
+/* Close a file */
41
+#define  x265_fclose(file) if (file != NULL) fclose(file); file=NULL;
42
+#define x265_fread(val, size, readSize, fileOffset,errorMessage)\
43
+    if (fread(val, size, readSize, fileOffset) != readSize)\
44
+    {\
45
+        x265_log(NULL, X265_LOG_ERROR, errorMessage); \
46
+        return; \
47
+    }
48
 int      x265_exp2fix8(double x);
49
 
50
 double   x265_ssim2dB(double ssim);
51
x265_3.5.tar.gz/source/common/cpu.cpp -> x265_3.6.tar.gz/source/common/cpu.cpp Changed
58
 
1
@@ -7,6 +7,8 @@
2
  *          Steve Borho <steve@borho.org>
3
  *          Hongbin Liu <liuhongbin1@huawei.com>
4
  *          Yimeng Su <yimeng.su@huawei.com>
5
+ *          Josh Dekker <josh@itanimul.li>
6
+ *          Jean-Baptiste Kempf <jb@videolan.org>
7
  *
8
  * This program is free software; you can redistribute it and/or modify
9
  * it under the terms of the GNU General Public License as published by
10
@@ -105,6 +107,14 @@
11
     { "NEON",            X265_CPU_NEON },
12
     { "FastNeonMRC",     X265_CPU_FAST_NEON_MRC },
13
 
14
+#elif X265_ARCH_ARM64
15
+    { "NEON",            X265_CPU_NEON },
16
+#if defined(HAVE_SVE)
17
+    { "SVE",            X265_CPU_SVE },
18
+#endif
19
+#if defined(HAVE_SVE2)
20
+    { "SVE2",            X265_CPU_SVE2 },
21
+#endif
22
 #elif X265_ARCH_POWER8
23
     { "Altivec",         X265_CPU_ALTIVEC },
24
 
25
@@ -369,12 +379,30 @@
26
     flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
27
 #endif
28
     // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
29
-#elif X265_ARCH_ARM64
30
-    flags |= X265_CPU_NEON;
31
 #endif // if HAVE_ARMV6
32
     return flags;
33
 }
34
 
35
+#elif X265_ARCH_ARM64
36
+
37
+uint32_t cpu_detect(bool benableavx512)
38
+{
39
+    int flags = 0;
40
+
41
+    #if defined(HAVE_SVE2)
42
+         flags |= X265_CPU_SVE2;
43
+         flags |= X265_CPU_SVE;
44
+         flags |= X265_CPU_NEON;
45
+    #elif defined(HAVE_SVE)
46
+         flags |= X265_CPU_SVE;
47
+         flags |= X265_CPU_NEON;
48
+    #elif HAVE_NEON
49
+         flags |= X265_CPU_NEON;
50
+    #endif
51
+        
52
+    return flags;
53
+}
54
+
55
 #elif X265_ARCH_POWER8
56
 
57
 uint32_t cpu_detect(bool benableavx512)
58
x265_3.5.tar.gz/source/common/frame.cpp -> x265_3.6.tar.gz/source/common/frame.cpp Changed
102
 
1
@@ -64,12 +64,40 @@
2
     m_edgeBitPlane = NULL;
3
     m_edgeBitPic = NULL;
4
     m_isInsideWindow = 0;
5
+
6
+    // mcstf
7
+    m_isSubSampled = NULL;
8
+    m_mcstf = NULL;
9
+    m_refPicCnt0 = 0;
10
+    m_refPicCnt1 = 0;
11
+    m_nextMCSTF = NULL;
12
+    m_prevMCSTF = NULL;
13
+
14
+    m_tempLayer = 0;
15
+    m_sameLayerRefPic = false;
16
 }
17
 
18
 bool Frame::create(x265_param *param, float* quantOffsets)
19
 {
20
     m_fencPic = new PicYuv;
21
     m_param = param;
22
+
23
+    if (m_param->bEnableTemporalFilter)
24
+    {
25
+        m_mcstf = new TemporalFilter;
26
+        m_mcstf->init(param);
27
+
28
+        m_fencPicSubsampled2 = new PicYuv;
29
+        m_fencPicSubsampled4 = new PicYuv;
30
+
31
+        if (!m_fencPicSubsampled2->createScaledPicYUV(param, 2))
32
+            return false;
33
+        if (!m_fencPicSubsampled4->createScaledPicYUV(param, 4))
34
+            return false;
35
+
36
+        CHECKED_MALLOC_ZERO(m_isSubSampled, int, 1);
37
+    }
38
+
39
     CHECKED_MALLOC_ZERO(m_rcData, RcStats, 1);
40
 
41
     if (param->bCTUInfo)
42
@@ -151,6 +179,22 @@
43
     return false;
44
 }
45
 
46
+bool Frame::createSubSample()
47
+{
48
+
49
+    m_fencPicSubsampled2 = new PicYuv;
50
+    m_fencPicSubsampled4 = new PicYuv;
51
+
52
+    if (!m_fencPicSubsampled2->createScaledPicYUV(m_param, 2))
53
+        return false;
54
+    if (!m_fencPicSubsampled4->createScaledPicYUV(m_param, 4))
55
+        return false;
56
+    CHECKED_MALLOC_ZERO(m_isSubSampled, int, 1);
57
+    return true;
58
+fail:
59
+    return false;
60
+}
61
+
62
 bool Frame::allocEncodeData(x265_param *param, const SPS& sps)
63
 {
64
     m_encData = new FrameData;
65
@@ -207,6 +251,26 @@
66
         m_fencPic = NULL;
67
     }
68
 
69
+    if (m_param->bEnableTemporalFilter)
70
+    {
71
+
72
+        if (m_fencPicSubsampled2)
73
+        {
74
+            m_fencPicSubsampled2->destroy();
75
+            delete m_fencPicSubsampled2;
76
+            m_fencPicSubsampled2 = NULL;
77
+        }
78
+
79
+        if (m_fencPicSubsampled4)
80
+        {
81
+            m_fencPicSubsampled4->destroy();
82
+            delete m_fencPicSubsampled4;
83
+            m_fencPicSubsampled4 = NULL;
84
+        }
85
+        delete m_mcstf;
86
+        X265_FREE(m_isSubSampled);
87
+    }
88
+
89
     if (m_reconPic)
90
     {
91
         m_reconPic->destroy();
92
@@ -267,7 +331,8 @@
93
         X265_FREE(m_addOnPrevChange);
94
         m_addOnPrevChange = NULL;
95
     }
96
-    m_lowres.destroy();
97
+
98
+    m_lowres.destroy(m_param);
99
     X265_FREE(m_rcData);
100
 
101
     if (m_param->bDynamicRefine)
102
x265_3.5.tar.gz/source/common/frame.h -> x265_3.6.tar.gz/source/common/frame.h Changed
60
 
1
@@ -28,6 +28,7 @@
2
 #include "common.h"
3
 #include "lowres.h"
4
 #include "threading.h"
5
+#include "temporalfilter.h"
6
 
7
 namespace X265_NS {
8
 // private namespace
9
@@ -70,6 +71,7 @@
10
     double   count4;
11
     double   offset4;
12
     double   bufferFillFinal;
13
+    int64_t  currentSatd;
14
 };
15
 
16
 class Frame
17
@@ -83,8 +85,12 @@
18
 
19
     /* Data associated with x265_picture */
20
     PicYuv*                m_fencPic;
21
+    PicYuv*                m_fencPicSubsampled2;
22
+    PicYuv*                m_fencPicSubsampled4;
23
+
24
     int                    m_poc;
25
     int                    m_encodeOrder;
26
+    int                    m_gopOffset;
27
     int64_t                m_pts;                // user provided presentation time stamp
28
     int64_t                m_reorderedPts;
29
     int64_t                m_dts;
30
@@ -132,6 +138,13 @@
31
     bool                   m_classifyFrame;
32
     int                    m_fieldNum;
33
 
34
+    /*MCSTF*/
35
+    TemporalFilter*        m_mcstf;
36
+    int                    m_refPicCnt2;
37
+    Frame*                 m_nextMCSTF;           // PicList doubly linked list pointers
38
+    Frame*                 m_prevMCSTF;
39
+    int*                   m_isSubSampled;
40
+
41
     /* aq-mode 4 : Gaussian, edge and theta frames for edge information */
42
     pixel*                 m_edgePic;
43
     pixel*                 m_gaussianPic;
44
@@ -143,9 +156,15 @@
45
 
46
     int                    m_isInsideWindow;
47
 
48
+    /*Frame's temporal layer info*/
49
+    uint8_t                m_tempLayer;
50
+    int8_t                 m_gopId;
51
+    bool                   m_sameLayerRefPic;
52
+
53
     Frame();
54
 
55
     bool create(x265_param *param, float* quantOffsets);
56
+    bool createSubSample();
57
     bool allocEncodeData(x265_param *param, const SPS& sps);
58
     void reinit(const SPS& sps);
59
     void destroy();
60
x265_3.5.tar.gz/source/common/framedata.cpp -> x265_3.6.tar.gz/source/common/framedata.cpp Changed
10
 
1
@@ -62,7 +62,7 @@
2
     }
3
     else
4
         return false;
5
-    CHECKED_MALLOC_ZERO(m_cuStat, RCStatCU, sps.numCUsInFrame);
6
+    CHECKED_MALLOC_ZERO(m_cuStat, RCStatCU, sps.numCUsInFrame + 1);
7
     CHECKED_MALLOC(m_rowStat, RCStatRow, sps.numCuInHeight);
8
     reinit(sps);
9
     
10
x265_3.5.tar.gz/source/common/lowres.cpp -> x265_3.6.tar.gz/source/common/lowres.cpp Changed
154
 
1
@@ -28,6 +28,28 @@
2
 
3
 using namespace X265_NS;
4
 
5
+/*
6
+ * Down Sample input picture
7
+ */
8
+static
9
+void frame_lowres_core(const pixel* src0, pixel* dst0,
10
+    intptr_t src_stride, intptr_t dst_stride, int width, int height)
11
+{
12
+    for (int y = 0; y < height; y++)
13
+    {
14
+        const pixel* src1 = src0 + src_stride;
15
+        for (int x = 0; x < width; x++)
16
+        {
17
+            // slower than naive bilinear, but matches asm
18
+#define FILTER(a, b, c, d) ((((a + b + 1) >> 1) + ((c + d + 1) >> 1) + 1) >> 1)
19
+            dst0x = FILTER(src02 * x, src12 * x, src02 * x + 1, src12 * x + 1);
20
+#undef FILTER
21
+        }
22
+        src0 += src_stride * 2;
23
+        dst0 += dst_stride;
24
+    }
25
+}
26
+
27
 bool PicQPAdaptationLayer::create(uint32_t width, uint32_t height, uint32_t partWidth, uint32_t partHeight, uint32_t numAQPartInWidthExt, uint32_t numAQPartInHeightExt)
28
 {
29
     aqPartWidth = partWidth;
30
@@ -73,7 +95,7 @@
31
 
32
     size_t planesize = lumaStride * (lines + 2 * origPic->m_lumaMarginY);
33
     size_t padoffset = lumaStride * origPic->m_lumaMarginY + origPic->m_lumaMarginX;
34
-    if (!!param->rc.aqMode || !!param->rc.hevcAq || !!param->bAQMotion)
35
+    if (!!param->rc.aqMode || !!param->rc.hevcAq || !!param->bAQMotion || !!param->bEnableWeightedPred || !!param->bEnableWeightedBiPred)
36
     {
37
         CHECKED_MALLOC_ZERO(qpAqOffset, double, cuCountFullRes);
38
         CHECKED_MALLOC_ZERO(invQscaleFactor, int, cuCountFullRes);
39
@@ -190,13 +212,45 @@
40
         }
41
     }
42
 
43
+    if (param->bHistBasedSceneCut)
44
+    {
45
+        quarterSampleLowResWidth = widthFullRes / 4;
46
+        quarterSampleLowResHeight = heightFullRes / 4;
47
+        quarterSampleLowResOriginX = 16;
48
+        quarterSampleLowResOriginY = 16;
49
+        quarterSampleLowResStrideY = quarterSampleLowResWidth + 2 * quarterSampleLowResOriginY;
50
+
51
+        size_t quarterSampleLowResPlanesize = quarterSampleLowResStrideY * (quarterSampleLowResHeight + 2 * quarterSampleLowResOriginX);
52
+        /* allocate quarter sampled lowres buffers */
53
+        CHECKED_MALLOC_ZERO(quarterSampleLowResBuffer, pixel, quarterSampleLowResPlanesize);
54
+
55
+        // Allocate memory for Histograms
56
+        picHistogram = X265_MALLOC(uint32_t***, NUMBER_OF_SEGMENTS_IN_WIDTH * sizeof(uint32_t***));
57
+        picHistogram0 = X265_MALLOC(uint32_t**, NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
58
+        for (uint32_t wd = 1; wd < NUMBER_OF_SEGMENTS_IN_WIDTH; wd++) {
59
+            picHistogramwd = picHistogram0 + wd * NUMBER_OF_SEGMENTS_IN_HEIGHT;
60
+        }
61
+
62
+        for (uint32_t regionInPictureWidthIndex = 0; regionInPictureWidthIndex < NUMBER_OF_SEGMENTS_IN_WIDTH; regionInPictureWidthIndex++)
63
+        {
64
+            for (uint32_t regionInPictureHeightIndex = 0; regionInPictureHeightIndex < NUMBER_OF_SEGMENTS_IN_HEIGHT; regionInPictureHeightIndex++)
65
+            {
66
+                picHistogramregionInPictureWidthIndexregionInPictureHeightIndex = X265_MALLOC(uint32_t*, NUMBER_OF_SEGMENTS_IN_WIDTH *sizeof(uint32_t*));
67
+                picHistogramregionInPictureWidthIndexregionInPictureHeightIndex0 = X265_MALLOC(uint32_t, 3 * HISTOGRAM_NUMBER_OF_BINS * sizeof(uint32_t));
68
+                for (uint32_t wd = 1; wd < 3; wd++) {
69
+                    picHistogramregionInPictureWidthIndexregionInPictureHeightIndexwd = picHistogramregionInPictureWidthIndexregionInPictureHeightIndex0 + wd * HISTOGRAM_NUMBER_OF_BINS;
70
+                }
71
+            }
72
+        }
73
+    }
74
+
75
     return true;
76
 
77
 fail:
78
     return false;
79
 }
80
 
81
-void Lowres::destroy()
82
+void Lowres::destroy(x265_param* param)
83
 {
84
     X265_FREE(buffer0);
85
     if(bEnableHME)
86
@@ -234,7 +288,8 @@
87
     X265_FREE(invQscaleFactor8x8);
88
     X265_FREE(edgeInclined);
89
     X265_FREE(qpAqMotionOffset);
90
-    X265_FREE(blockVariance);
91
+    if (param->bDynamicRefine || param->bEnableFades)
92
+        X265_FREE(blockVariance);
93
     if (maxAQDepth > 0)
94
     {
95
         for (uint32_t d = 0; d < 4; d++)
96
@@ -254,6 +309,29 @@
97
 
98
         delete pAQLayer;
99
     }
100
+
101
+    // Histograms
102
+    if (param->bHistBasedSceneCut)
103
+    {
104
+        for (uint32_t segmentInFrameWidthIdx = 0; segmentInFrameWidthIdx < NUMBER_OF_SEGMENTS_IN_WIDTH; segmentInFrameWidthIdx++)
105
+        {
106
+            if (picHistogramsegmentInFrameWidthIdx)
107
+            {
108
+                for (uint32_t segmentInFrameHeightIdx = 0; segmentInFrameHeightIdx < NUMBER_OF_SEGMENTS_IN_HEIGHT; segmentInFrameHeightIdx++)
109
+                {
110
+                    if (picHistogramsegmentInFrameWidthIdxsegmentInFrameHeightIdx)
111
+                        X265_FREE(picHistogramsegmentInFrameWidthIdxsegmentInFrameHeightIdx0);
112
+                    X265_FREE(picHistogramsegmentInFrameWidthIdxsegmentInFrameHeightIdx);
113
+                }
114
+            }
115
+        }
116
+        if (picHistogram)
117
+            X265_FREE(picHistogram0);
118
+        X265_FREE(picHistogram);
119
+
120
+        X265_FREE(quarterSampleLowResBuffer);
121
+
122
+    }
123
 }
124
 // (re) initialize lowres state
125
 void Lowres::init(PicYuv *origPic, int poc)
126
@@ -266,10 +344,6 @@
127
     indB = 0;
128
     memset(costEst, -1, sizeof(costEst));
129
     memset(weightedCostDelta, 0, sizeof(weightedCostDelta));
130
-    interPCostPercDiff = 0.0;
131
-    intraCostPercDiff = 0.0;
132
-    m_bIsMaxThres = false;
133
-    m_bIsHardScenecut = false;
134
 
135
     if (qpAqOffset && invQscaleFactor)
136
         memset(costEstAq, -1, sizeof(costEstAq));
137
@@ -314,4 +388,16 @@
138
     }
139
 
140
     fpelPlane0 = lowresPlane0;
141
+
142
+    if (origPic->m_param->bHistBasedSceneCut)
143
+    {
144
+        // Quarter Sampled Input Picture Formation
145
+        // TO DO: Replace with ASM function
146
+        frame_lowres_core(
147
+            lowresPlane0,
148
+            quarterSampleLowResBuffer + quarterSampleLowResOriginX + quarterSampleLowResOriginY * quarterSampleLowResStrideY,
149
+            lumaStride,
150
+            quarterSampleLowResStrideY,
151
+            widthFullRes / 4, heightFullRes / 4);
152
+    }
153
 }
154
x265_3.5.tar.gz/source/common/lowres.h -> x265_3.6.tar.gz/source/common/lowres.h Changed
73
 
1
@@ -32,6 +32,10 @@
2
 namespace X265_NS {
3
 // private namespace
4
 
5
+#define HISTOGRAM_NUMBER_OF_BINS         256
6
+#define NUMBER_OF_SEGMENTS_IN_WIDTH      4
7
+#define NUMBER_OF_SEGMENTS_IN_HEIGHT     4
8
+
9
 struct ReferencePlanes
10
 {
11
     ReferencePlanes() { memset(this, 0, sizeof(ReferencePlanes)); }
12
@@ -171,6 +175,7 @@
13
 
14
     int    frameNum;         // Presentation frame number
15
     int    sliceType;        // Slice type decided by lookahead
16
+    int    sliceTypeReq;     // Slice type required as per the QP file
17
     int    width;            // width of lowres frame in pixels
18
     int    lines;            // height of lowres frame in pixel lines
19
     int    leadingBframes;   // number of leading B frames for P or I
20
@@ -214,13 +219,13 @@
21
     double*   qpAqOffset;      // AQ QP offset values for each 16x16 CU
22
     double*   qpCuTreeOffset;  // cuTree QP offset values for each 16x16 CU
23
     double*   qpAqMotionOffset;
24
-    int*      invQscaleFactor; // qScale values for qp Aq Offsets
25
+    int*      invQscaleFactor;    // qScale values for qp Aq Offsets
26
     int*      invQscaleFactor8x8; // temporary buffer for qg-size 8
27
     uint32_t* blockVariance;
28
     uint64_t  wp_ssd3;       // This is different than SSDY, this is sum(pixel^2) - sum(pixel)^2 for entire frame
29
     uint64_t  wp_sum3;
30
     double    frameVariance;
31
-    int* edgeInclined;
32
+    int*      edgeInclined;
33
 
34
 
35
     /* cutree intermediate data */
36
@@ -230,18 +235,30 @@
37
     uint32_t heightFullRes;
38
     uint32_t m_maxCUSize;
39
     uint32_t m_qgSize;
40
-    
41
+
42
     uint16_t* propagateCost;
43
     double    weightedCostDeltaX265_BFRAME_MAX + 2;
44
     ReferencePlanes weightedRefX265_BFRAME_MAX + 2;
45
+
46
     /* For hist-based scenecut */
47
-    bool   m_bIsMaxThres;
48
-    double interPCostPercDiff;
49
-    double intraCostPercDiff;
50
-    bool   m_bIsHardScenecut;
51
+    int          quarterSampleLowResWidth;     // width of 1/4 lowres frame in pixels
52
+    int          quarterSampleLowResHeight;    // height of 1/4 lowres frame in pixels
53
+    int          quarterSampleLowResStrideY;
54
+    int          quarterSampleLowResOriginX;
55
+    int          quarterSampleLowResOriginY;
56
+    pixel       *quarterSampleLowResBuffer;
57
+    bool         bHistScenecutAnalyzed;
58
+
59
+    uint16_t     picAvgVariance;
60
+    uint16_t     picAvgVarianceCb;
61
+    uint16_t     picAvgVarianceCr;
62
+
63
+    uint32_t ****picHistogram;
64
+    uint64_t     averageIntensityPerSegmentNUMBER_OF_SEGMENTS_IN_WIDTHNUMBER_OF_SEGMENTS_IN_HEIGHT3;
65
+    uint8_t      averageIntensity3;
66
 
67
     bool create(x265_param* param, PicYuv *origPic, uint32_t qgSize);
68
-    void destroy();
69
+    void destroy(x265_param* param);
70
     void init(PicYuv *origPic, int poc);
71
 };
72
 }
73
x265_3.5.tar.gz/source/common/mv.h -> x265_3.6.tar.gz/source/common/mv.h Changed
10
 
1
@@ -105,6 +105,8 @@
2
     {
3
         return x >= _min.x && x <= _max.x && y >= _min.y && y <= _max.y;
4
     }
5
+
6
+    void set(int32_t _x, int32_t _y) { x = _x; y = _y; }
7
 };
8
 }
9
 
10
x265_3.5.tar.gz/source/common/param.cpp -> x265_3.6.tar.gz/source/common/param.cpp Changed
668
 
1
@@ -145,6 +145,8 @@
2
     param->bAnnexB = 1;
3
     param->bRepeatHeaders = 0;
4
     param->bEnableAccessUnitDelimiters = 0;
5
+    param->bEnableEndOfBitstream = 0;
6
+    param->bEnableEndOfSequence = 0;
7
     param->bEmitHRDSEI = 0;
8
     param->bEmitInfoSEI = 1;
9
     param->bEmitHDRSEI = 0; /*Deprecated*/
10
@@ -163,12 +165,12 @@
11
     param->keyframeMax = 250;
12
     param->gopLookahead = 0;
13
     param->bOpenGOP = 1;
14
+   param->craNal = 0;
15
     param->bframes = 4;
16
     param->lookaheadDepth = 20;
17
     param->bFrameAdaptive = X265_B_ADAPT_TRELLIS;
18
     param->bBPyramid = 1;
19
     param->scenecutThreshold = 40; /* Magic number pulled in from x264 */
20
-    param->edgeTransitionThreshold = 0.03;
21
     param->bHistBasedSceneCut = 0;
22
     param->lookaheadSlices = 8;
23
     param->lookaheadThreads = 0;
24
@@ -179,12 +181,20 @@
25
     param->bEnableHRDConcatFlag = 0;
26
     param->bEnableFades = 0;
27
     param->bEnableSceneCutAwareQp = 0;
28
-    param->fwdScenecutWindow = 500;
29
-    param->fwdRefQpDelta = 5;
30
-    param->fwdNonRefQpDelta = param->fwdRefQpDelta + (SLICE_TYPE_DELTA * param->fwdRefQpDelta);
31
-    param->bwdScenecutWindow = 100;
32
-    param->bwdRefQpDelta = -1;
33
-    param->bwdNonRefQpDelta = -1;
34
+    param->fwdMaxScenecutWindow = 1200;
35
+    param->bwdMaxScenecutWindow = 600;
36
+    for (int i = 0; i < 6; i++)
37
+    {
38
+        int deltas6 = { 5, 4, 3, 2, 1, 0 };
39
+
40
+        param->fwdScenecutWindowi = 200;
41
+        param->fwdRefQpDeltai = deltasi;
42
+        param->fwdNonRefQpDeltai = param->fwdRefQpDeltai + (SLICE_TYPE_DELTA * param->fwdRefQpDeltai);
43
+
44
+        param->bwdScenecutWindowi = 100;
45
+        param->bwdRefQpDeltai = -1;
46
+        param->bwdNonRefQpDeltai = -1;
47
+    }
48
 
49
     /* Intra Coding Tools */
50
     param->bEnableConstrainedIntra = 0;
51
@@ -278,7 +288,10 @@
52
     param->rc.rfConstantMin = 0;
53
     param->rc.bStatRead = 0;
54
     param->rc.bStatWrite = 0;
55
+    param->rc.dataShareMode = X265_SHARE_MODE_FILE;
56
     param->rc.statFileName = NULL;
57
+    param->rc.sharedMemName = NULL;
58
+    param->rc.bEncFocusedFramesOnly = 0;
59
     param->rc.complexityBlur = 20;
60
     param->rc.qblur = 0.5;
61
     param->rc.zoneCount = 0;
62
@@ -321,6 +334,7 @@
63
     param->maxLuma = PIXEL_MAX;
64
     param->log2MaxPocLsb = 8;
65
     param->maxSlices = 1;
66
+    param->videoSignalTypePreset = NULL;
67
 
68
     /*Conformance window*/
69
     param->confWinRightOffset = 0;
70
@@ -373,10 +387,17 @@
71
     param->bEnableSvtHevc = 0;
72
     param->svtHevcParam = NULL;
73
 
74
+    /* MCSTF */
75
+    param->bEnableTemporalFilter = 0;
76
+    param->temporalFilterStrength = 0.95;
77
+
78
 #ifdef SVT_HEVC
79
     param->svtHevcParam = svtParam;
80
     svt_param_default(param);
81
 #endif
82
+    /* Film grain characteristics model filename */
83
+    param->filmGrain = NULL;
84
+    param->bEnableSBRC = 0;
85
 }
86
 
87
 int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
88
@@ -666,6 +687,46 @@
89
 #define atof(str) x265_atof(str, bError)
90
 #define atobool(str) (x265_atobool(str, bError))
91
 
92
+int x265_scenecut_aware_qp_param_parse(x265_param* p, const char* name, const char* value)
93
+{
94
+    bool bError = false;
95
+    char nameBuf64;
96
+    if (!name)
97
+        return X265_PARAM_BAD_NAME;
98
+    // skip -- prefix if provided
99
+    if (name0 == '-' && name1 == '-')
100
+        name += 2;
101
+    // s/_/-/g
102
+    if (strlen(name) + 1 < sizeof(nameBuf) && strchr(name, '_'))
103
+    {
104
+        char *c;
105
+        strcpy(nameBuf, name);
106
+        while ((c = strchr(nameBuf, '_')) != 0)
107
+            *c = '-';
108
+        name = nameBuf;
109
+    }
110
+    if (!value)
111
+        value = "true";
112
+    else if (value0 == '=')
113
+        value++;
114
+#define OPT(STR) else if (!strcmp(name, STR))
115
+    if (0);
116
+    OPT("scenecut-aware-qp") p->bEnableSceneCutAwareQp = x265_atoi(value, bError);
117
+    OPT("masking-strength") bError = parseMaskingStrength(p, value);
118
+    else
119
+        return X265_PARAM_BAD_NAME;
120
+#undef OPT
121
+    return bError ? X265_PARAM_BAD_VALUE : 0;
122
+}
123
+
124
+
125
+/* internal versions of string-to-int with additional error checking */
126
+#undef atoi
127
+#undef atof
128
+#define atoi(str) x265_atoi(str, bError)
129
+#define atof(str) x265_atof(str, bError)
130
+#define atobool(str) (x265_atobool(str, bError))
131
+
132
 int x265_zone_param_parse(x265_param* p, const char* name, const char* value)
133
 {
134
     bool bError = false;
135
@@ -949,10 +1010,9 @@
136
        {
137
            bError = false;
138
            p->scenecutThreshold = atoi(value);
139
-           p->bHistBasedSceneCut = 0;
140
        }
141
     }
142
-    OPT("temporal-layers") p->bEnableTemporalSubLayers = atobool(value);
143
+    OPT("temporal-layers") p->bEnableTemporalSubLayers = atoi(value);
144
     OPT("keyint") p->keyframeMax = atoi(value);
145
     OPT("min-keyint") p->keyframeMin = atoi(value);
146
     OPT("rc-lookahead") p->lookaheadDepth = atoi(value);
147
@@ -1184,6 +1244,7 @@
148
         int pass = x265_clip3(0, 3, atoi(value));
149
         p->rc.bStatWrite = pass & 1;
150
         p->rc.bStatRead = pass & 2;
151
+        p->rc.dataShareMode = X265_SHARE_MODE_FILE;
152
     }
153
     OPT("stats") p->rc.statFileName = strdup(value);
154
     OPT("scaling-list") p->scalingLists = strdup(value);
155
@@ -1216,21 +1277,7 @@
156
         OPT("opt-ref-list-length-pps") p->bOptRefListLengthPPS = atobool(value);
157
         OPT("multi-pass-opt-rps") p->bMultiPassOptRPS = atobool(value);
158
         OPT("scenecut-bias") p->scenecutBias = atof(value);
159
-        OPT("hist-scenecut")
160
-        {
161
-            p->bHistBasedSceneCut = atobool(value);
162
-            if (bError)
163
-            {
164
-                bError = false;
165
-                p->bHistBasedSceneCut = 0;
166
-            }
167
-            if (p->bHistBasedSceneCut)
168
-            {
169
-                bError = false;
170
-                p->scenecutThreshold = 0;
171
-            }
172
-        }
173
-        OPT("hist-threshold") p->edgeTransitionThreshold = atof(value);
174
+        OPT("hist-scenecut") p->bHistBasedSceneCut = atobool(value);
175
         OPT("rskip-edge-threshold") p->edgeVarThreshold = atoi(value)/100.0f;
176
         OPT("lookahead-threads") p->lookaheadThreads = atoi(value);
177
         OPT("opt-cu-delta-qp") p->bOptCUDeltaQP = atobool(value);
178
@@ -1238,6 +1285,7 @@
179
         OPT("multi-pass-opt-distortion") p->analysisMultiPassDistortion = atobool(value);
180
         OPT("aq-motion") p->bAQMotion = atobool(value);
181
         OPT("dynamic-rd") p->dynamicRd = atof(value);
182
+       OPT("cra-nal") p->craNal = atobool(value);
183
         OPT("analysis-reuse-level")
184
         {
185
             p->analysisReuseLevel = atoi(value);
186
@@ -1348,71 +1396,7 @@
187
         }
188
         OPT("fades") p->bEnableFades = atobool(value);
189
         OPT("scenecut-aware-qp") p->bEnableSceneCutAwareQp = atoi(value);
190
-        OPT("masking-strength")
191
-        {
192
-            int window1;
193
-            double refQpDelta1, nonRefQpDelta1;
194
-
195
-            if (p->bEnableSceneCutAwareQp == FORWARD)
196
-            {
197
-                if (3 == sscanf(value, "%d,%lf,%lf", &window1, &refQpDelta1, &nonRefQpDelta1))
198
-                {
199
-                    if (window1 > 0)
200
-                        p->fwdScenecutWindow = window1;
201
-                    if (refQpDelta1 > 0)
202
-                        p->fwdRefQpDelta = refQpDelta1;
203
-                    if (nonRefQpDelta1 > 0)
204
-                        p->fwdNonRefQpDelta = nonRefQpDelta1;
205
-                }
206
-                else
207
-                {
208
-                    x265_log(NULL, X265_LOG_ERROR, "Specify all the necessary offsets for masking-strength \n");
209
-                    bError = true;
210
-                }
211
-            }
212
-            else if (p->bEnableSceneCutAwareQp == BACKWARD)
213
-            {
214
-                if (3 == sscanf(value, "%d,%lf,%lf", &window1, &refQpDelta1, &nonRefQpDelta1))
215
-                {
216
-                    if (window1 > 0)
217
-                        p->bwdScenecutWindow = window1;
218
-                    if (refQpDelta1 > 0)
219
-                        p->bwdRefQpDelta = refQpDelta1;
220
-                    if (nonRefQpDelta1 > 0)
221
-                        p->bwdNonRefQpDelta = nonRefQpDelta1;
222
-                }
223
-                else
224
-                {
225
-                    x265_log(NULL, X265_LOG_ERROR, "Specify all the necessary offsets for masking-strength \n");
226
-                    bError = true;
227
-                }
228
-            }
229
-            else if (p->bEnableSceneCutAwareQp == BI_DIRECTIONAL)
230
-            {
231
-                int window2;
232
-                double refQpDelta2, nonRefQpDelta2;
233
-                if (6 == sscanf(value, "%d,%lf,%lf,%d,%lf,%lf", &window1, &refQpDelta1, &nonRefQpDelta1, &window2, &refQpDelta2, &nonRefQpDelta2))
234
-                {
235
-                    if (window1 > 0)
236
-                        p->fwdScenecutWindow = window1;
237
-                    if (refQpDelta1 > 0)
238
-                        p->fwdRefQpDelta = refQpDelta1;
239
-                    if (nonRefQpDelta1 > 0)
240
-                        p->fwdNonRefQpDelta = nonRefQpDelta1;
241
-                    if (window2 > 0)
242
-                        p->bwdScenecutWindow = window2;
243
-                    if (refQpDelta2 > 0)
244
-                        p->bwdRefQpDelta = refQpDelta2;
245
-                    if (nonRefQpDelta2 > 0)
246
-                        p->bwdNonRefQpDelta = nonRefQpDelta2;
247
-                }
248
-                else
249
-                {
250
-                    x265_log(NULL, X265_LOG_ERROR, "Specify all the necessary offsets for masking-strength \n");
251
-                    bError = true;
252
-                }
253
-            }
254
-        }
255
+        OPT("masking-strength") bError |= parseMaskingStrength(p, value);
256
         OPT("field") p->bField = atobool( value );
257
         OPT("cll") p->bEmitCLL = atobool(value);
258
         OPT("frame-dup") p->bEnableFrameDuplication = atobool(value);
259
@@ -1446,6 +1430,13 @@
260
         OPT("vbv-live-multi-pass") p->bliveVBV2pass = atobool(value);
261
         OPT("min-vbv-fullness") p->minVbvFullness = atof(value);
262
         OPT("max-vbv-fullness") p->maxVbvFullness = atof(value);
263
+        OPT("video-signal-type-preset") p->videoSignalTypePreset = strdup(value);
264
+        OPT("eob") p->bEnableEndOfBitstream = atobool(value);
265
+        OPT("eos") p->bEnableEndOfSequence = atobool(value);
266
+        /* Film grain characterstics model filename */
267
+        OPT("film-grain") p->filmGrain = (char* )value;
268
+        OPT("mcstf") p->bEnableTemporalFilter = atobool(value);
269
+        OPT("sbrc") p->bEnableSBRC = atobool(value);
270
         else
271
             return X265_PARAM_BAD_NAME;
272
     }
273
@@ -1761,8 +1752,6 @@
274
           "scenecutThreshold must be greater than 0");
275
     CHECK(param->scenecutBias < 0 || 100 < param->scenecutBias,
276
             "scenecut-bias must be between 0 and 100");
277
-    CHECK(param->edgeTransitionThreshold < 0.0 || 1.0 < param->edgeTransitionThreshold,
278
-            "hist-threshold must be between 0.0 and 1.0");
279
     CHECK(param->radl < 0 || param->radl > param->bframes,
280
           "radl must be between 0 and bframes");
281
     CHECK(param->rdPenalty < 0 || param->rdPenalty > 2,
282
@@ -1824,15 +1813,15 @@
283
         "Invalid refine-ctu-distortion value, must be either 0 or 1");
284
     CHECK(param->maxAUSizeFactor < 0.5 || param->maxAUSizeFactor > 1.0,
285
         "Supported factor for controlling max AU size is from 0.5 to 1");
286
-    CHECK((param->dolbyProfile != 0) && (param->dolbyProfile != 50) && (param->dolbyProfile != 81) && (param->dolbyProfile != 82),
287
-        "Unsupported Dolby Vision profile, only profile 5, profile 8.1 and profile 8.2 enabled");
288
+    CHECK((param->dolbyProfile != 0) && (param->dolbyProfile != 50) && (param->dolbyProfile != 81) && (param->dolbyProfile != 82) && (param->dolbyProfile != 84),
289
+        "Unsupported Dolby Vision profile, only profile 5, profile 8.1, profile 8.2 and profile 8.4 enabled");
290
     CHECK(param->dupThreshold < 1 || 99 < param->dupThreshold,
291
         "Invalid frame-duplication threshold. Value must be between 1 and 99.");
292
     if (param->dolbyProfile)
293
     {
294
         CHECK((param->rc.vbvMaxBitrate <= 0 || param->rc.vbvBufferSize <= 0), "Dolby Vision requires VBV settings to enable HRD.\n");
295
-        CHECK((param->internalBitDepth != 10), "Dolby Vision profile - 5, profile - 8.1 and profile - 8.2 is Main10 only\n");
296
-        CHECK((param->internalCsp != X265_CSP_I420), "Dolby Vision profile - 5, profile - 8.1 and profile - 8.2 requires YCbCr 4:2:0 color space\n");
297
+        CHECK((param->internalBitDepth != 10), "Dolby Vision profile - 5, profile - 8.1, profile - 8.2 and profile - 8.4 are Main10 only\n");
298
+        CHECK((param->internalCsp != X265_CSP_I420), "Dolby Vision profile - 5, profile - 8.1, profile - 8.2 and profile - 8.4 requires YCbCr 4:2:0 color space\n");
299
         if (param->dolbyProfile == 81)
300
             CHECK(!(param->masteringDisplayColorVolume), "Dolby Vision profile - 8.1 requires Mastering display color volume information\n");
301
     }
302
@@ -1854,19 +1843,22 @@
303
         {
304
             CHECK(param->bEnableSceneCutAwareQp < 0 || param->bEnableSceneCutAwareQp > 3,
305
             "Invalid masking direction. Value must be between 0 and 3(inclusive)");
306
-            CHECK(param->fwdScenecutWindow < 0 || param->fwdScenecutWindow > 1000,
307
-            "Invalid forward scenecut Window duration. Value must be between 0 and 1000(inclusive)");
308
-            CHECK(param->fwdRefQpDelta < 0 || param->fwdRefQpDelta > 10,
309
-            "Invalid fwdRefQpDelta value. Value must be between 0 and 10 (inclusive)");
310
-            CHECK(param->fwdNonRefQpDelta < 0 || param->fwdNonRefQpDelta > 10,
311
-            "Invalid fwdNonRefQpDelta value. Value must be between 0 and 10 (inclusive)");
312
-
313
-            CHECK(param->bwdScenecutWindow < 0 || param->bwdScenecutWindow > 1000,
314
-                "Invalid backward scenecut Window duration. Value must be between 0 and 1000(inclusive)");
315
-            CHECK(param->bwdRefQpDelta < -1 || param->bwdRefQpDelta > 10,
316
-                "Invalid bwdRefQpDelta value. Value must be between 0 and 10 (inclusive)");
317
-            CHECK(param->bwdNonRefQpDelta < -1 || param->bwdNonRefQpDelta > 10,
318
-                "Invalid bwdNonRefQpDelta value. Value must be between 0 and 10 (inclusive)");
319
+            for (int i = 0; i < 6; i++)
320
+            {
321
+                CHECK(param->fwdScenecutWindowi < 0 || param->fwdScenecutWindowi > 1000,
322
+                    "Invalid forward scenecut Window duration. Value must be between 0 and 1000(inclusive)");
323
+                CHECK(param->fwdRefQpDeltai < 0 || param->fwdRefQpDeltai > 20,
324
+                    "Invalid fwdRefQpDelta value. Value must be between 0 and 20 (inclusive)");
325
+                CHECK(param->fwdNonRefQpDeltai < 0 || param->fwdNonRefQpDeltai > 20,
326
+                    "Invalid fwdNonRefQpDelta value. Value must be between 0 and 20 (inclusive)");
327
+
328
+                CHECK(param->bwdScenecutWindowi < 0 || param->bwdScenecutWindowi > 1000,
329
+                    "Invalid backward scenecut Window duration. Value must be between 0 and 1000(inclusive)");
330
+                CHECK(param->bwdRefQpDeltai < -1 || param->bwdRefQpDeltai > 20,
331
+                    "Invalid bwdRefQpDelta value. Value must be between 0 and 20 (inclusive)");
332
+                CHECK(param->bwdNonRefQpDeltai < -1 || param->bwdNonRefQpDeltai > 20,
333
+                    "Invalid bwdNonRefQpDelta value. Value must be between 0 and 20 (inclusive)");
334
+            }
335
         }
336
     }
337
     if (param->bEnableHME)
338
@@ -1898,6 +1890,11 @@
339
         param->bSingleSeiNal = 0;
340
         x265_log(param, X265_LOG_WARNING, "None of the SEI messages are enabled. Disabling Single SEI NAL\n");
341
     }
342
+    if (param->bEnableTemporalFilter && (param->frameNumThreads > 1))
343
+    {
344
+        param->bEnableTemporalFilter = 0;
345
+        x265_log(param, X265_LOG_WARNING, "MCSTF can be enabled with frame thread = 1 only. Disabling MCSTF\n");
346
+    }
347
     CHECK(param->confWinRightOffset < 0, "Conformance Window Right Offset must be 0 or greater");
348
     CHECK(param->confWinBottomOffset < 0, "Conformance Window Bottom Offset must be 0 or greater");
349
     CHECK(param->decoderVbvMaxRate < 0, "Invalid Decoder Vbv Maxrate. Value can not be less than zero");
350
@@ -1910,6 +1907,7 @@
351
             x265_log(param, X265_LOG_WARNING, "Live VBV enabled without VBV settings.Disabling live VBV in 2 pass\n");
352
         }
353
     }
354
+    CHECK(param->rc.dataShareMode != X265_SHARE_MODE_FILE && param->rc.dataShareMode != X265_SHARE_MODE_SHAREDMEM, "Invalid data share mode. It must be one of the X265_DATA_SHARE_MODES enum values\n" );
355
     return check_failed;
356
 }
357
 
358
@@ -1970,8 +1968,8 @@
359
         x265_log(param, X265_LOG_INFO, "Keyframe min / max / scenecut / bias  : %d / %d / %d / %.2lf \n",
360
                  param->keyframeMin, param->keyframeMax, param->scenecutThreshold, param->scenecutBias * 100);
361
     else if (param->bHistBasedSceneCut && param->keyframeMax != INT_MAX) 
362
-        x265_log(param, X265_LOG_INFO, "Keyframe min / max / scenecut / edge threshold  : %d / %d / %d / %.2lf\n",
363
-                 param->keyframeMin, param->keyframeMax, param->bHistBasedSceneCut, param->edgeTransitionThreshold);
364
+        x265_log(param, X265_LOG_INFO, "Keyframe min / max / scenecut  : %d / %d / %d\n",
365
+                 param->keyframeMin, param->keyframeMax, param->bHistBasedSceneCut);
366
     else if (param->keyframeMax == INT_MAX)
367
         x265_log(param, X265_LOG_INFO, "Keyframe min / max / scenecut       : disabled\n");
368
 
369
@@ -2089,6 +2087,8 @@
370
         bufSize += strlen(p->numaPools);
371
     if (p->masteringDisplayColorVolume)
372
         bufSize += strlen(p->masteringDisplayColorVolume);
373
+    if (p->videoSignalTypePreset)
374
+        bufSize += strlen(p->videoSignalTypePreset);
375
 
376
     buf = s = X265_MALLOC(char, bufSize);
377
     if (!buf)
378
@@ -2126,10 +2126,12 @@
379
     BOOL(p->bRepeatHeaders, "repeat-headers");
380
     BOOL(p->bAnnexB, "annexb");
381
     BOOL(p->bEnableAccessUnitDelimiters, "aud");
382
+    BOOL(p->bEnableEndOfBitstream, "eob");
383
+    BOOL(p->bEnableEndOfSequence, "eos");
384
     BOOL(p->bEmitHRDSEI, "hrd");
385
     BOOL(p->bEmitInfoSEI, "info");
386
     s += sprintf(s, " hash=%d", p->decodedPictureHashSEI);
387
-    BOOL(p->bEnableTemporalSubLayers, "temporal-layers");
388
+    s += sprintf(s, " temporal-layers=%d", p->bEnableTemporalSubLayers);
389
     BOOL(p->bOpenGOP, "open-gop");
390
     s += sprintf(s, " min-keyint=%d", p->keyframeMin);
391
     s += sprintf(s, " keyint=%d", p->keyframeMax);
392
@@ -2141,7 +2143,7 @@
393
     s += sprintf(s, " rc-lookahead=%d", p->lookaheadDepth);
394
     s += sprintf(s, " lookahead-slices=%d", p->lookaheadSlices);
395
     s += sprintf(s, " scenecut=%d", p->scenecutThreshold);
396
-    s += sprintf(s, " hist-scenecut=%d", p->bHistBasedSceneCut);
397
+    BOOL(p->bHistBasedSceneCut, "hist-scenecut");
398
     s += sprintf(s, " radl=%d", p->radl);
399
     BOOL(p->bEnableHRDConcatFlag, "splice");
400
     BOOL(p->bIntraRefresh, "intra-refresh");
401
@@ -2295,7 +2297,6 @@
402
     BOOL(p->bOptRefListLengthPPS, "opt-ref-list-length-pps");
403
     BOOL(p->bMultiPassOptRPS, "multi-pass-opt-rps");
404
     s += sprintf(s, " scenecut-bias=%.2f", p->scenecutBias);
405
-    s += sprintf(s, " hist-threshold=%.2f", p->edgeTransitionThreshold);
406
     BOOL(p->bOptCUDeltaQP, "opt-cu-delta-qp");
407
     BOOL(p->bAQMotion, "aq-motion");
408
     BOOL(p->bEmitHDR10SEI, "hdr10");
409
@@ -2328,10 +2329,14 @@
410
     s += sprintf(s, " qp-adaptation-range=%.2f", p->rc.qpAdaptationRange);
411
     s += sprintf(s, " scenecut-aware-qp=%d", p->bEnableSceneCutAwareQp);
412
     if (p->bEnableSceneCutAwareQp)
413
-        s += sprintf(s, " fwd-scenecut-window=%d fwd-ref-qp-delta=%f fwd-nonref-qp-delta=%f bwd-scenecut-window=%d bwd-ref-qp-delta=%f bwd-nonref-qp-delta=%f", p->fwdScenecutWindow, p->fwdRefQpDelta, p->fwdNonRefQpDelta, p->bwdScenecutWindow, p->bwdRefQpDelta, p->bwdNonRefQpDelta);
414
+        s += sprintf(s, " fwd-scenecut-window=%d fwd-ref-qp-delta=%f fwd-nonref-qp-delta=%f bwd-scenecut-window=%d bwd-ref-qp-delta=%f bwd-nonref-qp-delta=%f", p->fwdMaxScenecutWindow, p->fwdRefQpDelta0, p->fwdNonRefQpDelta0, p->bwdMaxScenecutWindow, p->bwdRefQpDelta0, p->bwdNonRefQpDelta0);
415
     s += sprintf(s, "conformance-window-offsets right=%d bottom=%d", p->confWinRightOffset, p->confWinBottomOffset);
416
     s += sprintf(s, " decoder-max-rate=%d", p->decoderVbvMaxRate);
417
     BOOL(p->bliveVBV2pass, "vbv-live-multi-pass");
418
+    if (p->filmGrain)
419
+        s += sprintf(s, " film-grain=%s", p->filmGrain); // Film grain characteristics model filename
420
+    BOOL(p->bEnableTemporalFilter, "mcstf");
421
+    BOOL(p->bEnableSBRC, "sbrc");
422
 #undef BOOL
423
     return buf;
424
 }
425
@@ -2406,6 +2411,151 @@
426
     return false;
427
 }
428
 
429
+bool parseMaskingStrength(x265_param* p, const char* value)
430
+{
431
+    bool bError = false;
432
+    int window16;
433
+    double refQpDelta16, nonRefQpDelta16;
434
+    if (p->bEnableSceneCutAwareQp == FORWARD)
435
+    {
436
+        if (3 == sscanf(value, "%d,%lf,%lf", &window10, &refQpDelta10, &nonRefQpDelta10))
437
+        {
438
+            if (window10 > 0)
439
+                p->fwdMaxScenecutWindow = window10;
440
+            if (refQpDelta10 > 0)
441
+                p->fwdRefQpDelta0 = refQpDelta10;
442
+            if (nonRefQpDelta10 > 0)
443
+                p->fwdNonRefQpDelta0 = nonRefQpDelta10;
444
+
445
+            p->fwdScenecutWindow0 = p->fwdMaxScenecutWindow / 6;
446
+            for (int i = 1; i < 6; i++)
447
+            {
448
+                p->fwdScenecutWindowi = p->fwdMaxScenecutWindow / 6;
449
+                p->fwdRefQpDeltai = p->fwdRefQpDeltai - 1 - (0.15 * p->fwdRefQpDeltai - 1);
450
+                p->fwdNonRefQpDeltai = p->fwdNonRefQpDeltai - 1 - (0.15 * p->fwdNonRefQpDeltai - 1);
451
+            }
452
+        }
453
+        else if (18 == sscanf(value, "%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf"
454
+            , &window10, &refQpDelta10, &nonRefQpDelta10, &window11, &refQpDelta11, &nonRefQpDelta11
455
+            , &window12, &refQpDelta12, &nonRefQpDelta12, &window13, &refQpDelta13, &nonRefQpDelta13
456
+            , &window14, &refQpDelta14, &nonRefQpDelta14, &window15, &refQpDelta15, &nonRefQpDelta15))
457
+        {
458
+            p->fwdMaxScenecutWindow = 0;
459
+            for (int i = 0; i < 6; i++)
460
+            {
461
+                p->fwdScenecutWindowi = window1i;
462
+                p->fwdRefQpDeltai = refQpDelta1i;
463
+                p->fwdNonRefQpDeltai = nonRefQpDelta1i;
464
+                p->fwdMaxScenecutWindow += p->fwdScenecutWindowi;
465
+            }
466
+        }
467
+        else
468
+        {
469
+            x265_log(NULL, X265_LOG_ERROR, "Specify all the necessary offsets for masking-strength \n");
470
+            bError = true;
471
+        }
472
+    }
473
+    else if (p->bEnableSceneCutAwareQp == BACKWARD)
474
+    {
475
+        if (3 == sscanf(value, "%d,%lf,%lf", &window10, &refQpDelta10, &nonRefQpDelta10))
476
+        {
477
+            if (window10 > 0)
478
+                p->bwdMaxScenecutWindow = window10;
479
+            if (refQpDelta10 > 0)
480
+                p->bwdRefQpDelta0 = refQpDelta10;
481
+            if (nonRefQpDelta10 > 0)
482
+                p->bwdNonRefQpDelta0 = nonRefQpDelta10;
483
+
484
+            p->bwdScenecutWindow0 = p->bwdMaxScenecutWindow / 6;
485
+            for (int i = 1; i < 6; i++)
486
+            {
487
+                p->bwdScenecutWindowi = p->bwdMaxScenecutWindow / 6;
488
+                p->bwdRefQpDeltai = p->bwdRefQpDeltai - 1 - (0.15 * p->bwdRefQpDeltai - 1);
489
+                p->bwdNonRefQpDeltai = p->bwdNonRefQpDeltai - 1 - (0.15 * p->bwdNonRefQpDeltai - 1);
490
+            }
491
+        }
492
+        else if (18 == sscanf(value, "%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf"
493
+            , &window10, &refQpDelta10, &nonRefQpDelta10, &window11, &refQpDelta11, &nonRefQpDelta11
494
+            , &window12, &refQpDelta12, &nonRefQpDelta12, &window13, &refQpDelta13, &nonRefQpDelta13
495
+            , &window14, &refQpDelta14, &nonRefQpDelta14, &window15, &refQpDelta15, &nonRefQpDelta15))
496
+        {
497
+            p->bwdMaxScenecutWindow = 0;
498
+            for (int i = 0; i < 6; i++)
499
+            {
500
+                p->bwdScenecutWindowi = window1i;
501
+                p->bwdRefQpDeltai = refQpDelta1i;
502
+                p->bwdNonRefQpDeltai = nonRefQpDelta1i;
503
+                p->bwdMaxScenecutWindow += p->bwdScenecutWindowi;
504
+            }
505
+        }
506
+        else
507
+        {
508
+            x265_log(NULL, X265_LOG_ERROR, "Specify all the necessary offsets for masking-strength \n");
509
+            bError = true;
510
+        }
511
+    }
512
+    else if (p->bEnableSceneCutAwareQp == BI_DIRECTIONAL)
513
+    {
514
+        int window26;
515
+        double refQpDelta26, nonRefQpDelta26;
516
+        if (6 == sscanf(value, "%d,%lf,%lf,%d,%lf,%lf", &window10, &refQpDelta10, &nonRefQpDelta10, &window20, &refQpDelta20, &nonRefQpDelta20))
517
+        {
518
+            if (window10 > 0)
519
+                p->fwdMaxScenecutWindow = window10;
520
+            if (refQpDelta10 > 0)
521
+                p->fwdRefQpDelta0 = refQpDelta10;
522
+            if (nonRefQpDelta10 > 0)
523
+                p->fwdNonRefQpDelta0 = nonRefQpDelta10;
524
+            if (window20 > 0)
525
+                p->bwdMaxScenecutWindow = window20;
526
+            if (refQpDelta20 > 0)
527
+                p->bwdRefQpDelta0 = refQpDelta20;
528
+            if (nonRefQpDelta20 > 0)
529
+                p->bwdNonRefQpDelta0 = nonRefQpDelta20;
530
+
531
+            p->fwdScenecutWindow0 = p->fwdMaxScenecutWindow / 6;
532
+            p->bwdScenecutWindow0 = p->bwdMaxScenecutWindow / 6;
533
+            for (int i = 1; i < 6; i++)
534
+            {
535
+                p->fwdScenecutWindowi = p->fwdMaxScenecutWindow / 6;
536
+                p->bwdScenecutWindowi = p->bwdMaxScenecutWindow / 6;
537
+                p->fwdRefQpDeltai = p->fwdRefQpDeltai - 1 - (0.15 * p->fwdRefQpDeltai - 1);
538
+                p->fwdNonRefQpDeltai = p->fwdNonRefQpDeltai - 1 - (0.15 * p->fwdNonRefQpDeltai - 1);
539
+                p->bwdRefQpDeltai = p->bwdRefQpDeltai - 1 - (0.15 * p->bwdRefQpDeltai - 1);
540
+                p->bwdNonRefQpDeltai = p->bwdNonRefQpDeltai - 1 - (0.15 * p->bwdNonRefQpDeltai - 1);
541
+            }
542
+        }
543
+        else if (36 == sscanf(value, "%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf"
544
+            , &window10, &refQpDelta10, &nonRefQpDelta10, &window11, &refQpDelta11, &nonRefQpDelta11
545
+            , &window12, &refQpDelta12, &nonRefQpDelta12, &window13, &refQpDelta13, &nonRefQpDelta13
546
+            , &window14, &refQpDelta14, &nonRefQpDelta14, &window15, &refQpDelta15, &nonRefQpDelta15
547
+            , &window20, &refQpDelta20, &nonRefQpDelta20, &window21, &refQpDelta21, &nonRefQpDelta21
548
+            , &window22, &refQpDelta22, &nonRefQpDelta22, &window23, &refQpDelta23, &nonRefQpDelta23
549
+            , &window24, &refQpDelta24, &nonRefQpDelta24, &window25, &refQpDelta25, &nonRefQpDelta25))
550
+        {
551
+            p->fwdMaxScenecutWindow = 0;
552
+            p->bwdMaxScenecutWindow = 0;
553
+            for (int i = 0; i < 6; i++)
554
+            {
555
+                p->fwdScenecutWindowi = window1i;
556
+                p->fwdRefQpDeltai = refQpDelta1i;
557
+                p->fwdNonRefQpDeltai = nonRefQpDelta1i;
558
+                p->bwdScenecutWindowi = window2i;
559
+                p->bwdRefQpDeltai = refQpDelta2i;
560
+                p->bwdNonRefQpDeltai = nonRefQpDelta2i;
561
+                p->fwdMaxScenecutWindow += p->fwdScenecutWindowi;
562
+                p->bwdMaxScenecutWindow += p->bwdScenecutWindowi;
563
+            }
564
+        }
565
+        else
566
+        {
567
+            x265_log(NULL, X265_LOG_ERROR, "Specify all the necessary offsets for masking-strength \n");
568
+            bError = true;
569
+        }
570
+    }
571
+    return bError;
572
+}
573
+
574
 void x265_copy_params(x265_param* dst, x265_param* src)
575
 {
576
     dst->cpuid = src->cpuid;
577
@@ -2440,10 +2590,13 @@
578
     dst->bRepeatHeaders = src->bRepeatHeaders;
579
     dst->bAnnexB = src->bAnnexB;
580
     dst->bEnableAccessUnitDelimiters = src->bEnableAccessUnitDelimiters;
581
+    dst->bEnableEndOfBitstream = src->bEnableEndOfBitstream;
582
+    dst->bEnableEndOfSequence = src->bEnableEndOfSequence;
583
     dst->bEmitInfoSEI = src->bEmitInfoSEI;
584
     dst->decodedPictureHashSEI = src->decodedPictureHashSEI;
585
     dst->bEnableTemporalSubLayers = src->bEnableTemporalSubLayers;
586
     dst->bOpenGOP = src->bOpenGOP;
587
+   dst->craNal = src->craNal;
588
     dst->keyframeMax = src->keyframeMax;
589
     dst->keyframeMin = src->keyframeMin;
590
     dst->bframes = src->bframes;
591
@@ -2541,8 +2694,11 @@
592
     dst->rc.rfConstantMin = src->rc.rfConstantMin;
593
     dst->rc.bStatWrite = src->rc.bStatWrite;
594
     dst->rc.bStatRead = src->rc.bStatRead;
595
+    dst->rc.dataShareMode = src->rc.dataShareMode;
596
     if (src->rc.statFileName) dst->rc.statFileName=strdup(src->rc.statFileName);
597
     else dst->rc.statFileName = NULL;
598
+    if (src->rc.sharedMemName) dst->rc.sharedMemName = strdup(src->rc.sharedMemName);
599
+    else dst->rc.sharedMemName = NULL;
600
     dst->rc.qblur = src->rc.qblur;
601
     dst->rc.complexityBlur = src->rc.complexityBlur;
602
     dst->rc.bEnableSlowFirstPass = src->rc.bEnableSlowFirstPass;
603
@@ -2550,6 +2706,7 @@
604
     dst->rc.zonefileCount = src->rc.zonefileCount;
605
     dst->reconfigWindowSize = src->reconfigWindowSize;
606
     dst->bResetZoneConfig = src->bResetZoneConfig;
607
+    dst->bNoResetZoneConfig = src->bNoResetZoneConfig;
608
     dst->decoderVbvMaxRate = src->decoderVbvMaxRate;
609
 
610
     if (src->rc.zonefileCount && src->rc.zones && src->bResetZoneConfig)
611
@@ -2557,6 +2714,7 @@
612
         for (int i = 0; i < src->rc.zonefileCount; i++)
613
         {
614
             dst->rc.zonesi.startFrame = src->rc.zonesi.startFrame;
615
+            dst->rc.zones0.keyframeMax = src->rc.zones0.keyframeMax;
616
             memcpy(dst->rc.zonesi.zoneParam, src->rc.zonesi.zoneParam, sizeof(x265_param));
617
         }
618
     }
619
@@ -2621,7 +2779,6 @@
620
     dst->bOptRefListLengthPPS = src->bOptRefListLengthPPS;
621
     dst->bMultiPassOptRPS = src->bMultiPassOptRPS;
622
     dst->scenecutBias = src->scenecutBias;
623
-    dst->edgeTransitionThreshold = src->edgeTransitionThreshold;
624
     dst->gopLookahead = src->lookaheadDepth;
625
     dst->bOptCUDeltaQP = src->bOptCUDeltaQP;
626
     dst->analysisMultiPassDistortion = src->analysisMultiPassDistortion;
627
@@ -2682,20 +2839,33 @@
628
     dst->bEnableSvtHevc = src->bEnableSvtHevc;
629
     dst->bEnableFades = src->bEnableFades;
630
     dst->bEnableSceneCutAwareQp = src->bEnableSceneCutAwareQp;
631
-    dst->fwdScenecutWindow = src->fwdScenecutWindow;
632
-    dst->fwdRefQpDelta = src->fwdRefQpDelta;
633
-    dst->fwdNonRefQpDelta = src->fwdNonRefQpDelta;
634
-    dst->bwdScenecutWindow = src->bwdScenecutWindow;
635
-    dst->bwdRefQpDelta = src->bwdRefQpDelta;
636
-    dst->bwdNonRefQpDelta = src->bwdNonRefQpDelta;
637
+    dst->fwdMaxScenecutWindow = src->fwdMaxScenecutWindow;
638
+    dst->bwdMaxScenecutWindow = src->bwdMaxScenecutWindow;
639
+    for (int i = 0; i < 6; i++)
640
+    {
641
+        dst->fwdScenecutWindowi = src->fwdScenecutWindowi;
642
+        dst->fwdRefQpDeltai = src->fwdRefQpDeltai;
643
+        dst->fwdNonRefQpDeltai = src->fwdNonRefQpDeltai;
644
+        dst->bwdScenecutWindowi = src->bwdScenecutWindowi;
645
+        dst->bwdRefQpDeltai = src->bwdRefQpDeltai;
646
+        dst->bwdNonRefQpDeltai = src->bwdNonRefQpDeltai;
647
+    }
648
     dst->bField = src->bField;
649
-
650
+    dst->bEnableTemporalFilter = src->bEnableTemporalFilter;
651
+    dst->temporalFilterStrength = src->temporalFilterStrength;
652
     dst->confWinRightOffset = src->confWinRightOffset;
653
     dst->confWinBottomOffset = src->confWinBottomOffset;
654
     dst->bliveVBV2pass = src->bliveVBV2pass;
655
+
656
+    if (src->videoSignalTypePreset) dst->videoSignalTypePreset = strdup(src->videoSignalTypePreset);
657
+    else dst->videoSignalTypePreset = NULL;
658
 #ifdef SVT_HEVC
659
     memcpy(dst->svtHevcParam, src->svtHevcParam, sizeof(EB_H265_ENC_CONFIGURATION));
660
 #endif
661
+    /* Film grain */
662
+    if (src->filmGrain)
663
+        dst->filmGrain = src->filmGrain;
664
+    dst->bEnableSBRC = src->bEnableSBRC;
665
 }
666
 
667
 #ifdef SVT_HEVC
668
x265_3.5.tar.gz/source/common/param.h -> x265_3.6.tar.gz/source/common/param.h Changed
17
 
1
@@ -38,6 +38,7 @@
2
 void  getParamAspectRatio(x265_param *p, int& width, int& height);
3
 bool  parseLambdaFile(x265_param *param);
4
 void x265_copy_params(x265_param* dst, x265_param* src);
5
+bool parseMaskingStrength(x265_param* p, const char* value);
6
 
7
 /* this table is kept internal to avoid confusion, since log level indices start at -1 */
8
 static const char * const logLevelNames = { "none", "error", "warning", "info", "debug", "full", 0 };
9
@@ -52,6 +53,7 @@
10
 int x265_param_default_preset(x265_param *, const char *preset, const char *tune);
11
 int x265_param_apply_profile(x265_param *, const char *profile);
12
 int x265_param_parse(x265_param *p, const char *name, const char *value);
13
+int x265_scenecut_aware_qp_param_parse(x265_param* p, const char* name, const char* value);
14
 int x265_zone_param_parse(x265_param* p, const char* name, const char* value);
15
 #define PARAM_NS X265_NS
16
 #endif
17
x265_3.5.tar.gz/source/common/piclist.cpp -> x265_3.6.tar.gz/source/common/piclist.cpp Changed
134
 
1
@@ -45,6 +45,25 @@
2
     m_count++;
3
 }
4
 
5
+void PicList::pushFrontMCSTF(Frame& curFrame)
6
+{
7
+    X265_CHECK(!curFrame.m_nextMCSTF && !curFrame.m_nextMCSTF, "piclist: picture already in OPB list\n"); // ensure frame is not in a list
8
+    curFrame.m_nextMCSTF = m_start;
9
+    curFrame.m_prevMCSTF = NULL;
10
+
11
+    if (m_count)
12
+    {
13
+        m_start->m_prevMCSTF = &curFrame;
14
+        m_start = &curFrame;
15
+    }
16
+    else
17
+    {
18
+        m_start = m_end = &curFrame;
19
+    }
20
+    m_count++;
21
+
22
+}
23
+
24
 void PicList::pushBack(Frame& curFrame)
25
 {
26
     X265_CHECK(!curFrame.m_next && !curFrame.m_prev, "piclist: picture already in list\n"); // ensure frame is not in a list
27
@@ -63,6 +82,24 @@
28
     m_count++;
29
 }
30
 
31
+void PicList::pushBackMCSTF(Frame& curFrame)
32
+{
33
+    X265_CHECK(!curFrame.m_nextMCSTF && !curFrame.m_prevMCSTF, "piclist: picture already in OPB list\n"); // ensure frame is not in a list
34
+    curFrame.m_nextMCSTF = NULL;
35
+    curFrame.m_prevMCSTF = m_end;
36
+
37
+    if (m_count)
38
+    {
39
+        m_end->m_nextMCSTF = &curFrame;
40
+        m_end = &curFrame;
41
+    }
42
+    else
43
+    {
44
+        m_start = m_end = &curFrame;
45
+    }
46
+    m_count++;
47
+}
48
+
49
 Frame *PicList::popFront()
50
 {
51
     if (m_start)
52
@@ -94,6 +131,14 @@
53
     return curFrame;
54
 }
55
 
56
+Frame* PicList::getPOCMCSTF(int poc)
57
+{
58
+    Frame *curFrame = m_start;
59
+    while (curFrame && curFrame->m_poc != poc)
60
+        curFrame = curFrame->m_nextMCSTF;
61
+    return curFrame;
62
+}
63
+
64
 Frame *PicList::popBack()
65
 {
66
     if (m_end)
67
@@ -117,6 +162,29 @@
68
         return NULL;
69
 }
70
 
71
+Frame *PicList::popBackMCSTF()
72
+{
73
+    if (m_end)
74
+    {
75
+        Frame* temp = m_end;
76
+        m_count--;
77
+
78
+        if (m_count)
79
+        {
80
+            m_end = m_end->m_prevMCSTF;
81
+            m_end->m_nextMCSTF = NULL;
82
+        }
83
+        else
84
+        {
85
+            m_start = m_end = NULL;
86
+        }
87
+        temp->m_nextMCSTF = temp->m_prevMCSTF = NULL;
88
+        return temp;
89
+    }
90
+    else
91
+        return NULL;
92
+}
93
+
94
 Frame* PicList::getCurFrame(void)
95
 {
96
     Frame *curFrame = m_start;
97
@@ -158,3 +226,36 @@
98
 
99
     curFrame.m_next = curFrame.m_prev = NULL;
100
 }
101
+
102
+void PicList::removeMCSTF(Frame& curFrame)
103
+{
104
+#if _DEBUG
105
+    Frame *tmp = m_start;
106
+    while (tmp && tmp != &curFrame)
107
+    {
108
+        tmp = tmp->m_nextMCSTF;
109
+    }
110
+
111
+    X265_CHECK(tmp == &curFrame, "framelist: pic being removed was not in list\n"); // verify pic is in this list
112
+#endif
113
+
114
+    m_count--;
115
+    if (m_count)
116
+    {
117
+        if (m_start == &curFrame)
118
+            m_start = curFrame.m_nextMCSTF;
119
+        if (m_end == &curFrame)
120
+            m_end = curFrame.m_prevMCSTF;
121
+
122
+        if (curFrame.m_nextMCSTF)
123
+            curFrame.m_nextMCSTF->m_prevMCSTF = curFrame.m_prevMCSTF;
124
+        if (curFrame.m_prevMCSTF)
125
+            curFrame.m_prevMCSTF->m_nextMCSTF = curFrame.m_nextMCSTF;
126
+    }
127
+    else
128
+    {
129
+        m_start = m_end = NULL;
130
+    }
131
+
132
+    curFrame.m_nextMCSTF = curFrame.m_prevMCSTF = NULL;
133
+}
134
x265_3.5.tar.gz/source/common/piclist.h -> x265_3.6.tar.gz/source/common/piclist.h Changed
33
 
1
@@ -49,24 +49,31 @@
2
 
3
     /** Push picture to end of the list */
4
     void pushBack(Frame& pic);
5
+    void pushBackMCSTF(Frame& pic);
6
 
7
     /** Push picture to beginning of the list */
8
     void pushFront(Frame& pic);
9
+    void pushFrontMCSTF(Frame& pic);
10
 
11
     /** Pop picture from end of the list */
12
     Frame* popBack();
13
+    Frame* popBackMCSTF();
14
 
15
     /** Pop picture from beginning of the list */
16
     Frame* popFront();
17
 
18
     /** Find frame with specified POC */
19
     Frame* getPOC(int poc);
20
+    /* Find next MCSTF frame with specified POC */
21
+    Frame* getPOCMCSTF(int poc);
22
 
23
     /** Get the current Frame from the list **/
24
     Frame* getCurFrame(void);
25
 
26
     /** Remove picture from list */
27
     void remove(Frame& pic);
28
+    /* Remove MCSTF picture from list */
29
+    void removeMCSTF(Frame& pic);
30
 
31
     Frame* first()        { return m_start;   }
32
 
33
x265_3.5.tar.gz/source/common/picyuv.cpp -> x265_3.6.tar.gz/source/common/picyuv.cpp Changed
60
 
1
@@ -125,6 +125,58 @@
2
     return false;
3
 }
4
 
5
+/*Copy pixels from the picture buffer of a frame to picture buffer of another frame*/
6
+void PicYuv::copyFromFrame(PicYuv* source)
7
+{
8
+    uint32_t numCuInHeight = (m_picHeight + m_param->maxCUSize - 1) / m_param->maxCUSize;
9
+
10
+    int maxHeight = numCuInHeight * m_param->maxCUSize;
11
+    memcpy(m_picBuf0, source->m_picBuf0, sizeof(pixel)* m_stride * (maxHeight + (m_lumaMarginY * 2)));
12
+    m_picOrg0 = m_picBuf0 + m_lumaMarginY * m_stride + m_lumaMarginX;
13
+
14
+    if (m_picCsp != X265_CSP_I400)
15
+    {
16
+        memcpy(m_picBuf1, source->m_picBuf1, sizeof(pixel)* m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
17
+        memcpy(m_picBuf2, source->m_picBuf2, sizeof(pixel)* m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
18
+
19
+        m_picOrg1 = m_picBuf1 + m_chromaMarginY * m_strideC + m_chromaMarginX;
20
+        m_picOrg2 = m_picBuf2 + m_chromaMarginY * m_strideC + m_chromaMarginX;
21
+    }
22
+    else
23
+    {
24
+        m_picBuf1 = m_picBuf2 = NULL;
25
+        m_picOrg1 = m_picOrg2 = NULL;
26
+    }
27
+}
28
+
29
+bool PicYuv::createScaledPicYUV(x265_param* param, uint8_t scaleFactor)
30
+{
31
+    m_param = param;
32
+    m_picWidth = m_param->sourceWidth / scaleFactor;
33
+    m_picHeight = m_param->sourceHeight / scaleFactor;
34
+
35
+    m_picCsp = m_param->internalCsp;
36
+    m_hChromaShift = CHROMA_H_SHIFT(m_picCsp);
37
+    m_vChromaShift = CHROMA_V_SHIFT(m_picCsp);
38
+
39
+    uint32_t numCuInWidth = (m_picWidth + param->maxCUSize - 1) / param->maxCUSize;
40
+    uint32_t numCuInHeight = (m_picHeight + param->maxCUSize - 1) / param->maxCUSize;
41
+
42
+    m_lumaMarginX = 128; // search margin for L0 and L1 ME in horizontal direction
43
+    m_lumaMarginY = 128; // search margin for L0 and L1 ME in vertical direction
44
+    m_stride = (numCuInWidth * param->maxCUSize) + (m_lumaMarginX << 1);
45
+
46
+    int maxHeight = numCuInHeight * param->maxCUSize;
47
+    CHECKED_MALLOC_ZERO(m_picBuf0, pixel, m_stride * (maxHeight + (m_lumaMarginY * 2)));
48
+    m_picOrg0 = m_picBuf0 + m_lumaMarginY * m_stride + m_lumaMarginX;
49
+    m_picBuf1 = m_picBuf2 = NULL;
50
+    m_picOrg1 = m_picOrg2 = NULL;
51
+    return true;
52
+
53
+fail:
54
+    return false;
55
+}
56
+
57
 int PicYuv::getLumaBufLen(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp)
58
 {
59
     m_picWidth = picWidth;
60
x265_3.5.tar.gz/source/common/picyuv.h -> x265_3.6.tar.gz/source/common/picyuv.h Changed
15
 
1
@@ -78,11 +78,13 @@
2
     PicYuv();
3
 
4
     bool  create(x265_param* param, bool picAlloc = true, pixel *pixelbuf = NULL);
5
+    bool  createScaledPicYUV(x265_param* param, uint8_t scaleFactor);
6
     bool  createOffsets(const SPS& sps);
7
     void  destroy();
8
     int   getLumaBufLen(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp);
9
 
10
     void  copyFromPicture(const x265_picture&, const x265_param& param, int padx, int pady);
11
+    void  copyFromFrame(PicYuv* source);
12
 
13
     intptr_t getChromaAddrOffset(uint32_t ctuAddr, uint32_t absPartIdx) const { return m_cuOffsetCctuAddr + m_buOffsetCabsPartIdx; }
14
 
15
x265_3.5.tar.gz/source/common/pixel.cpp -> x265_3.6.tar.gz/source/common/pixel.cpp Changed
51
 
1
@@ -266,7 +266,7 @@
2
 {
3
     int satd = 0;
4
 
5
-#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
6
+#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 && !HIGH_BIT_DEPTH
7
     pixelcmp_t satd_4x4 = x265_pixel_satd_4x4_neon;
8
 #endif
9
 
10
@@ -284,7 +284,7 @@
11
 {
12
     int satd = 0;
13
 
14
-#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
15
+#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 && !HIGH_BIT_DEPTH
16
     pixelcmp_t satd_8x4 = x265_pixel_satd_8x4_neon;
17
 #endif
18
 
19
@@ -627,6 +627,23 @@
20
     }
21
 }
22
 
23
+static
24
+void frame_subsample_luma(const pixel* src0, pixel* dst0, intptr_t src_stride, intptr_t dst_stride, int width, int height)
25
+{
26
+    for (int y = 0; y < height; y++, src0 += 2 * src_stride, dst0 += dst_stride)
27
+    {
28
+        const pixel *inRow = src0;
29
+        const pixel *inRowBelow = src0 + src_stride;
30
+        pixel *target = dst0;
31
+        for (int x = 0; x < width; x++)
32
+        {
33
+            targetx = (((inRow0 + inRowBelow0 + 1) >> 1) + ((inRow1 + inRowBelow1 + 1) >> 1) + 1) >> 1;
34
+            inRow += 2;
35
+            inRowBelow += 2;
36
+        }
37
+    }
38
+}
39
+
40
 /* structural similarity metric */
41
 static void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24)
42
 {
43
@@ -1355,5 +1372,7 @@
44
     p.cuBLOCK_16x16.normFact = normFact_c;
45
     p.cuBLOCK_32x32.normFact = normFact_c;
46
     p.cuBLOCK_64x64.normFact = normFact_c;
47
+    /* SubSample Luma*/
48
+    p.frameSubSampleLuma = frame_subsample_luma;
49
 }
50
 }
51
x265_3.5.tar.gz/source/common/ppc/intrapred_altivec.cpp -> x265_3.6.tar.gz/source/common/ppc/intrapred_altivec.cpp Changed
10
 
1
@@ -27,7 +27,7 @@
2
 #include <assert.h>
3
 #include <math.h>
4
 #include <cmath>
5
-#include <linux/types.h>
6
+#include <sys/types.h>
7
 #include <stdlib.h>
8
 #include <stdio.h>
9
 #include <stdint.h>
10
x265_3.5.tar.gz/source/common/primitives.h -> x265_3.6.tar.gz/source/common/primitives.h Changed
28
 
1
@@ -232,6 +232,8 @@
2
 typedef void(*psyRdoQuant_t2)(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos);
3
 typedef void(*ssimDistortion_t)(const pixel *fenc, uint32_t fStride, const pixel *recon,  intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k);
4
 typedef void(*normFactor_t)(const pixel *src, uint32_t blockSize, int shift, uint64_t *z_k);
5
+/* SubSampling Luma */
6
+typedef void (*downscaleluma_t)(const pixel* src0, pixel* dstf, intptr_t src_stride, intptr_t dst_stride, int width, int height);
7
 /* Function pointers to optimized encoder primitives. Each pointer can reference
8
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
9
 struct EncoderPrimitives
10
@@ -353,6 +355,8 @@
11
 
12
     downscale_t           frameInitLowres;
13
     downscale_t           frameInitLowerRes;
14
+    /* Sub Sample Luma */
15
+    downscaleluma_t        frameSubSampleLuma;
16
     cutree_propagate_cost propagateCost;
17
     cutree_fix8_unpack    fix8Unpack;
18
     cutree_fix8_pack      fix8Pack;
19
@@ -488,7 +492,7 @@
20
 
21
 #if ENABLE_ASSEMBLY && X265_ARCH_ARM64
22
 extern "C" {
23
-#include "aarch64/pixel-util.h"
24
+#include "aarch64/fun-decls.h"
25
 }
26
 #endif
27
 
28
x265_3.6.tar.gz/source/common/ringmem.cpp Added
359
 
1
@@ -0,0 +1,357 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2013-2017 MulticoreWare, Inc
4
+ *
5
+ * Authors: liwei <liwei@multicorewareinc.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com
23
+ *****************************************************************************/
24
+
25
+#include "ringmem.h"
26
+
27
+#ifndef _WIN32
28
+#include <sys/mman.h>
29
+#endif ////< _WIN32
30
+
31
+#ifdef _WIN32
32
+#define X265_SHARED_MEM_NAME                    "Local\\_x265_shr_mem_"
33
+#define X265_SEMAPHORE_RINGMEM_WRITER_NAME     "_x265_semW_"
34
+#define X265_SEMAPHORE_RINGMEM_READER_NAME     "_x265_semR_"
35
+#else /* POSIX / pthreads */
36
+#define X265_SHARED_MEM_NAME                    "/tmp/_x265_shr_mem_"
37
+#define X265_SEMAPHORE_RINGMEM_WRITER_NAME     "/tmp/_x265_semW_"
38
+#define X265_SEMAPHORE_RINGMEM_READER_NAME     "/tmp/_x265_semR_"
39
+#endif
40
+
41
+#define RINGMEM_ALLIGNMENT                       64
42
+
43
+namespace X265_NS {
44
+    RingMem::RingMem() 
45
+        : m_initialized(false)
46
+        , m_protectRW(false)
47
+        , m_itemSize(0)
48
+        , m_itemCnt(0)
49
+        , m_dataPool(NULL)
50
+        , m_shrMem(NULL)
51
+#ifdef _WIN32
52
+        , m_handle(NULL)
53
+#else //_WIN32
54
+        , m_filepath(NULL)
55
+#endif //_WIN32
56
+        , m_writeSem(NULL)
57
+        , m_readSem(NULL)
58
+    {
59
+    }
60
+
61
+
62
+    RingMem::~RingMem()
63
+    {
64
+    }
65
+
66
+    bool RingMem::skipRead(int32_t cnt) {
67
+        if (!m_initialized)
68
+        {
69
+            return false;
70
+        }
71
+
72
+        if (m_protectRW)
73
+        {
74
+            for (int i = 0; i < cnt; i++)
75
+            {
76
+                m_readSem->take();
77
+            }
78
+        }
79
+        
80
+        ATOMIC_ADD(&m_shrMem->m_read, cnt);
81
+
82
+        if (m_protectRW)
83
+        {
84
+            m_writeSem->give(cnt);
85
+        }
86
+
87
+        return true;
88
+    }
89
+
90
+    bool RingMem::skipWrite(int32_t cnt) {
91
+        if (!m_initialized)
92
+        {
93
+            return false;
94
+        }
95
+
96
+        if (m_protectRW)
97
+        {
98
+            for (int i = 0; i < cnt; i++)
99
+            {
100
+                m_writeSem->take();
101
+            }
102
+        }
103
+
104
+        ATOMIC_ADD(&m_shrMem->m_write, cnt);
105
+
106
+        if (m_protectRW)
107
+        {
108
+            m_readSem->give(cnt);
109
+        }
110
+
111
+        return true;
112
+    }
113
+
114
+    ///< initialize
115
+    bool RingMem::init(int32_t itemSize, int32_t itemCnt, const char *name, bool protectRW)
116
+    {
117
+        ///< check parameters
118
+        if (itemSize <= 0 || itemCnt <= 0 || NULL == name)
119
+        {
120
+            ///< invalid parameters 
121
+            return false;
122
+        }
123
+
124
+        if (!m_initialized)
125
+        {
126
+            ///< formating names
127
+            char nameBufMAX_SHR_NAME_LEN = { 0 };
128
+
129
+            ///< shared memory name
130
+            snprintf(nameBuf, sizeof(nameBuf) - 1, "%s%s", X265_SHARED_MEM_NAME, name);
131
+
132
+            ///< create or open shared memory
133
+            bool newCreated = false;
134
+
135
+            ///< calculate the size of the shared memory
136
+            int32_t shrMemSize = (itemSize * itemCnt + sizeof(ShrMemCtrl) + RINGMEM_ALLIGNMENT - 1) & ~(RINGMEM_ALLIGNMENT - 1);
137
+
138
+#ifdef _WIN32
139
+            HANDLE h = OpenFileMappingA(FILE_MAP_WRITE | FILE_MAP_READ, FALSE, nameBuf);
140
+            if (!h)
141
+            {
142
+                h = CreateFileMappingA(INVALID_HANDLE_VALUE, NULL, PAGE_READWRITE, 0, shrMemSize, nameBuf);
143
+
144
+                if (!h)
145
+                {
146
+                    return false;
147
+                }
148
+
149
+                newCreated = true;
150
+            }
151
+
152
+            void *pool = MapViewOfFile(h, FILE_MAP_ALL_ACCESS, 0, 0, 0);
153
+
154
+            ///< should not close the handle here, otherwise the OpenFileMapping would fail
155
+            //CloseHandle(h);
156
+            m_handle = h;
157
+
158
+            if (!pool)
159
+            {
160
+                return false;
161
+            }
162
+
163
+#else /* POSIX / pthreads */
164
+            mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH;
165
+            int flag = O_RDWR;
166
+            int shrfd = -1;
167
+            if ((shrfd = open(nameBuf, flag, mode)) < 0)
168
+            {
169
+                flag |= O_CREAT;
170
+                
171
+                shrfd = open(nameBuf, flag, mode);
172
+                if (shrfd < 0)
173
+                {
174
+                    return false;
175
+                }
176
+                newCreated = true;
177
+
178
+                lseek(shrfd, shrMemSize - 1, SEEK_SET);
179
+
180
+                if (-1 == write(shrfd, "\0", 1))
181
+                {
182
+                    close(shrfd);
183
+                    return false;
184
+                }
185
+
186
+                if (lseek(shrfd, 0, SEEK_END) < shrMemSize)
187
+                {
188
+                    close(shrfd);
189
+                    return false;
190
+                }
191
+            }
192
+
193
+            void *pool = mmap(0,
194
+                shrMemSize,
195
+                PROT_READ | PROT_WRITE,
196
+                MAP_SHARED,
197
+                shrfd,
198
+                0);
199
+
200
+            close(shrfd);
201
+            if (pool == MAP_FAILED)
202
+            {               
203
+                return false;
204
+            }
205
+
206
+            m_filepath = strdup(nameBuf);
207
+#endif ///< _WIN32
208
+
209
+            if (newCreated)
210
+            {
211
+                memset(pool, 0, shrMemSize);
212
+            }
213
+            
214
+            m_shrMem = reinterpret_cast<ShrMemCtrl *>(pool);
215
+            m_dataPool = reinterpret_cast<uint8_t *>(pool) + sizeof(ShrMemCtrl);
216
+            m_itemSize = itemSize;
217
+            m_itemCnt = itemCnt;
218
+            m_initialized = true;
219
+
220
+            if (protectRW)
221
+            {
222
+                m_protectRW = true;
223
+                m_writeSem = new NamedSemaphore();
224
+                if (!m_writeSem)
225
+                {
226
+                    release();
227
+                    return false;
228
+                }
229
+
230
+                ///< shared memory name
231
+                snprintf(nameBuf, sizeof(nameBuf) - 1, "%s%s", X265_SEMAPHORE_RINGMEM_WRITER_NAME, name);
232
+                if (!m_writeSem->create(nameBuf, m_itemCnt, m_itemCnt))
233
+                {
234
+                    release();
235
+                    return false;
236
+                }
237
+
238
+                m_readSem = new NamedSemaphore();
239
+                if (!m_readSem)
240
+                {
241
+                    release();
242
+                    return false;
243
+                }
244
+
245
+                ///< shared memory name
246
+                snprintf(nameBuf, sizeof(nameBuf) - 1, "%s%s", X265_SEMAPHORE_RINGMEM_READER_NAME, name);
247
+                if (!m_readSem->create(nameBuf, 0, m_itemCnt))
248
+                {
249
+                    release();
250
+                    return false;
251
+                }
252
+            }
253
+        }
254
+
255
+        return true;
256
+    }
257
+    ///< finalize
258
+    void RingMem::release()
259
+    {
260
+        if (m_initialized)
261
+        {
262
+            m_initialized = false;
263
+
264
+            if (m_shrMem)
265
+            {
266
+#ifdef _WIN32
267
+                UnmapViewOfFile(m_shrMem);
268
+                CloseHandle(m_handle);
269
+                m_handle = NULL;
270
+#else /* POSIX / pthreads */
271
+                int32_t shrMemSize = (m_itemSize * m_itemCnt + sizeof(ShrMemCtrl) + RINGMEM_ALLIGNMENT - 1) & (~RINGMEM_ALLIGNMENT - 1);
272
+                munmap(m_shrMem, shrMemSize);
273
+                unlink(m_filepath);
274
+                free(m_filepath);
275
+                m_filepath = NULL;
276
+#endif ///< _WIN32
277
+                m_shrMem = NULL;
278
+                m_dataPool = NULL;
279
+                m_itemSize = 0;
280
+                m_itemCnt = 0;
281
+            }
282
+            
283
+            if (m_protectRW)
284
+            {
285
+                m_protectRW = false;
286
+                if (m_writeSem)
287
+                {
288
+                    m_writeSem->release();
289
+
290
+                    delete m_writeSem;
291
+                    m_writeSem = NULL;
292
+                }
293
+
294
+                if (m_readSem)
295
+                {
296
+                    m_readSem->release();
297
+
298
+                    delete m_readSem;
299
+                    m_readSem = NULL;
300
+                }
301
+            }
302
+
303
+        }
304
+    }
305
+
306
+    ///< data read
307
+    bool RingMem::readNext(void* dst, fnRWSharedData callback)
308
+    {
309
+        if (!m_initialized || !callback || !dst)
310
+        {
311
+            return false;
312
+        }
313
+
314
+        if (m_protectRW)
315
+        {
316
+            if (!m_readSem->take())
317
+            {
318
+                return false;
319
+            }
320
+        }
321
+
322
+        int32_t index = ATOMIC_ADD(&m_shrMem->m_read, 1) % m_itemCnt;
323
+        (*callback)(dst, reinterpret_cast<uint8_t *>(m_dataPool) + index * m_itemSize, m_itemSize);
324
+
325
+        if (m_protectRW)
326
+        {
327
+            m_writeSem->give(1);
328
+        }
329
+
330
+        return true;
331
+    }
332
+    ///< data write
333
+    bool RingMem::writeData(void *data, fnRWSharedData callback)
334
+    {
335
+        if (!m_initialized || !data || !callback)
336
+        {
337
+            return false;
338
+        }
339
+
340
+        if (m_protectRW)
341
+        {
342
+            if (!m_writeSem->take())
343
+            {
344
+                return false;
345
+            }
346
+        }
347
+
348
+        int32_t index = ATOMIC_ADD(&m_shrMem->m_write, 1) % m_itemCnt;
349
+        (*callback)(reinterpret_cast<uint8_t *>(m_dataPool) + index * m_itemSize, data, m_itemSize);
350
+
351
+        if (m_protectRW)
352
+        {
353
+            m_readSem->give(1);
354
+        }
355
+
356
+        return true;
357
+    }
358
+}
359
x265_3.6.tar.gz/source/common/ringmem.h Added
92
 
1
@@ -0,0 +1,90 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2013-2017 MulticoreWare, Inc
4
+ *
5
+ * Authors: liwei <liwei@multicorewareinc.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com
23
+ *****************************************************************************/
24
+
25
+#ifndef X265_RINGMEM_H
26
+#define X265_RINGMEM_H
27
+
28
+#include "common.h"
29
+#include "threading.h"
30
+
31
+#if _MSC_VER
32
+#define snprintf _snprintf
33
+#define strdup _strdup
34
+#endif
35
+
36
+namespace X265_NS {
37
+
38
+#define MAX_SHR_NAME_LEN                         256
39
+
40
+    class RingMem {
41
+    public:
42
+        RingMem();
43
+        ~RingMem();
44
+
45
+        bool skipRead(int32_t cnt);
46
+
47
+        bool skipWrite(int32_t cnt);
48
+
49
+        ///< initialize
50
+        ///< protectRW: if use the semaphore the protect the write and read operation.
51
+        bool init(int32_t itemSize, int32_t itemCnt, const char *name, bool protectRW = false);
52
+        ///< finalize
53
+        void release();
54
+
55
+        typedef void(*fnRWSharedData)(void *dst, void *src, int32_t size);
56
+
57
+        ///< data read
58
+        bool readNext(void* dst, fnRWSharedData callback);
59
+        ///< data write
60
+        bool writeData(void *data, fnRWSharedData callback);
61
+
62
+    private:        
63
+        bool    m_initialized;
64
+        bool    m_protectRW;
65
+
66
+        int32_t m_itemSize;
67
+        int32_t m_itemCnt;
68
+        ///< data pool
69
+        void   *m_dataPool;
70
+        typedef struct {
71
+            ///< index to write
72
+            int32_t m_write;
73
+            ///< index to read
74
+            int32_t m_read;
75
+            
76
+        }ShrMemCtrl;
77
+
78
+        ShrMemCtrl *m_shrMem;
79
+#ifdef _WIN32
80
+        void       *m_handle;
81
+#else // _WIN32
82
+        char       *m_filepath;
83
+#endif // _WIN32
84
+
85
+        ///< Semaphores
86
+        NamedSemaphore *m_writeSem;
87
+        NamedSemaphore *m_readSem;
88
+    };
89
+};
90
+
91
+#endif // ifndef X265_RINGMEM_H
92
x265_3.5.tar.gz/source/common/slice.h -> x265_3.6.tar.gz/source/common/slice.h Changed
35
 
1
@@ -156,9 +156,9 @@
2
     HRDInfo          hrdParameters;
3
     ProfileTierLevel ptl;
4
     uint32_t         maxTempSubLayers;
5
-    uint32_t         numReorderPics;
6
-    uint32_t         maxDecPicBuffering;
7
-    uint32_t         maxLatencyIncrease;
8
+    uint32_t         numReorderPicsMAX_T_LAYERS;
9
+    uint32_t         maxDecPicBufferingMAX_T_LAYERS;
10
+    uint32_t         maxLatencyIncreaseMAX_T_LAYERS;
11
 };
12
 
13
 struct Window
14
@@ -235,9 +235,9 @@
15
     uint32_t maxAMPDepth;
16
 
17
     uint32_t maxTempSubLayers;   // max number of Temporal Sub layers
18
-    uint32_t maxDecPicBuffering; // these are dups of VPS values
19
-    uint32_t maxLatencyIncrease;
20
-    int      numReorderPics;
21
+    uint32_t maxDecPicBufferingMAX_T_LAYERS; // these are dups of VPS values
22
+    uint32_t maxLatencyIncreaseMAX_T_LAYERS;
23
+    int      numReorderPicsMAX_T_LAYERS;
24
 
25
     RPS      spsrpsMAX_NUM_SHORT_TERM_RPS;
26
     int      spsrpsNum;
27
@@ -363,6 +363,7 @@
28
     int         m_iNumRPSInSPS;
29
     const x265_param *m_param;
30
     int         m_fieldNum;
31
+    Frame*      m_mcstfRefFrameList2MAX_MCSTF_TEMPORAL_WINDOW_LENGTH;
32
 
33
     Slice()
34
     {
35
x265_3.6.tar.gz/source/common/temporalfilter.cpp Added
1019
 
1
@@ -0,0 +1,1017 @@
2
+/*****************************************************************************
3
+* Copyright (C) 2013-2021 MulticoreWare, Inc
4
+*
5
+ * Authors: Ashok Kumar Mishra <ashok@multicorewareinc.com>
6
+ *
7
+* This program is free software; you can redistribute it and/or modify
8
+* it under the terms of the GNU General Public License as published by
9
+* the Free Software Foundation; either version 2 of the License, or
10
+* (at your option) any later version.
11
+*
12
+* This program is distributed in the hope that it will be useful,
13
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+* GNU General Public License for more details.
16
+*
17
+* You should have received a copy of the GNU General Public License
18
+* along with this program; if not, write to the Free Software
19
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+*
21
+* This program is also available under a commercial proprietary license.
22
+* For more information, contact us at license @ x265.com.
23
+*****************************************************************************/
24
+#include "common.h"
25
+#include "temporalfilter.h"
26
+#include "primitives.h"
27
+
28
+#include "frame.h"
29
+#include "slice.h"
30
+#include "framedata.h"
31
+#include "analysis.h"
32
+
33
+using namespace X265_NS;
34
+
35
+void OrigPicBuffer::addPicture(Frame* inFrame)
36
+{
37
+    m_mcstfPicList.pushFrontMCSTF(*inFrame);
38
+}
39
+
40
+void OrigPicBuffer::addEncPicture(Frame* inFrame)
41
+{
42
+    m_mcstfOrigPicFreeList.pushFrontMCSTF(*inFrame);
43
+}
44
+
45
+void OrigPicBuffer::addEncPictureToPicList(Frame* inFrame)
46
+{
47
+    m_mcstfOrigPicList.pushFrontMCSTF(*inFrame);
48
+}
49
+
50
+OrigPicBuffer::~OrigPicBuffer()
51
+{
52
+    while (!m_mcstfOrigPicList.empty())
53
+    {
54
+        Frame* curFrame = m_mcstfOrigPicList.popBackMCSTF();
55
+        curFrame->destroy();
56
+        delete curFrame;
57
+    }
58
+
59
+    while (!m_mcstfOrigPicFreeList.empty())
60
+    {
61
+        Frame* curFrame = m_mcstfOrigPicFreeList.popBackMCSTF();
62
+        curFrame->destroy();
63
+        delete curFrame;
64
+    }
65
+}
66
+
67
+void OrigPicBuffer::setOrigPicList(Frame* inFrame, int frameCnt)
68
+{
69
+    Slice* slice = inFrame->m_encData->m_slice;
70
+    uint8_t j = 0;
71
+    for (int iterPOC = (inFrame->m_poc - inFrame->m_mcstf->m_range);
72
+        iterPOC <= (inFrame->m_poc + inFrame->m_mcstf->m_range); iterPOC++)
73
+    {
74
+        if (iterPOC != inFrame->m_poc)
75
+        {
76
+            if (iterPOC < 0)
77
+                continue;
78
+            if (iterPOC >= frameCnt)
79
+                break;
80
+
81
+            Frame *iterFrame = m_mcstfPicList.getPOCMCSTF(iterPOC);
82
+            X265_CHECK(iterFrame, "Reference frame not found in OPB");
83
+            if (iterFrame != NULL)
84
+            {
85
+                slice->m_mcstfRefFrameList1j = iterFrame;
86
+                iterFrame->m_refPicCnt1--;
87
+            }
88
+
89
+            iterFrame = m_mcstfOrigPicList.getPOCMCSTF(iterPOC);
90
+            if (iterFrame != NULL)
91
+            {
92
+
93
+                slice->m_mcstfRefFrameList1j = iterFrame;
94
+
95
+                iterFrame->m_refPicCnt1--;
96
+                Frame *cFrame = m_mcstfOrigPicList.getPOCMCSTF(inFrame->m_poc);
97
+                X265_CHECK(cFrame, "Reference frame not found in encoded OPB");
98
+                cFrame->m_refPicCnt1--;
99
+            }
100
+            j++;
101
+        }
102
+    }
103
+}
104
+
105
+void OrigPicBuffer::recycleOrigPicList()
106
+{
107
+    Frame *iterFrame = m_mcstfPicList.first();
108
+
109
+    while (iterFrame)
110
+    {
111
+        Frame *curFrame = iterFrame;
112
+        iterFrame = iterFrame->m_nextMCSTF;
113
+        if (!curFrame->m_refPicCnt1)
114
+        {
115
+            m_mcstfPicList.removeMCSTF(*curFrame);
116
+            iterFrame = m_mcstfPicList.first();
117
+        }
118
+    }
119
+
120
+    iterFrame = m_mcstfOrigPicList.first();
121
+
122
+    while (iterFrame)
123
+    {
124
+        Frame *curFrame = iterFrame;
125
+        iterFrame = iterFrame->m_nextMCSTF;
126
+        if (!curFrame->m_refPicCnt1)
127
+        {
128
+            m_mcstfOrigPicList.removeMCSTF(*curFrame);
129
+            *curFrame->m_isSubSampled = false;
130
+            m_mcstfOrigPicFreeList.pushFrontMCSTF(*curFrame);
131
+            iterFrame = m_mcstfOrigPicList.first();
132
+        }
133
+    }
134
+}
135
+
136
+void OrigPicBuffer::addPictureToFreelist(Frame* inFrame)
137
+{
138
+    m_mcstfOrigPicFreeList.pushBack(*inFrame);
139
+}
140
+
141
+TemporalFilter::TemporalFilter()
142
+{
143
+    m_sourceWidth = 0;
144
+    m_sourceHeight = 0,
145
+    m_QP = 0;
146
+    m_sliceTypeConfig = 3;
147
+    m_numRef = 0;
148
+    m_useSADinME = 1;
149
+
150
+    m_range = 2;
151
+    m_chromaFactor = 0.55;
152
+    m_sigmaMultiplier = 9.0;
153
+    m_sigmaZeroPoint = 10.0;
154
+    m_motionVectorFactor = 16;
155
+}
156
+
157
+void TemporalFilter::init(const x265_param* param)
158
+{
159
+    m_param = param;
160
+    m_bitDepth = param->internalBitDepth;
161
+    m_sourceWidth = param->sourceWidth;
162
+    m_sourceHeight = param->sourceHeight;
163
+    m_internalCsp = param->internalCsp;
164
+    m_numComponents = (m_internalCsp != X265_CSP_I400) ? MAX_NUM_COMPONENT : 1;
165
+
166
+    m_metld = new MotionEstimatorTLD;
167
+
168
+    predPUYuv.create(FENC_STRIDE, X265_CSP_I400);
169
+}
170
+
171
+int TemporalFilter::createRefPicInfo(TemporalFilterRefPicInfo* refFrame, x265_param* param)
172
+{
173
+    CHECKED_MALLOC_ZERO(refFrame->mvs, MV, sizeof(MV)* ((m_sourceWidth ) / 4) * ((m_sourceHeight ) / 4));
174
+    refFrame->mvsStride = m_sourceWidth / 4;
175
+    CHECKED_MALLOC_ZERO(refFrame->mvs0, MV, sizeof(MV)* ((m_sourceWidth ) / 16) * ((m_sourceHeight ) / 16));
176
+    refFrame->mvsStride0 = m_sourceWidth / 16;
177
+    CHECKED_MALLOC_ZERO(refFrame->mvs1, MV, sizeof(MV)* ((m_sourceWidth ) / 16) * ((m_sourceHeight ) / 16));
178
+    refFrame->mvsStride1 = m_sourceWidth / 16;
179
+    CHECKED_MALLOC_ZERO(refFrame->mvs2, MV, sizeof(MV)* ((m_sourceWidth ) / 16)*((m_sourceHeight ) / 16));
180
+    refFrame->mvsStride2 = m_sourceWidth / 16;
181
+
182
+    CHECKED_MALLOC_ZERO(refFrame->noise, int, sizeof(int) * ((m_sourceWidth) / 4) * ((m_sourceHeight) / 4));
183
+    CHECKED_MALLOC_ZERO(refFrame->error, int, sizeof(int) * ((m_sourceWidth) / 4) * ((m_sourceHeight) / 4));
184
+
185
+    refFrame->slicetype = X265_TYPE_AUTO;
186
+
187
+    refFrame->compensatedPic = new PicYuv;
188
+    refFrame->compensatedPic->create(param, true);
189
+
190
+    return 1;
191
+fail:
192
+    return 0;
193
+}
194
+
195
+int TemporalFilter::motionErrorLumaSAD(
196
+    PicYuv *orig,
197
+    PicYuv *buffer,
198
+    int x,
199
+    int y,
200
+    int dx,
201
+    int dy,
202
+    int bs,
203
+    int besterror)
204
+{
205
+
206
+    pixel* origOrigin = orig->m_picOrg0;
207
+    intptr_t origStride = orig->m_stride;
208
+    pixel *buffOrigin = buffer->m_picOrg0;
209
+    intptr_t buffStride = buffer->m_stride;
210
+    int error = 0;// dx * 10 + dy * 10;
211
+    if (((dx | dy) & 0xF) == 0)
212
+    {
213
+        dx /= m_motionVectorFactor;
214
+        dy /= m_motionVectorFactor;
215
+
216
+        const pixel* bufferRowStart = buffOrigin + (y + dy) * buffStride + (x + dx);
217
+#if 0
218
+        const pixel* origRowStart = origOrigin + y *origStride + x;
219
+
220
+        for (int y1 = 0; y1 < bs; y1++)
221
+        {
222
+            for (int x1 = 0; x1 < bs; x1++)
223
+            {
224
+                int diff = origRowStartx1 - bufferRowStartx1;
225
+                error += abs(diff);
226
+            }
227
+
228
+            origRowStart += origStride;
229
+            bufferRowStart += buffStride;
230
+        }
231
+#else
232
+        int partEnum = partitionFromSizes(bs, bs);
233
+        /* copy PU block into cache */
234
+        primitives.pupartEnum.copy_pp(predPUYuv.m_buf0, FENC_STRIDE, bufferRowStart, buffStride);
235
+
236
+        error = m_metld->me.bufSAD(predPUYuv.m_buf0, FENC_STRIDE);
237
+#endif
238
+        if (error > besterror)
239
+        {
240
+            return error;
241
+        }
242
+    }
243
+    else
244
+    {
245
+        const int *xFilter = s_interpolationFilterdx & 0xF;
246
+        const int *yFilter = s_interpolationFilterdy & 0xF;
247
+        int tempArray64 + 864;
248
+
249
+        int iSum, iBase;
250
+        for (int y1 = 1; y1 < bs + 7; y1++)
251
+        {
252
+            const int yOffset = y + y1 + (dy >> 4) - 3;
253
+            const pixel *sourceRow = buffOrigin + (yOffset)*buffStride + 0;
254
+            for (int x1 = 0; x1 < bs; x1++)
255
+            {
256
+                iSum = 0;
257
+                iBase = x + x1 + (dx >> 4) - 3;
258
+                const pixel *rowStart = sourceRow + iBase;
259
+
260
+                iSum += xFilter1 * rowStart1;
261
+                iSum += xFilter2 * rowStart2;
262
+                iSum += xFilter3 * rowStart3;
263
+                iSum += xFilter4 * rowStart4;
264
+                iSum += xFilter5 * rowStart5;
265
+                iSum += xFilter6 * rowStart6;
266
+
267
+                tempArrayy1x1 = iSum;
268
+            }
269
+        }
270
+
271
+        const pixel maxSampleValue = (1 << m_bitDepth) - 1;
272
+        for (int y1 = 0; y1 < bs; y1++)
273
+        {
274
+            const pixel *origRow = origOrigin + (y + y1)*origStride + 0;
275
+            for (int x1 = 0; x1 < bs; x1++)
276
+            {
277
+                iSum = 0;
278
+                iSum += yFilter1 * tempArrayy1 + 1x1;
279
+                iSum += yFilter2 * tempArrayy1 + 2x1;
280
+                iSum += yFilter3 * tempArrayy1 + 3x1;
281
+                iSum += yFilter4 * tempArrayy1 + 4x1;
282
+                iSum += yFilter5 * tempArrayy1 + 5x1;
283
+                iSum += yFilter6 * tempArrayy1 + 6x1;
284
+
285
+                iSum = (iSum + (1 << 11)) >> 12;
286
+                iSum = iSum < 0 ? 0 : (iSum > maxSampleValue ? maxSampleValue : iSum);
287
+
288
+                error += abs(iSum - origRowx + x1);
289
+            }
290
+            if (error > besterror)
291
+            {
292
+                return error;
293
+            }
294
+        }
295
+    }
296
+    return error;
297
+}
298
+
299
+int TemporalFilter::motionErrorLumaSSD(
300
+    PicYuv *orig,
301
+    PicYuv *buffer,
302
+    int x,
303
+    int y,
304
+    int dx,
305
+    int dy,
306
+    int bs,
307
+    int besterror)
308
+{
309
+
310
+    pixel* origOrigin = orig->m_picOrg0;
311
+    intptr_t origStride = orig->m_stride;
312
+    pixel *buffOrigin = buffer->m_picOrg0;
313
+    intptr_t buffStride = buffer->m_stride;
314
+    int error = 0;// dx * 10 + dy * 10;
315
+    if (((dx | dy) & 0xF) == 0)
316
+    {
317
+        dx /= m_motionVectorFactor;
318
+        dy /= m_motionVectorFactor;
319
+
320
+        const pixel* bufferRowStart = buffOrigin + (y + dy) * buffStride + (x + dx);
321
+#if 0
322
+        const pixel* origRowStart = origOrigin + y * origStride + x;
323
+
324
+        for (int y1 = 0; y1 < bs; y1++)
325
+        {
326
+            for (int x1 = 0; x1 < bs; x1++)
327
+            {
328
+                int diff = origRowStartx1 - bufferRowStartx1;
329
+                error += diff * diff;
330
+            }
331
+
332
+            origRowStart += origStride;
333
+            bufferRowStart += buffStride;
334
+        }
335
+#else
336
+        int partEnum = partitionFromSizes(bs, bs);
337
+        /* copy PU block into cache */
338
+        primitives.pupartEnum.copy_pp(predPUYuv.m_buf0, FENC_STRIDE, bufferRowStart, buffStride);
339
+
340
+        error = (int)primitives.cupartEnum.sse_pp(m_metld->me.fencPUYuv.m_buf0, FENC_STRIDE, predPUYuv.m_buf0, FENC_STRIDE);
341
+
342
+#endif
343
+        if (error > besterror)
344
+        {
345
+            return error;
346
+        }
347
+    }
348
+    else
349
+    {
350
+        const int *xFilter = s_interpolationFilterdx & 0xF;
351
+        const int *yFilter = s_interpolationFilterdy & 0xF;
352
+        int tempArray64 + 864;
353
+
354
+        int iSum, iBase;
355
+        for (int y1 = 1; y1 < bs + 7; y1++)
356
+        {
357
+            const int yOffset = y + y1 + (dy >> 4) - 3;
358
+            const pixel *sourceRow = buffOrigin + (yOffset)*buffStride + 0;
359
+            for (int x1 = 0; x1 < bs; x1++)
360
+            {
361
+                iSum = 0;
362
+                iBase = x + x1 + (dx >> 4) - 3;
363
+                const pixel *rowStart = sourceRow + iBase;
364
+
365
+                iSum += xFilter1 * rowStart1;
366
+                iSum += xFilter2 * rowStart2;
367
+                iSum += xFilter3 * rowStart3;
368
+                iSum += xFilter4 * rowStart4;
369
+                iSum += xFilter5 * rowStart5;
370
+                iSum += xFilter6 * rowStart6;
371
+
372
+                tempArrayy1x1 = iSum;
373
+            }
374
+        }
375
+
376
+        const pixel maxSampleValue = (1 << m_bitDepth) - 1;
377
+        for (int y1 = 0; y1 < bs; y1++)
378
+        {
379
+            const pixel *origRow = origOrigin + (y + y1)*origStride + 0;
380
+            for (int x1 = 0; x1 < bs; x1++)
381
+            {
382
+                iSum = 0;
383
+                iSum += yFilter1 * tempArrayy1 + 1x1;
384
+                iSum += yFilter2 * tempArrayy1 + 2x1;
385
+                iSum += yFilter3 * tempArrayy1 + 3x1;
386
+                iSum += yFilter4 * tempArrayy1 + 4x1;
387
+                iSum += yFilter5 * tempArrayy1 + 5x1;
388
+                iSum += yFilter6 * tempArrayy1 + 6x1;
389
+
390
+                iSum = (iSum + (1 << 11)) >> 12;
391
+                iSum = iSum < 0 ? 0 : (iSum > maxSampleValue ? maxSampleValue : iSum);
392
+
393
+                error += (iSum - origRowx + x1) * (iSum - origRowx + x1);
394
+            }
395
+            if (error > besterror)
396
+            {
397
+                return error;
398
+            }
399
+        }
400
+    }
401
+    return error;
402
+}
403
+
404
+void TemporalFilter::applyMotion(MV *mvs, uint32_t mvsStride, PicYuv *input, PicYuv *output)
405
+{
406
+    static const int lumaBlockSize = 8;
407
+    int srcStride = 0;
408
+    int dstStride = 0;
409
+    int csx = 0, csy = 0;
410
+    for (int c = 0; c < m_numComponents; c++)
411
+    {
412
+        const pixel maxValue = (1 << X265_DEPTH) - 1;
413
+
414
+        const pixel *pSrcImage = input->m_picOrgc;
415
+        pixel *pDstImage = output->m_picOrgc;
416
+
417
+        if (!c)
418
+        {
419
+            srcStride = (int)input->m_stride;
420
+            dstStride = (int)output->m_stride;
421
+        }
422
+        else
423
+        {
424
+            srcStride = (int)input->m_strideC;
425
+            dstStride = (int)output->m_strideC;
426
+            csx = CHROMA_H_SHIFT(m_internalCsp);
427
+            csy = CHROMA_V_SHIFT(m_internalCsp);
428
+        }
429
+        const int blockSizeX = lumaBlockSize >> csx;
430
+        const int blockSizeY = lumaBlockSize >> csy;
431
+        const int height = input->m_picHeight >> csy;
432
+        const int width = input->m_picWidth >> csx;
433
+
434
+        for (int y = 0, blockNumY = 0; y + blockSizeY <= height; y += blockSizeY, blockNumY++)
435
+        {
436
+            for (int x = 0, blockNumX = 0; x + blockSizeX <= width; x += blockSizeX, blockNumX++)
437
+            {
438
+                int mvIdx = blockNumY * mvsStride + blockNumX;
439
+                const MV &mv = mvsmvIdx;
440
+                const int dx = mv.x >> csx;
441
+                const int dy = mv.y >> csy;
442
+                const int xInt = mv.x >> (4 + csx);
443
+                const int yInt = mv.y >> (4 + csy);
444
+
445
+                const int *xFilter = s_interpolationFilterdx & 0xf;
446
+                const int *yFilter = s_interpolationFilterdy & 0xf; // will add 6 bit.
447
+                const int numFilterTaps = 7;
448
+                const int centreTapOffset = 3;
449
+
450
+                int tempArraylumaBlockSize + numFilterTapslumaBlockSize;
451
+
452
+                for (int by = 1; by < blockSizeY + numFilterTaps; by++)
453
+                {
454
+                    const int yOffset = y + by + yInt - centreTapOffset;
455
+                    const pixel *sourceRow = pSrcImage + yOffset * srcStride;
456
+                    for (int bx = 0; bx < blockSizeX; bx++)
457
+                    {
458
+                        int iBase = x + bx + xInt - centreTapOffset;
459
+                        const pixel *rowStart = sourceRow + iBase;
460
+
461
+                        int iSum = 0;
462
+                        iSum += xFilter1 * rowStart1;
463
+                        iSum += xFilter2 * rowStart2;
464
+                        iSum += xFilter3 * rowStart3;
465
+                        iSum += xFilter4 * rowStart4;
466
+                        iSum += xFilter5 * rowStart5;
467
+                        iSum += xFilter6 * rowStart6;
468
+
469
+                        tempArraybybx = iSum;
470
+                    }
471
+                }
472
+
473
+                pixel *pDstRow = pDstImage + y * dstStride;
474
+                for (int by = 0; by < blockSizeY; by++, pDstRow += dstStride)
475
+                {
476
+                    pixel *pDstPel = pDstRow + x;
477
+                    for (int bx = 0; bx < blockSizeX; bx++, pDstPel++)
478
+                    {
479
+                        int iSum = 0;
480
+
481
+                        iSum += yFilter1 * tempArrayby + 1bx;
482
+                        iSum += yFilter2 * tempArrayby + 2bx;
483
+                        iSum += yFilter3 * tempArrayby + 3bx;
484
+                        iSum += yFilter4 * tempArrayby + 4bx;
485
+                        iSum += yFilter5 * tempArrayby + 5bx;
486
+                        iSum += yFilter6 * tempArrayby + 6bx;
487
+
488
+                        iSum = (iSum + (1 << 11)) >> 12;
489
+                        iSum = iSum < 0 ? 0 : (iSum > maxValue ? maxValue : iSum);
490
+                        *pDstPel = (pixel)iSum;
491
+                    }
492
+                }
493
+            }
494
+        }
495
+    }
496
+}
497
+
498
+void TemporalFilter::bilateralFilter(Frame* frame,
499
+    TemporalFilterRefPicInfo* m_mcstfRefList,
500
+    double overallStrength)
501
+{
502
+
503
+    const int numRefs = frame->m_mcstf->m_numRef;
504
+
505
+    for (int i = 0; i < numRefs; i++)
506
+    {
507
+        TemporalFilterRefPicInfo *ref = &m_mcstfRefListi;
508
+        applyMotion(m_mcstfRefListi.mvs, m_mcstfRefListi.mvsStride, m_mcstfRefListi.picBuffer, ref->compensatedPic);
509
+    }
510
+
511
+    int refStrengthRow = 2;
512
+    if (numRefs == m_range * 2)
513
+    {
514
+        refStrengthRow = 0;
515
+    }
516
+    else if (numRefs == m_range)
517
+    {
518
+        refStrengthRow = 1;
519
+    }
520
+
521
+    const double lumaSigmaSq = (m_QP - m_sigmaZeroPoint) * (m_QP - m_sigmaZeroPoint) * m_sigmaMultiplier;
522
+    const double chromaSigmaSq = 30 * 30;
523
+
524
+    PicYuv* orgPic = frame->m_fencPic;
525
+
526
+    for (int c = 0; c < m_numComponents; c++)
527
+    {
528
+        int height, width;
529
+        pixel *srcPelRow = NULL;
530
+        intptr_t srcStride, correctedPicsStride = 0;
531
+
532
+        if (!c)
533
+        {
534
+            height = orgPic->m_picHeight;
535
+            width = orgPic->m_picWidth;
536
+            srcPelRow = orgPic->m_picOrgc;
537
+            srcStride = orgPic->m_stride;
538
+        }
539
+        else
540
+        {
541
+            int csx = CHROMA_H_SHIFT(m_internalCsp);
542
+            int csy = CHROMA_V_SHIFT(m_internalCsp);
543
+
544
+            height = orgPic->m_picHeight >> csy;
545
+            width = orgPic->m_picWidth >> csx;
546
+            srcPelRow = orgPic->m_picOrgc;
547
+            srcStride = (int)orgPic->m_strideC;
548
+        }
549
+
550
+        const double sigmaSq = (!c)  ? lumaSigmaSq : chromaSigmaSq;
551
+        const double weightScaling = overallStrength * ( (!c) ? 0.4 : m_chromaFactor);
552
+
553
+        const double maxSampleValue = (1 << m_bitDepth) - 1;
554
+        const double bitDepthDiffWeighting = 1024.0 / (maxSampleValue + 1);
555
+
556
+        const int blkSize = (!c) ? 8 : 4;
557
+
558
+        for (int y = 0; y < height; y++, srcPelRow += srcStride)
559
+        {
560
+            pixel *srcPel = srcPelRow;
561
+
562
+            for (int x = 0; x < width; x++, srcPel++)
563
+            {
564
+                const int orgVal = (int)*srcPel;
565
+                double temporalWeightSum = 1.0;
566
+                double newVal = (double)orgVal;
567
+
568
+                if ((y % blkSize == 0) && (x % blkSize == 0))
569
+                {
570
+                    for (int i = 0; i < numRefs; i++)
571
+                    {
572
+                        TemporalFilterRefPicInfo *refPicInfo = &m_mcstfRefListi;
573
+
574
+                        if (!c)
575
+                            correctedPicsStride = refPicInfo->compensatedPic->m_stride;
576
+                        else
577
+                            correctedPicsStride = refPicInfo->compensatedPic->m_strideC;
578
+
579
+                        double variance = 0, diffsum = 0;
580
+                        for (int y1 = 0; y1 < blkSize - 1; y1++)
581
+                        {
582
+                            for (int x1 = 0; x1 < blkSize - 1; x1++)
583
+                            {
584
+                                int pix = *(srcPel + x1);
585
+                                int pixR = *(srcPel + x1 + 1);
586
+                                int pixD = *(srcPel + x1 + srcStride);
587
+
588
+                                int ref = *(refPicInfo->compensatedPic->m_picOrgc + ((y + y1) * correctedPicsStride + x + x1));
589
+                                int refR = *(refPicInfo->compensatedPic->m_picOrgc + ((y + y1) * correctedPicsStride + x + x1 + 1));
590
+                                int refD = *(refPicInfo->compensatedPic->m_picOrgc + ((y + y1 + 1) * correctedPicsStride + x + x1));
591
+
592
+                                int diff = pix - ref;
593
+                                int diffR = pixR - refR;
594
+                                int diffD = pixD - refD;
595
+
596
+                                variance += diff * diff;
597
+                                diffsum += (diffR - diff) * (diffR - diff);
598
+                                diffsum += (diffD - diff) * (diffD - diff);
599
+                            }
600
+                        }
601
+
602
+                        refPicInfo->noise(y / blkSize) * refPicInfo->mvsStride + (x / blkSize) = (int)round((300 * variance + 50) / (10 * diffsum + 50));
603
+                    }
604
+                }
605
+
606
+                double minError = 9999999;
607
+                for (int i = 0; i < numRefs; i++)
608
+                {
609
+                    TemporalFilterRefPicInfo *refPicInfo = &m_mcstfRefListi;
610
+                    minError = X265_MIN(minError, (double)refPicInfo->error(y / blkSize) * refPicInfo->mvsStride + (x / blkSize));
611
+                }
612
+
613
+                for (int i = 0; i < numRefs; i++)
614
+                {
615
+                    TemporalFilterRefPicInfo *refPicInfo = &m_mcstfRefListi;
616
+
617
+                    const int error = refPicInfo->error(y / blkSize) * refPicInfo->mvsStride + (x / blkSize);
618
+                    const int noise = refPicInfo->noise(y / blkSize) * refPicInfo->mvsStride + (x / blkSize);
619
+
620
+                    const pixel *pCorrectedPelPtr = refPicInfo->compensatedPic->m_picOrgc + (y * correctedPicsStride + x);
621
+                    const int refVal = (int)*pCorrectedPelPtr;
622
+                    double diff = (double)(refVal - orgVal);
623
+                    diff *= bitDepthDiffWeighting;
624
+                    double diffSq = diff * diff;
625
+
626
+                    const int index = X265_MIN(3, std::abs(refPicInfo->origOffset) - 1);
627
+                    double ww = 1, sw = 1;
628
+                    ww *= (noise < 25) ? 1 : 1.2;
629
+                    sw *= (noise < 25) ? 1.3 : 0.8;
630
+                    ww *= (error < 50) ? 1.2 : ((error > 100) ? 0.8 : 1);
631
+                    sw *= (error < 50) ? 1.3 : 1;
632
+                    ww *= ((minError + 1) / (error + 1));
633
+                    const double weight = weightScaling * s_refStrengthsrefStrengthRowindex * ww * exp(-diffSq / (2 * sw * sigmaSq));
634
+
635
+                    newVal += weight * refVal;
636
+                    temporalWeightSum += weight;
637
+                }
638
+                newVal /= temporalWeightSum;
639
+                double sampleVal = round(newVal);
640
+                sampleVal = (sampleVal < 0 ? 0 : (sampleVal > maxSampleValue ? maxSampleValue : sampleVal));
641
+                *srcPel = (pixel)sampleVal;
642
+            }
643
+        }
644
+    }
645
+}
646
+
647
+void TemporalFilter::motionEstimationLuma(MV *mvs, uint32_t mvStride, PicYuv *orig, PicYuv *buffer, int blockSize,
648
+    MV *previous, uint32_t prevMvStride, int factor)
649
+{
650
+
651
+    int range = 5;
652
+
653
+
654
+    const int stepSize = blockSize;
655
+
656
+    const int origWidth = orig->m_picWidth;
657
+    const int origHeight = orig->m_picHeight;
658
+
659
+    int error;
660
+
661
+    for (int blockY = 0; blockY + blockSize <= origHeight; blockY += stepSize)
662
+    {
663
+        for (int blockX = 0; blockX + blockSize <= origWidth; blockX += stepSize)
664
+        {
665
+            const intptr_t pelOffset = blockY * orig->m_stride + blockX;
666
+            m_metld->me.setSourcePU(orig->m_picOrg0, orig->m_stride, pelOffset, blockSize, blockSize, X265_HEX_SEARCH, 1);
667
+
668
+
669
+            MV best(0, 0);
670
+            int leastError = INT_MAX;
671
+
672
+            if (previous == NULL)
673
+            {
674
+                range = 8;
675
+            }
676
+            else
677
+            {
678
+
679
+                for (int py = -1; py <= 1; py++)
680
+                {
681
+                    int testy = blockY / (2 * blockSize) + py;
682
+
683
+                    for (int px = -1; px <= 1; px++)
684
+                    {
685
+
686
+                        int testx = blockX / (2 * blockSize) + px;
687
+                        if ((testx >= 0) && (testx < origWidth / (2 * blockSize)) && (testy >= 0) && (testy < origHeight / (2 * blockSize)))
688
+                        {
689
+                            int mvIdx = testy * prevMvStride + testx;
690
+                            MV old = previousmvIdx;
691
+
692
+                            if (m_useSADinME)
693
+                                error = motionErrorLumaSAD(orig, buffer, blockX, blockY, old.x * factor, old.y * factor, blockSize, leastError);
694
+                            else
695
+                                error = motionErrorLumaSSD(orig, buffer, blockX, blockY, old.x * factor, old.y * factor, blockSize, leastError);
696
+
697
+                            if (error < leastError)
698
+                            {
699
+                                best.set(old.x * factor, old.y * factor);
700
+                                leastError = error;
701
+                            }
702
+                        }
703
+                    }
704
+                }
705
+
706
+                if (m_useSADinME)
707
+                    error = motionErrorLumaSAD(orig, buffer, blockX, blockY, 0, 0, blockSize, leastError);
708
+                else
709
+                    error = motionErrorLumaSSD(orig, buffer, blockX, blockY, 0, 0, blockSize, leastError);
710
+
711
+                if (error < leastError)
712
+                {
713
+                    best.set(0, 0);
714
+                    leastError = error;
715
+                }
716
+
717
+            }
718
+
719
+            MV prevBest = best;
720
+            for (int y2 = prevBest.y / m_motionVectorFactor - range; y2 <= prevBest.y / m_motionVectorFactor + range; y2++)
721
+            {
722
+                for (int x2 = prevBest.x / m_motionVectorFactor - range; x2 <= prevBest.x / m_motionVectorFactor + range; x2++)
723
+                {
724
+                    if (m_useSADinME)
725
+                        error = motionErrorLumaSAD(orig, buffer, blockX, blockY, x2 * m_motionVectorFactor, y2 * m_motionVectorFactor, blockSize, leastError);
726
+                    else
727
+                        error = motionErrorLumaSSD(orig, buffer, blockX, blockY, x2 * m_motionVectorFactor, y2 * m_motionVectorFactor, blockSize, leastError);
728
+                    if (error < leastError)
729
+                    {
730
+                        best.set(x2 * m_motionVectorFactor, y2 * m_motionVectorFactor);
731
+                        leastError = error;
732
+                    }
733
+                }
734
+            }
735
+
736
+            if (blockY > 0)
737
+            {
738
+                int idx = ((blockY - stepSize) / stepSize) * mvStride + (blockX / stepSize);
739
+                MV aboveMV = mvsidx;
740
+
741
+                if (m_useSADinME)
742
+                    error = motionErrorLumaSAD(orig, buffer, blockX, blockY, aboveMV.x, aboveMV.y, blockSize, leastError);
743
+                else
744
+                    error = motionErrorLumaSSD(orig, buffer, blockX, blockY, aboveMV.x, aboveMV.y, blockSize, leastError);
745
+
746
+                if (error < leastError)
747
+                {
748
+                    best.set(aboveMV.x, aboveMV.y);
749
+                    leastError = error;
750
+                }
751
+            }
752
+
753
+            if (blockX > 0)
754
+            {
755
+                int idx = ((blockY / stepSize) * mvStride + (blockX - stepSize) / stepSize);
756
+                MV leftMV = mvsidx;
757
+
758
+                if (m_useSADinME)
759
+                    error = motionErrorLumaSAD(orig, buffer, blockX, blockY, leftMV.x, leftMV.y, blockSize, leastError);
760
+                else
761
+                    error = motionErrorLumaSSD(orig, buffer, blockX, blockY, leftMV.x, leftMV.y, blockSize, leastError);
762
+
763
+                if (error < leastError)
764
+                {
765
+                    best.set(leftMV.x, leftMV.y);
766
+                    leastError = error;
767
+                }
768
+            }
769
+
770
+            // calculate average
771
+            double avg = 0.0;
772
+            for (int x1 = 0; x1 < blockSize; x1++)
773
+            {
774
+                for (int y1 = 0; y1 < blockSize; y1++)
775
+                {
776
+                    avg = avg + *(orig->m_picOrg0 + (blockX + x1 + orig->m_stride * (blockY + y1)));
777
+                }
778
+            }
779
+            avg = avg / (blockSize * blockSize);
780
+
781
+            // calculate variance
782
+            double variance = 0;
783
+            for (int x1 = 0; x1 < blockSize; x1++)
784
+            {
785
+                for (int y1 = 0; y1 < blockSize; y1++)
786
+                {
787
+                    int pix = *(orig->m_picOrg0 + (blockX + x1 + orig->m_stride * (blockY + y1)));
788
+                    variance = variance + (pix - avg) * (pix - avg);
789
+                }
790
+            }
791
+
792
+            leastError = (int)(20 * ((leastError + 5.0) / (variance + 5.0)) + (leastError / (blockSize * blockSize)) / 50);
793
+
794
+            int mvIdx = (blockY / stepSize) * mvStride + (blockX / stepSize);
795
+            mvsmvIdx = best;
796
+        }
797
+    }
798
+}
799
+
800
+
801
+void TemporalFilter::motionEstimationLumaDoubleRes(MV *mvs, uint32_t mvStride, PicYuv *orig, PicYuv *buffer, int blockSize,
802
+    MV *previous, uint32_t prevMvStride, int factor, int* minError)
803
+{
804
+
805
+    int range = 0;
806
+
807
+
808
+    const int stepSize = blockSize;
809
+
810
+    const int origWidth = orig->m_picWidth;
811
+    const int origHeight = orig->m_picHeight;
812
+
813
+    int error;
814
+
815
+    for (int blockY = 0; blockY + blockSize <= origHeight; blockY += stepSize)
816
+    {
817
+        for (int blockX = 0; blockX + blockSize <= origWidth; blockX += stepSize)
818
+        {
819
+
820
+            const intptr_t pelOffset = blockY * orig->m_stride + blockX;
821
+            m_metld->me.setSourcePU(orig->m_picOrg0, orig->m_stride, pelOffset, blockSize, blockSize, X265_HEX_SEARCH, 1);
822
+
823
+            MV best(0, 0);
824
+            int leastError = INT_MAX;
825
+
826
+            if (previous == NULL)
827
+            {
828
+                range = 8;
829
+            }
830
+            else
831
+            {
832
+
833
+                for (int py = -1; py <= 1; py++)
834
+                {
835
+                    int testy = blockY / (2 * blockSize) + py;
836
+
837
+                    for (int px = -1; px <= 1; px++)
838
+                    {
839
+
840
+                        int testx = blockX / (2 * blockSize) + px;
841
+                        if ((testx >= 0) && (testx < origWidth / (2 * blockSize)) && (testy >= 0) && (testy < origHeight / (2 * blockSize)))
842
+                        {
843
+                            int mvIdx = testy * prevMvStride + testx;
844
+                            MV old = previousmvIdx;
845
+
846
+                            if (m_useSADinME)
847
+                                error = motionErrorLumaSAD(orig, buffer, blockX, blockY, old.x * factor, old.y * factor, blockSize, leastError);
848
+                            else
849
+                                error = motionErrorLumaSSD(orig, buffer, blockX, blockY, old.x * factor, old.y * factor, blockSize, leastError);
850
+
851
+                            if (error < leastError)
852
+                            {
853
+                                best.set(old.x * factor, old.y * factor);
854
+                                leastError = error;
855
+                            }
856
+                        }
857
+                    }
858
+                }
859
+
860
+                if (m_useSADinME)
861
+                    error = motionErrorLumaSAD(orig, buffer, blockX, blockY, 0, 0, blockSize, leastError);
862
+                else
863
+                    error = motionErrorLumaSSD(orig, buffer, blockX, blockY, 0, 0, blockSize, leastError);
864
+
865
+                if (error < leastError)
866
+                {
867
+                    best.set(0, 0);
868
+                    leastError = error;
869
+                }
870
+
871
+            }
872
+
873
+            MV prevBest = best;
874
+            for (int y2 = prevBest.y / m_motionVectorFactor - range; y2 <= prevBest.y / m_motionVectorFactor + range; y2++)
875
+            {
876
+                for (int x2 = prevBest.x / m_motionVectorFactor - range; x2 <= prevBest.x / m_motionVectorFactor + range; x2++)
877
+                {
878
+                    if (m_useSADinME)
879
+                        error = motionErrorLumaSAD(orig, buffer, blockX, blockY, x2 * m_motionVectorFactor, y2 * m_motionVectorFactor, blockSize, leastError);
880
+                    else
881
+                        error = motionErrorLumaSSD(orig, buffer, blockX, blockY, x2 * m_motionVectorFactor, y2 * m_motionVectorFactor, blockSize, leastError);
882
+
883
+                    if (error < leastError)
884
+                    {
885
+                        best.set(x2 * m_motionVectorFactor, y2 * m_motionVectorFactor);
886
+                        leastError = error;
887
+                    }
888
+                }
889
+            }
890
+
891
+            prevBest = best;
892
+            int doubleRange = 3 * 4;
893
+            for (int y2 = prevBest.y - doubleRange; y2 <= prevBest.y + doubleRange; y2 += 4)
894
+            {
895
+                for (int x2 = prevBest.x - doubleRange; x2 <= prevBest.x + doubleRange; x2 += 4)
896
+                {
897
+                    if (m_useSADinME)
898
+                        error = motionErrorLumaSAD(orig, buffer, blockX, blockY, x2, y2, blockSize, leastError);
899
+                    else
900
+                        error = motionErrorLumaSSD(orig, buffer, blockX, blockY, x2, y2, blockSize, leastError);
901
+
902
+                    if (error < leastError)
903
+                    {
904
+                        best.set(x2, y2);
905
+                        leastError = error;
906
+                    }
907
+                }
908
+            }
909
+
910
+            prevBest = best;
911
+            doubleRange = 3;
912
+            for (int y2 = prevBest.y - doubleRange; y2 <= prevBest.y + doubleRange; y2++)
913
+            {
914
+                for (int x2 = prevBest.x - doubleRange; x2 <= prevBest.x + doubleRange; x2++)
915
+                {
916
+                    if (m_useSADinME)
917
+                        error = motionErrorLumaSAD(orig, buffer, blockX, blockY, x2, y2, blockSize, leastError);
918
+                    else
919
+                        error = motionErrorLumaSSD(orig, buffer, blockX, blockY, x2, y2, blockSize, leastError);
920
+
921
+                    if (error < leastError)
922
+                    {
923
+                        best.set(x2, y2);
924
+                        leastError = error;
925
+                    }
926
+                }
927
+            }
928
+
929
+
930
+            if (blockY > 0)
931
+            {
932
+                int idx = ((blockY - stepSize) / stepSize) * mvStride + (blockX / stepSize);
933
+                MV aboveMV = mvsidx;
934
+
935
+                if (m_useSADinME)
936
+                    error = motionErrorLumaSAD(orig, buffer, blockX, blockY, aboveMV.x, aboveMV.y, blockSize, leastError);
937
+                else
938
+                    error = motionErrorLumaSSD(orig, buffer, blockX, blockY, aboveMV.x, aboveMV.y, blockSize, leastError);
939
+
940
+                if (error < leastError)
941
+                {
942
+                    best.set(aboveMV.x, aboveMV.y);
943
+                    leastError = error;
944
+                }
945
+            }
946
+
947
+            if (blockX > 0)
948
+            {
949
+                int idx = ((blockY / stepSize) * mvStride + (blockX - stepSize) / stepSize);
950
+                MV leftMV = mvsidx;
951
+
952
+                if (m_useSADinME)
953
+                    error = motionErrorLumaSAD(orig, buffer, blockX, blockY, leftMV.x, leftMV.y, blockSize, leastError);
954
+                else
955
+                    error = motionErrorLumaSSD(orig, buffer, blockX, blockY, leftMV.x, leftMV.y, blockSize, leastError);
956
+
957
+                if (error < leastError)
958
+                {
959
+                    best.set(leftMV.x, leftMV.y);
960
+                    leastError = error;
961
+                }
962
+            }
963
+
964
+            // calculate average
965
+            double avg = 0.0;
966
+            for (int x1 = 0; x1 < blockSize; x1++)
967
+            {
968
+                for (int y1 = 0; y1 < blockSize; y1++)
969
+                {
970
+                    avg = avg + *(orig->m_picOrg0 + (blockX + x1 + orig->m_stride * (blockY + y1)));
971
+                }
972
+            }
973
+            avg = avg / (blockSize * blockSize);
974
+
975
+            // calculate variance
976
+            double variance = 0;
977
+            for (int x1 = 0; x1 < blockSize; x1++)
978
+            {
979
+                for (int y1 = 0; y1 < blockSize; y1++)
980
+                {
981
+                    int pix = *(orig->m_picOrg0 + (blockX + x1 + orig->m_stride * (blockY + y1)));
982
+                    variance = variance + (pix - avg) * (pix - avg);
983
+                }
984
+            }
985
+
986
+            leastError = (int)(20 * ((leastError + 5.0) / (variance + 5.0)) + (leastError / (blockSize * blockSize)) / 50);
987
+
988
+            int mvIdx = (blockY / stepSize) * mvStride + (blockX / stepSize);
989
+            mvsmvIdx = best;
990
+            minErrormvIdx = leastError;
991
+        }
992
+    }
993
+}
994
+
995
+void TemporalFilter::destroyRefPicInfo(TemporalFilterRefPicInfo* curFrame)
996
+{
997
+    if (curFrame)
998
+    {
999
+        if (curFrame->compensatedPic)
1000
+        {
1001
+            curFrame->compensatedPic->destroy();
1002
+            delete curFrame->compensatedPic;
1003
+        }
1004
+
1005
+        if (curFrame->mvs)
1006
+            X265_FREE(curFrame->mvs);
1007
+        if (curFrame->mvs0)
1008
+            X265_FREE(curFrame->mvs0);
1009
+        if (curFrame->mvs1)
1010
+            X265_FREE(curFrame->mvs1);
1011
+        if (curFrame->mvs2)
1012
+            X265_FREE(curFrame->mvs2);
1013
+        if (curFrame->noise)
1014
+            X265_FREE(curFrame->noise);
1015
+        if (curFrame->error)
1016
+            X265_FREE(curFrame->error);
1017
+    }
1018
+}
1019
x265_3.6.tar.gz/source/common/temporalfilter.h Added
187
 
1
@@ -0,0 +1,185 @@
2
+/*****************************************************************************
3
+* Copyright (C) 2013-2021 MulticoreWare, Inc
4
+*
5
+ * Authors: Ashok Kumar Mishra <ashok@multicorewareinc.com>
6
+ *
7
+* This program is free software; you can redistribute it and/or modify
8
+* it under the terms of the GNU General Public License as published by
9
+* the Free Software Foundation; either version 2 of the License, or
10
+* (at your option) any later version.
11
+*
12
+* This program is distributed in the hope that it will be useful,
13
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+* GNU General Public License for more details.
16
+*
17
+* You should have received a copy of the GNU General Public License
18
+* along with this program; if not, write to the Free Software
19
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+*
21
+* This program is also available under a commercial proprietary license.
22
+* For more information, contact us at license @ x265.com.
23
+*****************************************************************************/
24
+
25
+#ifndef X265_TEMPORAL_FILTER_H
26
+#define X265_TEMPORAL_FILTER_H
27
+
28
+#include "x265.h"
29
+#include "picyuv.h"
30
+#include "mv.h"
31
+#include "piclist.h"
32
+#include "yuv.h"
33
+#include "motion.h"
34
+
35
+const int s_interpolationFilter168 =
36
+{
37
+    {   0,   0,   0,  64,   0,   0,   0,   0 },   //0
38
+    {   0,   1,  -3,  64,   4,  -2,   0,   0 },   //1 -->-->
39
+    {   0,   1,  -6,  62,   9,  -3,   1,   0 },   //2 -->
40
+    {   0,   2,  -8,  60,  14,  -5,   1,   0 },   //3 -->-->
41
+    {   0,   2,  -9,  57,  19,  -7,   2,   0 },   //4
42
+    {   0,   3, -10,  53,  24,  -8,   2,   0 },   //5 -->-->
43
+    {   0,   3, -11,  50,  29,  -9,   2,   0 },   //6 -->
44
+    {   0,   3, -11,  44,  35, -10,   3,   0 },   //7 -->-->
45
+    {   0,   1,  -7,  38,  38,  -7,   1,   0 },   //8
46
+    {   0,   3, -10,  35,  44, -11,   3,   0 },   //9 -->-->
47
+    {   0,   2,  -9,  29,  50, -11,   3,   0 },   //10-->
48
+    {   0,   2,  -8,  24,  53, -10,   3,   0 },   //11-->-->
49
+    {   0,   2,  -7,  19,  57,  -9,   2,   0 },   //12
50
+    {   0,   1,  -5,  14,  60,  -8,   2,   0 },   //13-->-->
51
+    {   0,   1,  -3,   9,  62,  -6,   1,   0 },   //14-->
52
+    {   0,   0,  -2,   4,  64,  -3,   1,   0 }    //15-->-->
53
+};
54
+
55
+const double s_refStrengths34 =
56
+{ // abs(POC offset)
57
+  //  1,    2     3     4
58
+  {0.85, 0.57, 0.41, 0.33},  // m_range * 2
59
+  {1.13, 0.97, 0.81, 0.57},  // m_range
60
+  {0.30, 0.30, 0.30, 0.30}   // otherwise
61
+};
62
+
63
+namespace X265_NS {
64
+    class OrigPicBuffer
65
+    {
66
+    public:
67
+        PicList    m_mcstfPicList;
68
+        PicList    m_mcstfOrigPicFreeList;
69
+        PicList    m_mcstfOrigPicList;
70
+
71
+        ~OrigPicBuffer();
72
+        void addPicture(Frame*);
73
+        void addEncPicture(Frame*);
74
+        void setOrigPicList(Frame*, int);
75
+        void recycleOrigPicList();
76
+        void addPictureToFreelist(Frame*);
77
+        void addEncPictureToPicList(Frame*);
78
+    };
79
+
80
+    struct MotionEstimatorTLD
81
+    {
82
+        MotionEstimate  me;
83
+
84
+        MotionEstimatorTLD()
85
+        {
86
+            me.init(X265_CSP_I400);
87
+            me.setQP(X265_LOOKAHEAD_QP);
88
+        }
89
+
90
+        ~MotionEstimatorTLD() {}
91
+    };
92
+
93
+    struct TemporalFilterRefPicInfo
94
+    {
95
+        PicYuv*    picBuffer;
96
+        PicYuv*    picBufferSubSampled2;
97
+        PicYuv*    picBufferSubSampled4;
98
+        MV*        mvs;
99
+        MV*        mvs0;
100
+        MV*        mvs1;
101
+        MV*        mvs2;
102
+        uint32_t   mvsStride;
103
+        uint32_t   mvsStride0;
104
+        uint32_t   mvsStride1;
105
+        uint32_t   mvsStride2;
106
+        int*       error;
107
+        int*       noise;
108
+
109
+        int16_t    origOffset;
110
+        bool       isFilteredFrame;
111
+        PicYuv*    compensatedPic;
112
+
113
+        int*       isSubsampled;
114
+
115
+        int        slicetype;
116
+    };
117
+
118
+    class TemporalFilter
119
+    {
120
+    public:
121
+        TemporalFilter();
122
+        ~TemporalFilter() {}
123
+
124
+        void init(const x265_param* param);
125
+
126
+        //private:
127
+            // Private static member variables
128
+        const x265_param *m_param;
129
+        int32_t  m_bitDepth;
130
+        int m_range;
131
+        uint8_t m_numRef;
132
+        double m_chromaFactor;
133
+        double m_sigmaMultiplier;
134
+        double m_sigmaZeroPoint;
135
+        int m_motionVectorFactor;
136
+        int m_padding;
137
+
138
+        // Private member variables
139
+
140
+        int m_sourceWidth;
141
+        int m_sourceHeight;
142
+        int m_QP;
143
+
144
+        int m_internalCsp;
145
+        int m_numComponents;
146
+        uint8_t m_sliceTypeConfig;
147
+
148
+        MotionEstimatorTLD* m_metld;
149
+        Yuv  predPUYuv;
150
+        int m_useSADinME;
151
+
152
+        int createRefPicInfo(TemporalFilterRefPicInfo* refFrame, x265_param* param);
153
+
154
+        void bilateralFilter(Frame* frame, TemporalFilterRefPicInfo* mctfRefList, double overallStrength);
155
+
156
+        void motionEstimationLuma(MV *mvs, uint32_t mvStride, PicYuv *orig, PicYuv *buffer, int bs,
157
+            MV *previous = 0, uint32_t prevmvStride = 0, int factor = 1);
158
+
159
+        void motionEstimationLumaDoubleRes(MV *mvs, uint32_t mvStride, PicYuv *orig, PicYuv *buffer, int blockSize,
160
+            MV *previous, uint32_t prevMvStride, int factor, int* minError);
161
+
162
+        int motionErrorLumaSSD(PicYuv *orig,
163
+            PicYuv *buffer,
164
+            int x,
165
+            int y,
166
+            int dx,
167
+            int dy,
168
+            int bs,
169
+            int besterror = 8 * 8 * 1024 * 1024);
170
+
171
+        int motionErrorLumaSAD(PicYuv *orig,
172
+            PicYuv *buffer,
173
+            int x,
174
+            int y,
175
+            int dx,
176
+            int dy,
177
+            int bs,
178
+            int besterror = 8 * 8 * 1024 * 1024);
179
+
180
+        void destroyRefPicInfo(TemporalFilterRefPicInfo* curFrame);
181
+
182
+        void applyMotion(MV *mvs, uint32_t mvsStride, PicYuv *input, PicYuv *output);
183
+
184
+    };
185
+}
186
+#endif
187
x265_3.5.tar.gz/source/common/threading.h -> x265_3.6.tar.gz/source/common/threading.h Changed
340
 
1
@@ -3,6 +3,7 @@
2
  *
3
  * Authors: Steve Borho <steve@borho.org>
4
  *          Min Chen <chenm003@163.com>
5
+            liwei <liwei@multicorewareinc.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -253,6 +254,47 @@
10
     int                m_val;
11
 };
12
 
13
+class NamedSemaphore
14
+{
15
+public:
16
+    NamedSemaphore() : m_sem(NULL)
17
+    {
18
+    }
19
+
20
+    ~NamedSemaphore()
21
+    {
22
+    }
23
+
24
+    bool create(const char* name, const int initcnt, const int maxcnt)
25
+    {
26
+        if(!m_sem)
27
+        {
28
+            m_sem = CreateSemaphoreA(NULL, initcnt, maxcnt, name);
29
+        }
30
+        return m_sem != NULL;
31
+    }
32
+
33
+    bool give(const int32_t cnt)
34
+    {
35
+        return ReleaseSemaphore(m_sem, (LONG)cnt, NULL) != FALSE;
36
+    }
37
+
38
+    bool take(const uint32_t time_out = INFINITE)
39
+    {
40
+        int32_t rt = WaitForSingleObject(m_sem, time_out);
41
+        return rt != WAIT_TIMEOUT && rt != WAIT_FAILED;
42
+    }
43
+
44
+    void release()
45
+    {
46
+        CloseHandle(m_sem);
47
+        m_sem = NULL;
48
+    }
49
+
50
+private:
51
+    HANDLE m_sem;
52
+};
53
+
54
 #else /* POSIX / pthreads */
55
 
56
 typedef pthread_t ThreadHandle;
57
@@ -459,6 +501,282 @@
58
     int             m_val;
59
 };
60
 
61
+#define TIMEOUT_INFINITE 0xFFFFFFFF
62
+
63
+class NamedSemaphore
64
+{
65
+public:
66
+    NamedSemaphore() 
67
+        : m_sem(NULL)
68
+#ifndef __APPLE__
69
+        , m_name(NULL)
70
+#endif //__APPLE__
71
+    {
72
+    }
73
+
74
+    ~NamedSemaphore()
75
+    {
76
+    }
77
+
78
+    bool create(const char* name, const int initcnt, const int maxcnt)
79
+    {
80
+        bool ret = false;
81
+
82
+        if (initcnt >= maxcnt)
83
+        {
84
+            return false;
85
+        }
86
+
87
+#ifdef __APPLE__
88
+        do
89
+        {
90
+            int32_t pshared = name != NULL ? PTHREAD_PROCESS_SHARED : PTHREAD_PROCESS_PRIVATE;
91
+
92
+            m_sem = (mac_sem_t *)malloc(sizeof(mac_sem_t));
93
+            if (!m_sem)
94
+            {
95
+                break;
96
+            }
97
+
98
+            if (pthread_mutexattr_init(&m_sem->mutexAttr))
99
+            {
100
+                break;
101
+            }
102
+
103
+            if (pthread_mutexattr_setpshared(&m_sem->mutexAttr, pshared))
104
+            {
105
+                break;
106
+            }
107
+
108
+            if (pthread_condattr_init(&m_sem->condAttr))
109
+            {
110
+                break;
111
+            }
112
+
113
+            if (pthread_condattr_setpshared(&m_sem->condAttr, pshared))
114
+            {
115
+                break;
116
+            }
117
+
118
+            if (pthread_mutex_init(&m_sem->mutex, &m_sem->mutexAttr))
119
+            {
120
+                break;
121
+            }
122
+
123
+            if (pthread_cond_init(&m_sem->cond, &m_sem->condAttr))
124
+            {
125
+                break;
126
+            }
127
+
128
+            m_sem->curCnt = initcnt;
129
+            m_sem->maxCnt = maxcnt;
130
+
131
+            ret = true;
132
+        } while (0);
133
+        
134
+        if (!ret)
135
+        {
136
+            release();
137
+        }
138
+
139
+#else  //__APPLE__
140
+        m_sem = sem_open(name, O_CREAT | O_EXCL, 0666, initcnt);
141
+        if (m_sem != SEM_FAILED) 
142
+        {
143
+            m_name = strdup(name);
144
+            ret = true;
145
+        }
146
+        else 
147
+        {
148
+            if (EEXIST == errno) 
149
+            {
150
+                m_sem = sem_open(name, 0);
151
+                if (m_sem != SEM_FAILED) 
152
+                {
153
+                    m_name = strdup(name);
154
+                    ret = true;
155
+                }
156
+            }
157
+        }
158
+#endif //__APPLE__
159
+
160
+        return ret;
161
+    }
162
+
163
+    bool give(const int32_t cnt)
164
+    {
165
+        if (!m_sem)
166
+        {
167
+            return false;
168
+        }
169
+
170
+#ifdef __APPLE__
171
+        if (pthread_mutex_lock(&m_sem->mutex))
172
+        {
173
+            return false;
174
+        }
175
+
176
+        int oldCnt = m_sem->curCnt;
177
+        m_sem->curCnt += cnt;
178
+        if (m_sem->curCnt > m_sem->maxCnt)
179
+        {
180
+            m_sem->curCnt = m_sem->maxCnt;
181
+        }
182
+
183
+        bool ret = true;
184
+        if (!oldCnt)
185
+        {
186
+            ret = 0 == pthread_cond_broadcast(&m_sem->cond);
187
+        }
188
+
189
+        if (pthread_mutex_unlock(&m_sem->mutex))
190
+        {
191
+            return false;
192
+        }
193
+
194
+        return ret;
195
+#else //__APPLE__
196
+        int ret = 0;
197
+        int32_t curCnt = cnt;
198
+        while (curCnt-- && !ret) {
199
+            ret = sem_post(m_sem);
200
+        }
201
+
202
+        return 0 == ret;
203
+#endif //_APPLE__
204
+    }
205
+
206
+    bool take(const uint32_t time_out = TIMEOUT_INFINITE)
207
+    {
208
+        if (!m_sem)
209
+        {
210
+            return false;
211
+        }
212
+
213
+#ifdef __APPLE__
214
+
215
+        if (pthread_mutex_lock(&m_sem->mutex))
216
+        {
217
+            return false;
218
+        }
219
+
220
+        bool ret = true;
221
+        if (TIMEOUT_INFINITE == time_out) 
222
+        {
223
+            if (!m_sem->curCnt)
224
+            {
225
+                if (pthread_cond_wait(&m_sem->cond, &m_sem->mutex))
226
+                {
227
+                    ret = false;
228
+                } 
229
+            }
230
+
231
+            if (m_sem->curCnt && ret)
232
+            {
233
+                m_sem->curCnt--;
234
+            }
235
+        }
236
+        else
237
+        {
238
+            if (0 == time_out)
239
+            {
240
+                if (m_sem->curCnt)
241
+                {
242
+                    m_sem->curCnt--;
243
+                }
244
+                else
245
+                {
246
+                    ret = false;
247
+                }
248
+            }
249
+            else
250
+            {
251
+                if (!m_sem->curCnt)
252
+                {
253
+                    struct timespec ts;
254
+                    ts.tv_sec = time_out / 1000L;
255
+                    ts.tv_nsec = (time_out * 1000000L) - ts.tv_sec * 1000 * 1000 * 1000;
256
+
257
+                    if (pthread_cond_timedwait(&m_sem->cond, &m_sem->mutex, &ts))
258
+                    {
259
+                        ret = false;
260
+                    }
261
+                }
262
+
263
+                if (m_sem->curCnt && ret)
264
+                {
265
+                    m_sem->curCnt--;
266
+                }
267
+            }
268
+        }
269
+
270
+        if (pthread_mutex_unlock(&m_sem->mutex))
271
+        {
272
+            return false;
273
+        }
274
+
275
+        return ret;
276
+#else //__APPLE__
277
+        if (TIMEOUT_INFINITE == time_out) 
278
+        {
279
+            return 0 == sem_wait(m_sem);
280
+        }
281
+        else 
282
+        {
283
+            if (0 == time_out)
284
+            {
285
+                return 0 == sem_trywait(m_sem);
286
+            }
287
+            else
288
+            {
289
+                struct timespec ts;
290
+                ts.tv_sec = time_out / 1000L;
291
+                ts.tv_nsec = (time_out * 1000000L) - ts.tv_sec * 1000 * 1000 * 1000;
292
+                return 0 == sem_timedwait(m_sem, &ts);
293
+            }
294
+        }
295
+#endif //_APPLE__
296
+    }
297
+
298
+    void release()
299
+    {
300
+        if (m_sem)
301
+        {
302
+#ifdef __APPLE__
303
+            pthread_condattr_destroy(&m_sem->condAttr);
304
+            pthread_mutexattr_destroy(&m_sem->mutexAttr);
305
+            pthread_mutex_destroy(&m_sem->mutex);
306
+            pthread_cond_destroy(&m_sem->cond);
307
+            free(m_sem);
308
+            m_sem = NULL;
309
+#else //__APPLE__
310
+            sem_close(m_sem);
311
+            sem_unlink(m_name);
312
+            m_sem = NULL;
313
+            free(m_name);
314
+            m_name = NULL;
315
+#endif //__APPLE__
316
+        }
317
+    }
318
+
319
+private:
320
+#ifdef __APPLE__
321
+    typedef struct
322
+    {
323
+        pthread_mutex_t     mutex;
324
+        pthread_cond_t      cond;
325
+        pthread_mutexattr_t mutexAttr;
326
+        pthread_condattr_t  condAttr;
327
+        uint32_t            curCnt;
328
+        uint32_t            maxCnt;
329
+    }mac_sem_t;
330
+    mac_sem_t *m_sem;
331
+#else // __APPLE__
332
+    sem_t *m_sem;
333
+    char  *m_name;
334
+#endif // __APPLE_
335
+};
336
+
337
 #endif // ifdef _WIN32
338
 
339
 class ScopedLock
340
x265_3.5.tar.gz/source/common/threadpool.cpp -> x265_3.6.tar.gz/source/common/threadpool.cpp Changed
10
 
1
@@ -301,7 +301,7 @@
2
     /* limit threads based on param->numaPools
3
      * For windows because threads can't be allocated to live across sockets
4
      * changing the default behavior to be per-socket pools -- FIXME */
5
-#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
6
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 || HAVE_LIBNUMA
7
     if (!p->numaPools || (strcmp(p->numaPools, "NULL") == 0 || strcmp(p->numaPools, "*") == 0 || strcmp(p->numaPools, "") == 0))
8
     {
9
          char poolString50 = "";
10
x265_3.5.tar.gz/source/common/version.cpp -> x265_3.6.tar.gz/source/common/version.cpp Changed
10
 
1
@@ -71,7 +71,7 @@
2
 #define ONOS    "Unk-OS"
3
 #endif
4
 
5
-#if X86_64
6
+#if defined(_LP64) || defined(_WIN64)
7
 #define BITS    "64 bit"
8
 #else
9
 #define BITS    "32 bit"
10
x265_3.5.tar.gz/source/common/x86/asm-primitives.cpp -> x265_3.6.tar.gz/source/common/x86/asm-primitives.cpp Changed
85
 
1
@@ -1091,6 +1091,7 @@
2
 
3
         p.frameInitLowres = PFX(frame_init_lowres_core_sse2);
4
         p.frameInitLowerRes = PFX(frame_init_lowres_core_sse2);
5
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_sse2);
6
         // TODO: the planecopy_sp is really planecopy_SC now, must be fix it 
7
         //p.planecopy_sp = PFX(downShift_16_sse2);
8
         p.planecopy_sp_shl = PFX(upShift_16_sse2);
9
@@ -1121,6 +1122,7 @@
10
     {
11
         ASSIGN2(p.scale1D_128to64, scale1D_128to64_ssse3);
12
         p.scale2D_64to32 = PFX(scale2D_64to32_ssse3);
13
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_ssse3);
14
 
15
         // p.puLUMA_4x4.satd = p.cuBLOCK_4x4.sa8d = PFX(pixel_satd_4x4_ssse3); this one is broken
16
         ALL_LUMA_PU(satd, pixel_satd, ssse3);
17
@@ -1462,6 +1464,7 @@
18
         p.puLUMA_64x48.copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx);
19
         p.puLUMA_64x64.copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx);
20
         p.propagateCost = PFX(mbtree_propagate_cost_avx);
21
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_avx);
22
     }
23
     if (cpuMask & X265_CPU_XOP)
24
     {
25
@@ -1473,6 +1476,7 @@
26
         LUMA_VAR(xop);
27
         p.frameInitLowres = PFX(frame_init_lowres_core_xop);
28
         p.frameInitLowerRes = PFX(frame_init_lowres_core_xop);
29
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_xop);
30
     }
31
     if (cpuMask & X265_CPU_AVX2)
32
     {
33
@@ -2301,6 +2305,9 @@
34
 
35
         p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
36
         p.frameInitLowerRes = PFX(frame_init_lowres_core_avx2);
37
+
38
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_avx2);
39
+
40
         p.propagateCost = PFX(mbtree_propagate_cost_avx2);
41
         p.fix8Unpack = PFX(cutree_fix8_unpack_avx2);
42
         p.fix8Pack = PFX(cutree_fix8_pack_avx2);
43
@@ -3300,6 +3307,7 @@
44
         //p.frameInitLowres = PFX(frame_init_lowres_core_mmx2);
45
         p.frameInitLowres = PFX(frame_init_lowres_core_sse2);
46
         p.frameInitLowerRes = PFX(frame_init_lowres_core_sse2);
47
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_sse2);
48
 
49
         ALL_LUMA_TU(blockfill_sNONALIGNED, blockfill_s, sse2);
50
         ALL_LUMA_TU(blockfill_sALIGNED, blockfill_s, sse2);
51
@@ -3424,6 +3432,8 @@
52
         ASSIGN2(p.scale1D_128to64, scale1D_128to64_ssse3);
53
         p.scale2D_64to32 = PFX(scale2D_64to32_ssse3);
54
 
55
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_ssse3);
56
+
57
         ASSIGN2(p.puLUMA_8x4.convert_p2s, filterPixelToShort_8x4_ssse3);
58
         ASSIGN2(p.puLUMA_8x8.convert_p2s, filterPixelToShort_8x8_ssse3);
59
         ASSIGN2(p.puLUMA_8x16.convert_p2s, filterPixelToShort_8x16_ssse3);
60
@@ -3691,6 +3701,7 @@
61
         p.frameInitLowres = PFX(frame_init_lowres_core_avx);
62
         p.frameInitLowerRes = PFX(frame_init_lowres_core_avx);
63
         p.propagateCost = PFX(mbtree_propagate_cost_avx);
64
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_avx);
65
     }
66
     if (cpuMask & X265_CPU_XOP)
67
     {
68
@@ -3702,6 +3713,7 @@
69
         p.cuBLOCK_16x16.sse_pp = PFX(pixel_ssd_16x16_xop);
70
         p.frameInitLowres = PFX(frame_init_lowres_core_xop);
71
         p.frameInitLowerRes = PFX(frame_init_lowres_core_xop);
72
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_xop);
73
 
74
     }
75
 #if X86_64
76
@@ -4684,6 +4696,8 @@
77
         p.saoCuStatsE2 = PFX(saoCuStatsE2_avx2);
78
         p.saoCuStatsE3 = PFX(saoCuStatsE3_avx2);
79
 
80
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_avx2);
81
+
82
         if (cpuMask & X265_CPU_BMI2)
83
         {
84
             p.scanPosLast = PFX(scanPosLast_avx2_bmi2);
85
x265_3.5.tar.gz/source/common/x86/const-a.asm -> x265_3.6.tar.gz/source/common/x86/const-a.asm Changed
10
 
1
@@ -100,7 +100,7 @@
2
 const pw_2000,              times 16 dw 0x2000
3
 const pw_8000,              times  8 dw 0x8000
4
 const pw_3fff,              times 16 dw 0x3fff
5
-const pw_32_0,              times  4 dw 32,
6
+const pw_32_0,              times  4 dw 32
7
                             times  4 dw 0
8
 const pw_pixel_max,         times 16 dw ((1 << BIT_DEPTH)-1)
9
 
10
x265_3.5.tar.gz/source/common/x86/h-ipfilter8.asm -> x265_3.6.tar.gz/source/common/x86/h-ipfilter8.asm Changed
20
 
1
@@ -125,6 +125,9 @@
2
 ALIGN 32
3
 interp4_hps_shuf: times 2 db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
4
 
5
+ALIGN 32
6
+const interp_4tap_8x8_horiz_shuf,   dd 0, 4, 1, 5, 2, 6, 3, 7
7
+
8
 SECTION .text
9
 
10
 cextern pw_1
11
@@ -1459,8 +1462,6 @@
12
 
13
     RET
14
 
15
-ALIGN 32
16
-const interp_4tap_8x8_horiz_shuf,   dd 0, 4, 1, 5, 2, 6, 3, 7
17
 
18
 %macro FILTER_H4_w6 3
19
     movu        %1, srcq - 1
20
x265_3.5.tar.gz/source/common/x86/mc-a2.asm -> x265_3.6.tar.gz/source/common/x86/mc-a2.asm Changed
264
 
1
@@ -992,6 +992,262 @@
2
 FRAME_INIT_LOWRES
3
 %endif
4
 
5
+%macro SUBSAMPLEFILT8x4 7
6
+    mova      %3, r0+%7
7
+    mova      %4, r0+r2+%7
8
+    pavgb     %3, %4
9
+    pavgb     %4, r0+r2*2+%7
10
+    PALIGNR   %1, %3, 1, m6
11
+    PALIGNR   %2, %4, 1, m6
12
+%if cpuflag(xop)
13
+    pavgb     %1, %3
14
+    pavgb     %2, %4
15
+%else
16
+    pavgb     %1, %3
17
+    pavgb     %2, %4
18
+    psrlw     %5, %1, 8
19
+    psrlw     %6, %2, 8
20
+    pand      %1, m7
21
+    pand      %2, m7
22
+%endif
23
+%endmacro
24
+
25
+%macro SUBSAMPLEFILT32x4U 1
26
+    movu      m1, r0+r2
27
+    pavgb     m0, m1, r0
28
+    movu      m3, r0+r2+1
29
+    pavgb     m2, m3, r0+1
30
+    pavgb     m1, r0+r2*2
31
+    pavgb     m3, r0+r2*2+1
32
+    pavgb     m0, m2
33
+    pavgb     m1, m3
34
+
35
+    movu      m3, r0+r2+mmsize
36
+    pavgb     m2, m3, r0+mmsize
37
+    movu      m5, r0+r2+1+mmsize
38
+    pavgb     m4, m5, r0+1+mmsize
39
+    pavgb     m2, m4
40
+
41
+    pshufb    m0, m7
42
+    pshufb    m2, m7
43
+    punpcklqdq m0, m0, m2
44
+    vpermq    m0, m0, q3120
45
+    movu    %1, m0
46
+%endmacro
47
+
48
+%macro SUBSAMPLEFILT16x2 3
49
+    mova      m3, r0+%3+mmsize
50
+    mova      m2, r0+%3
51
+    pavgb     m3, r0+%3+r2+mmsize
52
+    pavgb     m2, r0+%3+r2
53
+    PALIGNR   %1, m3, 1, m6
54
+    pavgb     %1, m3
55
+    PALIGNR   m3, m2, 1, m6
56
+    pavgb     m3, m2
57
+%if cpuflag(xop)
58
+    vpperm    m3, m3, %1, m6
59
+%else
60
+    pand      m3, m7
61
+    pand      %1, m7
62
+    packuswb  m3, %1
63
+%endif
64
+    mova    %2, m3
65
+    mova      %1, m2
66
+%endmacro
67
+
68
+%macro SUBSAMPLEFILT8x2U 2
69
+    mova      m2, r0+%2
70
+    pavgb     m2, r0+%2+r2
71
+    mova      m0, r0+%2+1
72
+    pavgb     m0, r0+%2+r2+1
73
+    pavgb     m1, m3
74
+    pavgb     m0, m2
75
+    pand      m1, m7
76
+    pand      m0, m7
77
+    packuswb  m0, m1
78
+    mova    %1, m0
79
+%endmacro
80
+
81
+%macro SUBSAMPLEFILT8xU 2
82
+    mova      m3, r0+%2+8
83
+    mova      m2, r0+%2
84
+    pavgw     m3, r0+%2+r2+8
85
+    pavgw     m2, r0+%2+r2
86
+    movu      m1, r0+%2+10
87
+    movu      m0, r0+%2+2
88
+    pavgw     m1, r0+%2+r2+10
89
+    pavgw     m0, r0+%2+r2+2
90
+    pavgw     m1, m3
91
+    pavgw     m0, m2
92
+    psrld     m3, m1, 16
93
+    pand      m1, m7
94
+    pand      m0, m7
95
+    packssdw  m0, m1
96
+    movu    %1, m0
97
+%endmacro
98
+
99
+%macro SUBSAMPLEFILT8xA 3
100
+    movu      m3, r0+%3+mmsize
101
+    movu      m2, r0+%3
102
+    pavgw     m3, r0+%3+r2+mmsize
103
+    pavgw     m2, r0+%3+r2
104
+    PALIGNR   %1, m3, 2, m6
105
+    pavgw     %1, m3
106
+    PALIGNR   m3, m2, 2, m6
107
+    pavgw     m3, m2
108
+%if cpuflag(xop)
109
+    vpperm    m3, m3, %1, m6
110
+%else
111
+    pand      m3, m7
112
+    pand      %1, m7
113
+    packssdw  m3, %1
114
+%endif
115
+%if cpuflag(avx2)
116
+    vpermq     m3, m3, q3120
117
+%endif
118
+    movu    %2, m3
119
+    movu      %1, m2
120
+%endmacro
121
+
122
+;-----------------------------------------------------------------------------
123
+; void frame_subsample_luma( uint8_t *src0, uint8_t *dst0,
124
+;                              intptr_t src_stride, intptr_t dst_stride, int width, int height )
125
+;-----------------------------------------------------------------------------
126
+
127
+%macro FRAME_SUBSAMPLE_LUMA 0
128
+cglobal frame_subsample_luma, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
129
+%if HIGH_BIT_DEPTH
130
+    shl   dword r3m, 1
131
+    FIX_STRIDES r2
132
+    shl   dword r4m, 1
133
+%endif
134
+%if mmsize >= 16
135
+    add   dword r4m, mmsize-1
136
+    and   dword r4m, ~(mmsize-1)
137
+%endif
138
+    ; src += 2*(height-1)*stride + 2*width
139
+    mov      r6d, r5m
140
+    dec      r6d
141
+    imul     r6d, r2d
142
+    add      r6d, r4m
143
+    lea       r0, r0+r6*2
144
+    ; dst += (height-1)*stride + width
145
+    mov      r6d, r5m
146
+    dec      r6d
147
+    imul     r6d, r3m
148
+    add      r6d, r4m
149
+    add       r1, r6
150
+    ; gap = stride - width
151
+    mov      r6d, r3m
152
+    sub      r6d, r4m
153
+    PUSH      r6
154
+    %define dst_gap rsp+gprsize
155
+    mov      r6d, r2d
156
+    sub      r6d, r4m
157
+    shl      r6d, 1
158
+    PUSH      r6
159
+    %define src_gap rsp
160
+%if HIGH_BIT_DEPTH
161
+%if cpuflag(xop)
162
+    mova      m6, deinterleave_shuf32a
163
+    mova      m7, deinterleave_shuf32b
164
+%else
165
+    pcmpeqw   m7, m7
166
+    psrld     m7, 16
167
+%endif
168
+.vloop:
169
+    mov      r6d, r4m
170
+%ifnidn cpuname, mmx2
171
+    movu      m0, r0
172
+    movu      m1, r0+r2
173
+    pavgw     m0, m1
174
+    pavgw     m1, r0+r2*2
175
+%endif
176
+.hloop:
177
+    sub       r0, mmsize*2
178
+    sub       r1, mmsize
179
+%ifidn cpuname, mmx2
180
+    SUBSAMPLEFILT8xU r1, 0
181
+%else
182
+    SUBSAMPLEFILT8xA m0, r1, 0
183
+%endif
184
+    sub      r6d, mmsize
185
+    jg .hloop
186
+%else ; !HIGH_BIT_DEPTH
187
+%if cpuflag(avx2)
188
+    mova      m7, deinterleave_shuf
189
+%elif cpuflag(xop)
190
+    mova      m6, deinterleave_shuf32a
191
+    mova      m7, deinterleave_shuf32b
192
+%else
193
+    pcmpeqb   m7, m7
194
+    psrlw     m7, 8
195
+%endif
196
+.vloop:
197
+    mov      r6d, r4m
198
+%ifnidn cpuname, mmx2
199
+%if mmsize <= 16
200
+    mova      m0, r0
201
+    mova      m1, r0+r2
202
+    pavgb     m0, m1
203
+    pavgb     m1, r0+r2*2
204
+%endif
205
+%endif
206
+.hloop:
207
+    sub       r0, mmsize*2
208
+    sub       r1, mmsize
209
+%if mmsize==32
210
+    SUBSAMPLEFILT32x4U r1
211
+%elifdef m8
212
+    SUBSAMPLEFILT8x4   m0, m1, m2, m3, m10, m11, mmsize
213
+    mova      m8, m0
214
+    mova      m9, m1
215
+    SUBSAMPLEFILT8x4   m2, m3, m0, m1, m4, m5, 0
216
+%if cpuflag(xop)
217
+    vpperm    m4, m2, m8, m7
218
+    vpperm    m2, m2, m8, m6
219
+%else
220
+    packuswb  m2, m8
221
+%endif
222
+    mova    r1, m2
223
+%elifidn cpuname, mmx2
224
+    SUBSAMPLEFILT8x2U  r1, 0
225
+%else
226
+    SUBSAMPLEFILT16x2  m0, r1, 0
227
+%endif
228
+    sub      r6d, mmsize
229
+    jg .hloop
230
+%endif ; HIGH_BIT_DEPTH
231
+.skip:
232
+    mov       r3, dst_gap
233
+    sub       r0, src_gap
234
+    sub       r1, r3
235
+    dec    dword r5m
236
+    jg .vloop
237
+    ADD      rsp, 2*gprsize
238
+    emms
239
+    RET
240
+%endmacro ; FRAME_SUBSAMPLE_LUMA
241
+
242
+INIT_MMX mmx2
243
+FRAME_SUBSAMPLE_LUMA
244
+%if ARCH_X86_64 == 0
245
+INIT_MMX cache32, mmx2
246
+FRAME_SUBSAMPLE_LUMA
247
+%endif
248
+INIT_XMM sse2
249
+FRAME_SUBSAMPLE_LUMA
250
+INIT_XMM ssse3
251
+FRAME_SUBSAMPLE_LUMA
252
+INIT_XMM avx
253
+FRAME_SUBSAMPLE_LUMA
254
+INIT_XMM xop
255
+FRAME_SUBSAMPLE_LUMA
256
+%if ARCH_X86_64 == 1
257
+INIT_YMM avx2
258
+FRAME_SUBSAMPLE_LUMA
259
+%endif
260
+
261
 ;-----------------------------------------------------------------------------
262
 ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, int32_t *intra_costs,
263
 ;                             uint16_t *inter_costs, int32_t *inv_qscales, double *fps_factor, int len )
264
x265_3.5.tar.gz/source/common/x86/mc.h -> x265_3.6.tar.gz/source/common/x86/mc.h Changed
19
 
1
@@ -36,6 +36,17 @@
2
 
3
 #undef LOWRES
4
 
5
+#define SUBSAMPLELUMA(cpu) \
6
+    void PFX(frame_subsample_luma_ ## cpu)(const pixel* src0, pixel* dst0, intptr_t src_stride, intptr_t dst_stride, int width, int height);
7
+SUBSAMPLELUMA(mmx2)
8
+SUBSAMPLELUMA(sse2)
9
+SUBSAMPLELUMA(ssse3)
10
+SUBSAMPLELUMA(avx)
11
+SUBSAMPLELUMA(avx2)
12
+SUBSAMPLELUMA(xop)
13
+
14
+#undef SUBSAMPLELUMA
15
+
16
 #define PROPAGATE_COST(cpu) \
17
     void PFX(mbtree_propagate_cost_ ## cpu)(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, \
18
                                               const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
19
x265_3.5.tar.gz/source/common/x86/x86inc.asm -> x265_3.6.tar.gz/source/common/x86/x86inc.asm Changed
96
 
1
@@ -401,16 +401,6 @@
2
     %endif
3
 %endmacro
4
 
5
-%macro DEFINE_ARGS_INTERNAL 3+
6
-    %ifnum %2
7
-        DEFINE_ARGS %3
8
-    %elif %1 == 4
9
-        DEFINE_ARGS %2
10
-    %elif %1 > 4
11
-        DEFINE_ARGS %2, %3
12
-    %endif
13
-%endmacro
14
-
15
 %if WIN64 ; Windows x64 ;=================================================
16
 
17
 DECLARE_REG 0,  rcx
18
@@ -429,7 +419,7 @@
19
 DECLARE_REG 13, R12, 112
20
 DECLARE_REG 14, R13, 120
21
 
22
-%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
23
+%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
24
     %assign num_args %1
25
     %assign regs_used %2
26
     ASSERT regs_used >= num_args
27
@@ -441,7 +431,15 @@
28
         WIN64_SPILL_XMM %3
29
     %endif
30
     LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
31
-    DEFINE_ARGS_INTERNAL %0, %4, %5
32
+    %if %0 > 4
33
+         %ifnum %4
34
+             DEFINE_ARGS %5
35
+         %else
36
+             DEFINE_ARGS %4, %5
37
+         %endif
38
+     %elifnnum %4
39
+         DEFINE_ARGS %4
40
+     %endif
41
 %endmacro
42
 
43
 %macro WIN64_PUSH_XMM 0
44
@@ -537,7 +535,7 @@
45
 DECLARE_REG 13, R12, 64
46
 DECLARE_REG 14, R13, 72
47
 
48
-%macro PROLOGUE 2-5+ 0; #args, #regs, #xmm_regs, stack_size, arg_names...
49
+%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
50
     %assign num_args %1
51
     %assign regs_used %2
52
     %assign xmm_regs_used %3
53
@@ -547,7 +545,15 @@
54
     PUSH_IF_USED 9, 10, 11, 12, 13, 14
55
     ALLOC_STACK %4
56
     LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
57
-    DEFINE_ARGS_INTERNAL %0, %4, %5
58
+    %if %0 > 4
59
+         %ifnum %4
60
+             DEFINE_ARGS %5
61
+         %else
62
+             DEFINE_ARGS %4, %5
63
+         %endif
64
+     %elifnnum %4
65
+         DEFINE_ARGS %4
66
+     %endif
67
 %endmacro
68
 
69
 %define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
70
@@ -588,7 +594,7 @@
71
 
72
 DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
73
 
74
-%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, stack_size, arg_names...
75
+%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
76
     %assign num_args %1
77
     %assign regs_used %2
78
     ASSERT regs_used >= num_args
79
@@ -603,7 +609,15 @@
80
     PUSH_IF_USED 3, 4, 5, 6
81
     ALLOC_STACK %4
82
     LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
83
-    DEFINE_ARGS_INTERNAL %0, %4, %5
84
+    %if %0 > 4
85
+         %ifnum %4
86
+             DEFINE_ARGS %5
87
+         %else
88
+             DEFINE_ARGS %4, %5
89
+         %endif
90
+     %elifnnum %4
91
+         DEFINE_ARGS %4
92
+     %endif
93
 %endmacro
94
 
95
 %define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
96
x265_3.5.tar.gz/source/common/x86/x86util.asm -> x265_3.6.tar.gz/source/common/x86/x86util.asm Changed
13
 
1
@@ -578,8 +578,10 @@
2
     %elif %1==2
3
         %if mmsize==8
4
             SBUTTERFLY dq, %3, %4, %5
5
-        %else
6
+        %elif %0==6
7
             TRANS q, ORDER, %3, %4, %5, %6
8
+        %else
9
+            TRANS q, ORDER, %3, %4, %5
10
         %endif
11
     %elif %1==4
12
         SBUTTERFLY qdq, %3, %4, %5
13
x265_3.5.tar.gz/source/encoder/analysis.cpp -> x265_3.6.tar.gz/source/encoder/analysis.cpp Changed
10
 
1
@@ -3645,7 +3645,7 @@
2
             qp += distortionData->offsetctu.m_cuAddr;
3
     }
4
 
5
-    if (m_param->analysisLoadReuseLevel == 10 && m_param->rc.cuTree)
6
+    if (m_param->analysisLoadReuseLevel >= 2 && m_param->rc.cuTree)
7
     {
8
         int cuIdx = (ctu.m_cuAddr * ctu.m_numPartitions) + cuGeom.absPartIdx;
9
         if (ctu.m_slice->m_sliceType == I_SLICE)
10
x265_3.5.tar.gz/source/encoder/api.cpp -> x265_3.6.tar.gz/source/encoder/api.cpp Changed
50
 
1
@@ -208,7 +208,6 @@
2
     memcpy(zoneParam, param, sizeof(x265_param));
3
     for (int i = 0; i < param->rc.zonefileCount; i++)
4
     {
5
-        param->rc.zonesi.startFrame = -1;
6
         encoder->configureZone(zoneParam, param->rc.zonesi.zoneParam);
7
     }
8
 
9
@@ -608,6 +607,14 @@
10
     if (numEncoded < 0)
11
         encoder->m_aborted = true;
12
 
13
+    if ((!encoder->m_numDelayedPic && !numEncoded) && (encoder->m_param->bEnableEndOfSequence || encoder->m_param->bEnableEndOfBitstream))
14
+    {
15
+        Bitstream bs;
16
+        encoder->getEndNalUnits(encoder->m_nalList, bs);
17
+        *pp_nal = &encoder->m_nalList.m_nal0;
18
+        if (pi_nal) *pi_nal = encoder->m_nalList.m_numNal;
19
+    }
20
+
21
     return numEncoded;
22
 }
23
 
24
@@ -1042,6 +1049,7 @@
25
     &PARAM_NS::x265_param_free,
26
     &PARAM_NS::x265_param_default,
27
     &PARAM_NS::x265_param_parse,
28
+    &PARAM_NS::x265_scenecut_aware_qp_param_parse,
29
     &PARAM_NS::x265_param_apply_profile,
30
     &PARAM_NS::x265_param_default_preset,
31
     &x265_picture_alloc,
32
@@ -1288,6 +1296,8 @@
33
             if (param->csvLogLevel)
34
             {
35
                 fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, Scenecut, ");
36
+                if (!!param->bEnableTemporalSubLayers)
37
+                    fprintf(csvfp, "Temporal Sub Layer ID, ");
38
                 if (param->csvLogLevel >= 2)
39
                     fprintf(csvfp, "I/P cost ratio, ");
40
                 if (param->rc.rateControlMode == X265_RC_CRF)
41
@@ -1401,6 +1411,8 @@
42
     const x265_frame_stats* frameStats = &pic->frameData;
43
     fprintf(param->csvfpt, "%d, %c-SLICE, %4d, %2.2lf, %10d, %d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc,
44
                                                                    frameStats->qp, (int)frameStats->bits, frameStats->bScenecut);
45
+    if (!!param->bEnableTemporalSubLayers)
46
+        fprintf(param->csvfpt, "%d,", frameStats->tLayer);
47
     if (param->csvLogLevel >= 2)
48
         fprintf(param->csvfpt, "%.2f,", frameStats->ipCostRatio);
49
     if (param->rc.rateControlMode == X265_RC_CRF)
50
x265_3.5.tar.gz/source/encoder/dpb.cpp -> x265_3.6.tar.gz/source/encoder/dpb.cpp Changed
258
 
1
@@ -70,10 +70,18 @@
2
     {
3
         Frame *curFrame = iterFrame;
4
         iterFrame = iterFrame->m_next;
5
-        if (!curFrame->m_encData->m_bHasReferences && !curFrame->m_countRefEncoders)
6
+        bool isMCSTFReferenced = false;
7
+
8
+        if (curFrame->m_param->bEnableTemporalFilter)
9
+            isMCSTFReferenced =!!(curFrame->m_refPicCnt1);
10
+
11
+        if (!curFrame->m_encData->m_bHasReferences && !curFrame->m_countRefEncoders && !isMCSTFReferenced)
12
         {
13
             curFrame->m_bChromaExtended = false;
14
 
15
+            if (curFrame->m_param->bEnableTemporalFilter)
16
+                *curFrame->m_isSubSampled = false;
17
+
18
             // Reset column counter
19
             X265_CHECK(curFrame->m_reconRowFlag != NULL, "curFrame->m_reconRowFlag check failure");
20
             X265_CHECK(curFrame->m_reconColCount != NULL, "curFrame->m_reconColCount check failure");
21
@@ -142,12 +150,13 @@
22
     {
23
         newFrame->m_encData->m_bHasReferences = false;
24
 
25
+        newFrame->m_tempLayer = (newFrame->m_param->bEnableTemporalSubLayers && !m_bTemporalSublayer) ? 1 : newFrame->m_tempLayer;
26
         // Adjust NAL type for unreferenced B frames (change from _R "referenced"
27
         // to _N "non-referenced" NAL unit type)
28
         switch (slice->m_nalUnitType)
29
         {
30
         case NAL_UNIT_CODED_SLICE_TRAIL_R:
31
-            slice->m_nalUnitType = m_bTemporalSublayer ? NAL_UNIT_CODED_SLICE_TSA_N : NAL_UNIT_CODED_SLICE_TRAIL_N;
32
+            slice->m_nalUnitType = newFrame->m_param->bEnableTemporalSubLayers ? NAL_UNIT_CODED_SLICE_TSA_N : NAL_UNIT_CODED_SLICE_TRAIL_N;
33
             break;
34
         case NAL_UNIT_CODED_SLICE_RADL_R:
35
             slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_RADL_N;
36
@@ -168,13 +177,94 @@
37
 
38
     m_picList.pushFront(*newFrame);
39
 
40
+    if (m_bTemporalSublayer && getTemporalLayerNonReferenceFlag())
41
+    {
42
+        switch (slice->m_nalUnitType)
43
+        {
44
+        case NAL_UNIT_CODED_SLICE_TRAIL_R:
45
+            slice->m_nalUnitType =  NAL_UNIT_CODED_SLICE_TRAIL_N;
46
+            break;
47
+        case NAL_UNIT_CODED_SLICE_RADL_R:
48
+            slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_RADL_N;
49
+            break;
50
+        case NAL_UNIT_CODED_SLICE_RASL_R:
51
+            slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_RASL_N;
52
+            break;
53
+        default:
54
+            break;
55
+        }
56
+    }
57
     // Do decoding refresh marking if any
58
     decodingRefreshMarking(pocCurr, slice->m_nalUnitType);
59
 
60
-    computeRPS(pocCurr, slice->isIRAP(), &slice->m_rps, slice->m_sps->maxDecPicBuffering);
61
-
62
+    computeRPS(pocCurr, newFrame->m_tempLayer, slice->isIRAP(), &slice->m_rps, slice->m_sps->maxDecPicBufferingnewFrame->m_tempLayer);
63
+    bool isTSAPic = ((slice->m_nalUnitType == 2) || (slice->m_nalUnitType == 3)) ? true : false;
64
     // Mark pictures in m_piclist as unreferenced if they are not included in RPS
65
-    applyReferencePictureSet(&slice->m_rps, pocCurr);
66
+    applyReferencePictureSet(&slice->m_rps, pocCurr, newFrame->m_tempLayer, isTSAPic);
67
+
68
+
69
+    if (m_bTemporalSublayer && newFrame->m_tempLayer > 0
70
+        && !(slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RADL_N     // Check if not a leading picture
71
+            || slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RADL_R
72
+            || slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RASL_N
73
+            || slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RASL_R)
74
+        )
75
+    {
76
+        if (isTemporalLayerSwitchingPoint(pocCurr, newFrame->m_tempLayer) || (slice->m_sps->maxTempSubLayers == 1))
77
+        {
78
+            if (getTemporalLayerNonReferenceFlag())
79
+            {
80
+                slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_TSA_N;
81
+            }
82
+            else
83
+            {
84
+                slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_TSA_R;
85
+            }
86
+        }
87
+        else if (isStepwiseTemporalLayerSwitchingPoint(&slice->m_rps, pocCurr, newFrame->m_tempLayer))
88
+        {
89
+            bool isSTSA = true;
90
+            int id = newFrame->m_gopOffset % x265_gop_ra_lengthnewFrame->m_gopId;
91
+            for (int ii = id; (ii < x265_gop_ra_lengthnewFrame->m_gopId && isSTSA == true); ii++)
92
+            {
93
+                int tempIdRef = x265_gop_ranewFrame->m_gopIdii.layer;
94
+                if (tempIdRef == newFrame->m_tempLayer)
95
+                {
96
+                    for (int jj = 0; jj < slice->m_rps.numberOfPositivePictures + slice->m_rps.numberOfNegativePictures; jj++)
97
+                    {
98
+                        if (slice->m_rps.bUsedjj)
99
+                        {
100
+                            int refPoc = x265_gop_ranewFrame->m_gopIdii.poc_offset + slice->m_rps.deltaPOCjj;
101
+                            int kk = 0;
102
+                            for (kk = 0; kk < x265_gop_ra_lengthnewFrame->m_gopId; kk++)
103
+                            {
104
+                                if (x265_gop_ranewFrame->m_gopIdkk.poc_offset == refPoc)
105
+                                {
106
+                                    break;
107
+                                }
108
+                            }
109
+                            if (x265_gop_ranewFrame->m_gopIdkk.layer >= newFrame->m_tempLayer)
110
+                            {
111
+                                isSTSA = false;
112
+                                break;
113
+                            }
114
+                        }
115
+                    }
116
+                }
117
+            }
118
+            if (isSTSA == true)
119
+            {
120
+                if (getTemporalLayerNonReferenceFlag())
121
+                {
122
+                    slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_STSA_N;
123
+                }
124
+                else
125
+                {
126
+                    slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_STSA_R;
127
+                }
128
+            }
129
+        }
130
+    }
131
 
132
     if (slice->m_sliceType != I_SLICE)
133
         slice->m_numRefIdx0 = x265_clip3(1, newFrame->m_param->maxNumReferences, slice->m_rps.numberOfNegativePictures);
134
@@ -218,7 +308,7 @@
135
     }
136
 }
137
 
138
-void DPB::computeRPS(int curPoc, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer)
139
+void DPB::computeRPS(int curPoc, int tempId, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer)
140
 {
141
     unsigned int poci = 0, numNeg = 0, numPos = 0;
142
 
143
@@ -228,7 +318,7 @@
144
     {
145
         if ((iterPic->m_poc != curPoc) && iterPic->m_encData->m_bHasReferences)
146
         {
147
-            if ((m_lastIDR >= curPoc) || (m_lastIDR <= iterPic->m_poc))
148
+            if ((!m_bTemporalSublayer || (iterPic->m_tempLayer <= tempId)) && ((m_lastIDR >= curPoc) || (m_lastIDR <= iterPic->m_poc)))
149
             {
150
                     rps->pocpoci = iterPic->m_poc;
151
                     rps->deltaPOCpoci = rps->pocpoci - curPoc;
152
@@ -247,6 +337,18 @@
153
     rps->sortDeltaPOC();
154
 }
155
 
156
+bool DPB::getTemporalLayerNonReferenceFlag()
157
+{
158
+    Frame* curFrame = m_picList.first();
159
+    if (curFrame->m_encData->m_bHasReferences)
160
+    {
161
+        curFrame->m_sameLayerRefPic = true;
162
+        return false;
163
+    }
164
+    else
165
+        return true;
166
+}
167
+
168
 /* Marking reference pictures when an IDR/CRA is encountered. */
169
 void DPB::decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType)
170
 {
171
@@ -296,7 +398,7 @@
172
 }
173
 
174
 /** Function for applying picture marking based on the Reference Picture Set */
175
-void DPB::applyReferencePictureSet(RPS *rps, int curPoc)
176
+void DPB::applyReferencePictureSet(RPS *rps, int curPoc, int tempId, bool isTSAPicture)
177
 {
178
     // loop through all pictures in the reference picture buffer
179
     Frame* iterFrame = m_picList.first();
180
@@ -317,9 +419,68 @@
181
             }
182
             if (!referenced)
183
                 iterFrame->m_encData->m_bHasReferences = false;
184
+
185
+            if (m_bTemporalSublayer)
186
+            {
187
+                //check that pictures of higher temporal layers are not used
188
+                assert(referenced == 0 || iterFrame->m_encData->m_bHasReferences == false || iterFrame->m_tempLayer <= tempId);
189
+
190
+                //check that pictures of higher or equal temporal layer are not in the RPS if the current picture is a TSA picture
191
+                if (isTSAPicture)
192
+                {
193
+                    assert(referenced == 0 || iterFrame->m_tempLayer < tempId);
194
+                }
195
+                //check that pictures marked as temporal layer non-reference pictures are not used for reference
196
+                if (iterFrame->m_tempLayer == tempId)
197
+                {
198
+                    assert(referenced == 0 || iterFrame->m_sameLayerRefPic == true);
199
+                }
200
+            }
201
+        }
202
+        iterFrame = iterFrame->m_next;
203
+    }
204
+}
205
+
206
+bool DPB::isTemporalLayerSwitchingPoint(int curPoc, int tempId)
207
+{
208
+    // loop through all pictures in the reference picture buffer
209
+    Frame* iterFrame = m_picList.first();
210
+    while (iterFrame)
211
+    {
212
+        if (iterFrame->m_poc != curPoc && iterFrame->m_encData->m_bHasReferences)
213
+        {
214
+            if (iterFrame->m_tempLayer >= tempId)
215
+            {
216
+                return false;
217
+            }
218
+        }
219
+        iterFrame = iterFrame->m_next;
220
+    }
221
+    return true;
222
+}
223
+
224
+bool DPB::isStepwiseTemporalLayerSwitchingPoint(RPS *rps, int curPoc, int tempId)
225
+{
226
+    // loop through all pictures in the reference picture buffer
227
+    Frame* iterFrame = m_picList.first();
228
+    while (iterFrame)
229
+    {
230
+        if (iterFrame->m_poc != curPoc && iterFrame->m_encData->m_bHasReferences)
231
+        {
232
+            for (int i = 0; i < rps->numberOfPositivePictures + rps->numberOfNegativePictures; i++)
233
+            {
234
+                if ((iterFrame->m_poc == curPoc + rps->deltaPOCi) && rps->bUsedi)
235
+                {
236
+                    if (iterFrame->m_tempLayer >= tempId)
237
+                    {
238
+                        return false;
239
+                    }
240
+                }
241
+            }
242
         }
243
         iterFrame = iterFrame->m_next;
244
     }
245
+    return true;
246
 }
247
 
248
 /* deciding the nal_unit_type */
249
@@ -328,7 +489,7 @@
250
     if (!curPOC)
251
         return NAL_UNIT_CODED_SLICE_IDR_N_LP;
252
     if (bIsKeyFrame)
253
-        return m_bOpenGOP ? NAL_UNIT_CODED_SLICE_CRA : m_bhasLeadingPicture ? NAL_UNIT_CODED_SLICE_IDR_W_RADL : NAL_UNIT_CODED_SLICE_IDR_N_LP;
254
+        return (m_bOpenGOP || m_craNal) ? NAL_UNIT_CODED_SLICE_CRA : m_bhasLeadingPicture ? NAL_UNIT_CODED_SLICE_IDR_W_RADL : NAL_UNIT_CODED_SLICE_IDR_N_LP;
255
     if (m_pocCRA && curPOC < m_pocCRA)
256
         // All leading pictures are being marked as TFD pictures here since
257
         // current encoder uses all reference pictures while encoding leading
258
x265_3.5.tar.gz/source/encoder/dpb.h -> x265_3.6.tar.gz/source/encoder/dpb.h Changed
35
 
1
@@ -40,6 +40,7 @@
2
     int                m_lastIDR;
3
     int                m_pocCRA;
4
     int                m_bOpenGOP;
5
+   int                m_craNal;
6
     int                m_bhasLeadingPicture;
7
     bool               m_bRefreshPending;
8
     bool               m_bTemporalSublayer;
9
@@ -66,7 +67,8 @@
10
         m_bRefreshPending = false;
11
         m_frameDataFreeList = NULL;
12
         m_bOpenGOP = param->bOpenGOP;
13
-        m_bTemporalSublayer = !!param->bEnableTemporalSubLayers;
14
+       m_craNal = param->craNal;
15
+        m_bTemporalSublayer = (param->bEnableTemporalSubLayers > 2);
16
     }
17
 
18
     ~DPB();
19
@@ -77,10 +79,13 @@
20
 
21
 protected:
22
 
23
-    void computeRPS(int curPoc, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer);
24
+    void computeRPS(int curPoc,int tempId, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer);
25
 
26
-    void applyReferencePictureSet(RPS *rps, int curPoc);
27
+    void applyReferencePictureSet(RPS *rps, int curPoc, int tempId, bool isTSAPicture);
28
+    bool getTemporalLayerNonReferenceFlag();
29
     void decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType);
30
+    bool isTemporalLayerSwitchingPoint(int curPoc, int tempId);
31
+    bool isStepwiseTemporalLayerSwitchingPoint(RPS *rps, int curPoc, int tempId);
32
 
33
     NalUnitType getNalUnitType(int curPoc, bool bIsKeyFrame);
34
 };
35
x265_3.5.tar.gz/source/encoder/encoder.cpp -> x265_3.6.tar.gz/source/encoder/encoder.cpp Changed
1237
 
1
@@ -72,7 +72,40 @@
2
 {
3
     { 1, 1, 1, 1, 1, 5, 1,  2, 2, 2, 50 },
4
     { 1, 1, 1, 1, 1, 5, 0, 16, 9, 9, 81 },
5
-    { 1, 1, 1, 1, 1, 5, 0,  1, 1, 1, 82 }
6
+    { 1, 1, 1, 1, 1, 5, 0,  1, 1, 1, 82 },
7
+    { 1, 1, 1, 1, 1, 5, 0, 18, 9, 9, 84 }
8
+};
9
+
10
+typedef struct
11
+{
12
+    int bEnableVideoSignalTypePresentFlag;
13
+    int bEnableColorDescriptionPresentFlag;
14
+    int bEnableChromaLocInfoPresentFlag;
15
+    int colorPrimaries;
16
+    int transferCharacteristics;
17
+    int matrixCoeffs;
18
+    int bEnableVideoFullRangeFlag;
19
+    int chromaSampleLocTypeTopField;
20
+    int chromaSampleLocTypeBottomField;
21
+    const char* systemId;
22
+}VideoSignalTypePresets;
23
+
24
+VideoSignalTypePresets vstPresets =
25
+{
26
+    {1, 1, 1, 6, 6, 6, 0, 0, 0, "BT601_525"},
27
+    {1, 1, 1, 5, 6, 5, 0, 0, 0, "BT601_626"},
28
+    {1, 1, 1, 1, 1, 1, 0, 0, 0, "BT709_YCC"},
29
+    {1, 1, 0, 1, 1, 0, 0, 0, 0, "BT709_RGB"},
30
+    {1, 1, 1, 9, 14, 1, 0, 2, 2, "BT2020_YCC_NCL"},
31
+    {1, 1, 0, 9, 16, 9, 0, 0, 0, "BT2020_RGB"},
32
+    {1, 1, 1, 9, 16, 9, 0, 2, 2, "BT2100_PQ_YCC"},
33
+    {1, 1, 1, 9, 16, 14, 0, 2, 2, "BT2100_PQ_ICTCP"},
34
+    {1, 1, 0, 9, 16, 0, 0, 0, 0, "BT2100_PQ_RGB"},
35
+    {1, 1, 1, 9, 18, 9, 0, 2, 2, "BT2100_HLG_YCC"},
36
+    {1, 1, 0, 9, 18, 0, 0, 0, 0, "BT2100_HLG_RGB"},
37
+    {1, 1, 0, 1, 1, 0, 1, 0, 0, "FR709_RGB"},
38
+    {1, 1, 0, 9, 14, 0, 1, 0, 0, "FR2020_RGB"},
39
+    {1, 1, 1, 12, 1, 6, 1, 1, 1, "FRP3D65_YCC"}
40
 };
41
 }
42
 
43
@@ -109,6 +142,7 @@
44
     m_threadPool = NULL;
45
     m_analysisFileIn = NULL;
46
     m_analysisFileOut = NULL;
47
+    m_filmGrainIn = NULL;
48
     m_naluFile = NULL;
49
     m_offsetEmergency = NULL;
50
     m_iFrameNum = 0;
51
@@ -134,12 +168,8 @@
52
     m_prevTonemapPayload.payload = NULL;
53
     m_startPoint = 0;
54
     m_saveCTUSize = 0;
55
-    m_edgePic = NULL;
56
-    m_edgeHistThreshold = 0;
57
-    m_chromaHistThreshold = 0.0;
58
-    m_scaledEdgeThreshold = 0.0;
59
-    m_scaledChromaThreshold = 0.0;
60
     m_zoneIndex = 0;
61
+    m_origPicBuffer = 0;
62
 }
63
 
64
 inline char *strcatFilename(const char *input, const char *suffix)
65
@@ -216,34 +246,6 @@
66
         }
67
     }
68
 
69
-    if (m_param->bHistBasedSceneCut)
70
-    {
71
-        m_planeSizes0 = (m_param->sourceWidth >> x265_cli_cspsp->internalCsp.width0) * (m_param->sourceHeight >> x265_cli_cspsm_param->internalCsp.height0);
72
-        uint32_t pixelbytes = m_param->internalBitDepth > 8 ? 2 : 1;
73
-        m_edgePic = X265_MALLOC(pixel, m_planeSizes0 * pixelbytes);
74
-        m_edgeHistThreshold = m_param->edgeTransitionThreshold;
75
-        m_chromaHistThreshold = x265_min(m_edgeHistThreshold * 10.0, MAX_SCENECUT_THRESHOLD);
76
-        m_scaledEdgeThreshold = x265_min(m_edgeHistThreshold * SCENECUT_STRENGTH_FACTOR, MAX_SCENECUT_THRESHOLD);
77
-        m_scaledChromaThreshold = x265_min(m_chromaHistThreshold * SCENECUT_STRENGTH_FACTOR, MAX_SCENECUT_THRESHOLD);
78
-        if (m_param->sourceBitDepth != m_param->internalBitDepth)
79
-        {
80
-            int size = m_param->sourceWidth * m_param->sourceHeight;
81
-            int hshift = CHROMA_H_SHIFT(m_param->internalCsp);
82
-            int vshift = CHROMA_V_SHIFT(m_param->internalCsp);
83
-            int widthC = m_param->sourceWidth >> hshift;
84
-            int heightC = m_param->sourceHeight >> vshift;
85
-
86
-            m_inputPic0 = X265_MALLOC(pixel, size);
87
-            if (m_param->internalCsp != X265_CSP_I400)
88
-            {
89
-                for (int j = 1; j < 3; j++)
90
-                {
91
-                    m_inputPicj = X265_MALLOC(pixel, widthC * heightC);
92
-                }
93
-            }
94
-        }
95
-    }
96
-
97
     // Do not allow WPP if only one row or fewer than 3 columns, it is pointless and unstable
98
     if (rows == 1 || cols < 3)
99
     {
100
@@ -357,6 +359,10 @@
101
             lookAheadThreadPooli.start();
102
     m_lookahead->m_numPools = pools;
103
     m_dpb = new DPB(m_param);
104
+
105
+    if (m_param->bEnableTemporalFilter)
106
+        m_origPicBuffer = new OrigPicBuffer();
107
+
108
     m_rateControl = new RateControl(*m_param, this);
109
     if (!m_param->bResetZoneConfig)
110
     {
111
@@ -518,6 +524,15 @@
112
             }
113
         }
114
     }
115
+    if (m_param->filmGrain)
116
+    {
117
+        m_filmGrainIn = x265_fopen(m_param->filmGrain, "rb");
118
+        if (!m_filmGrainIn)
119
+        {
120
+            x265_log_file(NULL, X265_LOG_ERROR, "Failed to open film grain characteristics binary file %s\n", m_param->filmGrain);
121
+        }
122
+    }
123
+
124
     m_bZeroLatency = !m_param->bframes && !m_param->lookaheadDepth && m_param->frameNumThreads == 1 && m_param->maxSlices == 1;
125
     m_aborted |= parseLambdaFile(m_param);
126
 
127
@@ -879,26 +894,6 @@
128
         }
129
     }
130
 
131
-    if (m_param->bHistBasedSceneCut)
132
-    {
133
-        if (m_edgePic != NULL)
134
-        {
135
-            X265_FREE_ZERO(m_edgePic);
136
-        }
137
-
138
-        if (m_param->sourceBitDepth != m_param->internalBitDepth)
139
-        {
140
-            X265_FREE_ZERO(m_inputPic0);
141
-            if (m_param->internalCsp != X265_CSP_I400)
142
-            {
143
-                for (int i = 1; i < 3; i++)
144
-                {
145
-                    X265_FREE_ZERO(m_inputPici);
146
-                }
147
-            }
148
-        }
149
-    }
150
-
151
     for (int i = 0; i < m_param->frameNumThreads; i++)
152
     {
153
         if (m_frameEncoderi)
154
@@ -924,6 +919,10 @@
155
         delete zoneReadCount;
156
         delete zoneWriteCount;
157
     }
158
+
159
+    if (m_param->bEnableTemporalFilter)
160
+        delete m_origPicBuffer;
161
+
162
     if (m_rateControl)
163
     {
164
         m_rateControl->destroy();
165
@@ -963,6 +962,8 @@
166
      }
167
     if (m_naluFile)
168
         fclose(m_naluFile);
169
+    if (m_filmGrainIn)
170
+        x265_fclose(m_filmGrainIn);
171
 
172
 #ifdef SVT_HEVC
173
     X265_FREE(m_svtAppData);
174
@@ -974,6 +975,7 @@
175
         /* release string arguments that were strdup'd */
176
         free((char*)m_param->rc.lambdaFileName);
177
         free((char*)m_param->rc.statFileName);
178
+        free((char*)m_param->rc.sharedMemName);
179
         free((char*)m_param->analysisReuseFileName);
180
         free((char*)m_param->scalingLists);
181
         free((char*)m_param->csvfn);
182
@@ -982,6 +984,7 @@
183
         free((char*)m_param->toneMapFile);
184
         free((char*)m_param->analysisSave);
185
         free((char*)m_param->analysisLoad);
186
+        free((char*)m_param->videoSignalTypePreset);
187
         PARAM_NS::x265_param_free(m_param);
188
     }
189
 }
190
@@ -1358,215 +1361,90 @@
191
     dest->planes2 = (char*)dest->planes1 + src->stride1 * (src->height >> x265_cli_cspssrc->colorSpace.height1);
192
 }
193
 
194
-bool Encoder::computeHistograms(x265_picture *pic)
195
+bool Encoder::isFilterThisframe(uint8_t sliceTypeConfig, int curSliceType)
196
 {
197
-    pixel *src = NULL, *planeV = NULL, *planeU = NULL;
198
-    uint32_t widthC, heightC;
199
-    int hshift, vshift;
200
-
201
-    hshift = CHROMA_H_SHIFT(pic->colorSpace);
202
-    vshift = CHROMA_V_SHIFT(pic->colorSpace);
203
-    widthC = pic->width >> hshift;
204
-    heightC = pic->height >> vshift;
205
-
206
-    if (pic->bitDepth == X265_DEPTH)
207
+    uint8_t newSliceType = 0;
208
+    switch (curSliceType)
209
     {
210
-        src = (pixel*)pic->planes0;
211
-        if (m_param->internalCsp != X265_CSP_I400)
212
-        {
213
-            planeU = (pixel*)pic->planes1;
214
-            planeV = (pixel*)pic->planes2;
215
-        }
216
-    }
217
-    else if (pic->bitDepth == 8 && X265_DEPTH > 8)
218
-    {
219
-        int shift = (X265_DEPTH - 8);
220
-        uint8_t *yChar, *uChar, *vChar;
221
-
222
-        yChar = (uint8_t*)pic->planes0;
223
-        primitives.planecopy_cp(yChar, pic->stride0 / sizeof(*yChar), m_inputPic0, pic->stride0 / sizeof(*yChar), pic->width, pic->height, shift);
224
-        src = m_inputPic0;
225
-        if (m_param->internalCsp != X265_CSP_I400)
226
-        {
227
-            uChar = (uint8_t*)pic->planes1;
228
-            vChar = (uint8_t*)pic->planes2;
229
-            primitives.planecopy_cp(uChar, pic->stride1 / sizeof(*uChar), m_inputPic1, pic->stride1 / sizeof(*uChar), widthC, heightC, shift);
230
-            primitives.planecopy_cp(vChar, pic->stride2 / sizeof(*vChar), m_inputPic2, pic->stride2 / sizeof(*vChar), widthC, heightC, shift);
231
-            planeU = m_inputPic1;
232
-            planeV = m_inputPic2;
233
-        }
234
-    }
235
-    else
236
-    {
237
-        uint16_t *yShort, *uShort, *vShort;
238
-        /* mask off bits that are supposed to be zero */
239
-        uint16_t mask = (1 << X265_DEPTH) - 1;
240
-        int shift = abs(pic->bitDepth - X265_DEPTH);
241
-
242
-        yShort = (uint16_t*)pic->planes0;
243
-        uShort = (uint16_t*)pic->planes1;
244
-        vShort = (uint16_t*)pic->planes2;
245
-
246
-        if (pic->bitDepth > X265_DEPTH)
247
-        {
248
-            /* shift right and mask pixels to final size */
249
-            primitives.planecopy_sp(yShort, pic->stride0 / sizeof(*yShort), m_inputPic0, pic->stride0 / sizeof(*yShort), pic->width, pic->height, shift, mask);
250
-            if (m_param->internalCsp != X265_CSP_I400)
251
-            {
252
-                primitives.planecopy_sp(uShort, pic->stride1 / sizeof(*uShort), m_inputPic1, pic->stride1 / sizeof(*uShort), widthC, heightC, shift, mask);
253
-                primitives.planecopy_sp(vShort, pic->stride2 / sizeof(*vShort), m_inputPic2, pic->stride2 / sizeof(*vShort), widthC, heightC, shift, mask);
254
-            }
255
-        }
256
-        else /* Case for (pic.bitDepth < X265_DEPTH) */
257
-        {
258
-            /* shift left and mask pixels to final size */
259
-            primitives.planecopy_sp_shl(yShort, pic->stride0 / sizeof(*yShort), m_inputPic0, pic->stride0 / sizeof(*yShort), pic->width, pic->height, shift, mask);
260
-            if (m_param->internalCsp != X265_CSP_I400)
261
-            {
262
-                primitives.planecopy_sp_shl(uShort, pic->stride1 / sizeof(*uShort), m_inputPic1, pic->stride1 / sizeof(*uShort), widthC, heightC, shift, mask);
263
-                primitives.planecopy_sp_shl(vShort, pic->stride2 / sizeof(*vShort), m_inputPic2, pic->stride2 / sizeof(*vShort), widthC, heightC, shift, mask);
264
-            }
265
-        }
266
-
267
-        src = m_inputPic0;
268
-        planeU = m_inputPic1;
269
-        planeV = m_inputPic2;
270
-    }
271
-
272
-    size_t bufSize = sizeof(pixel) * m_planeSizes0;
273
-    memset(m_edgePic, 0, bufSize);
274
-
275
-    if (!computeEdge(m_edgePic, src, NULL, pic->width, pic->height, pic->width, false, 1))
276
-    {
277
-        x265_log(m_param, X265_LOG_ERROR, "Failed to compute edge!");
278
-        return false;
279
-    }
280
-
281
-    pixel pixelVal;
282
-    int32_t *edgeHist = m_curEdgeHist;
283
-    memset(edgeHist, 0, EDGE_BINS * sizeof(int32_t));
284
-    for (uint32_t i = 0; i < m_planeSizes0; i++)
285
-    {
286
-        if (m_edgePici)
287
-            edgeHist1++;
288
-        else
289
-            edgeHist0++;
290
-    }
291
-
292
-    /* Y Histogram Calculation */
293
-    int32_t *yHist = m_curYUVHist0;
294
-    memset(yHist, 0, HISTOGRAM_BINS * sizeof(int32_t));
295
-    for (uint32_t i = 0; i < m_planeSizes0; i++)
296
-    {
297
-        pixelVal = srci;
298
-        yHistpixelVal++;
299
+    case 1: newSliceType |= 1 << 0;
300
+        break;
301
+    case 2: newSliceType |= 1 << 0;
302
+        break;
303
+    case 3: newSliceType |= 1 << 1;
304
+        break;
305
+    case 4: newSliceType |= 1 << 2;
306
+        break;
307
+    case 5: newSliceType |= 1 << 3;
308
+        break;
309
+    default: return 0;
310
     }
311
+    return ((sliceTypeConfig & newSliceType) != 0);
312
+}
313
 
314
-    if (pic->colorSpace != X265_CSP_I400)
315
-    {
316
-        /* U Histogram Calculation */
317
-        int32_t *uHist = m_curYUVHist1;
318
-        memset(uHist, 0, sizeof(m_curYUVHist1));
319
-        for (uint32_t i = 0; i < m_planeSizes1; i++)
320
-        {
321
-            pixelVal = planeUi;
322
-            uHistpixelVal++;
323
-        }
324
+inline int enqueueRefFrame(FrameEncoder* curframeEncoder, Frame* iterFrame, Frame* curFrame, bool isPreFiltered, int16_t i)
325
+{
326
+    TemporalFilterRefPicInfo* dest = &curframeEncoder->m_mcstfRefListcurFrame->m_mcstf->m_numRef;
327
+    dest->picBuffer = iterFrame->m_fencPic;
328
+    dest->picBufferSubSampled2 = iterFrame->m_fencPicSubsampled2;
329
+    dest->picBufferSubSampled4 = iterFrame->m_fencPicSubsampled4;
330
+    dest->isFilteredFrame = isPreFiltered;
331
+    dest->isSubsampled = iterFrame->m_isSubSampled;
332
+    dest->origOffset = i;
333
+    curFrame->m_mcstf->m_numRef++;
334
 
335
-        /* V Histogram Calculation */
336
-        pixelVal = 0;
337
-        int32_t *vHist = m_curYUVHist2;
338
-        memset(vHist, 0, sizeof(m_curYUVHist2));
339
-        for (uint32_t i = 0; i < m_planeSizes2; i++)
340
-        {
341
-            pixelVal = planeVi;
342
-            vHistpixelVal++;
343
-        }
344
-    }
345
-    return true;
346
+    return 1;
347
 }
348
 
349
-void Encoder::computeHistogramSAD(double *normalizedMaxUVSad, double *normalizedEdgeSad, int curPoc)
350
+bool Encoder::generateMcstfRef(Frame* frameEnc, FrameEncoder* currEncoder)
351
 {
352
+    frameEnc->m_mcstf->m_numRef = 0;
353
 
354
-    if (curPoc == 0)
355
-    {   /* first frame is scenecut by default no sad computation for the same. */
356
-        *normalizedMaxUVSad = 0.0;
357
-        *normalizedEdgeSad = 0.0;
358
-    }
359
-    else
360
+    for (int iterPOC = (frameEnc->m_poc - frameEnc->m_mcstf->m_range);
361
+        iterPOC <= (frameEnc->m_poc + frameEnc->m_mcstf->m_range); iterPOC++)
362
     {
363
-        /* compute sum of absolute differences of histogram bins of chroma and luma edge response between the current and prev pictures. */
364
-        int32_t edgeHistSad = 0;
365
-        int32_t uHistSad = 0;
366
-        int32_t vHistSad = 0;
367
-        double normalizedUSad = 0.0;
368
-        double normalizedVSad = 0.0;
369
-
370
-        for (int j = 0; j < HISTOGRAM_BINS; j++)
371
+        bool isFound = false;
372
+        if (iterPOC != frameEnc->m_poc)
373
         {
374
-            if (j < 2)
375
+            //search for the reference frame in the Original Picture Buffer
376
+            if (!isFound)
377
             {
378
-                edgeHistSad += abs(m_curEdgeHistj - m_prevEdgeHistj);
379
-            }
380
-            uHistSad += abs(m_curYUVHist1j - m_prevYUVHist1j);
381
-            vHistSad += abs(m_curYUVHist2j - m_prevYUVHist2j);
382
-        }
383
-        *normalizedEdgeSad = normalizeRange(edgeHistSad, 0, 2 * m_planeSizes0, 0.0, 1.0);
384
-        normalizedUSad = normalizeRange(uHistSad, 0, 2 * m_planeSizes1, 0.0, 1.0);
385
-        normalizedVSad = normalizeRange(vHistSad, 0, 2 * m_planeSizes2, 0.0, 1.0);
386
-        *normalizedMaxUVSad = x265_max(normalizedUSad, normalizedVSad);
387
-    }
388
-
389
-    /* store histograms of previous frame for reference */
390
-    memcpy(m_prevEdgeHist, m_curEdgeHist, sizeof(m_curEdgeHist));
391
-    memcpy(m_prevYUVHist, m_curYUVHist, sizeof(m_curYUVHist));
392
-}
393
+                for (int j = 0; j < (2 * frameEnc->m_mcstf->m_range); j++)
394
+                {
395
+                    if (iterPOC < 0)
396
+                        continue;
397
+                    if (iterPOC >= m_pocLast)
398
+                    {
399
 
400
-double Encoder::normalizeRange(int32_t value, int32_t minValue, int32_t maxValue, double rangeStart, double rangeEnd)
401
-{
402
-    return (double)(value - minValue) * (rangeEnd - rangeStart) / (maxValue - minValue) + rangeStart;
403
-}
404
+                        TemporalFilter* mcstf = frameEnc->m_mcstf;
405
+                        while (mcstf->m_numRef)
406
+                        {
407
+                            memset(currEncoder->m_mcstfRefListmcstf->m_numRef.mvs0,  0, sizeof(MV) * ((mcstf->m_sourceWidth / 16) * (mcstf->m_sourceHeight / 16)));
408
+                            memset(currEncoder->m_mcstfRefListmcstf->m_numRef.mvs1,  0, sizeof(MV) * ((mcstf->m_sourceWidth / 16) * (mcstf->m_sourceHeight / 16)));
409
+                            memset(currEncoder->m_mcstfRefListmcstf->m_numRef.mvs2,  0, sizeof(MV) * ((mcstf->m_sourceWidth / 16) * (mcstf->m_sourceHeight / 16)));
410
+                            memset(currEncoder->m_mcstfRefListmcstf->m_numRef.mvs,   0, sizeof(MV) * ((mcstf->m_sourceWidth /  4) * (mcstf->m_sourceHeight /  4)));
411
+                            memset(currEncoder->m_mcstfRefListmcstf->m_numRef.noise, 0, sizeof(int) * ((mcstf->m_sourceWidth / 4) * (mcstf->m_sourceHeight / 4)));
412
+                            memset(currEncoder->m_mcstfRefListmcstf->m_numRef.error, 0, sizeof(int) * ((mcstf->m_sourceWidth / 4) * (mcstf->m_sourceHeight / 4)));
413
 
414
-void Encoder::findSceneCuts(x265_picture *pic, bool& bDup, double maxUVSad, double edgeSad, bool& isMaxThres, bool& isHardSC)
415
-{
416
-    double minEdgeT = m_edgeHistThreshold * MIN_EDGE_FACTOR;
417
-    double minChromaT = minEdgeT * SCENECUT_CHROMA_FACTOR;
418
-    double maxEdgeT = m_edgeHistThreshold * MAX_EDGE_FACTOR;
419
-    double maxChromaT = maxEdgeT * SCENECUT_CHROMA_FACTOR;
420
-    pic->frameData.bScenecut = false;
421
+                            mcstf->m_numRef--;
422
+                        }
423
 
424
-    if (pic->poc == 0)
425
-    {
426
-        /* for first frame */
427
-        pic->frameData.bScenecut = false;
428
-        bDup = false;
429
-    }
430
-    else
431
-    {
432
-        if (edgeSad == 0.0 && maxUVSad == 0.0)
433
-        {
434
-            bDup = true;
435
-        }
436
-        else if (edgeSad < minEdgeT && maxUVSad < minChromaT)
437
-        {
438
-            pic->frameData.bScenecut = false;
439
-        }
440
-        else if (edgeSad > maxEdgeT && maxUVSad > maxChromaT)
441
-        {
442
-            pic->frameData.bScenecut = true;
443
-            isMaxThres = true;
444
-            isHardSC = true;
445
-        }
446
-        else if (edgeSad > m_scaledEdgeThreshold || maxUVSad >= m_scaledChromaThreshold
447
-                 || (edgeSad > m_edgeHistThreshold && maxUVSad >= m_chromaHistThreshold))
448
-        {
449
-            pic->frameData.bScenecut = true;
450
-            bDup = false;
451
-            if (edgeSad > m_scaledEdgeThreshold || maxUVSad >= m_scaledChromaThreshold)
452
-                isHardSC = true;
453
+                        break;
454
+                    }
455
+                    Frame* iterFrame = frameEnc->m_encData->m_slice->m_mcstfRefFrameList1j;
456
+                    if (iterFrame->m_poc == iterPOC)
457
+                    {
458
+                        if (!enqueueRefFrame(currEncoder, iterFrame, frameEnc, false, (int16_t)(iterPOC - frameEnc->m_poc)))
459
+                        {
460
+                            return false;
461
+                        };
462
+                        break;
463
+                    }
464
+                }
465
+            }
466
         }
467
     }
468
+
469
+    return true;
470
 }
471
 
472
 /**
473
@@ -1595,40 +1473,24 @@
474
     const x265_picture* inputPic = NULL;
475
     static int written = 0, read = 0;
476
     bool dontRead = false;
477
-    bool bdropFrame = false;
478
     bool dropflag = false;
479
-    bool isMaxThres = false;
480
-    bool isHardSC = false;
481
 
482
     if (m_exportedPic)
483
     {
484
         if (!m_param->bUseAnalysisFile && m_param->analysisSave)
485
             x265_free_analysis_data(m_param, &m_exportedPic->m_analysisData);
486
+
487
         ATOMIC_DEC(&m_exportedPic->m_countRefEncoders);
488
+
489
         m_exportedPic = NULL;
490
         m_dpb->recycleUnreferenced();
491
+
492
+        if (m_param->bEnableTemporalFilter)
493
+            m_origPicBuffer->recycleOrigPicList();
494
     }
495
+
496
     if ((pic_in && (!m_param->chunkEnd || (m_encodedFrameNum < m_param->chunkEnd))) || (m_param->bEnableFrameDuplication && !pic_in && (read < written)))
497
     {
498
-        if (m_param->bHistBasedSceneCut && pic_in)
499
-        {
500
-            x265_picture *pic = (x265_picture *) pic_in;
501
-
502
-            if (pic->poc == 0)
503
-            {
504
-                /* for entire encode compute the chroma plane sizes only once */
505
-                for (int i = 1; i < x265_cli_cspsm_param->internalCsp.planes; i++)
506
-                    m_planeSizesi = (pic->width >> x265_cli_cspsm_param->internalCsp.widthi) * (pic->height >> x265_cli_cspsm_param->internalCsp.heighti);
507
-            }
508
-
509
-            if (computeHistograms(pic))
510
-            {
511
-                double maxUVSad = 0.0, edgeSad = 0.0;
512
-                computeHistogramSAD(&maxUVSad, &edgeSad, pic_in->poc);
513
-                findSceneCuts(pic, bdropFrame, maxUVSad, edgeSad, isMaxThres, isHardSC);
514
-            }
515
-        }
516
-
517
         if ((m_param->bEnableFrameDuplication && !pic_in && (read < written)))
518
             dontRead = true;
519
         else
520
@@ -1672,20 +1534,7 @@
521
                     written++;
522
                 }
523
 
524
-                if (m_param->bEnableFrameDuplication && m_param->bHistBasedSceneCut)
525
-                {
526
-                    if (!bdropFrame && m_dupBuffer1->dupPic->frameData.bScenecut == false)
527
-                    {
528
-                        psnrWeight = ComputePSNR(m_dupBuffer0->dupPic, m_dupBuffer1->dupPic, m_param);
529
-                        if (psnrWeight >= m_param->dupThreshold)
530
-                            dropflag = true;
531
-                    }
532
-                    else
533
-                    {
534
-                        dropflag = true;
535
-                    }
536
-                }
537
-                else if (m_param->bEnableFrameDuplication)
538
+                if (m_param->bEnableFrameDuplication)
539
                 {
540
                     psnrWeight = ComputePSNR(m_dupBuffer0->dupPic, m_dupBuffer1->dupPic, m_param);
541
                     if (psnrWeight >= m_param->dupThreshold)
542
@@ -1768,12 +1617,6 @@
543
                         }
544
                     }
545
                 }
546
-                if (m_param->recursionSkipMode == EDGE_BASED_RSKIP && m_param->bHistBasedSceneCut)
547
-                {
548
-                    pixel* src = m_edgePic;
549
-                    primitives.planecopy_pp_shr(src, inFrame->m_fencPic->m_picWidth, inFrame->m_edgeBitPic, inFrame->m_fencPic->m_stride,
550
-                        inFrame->m_fencPic->m_picWidth, inFrame->m_fencPic->m_picHeight, 0);
551
-                }
552
             }
553
             else
554
             {
555
@@ -1794,6 +1637,8 @@
556
             inFrame->m_lowres.satdCost = (int64_t)-1;
557
             inFrame->m_lowresInit = false;
558
             inFrame->m_isInsideWindow = 0;
559
+            inFrame->m_tempLayer = 0;
560
+            inFrame->m_sameLayerRefPic = 0;
561
         }
562
 
563
         /* Copy input picture into a Frame and PicYuv, send to lookahead */
564
@@ -1802,13 +1647,6 @@
565
         inFrame->m_poc       = ++m_pocLast;
566
         inFrame->m_userData  = inputPic->userData;
567
         inFrame->m_pts       = inputPic->pts;
568
-        if (m_param->bHistBasedSceneCut)
569
-        {
570
-            inFrame->m_lowres.bScenecut = (inputPic->frameData.bScenecut == 1) ? true : false;
571
-            inFrame->m_lowres.m_bIsMaxThres = isMaxThres;
572
-            if (m_param->radl && m_param->keyframeMax != m_param->keyframeMin)
573
-                inFrame->m_lowres.m_bIsHardScenecut = isHardSC;
574
-        }
575
 
576
         if ((m_param->bEnableSceneCutAwareQp & BACKWARD) && m_param->rc.bStatRead)
577
         {
578
@@ -1816,7 +1654,7 @@
579
             rcEntry = &(m_rateControl->m_rce2PassinFrame->m_poc);
580
             if(rcEntry->scenecut)
581
             {
582
-                int backwardWindow = X265_MIN(int((m_param->bwdScenecutWindow / 1000.0) * (m_param->fpsNum / m_param->fpsDenom)), p->lookaheadDepth);
583
+                int backwardWindow = X265_MIN(int((m_param->bwdMaxScenecutWindow / 1000.0) * (m_param->fpsNum / m_param->fpsDenom)), p->lookaheadDepth);
584
                 for (int i = 1; i <= backwardWindow; i++)
585
                 {
586
                     int frameNum = inFrame->m_poc - i;
587
@@ -1826,16 +1664,7 @@
588
                 }
589
             }
590
         }
591
-        if (m_param->bHistBasedSceneCut && m_param->analysisSave)
592
-        {
593
-            memcpy(inFrame->m_analysisData.edgeHist, m_curEdgeHist, EDGE_BINS * sizeof(int32_t));
594
-            memcpy(inFrame->m_analysisData.yuvHist0, m_curYUVHist0, HISTOGRAM_BINS *sizeof(int32_t));
595
-            if (inputPic->colorSpace != X265_CSP_I400)
596
-            {
597
-                memcpy(inFrame->m_analysisData.yuvHist1, m_curYUVHist1, HISTOGRAM_BINS * sizeof(int32_t));
598
-                memcpy(inFrame->m_analysisData.yuvHist2, m_curYUVHist2, HISTOGRAM_BINS * sizeof(int32_t));
599
-            }
600
-        }
601
+
602
         inFrame->m_forceqp   = inputPic->forceqp;
603
         inFrame->m_param     = (m_reconfigure || m_reconfigureRc) ? m_latestParam : m_param;
604
         inFrame->m_picStruct = inputPic->picStruct;
605
@@ -1881,7 +1710,8 @@
606
         }
607
 
608
         /* Use the frame types from the first pass, if available */
609
-        int sliceType = (m_param->rc.bStatRead) ? m_rateControl->rateControlSliceType(inFrame->m_poc) : inputPic->sliceType;
610
+        int sliceType = (m_param->rc.bStatRead) ? m_rateControl->rateControlSliceType(inFrame->m_poc) : X265_TYPE_AUTO;
611
+        inFrame->m_lowres.sliceTypeReq = inputPic->sliceType;
612
 
613
         /* In analysisSave mode, x265_analysis_data is allocated in inputPic and inFrame points to this */
614
         /* Load analysis data before lookahead->addPicture, since sliceType has been decided */
615
@@ -1977,6 +1807,59 @@
616
         if (m_reconfigureRc)
617
             inFrame->m_reconfigureRc = true;
618
 
619
+        if (m_param->bEnableTemporalFilter)
620
+        {
621
+            if (!m_pocLast)
622
+            {
623
+                /*One shot allocation of frames in OriginalPictureBuffer*/
624
+                int numFramesinOPB = X265_MAX(m_param->bframes, (inFrame->m_mcstf->m_range << 1)) + 1;
625
+                for (int i = 0; i < numFramesinOPB; i++)
626
+                {
627
+                    Frame* dupFrame = new Frame;
628
+                    if (!(dupFrame->create(m_param, pic_in->quantOffsets)))
629
+                    {
630
+                        m_aborted = true;
631
+                        x265_log(m_param, X265_LOG_ERROR, "Memory allocation failure, aborting encode\n");
632
+                        fflush(stderr);
633
+                        dupFrame->destroy();
634
+                        delete dupFrame;
635
+                        return -1;
636
+                    }
637
+                    else
638
+                    {
639
+                        if (m_sps.cuOffsetY)
640
+                        {
641
+                            dupFrame->m_fencPic->m_cuOffsetC = m_sps.cuOffsetC;
642
+                            dupFrame->m_fencPic->m_buOffsetC = m_sps.buOffsetC;
643
+                            dupFrame->m_fencPic->m_cuOffsetY = m_sps.cuOffsetY;
644
+                            dupFrame->m_fencPic->m_buOffsetY = m_sps.buOffsetY;
645
+                            if (m_param->internalCsp != X265_CSP_I400)
646
+                            {
647
+                                dupFrame->m_fencPic->m_cuOffsetC = m_sps.cuOffsetC;
648
+                                dupFrame->m_fencPic->m_buOffsetC = m_sps.buOffsetC;
649
+                            }
650
+                            m_origPicBuffer->addEncPicture(dupFrame);
651
+                        }
652
+                    }
653
+                }
654
+            }
655
+
656
+            inFrame->m_refPicCnt1 = 2 * inFrame->m_mcstf->m_range + 1;
657
+            if (inFrame->m_poc < inFrame->m_mcstf->m_range)
658
+                inFrame->m_refPicCnt1 -= (uint8_t)(inFrame->m_mcstf->m_range - inFrame->m_poc);
659
+            if (m_param->totalFrames && (inFrame->m_poc >= (m_param->totalFrames - inFrame->m_mcstf->m_range)))
660
+                inFrame->m_refPicCnt1 -= (uint8_t)(inFrame->m_poc + inFrame->m_mcstf->m_range - m_param->totalFrames + 1);
661
+
662
+            //Extend full-res original picture border
663
+            PicYuv *orig = inFrame->m_fencPic;
664
+            extendPicBorder(orig->m_picOrg0, orig->m_stride, orig->m_picWidth, orig->m_picHeight, orig->m_lumaMarginX, orig->m_lumaMarginY);
665
+            extendPicBorder(orig->m_picOrg1, orig->m_strideC, orig->m_picWidth >> orig->m_hChromaShift, orig->m_picHeight >> orig->m_vChromaShift, orig->m_chromaMarginX, orig->m_chromaMarginY);
666
+            extendPicBorder(orig->m_picOrg2, orig->m_strideC, orig->m_picWidth >> orig->m_hChromaShift, orig->m_picHeight >> orig->m_vChromaShift, orig->m_chromaMarginX, orig->m_chromaMarginY);
667
+
668
+            //TODO: Add subsampling here if required
669
+            m_origPicBuffer->addPicture(inFrame);
670
+        }
671
+
672
         m_lookahead->addPicture(*inFrame, sliceType);
673
         m_numDelayedPic++;
674
     }
675
@@ -2019,6 +1902,7 @@
676
                 pic_out->bitDepth = X265_DEPTH;
677
                 pic_out->userData = outFrame->m_userData;
678
                 pic_out->colorSpace = m_param->internalCsp;
679
+                pic_out->frameData.tLayer = outFrame->m_tempLayer;
680
                 frameData = &(pic_out->frameData);
681
 
682
                 pic_out->pts = outFrame->m_pts;
683
@@ -2041,16 +1925,6 @@
684
                     pic_out->analysisData.poc = pic_out->poc;
685
                     pic_out->analysisData.sliceType = pic_out->sliceType;
686
                     pic_out->analysisData.bScenecut = outFrame->m_lowres.bScenecut;
687
-                    if (m_param->bHistBasedSceneCut)
688
-                    {
689
-                        memcpy(pic_out->analysisData.edgeHist, outFrame->m_analysisData.edgeHist, EDGE_BINS * sizeof(int32_t));
690
-                        memcpy(pic_out->analysisData.yuvHist0, outFrame->m_analysisData.yuvHist0, HISTOGRAM_BINS * sizeof(int32_t));
691
-                        if (pic_out->colorSpace != X265_CSP_I400)
692
-                        {
693
-                            memcpy(pic_out->analysisData.yuvHist1, outFrame->m_analysisData.yuvHist1, HISTOGRAM_BINS * sizeof(int32_t));
694
-                            memcpy(pic_out->analysisData.yuvHist2, outFrame->m_analysisData.yuvHist2, HISTOGRAM_BINS * sizeof(int32_t));
695
-                        }
696
-                    }
697
                     pic_out->analysisData.satdCost  = outFrame->m_lowres.satdCost;
698
                     pic_out->analysisData.numCUsInFrame = outFrame->m_analysisData.numCUsInFrame;
699
                     pic_out->analysisData.numPartitions = outFrame->m_analysisData.numPartitions;
700
@@ -2198,7 +2072,7 @@
701
                 if (m_rateControl->writeRateControlFrameStats(outFrame, &curEncoder->m_rce))
702
                     m_aborted = true;
703
             if (pic_out)
704
-            { 
705
+            {
706
                 /* m_rcData is allocated for every frame */
707
                 pic_out->rcData = outFrame->m_rcData;
708
                 outFrame->m_rcData->qpaRc = outFrame->m_encData->m_avgQpRc;
709
@@ -2216,6 +2090,18 @@
710
                 outFrame->m_rcData->iCuCount = outFrame->m_encData->m_frameStats.percent8x8Intra * m_rateControl->m_ncu;
711
                 outFrame->m_rcData->pCuCount = outFrame->m_encData->m_frameStats.percent8x8Inter * m_rateControl->m_ncu;
712
                 outFrame->m_rcData->skipCuCount = outFrame->m_encData->m_frameStats.percent8x8Skip  * m_rateControl->m_ncu;
713
+                outFrame->m_rcData->currentSatd = curEncoder->m_rce.coeffBits;
714
+            }
715
+
716
+            if (m_param->bEnableTemporalFilter)
717
+            {
718
+                Frame *curFrame = m_origPicBuffer->m_mcstfPicList.getPOCMCSTF(outFrame->m_poc);
719
+                X265_CHECK(curFrame, "Outframe not found in DPB's mcstfPicList");
720
+                curFrame->m_refPicCnt0--;
721
+                curFrame->m_refPicCnt1--;
722
+                curFrame = m_origPicBuffer->m_mcstfOrigPicList.getPOCMCSTF(outFrame->m_poc);
723
+                X265_CHECK(curFrame, "Outframe not found in OPB's mcstfOrigPicList");
724
+                curFrame->m_refPicCnt1--;
725
             }
726
 
727
             /* Allow this frame to be recycled if no frame encoders are using it for reference */
728
@@ -2223,6 +2109,8 @@
729
             {
730
                 ATOMIC_DEC(&outFrame->m_countRefEncoders);
731
                 m_dpb->recycleUnreferenced();
732
+                if (m_param->bEnableTemporalFilter)
733
+                    m_origPicBuffer->recycleOrigPicList();
734
             }
735
             else
736
                 m_exportedPic = outFrame;
737
@@ -2253,7 +2141,7 @@
738
                         m_rateControl->m_lastScenecut = frameEnc->m_poc;
739
                     else
740
                     {
741
-                        int maxWindowSize = int((m_param->fwdScenecutWindow / 1000.0) * (m_param->fpsNum / m_param->fpsDenom) + 0.5);
742
+                        int maxWindowSize = int((m_param->fwdMaxScenecutWindow / 1000.0) * (m_param->fpsNum / m_param->fpsDenom) + 0.5);
743
                         if (frameEnc->m_poc > (m_rateControl->m_lastScenecut + maxWindowSize))
744
                             m_rateControl->m_lastScenecut = frameEnc->m_poc;
745
                     }
746
@@ -2422,8 +2310,36 @@
747
                 analysis->numPartitions  = m_param->num4x4Partitions;
748
                 x265_alloc_analysis_data(m_param, analysis);
749
             }
750
+            if (m_param->bEnableTemporalSubLayers > 2)
751
+            {
752
+                //Re-assign temporalid if the current frame is at the end of encode or when I slice is encountered
753
+                if ((frameEnc->m_poc == (m_param->totalFrames - 1)) || (frameEnc->m_lowres.sliceType == X265_TYPE_I) || (frameEnc->m_lowres.sliceType == X265_TYPE_IDR))
754
+                {
755
+                    frameEnc->m_tempLayer = (int8_t)0;
756
+                }
757
+            }
758
             /* determine references, setup RPS, etc */
759
             m_dpb->prepareEncode(frameEnc);
760
+
761
+            if (m_param->bEnableTemporalFilter)
762
+            {
763
+                X265_CHECK(!m_origPicBuffer->m_mcstfOrigPicFreeList.empty(), "Frames not available in Encoded OPB");
764
+
765
+                Frame *dupFrame = m_origPicBuffer->m_mcstfOrigPicFreeList.popBackMCSTF();
766
+                dupFrame->m_fencPic->copyFromFrame(frameEnc->m_fencPic);
767
+                dupFrame->m_poc = frameEnc->m_poc;
768
+                dupFrame->m_encodeOrder = frameEnc->m_encodeOrder;
769
+                dupFrame->m_refPicCnt1 = 2 * dupFrame->m_mcstf->m_range + 1;
770
+
771
+                if (dupFrame->m_poc < dupFrame->m_mcstf->m_range)
772
+                    dupFrame->m_refPicCnt1 -= (uint8_t)(dupFrame->m_mcstf->m_range - dupFrame->m_poc);
773
+                if (m_param->totalFrames && (dupFrame->m_poc >= (m_param->totalFrames - dupFrame->m_mcstf->m_range)))
774
+                    dupFrame->m_refPicCnt1 -= (uint8_t)(dupFrame->m_poc + dupFrame->m_mcstf->m_range - m_param->totalFrames + 1);
775
+
776
+                m_origPicBuffer->addEncPictureToPicList(dupFrame);
777
+                m_origPicBuffer->setOrigPicList(frameEnc, m_pocLast);
778
+            }
779
+
780
             if (!!m_param->selectiveSAO)
781
             {
782
                 Slice* slice = frameEnc->m_encData->m_slice;
783
@@ -2449,9 +2365,72 @@
784
 
785
             if (m_param->rc.rateControlMode != X265_RC_CQP)
786
                 m_lookahead->getEstimatedPictureCost(frameEnc);
787
+
788
             if (m_param->bIntraRefresh)
789
                  calcRefreshInterval(frameEnc);
790
 
791
+            // Generate MCSTF References and perform HME
792
+            if (m_param->bEnableTemporalFilter && isFilterThisframe(frameEnc->m_mcstf->m_sliceTypeConfig, frameEnc->m_lowres.sliceType))
793
+            {
794
+
795
+                if (!generateMcstfRef(frameEnc, curEncoder))
796
+                {
797
+                    m_aborted = true;
798
+                    x265_log(m_param, X265_LOG_ERROR, "Failed to initialize MCSTFReferencePicInfo at POC %d\n", frameEnc->m_poc);
799
+                    fflush(stderr);
800
+                    return -1;
801
+                }
802
+
803
+
804
+                if (!*frameEnc->m_isSubSampled)
805
+                {
806
+                    primitives.frameSubSampleLuma((const pixel *)frameEnc->m_fencPic->m_picOrg0,frameEnc->m_fencPicSubsampled2->m_picOrg0, frameEnc->m_fencPic->m_stride, frameEnc->m_fencPicSubsampled2->m_stride, frameEnc->m_fencPicSubsampled2->m_picWidth, frameEnc->m_fencPicSubsampled2->m_picHeight);
807
+                    extendPicBorder(frameEnc->m_fencPicSubsampled2->m_picOrg0, frameEnc->m_fencPicSubsampled2->m_stride, frameEnc->m_fencPicSubsampled2->m_picWidth, frameEnc->m_fencPicSubsampled2->m_picHeight, frameEnc->m_fencPicSubsampled2->m_lumaMarginX, frameEnc->m_fencPicSubsampled2->m_lumaMarginY);
808
+                    primitives.frameSubSampleLuma((const pixel *)frameEnc->m_fencPicSubsampled2->m_picOrg0,frameEnc->m_fencPicSubsampled4->m_picOrg0, frameEnc->m_fencPicSubsampled2->m_stride, frameEnc->m_fencPicSubsampled4->m_stride, frameEnc->m_fencPicSubsampled4->m_picWidth, frameEnc->m_fencPicSubsampled4->m_picHeight);
809
+                    extendPicBorder(frameEnc->m_fencPicSubsampled4->m_picOrg0, frameEnc->m_fencPicSubsampled4->m_stride, frameEnc->m_fencPicSubsampled4->m_picWidth, frameEnc->m_fencPicSubsampled4->m_picHeight, frameEnc->m_fencPicSubsampled4->m_lumaMarginX, frameEnc->m_fencPicSubsampled4->m_lumaMarginY);
810
+                    *frameEnc->m_isSubSampled = true;
811
+                }
812
+
813
+                for (uint8_t i = 1; i <= frameEnc->m_mcstf->m_numRef; i++)
814
+                {
815
+                    TemporalFilterRefPicInfo *ref = &curEncoder->m_mcstfRefListi - 1;
816
+                    if (!*ref->isSubsampled)
817
+                    {
818
+                        primitives.frameSubSampleLuma((const pixel *)ref->picBuffer->m_picOrg0, ref->picBufferSubSampled2->m_picOrg0, ref->picBuffer->m_stride, ref->picBufferSubSampled2->m_stride, ref->picBufferSubSampled2->m_picWidth, ref->picBufferSubSampled2->m_picHeight);
819
+                        extendPicBorder(ref->picBufferSubSampled2->m_picOrg0, ref->picBufferSubSampled2->m_stride, ref->picBufferSubSampled2->m_picWidth, ref->picBufferSubSampled2->m_picHeight, ref->picBufferSubSampled2->m_lumaMarginX, ref->picBufferSubSampled2->m_lumaMarginY);
820
+                        primitives.frameSubSampleLuma((const pixel *)ref->picBufferSubSampled2->m_picOrg0,ref->picBufferSubSampled4->m_picOrg0, ref->picBufferSubSampled2->m_stride, ref->picBufferSubSampled4->m_stride, ref->picBufferSubSampled4->m_picWidth, ref->picBufferSubSampled4->m_picHeight);
821
+                        extendPicBorder(ref->picBufferSubSampled4->m_picOrg0, ref->picBufferSubSampled4->m_stride, ref->picBufferSubSampled4->m_picWidth, ref->picBufferSubSampled4->m_picHeight, ref->picBufferSubSampled4->m_lumaMarginX, ref->picBufferSubSampled4->m_lumaMarginY);
822
+                        *ref->isSubsampled = true;
823
+                    }
824
+                }
825
+
826
+                for (uint8_t i = 1; i <= frameEnc->m_mcstf->m_numRef; i++)
827
+                {
828
+                    TemporalFilterRefPicInfo *ref = &curEncoder->m_mcstfRefListi - 1;
829
+
830
+                    curEncoder->m_frameEncTF->motionEstimationLuma(ref->mvs0, ref->mvsStride0, frameEnc->m_fencPicSubsampled4, ref->picBufferSubSampled4, 16);
831
+                    curEncoder->m_frameEncTF->motionEstimationLuma(ref->mvs1, ref->mvsStride1, frameEnc->m_fencPicSubsampled2, ref->picBufferSubSampled2, 16, ref->mvs0, ref->mvsStride0, 2);
832
+                    curEncoder->m_frameEncTF->motionEstimationLuma(ref->mvs2, ref->mvsStride2, frameEnc->m_fencPic, ref->picBuffer, 16, ref->mvs1, ref->mvsStride1, 2);
833
+                    curEncoder->m_frameEncTF->motionEstimationLumaDoubleRes(ref->mvs,  ref->mvsStride, frameEnc->m_fencPic, ref->picBuffer, 8, ref->mvs2, ref->mvsStride2, 1, ref->error);
834
+                }
835
+
836
+                for (int i = 0; i < frameEnc->m_mcstf->m_numRef; i++)
837
+                {
838
+                    TemporalFilterRefPicInfo *ref = &curEncoder->m_mcstfRefListi;
839
+                    ref->slicetype = m_lookahead->findSliceType(frameEnc->m_poc + ref->origOffset);
840
+                    Frame* dpbframePtr = m_dpb->m_picList.getPOC(frameEnc->m_poc + ref->origOffset);
841
+                    if (dpbframePtr != NULL)
842
+                    {
843
+                        if (dpbframePtr->m_encData->m_slice->m_sliceType == B_SLICE)
844
+                            ref->slicetype = X265_TYPE_B;
845
+                        else if (dpbframePtr->m_encData->m_slice->m_sliceType == P_SLICE)
846
+                            ref->slicetype = X265_TYPE_P;
847
+                        else
848
+                            ref->slicetype = X265_TYPE_I;
849
+                    }
850
+                }
851
+            }
852
+
853
             /* Allow FrameEncoder::compressFrame() to start in the frame encoder thread */
854
             if (!curEncoder->startCompressFrame(frameEnc))
855
                 m_aborted = true;
856
@@ -2523,7 +2502,11 @@
857
         encParam->dynamicRd = param->dynamicRd;
858
         encParam->bEnableTransformSkip = param->bEnableTransformSkip;
859
         encParam->bEnableAMP = param->bEnableAMP;
860
-
861
+        if (param->confWinBottomOffset == 0 && param->confWinRightOffset == 0)
862
+        {
863
+            encParam->confWinBottomOffset = param->confWinBottomOffset;
864
+            encParam->confWinRightOffset = param->confWinRightOffset;
865
+        }
866
         /* Resignal changes in params in Parameter Sets */
867
         m_sps.maxAMPDepth = (m_sps.bUseAMP = param->bEnableAMP && param->bEnableAMP) ? param->maxCUDepth : 0;
868
         m_pps.bTransformSkipEnabled = param->bEnableTransformSkip ? 1 : 0;
869
@@ -2729,18 +2712,7 @@
870
             (float)100.0 * m_numLumaWPBiFrames / m_analyzeB.m_numPics,
871
             (float)100.0 * m_numChromaWPBiFrames / m_analyzeB.m_numPics);
872
     }
873
-    int pWithB = 0;
874
-    for (int i = 0; i <= m_param->bframes; i++)
875
-        pWithB += m_lookahead->m_histogrami;
876
 
877
-    if (pWithB)
878
-    {
879
-        int p = 0;
880
-        for (int i = 0; i <= m_param->bframes; i++)
881
-            p += sprintf(buffer + p, "%.1f%% ", 100. * m_lookahead->m_histogrami / pWithB);
882
-
883
-        x265_log(m_param, X265_LOG_INFO, "consecutive B-frames: %s\n", buffer);
884
-    }
885
     if (m_param->bLossless)
886
     {
887
         float frameSize = (float)(m_param->sourceWidth - m_sps.conformanceWindow.rightOffset) *
888
@@ -3341,6 +3313,19 @@
889
     }
890
 }
891
 
892
+void Encoder::getEndNalUnits(NALList& list, Bitstream& bs)
893
+{
894
+    NALList nalList;
895
+    bs.resetBits();
896
+
897
+    if (m_param->bEnableEndOfSequence)
898
+        nalList.serialize(NAL_UNIT_EOS, bs);
899
+    if (m_param->bEnableEndOfBitstream)
900
+        nalList.serialize(NAL_UNIT_EOB, bs);
901
+
902
+    list.takeContents(nalList);
903
+}
904
+
905
 void Encoder::initVPS(VPS *vps)
906
 {
907
     /* Note that much of the VPS is initialized by determineLevel() */
908
@@ -3375,10 +3360,14 @@
909
     sps->bUseAMP = m_param->bEnableAMP;
910
     sps->maxAMPDepth = m_param->bEnableAMP ? m_param->maxCUDepth : 0;
911
 
912
-    sps->maxTempSubLayers = m_param->bEnableTemporalSubLayers ? 2 : 1;
913
-    sps->maxDecPicBuffering = m_vps.maxDecPicBuffering;
914
-    sps->numReorderPics = m_vps.numReorderPics;
915
-    sps->maxLatencyIncrease = m_vps.maxLatencyIncrease = m_param->bframes;
916
+    sps->maxTempSubLayers = m_vps.maxTempSubLayers;// Getting the value from the user
917
+
918
+    for(uint8_t i = 0; i < sps->maxTempSubLayers; i++)
919
+    {
920
+        sps->maxDecPicBufferingi = m_vps.maxDecPicBufferingi;
921
+        sps->numReorderPicsi = m_vps.numReorderPicsi;
922
+        sps->maxLatencyIncreasei = m_vps.maxLatencyIncreasei = m_param->bframes;
923
+    }
924
 
925
     sps->bUseStrongIntraSmoothing = m_param->bEnableStrongIntraSmoothing;
926
     sps->bTemporalMVPEnabled = m_param->bEnableTemporalMvp;
927
@@ -3518,6 +3507,11 @@
928
             p->rc.aqMode = X265_AQ_NONE;
929
             p->rc.hevcAq = 0;
930
         }
931
+        if (p->rc.aqMode == 0 && p->rc.cuTree)
932
+        {
933
+            p->rc.aqMode = X265_AQ_VARIANCE;
934
+            p->rc.aqStrength = 0;
935
+        }
936
         p->radl = zone->radl;
937
     }
938
     memcpy(zone, p, sizeof(x265_param));
939
@@ -3548,6 +3542,65 @@
940
         p->crQpOffset = 3;
941
 }
942
 
943
+void Encoder::configureVideoSignalTypePreset(x265_param* p)
944
+{
945
+    char systemId20 = {};
946
+    char colorVolume20 = {};
947
+    sscanf(p->videoSignalTypePreset, "%^::%s", systemId, colorVolume);
948
+    uint32_t sysId = 0;
949
+    while (strcmp(vstPresetssysId.systemId, systemId))
950
+    {
951
+        if (sysId + 1 == sizeof(vstPresets) / sizeof(vstPresets0))
952
+        {
953
+            x265_log(NULL, X265_LOG_ERROR, "Incorrect system-id, aborting\n");
954
+            m_aborted = true;
955
+            break;
956
+        }
957
+        sysId++;
958
+    }
959
+
960
+    p->vui.bEnableVideoSignalTypePresentFlag = vstPresetssysId.bEnableVideoSignalTypePresentFlag;
961
+    p->vui.bEnableColorDescriptionPresentFlag = vstPresetssysId.bEnableColorDescriptionPresentFlag;
962
+    p->vui.bEnableChromaLocInfoPresentFlag = vstPresetssysId.bEnableChromaLocInfoPresentFlag;
963
+    p->vui.colorPrimaries = vstPresetssysId.colorPrimaries;
964
+    p->vui.transferCharacteristics = vstPresetssysId.transferCharacteristics;
965
+    p->vui.matrixCoeffs = vstPresetssysId.matrixCoeffs;
966
+    p->vui.bEnableVideoFullRangeFlag = vstPresetssysId.bEnableVideoFullRangeFlag;
967
+    p->vui.chromaSampleLocTypeTopField = vstPresetssysId.chromaSampleLocTypeTopField;
968
+    p->vui.chromaSampleLocTypeBottomField = vstPresetssysId.chromaSampleLocTypeBottomField;
969
+
970
+    if (colorVolume0 != '\0')
971
+    {
972
+        if (!strcmp(systemId, "BT2100_PQ_YCC") || !strcmp(systemId, "BT2100_PQ_ICTCP") || !strcmp(systemId, "BT2100_PQ_RGB"))
973
+        {
974
+            p->bEmitHDR10SEI = 1;
975
+            if (!strcmp(colorVolume, "P3D65x1000n0005"))
976
+            {
977
+                p->masteringDisplayColorVolume = strdup("G(13250,34500)B(7500,3000)R(34000,16000)WP(15635,16450)L(10000000,5)");
978
+            }
979
+            else if (!strcmp(colorVolume, "P3D65x4000n005"))
980
+            {
981
+                p->masteringDisplayColorVolume = strdup("G(13250,34500)B(7500,3000)R(34000,16000)WP(15635,16450)L(40000000,50)");
982
+            }
983
+            else if (!strcmp(colorVolume, "BT2100x108n0005"))
984
+            {
985
+                p->masteringDisplayColorVolume = strdup("G(8500,39850)B(6550,2300)R(34000,146000)WP(15635,16450)L(10000000,1)");
986
+            }
987
+            else
988
+            {
989
+                x265_log(NULL, X265_LOG_ERROR, "Incorrect color-volume, aborting\n");
990
+                m_aborted = true;
991
+            }
992
+        }
993
+        else
994
+        {
995
+            x265_log(NULL, X265_LOG_ERROR, "Color-volume is not supported with the given system-id, aborting\n");
996
+            m_aborted = true;
997
+        }
998
+    }
999
+
1000
+}
1001
+
1002
 void Encoder::configure(x265_param *p)
1003
 {
1004
     this->m_param = p;
1005
@@ -3610,6 +3663,12 @@
1006
     if (!p->rdoqLevel)
1007
         p->psyRdoq = 0;
1008
 
1009
+    if (p->craNal && p->keyframeMax > 1)
1010
+    {
1011
+        x265_log_file(NULL, X265_LOG_ERROR, " --cra-nal works only with keyint 1, but given keyint = %s\n", p->keyframeMax);
1012
+        m_aborted = true;
1013
+    }
1014
+
1015
     /* Disable features which are not supported by the current RD level */
1016
     if (p->rdLevel < 3)
1017
     {
1018
@@ -3848,12 +3907,37 @@
1019
         p->limitReferences = 0;
1020
     }
1021
 
1022
-    if (p->bEnableTemporalSubLayers && !p->bframes)
1023
+    if ((p->bEnableTemporalSubLayers > 2) && !p->bframes)
1024
     {
1025
         x265_log(p, X265_LOG_WARNING, "B frames not enabled, temporal sublayer disabled\n");
1026
         p->bEnableTemporalSubLayers = 0;
1027
     }
1028
 
1029
+    if (!!p->bEnableTemporalSubLayers && p->bEnableTemporalSubLayers < 2)
1030
+    {
1031
+        p->bEnableTemporalSubLayers = 0;
1032
+        x265_log(p, X265_LOG_WARNING, "No support for temporal sublayers less than 2; Disabling temporal layers\n");
1033
+    }
1034
+
1035
+    if (p->bEnableTemporalSubLayers > 5)
1036
+    {
1037
+        p->bEnableTemporalSubLayers = 5;
1038
+        x265_log(p, X265_LOG_WARNING, "No support for temporal sublayers more than 5; Reducing the temporal sublayers to 5\n");
1039
+    }
1040
+
1041
+    // Assign number of B frames for temporal layers
1042
+    if (p->bEnableTemporalSubLayers > 2)
1043
+            p->bframes = x265_temporal_layer_bframesp->bEnableTemporalSubLayers - 1;
1044
+
1045
+    if (p->bEnableTemporalSubLayers > 2)
1046
+    {
1047
+        if (p->bFrameAdaptive)
1048
+        {
1049
+            x265_log(p, X265_LOG_WARNING, "Disabling adaptive B-frame placement to support temporal sub-layers\n");
1050
+            p->bFrameAdaptive = 0;
1051
+        }
1052
+    }
1053
+
1054
     m_bframeDelay = p->bframes ? (p->bBPyramid ? 2 : 1) : 0;
1055
 
1056
     p->bFrameBias = X265_MIN(X265_MAX(-90, p->bFrameBias), 100);
1057
@@ -3907,6 +3991,16 @@
1058
         p->rc.bStatRead = 0;
1059
     }
1060
 
1061
+    if ((p->rc.bStatWrite || p->rc.bStatRead) && p->rc.dataShareMode != X265_SHARE_MODE_FILE && p->rc.dataShareMode != X265_SHARE_MODE_SHAREDMEM)
1062
+    {
1063
+        p->rc.dataShareMode = X265_SHARE_MODE_FILE;
1064
+    }
1065
+
1066
+    if (!p->rc.bStatRead || p->rc.rateControlMode != X265_RC_CRF)
1067
+    {
1068
+        p->rc.bEncFocusedFramesOnly = 0;
1069
+    }
1070
+
1071
     /* some options make no sense if others are disabled */
1072
     p->bSaoNonDeblocked &= p->bEnableSAO;
1073
     p->bEnableTSkipFast &= p->bEnableTransformSkip;
1074
@@ -4243,6 +4337,9 @@
1075
         }
1076
     }
1077
 
1078
+    if (p->videoSignalTypePreset)     // Default disabled.
1079
+        configureVideoSignalTypePreset(p);
1080
+
1081
     if (m_param->toneMapFile || p->bHDR10Opt || p->bEmitHDR10SEI)
1082
     {
1083
         if (!p->bRepeatHeaders)
1084
@@ -4313,12 +4410,26 @@
1085
             m_param->searchRange = m_param->hmeRange2;
1086
     }
1087
 
1088
-   if (p->bHistBasedSceneCut && !p->edgeTransitionThreshold)
1089
-   {
1090
-       p->edgeTransitionThreshold = 0.03;
1091
-       x265_log(p, X265_LOG_WARNING, "using  default threshold %.2lf for scene cut detection\n", p->edgeTransitionThreshold);
1092
-   }
1093
+    if (p->bEnableSBRC && (p->rc.rateControlMode != X265_RC_CRF || (p->rc.vbvBufferSize == 0 || p->rc.vbvMaxBitrate == 0)))
1094
+    {
1095
+        x265_log(p, X265_LOG_WARNING, "SBRC can be enabled only with CRF+VBV mode. Disabling SBRC\n");
1096
+        p->bEnableSBRC = 0;
1097
+    }
1098
 
1099
+    if (p->bEnableSBRC)
1100
+    {
1101
+        p->rc.ipFactor = p->rc.ipFactor * X265_IPRATIO_STRENGTH;
1102
+        if (p->bOpenGOP)
1103
+        {
1104
+            x265_log(p, X265_LOG_WARNING, "Segment based RateControl requires closed gop structure. Enabling closed GOP.\n");
1105
+            p->bOpenGOP = 0;
1106
+        }
1107
+        if (p->keyframeMax != p->keyframeMin)
1108
+        {
1109
+            x265_log(p, X265_LOG_WARNING, "Segment based RateControl requires fixed gop length. Force set min-keyint equal to keyint.\n");
1110
+            p->keyframeMin = p->keyframeMax;
1111
+        }
1112
+    }
1113
 }
1114
 
1115
 void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const x265_picture* picIn, int paramBytes)
1116
@@ -4379,16 +4490,6 @@
1117
     analysis->frameRecordSize = frameRecordSize;
1118
     X265_FREAD(&analysis->sliceType, sizeof(int), 1, m_analysisFileIn, &(picData->sliceType));
1119
     X265_FREAD(&analysis->bScenecut, sizeof(int), 1, m_analysisFileIn, &(picData->bScenecut));
1120
-    if (m_param->bHistBasedSceneCut)
1121
-    {
1122
-        X265_FREAD(&analysis->edgeHist, sizeof(int32_t), EDGE_BINS, m_analysisFileIn, &m_curEdgeHist);
1123
-        X265_FREAD(&analysis->yuvHist0, sizeof(int32_t), HISTOGRAM_BINS, m_analysisFileIn, &m_curYUVHist0);
1124
-        if (m_param->internalCsp != X265_CSP_I400)
1125
-        {
1126
-            X265_FREAD(&analysis->yuvHist1, sizeof(int32_t), HISTOGRAM_BINS, m_analysisFileIn, &m_curYUVHist1);
1127
-            X265_FREAD(&analysis->yuvHist2, sizeof(int32_t), HISTOGRAM_BINS, m_analysisFileIn, &m_curYUVHist2);
1128
-        }
1129
-    }
1130
     X265_FREAD(&analysis->satdCost, sizeof(int64_t), 1, m_analysisFileIn, &(picData->satdCost));
1131
     X265_FREAD(&numCUsLoad, sizeof(int), 1, m_analysisFileIn, &(picData->numCUsInFrame));
1132
     X265_FREAD(&analysis->numPartitions, sizeof(int), 1, m_analysisFileIn, &(picData->numPartitions));
1133
@@ -4711,16 +4812,6 @@
1134
     analysis->frameRecordSize = frameRecordSize;
1135
     X265_FREAD(&analysis->sliceType, sizeof(int), 1, m_analysisFileIn, &(picData->sliceType));
1136
     X265_FREAD(&analysis->bScenecut, sizeof(int), 1, m_analysisFileIn, &(picData->bScenecut));
1137
-    if (m_param->bHistBasedSceneCut)
1138
-    {
1139
-        X265_FREAD(&analysis->edgeHist, sizeof(int32_t), EDGE_BINS, m_analysisFileIn, &m_curEdgeHist);
1140
-        X265_FREAD(&analysis->yuvHist0, sizeof(int32_t), HISTOGRAM_BINS, m_analysisFileIn, &m_curYUVHist0);
1141
-        if (m_param->internalCsp != X265_CSP_I400)
1142
-        {
1143
-            X265_FREAD(&analysis->yuvHist1, sizeof(int32_t), HISTOGRAM_BINS, m_analysisFileIn, &m_curYUVHist1);
1144
-            X265_FREAD(&analysis->yuvHist2, sizeof(int32_t), HISTOGRAM_BINS, m_analysisFileIn, &m_curYUVHist2);
1145
-        }
1146
-    }
1147
     X265_FREAD(&analysis->satdCost, sizeof(int64_t), 1, m_analysisFileIn, &(picData->satdCost));
1148
     X265_FREAD(&analysis->numCUsInFrame, sizeof(int), 1, m_analysisFileIn, &(picData->numCUsInFrame));
1149
     X265_FREAD(&analysis->numPartitions, sizeof(int), 1, m_analysisFileIn, &(picData->numPartitions));
1150
@@ -4810,8 +4901,14 @@
1151
 
1152
     if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
1153
     {
1154
-        if (m_param->analysisLoadReuseLevel < 2)
1155
-            return;
1156
+       if (m_param->analysisLoadReuseLevel < 2)
1157
+       {
1158
+           /* Restore to the current encode's numPartitions and numCUsInFrame */
1159
+           analysis->numPartitions = m_param->num4x4Partitions;
1160
+           analysis->numCUsInFrame = cuLoc.heightInCU * cuLoc.widthInCU;
1161
+           analysis->numCuInHeight = cuLoc.heightInCU;
1162
+           return;
1163
+       }
1164
 
1165
         uint8_t *tempBuf = NULL, *depthBuf = NULL, *modeBuf = NULL, *partSizes = NULL;
1166
         int8_t *cuQPBuf = NULL;
1167
@@ -4879,8 +4976,14 @@
1168
         uint32_t numDir = analysis->sliceType == X265_TYPE_P ? 1 : 2;
1169
         uint32_t numPlanes = m_param->internalCsp == X265_CSP_I400 ? 1 : 3;
1170
         X265_FREAD((WeightParam*)analysis->wt, sizeof(WeightParam), numPlanes * numDir, m_analysisFileIn, (picIn->analysisData.wt));
1171
-        if (m_param->analysisLoadReuseLevel < 2)
1172
-            return;
1173
+       if (m_param->analysisLoadReuseLevel < 2)
1174
+       {
1175
+           /* Restore to the current encode's numPartitions and numCUsInFrame */
1176
+           analysis->numPartitions = m_param->num4x4Partitions;
1177
+           analysis->numCUsInFrame = cuLoc.heightInCU * cuLoc.widthInCU;
1178
+           analysis->numCuInHeight = cuLoc.heightInCU;
1179
+           return;
1180
+       }
1181
 
1182
         uint8_t *tempBuf = NULL, *depthBuf = NULL, *modeBuf = NULL, *partSize = NULL, *mergeFlag = NULL;
1183
         uint8_t *interDir = NULL, *chromaDir = NULL, *mvpIdx2;
1184
@@ -5167,7 +5270,7 @@
1185
 
1186
         int bcutree;
1187
         X265_FREAD(&bcutree, sizeof(int), 1, m_analysisFileIn, &(saveParam->cuTree));
1188
-        if (loadLevel == 10 && m_param->rc.cuTree && (!bcutree || saveLevel < 2))
1189
+        if (loadLevel >= 2 && m_param->rc.cuTree && (!bcutree || saveLevel < 2))
1190
         {
1191
             x265_log(NULL, X265_LOG_ERROR, "Error reading cu-tree info. Disabling cutree offsets. \n");
1192
             m_param->rc.cuTree = 0;
1193
@@ -5337,6 +5440,7 @@
1194
             distortionData->highDistortionCtuCount++;
1195
     }
1196
 }
1197
+
1198
 void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, int sliceType)
1199
 {
1200
 
1201
@@ -5486,17 +5590,6 @@
1202
     /* calculate frameRecordSize */
1203
     analysis->frameRecordSize = sizeof(analysis->frameRecordSize) + sizeof(depthBytes) + sizeof(analysis->poc) + sizeof(analysis->sliceType) +
1204
                       sizeof(analysis->numCUsInFrame) + sizeof(analysis->numPartitions) + sizeof(analysis->bScenecut) + sizeof(analysis->satdCost);
1205
-    if (m_param->bHistBasedSceneCut)
1206
-    {
1207
-        analysis->frameRecordSize += sizeof(analysis->edgeHist);
1208
-        analysis->frameRecordSize += sizeof(int32_t) * HISTOGRAM_BINS;
1209
-        if (m_param->internalCsp != X265_CSP_I400)
1210
-        {
1211
-            analysis->frameRecordSize += sizeof(int32_t) * HISTOGRAM_BINS;
1212
-            analysis->frameRecordSize += sizeof(int32_t) * HISTOGRAM_BINS;
1213
-        }
1214
-    }
1215
-
1216
     if (analysis->sliceType > X265_TYPE_I)
1217
     {
1218
         numDir = (analysis->sliceType == X265_TYPE_P) ? 1 : 2;
1219
@@ -5641,17 +5734,6 @@
1220
     X265_FWRITE(&analysis->poc, sizeof(int), 1, m_analysisFileOut);
1221
     X265_FWRITE(&analysis->sliceType, sizeof(int), 1, m_analysisFileOut);
1222
     X265_FWRITE(&analysis->bScenecut, sizeof(int), 1, m_analysisFileOut);
1223
-    if (m_param->bHistBasedSceneCut)
1224
-    {
1225
-        X265_FWRITE(&analysis->edgeHist, sizeof(int32_t), EDGE_BINS, m_analysisFileOut);
1226
-        X265_FWRITE(&analysis->yuvHist0, sizeof(int32_t), HISTOGRAM_BINS, m_analysisFileOut);
1227
-        if (m_param->internalCsp != X265_CSP_I400)
1228
-        {
1229
-            X265_FWRITE(&analysis->yuvHist1, sizeof(int32_t), HISTOGRAM_BINS, m_analysisFileOut);
1230
-            X265_FWRITE(&analysis->yuvHist2, sizeof(int32_t), HISTOGRAM_BINS, m_analysisFileOut);
1231
-        }
1232
-    }
1233
-
1234
     X265_FWRITE(&analysis->satdCost, sizeof(int64_t), 1, m_analysisFileOut);
1235
     X265_FWRITE(&analysis->numCUsInFrame, sizeof(int), 1, m_analysisFileOut);
1236
     X265_FWRITE(&analysis->numPartitions, sizeof(int), 1, m_analysisFileOut);
1237
x265_3.5.tar.gz/source/encoder/encoder.h -> x265_3.6.tar.gz/source/encoder/encoder.h Changed
72
 
1
@@ -32,6 +32,7 @@
2
 #include "nal.h"
3
 #include "framedata.h"
4
 #include "svt.h"
5
+#include "temporalfilter.h"
6
 #ifdef ENABLE_HDR10_PLUS
7
     #include "dynamicHDR10/hdr10plus.h"
8
 #endif
9
@@ -256,19 +257,6 @@
10
     int                m_bToneMap; // Enables tone-mapping
11
     int                m_enableNal;
12
 
13
-    /* For histogram based scene-cut detection */
14
-    pixel*             m_edgePic;
15
-    pixel*             m_inputPic3;
16
-    int32_t            m_curYUVHist3HISTOGRAM_BINS;
17
-    int32_t            m_prevYUVHist3HISTOGRAM_BINS;
18
-    int32_t            m_curEdgeHist2;
19
-    int32_t            m_prevEdgeHist2;
20
-    uint32_t           m_planeSizes3;
21
-    double             m_edgeHistThreshold;
22
-    double             m_chromaHistThreshold;
23
-    double             m_scaledEdgeThreshold;
24
-    double             m_scaledChromaThreshold;
25
-
26
 #ifdef ENABLE_HDR10_PLUS
27
     const hdr10plus_api     *m_hdr10plus_api;
28
     uint8_t                 **m_cim;
29
@@ -295,6 +283,9 @@
30
 
31
     ThreadSafeInteger* zoneReadCount;
32
     ThreadSafeInteger* zoneWriteCount;
33
+    /* Film grain model file */
34
+    FILE* m_filmGrainIn;
35
+    OrigPicBuffer*          m_origPicBuffer;
36
 
37
     Encoder();
38
     ~Encoder()
39
@@ -327,6 +318,8 @@
40
 
41
     void getStreamHeaders(NALList& list, Entropy& sbacCoder, Bitstream& bs);
42
 
43
+    void getEndNalUnits(NALList& list, Bitstream& bs);
44
+
45
     void fetchStats(x265_stats* stats, size_t statsSizeBytes);
46
 
47
     void printSummary();
48
@@ -373,11 +366,6 @@
49
 
50
     void copyPicture(x265_picture *dest, const x265_picture *src);
51
 
52
-    bool computeHistograms(x265_picture *pic);
53
-    void computeHistogramSAD(double *maxUVNormalizedSAD, double *edgeNormalizedSAD, int curPoc);
54
-    double normalizeRange(int32_t value, int32_t minValue, int32_t maxValue, double rangeStart, double rangeEnd);
55
-    void findSceneCuts(x265_picture *pic, bool& bDup, double m_maxUVSADVal, double m_edgeSADVal, bool& isMaxThres, bool& isHardSC);
56
-
57
     void initRefIdx();
58
     void analyseRefIdx(int *numRefIdx);
59
     void updateRefIdx();
60
@@ -387,6 +375,11 @@
61
 
62
     void configureDolbyVisionParams(x265_param* p);
63
 
64
+    void configureVideoSignalTypePreset(x265_param* p);
65
+
66
+    bool isFilterThisframe(uint8_t sliceTypeConfig, int curSliceType);
67
+    bool generateMcstfRef(Frame* frameEnc, FrameEncoder* currEncoder);
68
+
69
 protected:
70
 
71
     void initVPS(VPS *vps);
72
x265_3.5.tar.gz/source/encoder/entropy.cpp -> x265_3.6.tar.gz/source/encoder/entropy.cpp Changed
41
 
1
@@ -245,9 +245,9 @@
2
 
3
     for (uint32_t i = 0; i < vps.maxTempSubLayers; i++)
4
     {
5
-        WRITE_UVLC(vps.maxDecPicBuffering - 1, "vps_max_dec_pic_buffering_minus1i");
6
-        WRITE_UVLC(vps.numReorderPics,         "vps_num_reorder_picsi");
7
-        WRITE_UVLC(vps.maxLatencyIncrease + 1, "vps_max_latency_increase_plus1i");
8
+        WRITE_UVLC(vps.maxDecPicBufferingi - 1, "vps_max_dec_pic_buffering_minus1i");
9
+        WRITE_UVLC(vps.numReorderPicsi,         "vps_num_reorder_picsi");
10
+        WRITE_UVLC(vps.maxLatencyIncreasei + 1, "vps_max_latency_increase_plus1i");
11
     }
12
 
13
     WRITE_CODE(0, 6, "vps_max_nuh_reserved_zero_layer_id");
14
@@ -291,9 +291,9 @@
15
 
16
     for (uint32_t i = 0; i < sps.maxTempSubLayers; i++)
17
     {
18
-        WRITE_UVLC(sps.maxDecPicBuffering - 1, "sps_max_dec_pic_buffering_minus1i");
19
-        WRITE_UVLC(sps.numReorderPics,         "sps_num_reorder_picsi");
20
-        WRITE_UVLC(sps.maxLatencyIncrease + 1, "sps_max_latency_increase_plus1i");
21
+        WRITE_UVLC(sps.maxDecPicBufferingi - 1, "sps_max_dec_pic_buffering_minus1i");
22
+        WRITE_UVLC(sps.numReorderPicsi,         "sps_num_reorder_picsi");
23
+        WRITE_UVLC(sps.maxLatencyIncreasei + 1, "sps_max_latency_increase_plus1i");
24
     }
25
 
26
     WRITE_UVLC(sps.log2MinCodingBlockSize - 3,    "log2_min_coding_block_size_minus3");
27
@@ -418,8 +418,11 @@
28
 
29
     if (maxTempSubLayers > 1)
30
     {
31
-         WRITE_FLAG(0, "sub_layer_profile_present_flagi");
32
-         WRITE_FLAG(0, "sub_layer_level_present_flagi");
33
+        for(int i = 0; i < maxTempSubLayers - 1; i++)
34
+        {
35
+            WRITE_FLAG(0, "sub_layer_profile_present_flagi");
36
+            WRITE_FLAG(0, "sub_layer_level_present_flagi");
37
+        }
38
          for (int i = maxTempSubLayers - 1; i < 8 ; i++)
39
              WRITE_CODE(0, 2, "reserved_zero_2bits");
40
     }
41
x265_3.5.tar.gz/source/encoder/frameencoder.cpp -> x265_3.6.tar.gz/source/encoder/frameencoder.cpp Changed
200
 
1
@@ -34,6 +34,7 @@
2
 #include "common.h"
3
 #include "slicetype.h"
4
 #include "nal.h"
5
+#include "temporalfilter.h"
6
 
7
 namespace X265_NS {
8
 void weightAnalyse(Slice& slice, Frame& frame, x265_param& param);
9
@@ -101,6 +102,16 @@
10
         delete m_rce.picTimingSEI;
11
         delete m_rce.hrdTiming;
12
     }
13
+
14
+    if (m_param->bEnableTemporalFilter)
15
+    {
16
+        delete m_frameEncTF->m_metld;
17
+
18
+        for (int i = 0; i < (m_frameEncTF->m_range << 1); i++)
19
+            m_frameEncTF->destroyRefPicInfo(&m_mcstfRefListi);
20
+
21
+        delete m_frameEncTF;
22
+    }
23
 }
24
 
25
 bool FrameEncoder::init(Encoder *top, int numRows, int numCols)
26
@@ -195,6 +206,16 @@
27
         m_sliceAddrBits = (uint16_t)(tmp + 1);
28
     }
29
 
30
+    if (m_param->bEnableTemporalFilter)
31
+    {
32
+        m_frameEncTF = new TemporalFilter();
33
+        if (m_frameEncTF)
34
+            m_frameEncTF->init(m_param);
35
+
36
+        for (int i = 0; i < (m_frameEncTF->m_range << 1); i++)
37
+            ok &= !!m_frameEncTF->createRefPicInfo(&m_mcstfRefListi, m_param);
38
+    }
39
+
40
     return ok;
41
 }
42
 
43
@@ -450,7 +471,7 @@
44
     m_ssimCnt = 0;
45
     memset(&(m_frame->m_encData->m_frameStats), 0, sizeof(m_frame->m_encData->m_frameStats));
46
 
47
-    if (!m_param->bHistBasedSceneCut && m_param->rc.aqMode != X265_AQ_EDGE && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
48
+    if (m_param->rc.aqMode != X265_AQ_EDGE && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
49
     {
50
         int height = m_frame->m_fencPic->m_picHeight;
51
         int width = m_frame->m_fencPic->m_picWidth;
52
@@ -467,6 +488,12 @@
53
      * unit) */
54
     Slice* slice = m_frame->m_encData->m_slice;
55
 
56
+    if (m_param->bEnableEndOfSequence && m_frame->m_lowres.sliceType == X265_TYPE_IDR && m_frame->m_poc)
57
+    {
58
+        m_bs.resetBits();
59
+        m_nalList.serialize(NAL_UNIT_EOS, m_bs);
60
+    }
61
+
62
     if (m_param->bEnableAccessUnitDelimiters && (m_frame->m_poc || m_param->bRepeatHeaders))
63
     {
64
         m_bs.resetBits();
65
@@ -573,6 +600,12 @@
66
     int qp = m_top->m_rateControl->rateControlStart(m_frame, &m_rce, m_top);
67
     m_rce.newQp = qp;
68
 
69
+    if (m_param->bEnableTemporalFilter)
70
+    {
71
+        m_frameEncTF->m_QP = qp;
72
+        m_frameEncTF->bilateralFilter(m_frame, m_mcstfRefList, m_param->temporalFilterStrength);
73
+    }
74
+
75
     if (m_nr)
76
     {
77
         if (qp > QP_MAX_SPEC && m_frame->m_param->rc.vbvBufferSize)
78
@@ -744,7 +777,7 @@
79
             // wait after removal of the access unit with the most recent
80
             // buffering period SEI message
81
             sei->m_auCpbRemovalDelay = X265_MIN(X265_MAX(1, m_rce.encodeOrder - prevBPSEI), (1 << hrd->cpbRemovalDelayLength));
82
-            sei->m_picDpbOutputDelay = slice->m_sps->numReorderPics + poc - m_rce.encodeOrder;
83
+            sei->m_picDpbOutputDelay = slice->m_sps->numReorderPicsm_frame->m_tempLayer + poc - m_rce.encodeOrder;
84
         }
85
 
86
         sei->writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
87
@@ -756,7 +789,14 @@
88
         m_seiAlternativeTC.m_preferredTransferCharacteristics = m_param->preferredTransferCharacteristics;
89
         m_seiAlternativeTC.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
90
     }
91
-
92
+    /* Write Film grain characteristics if present */
93
+    if (this->m_top->m_filmGrainIn)
94
+    {
95
+        FilmGrainCharacteristics m_filmGrain;
96
+        /* Read the Film grain model file */
97
+        readModel(&m_filmGrain, this->m_top->m_filmGrainIn);
98
+        m_filmGrain.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
99
+    }
100
     /* Write user SEI */
101
     for (int i = 0; i < m_frame->m_userSEI.numPayloads; i++)
102
     {
103
@@ -933,6 +973,23 @@
104
     if (m_param->bDynamicRefine && m_top->m_startPoint <= m_frame->m_encodeOrder) //Avoid collecting data that will not be used by future frames.
105
         collectDynDataFrame();
106
 
107
+    if (m_param->bEnableTemporalFilter && m_top->isFilterThisframe(m_frame->m_mcstf->m_sliceTypeConfig, m_frame->m_lowres.sliceType))
108
+    {
109
+        //Reset the MCSTF context in Frame Encoder and Frame
110
+        for (int i = 0; i < (m_frameEncTF->m_range << 1); i++)
111
+        {
112
+            memset(m_mcstfRefListi.mvs0, 0, sizeof(MV) * ((m_param->sourceWidth / 16) * (m_param->sourceHeight / 16)));
113
+            memset(m_mcstfRefListi.mvs1, 0, sizeof(MV) * ((m_param->sourceWidth / 16) * (m_param->sourceHeight / 16)));
114
+            memset(m_mcstfRefListi.mvs2, 0, sizeof(MV) * ((m_param->sourceWidth / 16) * (m_param->sourceHeight / 16)));
115
+            memset(m_mcstfRefListi.mvs,  0, sizeof(MV) * ((m_param->sourceWidth / 4) * (m_param->sourceHeight / 4)));
116
+            memset(m_mcstfRefListi.noise, 0, sizeof(int) * ((m_param->sourceWidth / 4) * (m_param->sourceHeight / 4)));
117
+            memset(m_mcstfRefListi.error, 0, sizeof(int) * ((m_param->sourceWidth / 4) * (m_param->sourceHeight / 4)));
118
+
119
+            m_frame->m_mcstf->m_numRef = 0;
120
+        }
121
+    }
122
+
123
+
124
     if (m_param->rc.bStatWrite)
125
     {
126
         int totalI = 0, totalP = 0, totalSkip = 0;
127
@@ -1041,7 +1098,7 @@
128
             
129
             m_bs.writeByteAlignment();
130
 
131
-            m_nalList.serialize(slice->m_nalUnitType, m_bs);
132
+            m_nalList.serialize(slice->m_nalUnitType, m_bs, (!!m_param->bEnableTemporalSubLayers ? m_frame->m_tempLayer + 1 : (1 + (slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N))));
133
         }
134
     }
135
     else
136
@@ -1062,7 +1119,7 @@
137
             m_entropyCoder.codeSliceHeaderWPPEntryPoints(m_substreamSizes, (slice->m_sps->numCuInHeight - 1), maxStreamSize);
138
         m_bs.writeByteAlignment();
139
 
140
-        m_nalList.serialize(slice->m_nalUnitType, m_bs);
141
+        m_nalList.serialize(slice->m_nalUnitType, m_bs, (!!m_param->bEnableTemporalSubLayers ? m_frame->m_tempLayer + 1 : (1 + (slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N))));
142
     }
143
 
144
     if (m_param->decodedPictureHashSEI)
145
@@ -2127,6 +2184,54 @@
146
         m_nr->nrOffsetDenoisecat0 = 0;
147
     }
148
 }
149
+
150
+void FrameEncoder::readModel(FilmGrainCharacteristics* m_filmGrain, FILE* filmgrain)
151
+{
152
+    char const* errorMessage = "Error reading FilmGrain characteristics\n";
153
+    FilmGrain m_fg;
154
+    x265_fread((char* )&m_fg, sizeof(bool) * 3 + sizeof(uint8_t), 1, filmgrain, errorMessage);
155
+    m_filmGrain->m_filmGrainCharacteristicsCancelFlag = m_fg.m_filmGrainCharacteristicsCancelFlag;
156
+    m_filmGrain->m_filmGrainCharacteristicsPersistenceFlag = m_fg.m_filmGrainCharacteristicsPersistenceFlag;
157
+    m_filmGrain->m_filmGrainModelId = m_fg.m_filmGrainModelId;
158
+    m_filmGrain->m_separateColourDescriptionPresentFlag = m_fg.m_separateColourDescriptionPresentFlag;
159
+    if (m_filmGrain->m_separateColourDescriptionPresentFlag)
160
+    {
161
+        ColourDescription m_clr;
162
+        x265_fread((char* )&m_clr, sizeof(bool) + sizeof(uint8_t) * 5, 1, filmgrain, errorMessage);
163
+        m_filmGrain->m_filmGrainBitDepthLumaMinus8 = m_clr.m_filmGrainBitDepthLumaMinus8;
164
+        m_filmGrain->m_filmGrainBitDepthChromaMinus8 = m_clr.m_filmGrainBitDepthChromaMinus8;
165
+        m_filmGrain->m_filmGrainFullRangeFlag = m_clr.m_filmGrainFullRangeFlag;
166
+        m_filmGrain->m_filmGrainColourPrimaries = m_clr.m_filmGrainColourPrimaries;
167
+        m_filmGrain->m_filmGrainTransferCharacteristics = m_clr.m_filmGrainTransferCharacteristics;
168
+        m_filmGrain->m_filmGrainMatrixCoeffs = m_clr.m_filmGrainMatrixCoeffs;
169
+    }
170
+    FGPresent m_present;
171
+    x265_fread((char* )&m_present, sizeof(bool) * 3 + sizeof(uint8_t) * 2, 1, filmgrain, errorMessage);
172
+    m_filmGrain->m_blendingModeId = m_present.m_blendingModeId;
173
+    m_filmGrain->m_log2ScaleFactor = m_present.m_log2ScaleFactor;
174
+    m_filmGrain->m_compModel0.bPresentFlag = m_present.m_presentFlag0;
175
+    m_filmGrain->m_compModel1.bPresentFlag = m_present.m_presentFlag1;
176
+    m_filmGrain->m_compModel2.bPresentFlag = m_present.m_presentFlag2;
177
+    for (int i = 0; i < MAX_NUM_COMPONENT; i++)
178
+    {
179
+        if (m_filmGrain->m_compModeli.bPresentFlag)
180
+        {
181
+            x265_fread((char* )(&m_filmGrain->m_compModeli.m_filmGrainNumIntensityIntervalMinus1), sizeof(uint8_t), 1, filmgrain, errorMessage);
182
+            x265_fread((char* )(&m_filmGrain->m_compModeli.numModelValues), sizeof(uint8_t), 1, filmgrain, errorMessage);
183
+            m_filmGrain->m_compModeli.intensityValues = (FilmGrainCharacteristics::CompModelIntensityValues* ) malloc(sizeof(FilmGrainCharacteristics::CompModelIntensityValues) * (m_filmGrain->m_compModeli.m_filmGrainNumIntensityIntervalMinus1+1)) ;
184
+            for (int j = 0; j <= m_filmGrain->m_compModeli.m_filmGrainNumIntensityIntervalMinus1; j++)
185
+            {
186
+                x265_fread((char* )(&m_filmGrain->m_compModeli.intensityValuesj.intensityIntervalLowerBound), sizeof(uint8_t), 1, filmgrain, errorMessage);
187
+                x265_fread((char* )(&m_filmGrain->m_compModeli.intensityValuesj.intensityIntervalUpperBound), sizeof(uint8_t), 1, filmgrain, errorMessage);
188
+                m_filmGrain->m_compModeli.intensityValuesj.compModelValue = (int* ) malloc(sizeof(int) * (m_filmGrain->m_compModeli.numModelValues));
189
+                for (int k = 0; k < m_filmGrain->m_compModeli.numModelValues; k++)
190
+                {
191
+                    x265_fread((char* )(&m_filmGrain->m_compModeli.intensityValuesj.compModelValuek), sizeof(int), 1, filmgrain, errorMessage);
192
+                }
193
+            }
194
+        }
195
+    }
196
+}
197
 #if ENABLE_LIBVMAF
198
 void FrameEncoder::vmafFrameLevelScore()
199
 {
200
x265_3.5.tar.gz/source/encoder/frameencoder.h -> x265_3.6.tar.gz/source/encoder/frameencoder.h Changed
63
 
1
@@ -40,6 +40,7 @@
2
 #include "ratecontrol.h"
3
 #include "reference.h"
4
 #include "nal.h"
5
+#include "temporalfilter.h"
6
 
7
 namespace X265_NS {
8
 // private x265 namespace
9
@@ -113,6 +114,34 @@
10
     }
11
 };
12
 
13
+/*Film grain characteristics*/
14
+struct FilmGrain
15
+{
16
+    bool    m_filmGrainCharacteristicsCancelFlag;
17
+    bool    m_filmGrainCharacteristicsPersistenceFlag;
18
+    bool    m_separateColourDescriptionPresentFlag;
19
+    uint8_t m_filmGrainModelId;
20
+    uint8_t m_blendingModeId;
21
+    uint8_t m_log2ScaleFactor;
22
+};
23
+
24
+struct ColourDescription
25
+{
26
+    bool        m_filmGrainFullRangeFlag;
27
+    uint8_t     m_filmGrainBitDepthLumaMinus8;
28
+    uint8_t     m_filmGrainBitDepthChromaMinus8;
29
+    uint8_t     m_filmGrainColourPrimaries;
30
+    uint8_t     m_filmGrainTransferCharacteristics;
31
+    uint8_t     m_filmGrainMatrixCoeffs;
32
+};
33
+
34
+struct FGPresent
35
+{
36
+    uint8_t     m_blendingModeId;
37
+    uint8_t     m_log2ScaleFactor;
38
+    bool        m_presentFlag3;
39
+};
40
+
41
 // Manages the wave-front processing of a single encoding frame
42
 class FrameEncoder : public WaveFront, public Thread
43
 {
44
@@ -205,6 +234,10 @@
45
     FrameFilter              m_frameFilter;
46
     NALList                  m_nalList;
47
 
48
+    // initialization for mcstf
49
+    TemporalFilter*          m_frameEncTF;
50
+    TemporalFilterRefPicInfo m_mcstfRefListMAX_MCSTF_TEMPORAL_WINDOW_LENGTH;
51
+
52
     class WeightAnalysis : public BondedTaskGroup
53
     {
54
     public:
55
@@ -250,6 +283,7 @@
56
     void collectDynDataFrame();
57
     void computeAvgTrainingData();
58
     void collectDynDataRow(CUData& ctu, FrameStats* rowStats);    
59
+    void readModel(FilmGrainCharacteristics* m_filmGrain, FILE* filmgrain);
60
 };
61
 }
62
 
63
x265_3.5.tar.gz/source/encoder/level.cpp -> x265_3.6.tar.gz/source/encoder/level.cpp Changed
86
 
1
@@ -72,7 +72,7 @@
2
      * for intra-only profiles (vps.ptl.intraConstraintFlag) */
3
     vps.ptl.lowerBitRateConstraintFlag = true;
4
 
5
-    vps.maxTempSubLayers = param.bEnableTemporalSubLayers ? 2 : 1;
6
+    vps.maxTempSubLayers = !!param.bEnableTemporalSubLayers ? param.bEnableTemporalSubLayers : 1;
7
     
8
     if (param.internalCsp == X265_CSP_I420 && param.internalBitDepth <= 10)
9
     {
10
@@ -167,7 +167,7 @@
11
 
12
         /* The value of sps_max_dec_pic_buffering_minus1 HighestTid  + 1 shall be less than
13
          * or equal to MaxDpbSize */
14
-        if (vps.maxDecPicBuffering > maxDpbSize)
15
+        if (vps.maxDecPicBufferingvps.maxTempSubLayers - 1 > maxDpbSize)
16
             continue;
17
 
18
         /* For level 5 and higher levels, the value of CtbSizeY shall be equal to 32 or 64 */
19
@@ -182,8 +182,8 @@
20
         }
21
 
22
         /* The value of NumPocTotalCurr shall be less than or equal to 8 */
23
-        int numPocTotalCurr = param.maxNumReferences + vps.numReorderPics;
24
-        if (numPocTotalCurr > 8)
25
+        int numPocTotalCurr = param.maxNumReferences + vps.numReorderPicsvps.maxTempSubLayers - 1;
26
+        if (numPocTotalCurr > 10)
27
         {
28
             x265_log(&param, X265_LOG_WARNING, "level %s detected, but NumPocTotalCurr (total references) is non-compliant\n", levelsi.name);
29
             vps.ptl.profileIdc = Profile::NONE;
30
@@ -289,9 +289,40 @@
31
  * circumstances it will be quite noisy */
32
 bool enforceLevel(x265_param& param, VPS& vps)
33
 {
34
-    vps.numReorderPics = (param.bBPyramid && param.bframes > 1) ? 2 : !!param.bframes;
35
-    vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 2, (uint32_t)param.maxNumReferences) + 1);
36
+    vps.maxTempSubLayers = !!param.bEnableTemporalSubLayers ? param.bEnableTemporalSubLayers : 1;
37
+    for (uint32_t i = 0; i < vps.maxTempSubLayers; i++)
38
+    {
39
+        vps.numReorderPicsi = (i == 0) ? ((param.bBPyramid && param.bframes > 1) ? 2 : !!param.bframes) : i;
40
+        vps.maxDecPicBufferingi = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPicsi + 2, (uint32_t)param.maxNumReferences) + 1);
41
+    }
42
 
43
+    if (!!param.bEnableTemporalSubLayers)
44
+    {
45
+        for (int i = 0; i < MAX_T_LAYERS - 1; i++)
46
+        {
47
+            // a lower layer can not have higher value of numReorderPics than a higher layer
48
+            if (vps.numReorderPicsi + 1 < vps.numReorderPicsi)
49
+            {
50
+                vps.numReorderPicsi + 1 = vps.numReorderPicsi;
51
+            }
52
+            // the value of numReorderPicsi shall be in the range of 0 to maxDecPicBufferingi - 1, inclusive
53
+            if (vps.numReorderPicsi > vps.maxDecPicBufferingi - 1)
54
+            {
55
+                vps.maxDecPicBufferingi = vps.numReorderPicsi + 1;
56
+            }
57
+            // a lower layer can not have higher value of maxDecPicBuffering than a higher layer
58
+            if (vps.maxDecPicBufferingi + 1 < vps.maxDecPicBufferingi)
59
+            {
60
+                vps.maxDecPicBufferingi + 1 = vps.maxDecPicBufferingi;
61
+            }
62
+        }
63
+
64
+        // the value of numReorderPicsi shall be in the range of 0 to maxDecPicBuffering i  -  1, inclusive
65
+        if (vps.numReorderPicsMAX_T_LAYERS - 1 > vps.maxDecPicBufferingMAX_T_LAYERS - 1 - 1)
66
+        {
67
+            vps.maxDecPicBufferingMAX_T_LAYERS - 1 = vps.numReorderPicsMAX_T_LAYERS - 1 + 1;
68
+        }
69
+    }
70
     /* no level specified by user, just auto-detect from the configuration */
71
     if (param.levelIdc <= 0)
72
         return true;
73
@@ -391,10 +422,10 @@
74
     }
75
 
76
     int savedRefCount = param.maxNumReferences;
77
-    while (vps.maxDecPicBuffering > maxDpbSize && param.maxNumReferences > 1)
78
+    while (vps.maxDecPicBufferingvps.maxTempSubLayers - 1 > maxDpbSize && param.maxNumReferences > 1)
79
     {
80
         param.maxNumReferences--;
81
-        vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 1, (uint32_t)param.maxNumReferences) + 1);
82
+        vps.maxDecPicBufferingvps.maxTempSubLayers - 1 = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPicsvps.maxTempSubLayers - 1 + 1, (uint32_t)param.maxNumReferences) + 1);
83
     }
84
     if (param.maxNumReferences != savedRefCount)
85
         x265_log(&param, X265_LOG_WARNING, "Lowering max references to %d to meet level requirement\n", param.maxNumReferences);
86
x265_3.5.tar.gz/source/encoder/motion.cpp -> x265_3.6.tar.gz/source/encoder/motion.cpp Changed
33
 
1
@@ -190,6 +190,31 @@
2
     X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
3
 }
4
 
5
+/* Called by lookahead, luma only, no use of PicYuv */
6
+void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int method, const int refine)
7
+{
8
+    partEnum = partitionFromSizes(pwidth, pheight);
9
+    X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
10
+    sad = primitives.pupartEnum.sad;
11
+    ads = primitives.pupartEnum.ads;
12
+    satd = primitives.pupartEnum.satd;
13
+    sad_x3 = primitives.pupartEnum.sad_x3;
14
+    sad_x4 = primitives.pupartEnum.sad_x4;
15
+
16
+
17
+    blockwidth = pwidth;
18
+    blockOffset = offset;
19
+    absPartIdx = ctuAddr = -1;
20
+
21
+    /* Search params */
22
+    searchMethod = method;
23
+    subpelRefine = refine;
24
+
25
+    /* copy PU block into cache */
26
+    primitives.pupartEnum.copy_pp(fencPUYuv.m_buf0, FENC_STRIDE, fencY + offset, stride);
27
+    X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
28
+}
29
+
30
 /* Called by Search::predInterSearch() or --pme equivalent, chroma residual might be considered */
31
 void MotionEstimate::setSourcePU(const Yuv& srcFencYuv, int _ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight, const int method, const int refine, bool bChroma)
32
 {
33
x265_3.5.tar.gz/source/encoder/motion.h -> x265_3.6.tar.gz/source/encoder/motion.h Changed
10
 
1
@@ -77,7 +77,7 @@
2
     void init(int csp);
3
 
4
     /* Methods called at slice setup */
5
-
6
+    void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int searchMethod, const int subpelRefine);
7
     void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int searchMethod, const int searchL0, const int searchL1, const int subpelRefine);
8
     void setSourcePU(const Yuv& srcFencYuv, int ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight, const int searchMethod, const int subpelRefine, bool bChroma);
9
 
10
x265_3.5.tar.gz/source/encoder/nal.cpp -> x265_3.6.tar.gz/source/encoder/nal.cpp Changed
19
 
1
@@ -57,7 +57,7 @@
2
     other.m_buffer = X265_MALLOC(uint8_t, m_allocSize);
3
 }
4
 
5
-void NALList::serialize(NalUnitType nalUnitType, const Bitstream& bs)
6
+void NALList::serialize(NalUnitType nalUnitType, const Bitstream& bs, uint8_t temporalID)
7
 {
8
     static const char startCodePrefix = { 0, 0, 0, 1 };
9
 
10
@@ -114,7 +114,7 @@
11
      * nuh_reserved_zero_6bits  6-bits
12
      * nuh_temporal_id_plus1    3-bits */
13
     outbytes++ = (uint8_t)nalUnitType << 1;
14
-    outbytes++ = 1 + (nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N);
15
+    outbytes++ = temporalID;
16
 
17
     /* 7.4.1 ...
18
      * Within the NAL unit, the following three-byte sequences shall not occur at
19
x265_3.5.tar.gz/source/encoder/nal.h -> x265_3.6.tar.gz/source/encoder/nal.h Changed
10
 
1
@@ -56,7 +56,7 @@
2
 
3
     void takeContents(NALList& other);
4
 
5
-    void serialize(NalUnitType nalUnitType, const Bitstream& bs);
6
+    void serialize(NalUnitType nalUnitType, const Bitstream& bs, uint8_t temporalID = 1);
7
 
8
     uint32_t serializeSubstreams(uint32_t* streamSizeBytes, uint32_t streamCount, const Bitstream* streams);
9
 };
10
x265_3.5.tar.gz/source/encoder/ratecontrol.cpp -> x265_3.6.tar.gz/source/encoder/ratecontrol.cpp Changed
1457
 
1
@@ -41,6 +41,10 @@
2
 #define BR_SHIFT  6
3
 #define CPB_SHIFT 4
4
 
5
+#define SHARED_DATA_ALIGNMENT      4 ///< 4btye, 32bit
6
+#define CUTREE_SHARED_MEM_NAME     "cutree"
7
+#define GOP_CNT_CU_TREE            3
8
+
9
 using namespace X265_NS;
10
 
11
 /* Amortize the partial cost of I frames over the next N frames */
12
@@ -104,6 +108,37 @@
13
     return output;
14
 }
15
 
16
+typedef struct CUTreeSharedDataItem
17
+{
18
+    uint8_t  *type;
19
+    uint16_t *stats;
20
+}CUTreeSharedDataItem;
21
+
22
+void static ReadSharedCUTreeData(void *dst, void *src, int32_t size)
23
+{
24
+    CUTreeSharedDataItem *statsDst = reinterpret_cast<CUTreeSharedDataItem *>(dst);
25
+    uint8_t *typeSrc = reinterpret_cast<uint8_t *>(src);
26
+    *statsDst->type = *typeSrc;
27
+
28
+    ///< for memory alignment, the type will take 32bit in the shared memory
29
+    int32_t offset = (sizeof(*statsDst->type) + SHARED_DATA_ALIGNMENT - 1) & ~(SHARED_DATA_ALIGNMENT - 1);
30
+    uint16_t *statsSrc = reinterpret_cast<uint16_t *>(typeSrc + offset);
31
+    memcpy(statsDst->stats, statsSrc, size - offset);
32
+}
33
+
34
+void static WriteSharedCUTreeData(void *dst, void *src, int32_t size)
35
+{
36
+    CUTreeSharedDataItem *statsSrc = reinterpret_cast<CUTreeSharedDataItem *>(src);
37
+    uint8_t *typeDst = reinterpret_cast<uint8_t *>(dst);
38
+    *typeDst = *statsSrc->type;
39
+
40
+    ///< for memory alignment, the type will take 32bit in the shared memory
41
+    int32_t offset = (sizeof(*statsSrc->type) + SHARED_DATA_ALIGNMENT - 1) & ~(SHARED_DATA_ALIGNMENT - 1);
42
+    uint16_t *statsDst = reinterpret_cast<uint16_t *>(typeDst + offset);
43
+    memcpy(statsDst, statsSrc->stats, size - offset);
44
+}
45
+
46
+
47
 inline double qScale2bits(RateControlEntry *rce, double qScale)
48
 {
49
     if (qScale < 0.1)
50
@@ -209,6 +244,7 @@
51
     m_lastAbrResetPoc = -1;
52
     m_statFileOut = NULL;
53
     m_cutreeStatFileOut = m_cutreeStatFileIn = NULL;
54
+    m_cutreeShrMem = NULL;
55
     m_rce2Pass = NULL;
56
     m_encOrder = NULL;
57
     m_lastBsliceSatdCost = 0;
58
@@ -224,6 +260,8 @@
59
     m_initVbv = false;
60
     m_singleFrameVbv = 0;
61
     m_rateTolerance = 1.0;
62
+    m_encodedSegmentBits = 0;
63
+    m_segDur = 0;
64
 
65
     if (m_param->rc.vbvBufferSize)
66
     {
67
@@ -320,47 +358,86 @@
68
         m_cuTreeStats.qpBufferi = NULL;
69
 }
70
 
71
-bool RateControl::init(const SPS& sps)
72
+bool RateControl::initCUTreeSharedMem()
73
 {
74
-    if (m_isVbv && !m_initVbv)
75
-    {
76
-        /* We don't support changing the ABR bitrate right now,
77
-         * so if the stream starts as CBR, keep it CBR. */
78
-        if (m_param->rc.vbvBufferSize < (int)(m_param->rc.vbvMaxBitrate / m_fps))
79
+    if (!m_cutreeShrMem) {
80
+        m_cutreeShrMem = new RingMem();
81
+        if (!m_cutreeShrMem)
82
         {
83
-            m_param->rc.vbvBufferSize = (int)(m_param->rc.vbvMaxBitrate / m_fps);
84
-            x265_log(m_param, X265_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
85
-                     m_param->rc.vbvBufferSize);
86
+            return false;
87
         }
88
-        int vbvBufferSize = m_param->rc.vbvBufferSize * 1000;
89
-        int vbvMaxBitrate = m_param->rc.vbvMaxBitrate * 1000;
90
 
91
-        if (m_param->bEmitHRDSEI && !m_param->decoderVbvMaxRate)
92
+        ///< now cutree data form at most 3 gops would be stored in the shared memory at the same time
93
+        int32_t itemSize = (sizeof(uint8_t) + SHARED_DATA_ALIGNMENT - 1) & ~(SHARED_DATA_ALIGNMENT - 1);
94
+        if (m_param->rc.qgSize == 8)
95
         {
96
-            const HRDInfo* hrd = &sps.vuiParameters.hrdParameters;
97
-            vbvBufferSize = hrd->cpbSizeValue << (hrd->cpbSizeScale + CPB_SHIFT);
98
-            vbvMaxBitrate = hrd->bitRateValue << (hrd->bitRateScale + BR_SHIFT);
99
+            itemSize += sizeof(uint16_t) * m_ncu * 4;
100
         }
101
-        m_bufferRate = vbvMaxBitrate / m_fps;
102
-        m_vbvMaxRate = vbvMaxBitrate;
103
-        m_bufferSize = vbvBufferSize;
104
-        m_singleFrameVbv = m_bufferRate * 1.1 > m_bufferSize;
105
+        else
106
+        {
107
+            itemSize += sizeof(uint16_t) * m_ncu;
108
+        }
109
+
110
+        int32_t itemCnt = X265_MIN(m_param->keyframeMax, (int)(m_fps + 0.5));
111
+        itemCnt *= GOP_CNT_CU_TREE;
112
 
113
-        if (m_param->rc.vbvBufferInit > 1.)
114
-            m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, m_param->rc.vbvBufferInit / m_param->rc.vbvBufferSize);
115
-        if (m_param->vbvBufferEnd > 1.)
116
-            m_param->vbvBufferEnd = x265_clip3(0.0, 1.0, m_param->vbvBufferEnd / m_param->rc.vbvBufferSize);
117
-        if (m_param->vbvEndFrameAdjust > 1.)
118
-            m_param->vbvEndFrameAdjust = x265_clip3(0.0, 1.0, m_param->vbvEndFrameAdjust);
119
-        m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, X265_MAX(m_param->rc.vbvBufferInit, m_bufferRate / m_bufferSize));
120
-        m_bufferFillFinal = m_bufferSize * m_param->rc.vbvBufferInit;
121
-        m_bufferFillActual = m_bufferFillFinal;
122
-        m_bufferExcess = 0;
123
-        m_minBufferFill = m_param->minVbvFullness / 100;
124
-        m_maxBufferFill = 1 - (m_param->maxVbvFullness / 100);
125
-        m_initVbv = true;
126
+        char shrnameMAX_SHR_NAME_LEN = { 0 };
127
+        strcpy(shrname, m_param->rc.sharedMemName);
128
+        strcat(shrname, CUTREE_SHARED_MEM_NAME);
129
+
130
+        if (!m_cutreeShrMem->init(itemSize, itemCnt, shrname))
131
+        {
132
+            return false;
133
+        }
134
     }
135
 
136
+    return true;
137
+}
138
+
139
+void RateControl::initVBV(const SPS& sps)
140
+{
141
+    /* We don't support changing the ABR bitrate right now,
142
+ * so if the stream starts as CBR, keep it CBR. */
143
+    if (m_param->rc.vbvBufferSize < (int)(m_param->rc.vbvMaxBitrate / m_fps))
144
+    {
145
+        m_param->rc.vbvBufferSize = (int)(m_param->rc.vbvMaxBitrate / m_fps);
146
+        x265_log(m_param, X265_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
147
+            m_param->rc.vbvBufferSize);
148
+    }
149
+    int vbvBufferSize = m_param->rc.vbvBufferSize * 1000;
150
+    int vbvMaxBitrate = m_param->rc.vbvMaxBitrate * 1000;
151
+
152
+    if (m_param->bEmitHRDSEI && !m_param->decoderVbvMaxRate)
153
+    {
154
+        const HRDInfo* hrd = &sps.vuiParameters.hrdParameters;
155
+        vbvBufferSize = hrd->cpbSizeValue << (hrd->cpbSizeScale + CPB_SHIFT);
156
+        vbvMaxBitrate = hrd->bitRateValue << (hrd->bitRateScale + BR_SHIFT);
157
+    }
158
+    m_bufferRate = vbvMaxBitrate / m_fps;
159
+    m_vbvMaxRate = vbvMaxBitrate;
160
+    m_bufferSize = vbvBufferSize;
161
+    m_singleFrameVbv = m_bufferRate * 1.1 > m_bufferSize;
162
+
163
+    if (m_param->rc.vbvBufferInit > 1.)
164
+        m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, m_param->rc.vbvBufferInit / m_param->rc.vbvBufferSize);
165
+    if (m_param->vbvBufferEnd > 1.)
166
+        m_param->vbvBufferEnd = x265_clip3(0.0, 1.0, m_param->vbvBufferEnd / m_param->rc.vbvBufferSize);
167
+    if (m_param->vbvEndFrameAdjust > 1.)
168
+        m_param->vbvEndFrameAdjust = x265_clip3(0.0, 1.0, m_param->vbvEndFrameAdjust);
169
+    m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, X265_MAX(m_param->rc.vbvBufferInit, m_bufferRate / m_bufferSize));
170
+    m_bufferFillFinal = m_bufferSize * m_param->rc.vbvBufferInit;
171
+    m_bufferFillActual = m_bufferFillFinal;
172
+    m_bufferExcess = 0;
173
+    m_minBufferFill = m_param->minVbvFullness / 100;
174
+    m_maxBufferFill = 1 - (m_param->maxVbvFullness / 100);
175
+    m_initVbv = true;
176
+}
177
+
178
+bool RateControl::init(const SPS& sps)
179
+{
180
+    if (m_isVbv && !m_initVbv)
181
+        initVBV(sps);
182
+
183
     if (!m_param->bResetZoneConfig && (m_relativeComplexity == NULL))
184
     {
185
         m_relativeComplexity = X265_MALLOC(double, m_param->reconfigWindowSize);
186
@@ -373,7 +450,9 @@
187
 
188
     m_totalBits = 0;
189
     m_encodedBits = 0;
190
+    m_encodedSegmentBits = 0;
191
     m_framesDone = 0;
192
+    m_segDur = 0;
193
     m_residualCost = 0;
194
     m_partialResidualCost = 0;
195
     m_amortizeFraction = 0.85;
196
@@ -421,244 +500,257 @@
197
         /* Load stat file and init 2pass algo */
198
         if (m_param->rc.bStatRead)
199
         {
200
-            m_expectedBitsSum = 0;
201
-            char *p, *statsIn, *statsBuf;
202
-            /* read 1st pass stats */
203
-            statsIn = statsBuf = x265_slurp_file(fileName);
204
-            if (!statsBuf)
205
-                return false;
206
-            if (m_param->rc.cuTree)
207
+            if (X265_SHARE_MODE_FILE == m_param->rc.dataShareMode)
208
             {
209
-                char *tmpFile = strcatFilename(fileName, ".cutree");
210
-                if (!tmpFile)
211
+                m_expectedBitsSum = 0;
212
+                char *p, *statsIn, *statsBuf;
213
+                /* read 1st pass stats */
214
+                statsIn = statsBuf = x265_slurp_file(fileName);
215
+                if (!statsBuf)
216
                     return false;
217
-                m_cutreeStatFileIn = x265_fopen(tmpFile, "rb");
218
-                X265_FREE(tmpFile);
219
-                if (!m_cutreeStatFileIn)
220
+                if (m_param->rc.cuTree)
221
                 {
222
-                    x265_log_file(m_param, X265_LOG_ERROR, "can't open stats file %s.cutree\n", fileName);
223
-                    return false;
224
+                    char *tmpFile = strcatFilename(fileName, ".cutree");
225
+                    if (!tmpFile)
226
+                        return false;
227
+                    m_cutreeStatFileIn = x265_fopen(tmpFile, "rb");
228
+                    X265_FREE(tmpFile);
229
+                    if (!m_cutreeStatFileIn)
230
+                    {
231
+                        x265_log_file(m_param, X265_LOG_ERROR, "can't open stats file %s.cutree\n", fileName);
232
+                        return false;
233
+                    }
234
                 }
235
-            }
236
 
237
-            /* check whether 1st pass options were compatible with current options */
238
-            if (strncmp(statsBuf, "#options:", 9))
239
-            {
240
-                x265_log(m_param, X265_LOG_ERROR,"options list in stats file not valid\n");
241
-                return false;
242
-            }
243
-            {
244
-                int i, j, m;
245
-                uint32_t k , l;
246
-                bool bErr = false;
247
-                char *opts = statsBuf;
248
-                statsIn = strchr(statsBuf, '\n');
249
-                if (!statsIn)
250
-                {
251
-                    x265_log(m_param, X265_LOG_ERROR, "Malformed stats file\n");
252
-                    return false;
253
-                }
254
-                *statsIn = '\0';
255
-                statsIn++;
256
-                if ((p = strstr(opts, " input-res=")) == 0 || sscanf(p, " input-res=%dx%d", &i, &j) != 2)
257
-                {
258
-                    x265_log(m_param, X265_LOG_ERROR, "Resolution specified in stats file not valid\n");
259
-                    return false;
260
-                }
261
-                if ((p = strstr(opts, " fps=")) == 0 || sscanf(p, " fps=%u/%u", &k, &l) != 2)
262
-                {
263
-                    x265_log(m_param, X265_LOG_ERROR, "fps specified in stats file not valid\n");
264
-                    return false;
265
-                }
266
-                if (((p = strstr(opts, " vbv-maxrate=")) == 0 || sscanf(p, " vbv-maxrate=%d", &m) != 1) && m_param->rc.rateControlMode == X265_RC_CRF)
267
-                {
268
-                    x265_log(m_param, X265_LOG_ERROR, "Constant rate-factor is incompatible with 2pass without vbv-maxrate in the previous pass\n");
269
-                    return false;
270
-                }
271
-                if (k != m_param->fpsNum || l != m_param->fpsDenom)
272
+                /* check whether 1st pass options were compatible with current options */
273
+                if (strncmp(statsBuf, "#options:", 9))
274
                 {
275
-                    x265_log(m_param, X265_LOG_ERROR, "fps mismatch with 1st pass (%u/%u vs %u/%u)\n",
276
-                              m_param->fpsNum, m_param->fpsDenom, k, l);
277
+                    x265_log(m_param, X265_LOG_ERROR, "options list in stats file not valid\n");
278
                     return false;
279
                 }
280
-                if (m_param->analysisMultiPassRefine)
281
                 {
282
-                    p = strstr(opts, "ref=");
283
-                    sscanf(p, "ref=%d", &i);
284
-                    if (i > m_param->maxNumReferences)
285
+                    int i, j, m;
286
+                    uint32_t k, l;
287
+                    bool bErr = false;
288
+                    char *opts = statsBuf;
289
+                    statsIn = strchr(statsBuf, '\n');
290
+                    if (!statsIn)
291
                     {
292
-                        x265_log(m_param, X265_LOG_ERROR, "maxNumReferences cannot be less than 1st pass (%d vs %d)\n",
293
-                            i, m_param->maxNumReferences);
294
+                        x265_log(m_param, X265_LOG_ERROR, "Malformed stats file\n");
295
                         return false;
296
                     }
297
-                }
298
-                if (m_param->analysisMultiPassRefine || m_param->analysisMultiPassDistortion)
299
-                {
300
-                    p = strstr(opts, "ctu=");
301
-                    sscanf(p, "ctu=%u", &k);
302
-                    if (k != m_param->maxCUSize)
303
+                    *statsIn = '\0';
304
+                    statsIn++;
305
+                    if ((p = strstr(opts, " input-res=")) == 0 || sscanf(p, " input-res=%dx%d", &i, &j) != 2)
306
                     {
307
-                        x265_log(m_param, X265_LOG_ERROR, "maxCUSize mismatch with 1st pass (%u vs %u)\n",
308
-                            k, m_param->maxCUSize);
309
+                        x265_log(m_param, X265_LOG_ERROR, "Resolution specified in stats file not valid\n");
310
                         return false;
311
                     }
312
+                    if ((p = strstr(opts, " fps=")) == 0 || sscanf(p, " fps=%u/%u", &k, &l) != 2)
313
+                    {
314
+                        x265_log(m_param, X265_LOG_ERROR, "fps specified in stats file not valid\n");
315
+                        return false;
316
+                    }
317
+                    if (((p = strstr(opts, " vbv-maxrate=")) == 0 || sscanf(p, " vbv-maxrate=%d", &m) != 1) && m_param->rc.rateControlMode == X265_RC_CRF)
318
+                    {
319
+                        x265_log(m_param, X265_LOG_ERROR, "Constant rate-factor is incompatible with 2pass without vbv-maxrate in the previous pass\n");
320
+                        return false;
321
+                    }
322
+                    if (k != m_param->fpsNum || l != m_param->fpsDenom)
323
+                    {
324
+                        x265_log(m_param, X265_LOG_ERROR, "fps mismatch with 1st pass (%u/%u vs %u/%u)\n",
325
+                            m_param->fpsNum, m_param->fpsDenom, k, l);
326
+                        return false;
327
+                    }
328
+                    if (m_param->analysisMultiPassRefine)
329
+                    {
330
+                        p = strstr(opts, "ref=");
331
+                        sscanf(p, "ref=%d", &i);
332
+                        if (i > m_param->maxNumReferences)
333
+                        {
334
+                            x265_log(m_param, X265_LOG_ERROR, "maxNumReferences cannot be less than 1st pass (%d vs %d)\n",
335
+                                i, m_param->maxNumReferences);
336
+                            return false;
337
+                        }
338
+                    }
339
+                    if (m_param->analysisMultiPassRefine || m_param->analysisMultiPassDistortion)
340
+                    {
341
+                        p = strstr(opts, "ctu=");
342
+                        sscanf(p, "ctu=%u", &k);
343
+                        if (k != m_param->maxCUSize)
344
+                        {
345
+                            x265_log(m_param, X265_LOG_ERROR, "maxCUSize mismatch with 1st pass (%u vs %u)\n",
346
+                                k, m_param->maxCUSize);
347
+                            return false;
348
+                        }
349
+                    }
350
+                    CMP_OPT_FIRST_PASS("bitdepth", m_param->internalBitDepth);
351
+                    CMP_OPT_FIRST_PASS("weightp", m_param->bEnableWeightedPred);
352
+                    CMP_OPT_FIRST_PASS("bframes", m_param->bframes);
353
+                    CMP_OPT_FIRST_PASS("b-pyramid", m_param->bBPyramid);
354
+                    CMP_OPT_FIRST_PASS("open-gop", m_param->bOpenGOP);
355
+                    CMP_OPT_FIRST_PASS(" keyint", m_param->keyframeMax);
356
+                    CMP_OPT_FIRST_PASS("scenecut", m_param->scenecutThreshold);
357
+                    CMP_OPT_FIRST_PASS("intra-refresh", m_param->bIntraRefresh);
358
+                    CMP_OPT_FIRST_PASS("frame-dup", m_param->bEnableFrameDuplication);
359
+                    if (m_param->bMultiPassOptRPS)
360
+                    {
361
+                        CMP_OPT_FIRST_PASS("multi-pass-opt-rps", m_param->bMultiPassOptRPS);
362
+                        CMP_OPT_FIRST_PASS("repeat-headers", m_param->bRepeatHeaders);
363
+                        CMP_OPT_FIRST_PASS("min-keyint", m_param->keyframeMin);
364
+                    }
365
+
366
+                    if ((p = strstr(opts, "b-adapt=")) != 0 && sscanf(p, "b-adapt=%d", &i) && i >= X265_B_ADAPT_NONE && i <= X265_B_ADAPT_TRELLIS)
367
+                    {
368
+                        m_param->bFrameAdaptive = i;
369
+                    }
370
+                    else if (m_param->bframes)
371
+                    {
372
+                        x265_log(m_param, X265_LOG_ERROR, "b-adapt method specified in stats file not valid\n");
373
+                        return false;
374
+                    }
375
+
376
+                    if ((p = strstr(opts, "rc-lookahead=")) != 0 && sscanf(p, "rc-lookahead=%d", &i))
377
+                        m_param->lookaheadDepth = i;
378
                 }
379
-                CMP_OPT_FIRST_PASS("bitdepth", m_param->internalBitDepth);
380
-                CMP_OPT_FIRST_PASS("weightp", m_param->bEnableWeightedPred);
381
-                CMP_OPT_FIRST_PASS("bframes", m_param->bframes);
382
-                CMP_OPT_FIRST_PASS("b-pyramid", m_param->bBPyramid);
383
-                CMP_OPT_FIRST_PASS("open-gop", m_param->bOpenGOP);
384
-                CMP_OPT_FIRST_PASS(" keyint", m_param->keyframeMax);
385
-                CMP_OPT_FIRST_PASS("scenecut", m_param->scenecutThreshold);
386
-                CMP_OPT_FIRST_PASS("intra-refresh", m_param->bIntraRefresh);
387
-                CMP_OPT_FIRST_PASS("frame-dup", m_param->bEnableFrameDuplication);
388
-                if (m_param->bMultiPassOptRPS)
389
+                /* find number of pics */
390
+                p = statsIn;
391
+                int numEntries;
392
+                for (numEntries = -1; p; numEntries++)
393
+                    p = strchr(p + 1, ';');
394
+                if (!numEntries)
395
                 {
396
-                    CMP_OPT_FIRST_PASS("multi-pass-opt-rps", m_param->bMultiPassOptRPS);
397
-                    CMP_OPT_FIRST_PASS("repeat-headers", m_param->bRepeatHeaders);
398
-                    CMP_OPT_FIRST_PASS("min-keyint", m_param->keyframeMin);
399
+                    x265_log(m_param, X265_LOG_ERROR, "empty stats file\n");
400
+                    return false;
401
                 }
402
+                m_numEntries = numEntries;
403
 
404
-                if ((p = strstr(opts, "b-adapt=")) != 0 && sscanf(p, "b-adapt=%d", &i) && i >= X265_B_ADAPT_NONE && i <= X265_B_ADAPT_TRELLIS)
405
+                if (m_param->totalFrames < m_numEntries && m_param->totalFrames > 0)
406
                 {
407
-                    m_param->bFrameAdaptive = i;
408
+                    x265_log(m_param, X265_LOG_WARNING, "2nd pass has fewer frames than 1st pass (%d vs %d)\n",
409
+                        m_param->totalFrames, m_numEntries);
410
                 }
411
-                else if (m_param->bframes)
412
+                if (m_param->totalFrames > m_numEntries && !m_param->bEnableFrameDuplication)
413
                 {
414
-                    x265_log(m_param, X265_LOG_ERROR, "b-adapt method specified in stats file not valid\n");
415
+                    x265_log(m_param, X265_LOG_ERROR, "2nd pass has more frames than 1st pass (%d vs %d)\n",
416
+                        m_param->totalFrames, m_numEntries);
417
                     return false;
418
                 }
419
 
420
-                if ((p = strstr(opts, "rc-lookahead=")) != 0 && sscanf(p, "rc-lookahead=%d", &i))
421
-                    m_param->lookaheadDepth = i;
422
-            }
423
-            /* find number of pics */
424
-            p = statsIn;
425
-            int numEntries;
426
-            for (numEntries = -1; p; numEntries++)
427
-                p = strchr(p + 1, ';');
428
-            if (!numEntries)
429
-            {
430
-                x265_log(m_param, X265_LOG_ERROR, "empty stats file\n");
431
-                return false;
432
-            }
433
-            m_numEntries = numEntries;
434
-
435
-            if (m_param->totalFrames < m_numEntries && m_param->totalFrames > 0)
436
-            {
437
-                x265_log(m_param, X265_LOG_WARNING, "2nd pass has fewer frames than 1st pass (%d vs %d)\n",
438
-                         m_param->totalFrames, m_numEntries);
439
-            }
440
-            if (m_param->totalFrames > m_numEntries && !m_param->bEnableFrameDuplication)
441
-            {
442
-                x265_log(m_param, X265_LOG_ERROR, "2nd pass has more frames than 1st pass (%d vs %d)\n",
443
-                         m_param->totalFrames, m_numEntries);
444
-                return false;
445
-            }
446
-
447
-            m_rce2Pass = X265_MALLOC(RateControlEntry, m_numEntries);
448
-            if (!m_rce2Pass)
449
-            {
450
-                 x265_log(m_param, X265_LOG_ERROR, "Rce Entries for 2 pass cannot be allocated\n");
451
-                 return false;
452
-            }
453
-            m_encOrder = X265_MALLOC(int, m_numEntries);
454
-            if (!m_encOrder)
455
-            {
456
-                x265_log(m_param, X265_LOG_ERROR, "Encode order for 2 pass cannot be allocated\n");
457
-                return false;
458
-            }
459
-            /* init all to skipped p frames */
460
-            for (int i = 0; i < m_numEntries; i++)
461
-            {
462
-                RateControlEntry *rce = &m_rce2Passi;
463
-                rce->sliceType = P_SLICE;
464
-                rce->qScale = rce->newQScale = x265_qp2qScale(20);
465
-                rce->miscBits = m_ncu + 10;
466
-                rce->newQp = 0;
467
-            }
468
-            /* read stats */
469
-            p = statsIn;
470
-            double totalQpAq = 0;
471
-            for (int i = 0; i < m_numEntries; i++)
472
-            {
473
-                RateControlEntry *rce, *rcePocOrder;
474
-                int frameNumber;
475
-                int encodeOrder;
476
-                char picType;
477
-                int e;
478
-                char *next;
479
-                double qpRc, qpAq, qNoVbv, qRceq;
480
-                next = strstr(p, ";");
481
-                if (next)
482
-                    *next++ = 0;
483
-                e = sscanf(p, " in:%d out:%d", &frameNumber, &encodeOrder);
484
-                if (frameNumber < 0 || frameNumber >= m_numEntries)
485
+                m_rce2Pass = X265_MALLOC(RateControlEntry, m_numEntries);
486
+                if (!m_rce2Pass)
487
                 {
488
-                    x265_log(m_param, X265_LOG_ERROR, "bad frame number (%d) at stats line %d\n", frameNumber, i);
489
+                    x265_log(m_param, X265_LOG_ERROR, "Rce Entries for 2 pass cannot be allocated\n");
490
                     return false;
491
                 }
492
-                rce = &m_rce2PassencodeOrder;
493
-                rcePocOrder = &m_rce2PassframeNumber;
494
-                m_encOrderframeNumber = encodeOrder;
495
-                if (!m_param->bMultiPassOptRPS)
496
-                {
497
-                    int scenecut = 0;
498
-                    e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf q-noVbv:%lf q-Rceq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf sc:%d",
499
-                        &picType, &qpRc, &qpAq, &qNoVbv, &qRceq, &rce->coeffBits,
500
-                        &rce->mvBits, &rce->miscBits, &rce->iCuCount, &rce->pCuCount,
501
-                        &rce->skipCuCount, &scenecut);
502
-                    rcePocOrder->scenecut = scenecut != 0;
503
+                m_encOrder = X265_MALLOC(int, m_numEntries);
504
+                if (!m_encOrder)
505
+                {
506
+                    x265_log(m_param, X265_LOG_ERROR, "Encode order for 2 pass cannot be allocated\n");
507
+                    return false;
508
                 }
509
-                else
510
+                /* init all to skipped p frames */
511
+                for (int i = 0; i < m_numEntries; i++)
512
                 {
513
-                    char deltaPOC128;
514
-                    char bUsed40;
515
-                    memset(deltaPOC, 0, sizeof(deltaPOC));
516
-                    memset(bUsed, 0, sizeof(bUsed));
517
-                    e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf q-noVbv:%lf q-Rceq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf nump:%d numnegp:%d numposp:%d deltapoc:%s bused:%s",
518
-                        &picType, &qpRc, &qpAq, &qNoVbv, &qRceq, &rce->coeffBits,
519
-                        &rce->mvBits, &rce->miscBits, &rce->iCuCount, &rce->pCuCount,
520
-                        &rce->skipCuCount, &rce->rpsData.numberOfPictures, &rce->rpsData.numberOfNegativePictures, &rce->rpsData.numberOfPositivePictures, deltaPOC, bUsed);
521
-                    splitdeltaPOC(deltaPOC, rce);
522
-                    splitbUsed(bUsed, rce);
523
-                    rce->rpsIdx = -1;
524
-                }
525
-                rce->keptAsRef = true;
526
-                rce->isIdr = false;
527
-                if (picType == 'b' || picType == 'p')
528
-                    rce->keptAsRef = false;
529
-                if (picType == 'I')
530
-                    rce->isIdr = true;
531
-                if (picType == 'I' || picType == 'i')
532
-                    rce->sliceType = I_SLICE;
533
-                else if (picType == 'P' || picType == 'p')
534
+                    RateControlEntry *rce = &m_rce2Passi;
535
                     rce->sliceType = P_SLICE;
536
-                else if (picType == 'B' || picType == 'b')
537
-                    rce->sliceType = B_SLICE;
538
-                else
539
-                    e = -1;
540
-                if (e < 10)
541
+                    rce->qScale = rce->newQScale = x265_qp2qScale(20);
542
+                    rce->miscBits = m_ncu + 10;
543
+                    rce->newQp = 0;
544
+                }
545
+                /* read stats */
546
+                p = statsIn;
547
+                double totalQpAq = 0;
548
+                for (int i = 0; i < m_numEntries; i++)
549
+                {
550
+                    RateControlEntry *rce, *rcePocOrder;
551
+                    int frameNumber;
552
+                    int encodeOrder;
553
+                    char picType;
554
+                    int e;
555
+                    char *next;
556
+                    double qpRc, qpAq, qNoVbv, qRceq;
557
+                    next = strstr(p, ";");
558
+                    if (next)
559
+                        *next++ = 0;
560
+                    e = sscanf(p, " in:%d out:%d", &frameNumber, &encodeOrder);
561
+                    if (frameNumber < 0 || frameNumber >= m_numEntries)
562
+                    {
563
+                        x265_log(m_param, X265_LOG_ERROR, "bad frame number (%d) at stats line %d\n", frameNumber, i);
564
+                        return false;
565
+                    }
566
+                    rce = &m_rce2PassencodeOrder;
567
+                    rcePocOrder = &m_rce2PassframeNumber;
568
+                    m_encOrderframeNumber = encodeOrder;
569
+                    if (!m_param->bMultiPassOptRPS)
570
+                    {
571
+                        int scenecut = 0;
572
+                        e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf q-noVbv:%lf q-Rceq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf sc:%d",
573
+                            &picType, &qpRc, &qpAq, &qNoVbv, &qRceq, &rce->coeffBits,
574
+                            &rce->mvBits, &rce->miscBits, &rce->iCuCount, &rce->pCuCount,
575
+                            &rce->skipCuCount, &scenecut);
576
+                        rcePocOrder->scenecut = scenecut != 0;
577
+                    }
578
+                    else
579
+                    {
580
+                        char deltaPOC128;
581
+                        char bUsed40;
582
+                        memset(deltaPOC, 0, sizeof(deltaPOC));
583
+                        memset(bUsed, 0, sizeof(bUsed));
584
+                        e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf q-noVbv:%lf q-Rceq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf nump:%d numnegp:%d numposp:%d deltapoc:%s bused:%s",
585
+                            &picType, &qpRc, &qpAq, &qNoVbv, &qRceq, &rce->coeffBits,
586
+                            &rce->mvBits, &rce->miscBits, &rce->iCuCount, &rce->pCuCount,
587
+                            &rce->skipCuCount, &rce->rpsData.numberOfPictures, &rce->rpsData.numberOfNegativePictures, &rce->rpsData.numberOfPositivePictures, deltaPOC, bUsed);
588
+                        splitdeltaPOC(deltaPOC, rce);
589
+                        splitbUsed(bUsed, rce);
590
+                        rce->rpsIdx = -1;
591
+                    }
592
+                    rce->keptAsRef = true;
593
+                    rce->isIdr = false;
594
+                    if (picType == 'b' || picType == 'p')
595
+                        rce->keptAsRef = false;
596
+                    if (picType == 'I')
597
+                        rce->isIdr = true;
598
+                    if (picType == 'I' || picType == 'i')
599
+                        rce->sliceType = I_SLICE;
600
+                    else if (picType == 'P' || picType == 'p')
601
+                        rce->sliceType = P_SLICE;
602
+                    else if (picType == 'B' || picType == 'b')
603
+                        rce->sliceType = B_SLICE;
604
+                    else
605
+                        e = -1;
606
+                    if (e < 10)
607
+                    {
608
+                        x265_log(m_param, X265_LOG_ERROR, "statistics are damaged at line %d, parser out=%d\n", i, e);
609
+                        return false;
610
+                    }
611
+                    rce->qScale = rce->newQScale = x265_qp2qScale(qpRc);
612
+                    totalQpAq += qpAq;
613
+                    rce->qpNoVbv = qNoVbv;
614
+                    rce->qpaRc = qpRc;
615
+                    rce->qpAq = qpAq;
616
+                    rce->qRceq = qRceq;
617
+                    p = next;
618
+                }
619
+                X265_FREE(statsBuf);
620
+                if (m_param->rc.rateControlMode != X265_RC_CQP)
621
+                {
622
+                    m_start = 0;
623
+                    m_isQpModified = true;
624
+                    if (!initPass2())
625
+                        return false;
626
+                } /* else we're using constant quant, so no need to run the bitrate allocation */
627
+            }
628
+            else // X265_SHARE_MODE_SHAREDMEM == m_param->rc.dataShareMode
629
+            {
630
+                if (m_param->rc.cuTree)
631
                 {
632
-                    x265_log(m_param, X265_LOG_ERROR, "statistics are damaged at line %d, parser out=%d\n", i, e);
633
-                    return false;
634
+                    if (!initCUTreeSharedMem())
635
+                    {
636
+                        return false;
637
+                    }
638
                 }
639
-                rce->qScale = rce->newQScale = x265_qp2qScale(qpRc);
640
-                totalQpAq += qpAq;
641
-                rce->qpNoVbv = qNoVbv;
642
-                rce->qpaRc = qpRc;
643
-                rce->qpAq = qpAq;
644
-                rce->qRceq = qRceq;
645
-                p = next;
646
-            }
647
-            X265_FREE(statsBuf);
648
-            if (m_param->rc.rateControlMode != X265_RC_CQP)
649
-            {
650
-                m_start = 0;
651
-                m_isQpModified = true;
652
-                if (!initPass2())
653
-                    return false;
654
-            } /* else we're using constant quant, so no need to run the bitrate allocation */
655
+            }
656
         }
657
         /* Open output file */
658
         /* If input and output files are the same, output to a temp file
659
@@ -682,19 +774,29 @@
660
             X265_FREE(p);
661
             if (m_param->rc.cuTree && !m_param->rc.bStatRead)
662
             {
663
-                statFileTmpname = strcatFilename(fileName, ".cutree.temp");
664
-                if (!statFileTmpname)
665
-                    return false;
666
-                m_cutreeStatFileOut = x265_fopen(statFileTmpname, "wb");
667
-                X265_FREE(statFileTmpname);
668
-                if (!m_cutreeStatFileOut)
669
+                if (X265_SHARE_MODE_FILE == m_param->rc.dataShareMode)
670
                 {
671
-                    x265_log_file(m_param, X265_LOG_ERROR, "can't open mbtree stats file %s.cutree.temp\n", fileName);
672
-                    return false;
673
+                    statFileTmpname = strcatFilename(fileName, ".cutree.temp");
674
+                    if (!statFileTmpname)
675
+                        return false;
676
+                    m_cutreeStatFileOut = x265_fopen(statFileTmpname, "wb");
677
+                    X265_FREE(statFileTmpname);
678
+                    if (!m_cutreeStatFileOut)
679
+                    {
680
+                        x265_log_file(m_param, X265_LOG_ERROR, "can't open mbtree stats file %s.cutree.temp\n", fileName);
681
+                        return false;
682
+                    }
683
+                }
684
+                else // X265_SHARE_MODE_SHAREDMEM == m_param->rc.dataShareMode
685
+                {
686
+                    if (!initCUTreeSharedMem())
687
+                    {
688
+                        return false;
689
+                    }
690
                 }
691
             }
692
         }
693
-        if (m_param->rc.cuTree)
694
+        if (m_param->rc.cuTree && !m_cuTreeStats.qpBuffer0)
695
         {
696
             if (m_param->rc.qgSize == 8)
697
             {
698
@@ -714,6 +816,10 @@
699
     return true;
700
 }
701
 
702
+void RateControl::skipCUTreeSharedMemRead(int32_t cnt)
703
+{
704
+    m_cutreeShrMem->skipRead(cnt);
705
+}
706
 void RateControl::reconfigureRC()
707
 {
708
     if (m_isVbv)
709
@@ -806,7 +912,7 @@
710
 
711
     TimingInfo *time = &sps.vuiParameters.timingInfo;
712
     int maxCpbOutputDelay = (int)(X265_MIN(m_param->keyframeMax * MAX_DURATION * time->timeScale / time->numUnitsInTick, INT_MAX));
713
-    int maxDpbOutputDelay = (int)(sps.maxDecPicBuffering * MAX_DURATION * time->timeScale / time->numUnitsInTick);
714
+    int maxDpbOutputDelay = (int)(sps.maxDecPicBufferingsps.maxTempSubLayers - 1 * MAX_DURATION * time->timeScale / time->numUnitsInTick);
715
     int maxDelay = (int)(90000.0 * cpbSizeUnscale / bitRateUnscale + 0.5);
716
 
717
     hrd->initialCpbRemovalDelayLength = 2 + x265_clip3(4, 22, 32 - calcLength(maxDelay));
718
@@ -1000,125 +1106,103 @@
719
 {
720
     uint64_t allConstBits = 0, allCodedBits = 0;
721
     uint64_t allAvailableBits = uint64_t(m_param->rc.bitrate * 1000. * m_numEntries * m_frameDuration);
722
-    int startIndex, framesCount, endIndex;
723
+    int startIndex, endIndex;
724
     int fps = X265_MIN(m_param->keyframeMax, (int)(m_fps + 0.5));
725
-    startIndex = endIndex = framesCount = 0;
726
-    int diffQp = 0;
727
+    int distance = fps << 1;
728
+    distance = distance > m_param->keyframeMax ? (m_param->keyframeMax << 1) : m_param->keyframeMax;
729
+    startIndex = endIndex = 0;
730
     double targetBits = 0;
731
     double expectedBits = 0;
732
-    for (startIndex = m_start, endIndex = m_start; endIndex < m_numEntries; endIndex++)
733
+    double targetBits2 = 0;
734
+    double expectedBits2 = 0;
735
+    double cpxSum = 0;
736
+    double cpxSum2 = 0;
737
+
738
+    if (m_param->rc.rateControlMode == X265_RC_ABR)
739
     {
740
-        allConstBits += m_rce2PassendIndex.miscBits;
741
-        allCodedBits += m_rce2PassendIndex.coeffBits + m_rce2PassendIndex.mvBits;
742
-        if (m_param->rc.rateControlMode == X265_RC_CRF)
743
+        for (endIndex = m_start; endIndex < m_numEntries; endIndex++)
744
         {
745
-            framesCount = endIndex - startIndex + 1;
746
-            diffQp += int (m_rce2PassendIndex.qpaRc - m_rce2PassendIndex.qpNoVbv);
747
-            if (framesCount > fps)
748
-                diffQp -= int (m_rce2PassendIndex - fps.qpaRc - m_rce2PassendIndex - fps.qpNoVbv);
749
-            if (framesCount >= fps)
750
-            {
751
-                if (diffQp >= 1)
752
-                {
753
-                    if (!m_isQpModified && endIndex > fps)
754
-                    {
755
-                        double factor = 2;
756
-                        double step = 0;
757
-                        if (endIndex + fps >= m_numEntries)
758
-                        {
759
-                            m_start = endIndex - (endIndex % fps);
760
-                            return true;
761
-                        }
762
-                        for (int start = endIndex + 1; start <= endIndex + fps && start < m_numEntries; start++)
763
-                        {
764
-                            RateControlEntry *rce = &m_rce2Passstart;
765
-                            targetBits += qScale2bits(rce, x265_qp2qScale(rce->qpNoVbv));
766
-                            expectedBits += qScale2bits(rce, rce->qScale);
767
-                        }
768
-                        if (expectedBits < 0.95 * targetBits)
769
-                        {
770
-                            m_isQpModified = true;
771
-                            m_isGopReEncoded = true;
772
-                            while (endIndex + fps < m_numEntries)
773
-                            {
774
-                                step = pow(2, factor / 6.0);
775
-                                expectedBits = 0;
776
-                                for (int start = endIndex + 1; start <= endIndex + fps; start++)
777
-                                {
778
-                                    RateControlEntry *rce = &m_rce2Passstart;
779
-                                    rce->newQScale = rce->qScale / step;
780
-                                    X265_CHECK(rce->newQScale >= 0, "new Qscale is negative\n");
781
-                                    expectedBits += qScale2bits(rce, rce->newQScale);
782
-                                    rce->newQp = x265_qScale2qp(rce->newQScale);
783
-                                }
784
-                                if (expectedBits >= targetBits && step > 1)
785
-                                    factor *= 0.90;
786
-                                else
787
-                                    break;
788
-                            }
789
-
790
-                            if (m_isVbv && endIndex + fps < m_numEntries)
791
-                                if (!vbv2Pass((uint64_t)targetBits, endIndex + fps, endIndex + 1))
792
-                                    return false;
793
-
794
-                            targetBits = 0;
795
-                            expectedBits = 0;
796
-
797
-                            for (int start = endIndex - fps + 1; start <= endIndex; start++)
798
-                            {
799
-                                RateControlEntry *rce = &m_rce2Passstart;
800
-                                targetBits += qScale2bits(rce, x265_qp2qScale(rce->qpNoVbv));
801
-                            }
802
-                            while (1)
803
-                            {
804
-                                step = pow(2, factor / 6.0);
805
-                                expectedBits = 0;
806
-                                for (int start = endIndex - fps + 1; start <= endIndex; start++)
807
-                                {
808
-                                    RateControlEntry *rce = &m_rce2Passstart;
809
-                                    rce->newQScale = rce->qScale * step;
810
-                                    X265_CHECK(rce->newQScale >= 0, "new Qscale is negative\n");
811
-                                    expectedBits += qScale2bits(rce, rce->newQScale);
812
-                                    rce->newQp = x265_qScale2qp(rce->newQScale);
813
-                                }
814
-                                if (expectedBits > targetBits && step > 1)
815
-                                    factor *= 1.1;
816
-                                else
817
-                                     break;
818
-                            }
819
-                            if (m_isVbv)
820
-                                if (!vbv2Pass((uint64_t)targetBits, endIndex, endIndex - fps + 1))
821
-                                    return false;
822
-                            diffQp = 0;
823
-                            m_reencode = endIndex - fps + 1;
824
-                            endIndex = endIndex + fps;
825
-                            startIndex = endIndex + 1;
826
-                            m_start = startIndex;
827
-                            targetBits = expectedBits = 0;
828
-                        }
829
-                        else
830
-                            targetBits = expectedBits = 0;
831
-                    }
832
-                }
833
-                else
834
-                    m_isQpModified = false;
835
-            }
836
+            allConstBits += m_rce2PassendIndex.miscBits;
837
+            allCodedBits += m_rce2PassendIndex.coeffBits + m_rce2PassendIndex.mvBits;
838
         }
839
-    }
840
 
841
-    if (m_param->rc.rateControlMode == X265_RC_ABR)
842
-    {
843
         if (allAvailableBits < allConstBits)
844
         {
845
             x265_log(m_param, X265_LOG_ERROR, "requested bitrate is too low. estimated minimum is %d kbps\n",
846
-                     (int)(allConstBits * m_fps / framesCount * 1000.));
847
+                (int)(allConstBits * m_fps / (m_numEntries - m_start) * 1000.));
848
             return false;
849
         }
850
         if (!analyseABR2Pass(allAvailableBits))
851
             return false;
852
+
853
+        return true;
854
+    }
855
+
856
+    if (m_isQpModified)
857
+    {
858
+        return true;
859
+    }
860
+
861
+    if (m_start + (fps << 1) > m_numEntries)
862
+    {
863
+        return true;
864
+    }
865
+
866
+    for (startIndex = m_start, endIndex = m_numEntries - 1; startIndex < endIndex; startIndex++, endIndex--)
867
+    {
868
+        cpxSum += m_rce2PassstartIndex.qScale / m_rce2PassstartIndex.coeffBits;
869
+        cpxSum2 += m_rce2PassendIndex.qScale / m_rce2PassendIndex.coeffBits;
870
+
871
+        RateControlEntry *rce = &m_rce2PassstartIndex;
872
+        targetBits += qScale2bits(rce, x265_qp2qScale(rce->qpNoVbv));
873
+        expectedBits += qScale2bits(rce, rce->qScale);
874
+
875
+        rce = &m_rce2PassendIndex;
876
+        targetBits2 += qScale2bits(rce, x265_qp2qScale(rce->qpNoVbv));
877
+        expectedBits2 += qScale2bits(rce, rce->qScale);
878
     }
879
 
880
-    m_start = X265_MAX(m_start, endIndex - fps);
881
+    if (expectedBits < 0.95 * targetBits || expectedBits2 < 0.95 * targetBits2)
882
+    {
883
+        if (cpxSum / cpxSum2 < 0.95 || cpxSum2 / cpxSum < 0.95)
884
+        {
885
+            m_isQpModified = true;
886
+            m_isGopReEncoded = true;
887
+
888
+            m_shortTermCplxSum = 0;
889
+            m_shortTermCplxCount = 0;
890
+            m_framesDone = m_start;
891
+
892
+            for (startIndex = m_start; startIndex < m_numEntries; startIndex++)
893
+            {
894
+                m_shortTermCplxSum *= 0.5;
895
+                m_shortTermCplxCount *= 0.5;
896
+                m_shortTermCplxSum += m_rce2PassstartIndex.currentSatd / (CLIP_DURATION(m_frameDuration) / BASE_FRAME_DURATION);
897
+                m_shortTermCplxCount++;
898
+            }
899
+
900
+            m_bufferFill = m_rce2Passm_start - 1.bufferFill;
901
+            m_bufferFillFinal = m_rce2Passm_start - 1.bufferFillFinal;
902
+            m_bufferFillActual = m_rce2Passm_start - 1.bufferFillActual;
903
+
904
+            m_reencode = m_start;
905
+            m_start = m_numEntries;
906
+        }
907
+        else
908
+        {
909
+
910
+            m_isQpModified = false;
911
+            m_isGopReEncoded = false;
912
+        }
913
+    }
914
+    else
915
+    {
916
+
917
+        m_isQpModified = false;
918
+        m_isGopReEncoded = false;
919
+    }
920
+
921
+    m_start = X265_MAX(m_start, m_numEntries - distance + m_param->keyframeMax);
922
 
923
     return true;
924
 }
925
@@ -1271,6 +1355,16 @@
926
     m_predType = getPredictorType(curFrame->m_lowres.sliceType, m_sliceType);
927
     rce->poc = m_curSlice->m_poc;
928
 
929
+    if (m_param->bEnableSBRC)
930
+    {
931
+        if (rce->poc == 0 || (m_framesDone % m_param->keyframeMax == 0))
932
+        {
933
+            //Reset SBRC buffer
934
+            m_encodedSegmentBits = 0;
935
+            m_segDur = 0;
936
+        }
937
+    }
938
+
939
     if (!m_param->bResetZoneConfig && (rce->encodeOrder % m_param->reconfigWindowSize == 0))
940
     {
941
         int index = m_zoneBufferIdx % m_param->rc.zonefileCount;
942
@@ -1304,7 +1398,8 @@
943
             {
944
                 m_param = m_param->rc.zonesi.zoneParam;
945
                 reconfigureRC();
946
-                init(*m_curSlice->m_sps);
947
+                if (!m_param->bNoResetZoneConfig)
948
+                    init(*m_curSlice->m_sps);
949
             }
950
         }
951
     }
952
@@ -1391,15 +1486,57 @@
953
             rce->frameSizeMaximum *= m_param->maxAUSizeFactor;
954
         }
955
     }
956
+
957
+    ///< regenerate the qp
958
     if (!m_isAbr && m_2pass && m_param->rc.rateControlMode == X265_RC_CRF)
959
     {
960
-        rce->qpPrev = x265_qScale2qp(rce->qScale);
961
-        rce->qScale = rce->newQScale;
962
-        rce->qpaRc = curEncData.m_avgQpRc = curEncData.m_avgQpAq = x265_qScale2qp(rce->newQScale);
963
-        m_qp = int(rce->qpaRc + 0.5);
964
-        rce->frameSizePlanned = qScale2bits(rce, rce->qScale);
965
-        m_framesDone++;
966
-        return m_qp;
967
+        if (!m_param->rc.bEncFocusedFramesOnly)
968
+        {
969
+            rce->qpPrev = x265_qScale2qp(rce->qScale);
970
+            if (m_param->bEnableSceneCutAwareQp)
971
+            {
972
+                double lqmin = m_lminm_sliceType;
973
+                double lqmax = m_lmaxm_sliceType;
974
+                if (m_param->bEnableSceneCutAwareQp & FORWARD)
975
+                    rce->newQScale = forwardMasking(curFrame, rce->newQScale);
976
+                if (m_param->bEnableSceneCutAwareQp & BACKWARD)
977
+                    rce->newQScale = backwardMasking(curFrame, rce->newQScale);
978
+                rce->newQScale = x265_clip3(lqmin, lqmax, rce->newQScale);
979
+            }
980
+            rce->qScale = rce->newQScale;
981
+            rce->qpaRc = curEncData.m_avgQpRc = curEncData.m_avgQpAq = x265_qScale2qp(rce->newQScale);
982
+            m_qp = int(rce->qpaRc + 0.5);
983
+            rce->frameSizePlanned = qScale2bits(rce, rce->qScale);
984
+            m_framesDone++;
985
+            return m_qp;
986
+        }
987
+        else
988
+        { 
989
+            int index = m_encOrderrce->poc;
990
+            index++;
991
+            double totalDuration = m_frameDuration;
992
+            for (int j = 0; totalDuration < 1.0 && index < m_numEntries; j++)
993
+            {
994
+                switch (m_rce2Passindex.sliceType)
995
+                {
996
+                case B_SLICE:
997
+                    curFrame->m_lowres.plannedTypej = m_rce2Passindex.keptAsRef ? X265_TYPE_BREF : X265_TYPE_B;
998
+                    break;
999
+                case P_SLICE:
1000
+                    curFrame->m_lowres.plannedTypej = X265_TYPE_P;
1001
+                    break;
1002
+                case I_SLICE:
1003
+                    curFrame->m_lowres.plannedTypej = m_param->bOpenGOP ? X265_TYPE_I : X265_TYPE_IDR;
1004
+                    break;
1005
+                default:
1006
+                    break;
1007
+                }
1008
+
1009
+                curFrame->m_lowres.plannedSatdj = m_rce2Passindex.currentSatd;
1010
+                totalDuration += m_frameDuration;
1011
+                index++;
1012
+            }
1013
+        }
1014
     }
1015
 
1016
     if (m_isAbr || m_2pass) // ABR,CRF
1017
@@ -1655,10 +1792,25 @@
1018
             {
1019
                 m_cuTreeStats.qpBufPos++;
1020
 
1021
-                if (!fread(&type, 1, 1, m_cutreeStatFileIn))
1022
-                    goto fail;
1023
-                if (fread(m_cuTreeStats.qpBufferm_cuTreeStats.qpBufPos, sizeof(uint16_t), ncu, m_cutreeStatFileIn) != (size_t)ncu)
1024
-                    goto fail;
1025
+                if (X265_SHARE_MODE_FILE == m_param->rc.dataShareMode)
1026
+                {
1027
+                    if (!fread(&type, 1, 1, m_cutreeStatFileIn))
1028
+                        goto fail;
1029
+                    if (fread(m_cuTreeStats.qpBufferm_cuTreeStats.qpBufPos, sizeof(uint16_t), ncu, m_cutreeStatFileIn) != (size_t)ncu)
1030
+                        goto fail;
1031
+                }
1032
+                else // X265_SHARE_MODE_SHAREDMEM == m_param->rc.dataShareMode
1033
+                {
1034
+                    if (!m_cutreeShrMem)
1035
+                    {
1036
+                        goto fail;
1037
+                    }
1038
+
1039
+                    CUTreeSharedDataItem shrItem;
1040
+                    shrItem.type = &type;
1041
+                    shrItem.stats = m_cuTreeStats.qpBufferm_cuTreeStats.qpBufPos;
1042
+                    m_cutreeShrMem->readNext(&shrItem, ReadSharedCUTreeData);
1043
+                }
1044
 
1045
                 if (type != sliceTypeActual && m_cuTreeStats.qpBufPos == 1)
1046
                 {
1047
@@ -1785,7 +1937,7 @@
1048
         m_sliderPos++;
1049
     }
1050
 
1051
-    if (m_sliceType == B_SLICE)
1052
+    if((!m_param->bEnableSBRC && m_sliceType == B_SLICE) || (m_param->bEnableSBRC && !IS_REFERENCED(curFrame)))
1053
     {
1054
         /* B-frames don't have independent rate control, but rather get the
1055
          * average QP of the two adjacent P-frames + an offset */
1056
@@ -1836,8 +1988,16 @@
1057
             double minScenecutQscale =x265_qp2qScale(ABR_SCENECUT_INIT_QP_MIN); 
1058
             m_lastQScaleForP_SLICE = X265_MAX(minScenecutQscale, m_lastQScaleForP_SLICE);
1059
         }
1060
+
1061
         double qScale = x265_qp2qScale(q);
1062
         rce->qpNoVbv = q;
1063
+
1064
+        if (m_param->bEnableSBRC)
1065
+        {
1066
+            qScale = tuneQscaleForSBRC(curFrame, qScale);
1067
+            rce->qpNoVbv = x265_qScale2qp(qScale);
1068
+        }
1069
+
1070
         double lmin = 0, lmax = 0;
1071
         if (m_isGrainEnabled && m_isFirstMiniGop)
1072
         {
1073
@@ -1890,7 +2050,7 @@
1074
                 qScale = x265_clip3(lqmin, lqmax, qScale);
1075
             }
1076
 
1077
-            if (!m_2pass || m_param->bliveVBV2pass)
1078
+            if (!m_2pass || m_param->bliveVBV2pass || (m_2pass && m_param->rc.rateControlMode == X265_RC_CRF && m_param->rc.bEncFocusedFramesOnly))
1079
             {
1080
                 /* clip qp to permissible range after vbv-lookahead estimation to avoid possible 
1081
                  * mispredictions by initial frame size predictors */
1082
@@ -1927,7 +2087,7 @@
1083
     else
1084
     {
1085
         double abrBuffer = 2 * m_rateTolerance * m_bitrate;
1086
-        if (m_2pass)
1087
+        if (m_2pass && (m_param->rc.rateControlMode != X265_RC_CRF || !m_param->rc.bEncFocusedFramesOnly))
1088
         {
1089
             double lmin = m_lminm_sliceType;
1090
             double lmax = m_lmaxm_sliceType;
1091
@@ -2057,6 +2217,19 @@
1092
 
1093
             if (m_param->rc.rateControlMode == X265_RC_CRF)
1094
             {
1095
+                if (m_param->bEnableSBRC)
1096
+                {
1097
+                    double rfConstant = m_param->rc.rfConstant;
1098
+                    if (m_currentSatd < rce->movingAvgSum)
1099
+                        rfConstant += 2;
1100
+                    double ipOffset = (curFrame->m_lowres.bScenecut ? m_ipOffset : m_ipOffset / 2.0);
1101
+                    rfConstant = (rce->sliceType == I_SLICE ? rfConstant - ipOffset :
1102
+                        (rce->sliceType == B_SLICE ? rfConstant + m_pbOffset : rfConstant));
1103
+                    double mbtree_offset = m_param->rc.cuTree ? (1.0 - m_param->rc.qCompress) * 13.5 : 0;
1104
+                    double qComp = (m_param->rc.cuTree && !m_param->rc.hevcAq) ? 0.99 : m_param->rc.qCompress;
1105
+                    m_rateFactorConstant = pow(m_currentSatd, 1.0 - qComp) /
1106
+                        x265_qp2qScale(rfConstant + mbtree_offset);
1107
+                }
1108
                 q = getQScale(rce, m_rateFactorConstant);
1109
                 x265_zone* zone = getZone();
1110
                 if (zone)
1111
@@ -2082,7 +2255,7 @@
1112
                 }
1113
                 double tunedQScale = tuneAbrQScaleFromFeedback(initialQScale);
1114
                 overflow = tunedQScale / initialQScale;
1115
-                q = !m_partialResidualFrames? tunedQScale : initialQScale;
1116
+                q = !m_partialResidualFrames ? tunedQScale : initialQScale;
1117
                 bool isEncodeEnd = (m_param->totalFrames && 
1118
                     m_framesDone > 0.75 * m_param->totalFrames) ? 1 : 0;
1119
                 bool isEncodeBeg = m_framesDone < (int)(m_fps + 0.5);
1120
@@ -2138,6 +2311,9 @@
1121
                 q = X265_MAX(minScenecutQscale, q);
1122
                 m_lastQScaleForP_SLICE = X265_MAX(minScenecutQscale, m_lastQScaleForP_SLICE);
1123
             }
1124
+            if (m_param->bEnableSBRC)
1125
+                q = tuneQscaleForSBRC(curFrame, q);
1126
+
1127
             rce->qpNoVbv = x265_qScale2qp(q);
1128
             if (m_sliceType == P_SLICE)
1129
             {
1130
@@ -2319,6 +2495,43 @@
1131
     return (p->coeff * var + p->offset) / (q * p->count);
1132
 }
1133
 
1134
+double RateControl::tuneQscaleForSBRC(Frame* curFrame, double q)
1135
+{
1136
+    int depth = 0;
1137
+    int framesDoneInSeg = m_framesDone % m_param->keyframeMax;
1138
+    if (framesDoneInSeg + m_param->lookaheadDepth <= m_param->keyframeMax)
1139
+        depth = m_param->lookaheadDepth;
1140
+    else
1141
+        depth = m_param->keyframeMax - framesDoneInSeg;
1142
+    for (int iterations = 0; iterations < 1000; iterations++)
1143
+    {
1144
+        double totalDuration = m_segDur;
1145
+        double frameBitsTotal = m_encodedSegmentBits + predictSize(&m_predm_predType, q, (double)m_currentSatd);
1146
+        for (int i = 0; i < depth; i++)
1147
+        {
1148
+            int type = curFrame->m_lowres.plannedTypei;
1149
+            if (type == X265_TYPE_AUTO)
1150
+                break;
1151
+            int64_t satd = curFrame->m_lowres.plannedSatdi >> (X265_DEPTH - 8);
1152
+            type = IS_X265_TYPE_I(curFrame->m_lowres.plannedTypei) ? I_SLICE : IS_X265_TYPE_B(curFrame->m_lowres.plannedTypei) ? B_SLICE : P_SLICE;
1153
+            int predType = getPredictorType(curFrame->m_lowres.plannedTypei, type);
1154
+            double curBits = predictSize(&m_predpredType, q, (double)satd);
1155
+            frameBitsTotal += curBits;
1156
+            totalDuration += m_frameDuration;
1157
+        }
1158
+        //Check for segment buffer overflow and adjust QP accordingly
1159
+        double segDur = m_param->keyframeMax / m_fps;
1160
+        double allowedSize = m_vbvMaxRate * segDur;
1161
+        double remDur = segDur - totalDuration;
1162
+        double remainingBits = frameBitsTotal / totalDuration * remDur;
1163
+        if (frameBitsTotal + remainingBits > 0.9 * allowedSize)
1164
+            q = q * 1.01;
1165
+        else
1166
+            break;
1167
+    }
1168
+    return q;
1169
+}
1170
+
1171
 double RateControl::clipQscale(Frame* curFrame, RateControlEntry* rce, double q)
1172
 {
1173
     // B-frames are not directly subject to VBV,
1174
@@ -2395,7 +2608,7 @@
1175
                     {
1176
                         finalDur = x265_clip3(0.4, 1.0, totalDuration);
1177
                     }
1178
-                    targetFill = X265_MIN(m_bufferFill + totalDuration * m_vbvMaxRate * 0.5, m_bufferSize * (1 - m_minBufferFill * finalDur));
1179
+                    targetFill = X265_MIN(m_bufferFill + totalDuration * m_vbvMaxRate * 0.5, m_bufferSize * (m_minBufferFill * finalDur));
1180
                     if (bufferFillCur < targetFill)
1181
                     {
1182
                         q *= 1.01;
1183
@@ -2828,7 +3041,7 @@
1184
 
1185
     if (m_param->rc.aqMode || m_isVbv || m_param->bAQMotion || bEnableDistOffset)
1186
     {
1187
-        if (m_isVbv && !(m_2pass && m_param->rc.rateControlMode == X265_RC_CRF))
1188
+        if (m_isVbv && !(m_2pass && m_param->rc.rateControlMode == X265_RC_CRF && !m_param->rc.bEncFocusedFramesOnly))
1189
         {
1190
             double avgQpRc = 0;
1191
             /* determine avg QP decided by VBV rate control */
1192
@@ -2862,8 +3075,9 @@
1193
     if (m_param->rc.rateControlMode == X265_RC_CRF)
1194
     {
1195
         double crfVal, qpRef = curEncData.m_avgQpRc;
1196
+
1197
         bool is2passCrfChange = false;
1198
-        if (m_2pass)
1199
+        if (m_2pass && !m_param->rc.bEncFocusedFramesOnly)
1200
         {
1201
             if (fabs(curEncData.m_avgQpRc - rce->qpPrev) > 0.1)
1202
             {
1203
@@ -2921,6 +3135,8 @@
1204
         m_wantedBitsWindow += m_frameDuration * m_bitrate;
1205
         m_totalBits += bits - rce->rowTotalBits;
1206
         m_encodedBits += actualBits;
1207
+        m_encodedSegmentBits += actualBits;
1208
+        m_segDur += m_frameDuration;
1209
         int pos = m_sliderPos - m_param->frameNumThreads;
1210
         if (pos >= 0)
1211
             m_encodedBitsWindowpos % s_slidingWindowFrames = actualBits;
1212
@@ -3048,10 +3264,26 @@
1213
     {
1214
         uint8_t sliceType = (uint8_t)rce->sliceType;
1215
         primitives.fix8Pack(m_cuTreeStats.qpBuffer0, curFrame->m_lowres.qpCuTreeOffset, ncu);
1216
-        if (fwrite(&sliceType, 1, 1, m_cutreeStatFileOut) < 1)
1217
-            goto writeFailure;
1218
-        if (fwrite(m_cuTreeStats.qpBuffer0, sizeof(uint16_t), ncu, m_cutreeStatFileOut) < (size_t)ncu)
1219
-            goto writeFailure;
1220
+
1221
+        if (X265_SHARE_MODE_FILE == m_param->rc.dataShareMode)
1222
+        {
1223
+            if (fwrite(&sliceType, 1, 1, m_cutreeStatFileOut) < 1)
1224
+                goto writeFailure;
1225
+            if (fwrite(m_cuTreeStats.qpBuffer0, sizeof(uint16_t), ncu, m_cutreeStatFileOut) < (size_t)ncu)
1226
+                goto writeFailure;
1227
+        }
1228
+        else // X265_SHARE_MODE_SHAREDMEM == m_param->rc.dataShareMode
1229
+        {
1230
+            if (!m_cutreeShrMem)
1231
+            {
1232
+                goto writeFailure;
1233
+            }
1234
+
1235
+            CUTreeSharedDataItem shrItem;
1236
+            shrItem.type = &sliceType;
1237
+            shrItem.stats = m_cuTreeStats.qpBuffer0;
1238
+            m_cutreeShrMem->writeData(&shrItem, WriteSharedCUTreeData);
1239
+        } 
1240
     }
1241
     return 0;
1242
 
1243
@@ -3127,6 +3359,13 @@
1244
     if (m_cutreeStatFileIn)
1245
         fclose(m_cutreeStatFileIn);
1246
 
1247
+    if (m_cutreeShrMem)
1248
+    {
1249
+        m_cutreeShrMem->release();
1250
+        delete m_cutreeShrMem;
1251
+        m_cutreeShrMem = NULL;
1252
+    }
1253
+
1254
     X265_FREE(m_rce2Pass);
1255
     X265_FREE(m_encOrder);
1256
     for (int i = 0; i < 2; i++)
1257
@@ -3186,13 +3425,20 @@
1258
 double RateControl::forwardMasking(Frame* curFrame, double q)
1259
 {
1260
     double qp = x265_qScale2qp(q);
1261
-    uint32_t maxWindowSize = uint32_t((m_param->fwdScenecutWindow / 1000.0) * (m_param->fpsNum / m_param->fpsDenom) + 0.5);
1262
-    uint32_t windowSize = maxWindowSize / 3;
1263
+    uint32_t maxWindowSize = uint32_t((m_param->fwdMaxScenecutWindow / 1000.0) * (m_param->fpsNum / m_param->fpsDenom) + 0.5);
1264
+    uint32_t windowSize6, prevWindow = 0;
1265
     int lastScenecut = m_top->m_rateControl->m_lastScenecut;
1266
-    int lastIFrame = m_top->m_rateControl->m_lastScenecutAwareIFrame;
1267
-    double fwdRefQpDelta = double(m_param->fwdRefQpDelta);
1268
-    double fwdNonRefQpDelta = double(m_param->fwdNonRefQpDelta);
1269
-    double sliceTypeDelta = SLICE_TYPE_DELTA * fwdRefQpDelta;
1270
+
1271
+    double fwdRefQpDelta6, fwdNonRefQpDelta6, sliceTypeDelta6;
1272
+    for (int i = 0; i < 6; i++)
1273
+    {
1274
+        windowSizei = prevWindow + (uint32_t((m_param->fwdScenecutWindowi / 1000.0) * (m_param->fpsNum / m_param->fpsDenom) + 0.5));
1275
+        fwdRefQpDeltai = double(m_param->fwdRefQpDeltai);
1276
+        fwdNonRefQpDeltai = double(m_param->fwdNonRefQpDeltai);
1277
+        sliceTypeDeltai = SLICE_TYPE_DELTA * fwdRefQpDeltai;
1278
+        prevWindow = windowSizei;
1279
+    }
1280
+
1281
 
1282
     //Check whether the current frame is within the forward window
1283
     if (curFrame->m_poc > lastScenecut && curFrame->m_poc <= (lastScenecut + int(maxWindowSize)))
1284
@@ -3205,45 +3451,51 @@
1285
         }
1286
         else if (curFrame->m_lowres.sliceType == X265_TYPE_P)
1287
         {
1288
-            if (!(lastIFrame > lastScenecut && lastIFrame <= (lastScenecut + int(maxWindowSize))
1289
-                && curFrame->m_poc >= lastIFrame))
1290
-            {
1291
-                //Add offsets corresponding to the window in which the P-frame occurs
1292
-                if (curFrame->m_poc <= (lastScenecut + int(windowSize)))
1293
-                    qp += WINDOW1_DELTA * (fwdRefQpDelta - sliceTypeDelta);
1294
-                else if (((curFrame->m_poc) > (lastScenecut + int(windowSize))) && ((curFrame->m_poc) <= (lastScenecut + 2 * int(windowSize))))
1295
-                    qp += WINDOW2_DELTA * (fwdRefQpDelta - sliceTypeDelta);
1296
-                else if (curFrame->m_poc > lastScenecut + 2 * int(windowSize))
1297
-                    qp += WINDOW3_DELTA * (fwdRefQpDelta - sliceTypeDelta);
1298
-            }
1299
+            //Add offsets corresponding to the window in which the P-frame occurs
1300
+            if (curFrame->m_poc <= (lastScenecut + int(windowSize0)))
1301
+                qp += fwdRefQpDelta0 - sliceTypeDelta0;
1302
+            else if (((curFrame->m_poc) > (lastScenecut + int(windowSize0))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize1))))
1303
+                qp += fwdRefQpDelta1 - sliceTypeDelta1;
1304
+            else if (((curFrame->m_poc) > (lastScenecut + int(windowSize1))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize2))))
1305
+                qp += fwdRefQpDelta2 - sliceTypeDelta2;
1306
+            else if (((curFrame->m_poc) > (lastScenecut + int(windowSize2))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize3))))
1307
+                qp += fwdRefQpDelta3 - sliceTypeDelta3;
1308
+            else if (((curFrame->m_poc) > (lastScenecut + int(windowSize3))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize4))))
1309
+                qp += fwdRefQpDelta4 - sliceTypeDelta4;
1310
+            else if (curFrame->m_poc > lastScenecut + int(windowSize4))
1311
+                qp += fwdRefQpDelta5 - sliceTypeDelta5;
1312
         }
1313
         else if (curFrame->m_lowres.sliceType == X265_TYPE_BREF)
1314
         {
1315
-            if (!(lastIFrame > lastScenecut && lastIFrame <= (lastScenecut + int(maxWindowSize))
1316
-                && curFrame->m_poc >= lastIFrame))
1317
-            {
1318
-                //Add offsets corresponding to the window in which the B-frame occurs
1319
-                if (curFrame->m_poc <= (lastScenecut + int(windowSize)))
1320
-                    qp += WINDOW1_DELTA * fwdRefQpDelta;
1321
-                else if (((curFrame->m_poc) > (lastScenecut + int(windowSize))) && ((curFrame->m_poc) <= (lastScenecut + 2 * int(windowSize))))
1322
-                    qp += WINDOW2_DELTA * fwdRefQpDelta;
1323
-                else if (curFrame->m_poc > lastScenecut + 2 * int(windowSize))
1324
-                    qp += WINDOW3_DELTA * fwdRefQpDelta;
1325
-            }
1326
+            //Add offsets corresponding to the window in which the B-frame occurs
1327
+            if (curFrame->m_poc <= (lastScenecut + int(windowSize0)))
1328
+                qp += fwdRefQpDelta0;
1329
+            else if (((curFrame->m_poc) > (lastScenecut + int(windowSize0))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize1))))
1330
+                qp += fwdRefQpDelta1;
1331
+            else if (((curFrame->m_poc) > (lastScenecut + int(windowSize1))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize2))))
1332
+                qp += fwdRefQpDelta2;
1333
+            else if (((curFrame->m_poc) > (lastScenecut + int(windowSize2))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize3))))
1334
+                qp += fwdRefQpDelta3;
1335
+            else if (((curFrame->m_poc) > (lastScenecut + int(windowSize3))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize4))))
1336
+                qp += fwdRefQpDelta4;
1337
+            else if (curFrame->m_poc > lastScenecut + int(windowSize4))
1338
+                qp += fwdRefQpDelta5;
1339
         }
1340
         else if (curFrame->m_lowres.sliceType == X265_TYPE_B)
1341
         {
1342
-            if (!(lastIFrame > lastScenecut && lastIFrame <= (lastScenecut + int(maxWindowSize))
1343
-                && curFrame->m_poc >= lastIFrame))
1344
-            {
1345
-                //Add offsets corresponding to the window in which the b-frame occurs
1346
-                if (curFrame->m_poc <= (lastScenecut + int(windowSize)))
1347
-                    qp += WINDOW1_DELTA * fwdNonRefQpDelta;
1348
-                else if (((curFrame->m_poc) > (lastScenecut + int(windowSize))) && ((curFrame->m_poc) <= (lastScenecut + 2 * int(windowSize))))
1349
-                    qp += WINDOW2_DELTA * fwdNonRefQpDelta;
1350
-                else if (curFrame->m_poc > lastScenecut + 2 * int(windowSize))
1351
-                    qp += WINDOW3_DELTA * fwdNonRefQpDelta;
1352
-            }
1353
+            //Add offsets corresponding to the window in which the b-frame occurs
1354
+            if (curFrame->m_poc <= (lastScenecut + int(windowSize0)))
1355
+                qp += fwdNonRefQpDelta0;
1356
+            else if (((curFrame->m_poc) > (lastScenecut + int(windowSize0))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize1))))
1357
+                qp += fwdNonRefQpDelta1;
1358
+            else if (((curFrame->m_poc) > (lastScenecut + int(windowSize1))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize2))))
1359
+                qp += fwdNonRefQpDelta2;
1360
+            else if (((curFrame->m_poc) > (lastScenecut + int(windowSize2))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize3))))
1361
+                qp += fwdNonRefQpDelta3;
1362
+            else if (((curFrame->m_poc) > (lastScenecut + int(windowSize3))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize4))))
1363
+                qp += fwdNonRefQpDelta4;
1364
+            else if (curFrame->m_poc > lastScenecut + int(windowSize4))
1365
+                qp += fwdNonRefQpDelta5;
1366
         }
1367
     }
1368
 
1369
@@ -3252,24 +3504,75 @@
1370
 double RateControl::backwardMasking(Frame* curFrame, double q)
1371
 {
1372
     double qp = x265_qScale2qp(q);
1373
-    double fwdRefQpDelta = double(m_param->fwdRefQpDelta);
1374
-    double bwdRefQpDelta = double(m_param->bwdRefQpDelta);
1375
-    double bwdNonRefQpDelta = double(m_param->bwdNonRefQpDelta);
1376
+    uint32_t windowSize6, prevWindow = 0;
1377
+    int lastScenecut = m_top->m_rateControl->m_lastScenecut;
1378
 
1379
-    if (curFrame->m_isInsideWindow == BACKWARD_WINDOW)
1380
+    double bwdRefQpDelta6, bwdNonRefQpDelta6, sliceTypeDelta6;
1381
+    for (int i = 0; i < 6; i++)
1382
     {
1383
-        if (bwdRefQpDelta < 0)
1384
-            bwdRefQpDelta = WINDOW3_DELTA * fwdRefQpDelta;
1385
-        double sliceTypeDelta = SLICE_TYPE_DELTA * bwdRefQpDelta;
1386
-        if (bwdNonRefQpDelta < 0)
1387
-            bwdNonRefQpDelta = bwdRefQpDelta + sliceTypeDelta;
1388
+        windowSizei = prevWindow + (uint32_t((m_param->bwdScenecutWindowi / 1000.0) * (m_param->fpsNum / m_param->fpsDenom) + 0.5));
1389
+        prevWindow = windowSizei;
1390
+        bwdRefQpDeltai = double(m_param->bwdRefQpDeltai);
1391
+        bwdNonRefQpDeltai = double(m_param->bwdNonRefQpDeltai);
1392
+
1393
+        if (bwdRefQpDeltai < 0)
1394
+            bwdRefQpDeltai = BWD_WINDOW_DELTA * m_param->fwdRefQpDeltai;
1395
+        sliceTypeDeltai = SLICE_TYPE_DELTA * bwdRefQpDeltai;
1396
+
1397
+        if (bwdNonRefQpDeltai < 0)
1398
+            bwdNonRefQpDeltai = bwdRefQpDeltai + sliceTypeDeltai;
1399
+    }
1400
 
1401
+    if (curFrame->m_isInsideWindow == BACKWARD_WINDOW)
1402
+    {
1403
         if (curFrame->m_lowres.sliceType == X265_TYPE_P)
1404
-            qp += bwdRefQpDelta - sliceTypeDelta;
1405
+        {
1406
+            //Add offsets corresponding to the window in which the P-frame occurs
1407
+            if (curFrame->m_poc >= (lastScenecut - int(windowSize0)))
1408
+                qp += bwdRefQpDelta0 - sliceTypeDelta0;
1409
+            else if (((curFrame->m_poc) < (lastScenecut - int(windowSize0))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize1))))
1410
+                qp += bwdRefQpDelta1 - sliceTypeDelta1;
1411
+            else if (((curFrame->m_poc) < (lastScenecut - int(windowSize1))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize2))))
1412
+                qp += bwdRefQpDelta2 - sliceTypeDelta2;
1413
+            else if (((curFrame->m_poc) < (lastScenecut - int(windowSize2))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize3))))
1414
+                qp += bwdRefQpDelta3 - sliceTypeDelta3;
1415
+            else if (((curFrame->m_poc) < (lastScenecut - int(windowSize3))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize4))))
1416
+                qp += bwdRefQpDelta4 - sliceTypeDelta4;
1417
+            else if (curFrame->m_poc < lastScenecut - int(windowSize4))
1418
+                qp += bwdRefQpDelta5 - sliceTypeDelta5;
1419
+        }
1420
         else if (curFrame->m_lowres.sliceType == X265_TYPE_BREF)
1421
-            qp += bwdRefQpDelta;
1422
+        {
1423
+            //Add offsets corresponding to the window in which the B-frame occurs
1424
+            if (curFrame->m_poc >= (lastScenecut - int(windowSize0)))
1425
+                qp += bwdRefQpDelta0;
1426
+            else if (((curFrame->m_poc) < (lastScenecut - int(windowSize0))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize1))))
1427
+                qp += bwdRefQpDelta1;
1428
+            else if (((curFrame->m_poc) < (lastScenecut - int(windowSize1))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize2))))
1429
+                qp += bwdRefQpDelta2;
1430
+            else if (((curFrame->m_poc) < (lastScenecut - int(windowSize2))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize3))))
1431
+                qp += bwdRefQpDelta3;
1432
+            else if (((curFrame->m_poc) < (lastScenecut - int(windowSize3))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize4))))
1433
+                qp += bwdRefQpDelta4;
1434
+            else if (curFrame->m_poc < lastScenecut - int(windowSize4))
1435
+                qp += bwdRefQpDelta5;
1436
+        }
1437
         else if (curFrame->m_lowres.sliceType == X265_TYPE_B)
1438
-            qp += bwdNonRefQpDelta;
1439
+        {
1440
+            //Add offsets corresponding to the window in which the b-frame occurs
1441
+            if (curFrame->m_poc >= (lastScenecut - int(windowSize0)))
1442
+                qp += bwdNonRefQpDelta0;
1443
+            else if (((curFrame->m_poc) < (lastScenecut - int(windowSize0))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize1))))
1444
+                qp += bwdNonRefQpDelta1;
1445
+            else if (((curFrame->m_poc) < (lastScenecut - int(windowSize1))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize2))))
1446
+                qp += bwdNonRefQpDelta2;
1447
+            else if (((curFrame->m_poc) < (lastScenecut - int(windowSize2))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize3))))
1448
+                qp += bwdNonRefQpDelta3;
1449
+            else if (((curFrame->m_poc) < (lastScenecut - int(windowSize3))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize4))))
1450
+                qp += bwdNonRefQpDelta4;
1451
+            else if (curFrame->m_poc < lastScenecut - int(windowSize4))
1452
+                qp += bwdNonRefQpDelta5;
1453
+        }
1454
     }
1455
 
1456
     return x265_qp2qScale(qp);
1457
x265_3.5.tar.gz/source/encoder/ratecontrol.h -> x265_3.6.tar.gz/source/encoder/ratecontrol.h Changed
90
 
1
@@ -28,6 +28,7 @@
2
 
3
 #include "common.h"
4
 #include "sei.h"
5
+#include "ringmem.h"
6
 
7
 namespace X265_NS {
8
 // encoder namespace
9
@@ -46,11 +47,6 @@
10
 #define MIN_AMORTIZE_FRACTION 0.2
11
 #define CLIP_DURATION(f) x265_clip3(MIN_FRAME_DURATION, MAX_FRAME_DURATION, f)
12
 
13
-/*Scenecut Aware QP*/
14
-#define WINDOW1_DELTA           1.0 /* The offset for the frames coming in the window-1*/
15
-#define WINDOW2_DELTA           0.7 /* The offset for the frames coming in the window-2*/
16
-#define WINDOW3_DELTA           0.4 /* The offset for the frames coming in the window-3*/
17
-
18
 struct Predictor
19
 {
20
     double coeffMin;
21
@@ -73,6 +69,7 @@
22
     Predictor  rowPreds32;
23
     Predictor* rowPred2;
24
 
25
+    int64_t currentSatd;
26
     int64_t lastSatd;      /* Contains the picture cost of the previous frame, required for resetAbr and VBV */
27
     int64_t leadingNoBSatd;
28
     int64_t rowTotalBits;  /* update cplxrsum and totalbits at the end of 2 rows */
29
@@ -87,6 +84,8 @@
30
     double  rowCplxrSum;
31
     double  qpNoVbv;
32
     double  bufferFill;
33
+    double  bufferFillFinal;
34
+    double  bufferFillActual;
35
     double  targetFill;
36
     bool    vbvEndAdj;
37
     double  frameDuration;
38
@@ -192,6 +191,8 @@
39
     double  m_qCompress;
40
     int64_t m_totalBits;        /* total bits used for already encoded frames (after ammortization) */
41
     int64_t m_encodedBits;      /* bits used for encoded frames (without ammortization) */
42
+    int64_t m_encodedSegmentBits;      /* bits used for encoded frames in a segment*/
43
+    double  m_segDur;
44
     double  m_fps;
45
     int64_t m_satdCostWindow50;
46
     int64_t m_encodedBitsWindow50;
47
@@ -237,6 +238,8 @@
48
     FILE*   m_statFileOut;
49
     FILE*   m_cutreeStatFileOut;
50
     FILE*   m_cutreeStatFileIn;
51
+    ///< store the cutree data in memory instead of file
52
+    RingMem *m_cutreeShrMem;
53
     double  m_lastAccumPNorm;
54
     double  m_expectedBitsSum;   /* sum of qscale2bits after rceq, ratefactor, and overflow, only includes finished frames */
55
     int64_t m_predictedBits;
56
@@ -254,6 +257,7 @@
57
     RateControl(x265_param& p, Encoder *enc);
58
     bool init(const SPS& sps);
59
     void initHRD(SPS& sps);
60
+    void initVBV(const SPS& sps);
61
     void reconfigureRC();
62
 
63
     void setFinalFrameCount(int count);
64
@@ -271,6 +275,9 @@
65
     int writeRateControlFrameStats(Frame* curFrame, RateControlEntry* rce);
66
     bool   initPass2();
67
 
68
+    bool initCUTreeSharedMem();
69
+    void skipCUTreeSharedMemRead(int32_t cnt);
70
+
71
     double forwardMasking(Frame* curFrame, double q);
72
     double backwardMasking(Frame* curFrame, double q);
73
 
74
@@ -291,6 +298,7 @@
75
     double rateEstimateQscale(Frame* pic, RateControlEntry *rce); // main logic for calculating QP based on ABR
76
     double tuneAbrQScaleFromFeedback(double qScale);
77
     double tuneQScaleForZone(RateControlEntry *rce, double qScale); // Tune qScale to adhere to zone budget
78
+    double tuneQscaleForSBRC(Frame* curFrame, double q); // Tune qScale to adhere to segment budget
79
     void   accumPQpUpdate();
80
 
81
     int    getPredictorType(int lowresSliceType, int sliceType);
82
@@ -311,6 +319,7 @@
83
     double tuneQScaleForGrain(double rcOverflow);
84
     void   splitdeltaPOC(char deltapoc, RateControlEntry *rce);
85
     void   splitbUsed(char deltapoc, RateControlEntry *rce);
86
+    void   checkAndResetCRF(RateControlEntry* rce);
87
 };
88
 }
89
 #endif // ifndef X265_RATECONTROL_H
90
x265_3.5.tar.gz/source/encoder/sei.cpp -> x265_3.6.tar.gz/source/encoder/sei.cpp Changed
10
 
1
@@ -68,7 +68,7 @@
2
     {
3
         if (nalUnitType != NAL_UNIT_UNSPECIFIED)
4
             bs.writeByteAlignment();
5
-        list.serialize(nalUnitType, bs);
6
+        list.serialize(nalUnitType, bs, (1 + (nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N)));
7
     }
8
 }
9
 
10
x265_3.5.tar.gz/source/encoder/sei.h -> x265_3.6.tar.gz/source/encoder/sei.h Changed
103
 
1
@@ -73,6 +73,101 @@
2
     }
3
 };
4
 
5
+/* Film grain characteristics */
6
+class FilmGrainCharacteristics : public SEI
7
+{
8
+  public:
9
+
10
+    FilmGrainCharacteristics()
11
+    {
12
+        m_payloadType = FILM_GRAIN_CHARACTERISTICS;
13
+        m_payloadSize = 0;
14
+    }
15
+
16
+    struct CompModelIntensityValues
17
+    {
18
+        uint8_t intensityIntervalLowerBound;
19
+        uint8_t intensityIntervalUpperBound;
20
+        int*    compModelValue;
21
+    };
22
+
23
+    struct CompModel
24
+    {
25
+        bool    bPresentFlag;
26
+        uint8_t numModelValues;
27
+        uint8_t m_filmGrainNumIntensityIntervalMinus1;
28
+        CompModelIntensityValues* intensityValues;
29
+    };
30
+
31
+    CompModel   m_compModelMAX_NUM_COMPONENT;
32
+    bool        m_filmGrainCharacteristicsPersistenceFlag;
33
+    bool        m_filmGrainCharacteristicsCancelFlag;
34
+    bool        m_separateColourDescriptionPresentFlag;
35
+    bool        m_filmGrainFullRangeFlag;
36
+    uint8_t     m_filmGrainModelId;
37
+    uint8_t     m_blendingModeId;
38
+    uint8_t     m_log2ScaleFactor;
39
+    uint8_t     m_filmGrainBitDepthLumaMinus8;
40
+    uint8_t     m_filmGrainBitDepthChromaMinus8;
41
+    uint8_t     m_filmGrainColourPrimaries;
42
+    uint8_t     m_filmGrainTransferCharacteristics;
43
+    uint8_t     m_filmGrainMatrixCoeffs;
44
+
45
+    void writeSEI(const SPS&)
46
+    {
47
+        WRITE_FLAG(m_filmGrainCharacteristicsCancelFlag, "film_grain_characteristics_cancel_flag");
48
+
49
+        if (!m_filmGrainCharacteristicsCancelFlag)
50
+        {
51
+            WRITE_CODE(m_filmGrainModelId, 2, "film_grain_model_id");
52
+            WRITE_FLAG(m_separateColourDescriptionPresentFlag, "separate_colour_description_present_flag");
53
+            if (m_separateColourDescriptionPresentFlag)
54
+            {
55
+                WRITE_CODE(m_filmGrainBitDepthLumaMinus8, 3, "film_grain_bit_depth_luma_minus8");
56
+                WRITE_CODE(m_filmGrainBitDepthChromaMinus8, 3, "film_grain_bit_depth_chroma_minus8");
57
+                WRITE_FLAG(m_filmGrainFullRangeFlag, "film_grain_full_range_flag");
58
+                WRITE_CODE(m_filmGrainColourPrimaries, X265_BYTE, "film_grain_colour_primaries");
59
+                WRITE_CODE(m_filmGrainTransferCharacteristics, X265_BYTE, "film_grain_transfer_characteristics");
60
+                WRITE_CODE(m_filmGrainMatrixCoeffs, X265_BYTE, "film_grain_matrix_coeffs");
61
+            }
62
+            WRITE_CODE(m_blendingModeId, 2, "blending_mode_id");
63
+            WRITE_CODE(m_log2ScaleFactor, 4, "log2_scale_factor");
64
+            for (uint8_t c = 0; c < 3; c++)
65
+            {
66
+                WRITE_FLAG(m_compModelc.bPresentFlag && m_compModelc.m_filmGrainNumIntensityIntervalMinus1 + 1 > 0 && m_compModelc.numModelValues > 0, "comp_model_present_flagc");
67
+            }
68
+            for (uint8_t c = 0; c < 3; c++)
69
+            {
70
+                if (m_compModelc.bPresentFlag && m_compModelc.m_filmGrainNumIntensityIntervalMinus1 + 1 > 0 && m_compModelc.numModelValues > 0)
71
+                {
72
+                    assert(m_compModelc.m_filmGrainNumIntensityIntervalMinus1 + 1 <= 256);
73
+                    assert(m_compModelc.numModelValues <= X265_BYTE);
74
+                    WRITE_CODE(m_compModelc.m_filmGrainNumIntensityIntervalMinus1 , X265_BYTE, "num_intensity_intervals_minus1c");
75
+                    WRITE_CODE(m_compModelc.numModelValues - 1, 3, "num_model_values_minus1c");
76
+                    for (uint8_t interval = 0; interval < m_compModelc.m_filmGrainNumIntensityIntervalMinus1 + 1; interval++)
77
+                    {
78
+                        WRITE_CODE(m_compModelc.intensityValuesinterval.intensityIntervalLowerBound, X265_BYTE, "intensity_interval_lower_boundci");
79
+                        WRITE_CODE(m_compModelc.intensityValuesinterval.intensityIntervalUpperBound, X265_BYTE, "intensity_interval_upper_boundci");
80
+                        for (uint8_t j = 0; j < m_compModelc.numModelValues; j++)
81
+                        {
82
+                            WRITE_SVLC(m_compModelc.intensityValuesinterval.compModelValuej,"comp_model_valueci");
83
+                        }
84
+                    }
85
+                }
86
+            }
87
+            WRITE_FLAG(m_filmGrainCharacteristicsPersistenceFlag, "film_grain_characteristics_persistence_flag");
88
+        }
89
+        if (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
90
+        {
91
+            WRITE_FLAG(1, "payload_bit_equal_to_one");
92
+            while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
93
+            {
94
+                WRITE_FLAG(0, "payload_bit_equal_to_zero");
95
+            }
96
+        }
97
+    }
98
+};
99
+
100
 static const uint32_t ISO_IEC_11578_LEN = 16;
101
 
102
 class SEIuserDataUnregistered : public SEI
103
x265_3.5.tar.gz/source/encoder/slicetype.cpp -> x265_3.6.tar.gz/source/encoder/slicetype.cpp Changed
1444
 
1
@@ -87,6 +87,14 @@
2
 
3
 namespace X265_NS {
4
 
5
+uint32_t acEnergyVarHist(uint64_t sum_ssd, int shift)
6
+{
7
+    uint32_t sum = (uint32_t)sum_ssd;
8
+    uint32_t ssd = (uint32_t)(sum_ssd >> 32);
9
+
10
+    return ssd - ((uint64_t)sum * sum >> shift);
11
+}
12
+
13
 bool computeEdge(pixel* edgePic, pixel* refPic, pixel* edgeTheta, intptr_t stride, int height, int width, bool bcalcTheta, pixel whitePixel)
14
 {
15
     intptr_t rowOne = 0, rowTwo = 0, rowThree = 0, colOne = 0, colTwo = 0, colThree = 0;
16
@@ -184,7 +192,7 @@
17
     {
18
         for (int colNum = 0; colNum < width; colNum++)
19
         {
20
-            if ((rowNum >= 2) && (colNum >= 2) && (rowNum != height - 2) && (colNum != width - 2)) //Ignoring the border pixels of the picture
21
+            if ((rowNum >= 2) && (colNum >= 2) && (rowNum < height - 2) && (colNum < width - 2)) //Ignoring the border pixels of the picture
22
             {
23
                 /*  5x5 Gaussian filter
24
                     2   4   5   4   2
25
@@ -519,7 +527,7 @@
26
                 if (param->rc.aqMode == X265_AQ_EDGE)
27
                     edgeFilter(curFrame, param);
28
 
29
-                if (param->rc.aqMode == X265_AQ_EDGE && !param->bHistBasedSceneCut && param->recursionSkipMode == EDGE_BASED_RSKIP)
30
+                if (param->rc.aqMode == X265_AQ_EDGE && param->recursionSkipMode == EDGE_BASED_RSKIP)
31
                 {
32
                     pixel* src = curFrame->m_edgePic + curFrame->m_fencPic->m_lumaMarginY * curFrame->m_fencPic->m_stride + curFrame->m_fencPic->m_lumaMarginX;
33
                     primitives.planecopy_pp_shr(src, curFrame->m_fencPic->m_stride, curFrame->m_edgeBitPic,
34
@@ -1050,7 +1058,48 @@
35
     m_countPreLookahead = 0;
36
 #endif
37
 
38
-    memset(m_histogram, 0, sizeof(m_histogram));
39
+    m_accHistDiffRunningAvgCb = X265_MALLOC(uint32_t*, NUMBER_OF_SEGMENTS_IN_WIDTH * sizeof(uint32_t*));
40
+    m_accHistDiffRunningAvgCb0 = X265_MALLOC(uint32_t, NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
41
+    memset(m_accHistDiffRunningAvgCb0, 0, sizeof(uint32_t) * NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
42
+    for (uint32_t w = 1; w < NUMBER_OF_SEGMENTS_IN_WIDTH; w++) {
43
+        m_accHistDiffRunningAvgCbw = m_accHistDiffRunningAvgCb0 + w * NUMBER_OF_SEGMENTS_IN_HEIGHT;
44
+    }
45
+
46
+    m_accHistDiffRunningAvgCr = X265_MALLOC(uint32_t*, NUMBER_OF_SEGMENTS_IN_WIDTH * sizeof(uint32_t*));
47
+    m_accHistDiffRunningAvgCr0 = X265_MALLOC(uint32_t, NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
48
+    memset(m_accHistDiffRunningAvgCr0, 0, sizeof(uint32_t) * NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
49
+    for (uint32_t w = 1; w < NUMBER_OF_SEGMENTS_IN_WIDTH; w++) {
50
+        m_accHistDiffRunningAvgCrw = m_accHistDiffRunningAvgCr0 + w * NUMBER_OF_SEGMENTS_IN_HEIGHT;
51
+    }
52
+
53
+    m_accHistDiffRunningAvg = X265_MALLOC(uint32_t*, NUMBER_OF_SEGMENTS_IN_WIDTH * sizeof(uint32_t*));
54
+    m_accHistDiffRunningAvg0 = X265_MALLOC(uint32_t, NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
55
+    memset(m_accHistDiffRunningAvg0, 0, sizeof(uint32_t) * NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
56
+    for (uint32_t w = 1; w < NUMBER_OF_SEGMENTS_IN_WIDTH; w++) {
57
+        m_accHistDiffRunningAvgw = m_accHistDiffRunningAvg0 + w * NUMBER_OF_SEGMENTS_IN_HEIGHT;
58
+    }
59
+
60
+    m_resetRunningAvg = true;
61
+
62
+    m_segmentCountThreshold = (uint32_t)(((float)((NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT) * 50) / 100) + 0.5);
63
+
64
+    if (m_param->bEnableTemporalSubLayers > 2)
65
+    {
66
+        switch (m_param->bEnableTemporalSubLayers)
67
+        {
68
+        case 3:
69
+            m_gopId = 0;
70
+            break;
71
+        case 4:
72
+            m_gopId = 1;
73
+            break;
74
+        case 5:
75
+            m_gopId = 2;
76
+            break;
77
+        default:
78
+            break;
79
+        }
80
+    }
81
 }
82
 
83
 #if DETAILED_CU_STATS
84
@@ -1098,6 +1147,7 @@
85
             m_pooli.stopWorkers();
86
     }
87
 }
88
+
89
 void Lookahead::destroy()
90
 {
91
     // these two queues will be empty unless the encode was aborted
92
@@ -1309,32 +1359,32 @@
93
     default:
94
         return;
95
     }
96
-    if (!m_param->analysisLoad || !m_param->bDisableLookahead)
97
+    if (!curFrame->m_param->analysisLoad || !curFrame->m_param->bDisableLookahead)
98
     {
99
         X265_CHECK(curFrame->m_lowres.costEstb - p0p1 - b > 0, "Slice cost not estimated\n")
100
 
101
-        if (m_param->rc.cuTree && !m_param->rc.bStatRead)
102
+        if (curFrame->m_param->rc.cuTree && !curFrame->m_param->rc.bStatRead)
103
             /* update row satds based on cutree offsets */
104
             curFrame->m_lowres.satdCost = frameCostRecalculate(frames, p0, p1, b);
105
-        else if (!m_param->analysisLoad || m_param->scaleFactor || m_param->bAnalysisType == HEVC_INFO)
106
+        else if (!curFrame->m_param->analysisLoad || curFrame->m_param->scaleFactor || curFrame->m_param->bAnalysisType == HEVC_INFO)
107
         {
108
-            if (m_param->rc.aqMode)
109
+            if (curFrame->m_param->rc.aqMode)
110
                 curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstAqb - p0p1 - b;
111
             else
112
                 curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstb - p0p1 - b;
113
         }
114
-        if (m_param->rc.vbvBufferSize && m_param->rc.vbvMaxBitrate)
115
+        if (curFrame->m_param->rc.vbvBufferSize && curFrame->m_param->rc.vbvMaxBitrate)
116
         {
117
             /* aggregate lowres row satds to CTU resolution */
118
             curFrame->m_lowres.lowresCostForRc = curFrame->m_lowres.lowresCostsb - p0p1 - b;
119
             uint32_t lowresRow = 0, lowresCol = 0, lowresCuIdx = 0, sum = 0, intraSum = 0;
120
-            uint32_t scale = m_param->maxCUSize / (2 * X265_LOWRES_CU_SIZE);
121
-            uint32_t numCuInHeight = (m_param->sourceHeight + m_param->maxCUSize - 1) / m_param->maxCUSize;
122
+            uint32_t scale = curFrame->m_param->maxCUSize / (2 * X265_LOWRES_CU_SIZE);
123
+            uint32_t numCuInHeight = (curFrame->m_param->sourceHeight + curFrame->m_param->maxCUSize - 1) / curFrame->m_param->maxCUSize;
124
             uint32_t widthInLowresCu = (uint32_t)m_8x8Width, heightInLowresCu = (uint32_t)m_8x8Height;
125
             double *qp_offset = 0;
126
             /* Factor in qpoffsets based on Aq/Cutree in CU costs */
127
-            if (m_param->rc.aqMode || m_param->bAQMotion)
128
-                qp_offset = (framesb->sliceType == X265_TYPE_B || !m_param->rc.cuTree) ? framesb->qpAqOffset : framesb->qpCuTreeOffset;
129
+            if (curFrame->m_param->rc.aqMode || curFrame->m_param->bAQMotion)
130
+                qp_offset = (framesb->sliceType == X265_TYPE_B || !curFrame->m_param->rc.cuTree) ? framesb->qpAqOffset : framesb->qpCuTreeOffset;
131
 
132
             for (uint32_t row = 0; row < numCuInHeight; row++)
133
             {
134
@@ -1350,7 +1400,7 @@
135
                         if (qp_offset)
136
                         {
137
                             double qpOffset;
138
-                            if (m_param->rc.qgSize == 8)
139
+                            if (curFrame->m_param->rc.qgSize == 8)
140
                                 qpOffset = (qp_offsetlowresCol * 2 + lowresRow * widthInLowresCu * 4 +
141
                                 qp_offsetlowresCol * 2 + lowresRow * widthInLowresCu * 4 + 1 +
142
                                 qp_offsetlowresCol * 2 + lowresRow * widthInLowresCu * 4 + curFrame->m_lowres.maxBlocksInRowFullRes +
143
@@ -1361,7 +1411,7 @@
144
                             int32_t intraCuCost = curFrame->m_lowres.intraCostlowresCuIdx;
145
                             curFrame->m_lowres.intraCostlowresCuIdx = (intraCuCost * x265_exp2fix8(qpOffset) + 128) >> 8;
146
                         }
147
-                        if (m_param->bIntraRefresh && slice->m_sliceType == X265_TYPE_P)
148
+                        if (curFrame->m_param->bIntraRefresh && slice->m_sliceType == X265_TYPE_P)
149
                             for (uint32_t x = curFrame->m_encData->m_pir.pirStartCol; x <= curFrame->m_encData->m_pir.pirEndCol; x++)
150
                                 diff += curFrame->m_lowres.intraCostlowresCuIdx - lowresCuCost;
151
                         curFrame->m_lowres.lowresCostForRclowresCuIdx = lowresCuCost;
152
@@ -1377,6 +1427,291 @@
153
     }
154
 }
155
 
156
+uint32_t LookaheadTLD::calcVariance(pixel* inpSrc, intptr_t stride, intptr_t blockOffset, uint32_t plane)
157
+{
158
+    pixel* src = inpSrc + blockOffset;
159
+
160
+    uint32_t var;
161
+    if (!plane)
162
+        var = acEnergyVarHist(primitives.cuBLOCK_8x8.var(src, stride), 6);
163
+    else
164
+        var = acEnergyVarHist(primitives.cuBLOCK_4x4.var(src, stride), 4);
165
+
166
+    x265_emms();
167
+    return var;
168
+}
169
+
170
+/*
171
+** Compute Block and Picture Variance, Block Mean for all blocks in the picture
172
+*/
173
+void LookaheadTLD::computePictureStatistics(Frame *curFrame)
174
+{
175
+    int maxCol = curFrame->m_fencPic->m_picWidth;
176
+    int maxRow = curFrame->m_fencPic->m_picHeight;
177
+    intptr_t inpStride = curFrame->m_fencPic->m_stride;
178
+
179
+    // Variance
180
+    uint64_t picTotVariance = 0;
181
+    uint32_t variance;
182
+
183
+    uint64_t blockXY = 0;
184
+    pixel* src = curFrame->m_fencPic->m_picOrg0;
185
+
186
+    for (int blockY = 0; blockY < maxRow; blockY += 8)
187
+    {
188
+        uint64_t rowVariance = 0;
189
+        for (int blockX = 0; blockX < maxCol; blockX += 8)
190
+        {
191
+            intptr_t blockOffsetLuma = blockX + (blockY * inpStride);
192
+
193
+            variance = calcVariance(
194
+                src,
195
+                inpStride,
196
+                blockOffsetLuma, 0);
197
+
198
+            rowVariance += variance;
199
+            blockXY++;
200
+        }
201
+        picTotVariance += (uint16_t)(rowVariance / maxCol);
202
+    }
203
+
204
+    curFrame->m_lowres.picAvgVariance = (uint16_t)(picTotVariance / maxRow);
205
+
206
+    // Collect chroma variance
207
+    int hShift = curFrame->m_fencPic->m_hChromaShift;
208
+    int vShift = curFrame->m_fencPic->m_vChromaShift;
209
+
210
+    int maxColChroma = curFrame->m_fencPic->m_picWidth >> hShift;
211
+    int maxRowChroma = curFrame->m_fencPic->m_picHeight >> vShift;
212
+    intptr_t cStride = curFrame->m_fencPic->m_strideC;
213
+
214
+    pixel* srcCb = curFrame->m_fencPic->m_picOrg1;
215
+
216
+    picTotVariance = 0;
217
+    for (int blockY = 0; blockY < maxRowChroma; blockY += 4)
218
+    {
219
+        uint64_t rowVariance = 0;
220
+        for (int blockX = 0; blockX < maxColChroma; blockX += 4)
221
+        {
222
+            intptr_t blockOffsetChroma = blockX + blockY * cStride;
223
+
224
+            variance = calcVariance(
225
+                srcCb,
226
+                cStride,
227
+                blockOffsetChroma, 1);
228
+
229
+            rowVariance += variance;
230
+            blockXY++;
231
+        }
232
+        picTotVariance += (uint16_t)(rowVariance / maxColChroma);
233
+    }
234
+
235
+    curFrame->m_lowres.picAvgVarianceCb = (uint16_t)(picTotVariance / maxRowChroma);
236
+
237
+
238
+    pixel* srcCr = curFrame->m_fencPic->m_picOrg2;
239
+
240
+    picTotVariance = 0;
241
+    for (int blockY = 0; blockY < maxRowChroma; blockY += 4)
242
+    {
243
+        uint64_t rowVariance = 0;
244
+        for (int blockX = 0; blockX < maxColChroma; blockX += 4)
245
+        {
246
+            intptr_t blockOffsetChroma = blockX + blockY * cStride;
247
+
248
+            variance = calcVariance(
249
+                srcCr,
250
+                cStride,
251
+                blockOffsetChroma, 2);
252
+
253
+            rowVariance += variance;
254
+            blockXY++;
255
+        }
256
+        picTotVariance += (uint16_t)(rowVariance / maxColChroma);
257
+    }
258
+
259
+    curFrame->m_lowres.picAvgVarianceCr = (uint16_t)(picTotVariance / maxRowChroma);
260
+}
261
+
262
+/*
263
+* Compute histogram of n-bins for the input
264
+*/
265
+void LookaheadTLD::calculateHistogram(
266
+    pixel     *inputSrc,
267
+    uint32_t   inputWidth,
268
+    uint32_t   inputHeight,
269
+    intptr_t   stride,
270
+    uint8_t    dsFactor,
271
+    uint32_t  *histogram,
272
+    uint64_t  *sum)
273
+
274
+{
275
+    *sum = 0;
276
+
277
+    for (uint32_t verticalIdx = 0; verticalIdx < inputHeight; verticalIdx += dsFactor)
278
+    {
279
+        for (uint32_t horizontalIdx = 0; horizontalIdx < inputWidth; horizontalIdx += dsFactor)
280
+        {
281
+            ++(histograminputSrchorizontalIdx);
282
+            *sum += inputSrchorizontalIdx;
283
+        }
284
+        inputSrc += (stride << (dsFactor >> 1));
285
+    }
286
+
287
+    return;
288
+}
289
+
290
+/*
291
+* Compute histogram bins and chroma pixel intensity *
292
+*/
293
+void LookaheadTLD::computeIntensityHistogramBinsChroma(
294
+    Frame    *curFrame,
295
+    uint64_t *sumAverageIntensityCb,
296
+    uint64_t *sumAverageIntensityCr)
297
+{
298
+    uint64_t    sum;
299
+    uint8_t     dsFactor = 4;
300
+
301
+    uint32_t segmentWidth = curFrame->m_lowres.widthFullRes / NUMBER_OF_SEGMENTS_IN_WIDTH;
302
+    uint32_t segmentHeight = curFrame->m_lowres.heightFullRes / NUMBER_OF_SEGMENTS_IN_HEIGHT;
303
+
304
+    for (uint32_t segmentInFrameWidthIndex = 0; segmentInFrameWidthIndex < NUMBER_OF_SEGMENTS_IN_WIDTH; segmentInFrameWidthIndex++)
305
+    {
306
+        for (uint32_t segmentInFrameHeightIndex = 0; segmentInFrameHeightIndex < NUMBER_OF_SEGMENTS_IN_HEIGHT; segmentInFrameHeightIndex++)
307
+        {
308
+            // Initialize bins to 1
309
+            for (uint32_t cuIndex = 0; cuIndex < 256; cuIndex++) {
310
+                curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex1cuIndex = 1;
311
+                curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex2cuIndex = 1;
312
+            }
313
+
314
+            uint32_t segmentWidthOffset = (segmentInFrameWidthIndex == NUMBER_OF_SEGMENTS_IN_WIDTH - 1) ?
315
+                curFrame->m_lowres.widthFullRes - (NUMBER_OF_SEGMENTS_IN_WIDTH * segmentWidth) : 0;
316
+
317
+            uint32_t segmentHeightOffset = (segmentInFrameHeightIndex == NUMBER_OF_SEGMENTS_IN_HEIGHT - 1) ?
318
+                curFrame->m_lowres.heightFullRes - (NUMBER_OF_SEGMENTS_IN_HEIGHT * segmentHeight) : 0;
319
+
320
+
321
+            // U Histogram
322
+            calculateHistogram(
323
+                curFrame->m_fencPic->m_picOrg1 + ((segmentInFrameWidthIndex * segmentWidth) >> 1) + (((segmentInFrameHeightIndex * segmentHeight) >> 1) * curFrame->m_fencPic->m_strideC),
324
+                (segmentWidth + segmentWidthOffset) >> 1,
325
+                (segmentHeight + segmentHeightOffset) >> 1,
326
+                curFrame->m_fencPic->m_strideC,
327
+                dsFactor,
328
+                curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex1,
329
+                &sum);
330
+
331
+            sum = (sum << dsFactor);
332
+            *sumAverageIntensityCb += sum;
333
+            curFrame->m_lowres.averageIntensityPerSegmentsegmentInFrameWidthIndexsegmentInFrameHeightIndex1 =
334
+                (uint8_t)((sum + (((segmentWidth + segmentWidthOffset) * (segmentHeight + segmentHeightOffset)) >> 3)) / (((segmentWidth + segmentWidthOffset) * (segmentHeight + segmentHeightOffset)) >> 2));
335
+
336
+            for (uint16_t histogramBin = 0; histogramBin < HISTOGRAM_NUMBER_OF_BINS; histogramBin++) {
337
+                curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex1histogramBin =
338
+                    curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex1histogramBin << dsFactor;
339
+            }
340
+
341
+            // V Histogram
342
+            calculateHistogram(
343
+                curFrame->m_fencPic->m_picOrg2 + ((segmentInFrameWidthIndex * segmentWidth) >> 1) + (((segmentInFrameHeightIndex * segmentHeight) >> 1) * curFrame->m_fencPic->m_strideC),
344
+                (segmentWidth + segmentWidthOffset) >> 1,
345
+                (segmentHeight + segmentHeightOffset) >> 1,
346
+                curFrame->m_fencPic->m_strideC,
347
+                dsFactor,
348
+                curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex2,
349
+                &sum);
350
+
351
+            sum = (sum << dsFactor);
352
+            *sumAverageIntensityCr += sum;
353
+            curFrame->m_lowres.averageIntensityPerSegmentsegmentInFrameWidthIndexsegmentInFrameHeightIndex2 =
354
+                (uint8_t)((sum + (((segmentWidth + segmentWidthOffset) * (segmentHeight + segmentHeightOffset)) >> 3)) / (((segmentWidth + segmentHeightOffset) * (segmentHeight + segmentHeightOffset)) >> 2));
355
+
356
+            for (uint16_t histogramBin = 0; histogramBin < HISTOGRAM_NUMBER_OF_BINS; histogramBin++) {
357
+                curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex2histogramBin =
358
+                    curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex2histogramBin << dsFactor;
359
+            }
360
+        }
361
+    }
362
+    return;
363
+
364
+}
365
+
366
+/*
367
+* Compute histogram bins and luma pixel intensity *
368
+*/
369
+void LookaheadTLD::computeIntensityHistogramBinsLuma(
370
+    Frame    *curFrame,
371
+    uint64_t *sumAvgIntensityTotalSegmentsLuma)
372
+{
373
+    uint64_t sum;
374
+
375
+    uint32_t segmentWidth = curFrame->m_lowres.quarterSampleLowResWidth / NUMBER_OF_SEGMENTS_IN_WIDTH;
376
+    uint32_t segmentHeight = curFrame->m_lowres.quarterSampleLowResHeight / NUMBER_OF_SEGMENTS_IN_HEIGHT;
377
+
378
+    for (uint32_t segmentInFrameWidthIndex = 0; segmentInFrameWidthIndex < NUMBER_OF_SEGMENTS_IN_WIDTH; segmentInFrameWidthIndex++)
379
+    {
380
+        for (uint32_t segmentInFrameHeightIndex = 0; segmentInFrameHeightIndex < NUMBER_OF_SEGMENTS_IN_HEIGHT; segmentInFrameHeightIndex++)
381
+        {
382
+            // Initialize bins to 1
383
+            for (uint32_t cuIndex = 0; cuIndex < 256; cuIndex++) {
384
+                curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex0cuIndex = 1;
385
+            }
386
+
387
+            uint32_t segmentWidthOffset = (segmentInFrameWidthIndex == NUMBER_OF_SEGMENTS_IN_WIDTH - 1) ?
388
+                curFrame->m_lowres.quarterSampleLowResWidth - (NUMBER_OF_SEGMENTS_IN_WIDTH * segmentWidth) : 0;
389
+
390
+            uint32_t segmentHeightOffset = (segmentInFrameHeightIndex == NUMBER_OF_SEGMENTS_IN_HEIGHT - 1) ?
391
+                curFrame->m_lowres.quarterSampleLowResHeight - (NUMBER_OF_SEGMENTS_IN_HEIGHT * segmentHeight) : 0;
392
+
393
+            // Y Histogram
394
+            calculateHistogram(
395
+                curFrame->m_lowres.quarterSampleLowResBuffer + (curFrame->m_lowres.quarterSampleLowResOriginX + segmentInFrameWidthIndex * segmentWidth) + ((curFrame->m_lowres.quarterSampleLowResOriginY + segmentInFrameHeightIndex * segmentHeight) * curFrame->m_lowres.quarterSampleLowResStrideY),
396
+                segmentWidth + segmentWidthOffset,
397
+                segmentHeight + segmentHeightOffset,
398
+                curFrame->m_lowres.quarterSampleLowResStrideY,
399
+                1,
400
+                curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex0,
401
+                &sum);
402
+
403
+            curFrame->m_lowres.averageIntensityPerSegmentsegmentInFrameWidthIndexsegmentInFrameHeightIndex0 = (uint8_t)((sum + (((segmentWidth + segmentWidthOffset)*(segmentWidth + segmentHeightOffset)) >> 1)) / ((segmentWidth + segmentWidthOffset)*(segmentHeight + segmentHeightOffset)));
404
+            (*sumAvgIntensityTotalSegmentsLuma) += (sum << 4);
405
+            for (uint32_t histogramBin = 0; histogramBin < HISTOGRAM_NUMBER_OF_BINS; histogramBin++)
406
+            {
407
+                curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex0histogramBin =
408
+                    curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex0histogramBin << 4;
409
+            }
410
+        }
411
+    }
412
+}
413
+
414
+void LookaheadTLD::collectPictureStatistics(Frame *curFrame)
415
+{
416
+
417
+    uint64_t sumAverageIntensityCb = 0;
418
+    uint64_t sumAverageIntensityCr = 0;
419
+    uint64_t sumAverageIntensity = 0;
420
+
421
+    // Histogram bins for Luma
422
+    computeIntensityHistogramBinsLuma(
423
+        curFrame,
424
+        &sumAverageIntensity);
425
+
426
+    // Histogram bins for Chroma
427
+    computeIntensityHistogramBinsChroma(
428
+        curFrame,
429
+        &sumAverageIntensityCb,
430
+        &sumAverageIntensityCr);
431
+
432
+    curFrame->m_lowres.averageIntensity0 = (uint8_t)((sumAverageIntensity + ((curFrame->m_lowres.widthFullRes * curFrame->m_lowres.heightFullRes) >> 1)) / (curFrame->m_lowres.widthFullRes * curFrame->m_lowres.heightFullRes));
433
+    curFrame->m_lowres.averageIntensity1 = (uint8_t)((sumAverageIntensityCb + ((curFrame->m_lowres.widthFullRes * curFrame->m_lowres.heightFullRes) >> 3)) / ((curFrame->m_lowres.widthFullRes * curFrame->m_lowres.heightFullRes) >> 2));
434
+    curFrame->m_lowres.averageIntensity2 = (uint8_t)((sumAverageIntensityCr + ((curFrame->m_lowres.widthFullRes * curFrame->m_lowres.heightFullRes) >> 3)) / ((curFrame->m_lowres.widthFullRes * curFrame->m_lowres.heightFullRes) >> 2));
435
+
436
+    computePictureStatistics(curFrame);
437
+
438
+    curFrame->m_lowres.bHistScenecutAnalyzed = false;
439
+}
440
+
441
 void PreLookaheadGroup::processTasks(int workerThreadID)
442
 {
443
     if (workerThreadID < 0)
444
@@ -1393,6 +1728,10 @@
445
         preFrame->m_lowres.init(preFrame->m_fencPic, preFrame->m_poc);
446
         if (m_lookahead.m_bAdaptiveQuant)
447
             tld.calcAdaptiveQuantFrame(preFrame, m_lookahead.m_param);
448
+
449
+        if (m_lookahead.m_param->bHistBasedSceneCut)
450
+            tld.collectPictureStatistics(preFrame);
451
+
452
         tld.lowresIntraEstimate(preFrame->m_lowres, m_lookahead.m_param->rc.qgSize);
453
         preFrame->m_lowresInit = true;
454
 
455
@@ -1401,6 +1740,53 @@
456
     m_lock.release();
457
 }
458
 
459
+
460
+void Lookahead::placeBref(Frame** frames, int start, int end, int num, int *brefs)
461
+{
462
+    int avg = (start + end) / 2;
463
+    if (m_param->bEnableTemporalSubLayers < 2)
464
+    {
465
+        (*framesavg).m_lowres.sliceType = X265_TYPE_BREF;
466
+        (*brefs)++;
467
+        return;
468
+    }
469
+    else
470
+    {
471
+        if (num <= 2)
472
+            return;
473
+        else
474
+        {
475
+            (*framesavg).m_lowres.sliceType = X265_TYPE_BREF;
476
+            (*brefs)++;
477
+            placeBref(frames, start, avg, avg - start, brefs);
478
+            placeBref(frames, avg + 1, end, end - avg, brefs);
479
+            return;
480
+        }
481
+    }
482
+}
483
+
484
+
485
+void Lookahead::compCostBref(Lowres **frames, int start, int end, int num)
486
+{
487
+    CostEstimateGroup estGroup(*this, frames);
488
+    int avg = (start + end) / 2;
489
+    if (num <= 2)
490
+    {
491
+        for (int i = start; i < end; i++)
492
+        {
493
+            estGroup.singleCost(start, end + 1, i + 1);
494
+        }
495
+        return;
496
+    }
497
+    else
498
+    {
499
+        estGroup.singleCost(start, end + 1, avg + 1);
500
+        compCostBref(frames, start, avg, avg - start);
501
+        compCostBref(frames, avg + 1, end, end - avg);
502
+        return;
503
+    }
504
+}
505
+
506
 /* called by API thread or worker thread with inputQueueLock acquired */
507
 void Lookahead::slicetypeDecide()
508
 {
509
@@ -1416,6 +1802,18 @@
510
         ScopedLock lock(m_inputLock);
511
 
512
         Frame *curFrame = m_inputQueue.first();
513
+        if (m_param->bResetZoneConfig)
514
+        {
515
+            for (int i = 0; i < m_param->rc.zonefileCount; i++)
516
+            {
517
+                if (m_param->rc.zonesi.startFrame == curFrame->m_poc)
518
+                    m_param = m_param->rc.zonesi.zoneParam;
519
+                int nextZoneStart = m_param->rc.zonesi.startFrame;
520
+                nextZoneStart += nextZoneStart ? m_param->rc.zonesi.zoneParam->radl : 0;
521
+                if (nextZoneStart < curFrame->m_poc + maxSearch && curFrame->m_poc < nextZoneStart)
522
+                    maxSearch = nextZoneStart - curFrame->m_poc;
523
+            }
524
+        }
525
         int j;
526
         for (j = 0; j < m_param->bframes + 2; j++)
527
         {
528
@@ -1502,7 +1900,7 @@
529
          m_param->rc.cuTree || m_param->scenecutThreshold || m_param->bHistBasedSceneCut ||
530
          (m_param->lookaheadDepth && m_param->rc.vbvBufferSize)))
531
     {
532
-        if(!m_param->rc.bStatRead)
533
+        if (!m_param->rc.bStatRead)
534
             slicetypeAnalyse(frames, false);
535
         bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
536
         if ((m_param->analysisLoad && m_param->scaleFactor && bIsVbv) || m_param->bliveVBV2pass)
537
@@ -1526,6 +1924,8 @@
538
         {
539
             Lowres& frm = listbframes->m_lowres;
540
 
541
+            if (frm.sliceTypeReq != X265_TYPE_AUTO && frm.sliceTypeReq != frm.sliceType)
542
+                frm.sliceType = frm.sliceTypeReq;
543
             if (frm.sliceType == X265_TYPE_BREF && !m_param->bBPyramid && brefs == m_param->bBPyramid)
544
             {
545
                 frm.sliceType = X265_TYPE_B;
546
@@ -1583,12 +1983,9 @@
547
             }
548
             if (frm.sliceType == X265_TYPE_IDR && frm.bScenecut && isClosedGopRadl)
549
             {
550
-                if (!m_param->bHistBasedSceneCut || (m_param->bHistBasedSceneCut && frm.m_bIsHardScenecut))
551
-                {
552
-                    for (int i = bframes; i < bframes + m_param->radl; i++)
553
-                        listi->m_lowres.sliceType = X265_TYPE_B;
554
-                    list(bframes + m_param->radl)->m_lowres.sliceType = X265_TYPE_IDR;
555
-                }
556
+                for (int i = bframes; i < bframes + m_param->radl; i++)
557
+                    listi->m_lowres.sliceType = X265_TYPE_B;
558
+                list(bframes + m_param->radl)->m_lowres.sliceType = X265_TYPE_IDR;
559
             }
560
             if (frm.sliceType == X265_TYPE_IDR)
561
             {
562
@@ -1649,138 +2046,454 @@
563
                 break;
564
         }
565
     }
566
-    if (bframes)
567
-        listbframes - 1->m_lowres.bLastMiniGopBFrame = true;
568
-    listbframes->m_lowres.leadingBframes = bframes;
569
-    m_lastNonB = &listbframes->m_lowres;
570
-    m_histogrambframes++;
571
-
572
-    /* insert a bref into the sequence */
573
-    if (m_param->bBPyramid && bframes > 1 && !brefs)
574
-    {
575
-        listbframes / 2->m_lowres.sliceType = X265_TYPE_BREF;
576
-        brefs++;
577
-    }
578
-    /* calculate the frame costs ahead of time for estimateFrameCost while we still have lowres */
579
-    if (m_param->rc.rateControlMode != X265_RC_CQP)
580
-    {
581
-        int p0, p1, b;
582
-        /* For zero latency tuning, calculate frame cost to be used later in RC */
583
-        if (!maxSearch)
584
+
585
+    if (m_param->bEnableTemporalSubLayers > 2)
586
+    {
587
+        //Split the partial mini GOP into sub mini GOPs when temporal sub layers are enabled
588
+        if (bframes < m_param->bframes)
589
         {
590
-            for (int i = 0; i <= bframes; i++)
591
-               framesi + 1 = &listi->m_lowres;
592
-        }
593
+            int leftOver = bframes + 1;
594
+            int8_t gopId = m_gopId - 1;
595
+            int gopLen = x265_gop_ra_lengthgopId;
596
+            int listReset = 0;
597
 
598
-        /* estimate new non-B cost */
599
-        p1 = b = bframes + 1;
600
-        p0 = (IS_X265_TYPE_I(framesbframes + 1->sliceType)) ? b : 0;
601
+            m_outputLock.acquire();
602
 
603
-        CostEstimateGroup estGroup(*this, frames);
604
+            while ((gopId >= 0) && (leftOver > 3))
605
+            {
606
+                if (leftOver < gopLen)
607
+                {
608
+                    gopId = gopId - 1;
609
+                    gopLen = x265_gop_ra_lengthgopId;
610
+                    continue;
611
+                }
612
+                else
613
+                {
614
+                    int newbFrames = listReset + gopLen - 1;
615
+                    //Re-assign GOP
616
+                    listnewbFrames->m_lowres.sliceType = IS_X265_TYPE_I(listnewbFrames->m_lowres.sliceType) ? listnewbFrames->m_lowres.sliceType : X265_TYPE_P;
617
+                    if (newbFrames)
618
+                        listnewbFrames - 1->m_lowres.bLastMiniGopBFrame = true;
619
+                    listnewbFrames->m_lowres.leadingBframes = newbFrames;
620
+                    m_lastNonB = &listnewbFrames->m_lowres;
621
+
622
+                    /* insert a bref into the sequence */
623
+                    if (m_param->bBPyramid && newbFrames)
624
+                    {
625
+                        placeBref(list, listReset, newbFrames, newbFrames + 1, &brefs);
626
+                    }
627
+                    if (m_param->rc.rateControlMode != X265_RC_CQP)
628
+                    {
629
+                        int p0, p1, b;
630
+                        /* For zero latency tuning, calculate frame cost to be used later in RC */
631
+                        if (!maxSearch)
632
+                        {
633
+                            for (int i = listReset; i <= newbFrames; i++)
634
+                                framesi + 1 = &listlistReset + i->m_lowres;
635
+                        }
636
 
637
-        estGroup.singleCost(p0, p1, b);
638
+                        /* estimate new non-B cost */
639
+                        p1 = b = newbFrames + 1;
640
+                        p0 = (IS_X265_TYPE_I(framesnewbFrames + 1->sliceType)) ? b : listReset;
641
 
642
-        if (bframes)
643
+                        CostEstimateGroup estGroup(*this, frames);
644
+
645
+                        estGroup.singleCost(p0, p1, b);
646
+
647
+                        if (newbFrames)
648
+                            compCostBref(frames, listReset, newbFrames, newbFrames + 1);
649
+                    }
650
+
651
+                    m_inputLock.acquire();
652
+                    /* dequeue all frames from inputQueue that are about to be enqueued
653
+                     * in the output queue. The order is important because Frame can
654
+                     * only be in one list at a time */
655
+                    int64_t ptsX265_BFRAME_MAX + 1;
656
+                    for (int i = 0; i < gopLen; i++)
657
+                    {
658
+                        Frame *curFrame;
659
+                        curFrame = m_inputQueue.popFront();
660
+                        ptsi = curFrame->m_pts;
661
+                        maxSearch--;
662
+                    }
663
+                    m_inputLock.release();
664
+
665
+                    int idx = 0;
666
+                    /* add non-B to output queue */
667
+                    listnewbFrames->m_reorderedPts = ptsidx++;
668
+                    listnewbFrames->m_gopOffset = 0;
669
+                    listnewbFrames->m_gopId = gopId;
670
+                    listnewbFrames->m_tempLayer = x265_gop_ragopId0.layer;
671
+                    m_outputQueue.pushBack(*listnewbFrames);
672
+
673
+                    /* add B frames to output queue */
674
+                    int i = 1, j = 1;
675
+                    while (i < gopLen)
676
+                    {
677
+                        int offset = listReset + (x265_gop_ragopIdj.poc_offset - 1);
678
+                        if (!listoffset || offset == newbFrames)
679
+                            continue;
680
+
681
+                        // Assign gop offset and temporal layer of frames
682
+                        listoffset->m_gopOffset = j;
683
+                        listbframes->m_gopId = gopId;
684
+                        listoffset->m_tempLayer = x265_gop_ragopIdj++.layer;
685
+
686
+                        listoffset->m_reorderedPts = ptsidx++;
687
+                        m_outputQueue.pushBack(*listoffset);
688
+                        i++;
689
+                    }
690
+
691
+                    listReset += gopLen;
692
+                    leftOver = leftOver - gopLen;
693
+                    gopId -= 1;
694
+                    gopLen = (gopId >= 0) ? x265_gop_ra_lengthgopId : 0;
695
+                }
696
+            }
697
+
698
+            if (leftOver > 0 && leftOver < 4)
699
+            {
700
+                int64_t ptsX265_BFRAME_MAX + 1;
701
+                int idx = 0;
702
+
703
+                int newbFrames = listReset + leftOver - 1;
704
+                listnewbFrames->m_lowres.sliceType = IS_X265_TYPE_I(listnewbFrames->m_lowres.sliceType) ? listnewbFrames->m_lowres.sliceType : X265_TYPE_P;
705
+                if (newbFrames)
706
+                        listnewbFrames - 1->m_lowres.bLastMiniGopBFrame = true;
707
+                listnewbFrames->m_lowres.leadingBframes = newbFrames;
708
+                m_lastNonB = &listnewbFrames->m_lowres;
709
+
710
+                /* insert a bref into the sequence */
711
+                if (m_param->bBPyramid && (newbFrames- listReset) > 1)
712
+                    placeBref(list, listReset, newbFrames, newbFrames + 1, &brefs);
713
+
714
+                if (m_param->rc.rateControlMode != X265_RC_CQP)
715
+                {
716
+                    int p0, p1, b;
717
+                    /* For zero latency tuning, calculate frame cost to be used later in RC */
718
+                    if (!maxSearch)
719
+                    {
720
+                        for (int i = listReset; i <= newbFrames; i++)
721
+                            framesi + 1 = &listlistReset + i->m_lowres;
722
+                    }
723
+
724
+                        /* estimate new non-B cost */
725
+                    p1 = b = newbFrames + 1;
726
+                    p0 = (IS_X265_TYPE_I(framesnewbFrames + 1->sliceType)) ? b : listReset;
727
+
728
+                    CostEstimateGroup estGroup(*this, frames);
729
+
730
+                    estGroup.singleCost(p0, p1, b);
731
+
732
+                    if (newbFrames)
733
+                        compCostBref(frames, listReset, newbFrames, newbFrames + 1);
734
+                }
735
+
736
+                m_inputLock.acquire();
737
+                /* dequeue all frames from inputQueue that are about to be enqueued
738
+                 * in the output queue. The order is important because Frame can
739
+                 * only be in one list at a time */
740
+                for (int i = 0; i < leftOver; i++)
741
+                {
742
+                    Frame *curFrame;
743
+                    curFrame = m_inputQueue.popFront();
744
+                    ptsi = curFrame->m_pts;
745
+                    maxSearch--;
746
+                }
747
+                m_inputLock.release();
748
+
749
+                m_lastNonB = &listnewbFrames->m_lowres;
750
+                listnewbFrames->m_reorderedPts = ptsidx++;
751
+                listnewbFrames->m_gopOffset = 0;
752
+                listnewbFrames->m_gopId = -1;
753
+                listnewbFrames->m_tempLayer = 0;
754
+                m_outputQueue.pushBack(*listnewbFrames);
755
+                if (brefs)
756
+                {
757
+                    for (int i = listReset; i < newbFrames; i++)
758
+                    {
759
+                        if (listi->m_lowres.sliceType == X265_TYPE_BREF)
760
+                        {
761
+                            listi->m_reorderedPts = ptsidx++;
762
+                            listi->m_gopOffset = 0;
763
+                            listi->m_gopId = -1;
764
+                            listi->m_tempLayer = 0;
765
+                            m_outputQueue.pushBack(*listi);
766
+                        }
767
+                    }
768
+                }
769
+
770
+                /* add B frames to output queue */
771
+                for (int i = listReset; i < newbFrames; i++)
772
+                {
773
+                    /* push all the B frames into output queue except B-ref, which already pushed into output queue */
774
+                    if (listi->m_lowres.sliceType != X265_TYPE_BREF)
775
+                    {
776
+                        listi->m_reorderedPts = ptsidx++;
777
+                        listi->m_gopOffset = 0;
778
+                        listi->m_gopId = -1;
779
+                        listi->m_tempLayer = 1;
780
+                        m_outputQueue.pushBack(*listi);
781
+                    }
782
+                }
783
+            }
784
+        }
785
+        else
786
+        // Fill the complete mini GOP when temporal sub layers are enabled
787
         {
788
-            p0 = 0; // last nonb
789
-            bool isp0available = framesbframes + 1->sliceType == X265_TYPE_IDR ? false : true;
790
 
791
-            for (b = 1; b <= bframes; b++)
792
+            listbframes - 1->m_lowres.bLastMiniGopBFrame = true;
793
+            listbframes->m_lowres.leadingBframes = bframes;
794
+            m_lastNonB = &listbframes->m_lowres;
795
+
796
+            /* insert a bref into the sequence */
797
+            if (m_param->bBPyramid && !brefs)
798
             {
799
-                if (!isp0available)
800
-                    p0 = b;
801
+                placeBref(list, 0, bframes, bframes + 1, &brefs);
802
+            }
803
 
804
-                if (framesb->sliceType == X265_TYPE_B)
805
-                    for (p1 = b; framesp1->sliceType == X265_TYPE_B; p1++)
806
-                        ; // find new nonb or bref
807
-                else
808
-                    p1 = bframes + 1;
809
+            /* calculate the frame costs ahead of time for estimateFrameCost while we still have lowres */
810
+            if (m_param->rc.rateControlMode != X265_RC_CQP)
811
+            {
812
+                int p0, p1, b;
813
+                /* For zero latency tuning, calculate frame cost to be used later in RC */
814
+                if (!maxSearch)
815
+                {
816
+                    for (int i = 0; i <= bframes; i++)
817
+                        framesi + 1 = &listi->m_lowres;
818
+                }
819
 
820
+                /* estimate new non-B cost */
821
+                p1 = b = bframes + 1;
822
+                p0 = (IS_X265_TYPE_I(framesbframes + 1->sliceType)) ? b : 0;
823
+
824
+                CostEstimateGroup estGroup(*this, frames);
825
                 estGroup.singleCost(p0, p1, b);
826
 
827
-                if (framesb->sliceType == X265_TYPE_BREF)
828
+                compCostBref(frames, 0, bframes, bframes + 1);
829
+            }
830
+
831
+            m_inputLock.acquire();
832
+            /* dequeue all frames from inputQueue that are about to be enqueued
833
+            * in the output queue. The order is important because Frame can
834
+            * only be in one list at a time */
835
+            int64_t ptsX265_BFRAME_MAX + 1;
836
+            for (int i = 0; i <= bframes; i++)
837
+            {
838
+                Frame *curFrame;
839
+                curFrame = m_inputQueue.popFront();
840
+                ptsi = curFrame->m_pts;
841
+                maxSearch--;
842
+            }
843
+            m_inputLock.release();
844
+
845
+            m_outputLock.acquire();
846
+
847
+            int idx = 0;
848
+            /* add non-B to output queue */
849
+            listbframes->m_reorderedPts = ptsidx++;
850
+            listbframes->m_gopOffset = 0;
851
+            listbframes->m_gopId = m_gopId;
852
+            listbframes->m_tempLayer = x265_gop_ram_gopId0.layer;
853
+            m_outputQueue.pushBack(*listbframes);
854
+
855
+            int i = 1, j = 1;
856
+            while (i <= bframes)
857
+            {
858
+                int offset = x265_gop_ram_gopIdj.poc_offset - 1;
859
+                if (!listoffset || offset == bframes)
860
+                    continue;
861
+
862
+                // Assign gop offset and temporal layer of frames
863
+                listoffset->m_gopOffset = j;
864
+                listoffset->m_gopId = m_gopId;
865
+                listoffset->m_tempLayer = x265_gop_ram_gopIdj++.layer;
866
+
867
+                /* add B frames to output queue */
868
+                listoffset->m_reorderedPts = ptsidx++;
869
+                m_outputQueue.pushBack(*listoffset);
870
+                i++;
871
+            }
872
+        }
873
+
874
+        bool isKeyFrameAnalyse = (m_param->rc.cuTree || (m_param->rc.vbvBufferSize && m_param->lookaheadDepth));
875
+        if (isKeyFrameAnalyse && IS_X265_TYPE_I(m_lastNonB->sliceType))
876
+        {
877
+            m_inputLock.acquire();
878
+            Frame *curFrame = m_inputQueue.first();
879
+            frames0 = m_lastNonB;
880
+            int j;
881
+            for (j = 0; j < maxSearch; j++)
882
+            {
883
+                framesj + 1 = &curFrame->m_lowres;
884
+                curFrame = curFrame->m_next;
885
+            }
886
+            m_inputLock.release();
887
+
888
+            framesj + 1 = NULL;
889
+            if (!m_param->rc.bStatRead)
890
+                slicetypeAnalyse(frames, true);
891
+            bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
892
+            if ((m_param->analysisLoad && m_param->scaleFactor && bIsVbv) || m_param->bliveVBV2pass)
893
+            {
894
+                int numFrames;
895
+                for (numFrames = 0; numFrames < maxSearch; numFrames++)
896
                 {
897
-                    p0 = b;
898
-                    isp0available = true;
899
+                    Lowres *fenc = framesnumFrames + 1;
900
+                    if (!fenc)
901
+                        break;
902
                 }
903
+                vbvLookahead(frames, numFrames, true);
904
             }
905
         }
906
-    }
907
 
908
-    m_inputLock.acquire();
909
-    /* dequeue all frames from inputQueue that are about to be enqueued
910
-     * in the output queue. The order is important because Frame can
911
-     * only be in one list at a time */
912
-    int64_t ptsX265_BFRAME_MAX + 1;
913
-    for (int i = 0; i <= bframes; i++)
914
-    {
915
-        Frame *curFrame;
916
-        curFrame = m_inputQueue.popFront();
917
-        ptsi = curFrame->m_pts;
918
-        maxSearch--;
919
-    }
920
-    m_inputLock.release();
921
 
922
-    m_outputLock.acquire();
923
-    /* add non-B to output queue */
924
-    int idx = 0;
925
-    listbframes->m_reorderedPts = ptsidx++;
926
-    m_outputQueue.pushBack(*listbframes);
927
-    /* Add B-ref frame next to P frame in output queue, the B-ref encode before non B-ref frame */
928
-    if (brefs)
929
+        m_outputLock.release();
930
+    }
931
+    else
932
     {
933
-        for (int i = 0; i < bframes; i++)
934
+
935
+        if (bframes)
936
+            listbframes - 1->m_lowres.bLastMiniGopBFrame = true;
937
+        listbframes->m_lowres.leadingBframes = bframes;
938
+        m_lastNonB = &listbframes->m_lowres;
939
+
940
+        /* insert a bref into the sequence */
941
+        if (m_param->bBPyramid && bframes > 1 && !brefs)
942
         {
943
-            if (listi->m_lowres.sliceType == X265_TYPE_BREF)
944
+            placeBref(list, 0, bframes, bframes + 1, &brefs);
945
+        }
946
+        /* calculate the frame costs ahead of time for estimateFrameCost while we still have lowres */
947
+        if (m_param->rc.rateControlMode != X265_RC_CQP)
948
+        {
949
+            int p0, p1, b;
950
+            /* For zero latency tuning, calculate frame cost to be used later in RC */
951
+            if (!maxSearch)
952
             {
953
-                listi->m_reorderedPts = ptsidx++;
954
-                m_outputQueue.pushBack(*listi);
955
+                for (int i = 0; i <= bframes; i++)
956
+                    framesi + 1 = &listi->m_lowres;
957
+            }
958
+
959
+            /* estimate new non-B cost */
960
+            p1 = b = bframes + 1;
961
+            p0 = (IS_X265_TYPE_I(framesbframes + 1->sliceType)) ? b : 0;
962
+
963
+            CostEstimateGroup estGroup(*this, frames);
964
+            estGroup.singleCost(p0, p1, b);
965
+
966
+            if (m_param->bEnableTemporalSubLayers > 1 && bframes)
967
+            {
968
+                compCostBref(frames, 0, bframes, bframes + 1);
969
+            }
970
+            else
971
+            {
972
+                if (bframes)
973
+                {
974
+                    p0 = 0; // last nonb
975
+                    bool isp0available = framesbframes + 1->sliceType == X265_TYPE_IDR ? false : true;
976
+
977
+                    for (b = 1; b <= bframes; b++)
978
+                    {
979
+                        if (!isp0available)
980
+                            p0 = b;
981
+
982
+                        if (framesb->sliceType == X265_TYPE_B)
983
+                            for (p1 = b; framesp1->sliceType == X265_TYPE_B; p1++)
984
+                                ; // find new nonb or bref
985
+                        else
986
+                            p1 = bframes + 1;
987
+
988
+                        estGroup.singleCost(p0, p1, b);
989
+
990
+                        if (framesb->sliceType == X265_TYPE_BREF)
991
+                        {
992
+                            p0 = b;
993
+                            isp0available = true;
994
+                        }
995
+                    }
996
+                }
997
             }
998
         }
999
-    }
1000
 
1001
-    /* add B frames to output queue */
1002
-    for (int i = 0; i < bframes; i++)
1003
-    {
1004
-        /* push all the B frames into output queue except B-ref, which already pushed into output queue */
1005
-        if (listi->m_lowres.sliceType != X265_TYPE_BREF)
1006
+        m_inputLock.acquire();
1007
+        /* dequeue all frames from inputQueue that are about to be enqueued
1008
+         * in the output queue. The order is important because Frame can
1009
+         * only be in one list at a time */
1010
+        int64_t ptsX265_BFRAME_MAX + 1;
1011
+        for (int i = 0; i <= bframes; i++)
1012
+        {
1013
+            Frame *curFrame;
1014
+            curFrame = m_inputQueue.popFront();
1015
+            ptsi = curFrame->m_pts;
1016
+            maxSearch--;
1017
+        }
1018
+        m_inputLock.release();
1019
+
1020
+        m_outputLock.acquire();
1021
+
1022
+        /* add non-B to output queue */
1023
+        int idx = 0;
1024
+        listbframes->m_reorderedPts = ptsidx++;
1025
+        m_outputQueue.pushBack(*listbframes);
1026
+
1027
+        /* Add B-ref frame next to P frame in output queue, the B-ref encode before non B-ref frame */
1028
+        if (brefs)
1029
         {
1030
-            listi->m_reorderedPts = ptsidx++;
1031
-            m_outputQueue.pushBack(*listi);
1032
+            for (int i = 0; i < bframes; i++)
1033
+            {
1034
+                if (listi->m_lowres.sliceType == X265_TYPE_BREF)
1035
+                {
1036
+                    listi->m_reorderedPts = ptsidx++;
1037
+                    m_outputQueue.pushBack(*listi);
1038
+                }
1039
+            }
1040
         }
1041
-    }
1042
 
1043
-    bool isKeyFrameAnalyse = (m_param->rc.cuTree || (m_param->rc.vbvBufferSize && m_param->lookaheadDepth));
1044
-    if (isKeyFrameAnalyse && IS_X265_TYPE_I(m_lastNonB->sliceType))
1045
-    {
1046
-        m_inputLock.acquire();
1047
-        Frame *curFrame = m_inputQueue.first();
1048
-        frames0 = m_lastNonB;
1049
-        int j;
1050
-        for (j = 0; j < maxSearch; j++)
1051
+        /* add B frames to output queue */
1052
+        for (int i = 0; i < bframes; i++)
1053
         {
1054
-            framesj + 1 = &curFrame->m_lowres;
1055
-            curFrame = curFrame->m_next;
1056
+            /* push all the B frames into output queue except B-ref, which already pushed into output queue */
1057
+            if (listi->m_lowres.sliceType != X265_TYPE_BREF)
1058
+            {
1059
+                listi->m_reorderedPts = ptsidx++;
1060
+                m_outputQueue.pushBack(*listi);
1061
+            }
1062
         }
1063
-        m_inputLock.release();
1064
 
1065
-        framesj + 1 = NULL;
1066
-        if (!m_param->rc.bStatRead)
1067
-            slicetypeAnalyse(frames, true);
1068
-        bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
1069
-        if ((m_param->analysisLoad && m_param->scaleFactor && bIsVbv) || m_param->bliveVBV2pass)
1070
+
1071
+        bool isKeyFrameAnalyse = (m_param->rc.cuTree || (m_param->rc.vbvBufferSize && m_param->lookaheadDepth));
1072
+        if (isKeyFrameAnalyse && IS_X265_TYPE_I(m_lastNonB->sliceType))
1073
         {
1074
-            int numFrames;
1075
-            for (numFrames = 0; numFrames < maxSearch; numFrames++)
1076
+            m_inputLock.acquire();
1077
+            Frame *curFrame = m_inputQueue.first();
1078
+            frames0 = m_lastNonB;
1079
+            int j;
1080
+            for (j = 0; j < maxSearch; j++)
1081
+            {
1082
+                framesj + 1 = &curFrame->m_lowres;
1083
+                curFrame = curFrame->m_next;
1084
+            }
1085
+            m_inputLock.release();
1086
+
1087
+            framesj + 1 = NULL;
1088
+            if (!m_param->rc.bStatRead)
1089
+                slicetypeAnalyse(frames, true);
1090
+            bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
1091
+            if ((m_param->analysisLoad && m_param->scaleFactor && bIsVbv) || m_param->bliveVBV2pass)
1092
             {
1093
-                Lowres *fenc = framesnumFrames + 1;
1094
-                if (!fenc)
1095
-                    break;
1096
+                int numFrames;
1097
+                for (numFrames = 0; numFrames < maxSearch; numFrames++)
1098
+                {
1099
+                    Lowres *fenc = framesnumFrames + 1;
1100
+                    if (!fenc)
1101
+                        break;
1102
+                }
1103
+                vbvLookahead(frames, numFrames, true);
1104
             }
1105
-            vbvLookahead(frames, numFrames, true);
1106
         }
1107
+
1108
+        m_outputLock.release();
1109
     }
1110
-    m_outputLock.release();
1111
 }
1112
 
1113
 void Lookahead::vbvLookahead(Lowres **frames, int numFrames, int keyframe)
1114
@@ -1909,6 +2622,8 @@
1115
             nextZoneStart += (i + 1 < m_param->rc.zonefileCount) ? m_param->rc.zonesi + 1.startFrame + m_param->rc.zonesi + 1.zoneParam->radl : m_param->totalFrames;
1116
             if (curZoneStart <= frames0->frameNum && nextZoneStart > frames0->frameNum)
1117
                 m_param->keyframeMax = nextZoneStart - curZoneStart;
1118
+            if (m_param->rc.zonesm_param->rc.zonefileCount - 1.startFrame <= frames0->frameNum && nextZoneStart == 0)
1119
+                m_param->keyframeMax = m_param->rc.zones0.keyframeMax;
1120
         }
1121
     }
1122
     int keylimit = m_param->keyframeMax;
1123
@@ -2013,44 +2728,13 @@
1124
     int numAnalyzed = numFrames;
1125
     bool isScenecut = false;
1126
 
1127
-    /* Temporal computations for scenecut detection */
1128
     if (m_param->bHistBasedSceneCut)
1129
-    {
1130
-        for (int i = numFrames - 1; i > 0; i--)
1131
-        {
1132
-            if (framesi->interPCostPercDiff > 0.0)
1133
-                continue;
1134
-            int64_t interCost = framesi->costEst10;
1135
-            int64_t intraCost = framesi->costEst00;
1136
-            if (interCost < 0 || intraCost < 0)
1137
-                continue;
1138
-            int times = 0;
1139
-            double averagePcost = 0.0, averageIcost = 0.0;
1140
-            for (int j = i - 1; j >= 0 && times < 5; j--, times++)
1141
-            {
1142
-                if (framesj->costEst00 > 0 && framesj->costEst10 > 0)
1143
-                {
1144
-                    averageIcost += framesj->costEst00;
1145
-                    averagePcost += framesj->costEst10;
1146
-                }
1147
-                else
1148
-                    times--;
1149
-            }
1150
-            if (times)
1151
-            {
1152
-                averageIcost = averageIcost / times;
1153
-                averagePcost = averagePcost / times;
1154
-                framesi->interPCostPercDiff = abs(interCost - averagePcost) / X265_MIN(interCost, averagePcost) * 100;
1155
-                framesi->intraCostPercDiff = abs(intraCost - averageIcost) / X265_MIN(intraCost, averageIcost) * 100;
1156
-            }
1157
-        }
1158
-    }
1159
-
1160
-    /* When scenecut threshold is set, use scenecut detection for I frame placements */
1161
-    if (!m_param->bHistBasedSceneCut || (m_param->bHistBasedSceneCut && frames1->bScenecut))
1162
+        isScenecut = histBasedScenecut(frames, 0, 1, origNumFrames);
1163
+    else
1164
         isScenecut = scenecut(frames, 0, 1, true, origNumFrames);
1165
 
1166
-    if (isScenecut && (m_param->bHistBasedSceneCut || m_param->scenecutThreshold))
1167
+    /* When scenecut threshold is set, use scenecut detection for I frame placements */
1168
+    if (m_param->scenecutThreshold && isScenecut)
1169
     {
1170
         frames1->sliceType = X265_TYPE_I;
1171
         return;
1172
@@ -2061,8 +2745,7 @@
1173
         m_extendGopBoundary = false;
1174
         for (int i = m_param->bframes + 1; i < origNumFrames; i += m_param->bframes + 1)
1175
         {
1176
-            if (!m_param->bHistBasedSceneCut || (m_param->bHistBasedSceneCut && framesi + 1->bScenecut))
1177
-                scenecut(frames, i, i + 1, true, origNumFrames);
1178
+            scenecut(frames, i, i + 1, true, origNumFrames);
1179
 
1180
             for (int j = i + 1; j <= X265_MIN(i + m_param->bframes + 1, origNumFrames); j++)
1181
             {
1182
@@ -2175,10 +2858,8 @@
1183
         {
1184
             for (int j = 1; j < numBFrames + 1; j++)
1185
             {
1186
-                bool isNextScenecut = false;
1187
-                if (!m_param->bHistBasedSceneCut || (m_param->bHistBasedSceneCut && framesj + 1->bScenecut))
1188
-                    isNextScenecut = scenecut(frames, j, j + 1, false, origNumFrames);
1189
-                if (isNextScenecut || (bForceRADL && framesj->frameNum == preRADL))
1190
+                if (scenecut(frames, j, j + 1, false, origNumFrames) ||
1191
+                    (bForceRADL && (framesj->frameNum == preRADL)))
1192
                 {
1193
                     framesj->sliceType = X265_TYPE_P;
1194
                     numAnalyzed = j;
1195
@@ -2244,9 +2925,10 @@
1196
         /* Where A and B are scenes: AAAAAABBBAAAAAA
1197
          * If BBB is shorter than (maxp1-p0), it is detected as a flash
1198
          * and not considered a scenecut. */
1199
+
1200
         for (int cp1 = p1; cp1 <= maxp1; cp1++)
1201
         {
1202
-            if (!scenecutInternal(frames, p0, cp1, false) && !m_param->bHistBasedSceneCut)
1203
+            if (!scenecutInternal(frames, p0, cp1, false))
1204
             {
1205
                 /* Any frame in between p0 and cur_p1 cannot be a real scenecut. */
1206
                 for (int i = cp1; i > p0; i--)
1207
@@ -2255,7 +2937,7 @@
1208
                     noScenecuts = false;
1209
                 }
1210
             }
1211
-            else if ((m_param->bHistBasedSceneCut && framescp1->m_bIsMaxThres) || scenecutInternal(frames, cp1 - 1, cp1, false))
1212
+            else if (scenecutInternal(frames, cp1 - 1, cp1, false))
1213
             {
1214
                 /* If current frame is a Scenecut from p0 frame as well as Scenecut from
1215
                  * preceeding frame, mark it as a Scenecut */
1216
@@ -2316,9 +2998,6 @@
1217
 
1218
     if (!framesp1->bScenecut)
1219
         return false;
1220
-    /* Check only scene transitions if max threshold */
1221
-    if (m_param->bHistBasedSceneCut && framesp1->m_bIsMaxThres)
1222
-        return framesp1->bScenecut;
1223
 
1224
     return scenecutInternal(frames, p0, p1, bRealScenecut);
1225
 }
1226
@@ -2336,19 +3015,8 @@
1227
     /* magic numbers pulled out of thin air */
1228
     float threshMin = (float)(threshMax * 0.25);
1229
     double bias = m_param->scenecutBias;
1230
-    if (m_param->bHistBasedSceneCut)
1231
-    {
1232
-        double minT = TEMPORAL_SCENECUT_THRESHOLD * (1 + m_param->edgeTransitionThreshold);
1233
-        if (frame->interPCostPercDiff > minT || frame->intraCostPercDiff > minT)
1234
-        {
1235
-            if (bRealScenecut && frame->bScenecut)
1236
-                x265_log(m_param, X265_LOG_DEBUG, "scene cut at %d \n", frame->frameNum);
1237
-            return frame->bScenecut;
1238
-        }
1239
-        else
1240
-            return false;
1241
-    }
1242
-    else if (bRealScenecut)
1243
+
1244
+    if (bRealScenecut)
1245
     {
1246
         if (m_param->keyframeMin == m_param->keyframeMax)
1247
             threshMin = threshMax;
1248
@@ -2375,6 +3043,167 @@
1249
     return res;
1250
 }
1251
 
1252
+bool Lookahead::detectHistBasedSceneChange(Lowres **frames, int p0, int p1, int p2)
1253
+{
1254
+    bool isAbruptChange;
1255
+    bool isSceneChange;
1256
+
1257
+    Lowres  *previousFrame = framesp0;
1258
+    Lowres  *currentFrame = framesp1;
1259
+    Lowres  *futureFrame = framesp2;
1260
+
1261
+    currentFrame->bHistScenecutAnalyzed = true;
1262
+
1263
+    uint32_t **accHistDiffRunningAvgCb = m_accHistDiffRunningAvgCb;
1264
+    uint32_t **accHistDiffRunningAvgCr = m_accHistDiffRunningAvgCr;
1265
+    uint32_t **accHistDiffRunningAvg = m_accHistDiffRunningAvg;
1266
+
1267
+    uint8_t absIntDiffFuturePast = 0;
1268
+    uint8_t absIntDiffFuturePresent = 0;
1269
+    uint8_t absIntDiffPresentPast = 0;
1270
+
1271
+    uint32_t abruptChangeCount = 0;
1272
+    uint32_t sceneChangeCount = 0;
1273
+
1274
+    uint32_t segmentWidth = frames1->widthFullRes / NUMBER_OF_SEGMENTS_IN_WIDTH;
1275
+    uint32_t segmentHeight = frames1->heightFullRes / NUMBER_OF_SEGMENTS_IN_HEIGHT;
1276
+
1277
+    for (uint32_t segmentInFrameWidthIndex = 0; segmentInFrameWidthIndex < NUMBER_OF_SEGMENTS_IN_WIDTH; segmentInFrameWidthIndex++)
1278
+    {
1279
+        for (uint32_t segmentInFrameHeightIndex = 0; segmentInFrameHeightIndex < NUMBER_OF_SEGMENTS_IN_HEIGHT; segmentInFrameHeightIndex++)
1280
+        {
1281
+            isAbruptChange = false;
1282
+            isSceneChange = false;
1283
+
1284
+            // accumulative absolute histogram differences between the past and current frame
1285
+            uint32_t accHistDiff = 0;
1286
+            uint32_t accHistDiffCb = 0;
1287
+            uint32_t accHistDiffCr = 0;
1288
+
1289
+            uint32_t segmentWidthOffset = (segmentInFrameWidthIndex == NUMBER_OF_SEGMENTS_IN_WIDTH - 1) ?
1290
+                frames1->widthFullRes - (NUMBER_OF_SEGMENTS_IN_WIDTH * segmentWidth) : 0;
1291
+
1292
+            uint32_t segmentHeightOffset = (segmentInFrameHeightIndex == NUMBER_OF_SEGMENTS_IN_HEIGHT - 1) ?
1293
+                frames1->heightFullRes - (NUMBER_OF_SEGMENTS_IN_HEIGHT * segmentHeight) : 0;
1294
+
1295
+            segmentWidth += segmentWidthOffset;
1296
+            segmentHeight += segmentHeightOffset;
1297
+
1298
+            uint32_t segmentThreshHold = (
1299
+                ((X265_ABS((int64_t)currentFrame->picAvgVariance - (int64_t)previousFrame->picAvgVariance)) > PICTURE_DIFF_VARIANCE_TH) &&
1300
+                (currentFrame->picAvgVariance > PICTURE_VARIANCE_TH || previousFrame->picAvgVariance > PICTURE_VARIANCE_TH)) ?
1301
+                HIGH_VAR_SCENE_CHANGE_TH * NUM64x64INPIC(segmentWidth, segmentHeight) : LOW_VAR_SCENE_CHANGE_TH * NUM64x64INPIC(segmentWidth, segmentHeight);
1302
+
1303
+            uint32_t segmentThreshHoldCb = (
1304
+                ((X265_ABS((int64_t)currentFrame->picAvgVarianceCb - (int64_t)previousFrame->picAvgVarianceCb)) > PICTURE_DIFF_VARIANCE_CHROMA_TH) &&
1305
+                (currentFrame->picAvgVarianceCb > PICTURE_VARIANCE_CHROMA_TH || previousFrame->picAvgVarianceCb > PICTURE_VARIANCE_CHROMA_TH)) ?
1306
+                HIGH_VAR_SCENE_CHANGE_CHROMA_TH * NUM64x64INPIC(segmentWidth, segmentHeight) : LOW_VAR_SCENE_CHANGE_CHROMA_TH * NUM64x64INPIC(segmentWidth, segmentHeight);
1307
+
1308
+            uint32_t segmentThreshHoldCr = (
1309
+                ((X265_ABS((int64_t)currentFrame->picAvgVarianceCr - (int64_t)previousFrame->picAvgVarianceCr)) > PICTURE_DIFF_VARIANCE_CHROMA_TH) &&
1310
+                (currentFrame->picAvgVarianceCr > PICTURE_VARIANCE_CHROMA_TH || previousFrame->picAvgVarianceCr > PICTURE_VARIANCE_CHROMA_TH)) ?
1311
+                HIGH_VAR_SCENE_CHANGE_CHROMA_TH * NUM64x64INPIC(segmentWidth, segmentHeight) : LOW_VAR_SCENE_CHANGE_CHROMA_TH * NUM64x64INPIC(segmentWidth, segmentHeight);
1312
+
1313
+            for (uint32_t bin = 0; bin < HISTOGRAM_NUMBER_OF_BINS; ++bin) {
1314
+                accHistDiff += X265_ABS((int32_t)currentFrame->picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex0bin - (int32_t)previousFrame->picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex0bin);
1315
+                accHistDiffCb += X265_ABS((int32_t)currentFrame->picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex1bin - (int32_t)previousFrame->picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex1bin);
1316
+                accHistDiffCr += X265_ABS((int32_t)currentFrame->picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex2bin - (int32_t)previousFrame->picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex2bin);
1317
+            }
1318
+
1319
+            if (m_resetRunningAvg) {
1320
+                accHistDiffRunningAvgsegmentInFrameWidthIndexsegmentInFrameHeightIndex = accHistDiff;
1321
+                accHistDiffRunningAvgCbsegmentInFrameWidthIndexsegmentInFrameHeightIndex = accHistDiffCb;
1322
+                accHistDiffRunningAvgCrsegmentInFrameWidthIndexsegmentInFrameHeightIndex = accHistDiffCr;
1323
+            }
1324
+
1325
+            // difference between accumulative absolute histogram differences and the running average at the current frame.
1326
+            uint32_t accHistDiffError = X265_ABS((int32_t)accHistDiffRunningAvgsegmentInFrameWidthIndexsegmentInFrameHeightIndex - (int32_t)accHistDiff);
1327
+            uint32_t accHistDiffErrorCb = X265_ABS((int32_t)accHistDiffRunningAvgCbsegmentInFrameWidthIndexsegmentInFrameHeightIndex - (int32_t)accHistDiffCb);
1328
+            uint32_t accHistDiffErrorCr = X265_ABS((int32_t)accHistDiffRunningAvgCrsegmentInFrameWidthIndexsegmentInFrameHeightIndex - (int32_t)accHistDiffCr);
1329
+
1330
+            if ((accHistDiffError > segmentThreshHold     && accHistDiff >= accHistDiffError) ||
1331
+                (accHistDiffErrorCb > segmentThreshHoldCb && accHistDiffCb >= accHistDiffErrorCb) ||
1332
+                (accHistDiffErrorCr > segmentThreshHoldCr && accHistDiffCr >= accHistDiffErrorCr)) {
1333
+
1334
+                isAbruptChange = true;
1335
+            }
1336
+
1337
+            if (isAbruptChange)
1338
+            {
1339
+                absIntDiffFuturePast = (uint8_t)X265_ABS((int16_t)futureFrame->averageIntensityPerSegmentsegmentInFrameWidthIndexsegmentInFrameHeightIndex0 - (int16_t)previousFrame->averageIntensityPerSegmentsegmentInFrameWidthIndexsegmentInFrameHeightIndex0);
1340
+                absIntDiffFuturePresent = (uint8_t)X265_ABS((int16_t)futureFrame->averageIntensityPerSegmentsegmentInFrameWidthIndexsegmentInFrameHeightIndex0 - (int16_t)currentFrame->averageIntensityPerSegmentsegmentInFrameWidthIndexsegmentInFrameHeightIndex0);
1341
+                absIntDiffPresentPast = (uint8_t)X265_ABS((int16_t)currentFrame->averageIntensityPerSegmentsegmentInFrameWidthIndexsegmentInFrameHeightIndex0 - (int16_t)previousFrame->averageIntensityPerSegmentsegmentInFrameWidthIndexsegmentInFrameHeightIndex0);
1342
+
1343
+                if (absIntDiffFuturePresent >= FLASH_TH * absIntDiffFuturePast && absIntDiffPresentPast >= FLASH_TH * absIntDiffFuturePast) {
1344
+                    x265_log(m_param, X265_LOG_DEBUG, "Flash in frame# %i , %i, %i, %i\n", currentFrame->frameNum, absIntDiffFuturePast, absIntDiffFuturePresent, absIntDiffPresentPast);
1345
+                }
1346
+                else if (absIntDiffFuturePresent < FADE_TH && absIntDiffPresentPast < FADE_TH) {
1347
+                    x265_log(m_param, X265_LOG_DEBUG, "Fade in frame# %i , %i, %i, %i\n", currentFrame->frameNum, absIntDiffFuturePast, absIntDiffFuturePresent, absIntDiffPresentPast);
1348
+                }
1349
+                else if (X265_ABS(absIntDiffFuturePresent - absIntDiffPresentPast) < INTENSITY_CHANGE_TH && absIntDiffFuturePresent + absIntDiffPresentPast >= absIntDiffFuturePast) {
1350
+                    x265_log(m_param, X265_LOG_DEBUG, "Intensity Change in frame# %i , %i, %i, %i\n", currentFrame->frameNum, absIntDiffFuturePast, absIntDiffFuturePresent, absIntDiffPresentPast);
1351
+                }
1352
+                else {
1353
+                    isSceneChange = true;
1354
+                    x265_log(m_param, X265_LOG_DEBUG, "Scene change in frame# %i , %i, %i, %i\n", currentFrame->frameNum, absIntDiffFuturePast, absIntDiffFuturePresent, absIntDiffPresentPast);
1355
+                }
1356
+
1357
+            }
1358
+            else {
1359
+                accHistDiffRunningAvgsegmentInFrameWidthIndexsegmentInFrameHeightIndex = (3 * accHistDiffRunningAvgsegmentInFrameWidthIndexsegmentInFrameHeightIndex + accHistDiff) / 4;
1360
+            }
1361
+
1362
+            abruptChangeCount += isAbruptChange;
1363
+            sceneChangeCount += isSceneChange;
1364
+        }
1365
+    }
1366
+
1367
+    if (abruptChangeCount >= m_segmentCountThreshold) {
1368
+        m_resetRunningAvg = true;
1369
+    }
1370
+    else {
1371
+        m_resetRunningAvg = false;
1372
+    }
1373
+
1374
+    if ((sceneChangeCount >= m_segmentCountThreshold)) {
1375
+        x265_log(m_param, X265_LOG_DEBUG, "Scene Change in Pic Number# %i\n", currentFrame->frameNum);
1376
+
1377
+        return true;
1378
+    }
1379
+    else {
1380
+        return false;
1381
+    }
1382
+
1383
+}
1384
+
1385
+bool Lookahead::histBasedScenecut(Lowres **frames, int p0, int p1, int numFrames)
1386
+{
1387
+    /* Only do analysis during a normal scenecut check. */
1388
+    if (m_param->bframes)
1389
+    {
1390
+        int origmaxp1 = p0 + 1;
1391
+        /* Look ahead to avoid coding short flashes as scenecuts. */
1392
+        origmaxp1 += m_param->bframes;
1393
+        int maxp1 = X265_MIN(origmaxp1, numFrames);
1394
+
1395
+        for (int cp1 = p0; cp1 < maxp1; cp1++)
1396
+        {
1397
+            if (framescp1 + 1->bHistScenecutAnalyzed == true)
1398
+                continue;
1399
+
1400
+            if (framescp1 + 2 != NULL && detectHistBasedSceneChange(frames, cp1, cp1 + 1, cp1 + 2))
1401
+            {
1402
+                /* If current frame is a Scenecut from p0 frame as well as Scenecut from
1403
+                 * preceeding frame, mark it as a Scenecut */
1404
+                framescp1+1->bScenecut = true;
1405
+            }
1406
+        }
1407
+
1408
+    }
1409
+
1410
+    return framesp1->bScenecut;
1411
+}
1412
+
1413
 void Lookahead::slicetypePath(Lowres **frames, int length, char(*best_paths)X265_LOOKAHEAD_MAX + 1)
1414
 {
1415
     char paths2X265_LOOKAHEAD_MAX + 1;
1416
@@ -2404,6 +3233,27 @@
1417
     memcpy(best_pathslength % (X265_BFRAME_MAX + 1), pathsidx ^ 1, length);
1418
 }
1419
 
1420
+// Find slicetype of the frame with poc # in lookahead buffer
1421
+int Lookahead::findSliceType(int poc)
1422
+{
1423
+    int out_slicetype = X265_TYPE_AUTO;
1424
+    if (m_filled)
1425
+    {
1426
+        m_outputLock.acquire();
1427
+        Frame* out = m_outputQueue.first();
1428
+        while (out != NULL) {
1429
+            if (poc == out->m_poc)
1430
+            {
1431
+                out_slicetype = out->m_lowres.sliceType;
1432
+                break;
1433
+            }
1434
+            out = out->m_next;
1435
+        }
1436
+        m_outputLock.release();
1437
+    }
1438
+    return out_slicetype;
1439
+}
1440
+
1441
 int64_t Lookahead::slicetypePathCost(Lowres **frames, char *path, int64_t threshold)
1442
 {
1443
     int64_t cost = 0;
1444
x265_3.5.tar.gz/source/encoder/slicetype.h -> x265_3.6.tar.gz/source/encoder/slicetype.h Changed
110
 
1
@@ -44,6 +44,24 @@
2
 #define EDGE_INCLINATION 45
3
 #define TEMPORAL_SCENECUT_THRESHOLD 50
4
 
5
+#define X265_ABS(a)                        (((a) < 0) ? (-(a)) : (a))
6
+
7
+#define PICTURE_DIFF_VARIANCE_TH            390
8
+#define PICTURE_VARIANCE_TH                 1500
9
+#define LOW_VAR_SCENE_CHANGE_TH             2250
10
+#define HIGH_VAR_SCENE_CHANGE_TH            3500
11
+
12
+#define PICTURE_DIFF_VARIANCE_CHROMA_TH     10
13
+#define PICTURE_VARIANCE_CHROMA_TH          20
14
+#define LOW_VAR_SCENE_CHANGE_CHROMA_TH      2250/4
15
+#define HIGH_VAR_SCENE_CHANGE_CHROMA_TH     3500/4
16
+
17
+#define FLASH_TH                            1.5
18
+#define FADE_TH                             4
19
+#define INTENSITY_CHANGE_TH                 4
20
+
21
+#define NUM64x64INPIC(w,h)                  ((w*h)>> (MAX_LOG2_CU_SIZE<<1))
22
+
23
 #if HIGH_BIT_DEPTH
24
 #define EDGE_THRESHOLD 1023.0
25
 #else
26
@@ -93,7 +111,29 @@
27
 
28
     ~LookaheadTLD() { X265_FREE(wbuffer0); }
29
 
30
+    void collectPictureStatistics(Frame *curFrame);
31
+    void computeIntensityHistogramBinsLuma(Frame *curFrame, uint64_t *sumAvgIntensityTotalSegmentsLuma);
32
+
33
+    void computeIntensityHistogramBinsChroma(
34
+        Frame    *curFrame,
35
+        uint64_t *sumAverageIntensityCb,
36
+        uint64_t *sumAverageIntensityCr);
37
+
38
+    void calculateHistogram(
39
+        pixel    *inputSrc,
40
+        uint32_t  inputWidth,
41
+        uint32_t  inputHeight,
42
+        intptr_t  stride,
43
+        uint8_t   dsFactor,
44
+        uint32_t *histogram,
45
+        uint64_t *sum);
46
+
47
+    void computePictureStatistics(Frame *curFrame);
48
+
49
+    uint32_t calcVariance(pixel* src, intptr_t stride, intptr_t blockOffset, uint32_t plane);
50
+
51
     void calcAdaptiveQuantFrame(Frame *curFrame, x265_param* param);
52
+    void calcFrameSegment(Frame *curFrame);
53
     void lowresIntraEstimate(Lowres& fenc, uint32_t qgSize);
54
 
55
     void weightsAnalyse(Lowres& fenc, Lowres& ref);
56
@@ -124,7 +164,6 @@
57
 
58
     /* pre-lookahead */
59
     int           m_fullQueueSize;
60
-    int           m_histogramX265_BFRAME_MAX + 1;
61
     int           m_lastKeyframe;
62
     int           m_8x8Width;
63
     int           m_8x8Height;
64
@@ -153,6 +192,16 @@
65
     bool          m_isFadeIn;
66
     uint64_t      m_fadeCount;
67
     int           m_fadeStart;
68
+
69
+    uint32_t    **m_accHistDiffRunningAvgCb;
70
+    uint32_t    **m_accHistDiffRunningAvgCr;
71
+    uint32_t    **m_accHistDiffRunningAvg;
72
+
73
+    bool          m_resetRunningAvg;
74
+    uint32_t      m_segmentCountThreshold;
75
+
76
+    int8_t                  m_gopId;
77
+
78
     Lookahead(x265_param *param, ThreadPool *pool);
79
 #if DETAILED_CU_STATS
80
     int64_t       m_slicetypeDecideElapsedTime;
81
@@ -174,6 +223,7 @@
82
 
83
     void    getEstimatedPictureCost(Frame *pic);
84
     void    setLookaheadQueue();
85
+    int     findSliceType(int poc);
86
 
87
 protected:
88
 
89
@@ -184,6 +234,10 @@
90
     /* called by slicetypeAnalyse() to make slice decisions */
91
     bool    scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, int numFrames);
92
     bool    scenecutInternal(Lowres **frames, int p0, int p1, bool bRealScenecut);
93
+
94
+    bool    histBasedScenecut(Lowres **frames, int p0, int p1, int numFrames);
95
+    bool    detectHistBasedSceneChange(Lowres **frames, int p0, int p1, int p2);
96
+
97
     void    slicetypePath(Lowres **frames, int length, char(*best_paths)X265_LOOKAHEAD_MAX + 1);
98
     int64_t slicetypePathCost(Lowres **frames, char *path, int64_t threshold);
99
     int64_t vbvFrameCost(Lowres **frames, int p0, int p1, int b);
100
@@ -199,6 +253,9 @@
101
 
102
     /* called by getEstimatedPictureCost() to finalize cuTree costs */
103
     int64_t frameCostRecalculate(Lowres **frames, int p0, int p1, int b);
104
+    /*Compute index for positioning B-Ref frames*/
105
+    void     placeBref(Frame** frames, int start, int end, int num, int *brefs);
106
+    void     compCostBref(Lowres **frame, int start, int end, int num);
107
 };
108
 
109
 class PreLookaheadGroup : public BondedTaskGroup
110
x265_3.5.tar.gz/source/output/output.cpp -> x265_3.6.tar.gz/source/output/output.cpp Changed
19
 
1
@@ -30,14 +30,14 @@
2
 
3
 using namespace X265_NS;
4
 
5
-ReconFile* ReconFile::open(const char *fname, int width, int height, uint32_t bitdepth, uint32_t fpsNum, uint32_t fpsDenom, int csp)
6
+ReconFile* ReconFile::open(const char *fname, int width, int height, uint32_t bitdepth, uint32_t fpsNum, uint32_t fpsDenom, int csp, int sourceBitDepth)
7
 {
8
     const char * s = strrchr(fname, '.');
9
 
10
     if (s && !strcmp(s, ".y4m"))
11
-        return new Y4MOutput(fname, width, height, fpsNum, fpsDenom, csp);
12
+        return new Y4MOutput(fname, width, height, bitdepth, fpsNum, fpsDenom, csp, sourceBitDepth);
13
     else
14
-        return new YUVOutput(fname, width, height, bitdepth, csp);
15
+        return new YUVOutput(fname, width, height, bitdepth, csp, sourceBitDepth);
16
 }
17
 
18
 OutputFile* OutputFile::open(const char *fname, InputFileInfo& inputInfo)
19
x265_3.5.tar.gz/source/output/output.h -> x265_3.6.tar.gz/source/output/output.h Changed
10
 
1
@@ -42,7 +42,7 @@
2
     ReconFile()           {}
3
 
4
     static ReconFile* open(const char *fname, int width, int height, uint32_t bitdepth,
5
-                           uint32_t fpsNum, uint32_t fpsDenom, int csp);
6
+                           uint32_t fpsNum, uint32_t fpsDenom, int csp, int sourceBitDepth);
7
 
8
     virtual bool isFail() const = 0;
9
 
10
x265_3.5.tar.gz/source/output/y4m.cpp -> x265_3.6.tar.gz/source/output/y4m.cpp Changed
145
 
1
@@ -28,11 +28,13 @@
2
 using namespace X265_NS;
3
 using namespace std;
4
 
5
-Y4MOutput::Y4MOutput(const char *filename, int w, int h, uint32_t fpsNum, uint32_t fpsDenom, int csp)
6
+Y4MOutput::Y4MOutput(const char* filename, int w, int h, uint32_t bitdepth, uint32_t fpsNum, uint32_t fpsDenom, int csp, int inputdepth)
7
     : width(w)
8
     , height(h)
9
+    , bitDepth(bitdepth)
10
     , colorSpace(csp)
11
     , frameSize(0)
12
+    , inputDepth(inputdepth)
13
 {
14
     ofs.open(filename, ios::binary | ios::out);
15
     buf = new charwidth;
16
@@ -41,7 +43,13 @@
17
 
18
     if (ofs)
19
     {
20
-        ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << fpsNum << ":" << fpsDenom << " Ip" << " C" << cf << "\n";
21
+        if (bitDepth == 10)
22
+            ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << fpsNum << ":" << fpsDenom << " Ip" << " C" << cf << "p10" << " XYSCSS = " << cf << "P10" << "\n";
23
+        else if (bitDepth == 12)
24
+            ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << fpsNum << ":" << fpsDenom << " Ip" << " C" << cf << "p12" << " XYSCSS = " << cf << "P12" << "\n";
25
+        else
26
+            ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << fpsNum << ":" << fpsDenom << " Ip" << " C" << cf << "\n";
27
+
28
         header = ofs.tellp();
29
     }
30
 
31
@@ -58,52 +66,81 @@
32
 bool Y4MOutput::writePicture(const x265_picture& pic)
33
 {
34
     std::ofstream::pos_type outPicPos = header;
35
-    outPicPos += (uint64_t)pic.poc * (6 + frameSize);
36
+    if (pic.bitDepth > 8)
37
+        outPicPos += (uint64_t)(pic.poc * (6 + frameSize * 2));
38
+    else
39
+        outPicPos += (uint64_t)pic.poc * (6 + frameSize);
40
     ofs.seekp(outPicPos);
41
     ofs << "FRAME\n";
42
 
43
-#if HIGH_BIT_DEPTH
44
-    if (pic.bitDepth > 8 && pic.poc == 0)
45
-        x265_log(NULL, X265_LOG_WARNING, "y4m: down-shifting reconstructed pixels to 8 bits\n");
46
-#else
47
-    if (pic.bitDepth > 8 && pic.poc == 0)
48
-        x265_log(NULL, X265_LOG_WARNING, "y4m: forcing reconstructed pixels to 8 bits\n");
49
-#endif
50
+    if (inputDepth > 8)
51
+    {
52
+        if (pic.bitDepth == 8 && pic.poc == 0)
53
+            x265_log(NULL, X265_LOG_WARNING, "y4m: down-shifting reconstructed pixels to 8 bits\n");
54
+    }
55
 
56
     X265_CHECK(pic.colorSpace == colorSpace, "invalid chroma subsampling\n");
57
 
58
-#if HIGH_BIT_DEPTH
59
-
60
-    // encoder gave us short pixels, downshift, then write
61
-    X265_CHECK(pic.bitDepth > 8, "invalid bit depth\n");
62
-    int shift = pic.bitDepth - 8;
63
-    for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
64
+    if (inputDepth > 8)//if HIGH_BIT_DEPTH
65
     {
66
-        uint16_t *src = (uint16_t*)pic.planesi;
67
-        for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
68
+        if (pic.bitDepth == 8)
69
         {
70
-            for (int w = 0; w < width >> x265_cli_cspscolorSpace.widthi; w++)
71
-                bufw = (char)(srcw >> shift);
72
-
73
-            ofs.write(buf, width >> x265_cli_cspscolorSpace.widthi);
74
-            src += pic.stridei / sizeof(*src);
75
+            // encoder gave us short pixels, downshift, then write
76
+            X265_CHECK(pic.bitDepth == 8, "invalid bit depth\n");
77
+            int shift = pic.bitDepth - 8;
78
+            for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
79
+            {
80
+                char *src = (char*)pic.planesi;
81
+                for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
82
+                {
83
+                    for (int w = 0; w < width >> x265_cli_cspscolorSpace.widthi; w++)
84
+                        bufw = (char)(srcw >> shift);
85
+
86
+                    ofs.write(buf, width >> x265_cli_cspscolorSpace.widthi);
87
+                    src += pic.stridei / sizeof(*src);
88
+                }
89
+            }
90
+        }
91
+        else
92
+        {
93
+            X265_CHECK(pic.bitDepth > 8, "invalid bit depth\n");
94
+            for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
95
+            {
96
+                uint16_t *src = (uint16_t*)pic.planesi;
97
+                for (int h = 0; h < (height * 1) >> x265_cli_cspscolorSpace.heighti; h++)
98
+                {
99
+                    ofs.write((const char*)src, (width * 2) >> x265_cli_cspscolorSpace.widthi);
100
+                    src += pic.stridei / sizeof(*src);
101
+                }
102
+            }
103
         }
104
     }
105
-
106
-#else // if HIGH_BIT_DEPTH
107
-
108
-    X265_CHECK(pic.bitDepth == 8, "invalid bit depth\n");
109
-    for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
110
+    else if (inputDepth == 8 && pic.bitDepth > 8)
111
     {
112
-        char *src = (char*)pic.planesi;
113
-        for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
114
+        X265_CHECK(pic.bitDepth > 8, "invalid bit depth\n");
115
+        for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
116
         {
117
-            ofs.write(src, width >> x265_cli_cspscolorSpace.widthi);
118
-            src += pic.stridei / sizeof(*src);
119
+            uint16_t* src = (uint16_t*)pic.planesi;
120
+            for (int h = 0; h < (height * 1) >> x265_cli_cspscolorSpace.heighti; h++)
121
+            {
122
+                ofs.write((const char*)src, (width * 2) >> x265_cli_cspscolorSpace.widthi);
123
+                src += pic.stridei / sizeof(*src);
124
+            }
125
+        }
126
+    }
127
+    else
128
+    {
129
+        X265_CHECK(pic.bitDepth == 8, "invalid bit depth\n");
130
+        for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
131
+        {
132
+            char *src = (char*)pic.planesi;
133
+            for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
134
+            {
135
+                ofs.write(src, width >> x265_cli_cspscolorSpace.widthi);
136
+                src += pic.stridei / sizeof(*src);
137
+            }
138
         }
139
     }
140
-
141
-#endif // if HIGH_BIT_DEPTH
142
 
143
     return true;
144
 }
145
x265_3.5.tar.gz/source/output/y4m.h -> x265_3.6.tar.gz/source/output/y4m.h Changed
25
 
1
@@ -38,10 +38,14 @@
2
 
3
     int height;
4
 
5
+    uint32_t bitDepth;
6
+
7
     int colorSpace;
8
 
9
     uint32_t frameSize;
10
 
11
+    int inputDepth;
12
+
13
     std::ofstream ofs;
14
 
15
     std::ofstream::pos_type header;
16
@@ -52,7 +56,7 @@
17
 
18
 public:
19
 
20
-    Y4MOutput(const char *filename, int width, int height, uint32_t fpsNum, uint32_t fpsDenom, int csp);
21
+    Y4MOutput(const char *filename, int width, int height, uint32_t bitdepth, uint32_t fpsNum, uint32_t fpsDenom, int csp, int inputDepth);
22
 
23
     virtual ~Y4MOutput();
24
 
25
x265_3.5.tar.gz/source/output/yuv.cpp -> x265_3.6.tar.gz/source/output/yuv.cpp Changed
107
 
1
@@ -28,12 +28,13 @@
2
 using namespace X265_NS;
3
 using namespace std;
4
 
5
-YUVOutput::YUVOutput(const char *filename, int w, int h, uint32_t d, int csp)
6
+YUVOutput::YUVOutput(const char *filename, int w, int h, uint32_t d, int csp, int inputdepth)
7
     : width(w)
8
     , height(h)
9
     , depth(d)
10
     , colorSpace(csp)
11
     , frameSize(0)
12
+    , inputDepth(inputdepth)
13
 {
14
     ofs.open(filename, ios::binary | ios::out);
15
     buf = new charwidth;
16
@@ -56,50 +57,52 @@
17
     X265_CHECK(pic.colorSpace == colorSpace, "invalid chroma subsampling\n");
18
     X265_CHECK(pic.bitDepth == (int)depth, "invalid bit depth\n");
19
 
20
-#if HIGH_BIT_DEPTH
21
-    if (depth == 8)
22
+    if (inputDepth > 8)
23
     {
24
-        int shift = pic.bitDepth - 8;
25
-        ofs.seekp((std::streamoff)fileOffset);
26
-        for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
27
-        {
28
-            uint16_t *src = (uint16_t*)pic.planesi;
29
-            for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
30
-            {
31
-                for (int w = 0; w < width >> x265_cli_cspscolorSpace.widthi; w++)
32
-                    bufw = (char)(srcw >> shift);
33
+   if (depth == 8)
34
+   {
35
+       int shift = pic.bitDepth - 8;
36
+       ofs.seekp((std::streamoff)fileOffset);
37
+       for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
38
+       {
39
+           uint16_t *src = (uint16_t*)pic.planesi;
40
+           for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
41
+           {
42
+               for (int w = 0; w < width >> x265_cli_cspscolorSpace.widthi; w++)
43
+                   bufw = (char)(srcw >> shift);
44
 
45
-                ofs.write(buf, width >> x265_cli_cspscolorSpace.widthi);
46
-                src += pic.stridei / sizeof(*src);
47
-            }
48
-        }
49
+               ofs.write(buf, width >> x265_cli_cspscolorSpace.widthi);
50
+               src += pic.stridei / sizeof(*src);
51
+           }
52
+       }
53
+   }
54
+   else
55
+   {
56
+       ofs.seekp((std::streamoff)(fileOffset * 2));
57
+       for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
58
+       {
59
+           uint16_t *src = (uint16_t*)pic.planesi;
60
+           for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
61
+           {
62
+               ofs.write((const char*)src, (width * 2) >> x265_cli_cspscolorSpace.widthi);
63
+               src += pic.stridei / sizeof(*src);
64
+           }
65
+       }
66
+   }
67
     }
68
     else
69
     {
70
-        ofs.seekp((std::streamoff)(fileOffset * 2));
71
-        for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
72
-        {
73
-            uint16_t *src = (uint16_t*)pic.planesi;
74
-            for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
75
-            {
76
-                ofs.write((const char*)src, (width * 2) >> x265_cli_cspscolorSpace.widthi);
77
-                src += pic.stridei / sizeof(*src);
78
-            }
79
-        }
80
+   ofs.seekp((std::streamoff)fileOffset);
81
+   for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
82
+   {
83
+       char *src = (char*)pic.planesi;
84
+       for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
85
+       {
86
+           ofs.write(src, width >> x265_cli_cspscolorSpace.widthi);
87
+           src += pic.stridei / sizeof(*src);
88
+       }
89
+   }
90
     }
91
-#else // if HIGH_BIT_DEPTH
92
-    ofs.seekp((std::streamoff)fileOffset);
93
-    for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
94
-    {
95
-        char *src = (char*)pic.planesi;
96
-        for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
97
-        {
98
-            ofs.write(src, width >> x265_cli_cspscolorSpace.widthi);
99
-            src += pic.stridei / sizeof(*src);
100
-        }
101
-    }
102
-
103
-#endif // if HIGH_BIT_DEPTH
104
 
105
     return true;
106
 }
107
x265_3.5.tar.gz/source/output/yuv.h -> x265_3.6.tar.gz/source/output/yuv.h Changed
18
 
1
@@ -46,13 +46,15 @@
2
 
3
     uint32_t frameSize;
4
 
5
+    int inputDepth;
6
+
7
     char *buf;
8
 
9
     std::ofstream ofs;
10
 
11
 public:
12
 
13
-    YUVOutput(const char *filename, int width, int height, uint32_t bitdepth, int csp);
14
+    YUVOutput(const char *filename, int width, int height, uint32_t bitdepth, int csp, int inputDepth);
15
 
16
     virtual ~YUVOutput();
17
 
18
x265_3.5.tar.gz/source/test/CMakeLists.txt -> x265_3.6.tar.gz/source/test/CMakeLists.txt Changed
24
 
1
@@ -23,15 +23,13 @@
2
 
3
 # add ARM assembly files
4
 if(ARM OR CROSS_COMPILE_ARM)
5
-    if(NOT ARM64)
6
-        enable_language(ASM)
7
-        set(NASM_SRC checkasm-arm.S)
8
-        add_custom_command(
9
-            OUTPUT checkasm-arm.obj
10
-            COMMAND ${CMAKE_CXX_COMPILER}
11
-            ARGS ${NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S -o checkasm-arm.obj
12
-            DEPENDS checkasm-arm.S)
13
-    endif()
14
+    enable_language(ASM)
15
+    set(NASM_SRC checkasm-arm.S)
16
+    add_custom_command(
17
+        OUTPUT checkasm-arm.obj
18
+        COMMAND ${CMAKE_CXX_COMPILER}
19
+        ARGS ${NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S -o checkasm-arm.obj
20
+        DEPENDS checkasm-arm.S)
21
 endif(ARM OR CROSS_COMPILE_ARM)
22
 
23
 # add PowerPC assembly files
24
x265_3.5.tar.gz/source/test/pixelharness.cpp -> x265_3.6.tar.gz/source/test/pixelharness.cpp Changed
63
 
1
@@ -406,6 +406,32 @@
2
     return true;
3
 }
4
 
5
+bool PixelHarness::check_downscaleluma_t(downscaleluma_t ref, downscaleluma_t opt)
6
+{
7
+    ALIGN_VAR_16(pixel, ref_destf32 * 32);
8
+    ALIGN_VAR_16(pixel, opt_destf32 * 32);
9
+
10
+    intptr_t src_stride = 64;
11
+    intptr_t dst_stride = 32;
12
+    int bx = 32;
13
+    int by = 32;
14
+    int j = 0;
15
+    for (int i = 0; i < ITERS; i++)
16
+    {
17
+        int index = i % TEST_CASES;
18
+        ref(pixel_test_buffindex + j, ref_destf, src_stride, dst_stride, bx, by);
19
+        checked(opt, pixel_test_buffindex + j, opt_destf, src_stride, dst_stride, bx, by);
20
+
21
+        if (memcmp(ref_destf, opt_destf, 32 * 32 * sizeof(pixel)))
22
+            return false;
23
+
24
+        reportfail();
25
+        j += INCR;
26
+    }
27
+
28
+    return true;
29
+}
30
+
31
 bool PixelHarness::check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt)
32
 {
33
     ALIGN_VAR_16(int16_t, ref_dest64 * 64);
34
@@ -2793,6 +2819,15 @@
35
         }
36
     }
37
 
38
+    if (opt.frameSubSampleLuma)
39
+    {
40
+        if (!check_downscaleluma_t(ref.frameSubSampleLuma, opt.frameSubSampleLuma))
41
+        {
42
+            printf("SubSample Luma failed!\n");
43
+            return false;
44
+        }
45
+    }
46
+
47
     if (opt.scale1D_128to64NONALIGNED)
48
     {
49
         if (!check_scale1D_pp(ref.scale1D_128to64NONALIGNED, opt.scale1D_128to64NONALIGNED))
50
@@ -3492,6 +3527,12 @@
51
         REPORT_SPEEDUP(opt.frameInitLowres, ref.frameInitLowres, pbuf2, pbuf1, pbuf2, pbuf3, pbuf4, 64, 64, 64, 64);
52
     }
53
 
54
+    if (opt.frameSubSampleLuma)
55
+    {
56
+        HEADER0("downscaleluma");
57
+        REPORT_SPEEDUP(opt.frameSubSampleLuma, ref.frameSubSampleLuma, pbuf2, pbuf1, 64, 64, 64, 64);
58
+    }
59
+
60
     if (opt.scale1D_128to64NONALIGNED)
61
     {
62
         HEADER0("scale1D_128to64");
63
x265_3.5.tar.gz/source/test/pixelharness.h -> x265_3.6.tar.gz/source/test/pixelharness.h Changed
9
 
1
@@ -138,6 +138,7 @@
2
     bool check_integral_inith(integralh_t ref, integralh_t opt);
3
     bool check_ssimDist(ssimDistortion_t ref, ssimDistortion_t opt);
4
     bool check_normFact(normFactor_t ref, normFactor_t opt, int block);
5
+    bool check_downscaleluma_t(downscaleluma_t ref, downscaleluma_t opt);
6
 
7
 public:
8
 
9
x265_3.5.tar.gz/source/test/rate-control-tests.txt -> x265_3.6.tar.gz/source/test/rate-control-tests.txt Changed
10
 
1
@@ -15,7 +15,7 @@
2
 112_1920x1080_25.yuv,--preset ultrafast --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 15000 --hrd --strict-cbr
3
 Traffic_4096x2048_30.yuv,--preset superfast --bitrate 20000 --vbv-maxrate 20000 --vbv-bufsize 20000 --repeat-headers --strict-cbr
4
 Traffic_4096x2048_30.yuv,--preset faster --bitrate 8000 --vbv-maxrate 8000 --vbv-bufsize 6000 --aud --repeat-headers --no-open-gop --hrd --pmode --pme
5
-News-4k.y4m,--preset veryfast --bitrate 3000 --vbv-maxrate 5000 --vbv-bufsize 5000 --repeat-headers --temporal-layers
6
+News-4k.y4m,--preset veryfast --bitrate 3000 --vbv-maxrate 5000 --vbv-bufsize 5000 --repeat-headers --temporal-layers 3
7
 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --bitrate 18000 --vbv-bufsize 20000 --vbv-maxrate 18000 --strict-cbr
8
 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --bitrate 8000 --vbv-bufsize 12000 --vbv-maxrate 10000  --tune grain
9
 big_buck_bunny_360p24.y4m,--preset medium --bitrate 400 --vbv-bufsize 600 --vbv-maxrate 600 --aud --hrd --tune fast-decode
10
x265_3.5.tar.gz/source/test/regression-tests.txt -> x265_3.6.tar.gz/source/test/regression-tests.txt Changed
91
 
1
@@ -18,12 +18,12 @@
2
 BasketballDrive_1920x1080_50.y4m,--preset faster --aq-strength 2 --merange 190 --slices 3
3
 BasketballDrive_1920x1080_50.y4m,--preset medium --ctu 16 --max-tu-size 8 --subme 7 --qg-size 16 --cu-lossless --tu-inter-depth 3 --limit-tu 1
4
 BasketballDrive_1920x1080_50.y4m,--preset medium --keyint -1 --nr-inter 100 -F4 --no-sao
5
-BasketballDrive_1920x1080_50.y4m,--preset medium --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 2 --bitrate 7000 --limit-modes::--preset medium --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 2 --bitrate 7000 --limit-modes
6
+BasketballDrive_1920x1080_50.y4m,--preset medium --analysis-save x265_analysis.dat --analysis-save-reuse-level 2 --bitrate 7000 --limit-modes::--preset medium --analysis-load x265_analysis.dat --analysis-load-reuse-level 2 --bitrate 7000 --limit-modes
7
 BasketballDrive_1920x1080_50.y4m,--preset slow --nr-intra 100 -F4 --aq-strength 3 --qg-size 16 --limit-refs 1
8
 BasketballDrive_1920x1080_50.y4m,--preset slower --lossless --chromaloc 3 --subme 0 --limit-tu 4
9
-BasketballDrive_1920x1080_50.y4m,--preset slower --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --bitrate 7000 --limit-tu 0::--preset slower --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 10 --bitrate 7000 --limit-tu 0
10
+BasketballDrive_1920x1080_50.y4m,--preset slower --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --bitrate 7000 --limit-tu 0::--preset slower --analysis-load x265_analysis.dat --analysis-load-reuse-level 10 --bitrate 7000 --limit-tu 0
11
 BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1 --aq-mode 3 --limit-tu 3
12
-BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --crf 18 --tskip-fast --limit-tu 2::--preset veryslow --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --crf 18 --tskip-fast --limit-tu 2
13
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --crf 18 --tskip-fast --limit-tu 2::--preset veryslow --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --crf 18 --tskip-fast --limit-tu 2
14
 BasketballDrive_1920x1080_50.y4m,--preset veryslow --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
15
 Coastguard-4k.y4m,--preset ultrafast --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
16
 Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop
17
@@ -33,7 +33,7 @@
18
 Coastguard-4k.y4m,--preset slow --tune psnr --cbqpoffs -1 --crqpoffs 1 --limit-refs 1
19
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset ultrafast --weightp --tune zerolatency --qg-size 16
20
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset superfast --weightp --no-wpp --sao
21
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset veryfast --temporal-layers --tune grain
22
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset veryfast --temporal-layers 2 --tune grain
23
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset faster --max-tu-size 4 --min-cu-size 32
24
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --aq-mode 0 --sar 2 --range full
25
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset medium --no-wpp --no-cutree --no-strong-intra-smoothing --limit-refs 1
26
@@ -41,7 +41,7 @@
27
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset slower --tune ssim --tune fastdecode --limit-refs 2
28
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset ultrafast --weightp --no-wpp --no-open-gop
29
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset superfast --weightp --dither --no-psy-rd
30
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryfast --temporal-layers --repeat-headers --limit-refs 2
31
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryfast --temporal-layers 2 --repeat-headers --limit-refs 2
32
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset medium --dither --keyint -1 --rdoq-level 1 --limit-modes
33
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --tskip --tskip-fast --no-scenecut --limit-tu 1
34
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --aq-mode 3 --aq-strength 1.5 --aq-motion --bitrate 5000
35
@@ -49,11 +49,11 @@
36
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --hevc-aq --no-cutree --qg-size 16
37
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset superfast --weightp --qg-size 16
38
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset medium --tune psnr --bframes 16 --limit-modes
39
-DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
40
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers 2 --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
41
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset veryfast --weightp --nr-intra 1000 -F4
42
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset medium --nr-inter 500 -F4 --no-psy-rdoq
43
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset slower --no-weightp --rdoq-level 0 --limit-refs 3 --tu-inter-depth 4 --limit-tu 3
44
-DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset fast --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1::--preset fast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1
45
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset fast --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1::--preset fast --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1
46
 FourPeople_1280x720_60.y4m,--preset superfast --no-wpp --lookahead-slices 2
47
 FourPeople_1280x720_60.y4m,--preset veryfast --aq-mode 2 --aq-strength 1.5 --qg-size 8
48
 FourPeople_1280x720_60.y4m,--preset medium --qp 38 --no-psy-rd
49
@@ -158,13 +158,10 @@
50
 ducks_take_off_420_1_720p50.y4m,--preset medium --selective-sao 4 --sao --crf 20
51
 Traffic_4096x2048_30p.y4m, --preset medium --frame-dup --dup-threshold 60 --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000
52
 Kimono1_1920x1080_24_400.yuv,--preset superfast --qp 28 --zones 0,139,q=32
53
-sintel_trailer_2k_1920x1080_24.yuv, --preset medium --hist-scenecut --hist-threshold 0.02 --frame-dup --dup-threshold 60 --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000
54
-sintel_trailer_2k_1920x1080_24.yuv, --preset medium --hist-scenecut --hist-threshold 0.02
55
-sintel_trailer_2k_1920x1080_24.yuv, --preset ultrafast --hist-scenecut --hist-threshold 0.02
56
 crowd_run_1920x1080_50.yuv, --preset faster --ctu 32 --rskip 2 --rskip-edge-threshold 5
57
 crowd_run_1920x1080_50.yuv, --preset fast --ctu 64 --rskip 2 --rskip-edge-threshold 5 --aq-mode 4
58
-crowd_run_1920x1080_50.yuv, --preset slow --ctu 32 --rskip 2 --rskip-edge-threshold 5 --hist-scenecut --hist-threshold 0.1
59
-crowd_run_1920x1080_50.yuv, --preset slower --ctu 16 --rskip 2 --rskip-edge-threshold 5 --hist-scenecut --hist-threshold 0.1 --aq-mode 4
60
+crowd_run_1920x1080_50.yuv, --preset ultrafast --video-signal-type-preset BT2100_PQ_YCC:BT2100x108n0005
61
+crowd_run_1920x1080_50.yuv, --preset ultrafast --eob --eos
62
  
63
 # Main12 intraCost overflow bug test
64
 720p50_parkrun_ter.y4m,--preset medium
65
@@ -182,14 +179,22 @@
66
 
67
 #scaled save/load test
68
 crowd_run_1080p50.y4m,--preset ultrafast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 8000 --vbv-bufsize 8000::crowd_run_2160p50.y4m, --preset ultrafast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 12000 --vbv-bufsize 12000 
69
-crowd_run_1080p50.y4m,--preset superfast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m, --preset superfast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000 
70
-crowd_run_1080p50.y4m,--preset fast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m, --preset fast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --scale-factor 2 --qp 18
71
+crowd_run_1080p50.y4m,--preset superfast --analysis-save x265_analysis.dat  --analysis-save-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m, --preset superfast --analysis-load x265_analysis.dat  --analysis-load-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000 
72
+crowd_run_1080p50.y4m,--preset fast --analysis-save x265_analysis.dat  --analysis-save-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m, --preset fast --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --scale-factor 2 --qp 18
73
 crowd_run_1080p50.y4m,--preset medium --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000  --vbv-maxrate 5000 --vbv-bufsize 5000 --early-skip --tu-inter-depth 3::crowd_run_2160p50.y4m, --preset medium --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 4 --dynamic-refine::crowd_run_2160p50.y4m, --preset medium --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 3 --refine-inter 3
74
-RaceHorses_416x240_30.y4m,--preset slow --no-cutree --ctu 16 --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --crf 22  --vbv-maxrate 1000 --vbv-bufsize 1000::RaceHorses_832x480_30.y4m, --preset slow --no-cutree --ctu 32 --analysis-load x265_analysis.dat  --analysis-save x265_analysis_2.dat --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --crf 16 --vbv-maxrate 4000 --vbv-bufsize 4000 --refine-intra 0 --refine-inter 1::RaceHorses_1664x960_30.y4m,--preset slow --no-cutree --ctu 64 --analysis-load x265_analysis_2.dat  --analysis-load-reuse-level 10 --scale-factor 2 --crf 12 --vbv-maxrate 7000 --vbv-bufsize 7000 --refine-intra 2 --refine-inter 2
75
+RaceHorses_416x240_30.y4m,--preset slow --ctu 16 --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --crf 22  --vbv-maxrate 1000 --vbv-bufsize 1000::RaceHorses_832x480_30.y4m, --preset slow --ctu 32 --analysis-load x265_analysis.dat  --analysis-save x265_analysis_2.dat --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --crf 16 --vbv-maxrate 4000 --vbv-bufsize 4000 --refine-intra 0 --refine-inter 1::RaceHorses_1664x960_30.y4m,--preset slow --ctu 64 --analysis-load x265_analysis_2.dat  --analysis-load-reuse-level 10 --scale-factor 2 --crf 12 --vbv-maxrate 7000 --vbv-bufsize 7000 --refine-intra 2 --refine-inter 2
76
 ElFunete_960x540_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune psnr --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500 --analysis-save-reuse-level 10 --analysis-save elfuente_960x540.dat --scale-factor 2::ElFunete_1920x1080_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune psnr --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500 --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --analysis-save elfuente_1920x1080.dat --limit-tu 0 --scale-factor 2 --analysis-load elfuente_960x540.dat --refine-intra 4 --refine-inter 2::ElFuente_3840x2160_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune=psnr --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000 --analysis-load-reuse-level 10 --limit-tu 0 --scale-factor 2 --analysis-load elfuente_1920x1080.dat --refine-intra 4 --refine-inter 2
77
 #save/load with ctu distortion refinement
78
 CrowdRun_1920x1080_50_10bit_422.yuv,--no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --refine-ctu-distortion 1 --bitrate 7000::--no-cutree --analysis-load x265_analysis.dat --refine-ctu-distortion 1 --bitrate 7000 --analysis-load-reuse-level 5
79
 #segment encoding
80
 BasketballDrive_1920x1080_50.y4m, --preset ultrafast --no-open-gop --chunk-start 100 --chunk-end 200
81
 
82
+#Test FG SEI message addition
83
+#OldTownCross_1920x1080_50_10bit_422.yuv,--preset slower --tune grain --film-grain "OldTownCross_1920x1080_50_10bit_422.bin"
84
+#RaceHorses_416x240_30_10bit.yuv,--preset ultrafast --signhide --colormatrix bt709 --film-grain "RaceHorses_416x240_30_10bit.bin"
85
+
86
+#Temporal layers tests
87
+ducks_take_off_420_720p50.y4m,--preset slow --temporal-layers 3 --b-adapt 0
88
+parkrun_ter_720p50.y4m,--preset medium --temporal-layers 4 --b-adapt 0
89
+BasketballDrive_1920x1080_50.y4m, --preset medium --no-open-gop --keyint 50 --min-keyint 50 --temporal-layers 5 --b-adapt 0
90
 # vim: tw=200
91
x265_3.5.tar.gz/source/test/save-load-tests.txt -> x265_3.6.tar.gz/source/test/save-load-tests.txt Changed
16
 
1
@@ -12,10 +12,10 @@
2
 # not auto-detected.
3
 crowd_run_1080p50.y4m, --preset ultrafast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 8000 --vbv-bufsize 8000::crowd_run_2160p50.y4m, --preset ultrafast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 12000 --vbv-bufsize 12000
4
 crowd_run_540p50.y4m, --preset ultrafast --no-cutree --analysis-save x265_analysis.dat --scale-factor 2 --crf 26 --vbv-maxrate 8000 --vbv-bufsize 8000::crowd_run_1080p50.y4m, --preset ultrafast --no-cutree --analysis-load x265_analysis.dat --scale-factor 2 --crf 26 --vbv-maxrate 12000 --vbv-bufsize 12000
5
-crowd_run_1080p50.y4m, --preset superfast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m,   --preset superfast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000
6
-crowd_run_1080p50.y4m,  --preset fast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m,   --preset fast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --scale-factor 2 --qp 18
7
-crowd_run_1080p50.y4m,   --preset medium --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000  --vbv-maxrate 5000 --vbv-bufsize 5000 --early-skip --tu-inter-depth 3::crowd_run_2160p50.y4m,    --preset medium --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 4 --dynamic-refine::crowd_run_2160p50.y4m,    --preset medium --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 3 --refine-inter 3
8
+crowd_run_1080p50.y4m, --preset superfast --analysis-save x265_analysis.dat  --analysis-save-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m,   --preset superfast --analysis-load x265_analysis.dat  --analysis-load-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000
9
+crowd_run_1080p50.y4m,  --preset fast --analysis-save x265_analysis.dat  --analysis-save-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m,   --preset fast --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --scale-factor 2 --qp 18
10
+crowd_run_1080p50.y4m,   --preset medium --analysis-save x265_analysis.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000  --vbv-maxrate 5000 --vbv-bufsize 5000 --early-skip --tu-inter-depth 3::crowd_run_2160p50.y4m,    --preset medium --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 4 --dynamic-refine::crowd_run_2160p50.y4m,    --preset medium --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 3 --refine-inter 3
11
 RaceHorses_416x240_30.y4m,   --preset slow --no-cutree --ctu 16 --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --crf 22  --vbv-maxrate 1000 --vbv-bufsize 1000::RaceHorses_832x480_30.y4m,    --preset slow --no-cutree --ctu 32 --analysis-load x265_analysis.dat  --analysis-save x265_analysis_2.dat --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --crf 16 --vbv-maxrate 4000 --vbv-bufsize 4000 --refine-intra 0 --refine-inter 1::RaceHorses_1664x960_30.y4m,   --preset slow --no-cutree --ctu 64 --analysis-load x265_analysis_2.dat  --analysis-load-reuse-level 10 --scale-factor 2 --crf 12 --vbv-maxrate 7000 --vbv-bufsize 7000 --refine-intra 2 --refine-inter 2
12
-crowd_run_540p50.y4m,   --preset veryslow --no-cutree --analysis-save x265_analysis_540.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-bufsize 15000 --vbv-maxrate 9000::crowd_run_1080p50.y4m,   --preset veryslow --no-cutree --analysis-save x265_analysis_1080.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_1080p50.y4m,  --preset veryslow --no-cutree --analysis-save x265_analysis_1080.dat --analysis-load x265_analysis_540.dat --refine-intra 4 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_2160p50.y4m,  --preset veryslow --no-cutree --analysis-save x265_analysis_2160.dat --analysis-load x265_analysis_1080.dat --refine-intra 3 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000::crowd_run_2160p50.y4m,  --preset veryslow --no-cutree --analysis-load x265_analysis_2160.dat --refine-intra 2 --dynamic-refine --analysis-load-reuse-level 10 --scale-factor 1 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000
13
+crowd_run_540p50.y4m,   --preset veryslow --analysis-save x265_analysis_540.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-bufsize 15000 --vbv-maxrate 9000::crowd_run_1080p50.y4m,   --preset veryslow --analysis-save x265_analysis_1080.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_1080p50.y4m,  --preset veryslow --analysis-save x265_analysis_1080.dat --analysis-load x265_analysis_540.dat --refine-intra 4 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_2160p50.y4m,  --preset veryslow --analysis-save x265_analysis_2160.dat --analysis-load x265_analysis_1080.dat --refine-intra 3 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000::crowd_run_2160p50.y4m,  --preset veryslow --analysis-load x265_analysis_2160.dat --refine-intra 2 --dynamic-refine --analysis-load-reuse-level 10 --scale-factor 1 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000
14
 crowd_run_540p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_540.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-bufsize 15000 --vbv-maxrate 9000::crowd_run_1080p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_1080.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_1080p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_1080.dat --analysis-load x265_analysis_540.dat --refine-intra 4 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_2160p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_2160.dat --analysis-load x265_analysis_1080.dat --refine-intra 3 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000::crowd_run_2160p50.y4m,  --preset medium --no-cutree --analysis-load x265_analysis_2160.dat --refine-intra 2 --dynamic-refine --analysis-load-reuse-level 10 --scale-factor 1 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000
15
 News-4k.y4m,  --preset medium --analysis-save x265_analysis_fdup.dat --frame-dup --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000::News-4k.y4m, --analysis-load x265_analysis_fdup.dat --frame-dup --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000
16
x265_3.5.tar.gz/source/test/smoke-tests.txt -> x265_3.6.tar.gz/source/test/smoke-tests.txt Changed
9
 
1
@@ -23,3 +23,7 @@
2
 # Main12 intraCost overflow bug test
3
 720p50_parkrun_ter.y4m,--preset medium
4
 720p50_parkrun_ter.y4m,--preset=fast --hevc-aq --no-cutree
5
+# Test FG SEI message addition
6
+# CrowdRun_1920x1080_50_10bit_444.yuv,--preset=ultrafast --weightp --keyint -1 --film-grain "CrowdRun_1920x1080_50_10bit_444.bin"
7
+# DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=veryfast --min-cu 16 --film-grain "DucksAndLegs_1920x1080_60_10bit_422.bin"
8
+# NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset=superfast --bitrate 10000 --sao --limit-sao --cll --max-cll "1000,400" --film-grain "NebutaFestival_2560x1600_60_10bit_crop.bin"
9
x265_3.5.tar.gz/source/test/testbench.cpp -> x265_3.6.tar.gz/source/test/testbench.cpp Changed
43
 
1
@@ -174,6 +174,8 @@
2
         { "AVX512", X265_CPU_AVX512 },
3
         { "ARMv6", X265_CPU_ARMV6 },
4
         { "NEON", X265_CPU_NEON },
5
+        { "SVE2", X265_CPU_SVE2 },
6
+        { "SVE", X265_CPU_SVE },
7
         { "FastNeonMRC", X265_CPU_FAST_NEON_MRC },
8
         { "", 0 },
9
     };
10
@@ -208,15 +210,8 @@
11
 
12
         EncoderPrimitives asmprim;
13
         memset(&asmprim, 0, sizeof(asmprim));
14
-        setupAssemblyPrimitives(asmprim, test_archi.flag);
15
-
16
-#if X265_ARCH_ARM64
17
-        /* Temporary workaround because luma_vsp assembly primitive has not been completed
18
-         * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
19
-         * Otherwise, segment fault occurs. */
20
-        setupAliasCPrimitives(cprim, asmprim, test_archi.flag);
21
-#endif
22
 
23
+        setupAssemblyPrimitives(asmprim, test_archi.flag);
24
         setupAliasPrimitives(asmprim);
25
         memcpy(&primitives, &asmprim, sizeof(EncoderPrimitives));
26
         for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++)
27
@@ -239,14 +234,8 @@
28
 #if X265_ARCH_X86
29
     setupInstrinsicPrimitives(optprim, cpuid);
30
 #endif
31
-    setupAssemblyPrimitives(optprim, cpuid);
32
 
33
-#if X265_ARCH_ARM64
34
-    /* Temporary workaround because luma_vsp assembly primitive has not been completed
35
-     * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
36
-     * Otherwise, segment fault occurs. */
37
-    setupAliasCPrimitives(cprim, optprim, cpuid);
38
-#endif
39
+    setupAssemblyPrimitives(optprim, cpuid);
40
 
41
     /* Note that we do not setup aliases for performance tests, that would be
42
      * redundant. The testbench only verifies they are correctly aliased */
43
x265_3.5.tar.gz/source/test/testharness.h -> x265_3.6.tar.gz/source/test/testharness.h Changed
48
 
1
@@ -73,7 +73,7 @@
2
 #include <x86intrin.h>
3
 #elif ( !defined(__APPLE__) && defined (__GNUC__) && defined(__ARM_NEON__))
4
 #include <arm_neon.h>
5
-#elif defined(__GNUC__) && (!defined(__clang__) || __clang_major__ < 4)
6
+#else
7
 /* fallback for older GCC/MinGW */
8
 static inline uint32_t __rdtsc(void)
9
 {
10
@@ -82,15 +82,13 @@
11
 #if X265_ARCH_X86
12
     asm volatile("rdtsc" : "=a" (a) ::"edx");
13
 #elif X265_ARCH_ARM
14
-#if X265_ARCH_ARM64
15
-    asm volatile("mrs %0, cntvct_el0" : "=r"(a));
16
-#else
17
     // TOD-DO: verify following inline asm to get cpu Timestamp Counter for ARM arch
18
     // asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(a));
19
 
20
     // TO-DO: replace clock() function with appropriate ARM cpu instructions
21
     a = clock();
22
-#endif
23
+#elif  X265_ARCH_ARM64
24
+    asm volatile("mrs %0, cntvct_el0" : "=r"(a));
25
 #endif
26
     return a;
27
 }
28
@@ -128,8 +126,8 @@
29
         x265_emms(); \
30
         float optperf = (10.0f * cycles / runs) / 4; \
31
         float refperf = (10.0f * refcycles / refruns) / 4; \
32
-        printf("\t%3.2fx ", refperf / optperf); \
33
-        printf("\t %-8.2lf \t %-8.2lf\n", optperf, refperf); \
34
+        printf(" | \t%3.2fx | ", refperf / optperf); \
35
+        printf("\t %-8.2lf | \t %-8.2lf\n", optperf, refperf); \
36
     }
37
 
38
 extern "C" {
39
@@ -140,7 +138,7 @@
40
  * needs an explicit asm check because it only sometimes crashes in normal use. */
41
 intptr_t PFX(checkasm_call)(intptr_t (*func)(), int *ok, ...);
42
 float PFX(checkasm_call_float)(float (*func)(), int *ok, ...);
43
-#elif X265_ARCH_ARM == 0
44
+#elif (X265_ARCH_ARM == 0 && X265_ARCH_ARM64 == 0)
45
 #define PFX(stack_pagealign)(func, align) func()
46
 #endif
47
 
48
x265_3.5.tar.gz/source/x265.cpp -> x265_3.6.tar.gz/source/x265.cpp Changed
18
 
1
@@ -296,6 +296,16 @@
2
 
3
     int ret = 0;
4
 
5
+    if (cliopt0.scenecutAwareQpConfig)
6
+    {
7
+        if (!cliopt0.parseScenecutAwareQpConfig())
8
+        {
9
+            x265_log(NULL, X265_LOG_ERROR, "Unable to parse scenecut aware qp config file \n");
10
+            fclose(cliopt0.scenecutAwareQpConfig);
11
+            cliopt0.scenecutAwareQpConfig = NULL;
12
+        }
13
+    }
14
+
15
     AbrEncoder* abrEnc = new AbrEncoder(cliopt, numEncodes, ret);
16
     int threadsActive = abrEnc->m_numActiveEncodes.get();
17
     while (threadsActive)
18
x265_3.5.tar.gz/source/x265.h -> x265_3.6.tar.gz/source/x265.h Changed
470
 
1
@@ -26,6 +26,7 @@
2
 #define X265_H
3
 #include <stdint.h>
4
 #include <stdio.h>
5
+#include <sys/stat.h>
6
 #include "x265_config.h"
7
 #ifdef __cplusplus
8
 extern "C" {
9
@@ -59,7 +60,7 @@
10
     NAL_UNIT_CODED_SLICE_TRAIL_N = 0,
11
     NAL_UNIT_CODED_SLICE_TRAIL_R,
12
     NAL_UNIT_CODED_SLICE_TSA_N,
13
-    NAL_UNIT_CODED_SLICE_TLA_R,
14
+    NAL_UNIT_CODED_SLICE_TSA_R,
15
     NAL_UNIT_CODED_SLICE_STSA_N,
16
     NAL_UNIT_CODED_SLICE_STSA_R,
17
     NAL_UNIT_CODED_SLICE_RADL_N,
18
@@ -311,6 +312,7 @@
19
     double           vmafFrameScore;
20
     double           bufferFillFinal;
21
     double           unclippedBufferFillFinal;
22
+    uint8_t          tLayer;
23
 } x265_frame_stats;
24
 
25
 typedef struct x265_ctu_info_t
26
@@ -536,6 +538,8 @@
27
 /* ARM */
28
 #define X265_CPU_ARMV6           0x0000001
29
 #define X265_CPU_NEON            0x0000002  /* ARM NEON */
30
+#define X265_CPU_SVE2            0x0000008  /* ARM SVE2 */
31
+#define X265_CPU_SVE             0x0000010  /* ARM SVE2 */
32
 #define X265_CPU_FAST_NEON_MRC   0x0000004  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
33
 
34
 /* IBM Power8 */
35
@@ -613,6 +617,13 @@
36
 #define SLICE_TYPE_DELTA        0.3 /* The offset decremented or incremented for P-frames or b-frames respectively*/
37
 #define BACKWARD_WINDOW         1 /* Scenecut window before a scenecut */
38
 #define FORWARD_WINDOW          2 /* Scenecut window after a scenecut */
39
+#define BWD_WINDOW_DELTA        0.4
40
+
41
+#define X265_MAX_GOP_CONFIG 3
42
+#define X265_MAX_GOP_LENGTH 16
43
+#define MAX_T_LAYERS 7
44
+
45
+#define X265_IPRATIO_STRENGTH   1.43
46
 
47
 typedef struct x265_cli_csp
48
 {
49
@@ -696,6 +707,7 @@
50
 typedef struct x265_zone
51
 {
52
     int   startFrame, endFrame; /* range of frame numbers */
53
+    int   keyframeMax;          /* it store the default/user defined keyframeMax value*/
54
     int   bForceQp;             /* whether to use qp vs bitrate factor */
55
     int   qp;
56
     float bitrateFactor;
57
@@ -747,6 +759,271 @@
58
 
59
 static const x265_vmaf_commondata vcd = { { NULL, (char *)"/usr/local/share/model/vmaf_v0.6.1.pkl", NULL, NULL, 0, 0, 0, 0, 0, 0, 0, NULL, 0, 1, 0 } };
60
 
61
+typedef struct x265_temporal_layer {
62
+    int poc_offset;      /* POC offset */
63
+    int8_t layer;        /* Current layer */
64
+    int8_t qp_offset;    /* QP offset */
65
+} x265_temporal_layer;
66
+
67
+static const int8_t x265_temporal_layer_bframesMAX_T_LAYERS = {-1, -1, 3, 7, 15, -1, -1};
68
+
69
+static const int8_t x265_gop_ra_lengthX265_MAX_GOP_CONFIG = { 4, 8, 16};
70
+static const x265_temporal_layer x265_gop_raX265_MAX_GOP_CONFIGX265_MAX_GOP_LENGTH = {
71
+    {
72
+        {
73
+            4,
74
+            0,
75
+            1,
76
+        },
77
+        {
78
+            2,
79
+            1,
80
+            5,
81
+        },
82
+        {
83
+            1,
84
+            2,
85
+            3,
86
+        },
87
+        {
88
+            3,
89
+            2,
90
+            5,
91
+        },
92
+        {
93
+            -1,
94
+            -1,
95
+            -1,
96
+        },
97
+        {
98
+            -1,
99
+            -1,
100
+            -1,
101
+        },
102
+        {
103
+            -1,
104
+            -1,
105
+            -1,
106
+        },
107
+        {
108
+            -1,
109
+            -1,
110
+            -1,
111
+        },
112
+        {
113
+            -1,
114
+            -1,
115
+            -1,
116
+        },
117
+        {
118
+            -1,
119
+            -1,
120
+            -1,
121
+        },
122
+        {
123
+            -1,
124
+            -1,
125
+            -1,
126
+        },
127
+        {
128
+            -1,
129
+            -1,
130
+            -1,
131
+        },
132
+        {
133
+            -1,
134
+            -1,
135
+            -1,
136
+        },
137
+        {
138
+            -1,
139
+            -1,
140
+            -1,
141
+        },
142
+        {
143
+            -1,
144
+            -1,
145
+            -1,
146
+        },
147
+        {
148
+            -1,
149
+            -1,
150
+            -1,
151
+        }
152
+    },
153
+
154
+    {
155
+        {
156
+            8,
157
+            0,
158
+            1,
159
+        },
160
+        {
161
+            4,
162
+            1,
163
+            5,
164
+        },
165
+        {
166
+            2,
167
+            2,
168
+            4,
169
+        },
170
+        {
171
+            1,
172
+            3,
173
+            5,
174
+        },
175
+        {
176
+            3,
177
+            3,
178
+            2,
179
+        },
180
+        {
181
+            6,
182
+            2,
183
+            5,
184
+        },
185
+        {
186
+            5,
187
+            3,
188
+            4,
189
+        },
190
+        {
191
+            7,
192
+            3,
193
+            5,
194
+        },
195
+        {
196
+            -1,
197
+            -1,
198
+            -1,
199
+        },
200
+        {
201
+            -1,
202
+            -1,
203
+            -1,
204
+        },
205
+        {
206
+            -1,
207
+            -1,
208
+            -1,
209
+        },
210
+        {
211
+            -1,
212
+            -1,
213
+            -1,
214
+        },
215
+        {
216
+            -1,
217
+            -1,
218
+            -1,
219
+        },
220
+        {
221
+            -1,
222
+            -1,
223
+            -1,
224
+        },
225
+        {
226
+            -1,
227
+            -1,
228
+            -1,
229
+        },
230
+        {
231
+            -1,
232
+            -1,
233
+            -1,
234
+        },
235
+    },
236
+    {
237
+        {
238
+            16,
239
+            0,
240
+            1,
241
+        },
242
+        {
243
+            8,
244
+            1,
245
+            6,
246
+        },
247
+        {
248
+            4,
249
+            2,
250
+            5,
251
+        },
252
+        {
253
+            2,
254
+            3,
255
+            6,
256
+        },
257
+        {
258
+            1,
259
+            4,
260
+            4,
261
+        },
262
+        {
263
+            3,
264
+            4,
265
+            6,
266
+        },
267
+        {
268
+            6,
269
+            3,
270
+            5,
271
+        },
272
+        {
273
+            5,
274
+            4,
275
+            6,
276
+        },
277
+        {
278
+            7,
279
+            4,
280
+            1,
281
+        },
282
+        {
283
+            12,
284
+            2,
285
+            6,
286
+        },
287
+        {
288
+            10,
289
+            3,
290
+            5,
291
+        },
292
+        {
293
+            9,
294
+            4,
295
+            6,
296
+        },
297
+        {
298
+            11,
299
+            4,
300
+            4,
301
+        },
302
+        {
303
+            14,
304
+            3,
305
+            6,
306
+        },
307
+        {
308
+            13,
309
+            4,
310
+            5,
311
+        },
312
+        {
313
+            15,
314
+            4,
315
+            6,
316
+        }
317
+    }
318
+};
319
+
320
+typedef enum
321
+{
322
+    X265_SHARE_MODE_FILE = 0,
323
+    X265_SHARE_MODE_SHAREDMEM
324
+}X265_DATA_SHARE_MODES;
325
+
326
 /* x265 input parameters
327
  *
328
  * For version safety you may use x265_param_alloc/free() to manage the
329
@@ -983,6 +1260,9 @@
330
      * performance impact, but the use case may preclude it.  Default true */
331
     int       bOpenGOP;
332
 
333
+   /*Force nal type to CRA to all frames expect first frame. Default disabled*/
334
+   int       craNal;
335
+
336
     /* Scene cuts closer together than this are coded as I, not IDR. */
337
     int       keyframeMin;
338
 
339
@@ -1433,10 +1713,10 @@
340
         double    rfConstantMin;
341
 
342
         /* Multi-pass encoding */
343
-        /* Enable writing the stats in a multi-pass encode to the stat output file */
344
+        /* Enable writing the stats in a multi-pass encode to the stat output file/memory */
345
         int       bStatWrite;
346
 
347
-        /* Enable loading data from the stat input file in a multi pass encode */
348
+        /* Enable loading data from the stat input file/memory in a multi pass encode */
349
         int       bStatRead;
350
 
351
         /* Filename of the 2pass output/input stats file, if unspecified the
352
@@ -1489,6 +1769,21 @@
353
         /* internally enable if tune grain is set */
354
         int      bEnableConstVbv;
355
 
356
+        /* if only the focused frames would be re-encode or not */
357
+        int       bEncFocusedFramesOnly;
358
+
359
+        /* Share the data with stats file or shared memory.
360
+        It must be one of the X265_DATA_SHARE_MODES enum values
361
+        Available if the bStatWrite or bStatRead is true.
362
+        Use stats file by default.
363
+        The stats file mode would be used among the encoders running in sequence.
364
+        The shared memory mode could only be used among the encoders running in parallel.
365
+        Now only the cutree data could be shared among shared memory. More data would be support in the future.*/
366
+        int       dataShareMode;
367
+
368
+        /* Unique shared memory name. Required if the shared memory mode enabled. NULL by default */
369
+        const char* sharedMemName;
370
+
371
     } rc;
372
 
373
     /*== Video Usability Information ==*/
374
@@ -1850,6 +2145,10 @@
375
       Default 1 (Enabled). API only. */
376
     int       bResetZoneConfig;
377
 
378
+    /*Flag to indicate rate-control history has not to be reset during zone reconfiguration.
379
+      Default 0 (Disabled) */
380
+    int       bNoResetZoneConfig;
381
+
382
     /* It reduces the bits spent on the inter-frames within the scenecutWindow before and / or after a scenecut
383
      * by increasing their QP in ratecontrol pass2 algorithm without any deterioration in visual quality.
384
      * 0 - Disabled (default).
385
@@ -1860,20 +2159,15 @@
386
 
387
     /* The duration(in milliseconds) for which there is a reduction in the bits spent on the inter-frames after a scenecut
388
      * by increasing their QP, when bEnableSceneCutAwareQp is 1 or 3. Default is 500ms.*/
389
-    int       fwdScenecutWindow;
390
+    int       fwdMaxScenecutWindow;
391
+    int       fwdScenecutWindow6;
392
 
393
     /* The offset by which QP is incremented for inter-frames after a scenecut when bEnableSceneCutAwareQp is 1 or 3.
394
      * Default is +5. */
395
-    double    fwdRefQpDelta;
396
+    double    fwdRefQpDelta6;
397
 
398
     /* The offset by which QP is incremented for non-referenced inter-frames after a scenecut when bEnableSceneCutAwareQp is 1 or 3. */
399
-    double    fwdNonRefQpDelta;
400
-
401
-    /* A genuine threshold used for histogram based scene cut detection.
402
-     * This threshold determines whether a frame is a scenecut or not
403
-     * when compared against the edge and chroma histogram sad values.
404
-     * Default 0.03. Range: Real number in the interval (0,1). */
405
-    double    edgeTransitionThreshold;
406
+    double    fwdNonRefQpDelta6;
407
 
408
     /* Enables histogram based scenecut detection algorithm to detect scenecuts. Default disabled */
409
     int       bHistBasedSceneCut;
410
@@ -1941,13 +2235,39 @@
411
 
412
     /* The duration(in milliseconds) for which there is a reduction in the bits spent on the inter-frames before a scenecut
413
      * by increasing their QP, when bEnableSceneCutAwareQp is 2 or 3. Default is 100ms.*/
414
-    int       bwdScenecutWindow;
415
+    int       bwdMaxScenecutWindow;
416
+    int       bwdScenecutWindow6;
417
 
418
     /* The offset by which QP is incremented for inter-frames before a scenecut when bEnableSceneCutAwareQp is 2 or 3. */
419
-    double    bwdRefQpDelta;
420
+    double    bwdRefQpDelta6;
421
 
422
     /* The offset by which QP is incremented for non-referenced inter-frames before a scenecut when bEnableSceneCutAwareQp is 2 or 3. */
423
-    double    bwdNonRefQpDelta;
424
+    double    bwdNonRefQpDelta6;
425
+
426
+    /* Specify combinations of color primaries, transfer characteristics, color matrix,
427
+    * range of luma and chroma signals, and chroma sample location. This has higher
428
+    * precedence than individual VUI parameters. If any individual VUI option is specified
429
+    * together with this, which changes the values set corresponding to the system-id
430
+    * or color-volume, it will be discarded. */
431
+    const char* videoSignalTypePreset;
432
+
433
+    /* Flag indicating whether the encoder should emit an End of Bitstream
434
+     * NAL at the end of bitstream. Default false */
435
+    int      bEnableEndOfBitstream;
436
+
437
+    /* Flag indicating whether the encoder should emit an End of Sequence
438
+     * NAL at the end of every Coded Video Sequence. Default false */
439
+    int      bEnableEndOfSequence;
440
+
441
+    /* Film Grain Characteristic file */
442
+    char* filmGrain;
443
+
444
+    /*Motion compensated temporal filter*/
445
+    int      bEnableTemporalFilter;
446
+    double   temporalFilterStrength;
447
+
448
+    /*SBRC*/
449
+    int      bEnableSBRC;
450
 } x265_param;
451
 
452
 /* x265_param_alloc:
453
@@ -1982,6 +2302,8 @@
454
 
455
 int x265_zone_param_parse(x265_param* p, const char* name, const char* value);
456
 
457
+int x265_scenecut_aware_qp_param_parse(x265_param* p, const char* name, const char* value);
458
+
459
 static const char * const x265_profile_names = {
460
     /* HEVC v1 */
461
     "main", "main10", "mainstillpicture", /* alias */ "msp",
462
@@ -2251,6 +2573,7 @@
463
     void          (*param_free)(x265_param*);
464
     void          (*param_default)(x265_param*);
465
     int           (*param_parse)(x265_param*, const char*, const char*);
466
+    int           (*scenecut_aware_qp_param_parse)(x265_param*, const char*, const char*);
467
     int           (*param_apply_profile)(x265_param*, const char*);
468
     int           (*param_default_preset)(x265_param*, const char*, const char *);
469
     x265_picture* (*picture_alloc)(void);
470
x265_3.5.tar.gz/source/x265cli.cpp -> x265_3.6.tar.gz/source/x265cli.cpp Changed
393
 
1
@@ -28,8 +28,8 @@
2
 #include "x265cli.h"
3
 #include "svt.h"
4
 
5
-#define START_CODE 0x00000001
6
-#define START_CODE_BYTES 4
7
+#define START_CODE 0x00000001
8
+#define START_CODE_BYTES 4
9
 
10
 #ifdef __cplusplus
11
 namespace X265_NS {
12
@@ -166,6 +166,7 @@
13
         H0("   --rdpenalty <0..2>            penalty for 32x32 intra TU in non-I slices. 0:disabled 1:RD-penalty 2:maximum. Default %d\n", param->rdPenalty);
14
         H0("\nSlice decision options:\n");
15
         H0("   --no-open-gop               Enable open-GOP, allows I slices to be non-IDR. Default %s\n", OPT(param->bOpenGOP));
16
+       H0("   --cra-nal                     Force nal type to CRA to all frames expect first frame, works only with keyint 1. Default %s\n", OPT(param->craNal));
17
         H0("-I/--keyint <integer>            Max IDR period in frames. -1 for infinite-gop. Default %d\n", param->keyframeMax);
18
         H0("-i/--min-keyint <integer>        Scenecuts closer together than this are coded as I, not IDR. Default: auto\n");
19
         H0("   --gop-lookahead <integer>     Extends gop boundary if a scenecut is found within this from keyint boundary. Default 0\n");
20
@@ -174,7 +175,6 @@
21
         H1("   --scenecut-bias <0..100.0>    Bias for scenecut detection. Default %.2f\n", param->scenecutBias);
22
         H0("   --hist-scenecut               Enables histogram based scene-cut detection using histogram based algorithm.\n");
23
         H0("   --no-hist-scenecut            Disables histogram based scene-cut detection using histogram based algorithm.\n");
24
-        H1("   --hist-threshold <0.0..1.0>   Luma Edge histogram's Normalized SAD threshold for histogram based scenecut detection Default %.2f\n", param->edgeTransitionThreshold);
25
         H0("   --no-fades                  Enable detection and handling of fade-in regions. Default %s\n", OPT(param->bEnableFades));
26
         H1("   --scenecut-aware-qp <0..3>    Enable increasing QP for frames inside the scenecut window around scenecut. Default %s\n", OPT(param->bEnableSceneCutAwareQp));
27
         H1("                                 0 - Disabled\n");
28
@@ -182,6 +182,7 @@
29
         H1("                                 2 - Backward masking\n");
30
         H1("                                 3 - Bidirectional masking\n");
31
         H1("   --masking-strength <string>   Comma separated values which specify the duration and offset for the QP increment for inter-frames when scenecut-aware-qp is enabled.\n");
32
+        H1("   --scenecut-qp-config <file>   File containing scenecut-aware-qp mode, window duration and offsets settings required for the masking. Works only with --pass 2\n");
33
         H0("   --radl <integer>              Number of RADL pictures allowed in front of IDR. Default %d\n", param->radl);
34
         H0("   --intra-refresh               Use Periodic Intra Refresh instead of IDR frames\n");
35
         H0("   --rc-lookahead <integer>      Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth);
36
@@ -262,6 +263,7 @@
37
         H0("   --aq-strength <float>         Reduces blocking and blurring in flat and textured areas (0 to 3.0). Default %.2f\n", param->rc.aqStrength);
38
         H0("   --qp-adaptation-range <float> Delta QP range by QP adaptation based on a psycho-visual model (1.0 to 6.0). Default %.2f\n", param->rc.qpAdaptationRange);
39
         H0("   --no-aq-motion              Block level QP adaptation based on the relative motion between the block and the frame. Default %s\n", OPT(param->bAQMotion));
40
+        H1("   --no-sbrc                   Enables the segment based rate control. Default %s\n", OPT(param->bEnableSBRC));
41
         H0("   --qg-size <int>               Specifies the size of the quantization group (64, 32, 16, 8). Default %d\n", param->rc.qgSize);
42
         H0("   --no-cutree                 Enable cutree for Adaptive Quantization. Default %s\n", OPT(param->rc.cuTree));
43
         H0("   --no-rc-grain               Enable ratecontrol mode to handle grains specifically. turned on with tune grain. Default %s\n", OPT(param->rc.bEnableGrain));
44
@@ -282,6 +284,7 @@
45
         H1("                                       q=<integer> (force QP)\n");
46
         H1("                                   or  b=<float> (bitrate multiplier)\n");
47
         H0("   --zonefile <filename>         Zone file containing the zone boundaries and the parameters to be reconfigured.\n");
48
+        H0("   --no-zonefile-rc-init         This allow to use rate-control history across zones in zonefile.\n");
49
         H1("   --lambda-file <string>        Specify a file containing replacement values for the lambda tables\n");
50
         H1("                                 MAX_MAX_QP+1 floats for lambda table, then again for lambda2 table\n");
51
         H1("                                 Blank lines and lines starting with hash(#) are ignored\n");
52
@@ -314,6 +317,30 @@
53
         H0("   --master-display <string>     SMPTE ST 2086 master display color volume info SEI (HDR)\n");
54
         H0("                                    format: G(x,y)B(x,y)R(x,y)WP(x,y)L(max,min)\n");
55
         H0("   --max-cll <string>            Specify content light level info SEI as \"cll,fall\" (HDR).\n");
56
+        H0("   --video-signal-type-preset <string>    Specify combinations of color primaries, transfer characteristics, color matrix, range of luma and chroma signals, and chroma sample location\n");
57
+        H0("                                            format: <system-id>:<color-volume>\n");
58
+        H0("                                            This has higher precedence than individual VUI parameters. If any individual VUI option is specified together with this,\n");
59
+        H0("                                            which changes the values set corresponding to the system-id or color-volume, it will be discarded.\n");
60
+        H0("                                            The color-volume can be used only with the system-id options BT2100_PQ_YCC, BT2100_PQ_ICTCP, and BT2100_PQ_RGB.\n");
61
+        H0("                                            system-id options and their corresponding values:\n");
62
+        H0("                                              BT601_525:       --colorprim smpte170m --transfer smpte170m --colormatrix smpte170m --range limited --chromaloc 0\n");
63
+        H0("                                              BT601_626:       --colorprim bt470bg --transfer smpte170m --colormatrix bt470bg --range limited --chromaloc 0\n");
64
+        H0("                                              BT709_YCC:       --colorprim bt709 --transfer bt709 --colormatrix bt709 --range limited --chromaloc 0\n");
65
+        H0("                                              BT709_RGB:       --colorprim bt709 --transfer bt709 --colormatrix gbr --range limited\n");
66
+        H0("                                              BT2020_YCC_NCL:  --colorprim bt2020 --transfer bt2020-10 --colormatrix bt709 --range limited --chromaloc 2\n");
67
+        H0("                                              BT2020_RGB:      --colorprim bt2020 --transfer smpte2084 --colormatrix bt2020nc --range limited\n");
68
+        H0("                                              BT2100_PQ_YCC:   --colorprim bt2020 --transfer smpte2084 --colormatrix bt2020nc --range limited --chromaloc 2\n");
69
+        H0("                                              BT2100_PQ_ICTCP: --colorprim bt2020 --transfer smpte2084 --colormatrix ictcp --range limited --chromaloc 2\n");
70
+        H0("                                              BT2100_PQ_RGB:   --colorprim bt2020 --transfer smpte2084 --colormatrix gbr --range limited\n");
71
+        H0("                                              BT2100_HLG_YCC:  --colorprim bt2020 --transfer arib-std-b67 --colormatrix bt2020nc --range limited --chromaloc 2\n");
72
+        H0("                                              BT2100_HLG_RGB:  --colorprim bt2020 --transfer arib-std-b67 --colormatrix gbr --range limited\n");
73
+        H0("                                              FR709_RGB:       --colorprim bt709 --transfer bt709 --colormatrix gbr --range full\n");
74
+        H0("                                              FR2020_RGB:      --colorprim bt2020 --transfer bt2020-10 --colormatrix gbr --range full\n");
75
+        H0("                                              FRP3D65_YCC:     --colorprim smpte432 --transfer bt709 --colormatrix smpte170m --range full --chromaloc 1\n");
76
+        H0("                                            color-volume options and their corresponding values:\n");
77
+        H0("                                              P3D65x1000n0005: --master-display G(13250,34500)B(7500,3000)R(34000,16000)WP(15635,16450)L(10000000,5)\n");
78
+        H0("                                              P3D65x4000n005:  --master-display G(13250,34500)B(7500,3000)R(34000,16000)WP(15635,16450)L(40000000,50)\n");
79
+        H0("                                              BT2100x108n0005: --master-display G(8500,39850)B(6550,2300)R(34000,146000)WP(15635,16450)L(10000000,1)\n");
80
         H0("   --no-cll                    Emit content light level info SEI. Default %s\n", OPT(param->bEmitCLL));
81
         H0("   --no-hdr10                  Control dumping of HDR10 SEI packet. If max-cll or master-display has non-zero values, this is enabled. Default %s\n", OPT(param->bEmitHDR10SEI));
82
         H0("   --no-hdr-opt                Add luma and chroma offsets for HDR/WCG content. Default %s. Now deprecated.\n", OPT(param->bHDROpt));
83
@@ -324,9 +351,11 @@
84
         H0("   --no-repeat-headers         Emit SPS and PPS headers at each keyframe. Default %s\n", OPT(param->bRepeatHeaders));
85
         H0("   --no-info                   Emit SEI identifying encoder and parameters. Default %s\n", OPT(param->bEmitInfoSEI));
86
         H0("   --no-hrd                    Enable HRD parameters signaling. Default %s\n", OPT(param->bEmitHRDSEI));
87
-        H0("   --no-idr-recovery-sei      Emit recovery point infor SEI at each IDR frame \n");
88
-        H0("   --no-temporal-layers        Enable a temporal sublayer for unreferenced B frames. Default %s\n", OPT(param->bEnableTemporalSubLayers));
89
+        H0("   --no-idr-recovery-sei       Emit recovery point infor SEI at each IDR frame \n");
90
+        H0("   --temporal-layers             Enable a temporal sublayer for unreferenced B frames. Default %s\n", OPT(param->bEnableTemporalSubLayers));
91
         H0("   --no-aud                    Emit access unit delimiters at the start of each access unit. Default %s\n", OPT(param->bEnableAccessUnitDelimiters));
92
+        H0("   --no-eob                    Emit end of bitstream nal unit at the end of the bitstream. Default %s\n", OPT(param->bEnableEndOfBitstream));
93
+        H0("   --no-eos                    Emit end of sequence nal unit at the end of every coded video sequence. Default %s\n", OPT(param->bEnableEndOfSequence));
94
         H1("   --hash <integer>              Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default %d\n", param->decodedPictureHashSEI);
95
         H0("   --atc-sei <integer>           Emit the alternative transfer characteristics SEI message where the integer is the preferred transfer characteristics. Default disabled\n");
96
         H0("   --pic-struct <integer>        Set the picture structure and emits it in the picture timing SEI message. Values in the range 0..12. See D.3.3 of the HEVC spec. for a detailed explanation.\n");
97
@@ -344,6 +373,7 @@
98
         H0("   --lowpass-dct                 Use low-pass subband dct approximation. Default %s\n", OPT(param->bLowPassDct));
99
         H0("   --no-frame-dup              Enable Frame duplication. Default %s\n", OPT(param->bEnableFrameDuplication));
100
         H0("   --dup-threshold <integer>     PSNR threshold for Frame duplication. Default %d\n", param->dupThreshold);
101
+        H0("   --no-mcstf                  Enable GOP based temporal filter. Default %d\n", param->bEnableTemporalFilter);
102
 #ifdef SVT_HEVC
103
         H0("   --nosvt                     Enable SVT HEVC encoder %s\n", OPT(param->bEnableSvtHevc));
104
         H0("   --no-svt-hme                Enable Hierarchial motion estimation(HME) in SVT HEVC encoder \n");
105
@@ -365,6 +395,9 @@
106
         H1("    2 - unable to open encoder\n");
107
         H1("    3 - unable to generate stream headers\n");
108
         H1("    4 - encoder abort\n");
109
+        H0("\nSEI Message Options\n");
110
+        H0("   --film-grain <filename>           File containing Film Grain Characteristics to be written as a SEI Message\n");
111
+
112
 #undef OPT
113
 #undef H0
114
 #undef H1
115
@@ -484,6 +517,9 @@
116
 
117
         memcpy(globalParam->rc.zoneszonefileCount.zoneParam, globalParam, sizeof(x265_param));
118
 
119
+        if (zonefileCount == 0)
120
+            globalParam->rc.zoneszonefileCount.keyframeMax = globalParam->keyframeMax;
121
+
122
         for (optind = 0;;)
123
         {
124
             int long_options_index = -1;
125
@@ -708,12 +744,19 @@
126
                         return true;
127
                     }
128
                 }
129
+                OPT("scenecut-qp-config")
130
+                {
131
+                    this->scenecutAwareQpConfig = x265_fopen(optarg, "rb");
132
+                    if (!this->scenecutAwareQpConfig)
133
+                        x265_log_file(param, X265_LOG_ERROR, "%s scenecut aware qp config file not found or error in opening config file\n", optarg);
134
+                }
135
                 OPT("zonefile")
136
                 {
137
                     this->zoneFile = x265_fopen(optarg, "rb");
138
                     if (!this->zoneFile)
139
                         x265_log_file(param, X265_LOG_ERROR, "%s zone file not found or error in opening zone file\n", optarg);
140
                 }
141
+                OPT("no-zonefile-rc-init") this->param->bNoResetZoneConfig = true;
142
                 OPT("fullhelp")
143
                 {
144
                     param->logLevel = X265_LOG_FULL;
145
@@ -875,7 +918,7 @@
146
             if (reconFileBitDepth == 0)
147
                 reconFileBitDepth = param->internalBitDepth;
148
             this->recon = ReconFile::open(reconfn, param->sourceWidth, param->sourceHeight, reconFileBitDepth,
149
-                param->fpsNum, param->fpsDenom, param->internalCsp);
150
+                param->fpsNum, param->fpsDenom, param->internalCsp, param->sourceBitDepth);
151
             if (this->recon->isFail())
152
             {
153
                 x265_log(param, X265_LOG_WARNING, "unable to write reconstructed outputs file\n");
154
@@ -973,6 +1016,7 @@
155
         param->rc.zones = X265_MALLOC(x265_zone, param->rc.zonefileCount);
156
         for (int i = 0; i < param->rc.zonefileCount; i++)
157
         {
158
+            param->rc.zonesi.startFrame = -1;
159
             while (fgets(line, sizeof(line), zoneFile))
160
             {
161
                 if (*line == '#' || (strcmp(line, "\r\n") == 0))
162
@@ -1010,57 +1054,179 @@
163
         return 1;
164
     }
165
 
166
-    /* Parse the RPU file and extract the RPU corresponding to the current picture
167
-    * and fill the rpu field of the input picture */
168
-    int CLIOptions::rpuParser(x265_picture * pic)
169
-    {
170
-        uint8_t byteVal;
171
-        uint32_t code = 0;
172
-        int bytesRead = 0;
173
-        pic->rpu.payloadSize = 0;
174
-
175
-        if (!pic->pts)
176
-        {
177
-            while (bytesRead++ < 4 && fread(&byteVal, sizeof(uint8_t), 1, dolbyVisionRpu))
178
-                code = (code << 8) | byteVal;
179
-
180
-            if (code != START_CODE)
181
-            {
182
-                x265_log(NULL, X265_LOG_ERROR, "Invalid Dolby Vision RPU startcode in POC %d\n", pic->pts);
183
-                return 1;
184
-            }
185
-        }
186
-
187
-        bytesRead = 0;
188
-        while (fread(&byteVal, sizeof(uint8_t), 1, dolbyVisionRpu))
189
-        {
190
-            code = (code << 8) | byteVal;
191
-            if (bytesRead++ < 3)
192
-                continue;
193
-            if (bytesRead >= 1024)
194
-            {
195
-                x265_log(NULL, X265_LOG_ERROR, "Invalid Dolby Vision RPU size in POC %d\n", pic->pts);
196
-                return 1;
197
-            }
198
-
199
-            if (code != START_CODE)
200
-                pic->rpu.payloadpic->rpu.payloadSize++ = (code >> (3 * 8)) & 0xFF;
201
-            else
202
-                return 0;
203
-        }
204
-
205
-        int ShiftBytes = START_CODE_BYTES - (bytesRead - pic->rpu.payloadSize);
206
-        int bytesLeft = bytesRead - pic->rpu.payloadSize;
207
-        code = (code << ShiftBytes * 8);
208
-        for (int i = 0; i < bytesLeft; i++)
209
-        {
210
-            pic->rpu.payloadpic->rpu.payloadSize++ = (code >> (3 * 8)) & 0xFF;
211
-            code = (code << 8);
212
-        }
213
-        if (!pic->rpu.payloadSize)
214
-            x265_log(NULL, X265_LOG_WARNING, "Dolby Vision RPU not found for POC %d\n", pic->pts);
215
-        return 0;
216
-    }
217
+    /* Parse the RPU file and extract the RPU corresponding to the current picture
218
+    * and fill the rpu field of the input picture */
219
+    int CLIOptions::rpuParser(x265_picture * pic)
220
+    {
221
+        uint8_t byteVal;
222
+        uint32_t code = 0;
223
+        int bytesRead = 0;
224
+        pic->rpu.payloadSize = 0;
225
+
226
+        if (!pic->pts)
227
+        {
228
+            while (bytesRead++ < 4 && fread(&byteVal, sizeof(uint8_t), 1, dolbyVisionRpu))
229
+                code = (code << 8) | byteVal;
230
+
231
+            if (code != START_CODE)
232
+            {
233
+                x265_log(NULL, X265_LOG_ERROR, "Invalid Dolby Vision RPU startcode in POC %d\n", pic->pts);
234
+                return 1;
235
+            }
236
+        }
237
+
238
+        bytesRead = 0;
239
+        while (fread(&byteVal, sizeof(uint8_t), 1, dolbyVisionRpu))
240
+        {
241
+            code = (code << 8) | byteVal;
242
+            if (bytesRead++ < 3)
243
+                continue;
244
+            if (bytesRead >= 1024)
245
+            {
246
+                x265_log(NULL, X265_LOG_ERROR, "Invalid Dolby Vision RPU size in POC %d\n", pic->pts);
247
+                return 1;
248
+            }
249
+
250
+            if (code != START_CODE)
251
+                pic->rpu.payloadpic->rpu.payloadSize++ = (code >> (3 * 8)) & 0xFF;
252
+            else
253
+                return 0;
254
+        }
255
+
256
+        int ShiftBytes = START_CODE_BYTES - (bytesRead - pic->rpu.payloadSize);
257
+        int bytesLeft = bytesRead - pic->rpu.payloadSize;
258
+        code = (code << ShiftBytes * 8);
259
+        for (int i = 0; i < bytesLeft; i++)
260
+        {
261
+            pic->rpu.payloadpic->rpu.payloadSize++ = (code >> (3 * 8)) & 0xFF;
262
+            code = (code << 8);
263
+        }
264
+        if (!pic->rpu.payloadSize)
265
+            x265_log(NULL, X265_LOG_WARNING, "Dolby Vision RPU not found for POC %d\n", pic->pts);
266
+        return 0;
267
+    }
268
+
269
+    bool CLIOptions::parseScenecutAwareQpConfig()
270
+    {
271
+        char line256;
272
+        char* argLine;
273
+        rewind(scenecutAwareQpConfig);
274
+        while (fgets(line, sizeof(line), scenecutAwareQpConfig))
275
+        {
276
+            if (*line == '#' || (strcmp(line, "\r\n") == 0))
277
+                continue;
278
+            int index = (int)strcspn(line, "\r\n");
279
+            lineindex = '\0';
280
+            argLine = line;
281
+            while (isspace((unsigned char)*argLine)) argLine++;
282
+            char* start = strchr(argLine, '-');
283
+            int argCount = 0;
284
+            char **args = (char**)malloc(256 * sizeof(char *));
285
+            //Adding a dummy string to avoid file parsing error
286
+            argsargCount++ = (char *)"x265";
287
+            char* token = strtok(start, " ");
288
+            while (token)
289
+            {
290
+                argsargCount++ = token;
291
+                token = strtok(NULL, " ");
292
+            }
293
+            argsargCount = NULL;
294
+            CLIOptions cliopt;
295
+            if (cliopt.parseScenecutAwareQpParam(argCount, args, param))
296
+            {
297
+                cliopt.destroy();
298
+                if (cliopt.api)
299
+                    cliopt.api->param_free(cliopt.param);
300
+                exit(1);
301
+            }
302
+            break;
303
+        }
304
+        return 1;
305
+    }
306
+    bool CLIOptions::parseScenecutAwareQpParam(int argc, char **argv, x265_param* globalParam)
307
+    {
308
+        bool bError = false;
309
+        int bShowHelp = false;
310
+        int outputBitDepth = 0;
311
+        const char *profile = NULL;
312
+        /* Presets are applied before all other options. */
313
+        for (optind = 0;;)
314
+        {
315
+            int c = getopt_long(argc, argv, short_options, long_options, NULL);
316
+            if (c == -1)
317
+                break;
318
+            else if (c == 'D')
319
+                outputBitDepth = atoi(optarg);
320
+            else if (c == 'P')
321
+                profile = optarg;
322
+            else if (c == '?')
323
+                bShowHelp = true;
324
+        }
325
+        if (!outputBitDepth && profile)
326
+        {
327
+            /*try to derive the output bit depth from the requested profile*/
328
+            if (strstr(profile, "10"))
329
+                outputBitDepth = 10;
330
+            else if (strstr(profile, "12"))
331
+                outputBitDepth = 12;
332
+            else
333
+                outputBitDepth = 8;
334
+        }
335
+        api = x265_api_get(outputBitDepth);
336
+        if (!api)
337
+        {
338
+            x265_log(NULL, X265_LOG_WARNING, "falling back to default bit-depth\n");
339
+            api = x265_api_get(0);
340
+        }
341
+        if (bShowHelp)
342
+        {
343
+            printVersion(globalParam, api);
344
+            showHelp(globalParam);
345
+        }
346
+        for (optind = 0;;)
347
+        {
348
+            int long_options_index = -1;
349
+            int c = getopt_long(argc, argv, short_options, long_options, &long_options_index);
350
+            if (c == -1)
351
+                break;
352
+            if (long_options_index < 0 && c > 0)
353
+            {
354
+                for (size_t i = 0; i < sizeof(long_options) / sizeof(long_options0); i++)
355
+                {
356
+                    if (long_optionsi.val == c)
357
+                    {
358
+                        long_options_index = (int)i;
359
+                        break;
360
+                    }
361
+                }
362
+                if (long_options_index < 0)
363
+                {
364
+                    /* getopt_long might have already printed an error message */
365
+                    if (c != 63)
366
+                        x265_log(NULL, X265_LOG_WARNING, "internal error: short option '%c' has no long option\n", c);
367
+                    return true;
368
+                }
369
+            }
370
+            if (long_options_index < 0)
371
+            {
372
+                x265_log(NULL, X265_LOG_WARNING, "short option '%c' unrecognized\n", c);
373
+                return true;
374
+            }
375
+            bError |= !!api->scenecut_aware_qp_param_parse(globalParam, long_optionslong_options_index.name, optarg);
376
+            if (bError)
377
+            {
378
+                const char *name = long_options_index > 0 ? long_optionslong_options_index.name : argvoptind - 2;
379
+                x265_log(NULL, X265_LOG_ERROR, "invalid argument: %s = %s\n", name, optarg);
380
+                return true;
381
+            }
382
+        }
383
+        if (optind < argc)
384
+        {
385
+            x265_log(param, X265_LOG_WARNING, "extra unused command arguments given <%s>\n", argvoptind);
386
+            return true;
387
+        }
388
+        return false;
389
+    }
390
 
391
 #ifdef __cplusplus
392
 }
393
x265_3.5.tar.gz/source/x265cli.h -> x265_3.6.tar.gz/source/x265cli.h Changed
104
 
1
@@ -135,6 +135,7 @@
2
     { "no-fast-intra",        no_argument, NULL, 0 },
3
     { "no-open-gop",          no_argument, NULL, 0 },
4
     { "open-gop",             no_argument, NULL, 0 },
5
+    { "cra-nal",              no_argument, NULL, 0 },
6
     { "keyint",         required_argument, NULL, 'I' },
7
     { "min-keyint",     required_argument, NULL, 'i' },
8
     { "gop-lookahead",  required_argument, NULL, 0 },
9
@@ -143,7 +144,6 @@
10
     { "scenecut-bias",  required_argument, NULL, 0 },
11
     { "hist-scenecut",        no_argument, NULL, 0},
12
     { "no-hist-scenecut",     no_argument, NULL, 0},
13
-    { "hist-threshold", required_argument, NULL, 0},
14
     { "fades",                no_argument, NULL, 0 },
15
     { "no-fades",             no_argument, NULL, 0 },
16
     { "scenecut-aware-qp", required_argument, NULL, 0 },
17
@@ -182,6 +182,8 @@
18
     { "qp",             required_argument, NULL, 'q' },
19
     { "aq-mode",        required_argument, NULL, 0 },
20
     { "aq-strength",    required_argument, NULL, 0 },
21
+    { "sbrc",                 no_argument, NULL, 0 },
22
+    { "no-sbrc",              no_argument, NULL, 0 },
23
     { "rc-grain",             no_argument, NULL, 0 },
24
     { "no-rc-grain",          no_argument, NULL, 0 },
25
     { "ipratio",        required_argument, NULL, 0 },
26
@@ -244,6 +246,7 @@
27
     { "crop-rect",      required_argument, NULL, 0 }, /* DEPRECATED */
28
     { "master-display", required_argument, NULL, 0 },
29
     { "max-cll",        required_argument, NULL, 0 },
30
+    {"video-signal-type-preset", required_argument, NULL, 0 },
31
     { "min-luma",       required_argument, NULL, 0 },
32
     { "max-luma",       required_argument, NULL, 0 },
33
     { "log2-max-poc-lsb", required_argument, NULL, 8 },
34
@@ -263,11 +266,16 @@
35
     { "repeat-headers",       no_argument, NULL, 0 },
36
     { "aud",                  no_argument, NULL, 0 },
37
     { "no-aud",               no_argument, NULL, 0 },
38
+    { "eob",                  no_argument, NULL, 0 },
39
+    { "no-eob",               no_argument, NULL, 0 },
40
+    { "eos",                  no_argument, NULL, 0 },
41
+    { "no-eos",               no_argument, NULL, 0 },
42
     { "info",                 no_argument, NULL, 0 },
43
     { "no-info",              no_argument, NULL, 0 },
44
     { "zones",          required_argument, NULL, 0 },
45
     { "qpfile",         required_argument, NULL, 0 },
46
     { "zonefile",       required_argument, NULL, 0 },
47
+    { "no-zonefile-rc-init",  no_argument, NULL, 0 },
48
     { "lambda-file",    required_argument, NULL, 0 },
49
     { "b-intra",              no_argument, NULL, 0 },
50
     { "no-b-intra",           no_argument, NULL, 0 },
51
@@ -298,8 +306,7 @@
52
     { "dynamic-refine",       no_argument, NULL, 0 },
53
     { "no-dynamic-refine",    no_argument, NULL, 0 },
54
     { "strict-cbr",           no_argument, NULL, 0 },
55
-    { "temporal-layers",      no_argument, NULL, 0 },
56
-    { "no-temporal-layers",   no_argument, NULL, 0 },
57
+    { "temporal-layers",      required_argument, NULL, 0 },
58
     { "qg-size",        required_argument, NULL, 0 },
59
     { "recon-y4m-exec", required_argument, NULL, 0 },
60
     { "analyze-src-pics", no_argument, NULL, 0 },
61
@@ -349,6 +356,8 @@
62
     { "frame-dup",            no_argument, NULL, 0 },
63
     { "no-frame-dup", no_argument, NULL, 0 },
64
     { "dup-threshold", required_argument, NULL, 0 },
65
+    { "mcstf",                 no_argument, NULL, 0 },
66
+    { "no-mcstf",              no_argument, NULL, 0 },
67
 #ifdef SVT_HEVC
68
     { "svt",     no_argument, NULL, 0 },
69
     { "no-svt",  no_argument, NULL, 0 },
70
@@ -373,6 +382,8 @@
71
     { "abr-ladder", required_argument, NULL, 0 },
72
     { "min-vbv-fullness", required_argument, NULL, 0 },
73
     { "max-vbv-fullness", required_argument, NULL, 0 },
74
+    { "scenecut-qp-config", required_argument, NULL, 0 },
75
+    { "film-grain", required_argument, NULL, 0 },
76
     { 0, 0, 0, 0 },
77
     { 0, 0, 0, 0 },
78
     { 0, 0, 0, 0 },
79
@@ -388,6 +399,7 @@
80
         FILE*       qpfile;
81
         FILE*       zoneFile;
82
         FILE*    dolbyVisionRpu;    /* File containing Dolby Vision BL RPU metadata */
83
+        FILE*    scenecutAwareQpConfig; /* File containing scenecut aware frame quantization related CLI options */
84
         const char* reconPlayCmd;
85
         const x265_api* api;
86
         x265_param* param;
87
@@ -425,6 +437,7 @@
88
             qpfile = NULL;
89
             zoneFile = NULL;
90
             dolbyVisionRpu = NULL;
91
+            scenecutAwareQpConfig = NULL;
92
             reconPlayCmd = NULL;
93
             api = NULL;
94
             param = NULL;
95
@@ -455,6 +468,8 @@
96
         bool parseQPFile(x265_picture &pic_org);
97
         bool parseZoneFile();
98
         int rpuParser(x265_picture * pic);
99
+        bool parseScenecutAwareQpConfig();
100
+        bool parseScenecutAwareQpParam(int argc, char **argv, x265_param* globalParam);
101
     };
102
 #ifdef __cplusplus
103
 }
104
x265_3.5.tar.gz/x265Version.txt -> x265_3.6.tar.gz/x265Version.txt Changed
8
 
1
@@ -1,4 +1,4 @@
2
 #Attribute:         Values
3
-repositorychangeset: f0c1022b6
4
+repositorychangeset: aa7f602f7
5
 releasetagdistance: 1
6
-releasetag: 3.5
7
+releasetag: 3.6
8
Refresh
Refresh
No rpmlint log
Request History
Luigi Baldoni's avatar

Aloysius created request 9 months ago


Luigi Baldoni's avatar

Aloysius accepted request 9 months ago