Overview

Request 6062 (accepted)

Update to version 3.6

Submit package home:Aloysius:branches:Staging / x265 to package Staging / x265

x265.changes Changed
x
 
1
@@ -1,4 +1,53 @@
2
 -------------------------------------------------------------------
3
+Thu Jun 13 05:58:19 UTC 2024 - Luigi Baldoni <aloisio@gmx.com>
4
+
5
+- Update to version 3.6
6
+  New features:
7
+  * Segment based Ratecontrol (SBRC) feature
8
+  * Motion-Compensated Spatio-Temporal Filtering
9
+  * Scene-cut aware qp - BBAQ (Bidirectional Boundary Aware
10
+    Quantization)
11
+  * Histogram-Based Scene Change Detection
12
+  * Film-Grain characteristics as a SEI message to support Film
13
+    Grain Synthesis(FGS)
14
+  * Add temporal layer implementation(Hierarchical B-frame
15
+    implementation)
16
+  Enhancements to existing features:
17
+  * Added Dolby Vision 8.4 Profile Support
18
+  API changes:
19
+  * Add Segment based Ratecontrol(SBRC) feature: "--no-sbrc".
20
+  * Add command line parameter for mcstf feature: "--no-mctf".
21
+  * Add command line parameters for the scene cut aware qp
22
+    feature: "--scenecut-aware-qp" and "--masking-strength".
23
+  * Add command line parameters for Histogram-Based Scene Change
24
+    Detection: "--hist-scenecut".
25
+  * Add film grain characteristics as a SEI message to the
26
+    bitstream: "--film-grain <filename>"
27
+  * cli: add new option --cra-nal (Force nal type to CRA to all
28
+    frames expect for the first frame, works only with keyint 1)
29
+  Optimizations:
30
+  * ARM64 NEON optimizations:- Several time-consuming C
31
+    functions have been optimized for the targeted platform -
32
+    aarch64. The overall performance increased by around 20%.
33
+  * SVE/SVE2 optimizations
34
+  Bug fixes:
35
+  * Linux bug to utilize all the cores
36
+  * Crash with hist-scenecut build when source resolution is not
37
+    multiple of minCuSize
38
+  * 32bit and 64bit builds generation for ARM
39
+  * bugs in zonefile feature (Reflect Zonefile Parameters inside
40
+    Lookahead, extra IDR issue, Avg I Slice QP value issue etc..)
41
+  * Add x86 ASM implementation for subsampling luma
42
+  * Fix for abrladder segfault with load reuse level 1
43
+  * Reorder miniGOP based on temporal layer hierarchy and add
44
+    support for more B frame
45
+  * Add MacOS aarch64 build support
46
+  * Fix boundary condition issue for Gaussian filter
47
+- Drop arm.patch and replace it with 0001-Fix-arm-flags.patch
48
+  and 0004-Do-not-build-with-assembly-support-on-arm.patch
49
+  (courtesy of Debian)
50
+
51
+-------------------------------------------------------------------
52
 Wed May 19 13:21:09 UTC 2021 - Luigi Baldoni <aloisio@gmx.com>
53
 
54
 - Build libx265_main10 and libx265_main12 unconditionally and
55
x265.spec Changed
46
 
1
@@ -1,7 +1,7 @@
2
 #
3
 # spec file for package x265
4
 #
5
-# Copyright (c) 2021 Packman Team <packman@links2linux.de>
6
+# Copyright (c) 2024 Packman Team <packman@links2linux.de>
7
 # Copyright (c) 2014 Torsten Gruner <t.gruner@katodev.de>
8
 #
9
 # All modifications and additions to the file contributed by third parties
10
@@ -17,21 +17,22 @@
11
 #
12
 
13
 
14
-%define sover   199
15
+%define sover   209
16
 %define libname lib%{name}
17
 %define libsoname %{libname}-%{sover}
18
-%define uver    3_5
19
+%define uver    3_6
20
 Name:           x265
21
-Version:        3.5
22
+Version:        3.6
23
 Release:        0
24
 Summary:        A free h265/HEVC encoder - encoder binary
25
 License:        GPL-2.0-or-later
26
 Group:          Productivity/Multimedia/Video/Editors and Convertors
27
 URL:            https://bitbucket.org/multicoreware/x265_git
28
 Source0:        https://bitbucket.org/multicoreware/x265_git/downloads/%{name}_%{version}.tar.gz
29
-Patch0:         arm.patch
30
 Patch1:         x265.pkgconfig.patch
31
 Patch2:         x265-fix_enable512.patch
32
+Patch3:         0001-Fix-arm-flags.patch
33
+Patch4:         0004-Do-not-build-with-assembly-support-on-arm.patch
34
 BuildRequires:  cmake >= 2.8.8
35
 BuildRequires:  gcc-c++
36
 BuildRequires:  nasm >= 2.13
37
@@ -130,6 +131,8 @@
38
 %cmake_install
39
 find %{buildroot} -type f -name "*.a" -delete -print0
40
 
41
+%check
42
+
43
 %post -n %{libsoname} -p /sbin/ldconfig
44
 %postun -n %{libsoname} -p /sbin/ldconfig
45
 
46
0001-Fix-arm-flags.patch Added
41
 
1
@@ -0,0 +1,39 @@
2
+From: Sebastian Ramacher <sramacher@debian.org>
3
+Date: Sun, 21 Jun 2020 17:54:56 +0200
4
+Subject: Fix arm* flags
5
+
6
+---
7
+ source/CMakeLists.txt | 7 ++-----
8
+ 1 file changed, 2 insertions(+), 5 deletions(-)
9
+
10
+diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
11
+index ab5ddfe..eb9b19b 100755
12
+--- a/source/CMakeLists.txt
13
++++ b/source/CMakeLists.txt
14
+@@ -253,10 +253,7 @@ if(GCC)
15
+     elseif(ARM)
16
+         find_package(Neon)
17
+         if(CPU_HAS_NEON)
18
+-            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
19
+             add_definitions(-DHAVE_NEON)
20
+-        else()
21
+-            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
22
+         endif()
23
+     endif()
24
+   if(ARM64 OR CROSS_COMPILE_ARM64)
25
+@@ -265,13 +262,13 @@ if(GCC)
26
+         find_package(SVE2)
27
+         if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2)
28
+             message(STATUS "Found SVE2")
29
+-          set(ARM_ARGS -O3 -march=armv8-a+sve2 -fPIC -flax-vector-conversions)
30
++          set(ARM_ARGS -fPIC -flax-vector-conversions)
31
+             add_definitions(-DHAVE_SVE2)
32
+             add_definitions(-DHAVE_SVE)
33
+             add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE2
34
+         elseif(CPU_HAS_SVE OR CROSS_COMPILE_SVE)
35
+             message(STATUS "Found SVE")
36
+-          set(ARM_ARGS -O3 -march=armv8-a+sve -fPIC -flax-vector-conversions)
37
++          set(ARM_ARGS -fPIC -flax-vector-conversions)
38
+             add_definitions(-DHAVE_SVE)
39
+             add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE
40
+         elseif(CPU_HAS_NEON)
41
0004-Do-not-build-with-assembly-support-on-arm.patch Added
30
 
1
@@ -0,0 +1,28 @@
2
+From: Sebastian Ramacher <sramacher@debian.org>
3
+Date: Fri, 31 May 2024 23:38:23 +0200
4
+Subject: Do not build with assembly support on arm*
5
+
6
+---
7
+ source/CMakeLists.txt | 9 ---------
8
+ 1 file changed, 9 deletions(-)
9
+
10
+diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
11
+index 672cc2d..f112330 100755
12
+--- a/source/CMakeLists.txt
13
++++ b/source/CMakeLists.txt
14
+@@ -73,15 +73,6 @@ elseif(POWERMATCH GREATER "-1")
15
+         add_definitions(-DPPC64=1)
16
+         message(STATUS "Detected POWER PPC64 target processor")
17
+     endif()
18
+-elseif(ARMMATCH GREATER "-1")
19
+-    if(CROSS_COMPILE_ARM)
20
+-        message(STATUS "Cross compiling for ARM arch")
21
+-    else()
22
+-        set(CROSS_COMPILE_ARM 0)
23
+-    endif()
24
+-  message(STATUS "Detected ARM target processor")
25
+-    set(ARM 1)
26
+-    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
27
+ elseif(ARM64MATCH GREATER "-1")
28
+     #if(CROSS_COMPILE_ARM64)
29
+         #message(STATUS "Cross compiling for ARM64 arch")
30
arm.patch Deleted
110
 
1
@@ -1,108 +0,0 @@
2
-Index: x265_3.4/source/CMakeLists.txt
3
-===================================================================
4
---- x265_3.4.orig/source/CMakeLists.txt
5
-+++ x265_3.4/source/CMakeLists.txt
6
-@@ -64,26 +64,26 @@ elseif(POWERMATCH GREATER "-1")
7
-         add_definitions(-DPPC64=1)
8
-         message(STATUS "Detected POWER PPC64 target processor")
9
-     endif()
10
--elseif(ARMMATCH GREATER "-1")
11
--    if(CROSS_COMPILE_ARM)
12
--        message(STATUS "Cross compiling for ARM arch")
13
--    else()
14
--        set(CROSS_COMPILE_ARM 0)
15
--    endif()
16
--    set(ARM 1)
17
--    if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
18
--        message(STATUS "Detected ARM64 target processor")
19
--        set(ARM64 1)
20
--        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0)
21
--    else()
22
--        message(STATUS "Detected ARM target processor")
23
--        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1)
24
--    endif()
25
-+elseif(${SYSPROC} MATCHES "armv5.*")
26
-+    message(STATUS "Detected ARMV5 system processor")
27
-+    set(ARMV5 1)
28
-+    add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=0 -DHAVE_NEON=0)
29
-+elseif(${SYSPROC} STREQUAL "armv6l")
30
-+    message(STATUS "Detected ARMV6 system processor")
31
-+    set(ARMV6 1)
32
-+    add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
33
-+elseif(${SYSPROC} STREQUAL "armv7l")
34
-+    message(STATUS "Detected ARMV7 system processor")
35
-+    set(ARMV7 1)
36
-+    add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
37
-+elseif(${SYSPROC} STREQUAL "aarch64")
38
-+    message(STATUS "Detected AArch64 system processor")
39
-+    set(ARMV7 1)
40
-+    add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0 -DHAVE_NEON=0)
41
- else()
42
-     message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
43
-     message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
44
- endif()
45
--
46
- if(UNIX)
47
-     list(APPEND PLATFORM_LIBS pthread)
48
-     find_library(LIBRT rt)
49
-@@ -238,28 +238,9 @@ if(GCC)
50
-             endif()
51
-         endif()
52
-     endif()
53
--    if(ARM AND CROSS_COMPILE_ARM)
54
--        if(ARM64)
55
--            set(ARM_ARGS -fPIC)
56
--        else()
57
--            set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
58
--        endif()
59
--        message(STATUS "cross compile arm")
60
--    elseif(ARM)
61
--        if(ARM64)
62
--            set(ARM_ARGS -fPIC)
63
--            add_definitions(-DHAVE_NEON)
64
--        else()
65
--            find_package(Neon)
66
--            if(CPU_HAS_NEON)
67
--                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
68
--                add_definitions(-DHAVE_NEON)
69
--            else()
70
--                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
71
--            endif()
72
--        endif()
73
-+    if(ARMV7)
74
-+        add_definitions(-fPIC)
75
-     endif()
76
--    add_definitions(${ARM_ARGS})
77
-     if(FPROFILE_GENERATE)
78
-         if(INTEL_CXX)
79
-             add_definitions(-prof-gen -prof-dir="${CMAKE_CURRENT_BINARY_DIR}")
80
-Index: x265_3.4/source/common/cpu.cpp
81
-===================================================================
82
---- x265_3.4.orig/source/common/cpu.cpp
83
-+++ x265_3.4/source/common/cpu.cpp
84
-@@ -39,7 +39,7 @@
85
- #include <machine/cpu.h>
86
- #endif
87
- 
88
--#if X265_ARCH_ARM && !defined(HAVE_NEON)
89
-+#if X265_ARCH_ARM && (!defined(HAVE_NEON) || HAVE_NEON==0)
90
- #include <signal.h>
91
- #include <setjmp.h>
92
- static sigjmp_buf jmpbuf;
93
-@@ -350,7 +350,6 @@ uint32_t cpu_detect(bool benableavx512)
94
-     }
95
- 
96
-     canjump = 1;
97
--    PFX(cpu_neon_test)();
98
-     canjump = 0;
99
-     signal(SIGILL, oldsig);
100
- #endif // if !HAVE_NEON
101
-@@ -366,7 +365,7 @@ uint32_t cpu_detect(bool benableavx512)
102
-     // which may result in incorrect detection and the counters stuck enabled.
103
-     // right now Apple does not seem to support performance counters for this test
104
- #ifndef __MACH__
105
--    flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
106
-+    //flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
107
- #endif
108
-     // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
109
- #elif X265_ARCH_ARM64
110
baselibs.conf Changed
4
 
1
@@ -1,1 +1,1 @@
2
-libx265-199
3
+libx265-209
4
x265_3.5.tar.gz/source/common/aarch64/ipfilter8.S Deleted
416
 
1
@@ -1,414 +0,0 @@
2
-/*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
- *
5
- * Authors: Yimeng Su <yimeng.su@huawei.com>
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
- *
21
- * This program is also available under a commercial proprietary license.
22
- * For more information, contact us at license @ x265.com.
23
- *****************************************************************************/
24
-
25
-#include "asm.S"
26
-
27
-.section .rodata
28
-
29
-.align 4
30
-
31
-.text
32
-
33
-
34
-
35
-.macro qpel_filter_0_32b
36
-    movi            v24.8h, #64
37
-    uxtl            v19.8h, v5.8b
38
-    smull           v17.4s, v19.4h, v24.4h
39
-    smull2          v18.4s, v19.8h, v24.8h
40
-.endm
41
-
42
-.macro qpel_filter_1_32b
43
-    movi            v16.8h, #58
44
-    uxtl            v19.8h, v5.8b
45
-    smull           v17.4s, v19.4h, v16.4h
46
-    smull2          v18.4s, v19.8h, v16.8h
47
-
48
-    movi            v24.8h, #10
49
-    uxtl            v21.8h, v1.8b
50
-    smull           v19.4s, v21.4h, v24.4h
51
-    smull2          v20.4s, v21.8h, v24.8h
52
-
53
-    movi            v16.8h, #17
54
-    uxtl            v23.8h, v2.8b
55
-    smull           v21.4s, v23.4h, v16.4h
56
-    smull2          v22.4s, v23.8h, v16.8h
57
-
58
-    movi            v24.8h, #5
59
-    uxtl            v1.8h, v6.8b
60
-    smull           v23.4s, v1.4h, v24.4h
61
-    smull2          v16.4s, v1.8h, v24.8h
62
-
63
-    sub             v17.4s, v17.4s, v19.4s
64
-    sub             v18.4s, v18.4s, v20.4s
65
-
66
-    uxtl            v1.8h, v4.8b
67
-    sshll           v19.4s, v1.4h, #2
68
-    sshll2          v20.4s, v1.8h, #2
69
-
70
-    add             v17.4s, v17.4s, v21.4s
71
-    add             v18.4s, v18.4s, v22.4s
72
-
73
-    uxtl            v1.8h, v0.8b
74
-    uxtl            v2.8h, v3.8b
75
-    ssubl           v21.4s, v2.4h, v1.4h
76
-    ssubl2          v22.4s, v2.8h, v1.8h
77
-
78
-    add             v17.4s, v17.4s, v19.4s
79
-    add             v18.4s, v18.4s, v20.4s
80
-    sub             v21.4s, v21.4s, v23.4s
81
-    sub             v22.4s, v22.4s, v16.4s
82
-    add             v17.4s, v17.4s, v21.4s
83
-    add             v18.4s, v18.4s, v22.4s
84
-.endm
85
-
86
-.macro qpel_filter_2_32b
87
-    movi            v16.4s, #11
88
-    uxtl            v19.8h, v5.8b
89
-    uxtl            v20.8h, v2.8b
90
-    saddl           v17.4s, v19.4h, v20.4h
91
-    saddl2          v18.4s, v19.8h, v20.8h
92
-
93
-    uxtl            v21.8h, v1.8b
94
-    uxtl            v22.8h, v6.8b
95
-    saddl           v19.4s, v21.4h, v22.4h
96
-    saddl2          v20.4s, v21.8h, v22.8h
97
-
98
-    mul             v19.4s, v19.4s, v16.4s
99
-    mul             v20.4s, v20.4s, v16.4s
100
-
101
-    movi            v16.4s, #40
102
-    mul             v17.4s, v17.4s, v16.4s
103
-    mul             v18.4s, v18.4s, v16.4s
104
-
105
-    uxtl            v21.8h, v4.8b
106
-    uxtl            v22.8h, v3.8b
107
-    saddl           v23.4s, v21.4h, v22.4h
108
-    saddl2          v16.4s, v21.8h, v22.8h
109
-
110
-    uxtl            v1.8h, v0.8b
111
-    uxtl            v2.8h, v7.8b
112
-    saddl           v21.4s, v1.4h, v2.4h
113
-    saddl2          v22.4s, v1.8h, v2.8h
114
-
115
-    shl             v23.4s, v23.4s, #2
116
-    shl             v16.4s, v16.4s, #2
117
-
118
-    add             v19.4s, v19.4s, v21.4s
119
-    add             v20.4s, v20.4s, v22.4s
120
-    add             v17.4s, v17.4s, v23.4s
121
-    add             v18.4s, v18.4s, v16.4s
122
-    sub             v17.4s, v17.4s, v19.4s
123
-    sub             v18.4s, v18.4s, v20.4s
124
-.endm
125
-
126
-.macro qpel_filter_3_32b
127
-    movi            v16.8h, #17
128
-    movi            v24.8h, #5
129
-
130
-    uxtl            v19.8h, v5.8b
131
-    smull           v17.4s, v19.4h, v16.4h
132
-    smull2          v18.4s, v19.8h, v16.8h
133
-
134
-    uxtl            v21.8h, v1.8b
135
-    smull           v19.4s, v21.4h, v24.4h
136
-    smull2          v20.4s, v21.8h, v24.8h
137
-
138
-    movi            v16.8h, #58
139
-    uxtl            v23.8h, v2.8b
140
-    smull           v21.4s, v23.4h, v16.4h
141
-    smull2          v22.4s, v23.8h, v16.8h
142
-
143
-    movi            v24.8h, #10
144
-    uxtl            v1.8h, v6.8b
145
-    smull           v23.4s, v1.4h, v24.4h
146
-    smull2          v16.4s, v1.8h, v24.8h
147
-
148
-    sub             v17.4s, v17.4s, v19.4s
149
-    sub             v18.4s, v18.4s, v20.4s
150
-
151
-    uxtl            v1.8h, v3.8b
152
-    sshll           v19.4s, v1.4h, #2
153
-    sshll2          v20.4s, v1.8h, #2
154
-
155
-    add             v17.4s, v17.4s, v21.4s
156
-    add             v18.4s, v18.4s, v22.4s
157
-
158
-    uxtl            v1.8h, v4.8b
159
-    uxtl            v2.8h, v7.8b
160
-    ssubl           v21.4s, v1.4h, v2.4h
161
-    ssubl2          v22.4s, v1.8h, v2.8h
162
-
163
-    add             v17.4s, v17.4s, v19.4s
164
-    add             v18.4s, v18.4s, v20.4s
165
-    sub             v21.4s, v21.4s, v23.4s
166
-    sub             v22.4s, v22.4s, v16.4s
167
-    add             v17.4s, v17.4s, v21.4s
168
-    add             v18.4s, v18.4s, v22.4s
169
-.endm
170
-
171
-
172
-
173
-
174
-.macro vextin8
175
-    ld1             {v3.16b}, x11, #16
176
-    mov             v7.d0, v3.d1
177
-    ext             v0.8b, v3.8b, v7.8b, #1
178
-    ext             v4.8b, v3.8b, v7.8b, #2
179
-    ext             v1.8b, v3.8b, v7.8b, #3
180
-    ext             v5.8b, v3.8b, v7.8b, #4
181
-    ext             v2.8b, v3.8b, v7.8b, #5
182
-    ext             v6.8b, v3.8b, v7.8b, #6
183
-    ext             v3.8b, v3.8b, v7.8b, #7
184
-.endm
185
-
186
-
187
-
188
-// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
189
-.macro HPS_FILTER a b filterhps
190
-    mov             w12, #8192
191
-    mov             w6, w10
192
-    sub             x3, x3, #\a
193
-    lsl             x3, x3, #1
194
-    mov             w9, #\a
195
-    cmp             w9, #4
196
-    b.eq            14f
197
-    cmp             w9, #12
198
-    b.eq            15f
199
-    b               7f
200
-14:
201
-    HPS_FILTER_4 \a \b \filterhps
202
-    b               10f
203
-15:
204
-    HPS_FILTER_12 \a \b \filterhps
205
-    b               10f
206
-7:
207
-    cmp             w5, #0
208
-    b.eq            8f
209
-    cmp             w5, #1
210
-    b.eq            9f
211
-8:
212
-loop1_hps_\filterhps\()_\a\()x\b\()_rowext0:
213
-    mov             w7, #\a
214
-    lsr             w7, w7, #3
215
-    mov             x11, x0
216
-    sub             x11, x11, #4
217
-loop2_hps_\filterhps\()_\a\()x\b\()_rowext0:
218
-    vextin8
219
-    \filterhps
220
-    dup             v16.4s, w12
221
-    sub             v17.4s, v17.4s, v16.4s
222
-    sub             v18.4s, v18.4s, v16.4s
223
-    xtn             v0.4h, v17.4s
224
-    xtn2            v0.8h, v18.4s
225
-    st1             {v0.8h}, x2, #16
226
-    subs            w7, w7, #1
227
-    sub             x11, x11, #8
228
-    b.ne            loop2_hps_\filterhps\()_\a\()x\b\()_rowext0
229
-    subs            w6, w6, #1
230
-    add             x0, x0, x1
231
-    add             x2, x2, x3
232
-    b.ne            loop1_hps_\filterhps\()_\a\()x\b\()_rowext0
233
-    b               10f
234
-9:
235
-loop3_hps_\filterhps\()_\a\()x\b\()_rowext1:
236
-    mov             w7, #\a
237
-    lsr             w7, w7, #3
238
-    mov             x11, x0
239
-    sub             x11, x11, #4
240
-loop4_hps_\filterhps\()_\a\()x\b\()_rowext1:
241
-    vextin8
242
-    \filterhps
243
-    dup             v16.4s, w12
244
-    sub             v17.4s, v17.4s, v16.4s
245
-    sub             v18.4s, v18.4s, v16.4s
246
-    xtn             v0.4h, v17.4s
247
-    xtn2            v0.8h, v18.4s
248
-    st1             {v0.8h}, x2, #16
249
-    subs            w7, w7, #1
250
-    sub             x11, x11, #8
251
-    b.ne            loop4_hps_\filterhps\()_\a\()x\b\()_rowext1
252
-    subs            w6, w6, #1
253
-    add             x0, x0, x1
254
-    add             x2, x2, x3
255
-    b.ne            loop3_hps_\filterhps\()_\a\()x\b\()_rowext1
256
-10:
257
-.endm
258
-
259
-.macro HPS_FILTER_4 w h filterhps
260
-    cmp             w5, #0
261
-    b.eq            11f
262
-    cmp             w5, #1
263
-    b.eq            12f
264
-11:
265
-loop4_hps_\filterhps\()_\w\()x\h\()_rowext0:
266
-    mov             x11, x0
267
-    sub             x11, x11, #4
268
-    vextin8
269
-    \filterhps
270
-    dup             v16.4s, w12
271
-    sub             v17.4s, v17.4s, v16.4s
272
-    xtn             v0.4h, v17.4s
273
-    st1             {v0.4h}, x2, #8
274
-    sub             x11, x11, #8
275
-    subs            w6, w6, #1
276
-    add             x0, x0, x1
277
-    add             x2, x2, x3
278
-    b.ne            loop4_hps_\filterhps\()_\w\()x\h\()_rowext0
279
-    b               13f
280
-12:
281
-loop5_hps_\filterhps\()_\w\()x\h\()_rowext1:
282
-    mov             x11, x0
283
-    sub             x11, x11, #4
284
-    vextin8
285
-    \filterhps
286
-    dup             v16.4s, w12
287
-    sub             v17.4s, v17.4s, v16.4s
288
-    xtn             v0.4h, v17.4s
289
-    st1             {v0.4h}, x2, #8
290
-    sub             x11, x11, #8
291
-    subs            w6, w6, #1
292
-    add             x0, x0, x1
293
-    add             x2, x2, x3
294
-    b.ne            loop5_hps_\filterhps\()_\w\()x\h\()_rowext1
295
-13:
296
-.endm
297
-
298
-.macro HPS_FILTER_12 w h filterhps
299
-    cmp             w5, #0
300
-    b.eq            14f
301
-    cmp             w5, #1
302
-    b.eq            15f
303
-14:
304
-loop12_hps_\filterhps\()_\w\()x\h\()_rowext0:
305
-    mov             x11, x0
306
-    sub             x11, x11, #4
307
-    vextin8
308
-    \filterhps
309
-    dup             v16.4s, w12
310
-    sub             v17.4s, v17.4s, v16.4s
311
-    sub             v18.4s, v18.4s, v16.4s
312
-    xtn             v0.4h, v17.4s
313
-    xtn2            v0.8h, v18.4s
314
-    st1             {v0.8h}, x2, #16
315
-    sub             x11, x11, #8
316
-
317
-    vextin8
318
-    \filterhps
319
-    dup             v16.4s, w12
320
-    sub             v17.4s, v17.4s, v16.4s
321
-    xtn             v0.4h, v17.4s
322
-    st1             {v0.4h}, x2, #8
323
-    add             x2, x2, x3
324
-    subs            w6, w6, #1
325
-    add             x0, x0, x1
326
-    b.ne            loop12_hps_\filterhps\()_\w\()x\h\()_rowext0
327
-    b               16f
328
-15:
329
-loop12_hps_\filterhps\()_\w\()x\h\()_rowext1:
330
-    mov             x11, x0
331
-    sub             x11, x11, #4
332
-    vextin8
333
-    \filterhps
334
-    dup             v16.4s, w12
335
-    sub             v17.4s, v17.4s, v16.4s
336
-    sub             v18.4s, v18.4s, v16.4s
337
-    xtn             v0.4h, v17.4s
338
-    xtn2            v0.8h, v18.4s
339
-    st1             {v0.8h}, x2, #16
340
-    sub             x11, x11, #8
341
-
342
-    vextin8
343
-    \filterhps
344
-    dup             v16.4s, w12
345
-    sub             v17.4s, v17.4s, v16.4s
346
-    xtn             v0.4h, v17.4s
347
-    st1             {v0.4h}, x2, #8
348
-    add             x2, x2, x3
349
-    subs            w6, w6, #1
350
-    add             x0, x0, x1
351
-    b.ne            loop12_hps_\filterhps\()_\w\()x\h\()_rowext1
352
-16:
353
-.endm
354
-
355
-// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
356
-.macro LUMA_HPS w h
357
-function x265_interp_8tap_horiz_ps_\w\()x\h\()_neon
358
-    mov             w10, #\h
359
-    cmp             w5, #0
360
-    b.eq            6f
361
-    sub             x0, x0, x1, lsl #2
362
-
363
-    add             x0, x0, x1
364
-    add             w10, w10, #7
365
-6:
366
-    cmp             w4, #0
367
-    b.eq            0f
368
-    cmp             w4, #1
369
-    b.eq            1f
370
-    cmp             w4, #2
371
-    b.eq            2f
372
-    cmp             w4, #3
373
-    b.eq            3f
374
-0:
375
-    HPS_FILTER  \w \h qpel_filter_0_32b
376
-    b               5f
377
-1:
378
-    HPS_FILTER  \w \h qpel_filter_1_32b
379
-    b               5f
380
-2:
381
-    HPS_FILTER  \w \h qpel_filter_2_32b
382
-    b               5f
383
-3:
384
-    HPS_FILTER  \w \h qpel_filter_3_32b
385
-    b               5f
386
-5:
387
-    ret
388
-endfunc
389
-.endm
390
-
391
-LUMA_HPS    4 4
392
-LUMA_HPS    4 8
393
-LUMA_HPS    4 16
394
-LUMA_HPS    8 4
395
-LUMA_HPS    8 8
396
-LUMA_HPS    8 16
397
-LUMA_HPS    8 32
398
-LUMA_HPS    12 16
399
-LUMA_HPS    16 4
400
-LUMA_HPS    16 8
401
-LUMA_HPS    16 12
402
-LUMA_HPS    16 16
403
-LUMA_HPS    16 32
404
-LUMA_HPS    16 64
405
-LUMA_HPS    24 32
406
-LUMA_HPS    32 8
407
-LUMA_HPS    32 16
408
-LUMA_HPS    32 24
409
-LUMA_HPS    32 32
410
-LUMA_HPS    32 64
411
-LUMA_HPS    48 64
412
-LUMA_HPS    64 16
413
-LUMA_HPS    64 32
414
-LUMA_HPS    64 48
415
-LUMA_HPS    64 64
416
x265_3.5.tar.gz/source/common/aarch64/ipfilter8.h Deleted
57
 
1
@@ -1,55 +0,0 @@
2
-/*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
- *
5
- * Authors: Yimeng Su <yimeng.su@huawei.com>
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
- *
21
- * This program is also available under a commercial proprietary license.
22
- * For more information, contact us at license @ x265.com.
23
- *****************************************************************************/
24
-
25
-#ifndef X265_IPFILTER8_AARCH64_H
26
-#define X265_IPFILTER8_AARCH64_H
27
-
28
-
29
-void x265_interp_8tap_horiz_ps_4x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
30
-void x265_interp_8tap_horiz_ps_4x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
31
-void x265_interp_8tap_horiz_ps_4x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
32
-void x265_interp_8tap_horiz_ps_8x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
33
-void x265_interp_8tap_horiz_ps_8x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
34
-void x265_interp_8tap_horiz_ps_8x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
35
-void x265_interp_8tap_horiz_ps_8x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
36
-void x265_interp_8tap_horiz_ps_12x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
37
-void x265_interp_8tap_horiz_ps_16x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
38
-void x265_interp_8tap_horiz_ps_16x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
39
-void x265_interp_8tap_horiz_ps_16x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
40
-void x265_interp_8tap_horiz_ps_16x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
41
-void x265_interp_8tap_horiz_ps_16x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
42
-void x265_interp_8tap_horiz_ps_16x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
43
-void x265_interp_8tap_horiz_ps_24x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
44
-void x265_interp_8tap_horiz_ps_32x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
45
-void x265_interp_8tap_horiz_ps_32x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
46
-void x265_interp_8tap_horiz_ps_32x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
47
-void x265_interp_8tap_horiz_ps_32x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
48
-void x265_interp_8tap_horiz_ps_32x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
49
-void x265_interp_8tap_horiz_ps_48x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
50
-void x265_interp_8tap_horiz_ps_64x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
51
-void x265_interp_8tap_horiz_ps_64x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
52
-void x265_interp_8tap_horiz_ps_64x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
53
-void x265_interp_8tap_horiz_ps_64x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
54
-
55
-
56
-#endif // ifndef X265_IPFILTER8_AARCH64_H
57
x265_3.5.tar.gz/source/common/aarch64/pixel-util.h Deleted
42
 
1
@@ -1,40 +0,0 @@
2
-/*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
- *
5
- * Authors: Yimeng Su <yimeng.su@huawei.com>
6
- *          Hongbin Liu <liuhongbin1@huawei.com>
7
- *
8
- * This program is free software; you can redistribute it and/or modify
9
- * it under the terms of the GNU General Public License as published by
10
- * the Free Software Foundation; either version 2 of the License, or
11
- * (at your option) any later version.
12
- *
13
- * This program is distributed in the hope that it will be useful,
14
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
- * GNU General Public License for more details.
17
- *
18
- * You should have received a copy of the GNU General Public License
19
- * along with this program; if not, write to the Free Software
20
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21
- *
22
- * This program is also available under a commercial proprietary license.
23
- * For more information, contact us at license @ x265.com.
24
- *****************************************************************************/
25
-
26
-#ifndef X265_PIXEL_UTIL_AARCH64_H
27
-#define X265_PIXEL_UTIL_AARCH64_H
28
-
29
-int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
30
-int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
31
-int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
32
-int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
33
-int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
34
-int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
35
-int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
36
-int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
37
-
38
-uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
39
-int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
40
-
41
-#endif // ifndef X265_PIXEL_UTIL_AARCH64_H
42
x265_3.5.tar.gz/source/common/aarch64/pixel.h Deleted
107
 
1
@@ -1,105 +0,0 @@
2
-/*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
- *
5
- * Authors: Hongbin Liu <liuhongbin1@huawei.com>
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
- *
21
- * This program is also available under a commercial proprietary license.
22
- * For more information, contact us at license @ x265.com.
23
- *****************************************************************************/
24
-
25
-#ifndef X265_I386_PIXEL_AARCH64_H
26
-#define X265_I386_PIXEL_AARCH64_H
27
-
28
-void x265_pixel_avg_pp_4x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
29
-void x265_pixel_avg_pp_4x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
30
-void x265_pixel_avg_pp_4x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
31
-void x265_pixel_avg_pp_8x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
32
-void x265_pixel_avg_pp_8x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
33
-void x265_pixel_avg_pp_8x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
34
-void x265_pixel_avg_pp_8x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
35
-void x265_pixel_avg_pp_12x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
36
-void x265_pixel_avg_pp_16x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
37
-void x265_pixel_avg_pp_16x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
38
-void x265_pixel_avg_pp_16x12_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
39
-void x265_pixel_avg_pp_16x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
40
-void x265_pixel_avg_pp_16x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
41
-void x265_pixel_avg_pp_16x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
42
-void x265_pixel_avg_pp_24x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
43
-void x265_pixel_avg_pp_32x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
44
-void x265_pixel_avg_pp_32x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
45
-void x265_pixel_avg_pp_32x24_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
46
-void x265_pixel_avg_pp_32x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
47
-void x265_pixel_avg_pp_32x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
48
-void x265_pixel_avg_pp_48x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
49
-void x265_pixel_avg_pp_64x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
50
-void x265_pixel_avg_pp_64x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
51
-void x265_pixel_avg_pp_64x48_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
52
-void x265_pixel_avg_pp_64x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
53
-
54
-void x265_sad_x3_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
55
-void x265_sad_x3_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
56
-void x265_sad_x3_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
57
-void x265_sad_x3_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
58
-void x265_sad_x3_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
59
-void x265_sad_x3_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
60
-void x265_sad_x3_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
61
-void x265_sad_x3_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
62
-void x265_sad_x3_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
63
-void x265_sad_x3_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
64
-void x265_sad_x3_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
65
-void x265_sad_x3_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
66
-void x265_sad_x3_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
67
-void x265_sad_x3_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
68
-void x265_sad_x3_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
69
-void x265_sad_x3_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
70
-void x265_sad_x3_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
71
-void x265_sad_x3_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
72
-void x265_sad_x3_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
73
-void x265_sad_x3_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
74
-void x265_sad_x3_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
75
-void x265_sad_x3_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
76
-void x265_sad_x3_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
77
-void x265_sad_x3_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
78
-void x265_sad_x3_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
79
-
80
-void x265_sad_x4_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
81
-void x265_sad_x4_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
82
-void x265_sad_x4_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
83
-void x265_sad_x4_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
84
-void x265_sad_x4_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
85
-void x265_sad_x4_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
86
-void x265_sad_x4_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
87
-void x265_sad_x4_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
88
-void x265_sad_x4_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
89
-void x265_sad_x4_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
90
-void x265_sad_x4_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
91
-void x265_sad_x4_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
92
-void x265_sad_x4_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
93
-void x265_sad_x4_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
94
-void x265_sad_x4_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
95
-void x265_sad_x4_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
96
-void x265_sad_x4_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
97
-void x265_sad_x4_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
98
-void x265_sad_x4_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
99
-void x265_sad_x4_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
100
-void x265_sad_x4_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
101
-void x265_sad_x4_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
102
-void x265_sad_x4_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
103
-void x265_sad_x4_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
104
-void x265_sad_x4_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
105
-
106
-#endif // ifndef X265_I386_PIXEL_AARCH64_H
107
x265_3.6.tar.gz/.gitignore Added
38
 
1
@@ -0,0 +1,36 @@
2
+# Prerequisites
3
+*.d
4
+
5
+# Compiled Object files
6
+*.slo
7
+*.lo
8
+*.o
9
+*.obj
10
+
11
+# Precompiled Headers
12
+*.gch
13
+*.pch
14
+
15
+# Compiled Dynamic libraries
16
+*.so
17
+*.dylib
18
+*.dll
19
+
20
+# Fortran module files
21
+*.mod
22
+*.smod
23
+
24
+# Compiled Static libraries
25
+*.lai
26
+*.la
27
+*.a
28
+*.lib
29
+
30
+# Executables
31
+*.exe
32
+*.out
33
+*.app
34
+
35
+# Build directory
36
+build/
37
+
38
x265_3.5.tar.gz/build/README.txt -> x265_3.6.tar.gz/build/README.txt Changed
37
 
1
@@ -6,6 +6,9 @@
2
 
3
 Note: MSVC12 requires cmake 2.8.11 or later
4
 
5
+Note: When the SVE/SVE2 instruction set of Arm AArch64 architecture is to be used, the GCC10.x and onwards must
6
+      be installed in order to compile x265.
7
+
8
 
9
 = Optional Prerequisites =
10
 
11
@@ -88,3 +91,25 @@
12
 building out of a Mercurial source repository.  If you are building out of
13
 a release source package, the version will not change.  If Mercurial is not
14
 found, the version will be "unknown".
15
+
16
+= Build Instructions for cross-compilation for Arm AArch64 Targets=
17
+
18
+When the target platform is based on Arm AArch64 architecture, the x265 can be
19
+built in x86 platforms. However, the CMAKE_C_COMPILER and CMAKE_CXX_COMPILER
20
+enviroment variables should be set to point to the cross compilers of the
21
+appropriate gcc. For example:
22
+
23
+1. export CMAKE_C_COMPILER=aarch64-unknown-linux-gnu-gcc
24
+2. export CMAKE_CXX_COMPILER=aarch64-unknown-linux-gnu-g++
25
+
26
+The default ones are aarch64-linux-gnu-gcc and aarch64-linux-gnu-g++.
27
+Then, the normal building process can be followed.
28
+
29
+Moreover, if the target platform supports SVE or SVE2 instruction set, the
30
+CROSS_COMPILE_SVE or CROSS_COMPILE_SVE2 environment variables should be set
31
+to true, respectively. For example:
32
+
33
+1. export CROSS_COMPILE_SVE2=true
34
+2. export CROSS_COMPILE_SVE=true
35
+
36
+Then, the normal building process can be followed.
37
x265_3.6.tar.gz/build/aarch64-darwin Added
2
 
1
+(directory)
2
x265_3.6.tar.gz/build/aarch64-darwin/crosscompile.cmake Added
25
 
1
@@ -0,0 +1,23 @@
2
+# CMake toolchain file for cross compiling x265 for aarch64
3
+# This feature is only supported as experimental. Use with caution.
4
+# Please report bugs on bitbucket
5
+# Run cmake with: cmake -DCMAKE_TOOLCHAIN_FILE=crosscompile.cmake -G "Unix Makefiles" ../../source && ccmake ../../source
6
+
7
+set(CROSS_COMPILE_ARM64 1)
8
+set(CMAKE_SYSTEM_NAME Darwin)
9
+set(CMAKE_SYSTEM_PROCESSOR aarch64)
10
+
11
+# specify the cross compiler
12
+set(CMAKE_C_COMPILER gcc-12)
13
+set(CMAKE_CXX_COMPILER g++-12)
14
+
15
+# specify the target environment
16
+SET(CMAKE_FIND_ROOT_PATH  /opt/homebrew/bin/)
17
+
18
+# specify whether SVE/SVE2 is supported by the target platform
19
+if(DEFINED ENV{CROSS_COMPILE_SVE2})
20
+    set(CROSS_COMPILE_SVE2 1)
21
+elseif(DEFINED ENV{CROSS_COMPILE_SVE})
22
+    set(CROSS_COMPILE_SVE 1)
23
+endif()
24
+
25
x265_3.6.tar.gz/build/aarch64-darwin/make-Makefiles.bash Added
6
 
1
@@ -0,0 +1,4 @@
2
+#!/bin/bash
3
+# Run this from within a bash shell
4
+
5
+cmake -DCMAKE_TOOLCHAIN_FILE="crosscompile.cmake" -G "Unix Makefiles" ../../source && ccmake ../../source
6
x265_3.5.tar.gz/build/aarch64-linux/crosscompile.cmake -> x265_3.6.tar.gz/build/aarch64-linux/crosscompile.cmake Changed
34
 
1
@@ -3,13 +3,29 @@
2
 # Please report bugs on bitbucket
3
 # Run cmake with: cmake -DCMAKE_TOOLCHAIN_FILE=crosscompile.cmake -G "Unix Makefiles" ../../source && ccmake ../../source
4
 
5
-set(CROSS_COMPILE_ARM 1)
6
+set(CROSS_COMPILE_ARM64 1)
7
 set(CMAKE_SYSTEM_NAME Linux)
8
 set(CMAKE_SYSTEM_PROCESSOR aarch64)
9
 
10
 # specify the cross compiler
11
-set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
12
-set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
13
+if(DEFINED ENV{CMAKE_C_COMPILER})
14
+    set(CMAKE_C_COMPILER $ENV{CMAKE_C_COMPILER})
15
+else()
16
+    set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
17
+endif()
18
+if(DEFINED ENV{CMAKE_CXX_COMPILER})
19
+    set(CMAKE_CXX_COMPILER $ENV{CMAKE_CXX_COMPILER})
20
+else()
21
+    set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
22
+endif()
23
 
24
 # specify the target environment
25
 SET(CMAKE_FIND_ROOT_PATH  /usr/aarch64-linux-gnu)
26
+
27
+# specify whether SVE/SVE2 is supported by the target platform
28
+if(DEFINED ENV{CROSS_COMPILE_SVE2})
29
+    set(CROSS_COMPILE_SVE2 1)
30
+elseif(DEFINED ENV{CROSS_COMPILE_SVE})
31
+    set(CROSS_COMPILE_SVE 1)
32
+endif()
33
+
34
x265_3.5.tar.gz/build/arm-linux/make-Makefiles.bash -> x265_3.6.tar.gz/build/arm-linux/make-Makefiles.bash Changed
7
 
1
@@ -1,4 +1,4 @@
2
 #!/bin/bash
3
 # Run this from within a bash shell
4
 
5
-cmake -G "Unix Makefiles" ../../source && ccmake ../../source
6
+cmake -DCMAKE_TOOLCHAIN_FILE="crosscompile.cmake" -G "Unix Makefiles" ../../source && ccmake ../../source
7
x265_3.5.tar.gz/doc/reST/cli.rst -> x265_3.6.tar.gz/doc/reST/cli.rst Changed
405
 
1
@@ -632,9 +632,8 @@
2
    auto-detection by the encoder. If specified, the encoder will
3
    attempt to bring the encode specifications within that specified
4
    level. If the encoder is unable to reach the level it issues a
5
-   warning and aborts the encode. If the requested requirement level is
6
-   higher than the actual level, the actual requirement level is
7
-   signaled.
8
+   warning and aborts the encode. The requested level will be signaled 
9
+   in the bitstream even if it is higher than the actual level.
10
 
11
    Beware, specifying a decoder level will force the encoder to enable
12
    VBV for constant rate factor encodes, which may introduce
13
@@ -714,11 +713,8 @@
14
    (main, main10, etc). Second, an encoder is created from this
15
    x265_param instance and the :option:`--level-idc` and
16
    :option:`--high-tier` parameters are used to reduce bitrate or other
17
-   features in order to enforce the target level. Finally, the encoder
18
-   re-examines the final set of parameters and detects the actual
19
-   minimum decoder requirement level and this is what is signaled in
20
-   the bitstream headers. The detected decoder level will only use High
21
-   tier if the user specified a High tier level.
22
+   features in order to enforce the target level. The detected decoder level
23
+   will only use High tier if the user specified a High tier level.
24
 
25
    The signaled profile will be determined by the encoder's internal
26
    bitdepth and input color space. If :option:`--keyint` is 0 or 1,
27
@@ -961,21 +957,21 @@
28
    Note that :option:`--analysis-save-reuse-level` and :option:`--analysis-load-reuse-level` must be paired
29
    with :option:`--analysis-save` and :option:`--analysis-load` respectively.
30
 
31
-   +--------------+------------------------------------------+
32
-   | Level        | Description                              |
33
-   +==============+==========================================+
34
-   | 1            | Lookahead information                    |
35
-   +--------------+------------------------------------------+
36
-   | 2 to 4       | Level 1 + intra/inter modes, ref's       |
37
-   +--------------+------------------------------------------+
38
-   | 5 and 6      | Level 2 + rect-amp                       |
39
-   +--------------+------------------------------------------+
40
-   | 7            | Level 5 + AVC size CU refinement         |
41
-   +--------------+------------------------------------------+
42
-   | 8 and 9      | Level 5 + AVC size Full CU analysis-info |
43
-   +--------------+------------------------------------------+
44
-   | 10           | Level 5 + Full CU analysis-info          |
45
-   +--------------+------------------------------------------+
46
+   +--------------+---------------------------------------------------+
47
+   | Level        | Description                                       |
48
+   +==============+===================================================+
49
+   | 1            | Lookahead information                             |
50
+   +--------------+---------------------------------------------------+
51
+   | 2 to 4       | Level 1 + intra/inter modes, depth, ref's, cutree |
52
+   +--------------+---------------------------------------------------+
53
+   | 5 and 6      | Level 2 + rect-amp                                |
54
+   +--------------+---------------------------------------------------+
55
+   | 7            | Level 5 + AVC size CU refinement                  |
56
+   +--------------+---------------------------------------------------+
57
+   | 8 and 9      | Level 5 + AVC size Full CU analysis-info          |
58
+   +--------------+---------------------------------------------------+
59
+   | 10           | Level 5 + Full CU analysis-info                   |
60
+   +--------------+---------------------------------------------------+
61
 
62
 .. option:: --refine-mv-type <string>
63
 
64
@@ -1332,6 +1328,11 @@
65
    Search range for HME level 0, 1 and 2.
66
    The Search Range for each HME level must be between 0 and 32768(excluding).
67
    Default search range is 16,32,48 for level 0,1,2 respectively.
68
+   
69
+.. option:: --mcstf, --no-mcstf
70
+
71
+    Enable Motion Compensated Temporal filtering.
72
+   Default: disabled
73
 
74
 Spatial/intra options
75
 =====================
76
@@ -1473,17 +1474,9 @@
77
 
78
 .. option:: --hist-scenecut, --no-hist-scenecut
79
 
80
-   Indicates that scenecuts need to be detected using luma edge and chroma histograms.
81
-   :option:`--hist-scenecut` enables scenecut detection using the histograms and disables the default scene cut algorithm.
82
-   :option:`--no-hist-scenecut` disables histogram based scenecut algorithm.
83
-   
84
-.. option:: --hist-threshold <0.0..1.0>
85
-
86
-   This value represents the threshold for normalized SAD of edge histograms used in scenecut detection.
87
-   This requires :option:`--hist-scenecut` to be enabled. For example, a value of 0.2 indicates that a frame with normalized SAD value 
88
-   greater than 0.2 against the previous frame as scenecut. 
89
-   Increasing the threshold reduces the number of scenecuts detected.
90
-   Default 0.03.
91
+   Scenecuts detected based on histogram, intensity and variance of the picture.
92
+   :option:`--hist-scenecut` enables or :option:`--no-hist-scenecut` disables scenecut detection based on
93
+   histogram.
94
    
95
 .. option:: --radl <integer>
96
    
97
@@ -1766,6 +1759,12 @@
98
    Default 1.0.
99
    **Range of values:** 0.0 to 3.0
100
 
101
+.. option:: --sbrc --no-sbrc
102
+
103
+   To enable and disable segment based rate control.Segment duration depends on the
104
+   keyframe interval specified.If unspecified,default keyframe interval will be used.
105
+   Default: disabled.
106
+
107
 .. option:: --hevc-aq
108
 
109
    Enable adaptive quantization
110
@@ -1976,12 +1975,18 @@
111
    
112
    **CLI ONLY**
113
 
114
+.. option:: --scenecut-qp-config <filename>
115
+
116
+   Specify a text file which contains the scenecut aware QP options.
117
+   The options include :option:`--scenecut-aware-qp` and :option:`--masking-strength`
118
+
119
+   **CLI ONLY**
120
+
121
 .. option:: --scenecut-aware-qp <integer>
122
 
123
    It reduces the bits spent on the inter-frames within the scenecut window
124
    before and after a scenecut by increasing their QP in ratecontrol pass2 algorithm
125
-   without any deterioration in visual quality. If a scenecut falls within the window,
126
-   the QP of the inter-frames after this scenecut will not be modified.
127
+   without any deterioration in visual quality.
128
    :option:`--scenecut-aware-qp` works only with --pass 2. Default 0.
129
 
130
    +-------+---------------------------------------------------------------+
131
@@ -2006,48 +2011,83 @@
132
    for the QP increment for inter-frames when :option:`--scenecut-aware-qp`
133
    is enabled.
134
 
135
-   When :option:`--scenecut-aware-qp` is::
136
+   When :option:`--scenecut-aware-qp` is:
137
+
138
    * 1 (Forward masking):
139
-   --masking-strength <fwdWindow,fwdRefQPDelta,fwdNonRefQPDelta>
140
+   --masking-strength <fwdMaxWindow,fwdRefQPDelta,fwdNonRefQPDelta>
141
+   or 
142
+   --masking-strength <fwdWindow1,fwdRefQPDelta1,fwdNonRefQPDelta1,fwdWindow2,fwdRefQPDelta2,fwdNonRefQPDelta2,
143
+                       fwdWindow3,fwdRefQPDelta3,fwdNonRefQPDelta3,fwdWindow4,fwdRefQPDelta4,fwdNonRefQPDelta4,
144
+                       fwdWindow5,fwdRefQPDelta5,fwdNonRefQPDelta5,fwdWindow6,fwdRefQPDelta6,fwdNonRefQPDelta6>
145
    * 2 (Backward masking):
146
-   --masking-strength <bwdWindow,bwdRefQPDelta,bwdNonRefQPDelta>
147
+   --masking-strength <bwdMaxWindow,bwdRefQPDelta,bwdNonRefQPDelta>
148
+   or 
149
+   --masking-strength <bwdWindow1,bwdRefQPDelta1,bwdNonRefQPDelta1,bwdWindow2,bwdRefQPDelta2,bwdNonRefQPDelta2,
150
+                       bwdWindow3,bwdRefQPDelta3,bwdNonRefQPDelta3,bwdWindow4,bwdRefQPDelta4,bwdNonRefQPDelta4,
151
+                       bwdWindow5,bwdRefQPDelta5,bwdNonRefQPDelta5,bwdWindow6,bwdRefQPDelta6,bwdNonRefQPDelta6>
152
    * 3 (Bi-directional masking):
153
-   --masking-strength <fwdWindow,fwdRefQPDelta,fwdNonRefQPDelta,bwdWindow,bwdRefQPDelta,bwdNonRefQPDelta>
154
+   --masking-strength <fwdMaxWindow,fwdRefQPDelta,fwdNonRefQPDelta,bwdMaxWindow,bwdRefQPDelta,bwdNonRefQPDelta>
155
+   or 
156
+   --masking-strength <fwdWindow1,fwdRefQPDelta1,fwdNonRefQPDelta1,fwdWindow2,fwdRefQPDelta2,fwdNonRefQPDelta2,
157
+                       fwdWindow3,fwdRefQPDelta3,fwdNonRefQPDelta3,fwdWindow4,fwdRefQPDelta4,fwdNonRefQPDelta4,
158
+                       fwdWindow5,fwdRefQPDelta5,fwdNonRefQPDelta5,fwdWindow6,fwdRefQPDelta6,fwdNonRefQPDelta6,
159
+                       bwdWindow1,bwdRefQPDelta1,bwdNonRefQPDelta1,bwdWindow2,bwdRefQPDelta2,bwdNonRefQPDelta2,
160
+                       bwdWindow3,bwdRefQPDelta3,bwdNonRefQPDelta3,bwdWindow4,bwdRefQPDelta4,bwdNonRefQPDelta4,
161
+                       bwdWindow5,bwdRefQPDelta5,bwdNonRefQPDelta5,bwdWindow6,bwdRefQPDelta6,bwdNonRefQPDelta6>
162
 
163
    +-----------------+---------------------------------------------------------------+
164
    | Parameter       | Description                                                   |
165
    +=================+===============================================================+
166
-   | fwdWindow       | The duration(in milliseconds) for which there is a reduction  |
167
-   |                 | in the bits spent on the inter-frames after a scenecut by     |
168
-   |                 | increasing their QP. Default 500ms.                           |
169
-   |                 | **Range of values:** 0 to 1000                                |
170
+   | fwdMaxWindow    | The maximum duration(in milliseconds) for which there is a    |
171
+   |                 | reduction in the bits spent on the inter-frames after a       |
172
+   |                 | scenecut by increasing their QP. Default 500ms.               |
173
+   |                 | **Range of values:** 0 to 2000                                |
174
+   +-----------------+---------------------------------------------------------------+
175
+   | fwdWindow       | The duration of a sub-window(in milliseconds) for which there |
176
+   |                 | is a reduction in the bits spent on the inter-frames after a  |
177
+   |                 | scenecut by increasing their QP. Default 500ms.               |
178
+   |                 | **Range of values:** 0 to 2000                                |
179
    +-----------------+---------------------------------------------------------------+
180
    | fwdRefQPDelta   | The offset by which QP is incremented for inter-frames        |
181
    |                 | after a scenecut. Default 5.                                  |
182
-   |                 | **Range of values:** 0 to 10                                  |
183
+   |                 | **Range of values:** 0 to 20                                  |
184
    +-----------------+---------------------------------------------------------------+
185
    | fwdNonRefQPDelta| The offset by which QP is incremented for non-referenced      |
186
    |                 | inter-frames after a scenecut. The offset is computed from    |
187
    |                 | fwdRefQPDelta when it is not explicitly specified.            |
188
-   |                 | **Range of values:** 0 to 10                                  |
189
+   |                 | **Range of values:** 0 to 20                                  |
190
+   +-----------------+---------------------------------------------------------------+
191
+   | bwdMaxWindow    | The maximum duration(in milliseconds) for which there is a    |
192
+   |                 | reduction in the bits spent on the inter-frames before a      |
193
+   |                 | scenecut by increasing their QP. Default 100ms.               |
194
+   |                 | **Range of values:** 0 to 2000                                |
195
    +-----------------+---------------------------------------------------------------+
196
-   | bwdWindow       | The duration(in milliseconds) for which there is a reduction  |
197
-   |                 | in the bits spent on the inter-frames before a scenecut by    |
198
-   |                 | increasing their QP. Default 100ms.                           |
199
-   |                 | **Range of values:** 0 to 1000                                |
200
+   | bwdWindow       | The duration of a sub-window(in milliseconds) for which there |
201
+   |                 | is a reduction in the bits spent on the inter-frames before a |
202
+   |                 | scenecut by increasing their QP. Default 100ms.               |
203
+   |                 | **Range of values:** 0 to 2000                                |
204
    +-----------------+---------------------------------------------------------------+
205
    | bwdRefQPDelta   | The offset by which QP is incremented for inter-frames        |
206
    |                 | before a scenecut. The offset is computed from                |
207
    |                 | fwdRefQPDelta when it is not explicitly specified.            |
208
-   |                 | **Range of values:** 0 to 10                                  |
209
+   |                 | **Range of values:** 0 to 20                                  |
210
    +-----------------+---------------------------------------------------------------+
211
    | bwdNonRefQPDelta| The offset by which QP is incremented for non-referenced      |
212
    |                 | inter-frames before a scenecut. The offset is computed from   |
213
    |                 | bwdRefQPDelta when it is not explicitly specified.            |
214
-   |                 | **Range of values:** 0 to 10                                  |
215
+   |                 | **Range of values:** 0 to 20                                  |
216
    +-----------------+---------------------------------------------------------------+
217
 
218
-   **CLI ONLY**
219
+   We can specify the value for the Use :option:`--masking-strength` parameter in different formats.
220
+   1. If we don't specify --masking-strength and specify only --scenecut-aware-qp, then default offset and window size values are considered.
221
+   2. If we specify --masking-strength with the format 1 mentioned above, the values of window, refQpDelta and nonRefQpDelta given by the user are taken for window 1 and the offsets for the remaining windows are derived with 15% difference between windows.
222
+   3. If we specify the --masking-strength with the format 2 mentioned above, the values of window, refQpDelta and nonRefQpDelta given by the user for each window from 1 to 6 are directly used.NOTE: We can use this format to specify zero offsets for any particular window
223
+
224
+   Sample config file:: (Format 2 Forward masking explained here)
225
+
226
+   --scenecut-aware-qp 1 --masking-strength 1000,8,12
227
+   
228
+   The above sample config file is available in `the downloads page <https://bitbucket.org/multicoreware/x265_git/downloads/scenecut_qp_config.txt>`_
229
 
230
 .. option:: --vbv-live-multi-pass, --no-vbv-live-multi-pass
231
 
232
@@ -2057,6 +2097,14 @@
233
    rate control mode.
234
 
235
    Default disabled. **Experimental feature**
236
+   
237
+
238
+.. option:: bEncFocusedFramesOnly
239
+
240
+   Used to trigger encoding of selective GOPs; Disabled by default.
241
+   
242
+   **API ONLY**
243
+   
244
 
245
 Quantization Options
246
 ====================
247
@@ -2427,6 +2475,81 @@
248
    Values in the range 0..12. See D.3.3 of the HEVC spec. for a detailed explanation.
249
    Required for HLG (Hybrid Log Gamma) signaling. Not signaled by default.
250
 
251
+.. option:: --video-signal-type-preset <string>
252
+
253
+   Specify combinations of color primaries, transfer characteristics, color matrix,
254
+   range of luma and chroma signals, and chroma sample location.
255
+   String format: <system-id>:<color-volume>
256
+   
257
+   This has higher precedence than individual VUI parameters. If any individual VUI option
258
+   is specified together with this, which changes the values set corresponding to the system-id
259
+   or color-volume, it will be discarded.
260
+
261
+   system-id options and their corresponding values:
262
+   +----------------+---------------------------------------------------------------+
263
+   | system-id      | Value                                                         |
264
+   +================+===============================================================+
265
+   | BT601_525      | --colorprim smpte170m --transfer smpte170m                    |
266
+   |                | --colormatrix smpte170m --range limited --chromaloc 0         |
267
+   +----------------+---------------------------------------------------------------+
268
+   | BT601_626      | --colorprim bt470bg --transfer smpte170m --colormatrix bt470bg|
269
+   |                | --range limited --chromaloc 0                                 |
270
+   +----------------+---------------------------------------------------------------+
271
+   | BT709_YCC      | --colorprim bt709 --transfer bt709 --colormatrix bt709        |
272
+   |                | --range limited --chromaloc 0                                 |
273
+   +----------------+---------------------------------------------------------------+
274
+   | BT709_RGB      | --colorprim bt709 --transfer bt709 --colormatrix gbr          |
275
+   |                | --range limited                                               |
276
+   +----------------+---------------------------------------------------------------+
277
+   | BT2020_YCC_NCL | --colorprim bt2020 --transfer bt2020-10 --colormatrix bt709   |
278
+   |                | --range limited --chromaloc 2                                 |
279
+   +----------------+---------------------------------------------------------------+
280
+   | BT2020_RGB     | --colorprim bt2020 --transfer smpte2084 --colormatrix bt2020nc|
281
+   |                | --range limited                                               |
282
+   +----------------+---------------------------------------------------------------+
283
+   | BT2100_PQ_YCC  | --colorprim bt2020 --transfer smpte2084 --colormatrix bt2020nc|
284
+   |                | --range limited --chromaloc 2                                 |
285
+   +----------------+---------------------------------------------------------------+
286
+   | BT2100_PQ_ICTCP| --colorprim bt2020 --transfer smpte2084 --colormatrix ictcp   |
287
+   |                | --range limited --chromaloc 2                                 |
288
+   +----------------+---------------------------------------------------------------+
289
+   | BT2100_PQ_RGB  | --colorprim bt2020 --transfer smpte2084 --colormatrix gbr     |
290
+   |                | --range limited                                               |
291
+   +----------------+---------------------------------------------------------------+
292
+   | BT2100_HLG_YCC | --colorprim bt2020 --transfer arib-std-b67                    |
293
+   |                | --colormatrix bt2020nc --range limited --chromaloc 2          |
294
+   +----------------+---------------------------------------------------------------+
295
+   | BT2100_HLG_RGB | --colorprim bt2020 --transfer arib-std-b67 --colormatrix gbr  |
296
+   |                | --range limited                                               |
297
+   +----------------+---------------------------------------------------------------+
298
+   | FR709_RGB      | --colorprim bt709 --transfer bt709 --colormatrix gbr          |
299
+   |                | --range full                                                  |
300
+   +----------------+---------------------------------------------------------------+
301
+   | FR2020_RGB     | --colorprim bt2020 --transfer bt2020-10 --colormatrix gbr     |
302
+   |                | --range full                                                  |
303
+   +----------------+---------------------------------------------------------------+
304
+   | FRP3D65_YCC    | --colorprim smpte432 --transfer bt709 --colormatrix smpte170m |
305
+   |                | --range full --chromaloc 1                                    |
306
+   +----------------+---------------------------------------------------------------+
307
+
308
+   color-volume options and their corresponding values:
309
+   +----------------+---------------------------------------------------------------+
310
+   | color-volume   | Value                                                         |
311
+   +================+===============================================================+
312
+   | P3D65x1000n0005| --master-display G(13250,34500)B(7500,3000)R(34000,16000)     |
313
+   |                |                  WP(15635,16450)L(10000000,5)                 |
314
+   +----------------+---------------------------------------------------------------+
315
+   | P3D65x4000n005 | --master-display G(13250,34500)B(7500,3000)R(34000,16000)     |
316
+   |                |                  WP(15635,16450)L(40000000,50)                |
317
+   +----------------+---------------------------------------------------------------+
318
+   | BT2100x108n0005| --master-display G(8500,39850)B(6550,2300)R(34000,146000)     |
319
+   |                |                  WP(15635,16450)L(10000000,1)                 |
320
+   +----------------+---------------------------------------------------------------+
321
+
322
+   Note: The color-volume options can be used only with the system-id options BT2100_PQ_YCC,
323
+          BT2100_PQ_ICTCP, and BT2100_PQ_RGB. It is incompatible with other options.
324
+
325
+
326
 Bitstream options
327
 =================
328
 
329
@@ -2454,6 +2577,16 @@
330
    the very first AUD will be skipped since it cannot be placed at the
331
    start of the access unit, where it belongs. Default disabled
332
 
333
+.. option:: --eob, --no-eob
334
+
335
+   Emit an end of bitstream NAL unit at the end of the bitstream.
336
+   Default disabled
337
+
338
+.. option:: --eos, --no-eos
339
+
340
+   Emit an end of sequence NAL unit at the end of every coded
341
+   video sequence. Default disabled
342
+
343
 .. option:: --hrd, --no-hrd
344
 
345
    Enable the signaling of HRD parameters to the decoder. The HRD
346
@@ -2480,7 +2613,7 @@
347
     The value is specified as a float or as an integer with the profile times 10,
348
     for example profile 5 is specified as "5" or "5.0" or "50".
349
     
350
-    Currently only profile 5, profile 8.1 and profile 8.2 enabled, Default 0 (disabled)
351
+    Currently only profile 5, profile 8.1, profile 8.2 and profile 8.4  enabled, Default 0 (disabled)
352
 
353
 .. option:: --dolby-vision-rpu <filename>
354
 
355
@@ -2509,17 +2642,26 @@
356
    2. CRC
357
    3. Checksum
358
 
359
-.. option:: --temporal-layers,--no-temporal-layers
360
+.. option:: --temporal-layers <integer>
361
 
362
-   Enable a temporal sub layer. All referenced I/P/B frames are in the
363
-   base layer and all unreferenced B frames are placed in a temporal
364
-   enhancement layer. A decoder may choose to drop the enhancement layer 
365
-   and only decode and display the base layer slices.
366
-   
367
-   If used with a fixed GOP (:option:`--b-adapt` 0) and :option:`--bframes`
368
-   3 then the two layers evenly split the frame rate, with a cadence of
369
-   PbBbP. You probably also want :option:`--no-scenecut` and a keyframe
370
-   interval that is a multiple of 4.
371
+   Enable specified number of temporal sub layers. For any frame in layer N,
372
+   all referenced frames are in the layer N or N-1.A decoder may choose to drop the enhancement layer
373
+   and only decode and display the base layer slices.Allowed number of temporal sub-layers
374
+   are 2 to 5.(2 and 5 inclusive)
375
+
376
+   When enabled,temporal layers 3 through 5 configures a fixed miniGOP with the number of bframes as shown below
377
+   unless miniGOP size is modified due to lookahead decisions.Temporal layer 2 is a special case that has
378
+   all reference frames in base layer and non-reference frames in enhancement layer without any constraint on the
379
+   number of bframes.Default disabled.
380
+   +----------------+--------+
381
+   | temporal layer | bframes|
382
+   +================+========+
383
+   | 3              | 3      |
384
+   +----------------+--------+
385
+   | 4              | 7      |
386
+    +----------------+--------+
387
+   | 5              | 15     |
388
+   +----------------+--------+
389
 
390
 .. option:: --log2-max-poc-lsb <integer>
391
 
392
@@ -2564,6 +2706,12 @@
393
    Emit SEI messages in a single NAL unit instead of multiple NALs. Default disabled.
394
    When HRD SEI is enabled the HM decoder will throw a warning.
395
 
396
+.. option:: --film-grain <filename>
397
+
398
+    Refers to the film grain model characteristics for signal enhancement information transmission
399
+
400
+    **CLI_ONLY**
401
+
402
 DCT Approximations
403
 =================
404
 
405
x265_3.5.tar.gz/doc/reST/introduction.rst -> x265_3.6.tar.gz/doc/reST/introduction.rst Changed
9
 
1
@@ -77,6 +77,6 @@
2
 to start is with the `Motion Picture Experts Group - Licensing Authority
3
 - HEVC Licensing Program <http://www.mpegla.com/main/PID/HEVC/default.aspx>`_.
4
 
5
-x265 is a registered trademark of MulticoreWare, Inc.  The x265 logo is
6
+x265 is a registered trademark of MulticoreWare, Inc.  The X265 logo is
7
 a trademark of MulticoreWare, and may only be used with explicit written
8
 permission.  All rights reserved.
9
x265_3.5.tar.gz/doc/reST/releasenotes.rst -> x265_3.6.tar.gz/doc/reST/releasenotes.rst Changed
55
 
1
@@ -2,6 +2,53 @@
2
 Release Notes
3
 *************
4
 
5
+Version 3.6
6
+===========
7
+
8
+Release date - 4th April, 2024.
9
+
10
+New feature
11
+-----------
12
+1. Segment based Ratecontrol (SBRC) feature
13
+2. Motion-Compensated Spatio-Temporal Filtering
14
+3. Scene-cut aware qp - BBAQ (Bidirectional Boundary Aware Quantization)
15
+4. Histogram-Based Scene Change Detection
16
+5. Film-Grain characteristics as a SEI message to support Film Grain Synthesis(FGS)
17
+6. Add temporal layer implementation(Hierarchical B-frame implementation)
18
+ 
19
+Enhancements to existing features
20
+---------------------------------
21
+1. Added Dolby Vision 8.4 Profile Support
22
+
23
+
24
+API changes
25
+-----------
26
+1. Add Segment based Ratecontrol(SBRC) feature: "--no-sbrc".
27
+2. Add command line parameter for mcstf feature: "--no-mctf".
28
+3. Add command line parameters for the scene cut aware qp feature: "--scenecut-aware-qp" and "--masking-strength".
29
+4. Add command line parameters for Histogram-Based Scene Change Detection: "--hist-scenecut".
30
+5. Add film grain characteristics as a SEI message to the bitstream: "--film-grain <filename>"
31
+6. cli: add new option --cra-nal (Force nal type to CRA to all frames expect for the first frame, works only with keyint 1)
32
+
33
+Optimizations
34
+---------------------
35
+ARM64 NEON optimizations:- Several time-consuming C functions have been optimized for the targeted platform - aarch64. The overall performance increased by around 20%.
36
+SVE/SVE2 optimizations
37
+
38
+
39
+Bug fixes
40
+---------
41
+1. Linux bug to utilize all the cores
42
+2. Crash with hist-scenecut build when source resolution is not multiple of minCuSize
43
+3. 32bit and 64bit builds generation for ARM
44
+4. bugs in zonefile feature (Reflect Zonefile Parameters inside Lookahead, extra IDR issue, Avg I Slice QP value issue etc..)
45
+5. Add x86 ASM implementation for subsampling luma 
46
+6. Fix for abrladder segfault with load reuse level 1 
47
+7. Reorder miniGOP based on temporal layer hierarchy and add support for more B frame 
48
+8. Add MacOS aarch64 build support 
49
+9. Fix boundary condition issue for Gaussian filter
50
+
51
+
52
 Version 3.5
53
 ===========
54
 
55
x265_3.5.tar.gz/readme.rst -> x265_3.6.tar.gz/readme.rst Changed
10
 
1
@@ -2,7 +2,7 @@
2
 x265 HEVC Encoder
3
 =================
4
 
5
-| **Read:** | Online `documentation <http://x265.readthedocs.org/en/default/>`_ | Developer `wiki <http://bitbucket.org/multicoreware/x265/wiki/>`_
6
+| **Read:** | Online `documentation <http://x265.readthedocs.org/en/master/>`_ | Developer `wiki <http://bitbucket.org/multicoreware/x265_git/wiki/>`_
7
 | **Download:** | `releases <http://ftp.videolan.org/pub/videolan/x265/>`_ 
8
 | **Interact:** | #x265 on freenode.irc.net | `x265-devel@videolan.org <http://mailman.videolan.org/listinfo/x265-devel>`_ | `Report an issue <https://bitbucket.org/multicoreware/x265/issues?status=new&status=open>`_
9
 
10
x265_3.5.tar.gz/source/CMakeLists.txt -> x265_3.6.tar.gz/source/CMakeLists.txt Changed
232
 
1
@@ -29,7 +29,7 @@
2
 option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
3
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
4
 # X265_BUILD must be incremented each time the public API is changed
5
-set(X265_BUILD 199)
6
+set(X265_BUILD 209)
7
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
8
                "${PROJECT_BINARY_DIR}/x265.def")
9
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
10
@@ -38,14 +38,20 @@
11
 SET(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake" "${CMAKE_MODULE_PATH}")
12
 
13
 # System architecture detection
14
-string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
15
+if (APPLE AND CMAKE_OSX_ARCHITECTURES)
16
+    string(TOLOWER "${CMAKE_OSX_ARCHITECTURES}" SYSPROC)
17
+else()
18
+    string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
19
+endif()
20
 set(X86_ALIASES x86 i386 i686 x86_64 amd64)
21
-set(ARM_ALIASES armv6l armv7l aarch64)
22
+set(ARM_ALIASES armv6l armv7l)
23
+set(ARM64_ALIASES arm64 arm64e aarch64)
24
 list(FIND X86_ALIASES "${SYSPROC}" X86MATCH)
25
 list(FIND ARM_ALIASES "${SYSPROC}" ARMMATCH)
26
-set(POWER_ALIASES ppc64 ppc64le)
27
+list(FIND ARM64_ALIASES "${SYSPROC}" ARM64MATCH)
28
+set(POWER_ALIASES powerpc64 powerpc64le ppc64 ppc64le)
29
 list(FIND POWER_ALIASES "${SYSPROC}" POWERMATCH)
30
-if("${SYSPROC}" STREQUAL "" OR X86MATCH GREATER "-1")
31
+if(X86MATCH GREATER "-1")
32
     set(X86 1)
33
     add_definitions(-DX265_ARCH_X86=1)
34
     if(CMAKE_CXX_FLAGS STREQUAL "-m32")
35
@@ -70,15 +76,18 @@
36
     else()
37
         set(CROSS_COMPILE_ARM 0)
38
     endif()
39
+   message(STATUS "Detected ARM target processor")
40
     set(ARM 1)
41
-    if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
42
-        message(STATUS "Detected ARM64 target processor")
43
-        set(ARM64 1)
44
-        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0)
45
-    else()
46
-        message(STATUS "Detected ARM target processor")
47
-        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1)
48
-    endif()
49
+    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
50
+elseif(ARM64MATCH GREATER "-1")
51
+    #if(CROSS_COMPILE_ARM64)
52
+        #message(STATUS "Cross compiling for ARM64 arch")
53
+    #else()
54
+        #set(CROSS_COMPILE_ARM64 0)
55
+    #endif()
56
+    message(STATUS "Detected ARM64 target processor")
57
+    set(ARM64 1)
58
+    add_definitions(-DX265_ARCH_ARM64=1 -DHAVE_NEON)
59
 else()
60
     message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
61
     message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
62
@@ -239,26 +248,43 @@
63
         endif()
64
     endif()
65
     if(ARM AND CROSS_COMPILE_ARM)
66
-        if(ARM64)
67
-            set(ARM_ARGS -fPIC)
68
-        else()
69
-            set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
70
-        endif()
71
         message(STATUS "cross compile arm")
72
+       set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
73
     elseif(ARM)
74
-        if(ARM64)
75
-            set(ARM_ARGS -fPIC)
76
+        find_package(Neon)
77
+        if(CPU_HAS_NEON)
78
+            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
79
             add_definitions(-DHAVE_NEON)
80
         else()
81
-            find_package(Neon)
82
-            if(CPU_HAS_NEON)
83
-                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
84
-                add_definitions(-DHAVE_NEON)
85
-            else()
86
-                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
87
-            endif()
88
+            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
89
         endif()
90
     endif()
91
+   if(ARM64 OR CROSS_COMPILE_ARM64)
92
+        find_package(Neon)
93
+        find_package(SVE)
94
+        find_package(SVE2)
95
+        if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2)
96
+            message(STATUS "Found SVE2")
97
+           set(ARM_ARGS -O3 -march=armv8-a+sve2 -fPIC -flax-vector-conversions)
98
+            add_definitions(-DHAVE_SVE2)
99
+            add_definitions(-DHAVE_SVE)
100
+            add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE2
101
+        elseif(CPU_HAS_SVE OR CROSS_COMPILE_SVE)
102
+            message(STATUS "Found SVE")
103
+           set(ARM_ARGS -O3 -march=armv8-a+sve -fPIC -flax-vector-conversions)
104
+            add_definitions(-DHAVE_SVE)
105
+            add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE
106
+        elseif(CPU_HAS_NEON)
107
+            message(STATUS "Found NEON")
108
+            set(ARM_ARGS -fPIC -flax-vector-conversions)
109
+            add_definitions(-DHAVE_NEON)
110
+        else()
111
+            set(ARM_ARGS -fPIC -flax-vector-conversions)
112
+        endif()        
113
+    endif()
114
+   if(ENABLE_PIC)
115
+   list(APPEND ARM_ARGS -DPIC)
116
+   endif()
117
     add_definitions(${ARM_ARGS})
118
     if(FPROFILE_GENERATE)
119
         if(INTEL_CXX)
120
@@ -350,7 +376,7 @@
121
 endif(GCC)
122
 
123
 find_package(Nasm)
124
-if(ARM OR CROSS_COMPILE_ARM)
125
+if(ARM OR CROSS_COMPILE_ARM OR ARM64 OR CROSS_COMPILE_ARM64)
126
     option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" ON)
127
 elseif(NASM_FOUND AND X86)
128
     if (NASM_VERSION_STRING VERSION_LESS "2.13.0")
129
@@ -384,7 +410,7 @@
130
 endif(EXTRA_LIB)
131
 mark_as_advanced(EXTRA_LIB EXTRA_LINK_FLAGS)
132
 
133
-if(X64)
134
+if(X64 OR ARM64 OR PPC64)
135
     # NOTE: We only officially support high-bit-depth compiles of x265
136
     # on 64bit architectures. Main10 plus large resolution plus slow
137
     # preset plus 32bit address space usually means malloc failure.  You
138
@@ -393,7 +419,7 @@
139
     # license" so to speak.  If it breaks you get to keep both halves.
140
     # You will need to disable assembly manually.
141
     option(HIGH_BIT_DEPTH "Store pixel samples as 16bit values (Main10/Main12)" OFF)
142
-endif(X64)
143
+endif(X64 OR ARM64 OR PPC64)
144
 if(HIGH_BIT_DEPTH)
145
     option(MAIN12 "Support Main12 instead of Main10" OFF)
146
     if(MAIN12)
147
@@ -440,6 +466,18 @@
148
 endif()
149
 add_definitions(-DX265_NS=${X265_NS})
150
 
151
+if(ARM64)
152
+  if(HIGH_BIT_DEPTH)
153
+    if(MAIN12)
154
+      list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=12 -DX265_NS=${X265_NS})
155
+    else()
156
+      list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=10 -DX265_NS=${X265_NS})
157
+    endif()
158
+  else()
159
+    list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=0 -DBIT_DEPTH=8 -DX265_NS=${X265_NS})
160
+  endif()
161
+endif(ARM64)
162
+
163
 option(WARNINGS_AS_ERRORS "Stop compiles on first warning" OFF)
164
 if(WARNINGS_AS_ERRORS)
165
     if(GCC)
166
@@ -536,11 +574,7 @@
167
     # compile ARM arch asm files here
168
         enable_language(ASM)
169
         foreach(ASM ${ARM_ASMS})
170
-            if(ARM64)
171
-                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
172
-            else()
173
-                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})
174
-            endif()
175
+           set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})
176
             list(APPEND ASM_SRCS ${ASM_SRC})
177
             list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
178
             add_custom_command(
179
@@ -549,6 +583,52 @@
180
                 ARGS ${ARM_ARGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
181
                 DEPENDS ${ASM_SRC})
182
         endforeach()
183
+   elseif(ARM64 OR CROSS_COMPILE_ARM64)
184
+    # compile ARM64 arch asm files here
185
+        enable_language(ASM)
186
+        foreach(ASM ${ARM_ASMS})
187
+            set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
188
+            list(APPEND ASM_SRCS ${ASM_SRC})
189
+            list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
190
+            add_custom_command(
191
+                OUTPUT ${ASM}.${SUFFIX}
192
+                COMMAND ${CMAKE_CXX_COMPILER}
193
+                ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
194
+                DEPENDS ${ASM_SRC})
195
+        endforeach()
196
+        if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2)
197
+            foreach(ASM ${ARM_ASMS_SVE})
198
+                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
199
+                list(APPEND ASM_SRCS ${ASM_SRC})
200
+                list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
201
+                add_custom_command(
202
+                    OUTPUT ${ASM}.${SUFFIX}
203
+                    COMMAND ${CMAKE_CXX_COMPILER}
204
+                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
205
+                    DEPENDS ${ASM_SRC})
206
+            endforeach()
207
+            foreach(ASM ${ARM_ASMS_SVE2})
208
+                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
209
+                list(APPEND ASM_SRCS ${ASM_SRC})
210
+                list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
211
+                add_custom_command(
212
+                    OUTPUT ${ASM}.${SUFFIX}
213
+                    COMMAND ${CMAKE_CXX_COMPILER}
214
+                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
215
+                    DEPENDS ${ASM_SRC})
216
+            endforeach()
217
+        elseif(CPU_HAS_SVE OR CROSS_COMPILE_SVE)
218
+            foreach(ASM ${ARM_ASMS_SVE})
219
+                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
220
+                list(APPEND ASM_SRCS ${ASM_SRC})
221
+                list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
222
+                add_custom_command(
223
+                    OUTPUT ${ASM}.${SUFFIX}
224
+                    COMMAND ${CMAKE_CXX_COMPILER}
225
+                    ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
226
+                    DEPENDS ${ASM_SRC})
227
+            endforeach()
228
+        endif()
229
     elseif(X86)
230
     # compile X86 arch asm files here
231
         foreach(ASM ${MSVC_ASMS})
232
x265_3.5.tar.gz/source/abrEncApp.cpp -> x265_3.6.tar.gz/source/abrEncApp.cpp Changed
2220
 
1
@@ -1,1111 +1,1111 @@
2
-/*****************************************************************************
3
-* Copyright (C) 2013-2020 MulticoreWare, Inc
4
-*
5
-* Authors: Pooja Venkatesan <pooja@multicorewareinc.com>
6
-*          Aruna Matheswaran <aruna@multicorewareinc.com>
7
-*
8
-* This program is free software; you can redistribute it and/or modify
9
-* it under the terms of the GNU General Public License as published by
10
-* the Free Software Foundation; either version 2 of the License, or
11
-* (at your option) any later version.
12
-*
13
-* This program is distributed in the hope that it will be useful,
14
-* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
-* GNU General Public License for more details.
17
-*
18
-* You should have received a copy of the GNU General Public License
19
-* along with this program; if not, write to the Free Software
20
-* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21
-*
22
-* This program is also available under a commercial proprietary license.
23
-* For more information, contact us at license @ x265.com.
24
-*****************************************************************************/
25
-
26
-#include "abrEncApp.h"
27
-#include "mv.h"
28
-#include "slice.h"
29
-#include "param.h"
30
-
31
-#include <signal.h>
32
-#include <errno.h>
33
-
34
-#include <queue>
35
-
36
-using namespace X265_NS;
37
-
38
-/* Ctrl-C handler */
39
-static volatile sig_atomic_t b_ctrl_c /* = 0 */;
40
-static void sigint_handler(int)
41
-{
42
-    b_ctrl_c = 1;
43
-}
44
-
45
-namespace X265_NS {
46
-    // private namespace
47
-#define X265_INPUT_QUEUE_SIZE 250
48
-
49
-    AbrEncoder::AbrEncoder(CLIOptions cliopt, uint8_t numEncodes, int &ret)
50
-    {
51
-        m_numEncodes = numEncodes;
52
-        m_numActiveEncodes.set(numEncodes);
53
-        m_queueSize = (numEncodes > 1) ? X265_INPUT_QUEUE_SIZE : 1;
54
-        m_passEnc = X265_MALLOC(PassEncoder*, m_numEncodes);
55
-
56
-        for (uint8_t i = 0; i < m_numEncodes; i++)
57
-        {
58
-            m_passEnci = new PassEncoder(i, cliopti, this);
59
-            if (!m_passEnci)
60
-            {
61
-                x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for passEncoder\n");
62
-                ret = 4;
63
-            }
64
-            m_passEnci->init(ret);
65
-        }
66
-
67
-        if (!allocBuffers())
68
-        {
69
-            x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for buffers\n");
70
-            ret = 4;
71
-        }
72
-
73
-        /* start passEncoder worker threads */
74
-        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
75
-            m_passEncpass->startThreads();
76
-    }
77
-
78
-    bool AbrEncoder::allocBuffers()
79
-    {
80
-        m_inputPicBuffer = X265_MALLOC(x265_picture**, m_numEncodes);
81
-        m_analysisBuffer = X265_MALLOC(x265_analysis_data*, m_numEncodes);
82
-
83
-        m_picWriteCnt = new ThreadSafeIntegerm_numEncodes;
84
-        m_picReadCnt = new ThreadSafeIntegerm_numEncodes;
85
-        m_analysisWriteCnt = new ThreadSafeIntegerm_numEncodes;
86
-        m_analysisReadCnt = new ThreadSafeIntegerm_numEncodes;
87
-
88
-        m_picIdxReadCnt = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
89
-        m_analysisWrite = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
90
-        m_analysisRead = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
91
-        m_readFlag = X265_MALLOC(int*, m_numEncodes);
92
-
93
-        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
94
-        {
95
-            m_inputPicBufferpass = X265_MALLOC(x265_picture*, m_queueSize);
96
-            for (uint32_t idx = 0; idx < m_queueSize; idx++)
97
-            {
98
-                m_inputPicBufferpassidx = x265_picture_alloc();
99
-                x265_picture_init(m_passEncpass->m_param, m_inputPicBufferpassidx);
100
-            }
101
-
102
-            CHECKED_MALLOC_ZERO(m_analysisBufferpass, x265_analysis_data, m_queueSize);
103
-            m_picIdxReadCntpass = new ThreadSafeIntegerm_queueSize;
104
-            m_analysisWritepass = new ThreadSafeIntegerm_queueSize;
105
-            m_analysisReadpass = new ThreadSafeIntegerm_queueSize;
106
-            m_readFlagpass = X265_MALLOC(int, m_queueSize);
107
-        }
108
-        return true;
109
-    fail:
110
-        return false;
111
-    }
112
-
113
-    void AbrEncoder::destroy()
114
-    {
115
-        x265_cleanup(); /* Free library singletons */
116
-        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
117
-        {
118
-            for (uint32_t index = 0; index < m_queueSize; index++)
119
-            {
120
-                X265_FREE(m_inputPicBufferpassindex->planes0);
121
-                x265_picture_free(m_inputPicBufferpassindex);
122
-            }
123
-
124
-            X265_FREE(m_inputPicBufferpass);
125
-            X265_FREE(m_analysisBufferpass);
126
-            X265_FREE(m_readFlagpass);
127
-            delete m_picIdxReadCntpass;
128
-            delete m_analysisWritepass;
129
-            delete m_analysisReadpass;
130
-            m_passEncpass->destroy();
131
-            delete m_passEncpass;
132
-        }
133
-        X265_FREE(m_inputPicBuffer);
134
-        X265_FREE(m_analysisBuffer);
135
-        X265_FREE(m_readFlag);
136
-
137
-        delete m_picWriteCnt;
138
-        delete m_picReadCnt;
139
-        delete m_analysisWriteCnt;
140
-        delete m_analysisReadCnt;
141
-
142
-        X265_FREE(m_picIdxReadCnt);
143
-        X265_FREE(m_analysisWrite);
144
-        X265_FREE(m_analysisRead);
145
-
146
-        X265_FREE(m_passEnc);
147
-    }
148
-
149
-    PassEncoder::PassEncoder(uint32_t id, CLIOptions cliopt, AbrEncoder *parent)
150
-    {
151
-        m_id = id;
152
-        m_cliopt = cliopt;
153
-        m_parent = parent;
154
-        if(!(m_cliopt.enableScaler && m_id))
155
-            m_input = m_cliopt.input;
156
-        m_param = cliopt.param;
157
-        m_inputOver = false;
158
-        m_lastIdx = -1;
159
-        m_encoder = NULL;
160
-        m_scaler = NULL;
161
-        m_reader = NULL;
162
-        m_ret = 0;
163
-    }
164
-
165
-    int PassEncoder::init(int &result)
166
-    {
167
-        if (m_parent->m_numEncodes > 1)
168
-            setReuseLevel();
169
-                
170
-        if (!(m_cliopt.enableScaler && m_id))
171
-            m_reader = new Reader(m_id, this);
172
-        else
173
-        {
174
-            VideoDesc *src = NULL, *dst = NULL;
175
-            dst = new VideoDesc(m_param->sourceWidth, m_param->sourceHeight, m_param->internalCsp, m_param->internalBitDepth);
176
-            int dstW = m_parent->m_passEncm_id - 1->m_param->sourceWidth;
177
-            int dstH = m_parent->m_passEncm_id - 1->m_param->sourceHeight;
178
-            src = new VideoDesc(dstW, dstH, m_param->internalCsp, m_param->internalBitDepth);
179
-            if (src != NULL && dst != NULL)
180
-            {
181
-                m_scaler = new Scaler(0, 1, m_id, src, dst, this);
182
-                if (!m_scaler)
183
-                {
184
-                    x265_log(m_param, X265_LOG_ERROR, "\n MALLOC failure in Scaler");
185
-                    result = 4;
186
-                }
187
-            }
188
-        }
189
-
190
-        /* note: we could try to acquire a different libx265 API here based on
191
-        * the profile found during option parsing, but it must be done before
192
-        * opening an encoder */
193
-
194
-        if (m_param)
195
-            m_encoder = m_cliopt.api->encoder_open(m_param);
196
-        if (!m_encoder)
197
-        {
198
-            x265_log(NULL, X265_LOG_ERROR, "x265_encoder_open() failed for Enc, \n");
199
-            m_ret = 2;
200
-            return -1;
201
-        }
202
-
203
-        /* get the encoder parameters post-initialization */
204
-        m_cliopt.api->encoder_parameters(m_encoder, m_param);
205
-
206
-        return 1;
207
-    }
208
-
209
-    void PassEncoder::setReuseLevel()
210
-    {
211
-        uint32_t r, padh = 0, padw = 0;
212
-
213
-        m_param->confWinBottomOffset = m_param->confWinRightOffset = 0;
214
-
215
-        m_param->analysisLoadReuseLevel = m_cliopt.loadLevel;
216
-        m_param->analysisSaveReuseLevel = m_cliopt.saveLevel;
217
-        m_param->analysisSave = m_cliopt.saveLevel ? "save.dat" : NULL;
218
-        m_param->analysisLoad = m_cliopt.loadLevel ? "load.dat" : NULL;
219
-        m_param->bUseAnalysisFile = 0;
220
-
221
-        if (m_cliopt.loadLevel)
222
-        {
223
-            x265_param *refParam = m_parent->m_passEncm_cliopt.refId->m_param;
224
-
225
-            if (m_param->sourceHeight == (refParam->sourceHeight - refParam->confWinBottomOffset) &&
226
-                m_param->sourceWidth == (refParam->sourceWidth - refParam->confWinRightOffset))
227
-            {
228
-                m_parent->m_passEncm_id->m_param->confWinBottomOffset = refParam->confWinBottomOffset;
229
-                m_parent->m_passEncm_id->m_param->confWinRightOffset = refParam->confWinRightOffset;
230
-            }
231
-            else
232
-            {
233
-                int srcH = refParam->sourceHeight - refParam->confWinBottomOffset;
234
-                int srcW = refParam->sourceWidth - refParam->confWinRightOffset;
235
-
236
-                double scaleFactorH = double(m_param->sourceHeight / srcH);
237
-                double scaleFactorW = double(m_param->sourceWidth / srcW);
238
-
239
-                int absScaleFactorH = (int)(10 * scaleFactorH + 0.5);
240
-                int absScaleFactorW = (int)(10 * scaleFactorW + 0.5);
241
-
242
-                if (absScaleFactorH == 20 && absScaleFactorW == 20)
243
-                {
244
-                    m_param->scaleFactor = 2;
245
-
246
-                    m_parent->m_passEncm_id->m_param->confWinBottomOffset = refParam->confWinBottomOffset * 2;
247
-                    m_parent->m_passEncm_id->m_param->confWinRightOffset = refParam->confWinRightOffset * 2;
248
-
249
-                }
250
-            }
251
-        }
252
-
253
-        int h = m_param->sourceHeight + m_param->confWinBottomOffset;
254
-        int w = m_param->sourceWidth + m_param->confWinRightOffset;
255
-        if (h & (m_param->minCUSize - 1))
256
-        {
257
-            r = h & (m_param->minCUSize - 1);
258
-            padh = m_param->minCUSize - r;
259
-            m_param->confWinBottomOffset += padh;
260
-
261
-        }
262
-
263
-        if (w & (m_param->minCUSize - 1))
264
-        {
265
-            r = w & (m_param->minCUSize - 1);
266
-            padw = m_param->minCUSize - r;
267
-            m_param->confWinRightOffset += padw;
268
-        }
269
-    }
270
-
271
-    void PassEncoder::startThreads()
272
-    {
273
-        /* Start slave worker threads */
274
-        m_threadActive = true;
275
-        start();
276
-        /* Start reader threads*/
277
-        if (m_reader != NULL)
278
-        {
279
-            m_reader->m_threadActive = true;
280
-            m_reader->start();
281
-        }
282
-        /* Start scaling worker threads */
283
-        if (m_scaler != NULL)
284
-        {
285
-            m_scaler->m_threadActive = true;
286
-            m_scaler->start();
287
-        }
288
-    }
289
-
290
-    void PassEncoder::copyInfo(x265_analysis_data * src)
291
-    {
292
-
293
-        uint32_t written = m_parent->m_analysisWriteCntm_id.get();
294
-
295
-        int index = written % m_parent->m_queueSize;
296
-        //If all streams have read analysis data, reuse that position in Queue
297
-
298
-        int read = m_parent->m_analysisReadm_idindex.get();
299
-        int write = m_parent->m_analysisWritem_idindex.get();
300
-
301
-        int overwrite = written / m_parent->m_queueSize;
302
-        bool emptyIdxFound = 0;
303
-        while (!emptyIdxFound && overwrite)
304
-        {
305
-            for (uint32_t i = 0; i < m_parent->m_queueSize; i++)
306
-            {
307
-                read = m_parent->m_analysisReadm_idi.get();
308
-                write = m_parent->m_analysisWritem_idi.get();
309
-                write *= m_cliopt.numRefs;
310
-
311
-                if (read == write)
312
-                {
313
-                    index = i;
314
-                    emptyIdxFound = 1;
315
-                }
316
-            }
317
-        }
318
-
319
-        x265_analysis_data *m_analysisInfo = &m_parent->m_analysisBufferm_idindex;
320
-
321
-        x265_free_analysis_data(m_param, m_analysisInfo);
322
-        memcpy(m_analysisInfo, src, sizeof(x265_analysis_data));
323
-        x265_alloc_analysis_data(m_param, m_analysisInfo);
324
-
325
-        bool isVbv = m_param->rc.vbvBufferSize && m_param->rc.vbvMaxBitrate;
326
-        if (m_param->bDisableLookahead && isVbv)
327
-        {
328
-            memcpy(m_analysisInfo->lookahead.intraSatdForVbv, src->lookahead.intraSatdForVbv, src->numCuInHeight * sizeof(uint32_t));
329
-            memcpy(m_analysisInfo->lookahead.satdForVbv, src->lookahead.satdForVbv, src->numCuInHeight * sizeof(uint32_t));
330
-            memcpy(m_analysisInfo->lookahead.intraVbvCost, src->lookahead.intraVbvCost, src->numCUsInFrame * sizeof(uint32_t));
331
-            memcpy(m_analysisInfo->lookahead.vbvCost, src->lookahead.vbvCost, src->numCUsInFrame * sizeof(uint32_t));
332
-        }
333
-
334
-        if (src->sliceType == X265_TYPE_IDR || src->sliceType == X265_TYPE_I)
335
-        {
336
-            if (m_param->analysisSaveReuseLevel < 2)
337
-                goto ret;
338
-            x265_analysis_intra_data *intraDst, *intraSrc;
339
-            intraDst = (x265_analysis_intra_data*)m_analysisInfo->intraData;
340
-            intraSrc = (x265_analysis_intra_data*)src->intraData;
341
-            memcpy(intraDst->depth, intraSrc->depth, sizeof(uint8_t) * src->depthBytes);
342
-            memcpy(intraDst->modes, intraSrc->modes, sizeof(uint8_t) * src->numCUsInFrame * src->numPartitions);
343
-            memcpy(intraDst->partSizes, intraSrc->partSizes, sizeof(char) * src->depthBytes);
344
-            memcpy(intraDst->chromaModes, intraSrc->chromaModes, sizeof(uint8_t) * src->depthBytes);
345
-            if (m_param->rc.cuTree)
346
-                memcpy(intraDst->cuQPOff, intraSrc->cuQPOff, sizeof(int8_t) * src->depthBytes);
347
-        }
348
-        else
349
-        {
350
-            bool bIntraInInter = (src->sliceType == X265_TYPE_P || m_param->bIntraInBFrames);
351
-            int numDir = src->sliceType == X265_TYPE_P ? 1 : 2;
352
-            memcpy(m_analysisInfo->wt, src->wt, sizeof(WeightParam) * 3 * numDir);
353
-            if (m_param->analysisSaveReuseLevel < 2)
354
-                goto ret;
355
-            x265_analysis_inter_data *interDst, *interSrc;
356
-            interDst = (x265_analysis_inter_data*)m_analysisInfo->interData;
357
-            interSrc = (x265_analysis_inter_data*)src->interData;
358
-            memcpy(interDst->depth, interSrc->depth, sizeof(uint8_t) * src->depthBytes);
359
-            memcpy(interDst->modes, interSrc->modes, sizeof(uint8_t) * src->depthBytes);
360
-            if (m_param->rc.cuTree)
361
-                memcpy(interDst->cuQPOff, interSrc->cuQPOff, sizeof(int8_t) * src->depthBytes);
362
-            if (m_param->analysisSaveReuseLevel > 4)
363
-            {
364
-                memcpy(interDst->partSize, interSrc->partSize, sizeof(uint8_t) * src->depthBytes);
365
-                memcpy(interDst->mergeFlag, interSrc->mergeFlag, sizeof(uint8_t) * src->depthBytes);
366
-                if (m_param->analysisSaveReuseLevel == 10)
367
-                {
368
-                    memcpy(interDst->interDir, interSrc->interDir, sizeof(uint8_t) * src->depthBytes);
369
-                    for (int dir = 0; dir < numDir; dir++)
370
-                    {
371
-                        memcpy(interDst->mvpIdxdir, interSrc->mvpIdxdir, sizeof(uint8_t) * src->depthBytes);
372
-                        memcpy(interDst->refIdxdir, interSrc->refIdxdir, sizeof(int8_t) * src->depthBytes);
373
-                        memcpy(interDst->mvdir, interSrc->mvdir, sizeof(MV) * src->depthBytes);
374
-                    }
375
-                    if (bIntraInInter)
376
-                    {
377
-                        x265_analysis_intra_data *intraDst = (x265_analysis_intra_data*)m_analysisInfo->intraData;
378
-                        x265_analysis_intra_data *intraSrc = (x265_analysis_intra_data*)src->intraData;
379
-                        memcpy(intraDst->modes, intraSrc->modes, sizeof(uint8_t) * src->numPartitions * src->numCUsInFrame);
380
-                        memcpy(intraDst->chromaModes, intraSrc->chromaModes, sizeof(uint8_t) * src->depthBytes);
381
-                    }
382
-               }
383
-            }
384
-            if (m_param->analysisSaveReuseLevel != 10)
385
-                memcpy(interDst->ref, interSrc->ref, sizeof(int32_t) * src->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir);
386
-        }
387
-
388
-ret:
389
-        //increment analysis Write counter 
390
-        m_parent->m_analysisWriteCntm_id.incr();
391
-        m_parent->m_analysisWritem_idindex.incr();
392
-        return;
393
-    }
394
-
395
-
396
-    bool PassEncoder::readPicture(x265_picture *dstPic)
397
-    {
398
-        /*Check and wait if there any input frames to read*/
399
-        int ipread = m_parent->m_picReadCntm_id.get();
400
-        int ipwrite = m_parent->m_picWriteCntm_id.get();
401
-
402
-        bool isAbrLoad = m_cliopt.loadLevel && (m_parent->m_numEncodes > 1);
403
-        while (!m_inputOver && (ipread == ipwrite))
404
-        {
405
-            ipwrite = m_parent->m_picWriteCntm_id.waitForChange(ipwrite);
406
-        }
407
-
408
-        if (m_threadActive && ipread < ipwrite)
409
-        {
410
-            /*Get input index to read from inputQueue. If doesn't need analysis info, it need not wait to fetch poc from analysisQueue*/
411
-            int readPos = ipread % m_parent->m_queueSize;
412
-            x265_analysis_data* analysisData = 0;
413
-
414
-            if (isAbrLoad)
415
-            {
416
-                /*If stream is master of each slave pass, then fetch analysis data from prev pass*/
417
-                int analysisQId = m_cliopt.refId;
418
-                /*Check and wait if there any analysis Data to read*/
419
-                int analysisWrite = m_parent->m_analysisWriteCntanalysisQId.get();
420
-                int written = analysisWrite * m_parent->m_passEncanalysisQId->m_cliopt.numRefs;
421
-                int analysisRead = m_parent->m_analysisReadCntanalysisQId.get();
422
-                
423
-                while (m_threadActive && written == analysisRead)
424
-                {
425
-                    analysisWrite = m_parent->m_analysisWriteCntanalysisQId.waitForChange(analysisWrite);
426
-                    written = analysisWrite * m_parent->m_passEncanalysisQId->m_cliopt.numRefs;
427
-                }
428
-
429
-                if (analysisRead < written)
430
-                {
431
-                    int analysisIdx = 0;
432
-                    if (!m_param->bDisableLookahead)
433
-                    {
434
-                        bool analysisdRead = false;
435
-                        while ((analysisRead < written) && !analysisdRead)
436
-                        {
437
-                            while (analysisWrite < ipread)
438
-                            {
439
-                                analysisWrite = m_parent->m_analysisWriteCntanalysisQId.waitForChange(analysisWrite);
440
-                                written = analysisWrite * m_parent->m_passEncanalysisQId->m_cliopt.numRefs;
441
-                            }
442
-                            for (uint32_t i = 0; i < m_parent->m_queueSize; i++)
443
-                            {
444
-                                analysisData = &m_parent->m_analysisBufferanalysisQIdi;
445
-                                int read = m_parent->m_analysisReadanalysisQIdi.get();
446
-                                int write = m_parent->m_analysisWriteanalysisQIdi.get() * m_parent->m_passEncanalysisQId->m_cliopt.numRefs;
447
-                                if ((analysisData->poc == (uint32_t)(ipread)) && (read < write))
448
-                                {
449
-                                    analysisIdx = i;
450
-                                    analysisdRead = true;
451
-                                    break;
452
-                                }
453
-                            }
454
-                        }
455
-                    }
456
-                    else
457
-                    {
458
-                        analysisIdx = analysisRead % m_parent->m_queueSize;
459
-                        analysisData = &m_parent->m_analysisBufferanalysisQIdanalysisIdx;
460
-                        readPos = analysisData->poc % m_parent->m_queueSize;
461
-                        while ((ipwrite < readPos) || ((ipwrite - 1) < (int)analysisData->poc))
462
-                        {
463
-                            ipwrite = m_parent->m_picWriteCntm_id.waitForChange(ipwrite);
464
-                        }
465
-                    }
466
-
467
-                    m_lastIdx = analysisIdx;
468
-                }
469
-                else
470
-                    return false;
471
-            }
472
-
473
-
474
-            x265_picture *srcPic = (x265_picture*)(m_parent->m_inputPicBufferm_idreadPos);
475
-
476
-            x265_picture *pic = (x265_picture*)(dstPic);
477
-            pic->colorSpace = srcPic->colorSpace;
478
-            pic->bitDepth = srcPic->bitDepth;
479
-            pic->framesize = srcPic->framesize;
480
-            pic->height = srcPic->height;
481
-            pic->pts = srcPic->pts;
482
-            pic->dts = srcPic->dts;
483
-            pic->reorderedPts = srcPic->reorderedPts;
484
-            pic->width = srcPic->width;
485
-            pic->analysisData = srcPic->analysisData;
486
-            pic->userSEI = srcPic->userSEI;
487
-            pic->stride0 = srcPic->stride0;
488
-            pic->stride1 = srcPic->stride1;
489
-            pic->stride2 = srcPic->stride2;
490
-            pic->planes0 = srcPic->planes0;
491
-            pic->planes1 = srcPic->planes1;
492
-            pic->planes2 = srcPic->planes2;
493
-            if (isAbrLoad)
494
-                pic->analysisData = *analysisData;
495
-            return true;
496
-        }
497
-        else
498
-            return false;
499
-    }
500
-
501
-    void PassEncoder::threadMain()
502
-    {
503
+/*****************************************************************************
504
+* Copyright (C) 2013-2020 MulticoreWare, Inc
505
+*
506
+* Authors: Pooja Venkatesan <pooja@multicorewareinc.com>
507
+*          Aruna Matheswaran <aruna@multicorewareinc.com>
508
+*
509
+* This program is free software; you can redistribute it and/or modify
510
+* it under the terms of the GNU General Public License as published by
511
+* the Free Software Foundation; either version 2 of the License, or
512
+* (at your option) any later version.
513
+*
514
+* This program is distributed in the hope that it will be useful,
515
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
516
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
517
+* GNU General Public License for more details.
518
+*
519
+* You should have received a copy of the GNU General Public License
520
+* along with this program; if not, write to the Free Software
521
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
522
+*
523
+* This program is also available under a commercial proprietary license.
524
+* For more information, contact us at license @ x265.com.
525
+*****************************************************************************/
526
+
527
+#include "abrEncApp.h"
528
+#include "mv.h"
529
+#include "slice.h"
530
+#include "param.h"
531
+
532
+#include <signal.h>
533
+#include <errno.h>
534
+
535
+#include <queue>
536
+
537
+using namespace X265_NS;
538
+
539
+/* Ctrl-C handler */
540
+static volatile sig_atomic_t b_ctrl_c /* = 0 */;
541
+static void sigint_handler(int)
542
+{
543
+    b_ctrl_c = 1;
544
+}
545
+
546
+namespace X265_NS {
547
+    // private namespace
548
+#define X265_INPUT_QUEUE_SIZE 250
549
+
550
+    AbrEncoder::AbrEncoder(CLIOptions cliopt, uint8_t numEncodes, int &ret)
551
+    {
552
+        m_numEncodes = numEncodes;
553
+        m_numActiveEncodes.set(numEncodes);
554
+        m_queueSize = (numEncodes > 1) ? X265_INPUT_QUEUE_SIZE : 1;
555
+        m_passEnc = X265_MALLOC(PassEncoder*, m_numEncodes);
556
+
557
+        for (uint8_t i = 0; i < m_numEncodes; i++)
558
+        {
559
+            m_passEnci = new PassEncoder(i, cliopti, this);
560
+            if (!m_passEnci)
561
+            {
562
+                x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for passEncoder\n");
563
+                ret = 4;
564
+            }
565
+            m_passEnci->init(ret);
566
+        }
567
+
568
+        if (!allocBuffers())
569
+        {
570
+            x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for buffers\n");
571
+            ret = 4;
572
+        }
573
+
574
+        /* start passEncoder worker threads */
575
+        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
576
+            m_passEncpass->startThreads();
577
+    }
578
+
579
+    bool AbrEncoder::allocBuffers()
580
+    {
581
+        m_inputPicBuffer = X265_MALLOC(x265_picture**, m_numEncodes);
582
+        m_analysisBuffer = X265_MALLOC(x265_analysis_data*, m_numEncodes);
583
+
584
+        m_picWriteCnt = new ThreadSafeIntegerm_numEncodes;
585
+        m_picReadCnt = new ThreadSafeIntegerm_numEncodes;
586
+        m_analysisWriteCnt = new ThreadSafeIntegerm_numEncodes;
587
+        m_analysisReadCnt = new ThreadSafeIntegerm_numEncodes;
588
+
589
+        m_picIdxReadCnt = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
590
+        m_analysisWrite = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
591
+        m_analysisRead = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
592
+        m_readFlag = X265_MALLOC(int*, m_numEncodes);
593
+
594
+        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
595
+        {
596
+            m_inputPicBufferpass = X265_MALLOC(x265_picture*, m_queueSize);
597
+            for (uint32_t idx = 0; idx < m_queueSize; idx++)
598
+            {
599
+                m_inputPicBufferpassidx = x265_picture_alloc();
600
+                x265_picture_init(m_passEncpass->m_param, m_inputPicBufferpassidx);
601
+            }
602
+
603
+            CHECKED_MALLOC_ZERO(m_analysisBufferpass, x265_analysis_data, m_queueSize);
604
+            m_picIdxReadCntpass = new ThreadSafeIntegerm_queueSize;
605
+            m_analysisWritepass = new ThreadSafeIntegerm_queueSize;
606
+            m_analysisReadpass = new ThreadSafeIntegerm_queueSize;
607
+            m_readFlagpass = X265_MALLOC(int, m_queueSize);
608
+        }
609
+        return true;
610
+    fail:
611
+        return false;
612
+    }
613
+
614
+    void AbrEncoder::destroy()
615
+    {
616
+        x265_cleanup(); /* Free library singletons */
617
+        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
618
+        {
619
+            for (uint32_t index = 0; index < m_queueSize; index++)
620
+            {
621
+                X265_FREE(m_inputPicBufferpassindex->planes0);
622
+                x265_picture_free(m_inputPicBufferpassindex);
623
+            }
624
+
625
+            X265_FREE(m_inputPicBufferpass);
626
+            X265_FREE(m_analysisBufferpass);
627
+            X265_FREE(m_readFlagpass);
628
+            delete m_picIdxReadCntpass;
629
+            delete m_analysisWritepass;
630
+            delete m_analysisReadpass;
631
+            m_passEncpass->destroy();
632
+            delete m_passEncpass;
633
+        }
634
+        X265_FREE(m_inputPicBuffer);
635
+        X265_FREE(m_analysisBuffer);
636
+        X265_FREE(m_readFlag);
637
+
638
+        delete m_picWriteCnt;
639
+        delete m_picReadCnt;
640
+        delete m_analysisWriteCnt;
641
+        delete m_analysisReadCnt;
642
+
643
+        X265_FREE(m_picIdxReadCnt);
644
+        X265_FREE(m_analysisWrite);
645
+        X265_FREE(m_analysisRead);
646
+
647
+        X265_FREE(m_passEnc);
648
+    }
649
+
650
+    PassEncoder::PassEncoder(uint32_t id, CLIOptions cliopt, AbrEncoder *parent)
651
+    {
652
+        m_id = id;
653
+        m_cliopt = cliopt;
654
+        m_parent = parent;
655
+        if(!(m_cliopt.enableScaler && m_id))
656
+            m_input = m_cliopt.input;
657
+        m_param = cliopt.param;
658
+        m_inputOver = false;
659
+        m_lastIdx = -1;
660
+        m_encoder = NULL;
661
+        m_scaler = NULL;
662
+        m_reader = NULL;
663
+        m_ret = 0;
664
+    }
665
+
666
+    int PassEncoder::init(int &result)
667
+    {
668
+        if (m_parent->m_numEncodes > 1)
669
+            setReuseLevel();
670
+                
671
+        if (!(m_cliopt.enableScaler && m_id))
672
+            m_reader = new Reader(m_id, this);
673
+        else
674
+        {
675
+            VideoDesc *src = NULL, *dst = NULL;
676
+            dst = new VideoDesc(m_param->sourceWidth, m_param->sourceHeight, m_param->internalCsp, m_param->internalBitDepth);
677
+            int dstW = m_parent->m_passEncm_id - 1->m_param->sourceWidth;
678
+            int dstH = m_parent->m_passEncm_id - 1->m_param->sourceHeight;
679
+            src = new VideoDesc(dstW, dstH, m_param->internalCsp, m_param->internalBitDepth);
680
+            if (src != NULL && dst != NULL)
681
+            {
682
+                m_scaler = new Scaler(0, 1, m_id, src, dst, this);
683
+                if (!m_scaler)
684
+                {
685
+                    x265_log(m_param, X265_LOG_ERROR, "\n MALLOC failure in Scaler");
686
+                    result = 4;
687
+                }
688
+            }
689
+        }
690
+
691
+        if (m_cliopt.zoneFile)
692
+        {
693
+            if (!m_cliopt.parseZoneFile())
694
+            {
695
+                x265_log(NULL, X265_LOG_ERROR, "Unable to parse zonefile in %s\n");
696
+                fclose(m_cliopt.zoneFile);
697
+                m_cliopt.zoneFile = NULL;
698
+            }
699
+        }
700
+
701
+        /* note: we could try to acquire a different libx265 API here based on
702
+        * the profile found during option parsing, but it must be done before
703
+        * opening an encoder */
704
+
705
+        if (m_param)
706
+            m_encoder = m_cliopt.api->encoder_open(m_param);
707
+        if (!m_encoder)
708
+        {
709
+            x265_log(NULL, X265_LOG_ERROR, "x265_encoder_open() failed for Enc, \n");
710
+            m_ret = 2;
711
+            return -1;
712
+        }
713
+
714
+        /* get the encoder parameters post-initialization */
715
+        m_cliopt.api->encoder_parameters(m_encoder, m_param);
716
+
717
+        return 1;
718
+    }
719
+
720
+    void PassEncoder::setReuseLevel()
721
+    {
722
+        uint32_t r, padh = 0, padw = 0;
723
+
724
+        m_param->confWinBottomOffset = m_param->confWinRightOffset = 0;
725
+
726
+        m_param->analysisLoadReuseLevel = m_cliopt.loadLevel;
727
+        m_param->analysisSaveReuseLevel = m_cliopt.saveLevel;
728
+        m_param->analysisSave = m_cliopt.saveLevel ? "save.dat" : NULL;
729
+        m_param->analysisLoad = m_cliopt.loadLevel ? "load.dat" : NULL;
730
+        m_param->bUseAnalysisFile = 0;
731
+
732
+        if (m_cliopt.loadLevel)
733
+        {
734
+            x265_param *refParam = m_parent->m_passEncm_cliopt.refId->m_param;
735
+
736
+            if (m_param->sourceHeight == (refParam->sourceHeight - refParam->confWinBottomOffset) &&
737
+                m_param->sourceWidth == (refParam->sourceWidth - refParam->confWinRightOffset))
738
+            {
739
+                m_parent->m_passEncm_id->m_param->confWinBottomOffset = refParam->confWinBottomOffset;
740
+                m_parent->m_passEncm_id->m_param->confWinRightOffset = refParam->confWinRightOffset;
741
+            }
742
+            else
743
+            {
744
+                int srcH = refParam->sourceHeight - refParam->confWinBottomOffset;
745
+                int srcW = refParam->sourceWidth - refParam->confWinRightOffset;
746
+
747
+                double scaleFactorH = double(m_param->sourceHeight / srcH);
748
+                double scaleFactorW = double(m_param->sourceWidth / srcW);
749
+
750
+                int absScaleFactorH = (int)(10 * scaleFactorH + 0.5);
751
+                int absScaleFactorW = (int)(10 * scaleFactorW + 0.5);
752
+
753
+                if (absScaleFactorH == 20 && absScaleFactorW == 20)
754
+                {
755
+                    m_param->scaleFactor = 2;
756
+
757
+                    m_parent->m_passEncm_id->m_param->confWinBottomOffset = refParam->confWinBottomOffset * 2;
758
+                    m_parent->m_passEncm_id->m_param->confWinRightOffset = refParam->confWinRightOffset * 2;
759
+
760
+                }
761
+            }
762
+        }
763
+
764
+        int h = m_param->sourceHeight + m_param->confWinBottomOffset;
765
+        int w = m_param->sourceWidth + m_param->confWinRightOffset;
766
+        if (h & (m_param->minCUSize - 1))
767
+        {
768
+            r = h & (m_param->minCUSize - 1);
769
+            padh = m_param->minCUSize - r;
770
+            m_param->confWinBottomOffset += padh;
771
+
772
+        }
773
+
774
+        if (w & (m_param->minCUSize - 1))
775
+        {
776
+            r = w & (m_param->minCUSize - 1);
777
+            padw = m_param->minCUSize - r;
778
+            m_param->confWinRightOffset += padw;
779
+        }
780
+    }
781
+
782
+    void PassEncoder::startThreads()
783
+    {
784
+        /* Start slave worker threads */
785
+        m_threadActive = true;
786
+        start();
787
+        /* Start reader threads*/
788
+        if (m_reader != NULL)
789
+        {
790
+            m_reader->m_threadActive = true;
791
+            m_reader->start();
792
+        }
793
+        /* Start scaling worker threads */
794
+        if (m_scaler != NULL)
795
+        {
796
+            m_scaler->m_threadActive = true;
797
+            m_scaler->start();
798
+        }
799
+    }
800
+
801
+    void PassEncoder::copyInfo(x265_analysis_data * src)
802
+    {
803
+
804
+        uint32_t written = m_parent->m_analysisWriteCntm_id.get();
805
+
806
+        int index = written % m_parent->m_queueSize;
807
+        //If all streams have read analysis data, reuse that position in Queue
808
+
809
+        int read = m_parent->m_analysisReadm_idindex.get();
810
+        int write = m_parent->m_analysisWritem_idindex.get();
811
+
812
+        int overwrite = written / m_parent->m_queueSize;
813
+        bool emptyIdxFound = 0;
814
+        while (!emptyIdxFound && overwrite)
815
+        {
816
+            for (uint32_t i = 0; i < m_parent->m_queueSize; i++)
817
+            {
818
+                read = m_parent->m_analysisReadm_idi.get();
819
+                write = m_parent->m_analysisWritem_idi.get();
820
+                write *= m_cliopt.numRefs;
821
+
822
+                if (read == write)
823
+                {
824
+                    index = i;
825
+                    emptyIdxFound = 1;
826
+                }
827
+            }
828
+        }
829
+
830
+        x265_analysis_data *m_analysisInfo = &m_parent->m_analysisBufferm_idindex;
831
+
832
+        x265_free_analysis_data(m_param, m_analysisInfo);
833
+        memcpy(m_analysisInfo, src, sizeof(x265_analysis_data));
834
+        x265_alloc_analysis_data(m_param, m_analysisInfo);
835
+
836
+        bool isVbv = m_param->rc.vbvBufferSize && m_param->rc.vbvMaxBitrate;
837
+        if (m_param->bDisableLookahead && isVbv)
838
+        {
839
+            memcpy(m_analysisInfo->lookahead.intraSatdForVbv, src->lookahead.intraSatdForVbv, src->numCuInHeight * sizeof(uint32_t));
840
+            memcpy(m_analysisInfo->lookahead.satdForVbv, src->lookahead.satdForVbv, src->numCuInHeight * sizeof(uint32_t));
841
+            memcpy(m_analysisInfo->lookahead.intraVbvCost, src->lookahead.intraVbvCost, src->numCUsInFrame * sizeof(uint32_t));
842
+            memcpy(m_analysisInfo->lookahead.vbvCost, src->lookahead.vbvCost, src->numCUsInFrame * sizeof(uint32_t));
843
+        }
844
+
845
+        if (src->sliceType == X265_TYPE_IDR || src->sliceType == X265_TYPE_I)
846
+        {
847
+            if (m_param->analysisSaveReuseLevel < 2)
848
+                goto ret;
849
+            x265_analysis_intra_data *intraDst, *intraSrc;
850
+            intraDst = (x265_analysis_intra_data*)m_analysisInfo->intraData;
851
+            intraSrc = (x265_analysis_intra_data*)src->intraData;
852
+            memcpy(intraDst->depth, intraSrc->depth, sizeof(uint8_t) * src->depthBytes);
853
+            memcpy(intraDst->modes, intraSrc->modes, sizeof(uint8_t) * src->numCUsInFrame * src->numPartitions);
854
+            memcpy(intraDst->partSizes, intraSrc->partSizes, sizeof(char) * src->depthBytes);
855
+            memcpy(intraDst->chromaModes, intraSrc->chromaModes, sizeof(uint8_t) * src->depthBytes);
856
+            if (m_param->rc.cuTree)
857
+                memcpy(intraDst->cuQPOff, intraSrc->cuQPOff, sizeof(int8_t) * src->depthBytes);
858
+        }
859
+        else
860
+        {
861
+            bool bIntraInInter = (src->sliceType == X265_TYPE_P || m_param->bIntraInBFrames);
862
+            int numDir = src->sliceType == X265_TYPE_P ? 1 : 2;
863
+            memcpy(m_analysisInfo->wt, src->wt, sizeof(WeightParam) * 3 * numDir);
864
+            if (m_param->analysisSaveReuseLevel < 2)
865
+                goto ret;
866
+            x265_analysis_inter_data *interDst, *interSrc;
867
+            interDst = (x265_analysis_inter_data*)m_analysisInfo->interData;
868
+            interSrc = (x265_analysis_inter_data*)src->interData;
869
+            memcpy(interDst->depth, interSrc->depth, sizeof(uint8_t) * src->depthBytes);
870
+            memcpy(interDst->modes, interSrc->modes, sizeof(uint8_t) * src->depthBytes);
871
+            if (m_param->rc.cuTree)
872
+                memcpy(interDst->cuQPOff, interSrc->cuQPOff, sizeof(int8_t) * src->depthBytes);
873
+            if (m_param->analysisSaveReuseLevel > 4)
874
+            {
875
+                memcpy(interDst->partSize, interSrc->partSize, sizeof(uint8_t) * src->depthBytes);
876
+                memcpy(interDst->mergeFlag, interSrc->mergeFlag, sizeof(uint8_t) * src->depthBytes);
877
+                if (m_param->analysisSaveReuseLevel == 10)
878
+                {
879
+                    memcpy(interDst->interDir, interSrc->interDir, sizeof(uint8_t) * src->depthBytes);
880
+                    for (int dir = 0; dir < numDir; dir++)
881
+                    {
882
+                        memcpy(interDst->mvpIdxdir, interSrc->mvpIdxdir, sizeof(uint8_t) * src->depthBytes);
883
+                        memcpy(interDst->refIdxdir, interSrc->refIdxdir, sizeof(int8_t) * src->depthBytes);
884
+                        memcpy(interDst->mvdir, interSrc->mvdir, sizeof(MV) * src->depthBytes);
885
+                    }
886
+                    if (bIntraInInter)
887
+                    {
888
+                        x265_analysis_intra_data *intraDst = (x265_analysis_intra_data*)m_analysisInfo->intraData;
889
+                        x265_analysis_intra_data *intraSrc = (x265_analysis_intra_data*)src->intraData;
890
+                        memcpy(intraDst->modes, intraSrc->modes, sizeof(uint8_t) * src->numPartitions * src->numCUsInFrame);
891
+                        memcpy(intraDst->chromaModes, intraSrc->chromaModes, sizeof(uint8_t) * src->depthBytes);
892
+                    }
893
+               }
894
+            }
895
+            if (m_param->analysisSaveReuseLevel != 10)
896
+                memcpy(interDst->ref, interSrc->ref, sizeof(int32_t) * src->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir);
897
+        }
898
+
899
+ret:
900
+        //increment analysis Write counter 
901
+        m_parent->m_analysisWriteCntm_id.incr();
902
+        m_parent->m_analysisWritem_idindex.incr();
903
+        return;
904
+    }
905
+
906
+
907
+    bool PassEncoder::readPicture(x265_picture *dstPic)
908
+    {
909
+        /*Check and wait if there any input frames to read*/
910
+        int ipread = m_parent->m_picReadCntm_id.get();
911
+        int ipwrite = m_parent->m_picWriteCntm_id.get();
912
+
913
+        bool isAbrLoad = m_cliopt.loadLevel && (m_parent->m_numEncodes > 1);
914
+        while (!m_inputOver && (ipread == ipwrite))
915
+        {
916
+            ipwrite = m_parent->m_picWriteCntm_id.waitForChange(ipwrite);
917
+        }
918
+
919
+        if (m_threadActive && ipread < ipwrite)
920
+        {
921
+            /*Get input index to read from inputQueue. If doesn't need analysis info, it need not wait to fetch poc from analysisQueue*/
922
+            int readPos = ipread % m_parent->m_queueSize;
923
+            x265_analysis_data* analysisData = 0;
924
+
925
+            if (isAbrLoad)
926
+            {
927
+                /*If stream is master of each slave pass, then fetch analysis data from prev pass*/
928
+                int analysisQId = m_cliopt.refId;
929
+                /*Check and wait if there any analysis Data to read*/
930
+                int analysisWrite = m_parent->m_analysisWriteCntanalysisQId.get();
931
+                int written = analysisWrite * m_parent->m_passEncanalysisQId->m_cliopt.numRefs;
932
+                int analysisRead = m_parent->m_analysisReadCntanalysisQId.get();
933
+                
934
+                while (m_threadActive && written == analysisRead)
935
+                {
936
+                    analysisWrite = m_parent->m_analysisWriteCntanalysisQId.waitForChange(analysisWrite);
937
+                    written = analysisWrite * m_parent->m_passEncanalysisQId->m_cliopt.numRefs;
938
+                }
939
+
940
+                if (analysisRead < written)
941
+                {
942
+                    int analysisIdx = 0;
943
+                    if (!m_param->bDisableLookahead)
944
+                    {
945
+                        bool analysisdRead = false;
946
+                        while ((analysisRead < written) && !analysisdRead)
947
+                        {
948
+                            while (analysisWrite < ipread)
949
+                            {
950
+                                analysisWrite = m_parent->m_analysisWriteCntanalysisQId.waitForChange(analysisWrite);
951
+                                written = analysisWrite * m_parent->m_passEncanalysisQId->m_cliopt.numRefs;
952
+                            }
953
+                            for (uint32_t i = 0; i < m_parent->m_queueSize; i++)
954
+                            {
955
+                                analysisData = &m_parent->m_analysisBufferanalysisQIdi;
956
+                                int read = m_parent->m_analysisReadanalysisQIdi.get();
957
+                                int write = m_parent->m_analysisWriteanalysisQIdi.get() * m_parent->m_passEncanalysisQId->m_cliopt.numRefs;
958
+                                if ((analysisData->poc == (uint32_t)(ipread)) && (read < write))
959
+                                {
960
+                                    analysisIdx = i;
961
+                                    analysisdRead = true;
962
+                                    break;
963
+                                }
964
+                            }
965
+                        }
966
+                    }
967
+                    else
968
+                    {
969
+                        analysisIdx = analysisRead % m_parent->m_queueSize;
970
+                        analysisData = &m_parent->m_analysisBufferanalysisQIdanalysisIdx;
971
+                        readPos = analysisData->poc % m_parent->m_queueSize;
972
+                        while ((ipwrite < readPos) || ((ipwrite - 1) < (int)analysisData->poc))
973
+                        {
974
+                            ipwrite = m_parent->m_picWriteCntm_id.waitForChange(ipwrite);
975
+                        }
976
+                    }
977
+
978
+                    m_lastIdx = analysisIdx;
979
+                }
980
+                else
981
+                    return false;
982
+            }
983
+
984
+
985
+            x265_picture *srcPic = (x265_picture*)(m_parent->m_inputPicBufferm_idreadPos);
986
+
987
+            x265_picture *pic = (x265_picture*)(dstPic);
988
+            pic->colorSpace = srcPic->colorSpace;
989
+            pic->bitDepth = srcPic->bitDepth;
990
+            pic->framesize = srcPic->framesize;
991
+            pic->height = srcPic->height;
992
+            pic->pts = srcPic->pts;
993
+            pic->dts = srcPic->dts;
994
+            pic->reorderedPts = srcPic->reorderedPts;
995
+            pic->width = srcPic->width;
996
+            pic->analysisData = srcPic->analysisData;
997
+            pic->userSEI = srcPic->userSEI;
998
+            pic->stride0 = srcPic->stride0;
999
+            pic->stride1 = srcPic->stride1;
1000
+            pic->stride2 = srcPic->stride2;
1001
+            pic->planes0 = srcPic->planes0;
1002
+            pic->planes1 = srcPic->planes1;
1003
+            pic->planes2 = srcPic->planes2;
1004
+            if (isAbrLoad)
1005
+                pic->analysisData = *analysisData;
1006
+            return true;
1007
+        }
1008
+        else
1009
+            return false;
1010
+    }
1011
+
1012
+    void PassEncoder::threadMain()
1013
+    {
1014
         THREAD_NAME("PassEncoder", m_id);
1015
 
1016
         while (m_threadActive)
1017
         {
1018
-
1019
-#if ENABLE_LIBVMAF
1020
-            x265_vmaf_data* vmafdata = m_cliopt.vmafData;
1021
-#endif
1022
-            /* This allows muxers to modify bitstream format */
1023
-            m_cliopt.output->setParam(m_param);
1024
-            const x265_api* api = m_cliopt.api;
1025
-            ReconPlay* reconPlay = NULL;
1026
-            if (m_cliopt.reconPlayCmd)
1027
-                reconPlay = new ReconPlay(m_cliopt.reconPlayCmd, *m_param);
1028
-            char* profileName = m_cliopt.encName ? m_cliopt.encName : (char *)"x265";
1029
-
1030
-            if (m_cliopt.zoneFile)
1031
-            {
1032
-                if (!m_cliopt.parseZoneFile())
1033
-                {
1034
-                    x265_log(NULL, X265_LOG_ERROR, "Unable to parse zonefile in %s\n", profileName);
1035
-                    fclose(m_cliopt.zoneFile);
1036
-                    m_cliopt.zoneFile = NULL;
1037
-                }
1038
-            }
1039
-
1040
-            if (signal(SIGINT, sigint_handler) == SIG_ERR)
1041
-                x265_log(m_param, X265_LOG_ERROR, "Unable to register CTRL+C handler: %s in %s\n",
1042
-                    strerror(errno), profileName);
1043
-
1044
-            x265_picture pic_orig, pic_out;
1045
-            x265_picture *pic_in = &pic_orig;
1046
-            /* Allocate recon picture if analysis save/load is enabled */
1047
-            std::priority_queue<int64_t>* pts_queue = m_cliopt.output->needPTS() ? new std::priority_queue<int64_t>() : NULL;
1048
-            x265_picture *pic_recon = (m_cliopt.recon || m_param->analysisSave || m_param->analysisLoad || pts_queue || reconPlay || m_param->csvLogLevel) ? &pic_out : NULL;
1049
-            uint32_t inFrameCount = 0;
1050
-            uint32_t outFrameCount = 0;
1051
-            x265_nal *p_nal;
1052
-            x265_stats stats;
1053
-            uint32_t nal;
1054
-            int16_t *errorBuf = NULL;
1055
-            bool bDolbyVisionRPU = false;
1056
-            uint8_t *rpuPayload = NULL;
1057
-            int inputPicNum = 1;
1058
-            x265_picture picField1, picField2;
1059
-            x265_analysis_data* analysisInfo = (x265_analysis_data*)(&pic_out.analysisData);
1060
-            bool isAbrSave = m_cliopt.saveLevel && (m_parent->m_numEncodes > 1);
1061
-
1062
-            if (!m_param->bRepeatHeaders && !m_param->bEnableSvtHevc)
1063
-            {
1064
-                if (api->encoder_headers(m_encoder, &p_nal, &nal) < 0)
1065
-                {
1066
-                    x265_log(m_param, X265_LOG_ERROR, "Failure generating stream headers in %s\n", profileName);
1067
-                    m_ret = 3;
1068
-                    goto fail;
1069
-                }
1070
-                else
1071
-                    m_cliopt.totalbytes += m_cliopt.output->writeHeaders(p_nal, nal);
1072
-            }
1073
-
1074
-            if (m_param->bField && m_param->interlaceMode)
1075
-            {
1076
-                api->picture_init(m_param, &picField1);
1077
-                api->picture_init(m_param, &picField2);
1078
-                // return back the original height of input
1079
-                m_param->sourceHeight *= 2;
1080
-                api->picture_init(m_param, &pic_orig);
1081
-            }
1082
-            else
1083
-                api->picture_init(m_param, &pic_orig);
1084
-
1085
-            if (m_param->dolbyProfile && m_cliopt.dolbyVisionRpu)
1086
-            {
1087
-                rpuPayload = X265_MALLOC(uint8_t, 1024);
1088
-                pic_in->rpu.payload = rpuPayload;
1089
-                if (pic_in->rpu.payload)
1090
-                    bDolbyVisionRPU = true;
1091
-            }
1092
-
1093
-            if (m_cliopt.bDither)
1094
-            {
1095
-                errorBuf = X265_MALLOC(int16_t, m_param->sourceWidth + 1);
1096
-                if (errorBuf)
1097
-                    memset(errorBuf, 0, (m_param->sourceWidth + 1) * sizeof(int16_t));
1098
-                else
1099
-                    m_cliopt.bDither = false;
1100
-            }
1101
-
1102
-            // main encoder loop
1103
-            while (pic_in && !b_ctrl_c)
1104
-            {
1105
-                pic_orig.poc = (m_param->bField && m_param->interlaceMode) ? inFrameCount * 2 : inFrameCount;
1106
-                if (m_cliopt.qpfile)
1107
-                {
1108
-                    if (!m_cliopt.parseQPFile(pic_orig))
1109
-                    {
1110
-                        x265_log(NULL, X265_LOG_ERROR, "can't parse qpfile for frame %d in %s\n",
1111
-                            pic_in->poc, profileName);
1112
-                        fclose(m_cliopt.qpfile);
1113
-                        m_cliopt.qpfile = NULL;
1114
-                    }
1115
-                }
1116
-
1117
-                if (m_cliopt.framesToBeEncoded && inFrameCount >= m_cliopt.framesToBeEncoded)
1118
-                    pic_in = NULL;
1119
-                else if (readPicture(pic_in))
1120
-                    inFrameCount++;
1121
-                else
1122
-                    pic_in = NULL;
1123
-
1124
-                if (pic_in)
1125
-                {
1126
-                    if (pic_in->bitDepth > m_param->internalBitDepth && m_cliopt.bDither)
1127
-                    {
1128
-                        x265_dither_image(pic_in, m_cliopt.input->getWidth(), m_cliopt.input->getHeight(), errorBuf, m_param->internalBitDepth);
1129
-                        pic_in->bitDepth = m_param->internalBitDepth;
1130
-                    }
1131
-                    /* Overwrite PTS */
1132
-                    pic_in->pts = pic_in->poc;
1133
-
1134
-                    // convert to field
1135
-                    if (m_param->bField && m_param->interlaceMode)
1136
-                    {
1137
-                        int height = pic_in->height >> 1;
1138
-
1139
-                        int static bCreated = 0;
1140
-                        if (bCreated == 0)
1141
-                        {
1142
-                            bCreated = 1;
1143
-                            inputPicNum = 2;
1144
-                            picField1.fieldNum = 1;
1145
-                            picField2.fieldNum = 2;
1146
-
1147
-                            picField1.bitDepth = picField2.bitDepth = pic_in->bitDepth;
1148
-                            picField1.colorSpace = picField2.colorSpace = pic_in->colorSpace;
1149
-                            picField1.height = picField2.height = pic_in->height >> 1;
1150
-                            picField1.framesize = picField2.framesize = pic_in->framesize >> 1;
1151
-
1152
-                            size_t fieldFrameSize = (size_t)pic_in->framesize >> 1;
1153
-                            char* field1Buf = X265_MALLOC(char, fieldFrameSize);
1154
-                            char* field2Buf = X265_MALLOC(char, fieldFrameSize);
1155
-
1156
-                            int stride = picField1.stride0 = picField2.stride0 = pic_in->stride0;
1157
-                            uint64_t framesize = stride * (height >> x265_cli_cspspic_in->colorSpace.height0);
1158
-                            picField1.planes0 = field1Buf;
1159
-                            picField2.planes0 = field2Buf;
1160
-                            for (int i = 1; i < x265_cli_cspspic_in->colorSpace.planes; i++)
1161
-                            {
1162
-                                picField1.planesi = field1Buf + framesize;
1163
-                                picField2.planesi = field2Buf + framesize;
1164
-
1165
-                                stride = picField1.stridei = picField2.stridei = pic_in->stridei;
1166
-                                framesize += (stride * (height >> x265_cli_cspspic_in->colorSpace.heighti));
1167
-                            }
1168
-                            assert(framesize == picField1.framesize);
1169
-                        }
1170
-
1171
-                        picField1.pts = picField1.poc = pic_in->poc;
1172
-                        picField2.pts = picField2.poc = pic_in->poc + 1;
1173
-
1174
-                        picField1.userSEI = picField2.userSEI = pic_in->userSEI;
1175
-
1176
-                        //if (pic_in->userData)
1177
-                        //{
1178
-                        //    // Have to handle userData here
1179
-                        //}
1180
-
1181
-                        if (pic_in->framesize)
1182
-                        {
1183
-                            for (int i = 0; i < x265_cli_cspspic_in->colorSpace.planes; i++)
1184
-                            {
1185
-                                char* srcP1 = (char*)pic_in->planesi;
1186
-                                char* srcP2 = (char*)pic_in->planesi + pic_in->stridei;
1187
-                                char* p1 = (char*)picField1.planesi;
1188
-                                char* p2 = (char*)picField2.planesi;
1189
-
1190
-                                int stride = picField1.stridei;
1191
-
1192
-                                for (int y = 0; y < (height >> x265_cli_cspspic_in->colorSpace.heighti); y++)
1193
-                                {
1194
-                                    memcpy(p1, srcP1, stride);
1195
-                                    memcpy(p2, srcP2, stride);
1196
-                                    srcP1 += 2 * stride;
1197
-                                    srcP2 += 2 * stride;
1198
-                                    p1 += stride;
1199
-                                    p2 += stride;
1200
-                                }
1201
-                            }
1202
-                        }
1203
-                    }
1204
-
1205
-                    if (bDolbyVisionRPU)
1206
-                    {
1207
-                        if (m_param->bField && m_param->interlaceMode)
1208
-                        {
1209
-                            if (m_cliopt.rpuParser(&picField1) > 0)
1210
-                                goto fail;
1211
-                            if (m_cliopt.rpuParser(&picField2) > 0)
1212
-                                goto fail;
1213
-                        }
1214
-                        else
1215
-                        {
1216
-                            if (m_cliopt.rpuParser(pic_in) > 0)
1217
-                                goto fail;
1218
-                        }
1219
-                    }
1220
-                }
1221
-
1222
-                for (int inputNum = 0; inputNum < inputPicNum; inputNum++)
1223
-                {
1224
-                    x265_picture *picInput = NULL;
1225
-                    if (inputPicNum == 2)
1226
-                        picInput = pic_in ? (inputNum ? &picField2 : &picField1) : NULL;
1227
-                    else
1228
-                        picInput = pic_in;
1229
-
1230
-                    int numEncoded = api->encoder_encode(m_encoder, &p_nal, &nal, picInput, pic_recon);
1231
-
1232
-                    int idx = (inFrameCount - 1) % m_parent->m_queueSize;
1233
-                    m_parent->m_picIdxReadCntm_ididx.incr();
1234
-                    m_parent->m_picReadCntm_id.incr();
1235
-                    if (m_cliopt.loadLevel && picInput)
1236
-                    {
1237
-                        m_parent->m_analysisReadCntm_cliopt.refId.incr();
1238
-                        m_parent->m_analysisReadm_cliopt.refIdm_lastIdx.incr();
1239
-                    }
1240
-
1241
-                    if (numEncoded < 0)
1242
-                    {
1243
-                        b_ctrl_c = 1;
1244
-                        m_ret = 4;
1245
-                        break;
1246
-                    }
1247
-
1248
-                    if (reconPlay && numEncoded)
1249
-                        reconPlay->writePicture(*pic_recon);
1250
-
1251
-                    outFrameCount += numEncoded;
1252
-
1253
-                    if (isAbrSave && numEncoded)
1254
-                    {
1255
-                        copyInfo(analysisInfo);
1256
-                    }
1257
-
1258
-                    if (numEncoded && pic_recon && m_cliopt.recon)
1259
-                        m_cliopt.recon->writePicture(pic_out);
1260
-                    if (nal)
1261
-                    {
1262
-                        m_cliopt.totalbytes += m_cliopt.output->writeFrame(p_nal, nal, pic_out);
1263
-                        if (pts_queue)
1264
-                        {
1265
-                            pts_queue->push(-pic_out.pts);
1266
-                            if (pts_queue->size() > 2)
1267
-                                pts_queue->pop();
1268
-                        }
1269
-                    }
1270
-                    m_cliopt.printStatus(outFrameCount);
1271
-                }
1272
-            }
1273
-
1274
-            /* Flush the encoder */
1275
-            while (!b_ctrl_c)
1276
-            {
1277
-                int numEncoded = api->encoder_encode(m_encoder, &p_nal, &nal, NULL, pic_recon);
1278
-                if (numEncoded < 0)
1279
-                {
1280
-                    m_ret = 4;
1281
-                    break;
1282
-                }
1283
-
1284
-                if (reconPlay && numEncoded)
1285
-                    reconPlay->writePicture(*pic_recon);
1286
-
1287
-                outFrameCount += numEncoded;
1288
-                if (isAbrSave && numEncoded)
1289
-                {
1290
-                    copyInfo(analysisInfo);
1291
-                }
1292
-
1293
-                if (numEncoded && pic_recon && m_cliopt.recon)
1294
-                    m_cliopt.recon->writePicture(pic_out);
1295
-                if (nal)
1296
-                {
1297
-                    m_cliopt.totalbytes += m_cliopt.output->writeFrame(p_nal, nal, pic_out);
1298
-                    if (pts_queue)
1299
-                    {
1300
-                        pts_queue->push(-pic_out.pts);
1301
-                        if (pts_queue->size() > 2)
1302
-                            pts_queue->pop();
1303
-                    }
1304
-                }
1305
-
1306
-                m_cliopt.printStatus(outFrameCount);
1307
-
1308
-                if (!numEncoded)
1309
-                    break;
1310
-            }
1311
-
1312
-            if (bDolbyVisionRPU)
1313
-            {
1314
-                if (fgetc(m_cliopt.dolbyVisionRpu) != EOF)
1315
-                    x265_log(NULL, X265_LOG_WARNING, "Dolby Vision RPU count is greater than frame count in %s\n",
1316
-                        profileName);
1317
-                x265_log(NULL, X265_LOG_INFO, "VES muxing with Dolby Vision RPU file successful in %s\n",
1318
-                    profileName);
1319
-            }
1320
-
1321
-            /* clear progress report */
1322
-            if (m_cliopt.bProgress)
1323
-                fprintf(stderr, "%*s\r", 80, " ");
1324
-
1325
-        fail:
1326
-
1327
-            delete reconPlay;
1328
-
1329
-            api->encoder_get_stats(m_encoder, &stats, sizeof(stats));
1330
-            if (m_param->csvfn && !b_ctrl_c)
1331
-#if ENABLE_LIBVMAF
1332
-                api->vmaf_encoder_log(m_encoder, m_cliopt.argCnt, m_cliopt.argString, m_cliopt.param, vmafdata);
1333
-#else
1334
-                api->encoder_log(m_encoder, m_cliopt.argCnt, m_cliopt.argString);
1335
-#endif
1336
-            api->encoder_close(m_encoder);
1337
-
1338
-            int64_t second_largest_pts = 0;
1339
-            int64_t largest_pts = 0;
1340
-            if (pts_queue && pts_queue->size() >= 2)
1341
-            {
1342
-                second_largest_pts = -pts_queue->top();
1343
-                pts_queue->pop();
1344
-                largest_pts = -pts_queue->top();
1345
-                pts_queue->pop();
1346
-                delete pts_queue;
1347
-                pts_queue = NULL;
1348
-            }
1349
-            m_cliopt.output->closeFile(largest_pts, second_largest_pts);
1350
-
1351
-            if (b_ctrl_c)
1352
-                general_log(m_param, NULL, X265_LOG_INFO, "aborted at input frame %d, output frame %d in %s\n",
1353
-                    m_cliopt.seek + inFrameCount, stats.encodedPictureCount, profileName);
1354
-
1355
-            api->param_free(m_param);
1356
-
1357
-            X265_FREE(errorBuf);
1358
-            X265_FREE(rpuPayload);
1359
-
1360
-            m_threadActive = false;
1361
-            m_parent->m_numActiveEncodes.decr();
1362
-        }
1363
-    }
1364
-
1365
-    void PassEncoder::destroy()
1366
-    {
1367
-        stop();
1368
-        if (m_reader)
1369
-        {
1370
-            m_reader->stop();
1371
-            delete m_reader;
1372
-        }
1373
-        else
1374
-        {
1375
-            m_scaler->stop();
1376
-            m_scaler->destroy();
1377
-            delete m_scaler;
1378
-        }
1379
-    }
1380
-
1381
-    Scaler::Scaler(int threadId, int threadNum, int id, VideoDesc *src, VideoDesc *dst, PassEncoder *parentEnc)
1382
-    {
1383
-        m_parentEnc = parentEnc;
1384
-        m_id = id;
1385
-        m_srcFormat = src;
1386
-        m_dstFormat = dst;
1387
-        m_threadActive = false;
1388
-        m_scaleFrameSize = 0;
1389
-        m_filterManager = NULL;
1390
-        m_threadId = threadId;
1391
-        m_threadTotal = threadNum;
1392
-
1393
-        int csp = dst->m_csp;
1394
-        uint32_t pixelbytes = dst->m_inputDepth > 8 ? 2 : 1;
1395
-        for (int i = 0; i < x265_cli_cspscsp.planes; i++)
1396
-        {
1397
-            int w = dst->m_width >> x265_cli_cspscsp.widthi;
1398
-            int h = dst->m_height >> x265_cli_cspscsp.heighti;
1399
-            m_scalePlanesi = w * h * pixelbytes;
1400
-            m_scaleFrameSize += m_scalePlanesi;
1401
-        }
1402
-
1403
-        if (src->m_height != dst->m_height || src->m_width != dst->m_width)
1404
-        {
1405
-            m_filterManager = new ScalerFilterManager;
1406
-            m_filterManager->init(4, m_srcFormat, m_dstFormat);
1407
-        }
1408
-    }
1409
-
1410
-    bool Scaler::scalePic(x265_picture * destination, x265_picture * source)
1411
-    {
1412
-        if (!destination || !source)
1413
-            return false;
1414
-        x265_param* param = m_parentEnc->m_param;
1415
-        int pixelBytes = m_dstFormat->m_inputDepth > 8 ? 2 : 1;
1416
-        if (m_srcFormat->m_height != m_dstFormat->m_height || m_srcFormat->m_width != m_dstFormat->m_width)
1417
-        {
1418
-            void **srcPlane = NULL, **dstPlane = NULL;
1419
-            int srcStride3, dstStride3;
1420
-            destination->bitDepth = source->bitDepth;
1421
-            destination->colorSpace = source->colorSpace;
1422
-            destination->pts = source->pts;
1423
-            destination->dts = source->dts;
1424
-            destination->reorderedPts = source->reorderedPts;
1425
-            destination->poc = source->poc;
1426
-            destination->userSEI = source->userSEI;
1427
-            srcPlane = source->planes;
1428
-            dstPlane = destination->planes;
1429
-            srcStride0 = source->stride0;
1430
-            destination->stride0 = m_dstFormat->m_width * pixelBytes;
1431
-            dstStride0 = destination->stride0;
1432
-            if (param->internalCsp != X265_CSP_I400)
1433
-            {
1434
-                srcStride1 = source->stride1;
1435
-                srcStride2 = source->stride2;
1436
-                destination->stride1 = destination->stride0 >> x265_cli_cspsparam->internalCsp.width1;
1437
-                destination->stride2 = destination->stride0 >> x265_cli_cspsparam->internalCsp.width2;
1438
-                dstStride1 = destination->stride1;
1439
-                dstStride2 = destination->stride2;
1440
-            }
1441
-            if (m_scaleFrameSize)
1442
-            {
1443
-                m_filterManager->scale_pic(srcPlane, dstPlane, srcStride, dstStride);
1444
-                return true;
1445
-            }
1446
-            else
1447
-                x265_log(param, X265_LOG_INFO, "Empty frame received\n");
1448
-        }
1449
-        return false;
1450
-    }
1451
-
1452
-    void Scaler::threadMain()
1453
-    {
1454
-        THREAD_NAME("Scaler", m_id);
1455
-
1456
-        /* unscaled picture is stored in the last index */
1457
-        uint32_t srcId = m_id - 1;
1458
-        int QDepth = m_parentEnc->m_parent->m_queueSize;
1459
-        while (!m_parentEnc->m_inputOver)
1460
-        {
1461
-
1462
-            uint32_t scaledWritten = m_parentEnc->m_parent->m_picWriteCntm_id.get();
1463
-
1464
-            if (m_parentEnc->m_cliopt.framesToBeEncoded && scaledWritten >= m_parentEnc->m_cliopt.framesToBeEncoded)
1465
-                break;
1466
-
1467
-            if (m_threadTotal > 1 && (m_threadId != scaledWritten % m_threadTotal))
1468
-            {
1469
-                continue;
1470
-            }
1471
-            uint32_t written = m_parentEnc->m_parent->m_picWriteCntsrcId.get();
1472
-
1473
-            /*If all the input pictures are scaled by the current scale worker thread wait for input pictures*/
1474
-            while (m_threadActive && (scaledWritten == written)) {
1475
-                written = m_parentEnc->m_parent->m_picWriteCntsrcId.waitForChange(written);
1476
-            }
1477
-
1478
-            if (m_threadActive && scaledWritten < written)
1479
-            {
1480
-
1481
-                int scaledWriteIdx = scaledWritten % QDepth;
1482
-                int overWritePicBuffer = scaledWritten / QDepth;
1483
-                int read = m_parentEnc->m_parent->m_picIdxReadCntm_idscaledWriteIdx.get();
1484
-
1485
-                while (overWritePicBuffer && read < overWritePicBuffer)
1486
-                {
1487
-                    read = m_parentEnc->m_parent->m_picIdxReadCntm_idscaledWriteIdx.waitForChange(read);
1488
-                }
1489
-
1490
-                if (!m_parentEnc->m_parent->m_inputPicBufferm_idscaledWriteIdx)
1491
-                {
1492
-                    int framesize = 0;
1493
-                    int planesize3;
1494
-                    int csp = m_dstFormat->m_csp;
1495
-                    int stride3;
1496
-                    stride0 = m_dstFormat->m_width;
1497
-                    stride1 = stride0 >> x265_cli_cspscsp.width1;
1498
-                    stride2 = stride0 >> x265_cli_cspscsp.width2;
1499
-                    for (int i = 0; i < x265_cli_cspscsp.planes; i++)
1500
-                    {
1501
-                        uint32_t h = m_dstFormat->m_height >> x265_cli_cspscsp.heighti;
1502
-                        planesizei = h * stridei;
1503
-                        framesize += planesizei;
1504
-                    }
1505
-
1506
-                    m_parentEnc->m_parent->m_inputPicBufferm_idscaledWriteIdx = x265_picture_alloc();
1507
-                    x265_picture_init(m_parentEnc->m_param, m_parentEnc->m_parent->m_inputPicBufferm_idscaledWriteIdx);
1508
-
1509
-                    ((x265_picture*)m_parentEnc->m_parent->m_inputPicBufferm_idscaledWritten % QDepth)->framesize = framesize;
1510
-                    for (int32_t j = 0; j < x265_cli_cspscsp.planes; j++)
1511
-                    {
1512
-                        m_parentEnc->m_parent->m_inputPicBufferm_idscaledWritten % QDepth->planesj = X265_MALLOC(char, planesizej);
1513
-                    }
1514
-                }
1515
-
1516
-                x265_picture *srcPic = m_parentEnc->m_parent->m_inputPicBuffersrcIdscaledWritten % QDepth;
1517
-                x265_picture* destPic = m_parentEnc->m_parent->m_inputPicBufferm_idscaledWriteIdx;
1518
-
1519
-                // Enqueue this picture up with the current encoder so that it will asynchronously encode
1520
-                if (!scalePic(destPic, srcPic))
1521
-                    x265_log(NULL, X265_LOG_ERROR, "Unable to copy scaled input picture to input queue \n");
1522
-                else
1523
-                    m_parentEnc->m_parent->m_picWriteCntm_id.incr();
1524
-                m_scaledWriteCnt.incr();
1525
-                m_parentEnc->m_parent->m_picIdxReadCntsrcIdscaledWriteIdx.incr();
1526
-            }
1527
-            if (m_threadTotal > 1)
1528
-            {
1529
-                written = m_parentEnc->m_parent->m_picWriteCntsrcId.get();
1530
-                int totalWrite = written / m_threadTotal;
1531
-                if (written % m_threadTotal > m_threadId)
1532
-                    totalWrite++;
1533
-                if (totalWrite == m_scaledWriteCnt.get())
1534
-                {
1535
-                    m_parentEnc->m_parent->m_picWriteCntsrcId.poke();
1536
-                    m_parentEnc->m_parent->m_picWriteCntm_id.poke();
1537
-                    break;
1538
-                }
1539
-            }
1540
-            else
1541
-            {
1542
-                /* Once end of video is reached and all frames are scaled, release wait on picwritecount */
1543
-                scaledWritten = m_parentEnc->m_parent->m_picWriteCntm_id.get();
1544
-                written = m_parentEnc->m_parent->m_picWriteCntsrcId.get();
1545
-                if (written == scaledWritten)
1546
-                {
1547
-                    m_parentEnc->m_parent->m_picWriteCntsrcId.poke();
1548
-                    m_parentEnc->m_parent->m_picWriteCntm_id.poke();
1549
-                    break;
1550
-                }
1551
-            }
1552
-
1553
-        }
1554
-        m_threadActive = false;
1555
-        destroy();
1556
-    }
1557
-
1558
-    Reader::Reader(int id, PassEncoder *parentEnc)
1559
-    {
1560
-        m_parentEnc = parentEnc;
1561
-        m_id = id;
1562
-        m_input = parentEnc->m_input;
1563
-    }
1564
-
1565
-    void Reader::threadMain()
1566
-    {
1567
-        THREAD_NAME("Reader", m_id);
1568
-
1569
-        int QDepth = m_parentEnc->m_parent->m_queueSize;
1570
-        x265_picture* src = x265_picture_alloc();
1571
-        x265_picture_init(m_parentEnc->m_param, src);
1572
-
1573
-        while (m_threadActive)
1574
-        {
1575
-            uint32_t written = m_parentEnc->m_parent->m_picWriteCntm_id.get();
1576
-            uint32_t writeIdx = written % QDepth;
1577
-            uint32_t read = m_parentEnc->m_parent->m_picIdxReadCntm_idwriteIdx.get();
1578
-            uint32_t overWritePicBuffer = written / QDepth;
1579
-
1580
-            if (m_parentEnc->m_cliopt.framesToBeEncoded && written >= m_parentEnc->m_cliopt.framesToBeEncoded)
1581
-                break;
1582
-
1583
-            while (overWritePicBuffer && read < overWritePicBuffer)
1584
-            {
1585
-                read = m_parentEnc->m_parent->m_picIdxReadCntm_idwriteIdx.waitForChange(read);
1586
-            }
1587
-
1588
-            x265_picture* dest = m_parentEnc->m_parent->m_inputPicBufferm_idwriteIdx;
1589
-            if (m_input->readPicture(*src))
1590
-            {
1591
-                dest->poc = src->poc;
1592
-                dest->pts = src->pts;
1593
-                dest->userSEI = src->userSEI;
1594
-                dest->bitDepth = src->bitDepth;
1595
-                dest->framesize = src->framesize;
1596
-                dest->height = src->height;
1597
-                dest->width = src->width;
1598
-                dest->colorSpace = src->colorSpace;
1599
-                dest->userSEI = src->userSEI;
1600
-                dest->rpu.payload = src->rpu.payload;
1601
-                dest->picStruct = src->picStruct;
1602
-                dest->stride0 = src->stride0;
1603
-                dest->stride1 = src->stride1;
1604
-                dest->stride2 = src->stride2;
1605
-
1606
-                if (!dest->planes0)
1607
-                    dest->planes0 = X265_MALLOC(char, dest->framesize);
1608
-
1609
-                memcpy(dest->planes0, src->planes0, src->framesize * sizeof(char));
1610
-                dest->planes1 = (char*)dest->planes0 + src->stride0 * src->height;
1611
-                dest->planes2 = (char*)dest->planes1 + src->stride1 * (src->height >> x265_cli_cspssrc->colorSpace.height1);
1612
-                m_parentEnc->m_parent->m_picWriteCntm_id.incr();
1613
-            }
1614
-            else
1615
-            {
1616
-                m_threadActive = false;
1617
-                m_parentEnc->m_inputOver = true;
1618
-                m_parentEnc->m_parent->m_picWriteCntm_id.poke();
1619
-            }
1620
-        }
1621
-        x265_picture_free(src);
1622
-    }
1623
-}
1624
+
1625
+#if ENABLE_LIBVMAF
1626
+            x265_vmaf_data* vmafdata = m_cliopt.vmafData;
1627
+#endif
1628
+            /* This allows muxers to modify bitstream format */
1629
+            m_cliopt.output->setParam(m_param);
1630
+            const x265_api* api = m_cliopt.api;
1631
+            ReconPlay* reconPlay = NULL;
1632
+            if (m_cliopt.reconPlayCmd)
1633
+                reconPlay = new ReconPlay(m_cliopt.reconPlayCmd, *m_param);
1634
+            char* profileName = m_cliopt.encName ? m_cliopt.encName : (char *)"x265";
1635
+
1636
+            if (signal(SIGINT, sigint_handler) == SIG_ERR)
1637
+                x265_log(m_param, X265_LOG_ERROR, "Unable to register CTRL+C handler: %s in %s\n",
1638
+                    strerror(errno), profileName);
1639
+
1640
+            x265_picture pic_orig, pic_out;
1641
+            x265_picture *pic_in = &pic_orig;
1642
+            /* Allocate recon picture if analysis save/load is enabled */
1643
+            std::priority_queue<int64_t>* pts_queue = m_cliopt.output->needPTS() ? new std::priority_queue<int64_t>() : NULL;
1644
+            x265_picture *pic_recon = (m_cliopt.recon || m_param->analysisSave || m_param->analysisLoad || pts_queue || reconPlay || m_param->csvLogLevel) ? &pic_out : NULL;
1645
+            uint32_t inFrameCount = 0;
1646
+            uint32_t outFrameCount = 0;
1647
+            x265_nal *p_nal;
1648
+            x265_stats stats;
1649
+            uint32_t nal;
1650
+            int16_t *errorBuf = NULL;
1651
+            bool bDolbyVisionRPU = false;
1652
+            uint8_t *rpuPayload = NULL;
1653
+            int inputPicNum = 1;
1654
+            x265_picture picField1, picField2;
1655
+            x265_analysis_data* analysisInfo = (x265_analysis_data*)(&pic_out.analysisData);
1656
+            bool isAbrSave = m_cliopt.saveLevel && (m_parent->m_numEncodes > 1);
1657
+
1658
+            if (!m_param->bRepeatHeaders && !m_param->bEnableSvtHevc)
1659
+            {
1660
+                if (api->encoder_headers(m_encoder, &p_nal, &nal) < 0)
1661
+                {
1662
+                    x265_log(m_param, X265_LOG_ERROR, "Failure generating stream headers in %s\n", profileName);
1663
+                    m_ret = 3;
1664
+                    goto fail;
1665
+                }
1666
+                else
1667
+                    m_cliopt.totalbytes += m_cliopt.output->writeHeaders(p_nal, nal);
1668
+            }
1669
+
1670
+            if (m_param->bField && m_param->interlaceMode)
1671
+            {
1672
+                api->picture_init(m_param, &picField1);
1673
+                api->picture_init(m_param, &picField2);
1674
+                // return back the original height of input
1675
+                m_param->sourceHeight *= 2;
1676
+                api->picture_init(m_param, &pic_orig);
1677
+            }
1678
+            else
1679
+                api->picture_init(m_param, &pic_orig);
1680
+
1681
+            if (m_param->dolbyProfile && m_cliopt.dolbyVisionRpu)
1682
+            {
1683
+                rpuPayload = X265_MALLOC(uint8_t, 1024);
1684
+                pic_in->rpu.payload = rpuPayload;
1685
+                if (pic_in->rpu.payload)
1686
+                    bDolbyVisionRPU = true;
1687
+            }
1688
+
1689
+            if (m_cliopt.bDither)
1690
+            {
1691
+                errorBuf = X265_MALLOC(int16_t, m_param->sourceWidth + 1);
1692
+                if (errorBuf)
1693
+                    memset(errorBuf, 0, (m_param->sourceWidth + 1) * sizeof(int16_t));
1694
+                else
1695
+                    m_cliopt.bDither = false;
1696
+            }
1697
+
1698
+            // main encoder loop
1699
+            while (pic_in && !b_ctrl_c)
1700
+            {
1701
+                pic_orig.poc = (m_param->bField && m_param->interlaceMode) ? inFrameCount * 2 : inFrameCount;
1702
+                if (m_cliopt.qpfile)
1703
+                {
1704
+                    if (!m_cliopt.parseQPFile(pic_orig))
1705
+                    {
1706
+                        x265_log(NULL, X265_LOG_ERROR, "can't parse qpfile for frame %d in %s\n",
1707
+                            pic_in->poc, profileName);
1708
+                        fclose(m_cliopt.qpfile);
1709
+                        m_cliopt.qpfile = NULL;
1710
+                    }
1711
+                }
1712
+
1713
+                if (m_cliopt.framesToBeEncoded && inFrameCount >= m_cliopt.framesToBeEncoded)
1714
+                    pic_in = NULL;
1715
+                else if (readPicture(pic_in))
1716
+                    inFrameCount++;
1717
+                else
1718
+                    pic_in = NULL;
1719
+
1720
+                if (pic_in)
1721
+                {
1722
+                    if (pic_in->bitDepth > m_param->internalBitDepth && m_cliopt.bDither)
1723
+                    {
1724
+                        x265_dither_image(pic_in, m_cliopt.input->getWidth(), m_cliopt.input->getHeight(), errorBuf, m_param->internalBitDepth);
1725
+                        pic_in->bitDepth = m_param->internalBitDepth;
1726
+                    }
1727
+                    /* Overwrite PTS */
1728
+                    pic_in->pts = pic_in->poc;
1729
+
1730
+                    // convert to field
1731
+                    if (m_param->bField && m_param->interlaceMode)
1732
+                    {
1733
+                        int height = pic_in->height >> 1;
1734
+
1735
+                        int static bCreated = 0;
1736
+                        if (bCreated == 0)
1737
+                        {
1738
+                            bCreated = 1;
1739
+                            inputPicNum = 2;
1740
+                            picField1.fieldNum = 1;
1741
+                            picField2.fieldNum = 2;
1742
+
1743
+                            picField1.bitDepth = picField2.bitDepth = pic_in->bitDepth;
1744
+                            picField1.colorSpace = picField2.colorSpace = pic_in->colorSpace;
1745
+                            picField1.height = picField2.height = pic_in->height >> 1;
1746
+                            picField1.framesize = picField2.framesize = pic_in->framesize >> 1;
1747
+
1748
+                            size_t fieldFrameSize = (size_t)pic_in->framesize >> 1;
1749
+                            char* field1Buf = X265_MALLOC(char, fieldFrameSize);
1750
+                            char* field2Buf = X265_MALLOC(char, fieldFrameSize);
1751
+
1752
+                            int stride = picField1.stride0 = picField2.stride0 = pic_in->stride0;
1753
+                            uint64_t framesize = stride * (height >> x265_cli_cspspic_in->colorSpace.height0);
1754
+                            picField1.planes0 = field1Buf;
1755
+                            picField2.planes0 = field2Buf;
1756
+                            for (int i = 1; i < x265_cli_cspspic_in->colorSpace.planes; i++)
1757
+                            {
1758
+                                picField1.planesi = field1Buf + framesize;
1759
+                                picField2.planesi = field2Buf + framesize;
1760
+
1761
+                                stride = picField1.stridei = picField2.stridei = pic_in->stridei;
1762
+                                framesize += (stride * (height >> x265_cli_cspspic_in->colorSpace.heighti));
1763
+                            }
1764
+                            assert(framesize == picField1.framesize);
1765
+                        }
1766
+
1767
+                        picField1.pts = picField1.poc = pic_in->poc;
1768
+                        picField2.pts = picField2.poc = pic_in->poc + 1;
1769
+
1770
+                        picField1.userSEI = picField2.userSEI = pic_in->userSEI;
1771
+
1772
+                        //if (pic_in->userData)
1773
+                        //{
1774
+                        //    // Have to handle userData here
1775
+                        //}
1776
+
1777
+                        if (pic_in->framesize)
1778
+                        {
1779
+                            for (int i = 0; i < x265_cli_cspspic_in->colorSpace.planes; i++)
1780
+                            {
1781
+                                char* srcP1 = (char*)pic_in->planesi;
1782
+                                char* srcP2 = (char*)pic_in->planesi + pic_in->stridei;
1783
+                                char* p1 = (char*)picField1.planesi;
1784
+                                char* p2 = (char*)picField2.planesi;
1785
+
1786
+                                int stride = picField1.stridei;
1787
+
1788
+                                for (int y = 0; y < (height >> x265_cli_cspspic_in->colorSpace.heighti); y++)
1789
+                                {
1790
+                                    memcpy(p1, srcP1, stride);
1791
+                                    memcpy(p2, srcP2, stride);
1792
+                                    srcP1 += 2 * stride;
1793
+                                    srcP2 += 2 * stride;
1794
+                                    p1 += stride;
1795
+                                    p2 += stride;
1796
+                                }
1797
+                            }
1798
+                        }
1799
+                    }
1800
+
1801
+                    if (bDolbyVisionRPU)
1802
+                    {
1803
+                        if (m_param->bField && m_param->interlaceMode)
1804
+                        {
1805
+                            if (m_cliopt.rpuParser(&picField1) > 0)
1806
+                                goto fail;
1807
+                            if (m_cliopt.rpuParser(&picField2) > 0)
1808
+                                goto fail;
1809
+                        }
1810
+                        else
1811
+                        {
1812
+                            if (m_cliopt.rpuParser(pic_in) > 0)
1813
+                                goto fail;
1814
+                        }
1815
+                    }
1816
+                }
1817
+
1818
+                for (int inputNum = 0; inputNum < inputPicNum; inputNum++)
1819
+                {
1820
+                    x265_picture *picInput = NULL;
1821
+                    if (inputPicNum == 2)
1822
+                        picInput = pic_in ? (inputNum ? &picField2 : &picField1) : NULL;
1823
+                    else
1824
+                        picInput = pic_in;
1825
+
1826
+                    int numEncoded = api->encoder_encode(m_encoder, &p_nal, &nal, picInput, pic_recon);
1827
+
1828
+                    int idx = (inFrameCount - 1) % m_parent->m_queueSize;
1829
+                    m_parent->m_picIdxReadCntm_ididx.incr();
1830
+                    m_parent->m_picReadCntm_id.incr();
1831
+                    if (m_cliopt.loadLevel && picInput)
1832
+                    {
1833
+                        m_parent->m_analysisReadCntm_cliopt.refId.incr();
1834
+                        m_parent->m_analysisReadm_cliopt.refIdm_lastIdx.incr();
1835
+                    }
1836
+
1837
+                    if (numEncoded < 0)
1838
+                    {
1839
+                        b_ctrl_c = 1;
1840
+                        m_ret = 4;
1841
+                        break;
1842
+                    }
1843
+
1844
+                    if (reconPlay && numEncoded)
1845
+                        reconPlay->writePicture(*pic_recon);
1846
+
1847
+                    outFrameCount += numEncoded;
1848
+
1849
+                    if (isAbrSave && numEncoded)
1850
+                    {
1851
+                        copyInfo(analysisInfo);
1852
+                    }
1853
+
1854
+                    if (numEncoded && pic_recon && m_cliopt.recon)
1855
+                        m_cliopt.recon->writePicture(pic_out);
1856
+                    if (nal)
1857
+                    {
1858
+                        m_cliopt.totalbytes += m_cliopt.output->writeFrame(p_nal, nal, pic_out);
1859
+                        if (pts_queue)
1860
+                        {
1861
+                            pts_queue->push(-pic_out.pts);
1862
+                            if (pts_queue->size() > 2)
1863
+                                pts_queue->pop();
1864
+                        }
1865
+                    }
1866
+                    m_cliopt.printStatus(outFrameCount);
1867
+                }
1868
+            }
1869
+
1870
+            /* Flush the encoder */
1871
+            while (!b_ctrl_c)
1872
+            {
1873
+                int numEncoded = api->encoder_encode(m_encoder, &p_nal, &nal, NULL, pic_recon);
1874
+                if (numEncoded < 0)
1875
+                {
1876
+                    m_ret = 4;
1877
+                    break;
1878
+                }
1879
+
1880
+                if (reconPlay && numEncoded)
1881
+                    reconPlay->writePicture(*pic_recon);
1882
+
1883
+                outFrameCount += numEncoded;
1884
+                if (isAbrSave && numEncoded)
1885
+                {
1886
+                    copyInfo(analysisInfo);
1887
+                }
1888
+
1889
+                if (numEncoded && pic_recon && m_cliopt.recon)
1890
+                    m_cliopt.recon->writePicture(pic_out);
1891
+                if (nal)
1892
+                {
1893
+                    m_cliopt.totalbytes += m_cliopt.output->writeFrame(p_nal, nal, pic_out);
1894
+                    if (pts_queue)
1895
+                    {
1896
+                        pts_queue->push(-pic_out.pts);
1897
+                        if (pts_queue->size() > 2)
1898
+                            pts_queue->pop();
1899
+                    }
1900
+                }
1901
+
1902
+                m_cliopt.printStatus(outFrameCount);
1903
+
1904
+                if (!numEncoded)
1905
+                    break;
1906
+            }
1907
+
1908
+            if (bDolbyVisionRPU)
1909
+            {
1910
+                if (fgetc(m_cliopt.dolbyVisionRpu) != EOF)
1911
+                    x265_log(NULL, X265_LOG_WARNING, "Dolby Vision RPU count is greater than frame count in %s\n",
1912
+                        profileName);
1913
+                x265_log(NULL, X265_LOG_INFO, "VES muxing with Dolby Vision RPU file successful in %s\n",
1914
+                    profileName);
1915
+            }
1916
+
1917
+            /* clear progress report */
1918
+            if (m_cliopt.bProgress)
1919
+                fprintf(stderr, "%*s\r", 80, " ");
1920
+
1921
+        fail:
1922
+
1923
+            delete reconPlay;
1924
+
1925
+            api->encoder_get_stats(m_encoder, &stats, sizeof(stats));
1926
+            if (m_param->csvfn && !b_ctrl_c)
1927
+#if ENABLE_LIBVMAF
1928
+                api->vmaf_encoder_log(m_encoder, m_cliopt.argCnt, m_cliopt.argString, m_cliopt.param, vmafdata);
1929
+#else
1930
+                api->encoder_log(m_encoder, m_cliopt.argCnt, m_cliopt.argString);
1931
+#endif
1932
+            api->encoder_close(m_encoder);
1933
+
1934
+            int64_t second_largest_pts = 0;
1935
+            int64_t largest_pts = 0;
1936
+            if (pts_queue && pts_queue->size() >= 2)
1937
+            {
1938
+                second_largest_pts = -pts_queue->top();
1939
+                pts_queue->pop();
1940
+                largest_pts = -pts_queue->top();
1941
+                pts_queue->pop();
1942
+                delete pts_queue;
1943
+                pts_queue = NULL;
1944
+            }
1945
+            m_cliopt.output->closeFile(largest_pts, second_largest_pts);
1946
+
1947
+            if (b_ctrl_c)
1948
+                general_log(m_param, NULL, X265_LOG_INFO, "aborted at input frame %d, output frame %d in %s\n",
1949
+                    m_cliopt.seek + inFrameCount, stats.encodedPictureCount, profileName);
1950
+
1951
+            api->param_free(m_param);
1952
+
1953
+            X265_FREE(errorBuf);
1954
+            X265_FREE(rpuPayload);
1955
+
1956
+            m_threadActive = false;
1957
+            m_parent->m_numActiveEncodes.decr();
1958
+        }
1959
+    }
1960
+
1961
+    void PassEncoder::destroy()
1962
+    {
1963
+        stop();
1964
+        if (m_reader)
1965
+        {
1966
+            m_reader->stop();
1967
+            delete m_reader;
1968
+        }
1969
+        else
1970
+        {
1971
+            m_scaler->stop();
1972
+            m_scaler->destroy();
1973
+            delete m_scaler;
1974
+        }
1975
+    }
1976
+
1977
+    Scaler::Scaler(int threadId, int threadNum, int id, VideoDesc *src, VideoDesc *dst, PassEncoder *parentEnc)
1978
+    {
1979
+        m_parentEnc = parentEnc;
1980
+        m_id = id;
1981
+        m_srcFormat = src;
1982
+        m_dstFormat = dst;
1983
+        m_threadActive = false;
1984
+        m_scaleFrameSize = 0;
1985
+        m_filterManager = NULL;
1986
+        m_threadId = threadId;
1987
+        m_threadTotal = threadNum;
1988
+
1989
+        int csp = dst->m_csp;
1990
+        uint32_t pixelbytes = dst->m_inputDepth > 8 ? 2 : 1;
1991
+        for (int i = 0; i < x265_cli_cspscsp.planes; i++)
1992
+        {
1993
+            int w = dst->m_width >> x265_cli_cspscsp.widthi;
1994
+            int h = dst->m_height >> x265_cli_cspscsp.heighti;
1995
+            m_scalePlanesi = w * h * pixelbytes;
1996
+            m_scaleFrameSize += m_scalePlanesi;
1997
+        }
1998
+
1999
+        if (src->m_height != dst->m_height || src->m_width != dst->m_width)
2000
+        {
2001
+            m_filterManager = new ScalerFilterManager;
2002
+            m_filterManager->init(4, m_srcFormat, m_dstFormat);
2003
+        }
2004
+    }
2005
+
2006
+    bool Scaler::scalePic(x265_picture * destination, x265_picture * source)
2007
+    {
2008
+        if (!destination || !source)
2009
+            return false;
2010
+        x265_param* param = m_parentEnc->m_param;
2011
+        int pixelBytes = m_dstFormat->m_inputDepth > 8 ? 2 : 1;
2012
+        if (m_srcFormat->m_height != m_dstFormat->m_height || m_srcFormat->m_width != m_dstFormat->m_width)
2013
+        {
2014
+            void **srcPlane = NULL, **dstPlane = NULL;
2015
+            int srcStride3, dstStride3;
2016
+            destination->bitDepth = source->bitDepth;
2017
+            destination->colorSpace = source->colorSpace;
2018
+            destination->pts = source->pts;
2019
+            destination->dts = source->dts;
2020
+            destination->reorderedPts = source->reorderedPts;
2021
+            destination->poc = source->poc;
2022
+            destination->userSEI = source->userSEI;
2023
+            srcPlane = source->planes;
2024
+            dstPlane = destination->planes;
2025
+            srcStride0 = source->stride0;
2026
+            destination->stride0 = m_dstFormat->m_width * pixelBytes;
2027
+            dstStride0 = destination->stride0;
2028
+            if (param->internalCsp != X265_CSP_I400)
2029
+            {
2030
+                srcStride1 = source->stride1;
2031
+                srcStride2 = source->stride2;
2032
+                destination->stride1 = destination->stride0 >> x265_cli_cspsparam->internalCsp.width1;
2033
+                destination->stride2 = destination->stride0 >> x265_cli_cspsparam->internalCsp.width2;
2034
+                dstStride1 = destination->stride1;
2035
+                dstStride2 = destination->stride2;
2036
+            }
2037
+            if (m_scaleFrameSize)
2038
+            {
2039
+                m_filterManager->scale_pic(srcPlane, dstPlane, srcStride, dstStride);
2040
+                return true;
2041
+            }
2042
+            else
2043
+                x265_log(param, X265_LOG_INFO, "Empty frame received\n");
2044
+        }
2045
+        return false;
2046
+    }
2047
+
2048
+    void Scaler::threadMain()
2049
+    {
2050
+        THREAD_NAME("Scaler", m_id);
2051
+
2052
+        /* unscaled picture is stored in the last index */
2053
+        uint32_t srcId = m_id - 1;
2054
+        int QDepth = m_parentEnc->m_parent->m_queueSize;
2055
+        while (!m_parentEnc->m_inputOver)
2056
+        {
2057
+
2058
+            uint32_t scaledWritten = m_parentEnc->m_parent->m_picWriteCntm_id.get();
2059
+
2060
+            if (m_parentEnc->m_cliopt.framesToBeEncoded && scaledWritten >= m_parentEnc->m_cliopt.framesToBeEncoded)
2061
+                break;
2062
+
2063
+            if (m_threadTotal > 1 && (m_threadId != scaledWritten % m_threadTotal))
2064
+            {
2065
+                continue;
2066
+            }
2067
+            uint32_t written = m_parentEnc->m_parent->m_picWriteCntsrcId.get();
2068
+
2069
+            /*If all the input pictures are scaled by the current scale worker thread wait for input pictures*/
2070
+            while (m_threadActive && (scaledWritten == written)) {
2071
+                written = m_parentEnc->m_parent->m_picWriteCntsrcId.waitForChange(written);
2072
+            }
2073
+
2074
+            if (m_threadActive && scaledWritten < written)
2075
+            {
2076
+
2077
+                int scaledWriteIdx = scaledWritten % QDepth;
2078
+                int overWritePicBuffer = scaledWritten / QDepth;
2079
+                int read = m_parentEnc->m_parent->m_picIdxReadCntm_idscaledWriteIdx.get();
2080
+
2081
+                while (overWritePicBuffer && read < overWritePicBuffer)
2082
+                {
2083
+                    read = m_parentEnc->m_parent->m_picIdxReadCntm_idscaledWriteIdx.waitForChange(read);
2084
+                }
2085
+
2086
+                if (!m_parentEnc->m_parent->m_inputPicBufferm_idscaledWriteIdx)
2087
+                {
2088
+                    int framesize = 0;
2089
+                    int planesize3;
2090
+                    int csp = m_dstFormat->m_csp;
2091
+                    int stride3;
2092
+                    stride0 = m_dstFormat->m_width;
2093
+                    stride1 = stride0 >> x265_cli_cspscsp.width1;
2094
+                    stride2 = stride0 >> x265_cli_cspscsp.width2;
2095
+                    for (int i = 0; i < x265_cli_cspscsp.planes; i++)
2096
+                    {
2097
+                        uint32_t h = m_dstFormat->m_height >> x265_cli_cspscsp.heighti;
2098
+                        planesizei = h * stridei;
2099
+                        framesize += planesizei;
2100
+                    }
2101
+
2102
+                    m_parentEnc->m_parent->m_inputPicBufferm_idscaledWriteIdx = x265_picture_alloc();
2103
+                    x265_picture_init(m_parentEnc->m_param, m_parentEnc->m_parent->m_inputPicBufferm_idscaledWriteIdx);
2104
+
2105
+                    ((x265_picture*)m_parentEnc->m_parent->m_inputPicBufferm_idscaledWritten % QDepth)->framesize = framesize;
2106
+                    for (int32_t j = 0; j < x265_cli_cspscsp.planes; j++)
2107
+                    {
2108
+                        m_parentEnc->m_parent->m_inputPicBufferm_idscaledWritten % QDepth->planesj = X265_MALLOC(char, planesizej);
2109
+                    }
2110
+                }
2111
+
2112
+                x265_picture *srcPic = m_parentEnc->m_parent->m_inputPicBuffersrcIdscaledWritten % QDepth;
2113
+                x265_picture* destPic = m_parentEnc->m_parent->m_inputPicBufferm_idscaledWriteIdx;
2114
+
2115
+                // Enqueue this picture up with the current encoder so that it will asynchronously encode
2116
+                if (!scalePic(destPic, srcPic))
2117
+                    x265_log(NULL, X265_LOG_ERROR, "Unable to copy scaled input picture to input queue \n");
2118
+                else
2119
+                    m_parentEnc->m_parent->m_picWriteCntm_id.incr();
2120
+                m_scaledWriteCnt.incr();
2121
+                m_parentEnc->m_parent->m_picIdxReadCntsrcIdscaledWriteIdx.incr();
2122
+            }
2123
+            if (m_threadTotal > 1)
2124
+            {
2125
+                written = m_parentEnc->m_parent->m_picWriteCntsrcId.get();
2126
+                int totalWrite = written / m_threadTotal;
2127
+                if (written % m_threadTotal > m_threadId)
2128
+                    totalWrite++;
2129
+                if (totalWrite == m_scaledWriteCnt.get())
2130
+                {
2131
+                    m_parentEnc->m_parent->m_picWriteCntsrcId.poke();
2132
+                    m_parentEnc->m_parent->m_picWriteCntm_id.poke();
2133
+                    break;
2134
+                }
2135
+            }
2136
+            else
2137
+            {
2138
+                /* Once end of video is reached and all frames are scaled, release wait on picwritecount */
2139
+                scaledWritten = m_parentEnc->m_parent->m_picWriteCntm_id.get();
2140
+                written = m_parentEnc->m_parent->m_picWriteCntsrcId.get();
2141
+                if (written == scaledWritten)
2142
+                {
2143
+                    m_parentEnc->m_parent->m_picWriteCntsrcId.poke();
2144
+                    m_parentEnc->m_parent->m_picWriteCntm_id.poke();
2145
+                    break;
2146
+                }
2147
+            }
2148
+
2149
+        }
2150
+        m_threadActive = false;
2151
+        destroy();
2152
+    }
2153
+
2154
+    Reader::Reader(int id, PassEncoder *parentEnc)
2155
+    {
2156
+        m_parentEnc = parentEnc;
2157
+        m_id = id;
2158
+        m_input = parentEnc->m_input;
2159
+    }
2160
+
2161
+    void Reader::threadMain()
2162
+    {
2163
+        THREAD_NAME("Reader", m_id);
2164
+
2165
+        int QDepth = m_parentEnc->m_parent->m_queueSize;
2166
+        x265_picture* src = x265_picture_alloc();
2167
+        x265_picture_init(m_parentEnc->m_param, src);
2168
+
2169
+        while (m_threadActive)
2170
+        {
2171
+            uint32_t written = m_parentEnc->m_parent->m_picWriteCntm_id.get();
2172
+            uint32_t writeIdx = written % QDepth;
2173
+            uint32_t read = m_parentEnc->m_parent->m_picIdxReadCntm_idwriteIdx.get();
2174
+            uint32_t overWritePicBuffer = written / QDepth;
2175
+
2176
+            if (m_parentEnc->m_cliopt.framesToBeEncoded && written >= m_parentEnc->m_cliopt.framesToBeEncoded)
2177
+                break;
2178
+
2179
+            while (overWritePicBuffer && read < overWritePicBuffer)
2180
+            {
2181
+                read = m_parentEnc->m_parent->m_picIdxReadCntm_idwriteIdx.waitForChange(read);
2182
+            }
2183
+
2184
+            x265_picture* dest = m_parentEnc->m_parent->m_inputPicBufferm_idwriteIdx;
2185
+            if (m_input->readPicture(*src))
2186
+            {
2187
+                dest->poc = src->poc;
2188
+                dest->pts = src->pts;
2189
+                dest->userSEI = src->userSEI;
2190
+                dest->bitDepth = src->bitDepth;
2191
+                dest->framesize = src->framesize;
2192
+                dest->height = src->height;
2193
+                dest->width = src->width;
2194
+                dest->colorSpace = src->colorSpace;
2195
+                dest->userSEI = src->userSEI;
2196
+                dest->rpu.payload = src->rpu.payload;
2197
+                dest->picStruct = src->picStruct;
2198
+                dest->stride0 = src->stride0;
2199
+                dest->stride1 = src->stride1;
2200
+                dest->stride2 = src->stride2;
2201
+
2202
+                if (!dest->planes0)
2203
+                    dest->planes0 = X265_MALLOC(char, dest->framesize);
2204
+
2205
+                memcpy(dest->planes0, src->planes0, src->framesize * sizeof(char));
2206
+                dest->planes1 = (char*)dest->planes0 + src->stride0 * src->height;
2207
+                dest->planes2 = (char*)dest->planes1 + src->stride1 * (src->height >> x265_cli_cspssrc->colorSpace.height1);
2208
+                m_parentEnc->m_parent->m_picWriteCntm_id.incr();
2209
+            }
2210
+            else
2211
+            {
2212
+                m_threadActive = false;
2213
+                m_parentEnc->m_inputOver = true;
2214
+                m_parentEnc->m_parent->m_picWriteCntm_id.poke();
2215
+            }
2216
+        }
2217
+        x265_picture_free(src);
2218
+    }
2219
+}
2220
x265_3.5.tar.gz/source/abrEncApp.h -> x265_3.6.tar.gz/source/abrEncApp.h Changed
9
 
1
@@ -91,6 +91,7 @@
2
         FILE*    m_qpfile;
3
         FILE*    m_zoneFile;
4
         FILE*    m_dolbyVisionRpu;/* File containing Dolby Vision BL RPU metadata */
5
+        FILE*    m_scenecutAwareQpConfig;
6
 
7
         int m_ret;
8
 
9
x265_3.5.tar.gz/source/cmake/FindNeon.cmake -> x265_3.6.tar.gz/source/cmake/FindNeon.cmake Changed
27
 
1
@@ -1,10 +1,21 @@
2
 include(FindPackageHandleStandardArgs)
3
 
4
 # Check the version of neon supported by the ARM CPU
5
-execute_process(COMMAND cat /proc/cpuinfo | grep Features | grep neon
6
-                OUTPUT_VARIABLE neon_version
7
-                ERROR_QUIET
8
-                OUTPUT_STRIP_TRAILING_WHITESPACE)
9
+if(APPLE)
10
+    execute_process(COMMAND sysctl -a
11
+                    COMMAND grep "hw.optional.neon: 1"
12
+                    OUTPUT_VARIABLE neon_version
13
+                    ERROR_QUIET
14
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
15
+else()
16
+    execute_process(COMMAND cat /proc/cpuinfo
17
+                    COMMAND grep Features
18
+                    COMMAND grep neon
19
+                    OUTPUT_VARIABLE neon_version
20
+                    ERROR_QUIET
21
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
22
+endif()
23
+
24
 if(neon_version)
25
     set(CPU_HAS_NEON 1)
26
 endif()
27
x265_3.6.tar.gz/source/cmake/FindSVE.cmake Added
23
 
1
@@ -0,0 +1,21 @@
2
+include(FindPackageHandleStandardArgs)
3
+
4
+# Check the version of SVE supported by the ARM CPU
5
+if(APPLE)
6
+    execute_process(COMMAND sysctl -a
7
+                    COMMAND grep "hw.optional.sve: 1"
8
+                    OUTPUT_VARIABLE sve_version
9
+                    ERROR_QUIET
10
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
11
+else()
12
+    execute_process(COMMAND cat /proc/cpuinfo
13
+                    COMMAND grep Features
14
+                    COMMAND grep -e "sve$" -e "sve:space:"
15
+                    OUTPUT_VARIABLE sve_version
16
+                    ERROR_QUIET
17
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
18
+endif()
19
+
20
+if(sve_version)
21
+    set(CPU_HAS_SVE 1)
22
+endif()
23
x265_3.6.tar.gz/source/cmake/FindSVE2.cmake Added
24
 
1
@@ -0,0 +1,22 @@
2
+include(FindPackageHandleStandardArgs)
3
+
4
+# Check the version of SVE2 supported by the ARM CPU
5
+if(APPLE)
6
+    execute_process(COMMAND sysctl -a
7
+                    COMMAND grep "hw.optional.sve2: 1"
8
+                    OUTPUT_VARIABLE sve2_version
9
+                    ERROR_QUIET
10
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
11
+else()
12
+    execute_process(COMMAND cat /proc/cpuinfo
13
+                    COMMAND grep Features
14
+                    COMMAND grep sve2
15
+                    OUTPUT_VARIABLE sve2_version
16
+                    ERROR_QUIET
17
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
18
+endif()
19
+
20
+if(sve2_version)
21
+    set(CPU_HAS_SVE 1)
22
+    set(CPU_HAS_SVE2 1)
23
+endif()
24
x265_3.5.tar.gz/source/common/CMakeLists.txt -> x265_3.6.tar.gz/source/common/CMakeLists.txt Changed
76
 
1
@@ -84,35 +84,42 @@
2
 endif(ENABLE_ASSEMBLY AND X86)
3
 
4
 if(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
5
-    if(ARM64)
6
-        if(GCC AND (CMAKE_CXX_FLAGS_RELEASE MATCHES "-O3"))
7
-            message(STATUS "Detected CXX compiler using -O3 optimization level")
8
-            add_definitions(-DAUTO_VECTORIZE=1)
9
-        endif()
10
-        set(C_SRCS asm-primitives.cpp pixel.h ipfilter8.h)
11
-
12
-        # add ARM assembly/intrinsic files here
13
-        set(A_SRCS asm.S mc-a.S sad-a.S pixel-util.S ipfilter8.S)
14
-        set(VEC_PRIMITIVES)
15
+    set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
16
 
17
-        set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
18
-        foreach(SRC ${C_SRCS})
19
-            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
20
-        endforeach()
21
-    else()
22
-        set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
23
+    # add ARM assembly/intrinsic files here
24
+    set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S dct-a.S)
25
+    set(VEC_PRIMITIVES)
26
 
27
-        # add ARM assembly/intrinsic files here
28
-        set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S dct-a.S)
29
-        set(VEC_PRIMITIVES)
30
+    set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
31
+    foreach(SRC ${C_SRCS})
32
+        set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
33
+    endforeach()
34
+    source_group(Assembly FILES ${ASM_PRIMITIVES})
35
+endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
36
 
37
-        set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
38
-        foreach(SRC ${C_SRCS})
39
-            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
40
-        endforeach()
41
+if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
42
+    if(GCC AND (CMAKE_CXX_FLAGS_RELEASE MATCHES "-O3"))
43
+        message(STATUS "Detected CXX compiler using -O3 optimization level")
44
+        add_definitions(-DAUTO_VECTORIZE=1)
45
     endif()
46
+
47
+    set(C_SRCS asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h)
48
+    enable_language(ASM)
49
+
50
+    # add ARM assembly/intrinsic files here
51
+    set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S sad-a-common.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S ipfilter.S ipfilter-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S)
52
+    set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S)
53
+    set(A_SRCS_SVE2 mc-a-sve2.S sad-a-sve2.S pixel-util-sve2.S ipfilter-sve2.S ssd-a-sve2.S)
54
+    set(VEC_PRIMITIVES)
55
+
56
+    set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
57
+    set(ARM_ASMS_SVE "${A_SRCS_SVE}" CACHE INTERNAL "ARM Assembly Sources that use SVE instruction set")
58
+    set(ARM_ASMS_SVE2 "${A_SRCS_SVE2}" CACHE INTERNAL "ARM Assembly Sources that use SVE2 instruction set")
59
+    foreach(SRC ${C_SRCS})
60
+        set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
61
+    endforeach()
62
     source_group(Assembly FILES ${ASM_PRIMITIVES})
63
-endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
64
+endif(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
65
 
66
 if(POWER)
67
     set_source_files_properties(version.cpp PROPERTIES COMPILE_FLAGS -DX265_VERSION=${X265_VERSION})
68
@@ -169,4 +176,6 @@
69
     scalinglist.cpp scalinglist.h
70
     quant.cpp quant.h contexts.h
71
     deblock.cpp deblock.h
72
-    scaler.cpp scaler.h)
73
+    scaler.cpp scaler.h
74
+    ringmem.cpp ringmem.h
75
+    temporalfilter.cpp temporalfilter.h)
76
x265_3.6.tar.gz/source/common/aarch64/arm64-utils.cpp Added
302
 
1
@@ -0,0 +1,300 @@
2
+#include "common.h"
3
+#include "x265.h"
4
+#include "arm64-utils.h"
5
+#include <arm_neon.h>
6
+
7
+#define COPY_16(d,s) *(uint8x16_t *)(d) = *(uint8x16_t *)(s)
8
+namespace X265_NS
9
+{
10
+
11
+
12
+
13
+void transpose8x8(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
14
+{
15
+    uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7;
16
+    uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;
17
+
18
+    a0 = *(uint8x8_t *)(src + 0 * sstride);
19
+    a1 = *(uint8x8_t *)(src + 1 * sstride);
20
+    a2 = *(uint8x8_t *)(src + 2 * sstride);
21
+    a3 = *(uint8x8_t *)(src + 3 * sstride);
22
+    a4 = *(uint8x8_t *)(src + 4 * sstride);
23
+    a5 = *(uint8x8_t *)(src + 5 * sstride);
24
+    a6 = *(uint8x8_t *)(src + 6 * sstride);
25
+    a7 = *(uint8x8_t *)(src + 7 * sstride);
26
+
27
+    b0 = vtrn1_u32(a0, a4);
28
+    b1 = vtrn1_u32(a1, a5);
29
+    b2 = vtrn1_u32(a2, a6);
30
+    b3 = vtrn1_u32(a3, a7);
31
+    b4 = vtrn2_u32(a0, a4);
32
+    b5 = vtrn2_u32(a1, a5);
33
+    b6 = vtrn2_u32(a2, a6);
34
+    b7 = vtrn2_u32(a3, a7);
35
+
36
+    a0 = vtrn1_u16(b0, b2);
37
+    a1 = vtrn1_u16(b1, b3);
38
+    a2 = vtrn2_u16(b0, b2);
39
+    a3 = vtrn2_u16(b1, b3);
40
+    a4 = vtrn1_u16(b4, b6);
41
+    a5 = vtrn1_u16(b5, b7);
42
+    a6 = vtrn2_u16(b4, b6);
43
+    a7 = vtrn2_u16(b5, b7);
44
+
45
+    b0 = vtrn1_u8(a0, a1);
46
+    b1 = vtrn2_u8(a0, a1);
47
+    b2 = vtrn1_u8(a2, a3);
48
+    b3 = vtrn2_u8(a2, a3);
49
+    b4 = vtrn1_u8(a4, a5);
50
+    b5 = vtrn2_u8(a4, a5);
51
+    b6 = vtrn1_u8(a6, a7);
52
+    b7 = vtrn2_u8(a6, a7);
53
+
54
+    *(uint8x8_t *)(dst + 0 * dstride) = b0;
55
+    *(uint8x8_t *)(dst + 1 * dstride) = b1;
56
+    *(uint8x8_t *)(dst + 2 * dstride) = b2;
57
+    *(uint8x8_t *)(dst + 3 * dstride) = b3;
58
+    *(uint8x8_t *)(dst + 4 * dstride) = b4;
59
+    *(uint8x8_t *)(dst + 5 * dstride) = b5;
60
+    *(uint8x8_t *)(dst + 6 * dstride) = b6;
61
+    *(uint8x8_t *)(dst + 7 * dstride) = b7;
62
+}
63
+
64
+
65
+
66
+
67
+
68
+
69
+void transpose16x16(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
70
+{
71
+    uint16x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aA, aB, aC, aD, aE, aF;
72
+    uint16x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, bA, bB, bC, bD, bE, bF;
73
+    uint16x8_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF;
74
+    uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, dA, dB, dC, dD, dE, dF;
75
+
76
+    a0 = *(uint16x8_t *)(src + 0 * sstride);
77
+    a1 = *(uint16x8_t *)(src + 1 * sstride);
78
+    a2 = *(uint16x8_t *)(src + 2 * sstride);
79
+    a3 = *(uint16x8_t *)(src + 3 * sstride);
80
+    a4 = *(uint16x8_t *)(src + 4 * sstride);
81
+    a5 = *(uint16x8_t *)(src + 5 * sstride);
82
+    a6 = *(uint16x8_t *)(src + 6 * sstride);
83
+    a7 = *(uint16x8_t *)(src + 7 * sstride);
84
+    a8 = *(uint16x8_t *)(src + 8 * sstride);
85
+    a9 = *(uint16x8_t *)(src + 9 * sstride);
86
+    aA = *(uint16x8_t *)(src + 10 * sstride);
87
+    aB = *(uint16x8_t *)(src + 11 * sstride);
88
+    aC = *(uint16x8_t *)(src + 12 * sstride);
89
+    aD = *(uint16x8_t *)(src + 13 * sstride);
90
+    aE = *(uint16x8_t *)(src + 14 * sstride);
91
+    aF = *(uint16x8_t *)(src + 15 * sstride);
92
+
93
+    b0 = vtrn1q_u64(a0, a8);
94
+    b1 = vtrn1q_u64(a1, a9);
95
+    b2 = vtrn1q_u64(a2, aA);
96
+    b3 = vtrn1q_u64(a3, aB);
97
+    b4 = vtrn1q_u64(a4, aC);
98
+    b5 = vtrn1q_u64(a5, aD);
99
+    b6 = vtrn1q_u64(a6, aE);
100
+    b7 = vtrn1q_u64(a7, aF);
101
+    b8 = vtrn2q_u64(a0, a8);
102
+    b9 = vtrn2q_u64(a1, a9);
103
+    bA = vtrn2q_u64(a2, aA);
104
+    bB = vtrn2q_u64(a3, aB);
105
+    bC = vtrn2q_u64(a4, aC);
106
+    bD = vtrn2q_u64(a5, aD);
107
+    bE = vtrn2q_u64(a6, aE);
108
+    bF = vtrn2q_u64(a7, aF);
109
+
110
+    c0 = vtrn1q_u32(b0, b4);
111
+    c1 = vtrn1q_u32(b1, b5);
112
+    c2 = vtrn1q_u32(b2, b6);
113
+    c3 = vtrn1q_u32(b3, b7);
114
+    c4 = vtrn2q_u32(b0, b4);
115
+    c5 = vtrn2q_u32(b1, b5);
116
+    c6 = vtrn2q_u32(b2, b6);
117
+    c7 = vtrn2q_u32(b3, b7);
118
+    c8 = vtrn1q_u32(b8, bC);
119
+    c9 = vtrn1q_u32(b9, bD);
120
+    cA = vtrn1q_u32(bA, bE);
121
+    cB = vtrn1q_u32(bB, bF);
122
+    cC = vtrn2q_u32(b8, bC);
123
+    cD = vtrn2q_u32(b9, bD);
124
+    cE = vtrn2q_u32(bA, bE);
125
+    cF = vtrn2q_u32(bB, bF);
126
+
127
+    d0 = vtrn1q_u16(c0, c2);
128
+    d1 = vtrn1q_u16(c1, c3);
129
+    d2 = vtrn2q_u16(c0, c2);
130
+    d3 = vtrn2q_u16(c1, c3);
131
+    d4 = vtrn1q_u16(c4, c6);
132
+    d5 = vtrn1q_u16(c5, c7);
133
+    d6 = vtrn2q_u16(c4, c6);
134
+    d7 = vtrn2q_u16(c5, c7);
135
+    d8 = vtrn1q_u16(c8, cA);
136
+    d9 = vtrn1q_u16(c9, cB);
137
+    dA = vtrn2q_u16(c8, cA);
138
+    dB = vtrn2q_u16(c9, cB);
139
+    dC = vtrn1q_u16(cC, cE);
140
+    dD = vtrn1q_u16(cD, cF);
141
+    dE = vtrn2q_u16(cC, cE);
142
+    dF = vtrn2q_u16(cD, cF);
143
+
144
+    *(uint16x8_t *)(dst + 0 * dstride)  = vtrn1q_u8(d0, d1);
145
+    *(uint16x8_t *)(dst + 1 * dstride)  = vtrn2q_u8(d0, d1);
146
+    *(uint16x8_t *)(dst + 2 * dstride)  = vtrn1q_u8(d2, d3);
147
+    *(uint16x8_t *)(dst + 3 * dstride)  = vtrn2q_u8(d2, d3);
148
+    *(uint16x8_t *)(dst + 4 * dstride)  = vtrn1q_u8(d4, d5);
149
+    *(uint16x8_t *)(dst + 5 * dstride)  = vtrn2q_u8(d4, d5);
150
+    *(uint16x8_t *)(dst + 6 * dstride)  = vtrn1q_u8(d6, d7);
151
+    *(uint16x8_t *)(dst + 7 * dstride)  = vtrn2q_u8(d6, d7);
152
+    *(uint16x8_t *)(dst + 8 * dstride)  = vtrn1q_u8(d8, d9);
153
+    *(uint16x8_t *)(dst + 9 * dstride)  = vtrn2q_u8(d8, d9);
154
+    *(uint16x8_t *)(dst + 10 * dstride)  = vtrn1q_u8(dA, dB);
155
+    *(uint16x8_t *)(dst + 11 * dstride)  = vtrn2q_u8(dA, dB);
156
+    *(uint16x8_t *)(dst + 12 * dstride)  = vtrn1q_u8(dC, dD);
157
+    *(uint16x8_t *)(dst + 13 * dstride)  = vtrn2q_u8(dC, dD);
158
+    *(uint16x8_t *)(dst + 14 * dstride)  = vtrn1q_u8(dE, dF);
159
+    *(uint16x8_t *)(dst + 15 * dstride)  = vtrn2q_u8(dE, dF);
160
+
161
+
162
+}
163
+
164
+
165
+void transpose32x32(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
166
+{
167
+    //assumption: there is no partial overlap
168
+    transpose16x16(dst, src, dstride, sstride);
169
+    transpose16x16(dst + 16 * dstride + 16, src + 16 * sstride + 16, dstride, sstride);
170
+    if (dst == src)
171
+    {
172
+        uint8_t tmp16 * 16 __attribute__((aligned(64)));
173
+        transpose16x16(tmp, src + 16, 16, sstride);
174
+        transpose16x16(dst + 16, src + 16 * sstride, dstride, sstride);
175
+        for (int i = 0; i < 16; i++)
176
+        {
177
+            COPY_16(dst + (16 + i)*dstride, tmp + 16 * i);
178
+        }
179
+    }
180
+    else
181
+    {
182
+        transpose16x16(dst + 16 * dstride, src + 16, dstride, sstride);
183
+        transpose16x16(dst + 16, src + 16 * sstride, dstride, sstride);
184
+    }
185
+
186
+}
187
+
188
+
189
+
190
+void transpose8x8(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride)
191
+{
192
+    uint16x8_t a0, a1, a2, a3, a4, a5, a6, a7;
193
+    uint16x8_t b0, b1, b2, b3, b4, b5, b6, b7;
194
+
195
+    a0 = *(uint16x8_t *)(src + 0 * sstride);
196
+    a1 = *(uint16x8_t *)(src + 1 * sstride);
197
+    a2 = *(uint16x8_t *)(src + 2 * sstride);
198
+    a3 = *(uint16x8_t *)(src + 3 * sstride);
199
+    a4 = *(uint16x8_t *)(src + 4 * sstride);
200
+    a5 = *(uint16x8_t *)(src + 5 * sstride);
201
+    a6 = *(uint16x8_t *)(src + 6 * sstride);
202
+    a7 = *(uint16x8_t *)(src + 7 * sstride);
203
+
204
+    b0 = vtrn1q_u64(a0, a4);
205
+    b1 = vtrn1q_u64(a1, a5);
206
+    b2 = vtrn1q_u64(a2, a6);
207
+    b3 = vtrn1q_u64(a3, a7);
208
+    b4 = vtrn2q_u64(a0, a4);
209
+    b5 = vtrn2q_u64(a1, a5);
210
+    b6 = vtrn2q_u64(a2, a6);
211
+    b7 = vtrn2q_u64(a3, a7);
212
+
213
+    a0 = vtrn1q_u32(b0, b2);
214
+    a1 = vtrn1q_u32(b1, b3);
215
+    a2 = vtrn2q_u32(b0, b2);
216
+    a3 = vtrn2q_u32(b1, b3);
217
+    a4 = vtrn1q_u32(b4, b6);
218
+    a5 = vtrn1q_u32(b5, b7);
219
+    a6 = vtrn2q_u32(b4, b6);
220
+    a7 = vtrn2q_u32(b5, b7);
221
+
222
+    b0 = vtrn1q_u16(a0, a1);
223
+    b1 = vtrn2q_u16(a0, a1);
224
+    b2 = vtrn1q_u16(a2, a3);
225
+    b3 = vtrn2q_u16(a2, a3);
226
+    b4 = vtrn1q_u16(a4, a5);
227
+    b5 = vtrn2q_u16(a4, a5);
228
+    b6 = vtrn1q_u16(a6, a7);
229
+    b7 = vtrn2q_u16(a6, a7);
230
+
231
+    *(uint16x8_t *)(dst + 0 * dstride) = b0;
232
+    *(uint16x8_t *)(dst + 1 * dstride) = b1;
233
+    *(uint16x8_t *)(dst + 2 * dstride) = b2;
234
+    *(uint16x8_t *)(dst + 3 * dstride) = b3;
235
+    *(uint16x8_t *)(dst + 4 * dstride) = b4;
236
+    *(uint16x8_t *)(dst + 5 * dstride) = b5;
237
+    *(uint16x8_t *)(dst + 6 * dstride) = b6;
238
+    *(uint16x8_t *)(dst + 7 * dstride) = b7;
239
+}
240
+
241
+void transpose16x16(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride)
242
+{
243
+    //assumption: there is no partial overlap
244
+    transpose8x8(dst, src, dstride, sstride);
245
+    transpose8x8(dst + 8 * dstride + 8, src + 8 * sstride + 8, dstride, sstride);
246
+
247
+    if (dst == src)
248
+    {
249
+        uint16_t tmp8 * 8;
250
+        transpose8x8(tmp, src + 8, 8, sstride);
251
+        transpose8x8(dst + 8, src + 8 * sstride, dstride, sstride);
252
+        for (int i = 0; i < 8; i++)
253
+        {
254
+            COPY_16(dst + (8 + i)*dstride, tmp + 8 * i);
255
+        }
256
+    }
257
+    else
258
+    {
259
+        transpose8x8(dst + 8 * dstride, src + 8, dstride, sstride);
260
+        transpose8x8(dst + 8, src + 8 * sstride, dstride, sstride);
261
+    }
262
+
263
+}
264
+
265
+
266
+
267
+void transpose32x32(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride)
268
+{
269
+    //assumption: there is no partial overlap
270
+    for (int i = 0; i < 4; i++)
271
+    {
272
+        transpose8x8(dst + i * 8 * (1 + dstride), src + i * 8 * (1 + sstride), dstride, sstride);
273
+        for (int j = i + 1; j < 4; j++)
274
+        {
275
+            if (dst == src)
276
+            {
277
+                uint16_t tmp8 * 8 __attribute__((aligned(64)));
278
+                transpose8x8(tmp, src + 8 * i + 8 * j * sstride, 8, sstride);
279
+                transpose8x8(dst + 8 * i + 8 * j * dstride, src + 8 * j + 8 * i * sstride, dstride, sstride);
280
+                for (int k = 0; k < 8; k++)
281
+                {
282
+                    COPY_16(dst + 8 * j + (8 * i + k)*dstride, tmp + 8 * k);
283
+                }
284
+            }
285
+            else
286
+            {
287
+                transpose8x8(dst + 8 * (j + i * dstride), src + 8 * (i + j * sstride), dstride, sstride);
288
+                transpose8x8(dst + 8 * (i + j * dstride), src + 8 * (j + i * sstride), dstride, sstride);
289
+            }
290
+
291
+        }
292
+    }
293
+}
294
+
295
+
296
+
297
+
298
+}
299
+
300
+
301
+
302
x265_3.6.tar.gz/source/common/aarch64/arm64-utils.h Added
17
 
1
@@ -0,0 +1,15 @@
2
+#ifndef __ARM64_UTILS_H__
3
+#define __ARM64_UTILS_H__
4
+
5
+
6
+namespace X265_NS
7
+{
8
+void transpose8x8(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride);
9
+void transpose16x16(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride);
10
+void transpose32x32(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride);
11
+void transpose8x8(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride);
12
+void transpose16x16(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride);
13
+void transpose32x32(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride);
14
+}
15
+
16
+#endif
17
x265_3.5.tar.gz/source/common/aarch64/asm-primitives.cpp -> x265_3.6.tar.gz/source/common/aarch64/asm-primitives.cpp Changed
2102
 
1
@@ -3,6 +3,7 @@
2
  *
3
  * Authors: Hongbin Liu <liuhongbin1@huawei.com>
4
  *          Yimeng Su <yimeng.su@huawei.com>
5
+ *          Sebastian Pop <spop@amazon.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -22,11 +23,659 @@
10
  * For more information, contact us at license @ x265.com.
11
  *****************************************************************************/
12
 
13
+
14
 #include "common.h"
15
 #include "primitives.h"
16
 #include "x265.h"
17
 #include "cpu.h"
18
 
19
+extern "C" {
20
+#include "fun-decls.h"
21
+}
22
+
23
+#define ALL_LUMA_TU_TYPED(prim, fncdef, fname, cpu) \
24
+    p.cuBLOCK_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
25
+    p.cuBLOCK_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
26
+    p.cuBLOCK_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
27
+    p.cuBLOCK_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
28
+    p.cuBLOCK_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu)
29
+#define LUMA_TU_TYPED_NEON(prim, fncdef, fname) \
30
+    p.cuBLOCK_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
31
+    p.cuBLOCK_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
32
+    p.cuBLOCK_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
33
+    p.cuBLOCK_64x64.prim = fncdef PFX(fname ## _64x64_ ## neon)
34
+#define LUMA_TU_TYPED_CAN_USE_SVE(prim, fncdef, fname) \
35
+    p.cuBLOCK_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve)
36
+#define ALL_LUMA_TU(prim, fname, cpu)      ALL_LUMA_TU_TYPED(prim, , fname, cpu)
37
+#define LUMA_TU_NEON(prim, fname)      LUMA_TU_TYPED_NEON(prim, , fname)
38
+#define LUMA_TU_CAN_USE_SVE(prim, fname)      LUMA_TU_TYPED_CAN_USE_SVE(prim, , fname)
39
+
40
+#define ALL_LUMA_PU_TYPED(prim, fncdef, fname, cpu) \
41
+    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
42
+    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
43
+    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
44
+    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
45
+    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
46
+    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## cpu); \
47
+    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## cpu); \
48
+    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## cpu); \
49
+    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
50
+    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
51
+    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
52
+    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
53
+    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
54
+    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
55
+    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
56
+    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## cpu); \
57
+    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## cpu); \
58
+    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
59
+    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
60
+    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
61
+    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu); \
62
+    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
63
+    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
64
+    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \
65
+    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu)
66
+#define LUMA_PU_TYPED_MULTIPLE_ARCHS_1(prim, fncdef, fname, cpu) \
67
+    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
68
+    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## cpu); \
69
+    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## cpu)
70
+#define LUMA_PU_TYPED_MULTIPLE_ARCHS_2(prim, fncdef, fname, cpu) \
71
+    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
72
+    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
73
+    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
74
+    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
75
+    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## cpu); \
76
+    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## cpu); \
77
+    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
78
+    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
79
+    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
80
+    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
81
+    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
82
+    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
83
+    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
84
+    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## cpu); \
85
+    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
86
+    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
87
+    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
88
+    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu); \
89
+    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
90
+    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
91
+    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \
92
+    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu)
93
+#define LUMA_PU_TYPED_NEON_1(prim, fncdef, fname) \
94
+    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
95
+    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
96
+    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon); \
97
+    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## neon); \
98
+    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
99
+    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
100
+    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## neon); \
101
+    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## neon); \
102
+    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## neon); \
103
+    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \
104
+    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
105
+    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## neon); \
106
+    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## neon); \
107
+    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## neon); \
108
+    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## neon); \
109
+    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## neon)
110
+#define LUMA_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fncdef, fname) \
111
+    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve); \
112
+    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## sve); \
113
+    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## sve); \
114
+    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## sve); \
115
+    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## sve); \
116
+    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## sve); \
117
+    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## sve); \
118
+    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## sve); \
119
+    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## sve)
120
+#define LUMA_PU_TYPED_NEON_2(prim, fncdef, fname) \
121
+    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
122
+    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## neon); \
123
+    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
124
+    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
125
+    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## neon); \
126
+    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## neon); \
127
+    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
128
+    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
129
+    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \
130
+    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## neon); \
131
+    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon); \
132
+    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## neon); \
133
+    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## neon)
134
+#define LUMA_PU_TYPED_MULTIPLE_ARCHS_3(prim, fncdef, fname, cpu) \
135
+    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
136
+    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
137
+    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
138
+    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
139
+    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
140
+    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
141
+    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
142
+    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
143
+    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
144
+    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
145
+    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
146
+    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu)
147
+#define LUMA_PU_TYPED_NEON_3(prim, fncdef, fname) \
148
+    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
149
+    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
150
+    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon)
151
+#define LUMA_PU_TYPED_CAN_USE_SVE2(prim, fncdef, fname) \
152
+    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## sve2); \
153
+    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## sve2); \
154
+    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve2); \
155
+    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## sve2); \
156
+    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## sve2); \
157
+    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## sve2); \
158
+    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## sve2); \
159
+    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## sve2); \
160
+    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## sve2); \
161
+    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## sve2); \
162
+    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## sve2); \
163
+    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## sve2); \
164
+    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## sve2); \
165
+    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## sve2); \
166
+    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## sve2); \
167
+    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## sve2); \
168
+    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## sve2); \
169
+    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## sve2); \
170
+    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## sve2); \
171
+    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## sve2); \
172
+    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## sve2); \
173
+    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## sve2)
174
+#define LUMA_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
175
+    p.puLUMA_4x4.prim   = fncdef PFX(filterPixelToShort ## _4x4_ ## neon); \
176
+    p.puLUMA_8x8.prim   = fncdef PFX(filterPixelToShort ## _8x8_ ## neon); \
177
+    p.puLUMA_16x16.prim = fncdef PFX(filterPixelToShort ## _16x16_ ## neon); \
178
+    p.puLUMA_8x4.prim   = fncdef PFX(filterPixelToShort ## _8x4_ ## neon); \
179
+    p.puLUMA_4x8.prim   = fncdef PFX(filterPixelToShort ## _4x8_ ## neon); \
180
+    p.puLUMA_16x8.prim  = fncdef PFX(filterPixelToShort ## _16x8_ ## neon); \
181
+    p.puLUMA_8x16.prim  = fncdef PFX(filterPixelToShort ## _8x16_ ## neon); \
182
+    p.puLUMA_16x32.prim = fncdef PFX(filterPixelToShort ## _16x32_ ## neon); \
183
+    p.puLUMA_16x12.prim = fncdef PFX(filterPixelToShort ## _16x12_ ## neon); \
184
+    p.puLUMA_12x16.prim = fncdef PFX(filterPixelToShort ## _12x16_ ## neon); \
185
+    p.puLUMA_16x4.prim  = fncdef PFX(filterPixelToShort ## _16x4_ ## neon); \
186
+    p.puLUMA_4x16.prim  = fncdef PFX(filterPixelToShort ## _4x16_ ## neon); \
187
+    p.puLUMA_24x32.prim = fncdef PFX(filterPixelToShort ## _24x32_ ## neon); \
188
+    p.puLUMA_8x32.prim  = fncdef PFX(filterPixelToShort ## _8x32_ ## neon); \
189
+    p.puLUMA_16x64.prim = fncdef PFX(filterPixelToShort ## _16x64_ ## neon)
190
+#define LUMA_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
191
+    p.puLUMA_32x32.prim = fncdef PFX(filterPixelToShort ## _32x32_ ## sve); \
192
+    p.puLUMA_32x16.prim = fncdef PFX(filterPixelToShort ## _32x16_ ## sve); \
193
+    p.puLUMA_32x64.prim = fncdef PFX(filterPixelToShort ## _32x64_ ## sve); \
194
+    p.puLUMA_32x24.prim = fncdef PFX(filterPixelToShort ## _32x24_ ## sve); \
195
+    p.puLUMA_32x8.prim  = fncdef PFX(filterPixelToShort ## _32x8_ ## sve); \
196
+    p.puLUMA_64x64.prim = fncdef PFX(filterPixelToShort ## _64x64_ ## sve); \
197
+    p.puLUMA_64x32.prim = fncdef PFX(filterPixelToShort ## _64x32_ ## sve); \
198
+    p.puLUMA_64x48.prim = fncdef PFX(filterPixelToShort ## _64x48_ ## sve); \
199
+    p.puLUMA_64x16.prim = fncdef PFX(filterPixelToShort ## _64x16_ ## sve); \
200
+    p.puLUMA_48x64.prim = fncdef PFX(filterPixelToShort ## _48x64_ ## sve)
201
+#define ALL_LUMA_PU(prim, fname, cpu) ALL_LUMA_PU_TYPED(prim, , fname, cpu)
202
+#define LUMA_PU_MULTIPLE_ARCHS_1(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_1(prim, , fname, cpu)
203
+#define LUMA_PU_MULTIPLE_ARCHS_2(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_2(prim, , fname, cpu)
204
+#define LUMA_PU_NEON_1(prim, fname) LUMA_PU_TYPED_NEON_1(prim, , fname)
205
+#define LUMA_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fname) LUMA_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, , fname)
206
+#define LUMA_PU_NEON_2(prim, fname) LUMA_PU_TYPED_NEON_2(prim, , fname)
207
+#define LUMA_PU_MULTIPLE_ARCHS_3(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_3(prim, , fname, cpu)
208
+#define LUMA_PU_NEON_3(prim, fname) LUMA_PU_TYPED_NEON_3(prim, , fname)
209
+#define LUMA_PU_CAN_USE_SVE2(prim, fname) LUMA_PU_TYPED_CAN_USE_SVE2(prim, , fname)
210
+#define LUMA_PU_NEON_FILTER_PIXEL_TO_SHORT(prim) LUMA_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, )
211
+#define LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(prim) LUMA_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, )
212
+
213
+
214
+#define ALL_LUMA_PU_T(prim, fname) \
215
+    p.puLUMA_4x4.prim   = fname<LUMA_4x4>; \
216
+    p.puLUMA_8x8.prim   = fname<LUMA_8x8>; \
217
+    p.puLUMA_16x16.prim = fname<LUMA_16x16>; \
218
+    p.puLUMA_32x32.prim = fname<LUMA_32x32>; \
219
+    p.puLUMA_64x64.prim = fname<LUMA_64x64>; \
220
+    p.puLUMA_8x4.prim   = fname<LUMA_8x4>; \
221
+    p.puLUMA_4x8.prim   = fname<LUMA_4x8>; \
222
+    p.puLUMA_16x8.prim  = fname<LUMA_16x8>; \
223
+    p.puLUMA_8x16.prim  = fname<LUMA_8x16>; \
224
+    p.puLUMA_16x32.prim = fname<LUMA_16x32>; \
225
+    p.puLUMA_32x16.prim = fname<LUMA_32x16>; \
226
+    p.puLUMA_64x32.prim = fname<LUMA_64x32>; \
227
+    p.puLUMA_32x64.prim = fname<LUMA_32x64>; \
228
+    p.puLUMA_16x12.prim = fname<LUMA_16x12>; \
229
+    p.puLUMA_12x16.prim = fname<LUMA_12x16>; \
230
+    p.puLUMA_16x4.prim  = fname<LUMA_16x4>; \
231
+    p.puLUMA_4x16.prim  = fname<LUMA_4x16>; \
232
+    p.puLUMA_32x24.prim = fname<LUMA_32x24>; \
233
+    p.puLUMA_24x32.prim = fname<LUMA_24x32>; \
234
+    p.puLUMA_32x8.prim  = fname<LUMA_32x8>; \
235
+    p.puLUMA_8x32.prim  = fname<LUMA_8x32>; \
236
+    p.puLUMA_64x48.prim = fname<LUMA_64x48>; \
237
+    p.puLUMA_48x64.prim = fname<LUMA_48x64>; \
238
+    p.puLUMA_64x16.prim = fname<LUMA_64x16>; \
239
+    p.puLUMA_16x64.prim = fname<LUMA_16x64>
240
+
241
+#define ALL_CHROMA_420_PU_TYPED(prim, fncdef, fname, cpu)               \
242
+    p.chromaX265_CSP_I420.puCHROMA_420_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
243
+    p.chromaX265_CSP_I420.puCHROMA_420_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
244
+    p.chromaX265_CSP_I420.puCHROMA_420_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
245
+    p.chromaX265_CSP_I420.puCHROMA_420_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
246
+    p.chromaX265_CSP_I420.puCHROMA_420_4x2.prim   = fncdef PFX(fname ## _4x2_ ## cpu); \
247
+    p.chromaX265_CSP_I420.puCHROMA_420_2x4.prim   = fncdef PFX(fname ## _2x4_ ## cpu); \
248
+    p.chromaX265_CSP_I420.puCHROMA_420_8x4.prim   = fncdef PFX(fname ## _8x4_ ## cpu); \
249
+    p.chromaX265_CSP_I420.puCHROMA_420_4x8.prim   = fncdef PFX(fname ## _4x8_ ## cpu); \
250
+    p.chromaX265_CSP_I420.puCHROMA_420_16x8.prim  = fncdef PFX(fname ## _16x8_ ## cpu); \
251
+    p.chromaX265_CSP_I420.puCHROMA_420_8x16.prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
252
+    p.chromaX265_CSP_I420.puCHROMA_420_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
253
+    p.chromaX265_CSP_I420.puCHROMA_420_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
254
+    p.chromaX265_CSP_I420.puCHROMA_420_8x6.prim   = fncdef PFX(fname ## _8x6_ ## cpu); \
255
+    p.chromaX265_CSP_I420.puCHROMA_420_6x8.prim   = fncdef PFX(fname ## _6x8_ ## cpu); \
256
+    p.chromaX265_CSP_I420.puCHROMA_420_8x2.prim   = fncdef PFX(fname ## _8x2_ ## cpu); \
257
+    p.chromaX265_CSP_I420.puCHROMA_420_2x8.prim   = fncdef PFX(fname ## _2x8_ ## cpu); \
258
+    p.chromaX265_CSP_I420.puCHROMA_420_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
259
+    p.chromaX265_CSP_I420.puCHROMA_420_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
260
+    p.chromaX265_CSP_I420.puCHROMA_420_16x4.prim  = fncdef PFX(fname ## _16x4_ ## cpu); \
261
+    p.chromaX265_CSP_I420.puCHROMA_420_4x16.prim  = fncdef PFX(fname ## _4x16_ ## cpu); \
262
+    p.chromaX265_CSP_I420.puCHROMA_420_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
263
+    p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
264
+    p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
265
+    p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu)
266
+#define CHROMA_420_PU_TYPED_NEON_1(prim, fncdef, fname)               \
267
+    p.chromaX265_CSP_I420.puCHROMA_420_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
268
+    p.chromaX265_CSP_I420.puCHROMA_420_4x2.prim   = fncdef PFX(fname ## _4x2_ ## neon); \
269
+    p.chromaX265_CSP_I420.puCHROMA_420_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
270
+    p.chromaX265_CSP_I420.puCHROMA_420_6x8.prim   = fncdef PFX(fname ## _6x8_ ## neon); \
271
+    p.chromaX265_CSP_I420.puCHROMA_420_12x16.prim = fncdef PFX(fname ## _12x16_ ## neon); \
272
+    p.chromaX265_CSP_I420.puCHROMA_420_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon); \
273
+    p.chromaX265_CSP_I420.puCHROMA_420_32x24.prim = fncdef PFX(fname ## _32x24_ ## neon); \
274
+    p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(fname ## _24x32_ ## neon); \
275
+    p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim  = fncdef PFX(fname ## _32x8_ ## neon); \
276
+    p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim  = fncdef PFX(fname ## _8x32_ ## neon); \
277
+    p.chromaX265_CSP_I420.puCHROMA_420_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
278
+    p.chromaX265_CSP_I420.puCHROMA_420_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
279
+    p.chromaX265_CSP_I420.puCHROMA_420_2x4.prim   = fncdef PFX(fname ## _2x4_ ## neon); \
280
+    p.chromaX265_CSP_I420.puCHROMA_420_8x4.prim   = fncdef PFX(fname ## _8x4_ ## neon); \
281
+    p.chromaX265_CSP_I420.puCHROMA_420_16x8.prim  = fncdef PFX(fname ## _16x8_ ## neon); \
282
+    p.chromaX265_CSP_I420.puCHROMA_420_8x16.prim  = fncdef PFX(fname ## _8x16_ ## neon); \
283
+    p.chromaX265_CSP_I420.puCHROMA_420_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
284
+    p.chromaX265_CSP_I420.puCHROMA_420_8x6.prim   = fncdef PFX(fname ## _8x6_ ## neon); \
285
+    p.chromaX265_CSP_I420.puCHROMA_420_8x2.prim   = fncdef PFX(fname ## _8x2_ ## neon); \
286
+    p.chromaX265_CSP_I420.puCHROMA_420_2x8.prim   = fncdef PFX(fname ## _2x8_ ## neon); \
287
+    p.chromaX265_CSP_I420.puCHROMA_420_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \
288
+    p.chromaX265_CSP_I420.puCHROMA_420_16x4.prim  = fncdef PFX(fname ## _16x4_ ## neon)
289
+#define CHROMA_420_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fncdef, fname)               \
290
+    p.chromaX265_CSP_I420.puCHROMA_420_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve); \
291
+    p.chromaX265_CSP_I420.puCHROMA_420_32x16.prim = fncdef PFX(fname ## _32x16_ ## sve)
292
+#define CHROMA_420_PU_TYPED_NEON_2(prim, fncdef, fname)               \
293
+    p.chromaX265_CSP_I420.puCHROMA_420_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
294
+    p.chromaX265_CSP_I420.puCHROMA_420_4x2.prim   = fncdef PFX(fname ## _4x2_ ## neon); \
295
+    p.chromaX265_CSP_I420.puCHROMA_420_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
296
+    p.chromaX265_CSP_I420.puCHROMA_420_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon)
297
+#define CHROMA_420_PU_TYPED_MULTIPLE_ARCHS(prim, fncdef, fname, cpu)               \
298
+    p.chromaX265_CSP_I420.puCHROMA_420_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
299
+    p.chromaX265_CSP_I420.puCHROMA_420_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
300
+    p.chromaX265_CSP_I420.puCHROMA_420_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
301
+    p.chromaX265_CSP_I420.puCHROMA_420_2x4.prim   = fncdef PFX(fname ## _2x4_ ## cpu); \
302
+    p.chromaX265_CSP_I420.puCHROMA_420_8x4.prim   = fncdef PFX(fname ## _8x4_ ## cpu); \
303
+    p.chromaX265_CSP_I420.puCHROMA_420_16x8.prim  = fncdef PFX(fname ## _16x8_ ## cpu); \
304
+    p.chromaX265_CSP_I420.puCHROMA_420_8x16.prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
305
+    p.chromaX265_CSP_I420.puCHROMA_420_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
306
+    p.chromaX265_CSP_I420.puCHROMA_420_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
307
+    p.chromaX265_CSP_I420.puCHROMA_420_8x6.prim   = fncdef PFX(fname ## _8x6_ ## cpu); \
308
+    p.chromaX265_CSP_I420.puCHROMA_420_6x8.prim   = fncdef PFX(fname ## _6x8_ ## cpu); \
309
+    p.chromaX265_CSP_I420.puCHROMA_420_8x2.prim   = fncdef PFX(fname ## _8x2_ ## cpu); \
310
+    p.chromaX265_CSP_I420.puCHROMA_420_2x8.prim   = fncdef PFX(fname ## _2x8_ ## cpu); \
311
+    p.chromaX265_CSP_I420.puCHROMA_420_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
312
+    p.chromaX265_CSP_I420.puCHROMA_420_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
313
+    p.chromaX265_CSP_I420.puCHROMA_420_16x4.prim  = fncdef PFX(fname ## _16x4_ ## cpu); \
314
+    p.chromaX265_CSP_I420.puCHROMA_420_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
315
+    p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
316
+    p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
317
+    p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu)
318
+#define CHROMA_420_PU_TYPED_FILTER_PIXEL_TO_SHORT_NEON(prim, fncdef)               \
319
+    p.chromaX265_CSP_I420.puCHROMA_420_4x4.prim   = fncdef PFX(filterPixelToShort ## _4x4_ ## neon); \
320
+    p.chromaX265_CSP_I420.puCHROMA_420_8x8.prim   = fncdef PFX(filterPixelToShort ## _8x8_ ## neon); \
321
+    p.chromaX265_CSP_I420.puCHROMA_420_16x16.prim = fncdef PFX(filterPixelToShort ## _16x16_ ## neon); \
322
+    p.chromaX265_CSP_I420.puCHROMA_420_8x4.prim   = fncdef PFX(filterPixelToShort ## _8x4_ ## neon); \
323
+    p.chromaX265_CSP_I420.puCHROMA_420_4x8.prim   = fncdef PFX(filterPixelToShort ## _4x8_ ## neon); \
324
+    p.chromaX265_CSP_I420.puCHROMA_420_16x8.prim  = fncdef PFX(filterPixelToShort ## _16x8_ ## neon); \
325
+    p.chromaX265_CSP_I420.puCHROMA_420_8x16.prim  = fncdef PFX(filterPixelToShort ## _8x16_ ## neon); \
326
+    p.chromaX265_CSP_I420.puCHROMA_420_16x32.prim = fncdef PFX(filterPixelToShort ## _16x32_ ## neon); \
327
+    p.chromaX265_CSP_I420.puCHROMA_420_8x6.prim   = fncdef PFX(filterPixelToShort ## _8x6_ ## neon); \
328
+    p.chromaX265_CSP_I420.puCHROMA_420_8x2.prim   = fncdef PFX(filterPixelToShort ## _8x2_ ## neon); \
329
+    p.chromaX265_CSP_I420.puCHROMA_420_16x12.prim = fncdef PFX(filterPixelToShort ## _16x12_ ## neon); \
330
+    p.chromaX265_CSP_I420.puCHROMA_420_12x16.prim = fncdef PFX(filterPixelToShort ## _12x16_ ## neon); \
331
+    p.chromaX265_CSP_I420.puCHROMA_420_16x4.prim  = fncdef PFX(filterPixelToShort ## _16x4_ ## neon); \
332
+    p.chromaX265_CSP_I420.puCHROMA_420_4x16.prim  = fncdef PFX(filterPixelToShort ## _4x16_ ## neon); \
333
+    p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(filterPixelToShort ## _24x32_ ## neon); \
334
+    p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim  = fncdef PFX(filterPixelToShort ## _8x32_ ## neon)
335
+#define CHROMA_420_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, fncdef)               \
336
+    p.chromaX265_CSP_I420.puCHROMA_420_2x4.prim   = fncdef PFX(filterPixelToShort ## _2x4_ ## sve); \
337
+    p.chromaX265_CSP_I420.puCHROMA_420_2x8.prim   = fncdef PFX(filterPixelToShort ## _2x8_ ## sve); \
338
+    p.chromaX265_CSP_I420.puCHROMA_420_6x8.prim   = fncdef PFX(filterPixelToShort ## _6x8_ ## sve); \
339
+    p.chromaX265_CSP_I420.puCHROMA_420_4x2.prim   = fncdef PFX(filterPixelToShort ## _4x2_ ## sve); \
340
+    p.chromaX265_CSP_I420.puCHROMA_420_32x32.prim = fncdef PFX(filterPixelToShort ## _32x32_ ## sve); \
341
+    p.chromaX265_CSP_I420.puCHROMA_420_32x16.prim = fncdef PFX(filterPixelToShort ## _32x16_ ## sve); \
342
+    p.chromaX265_CSP_I420.puCHROMA_420_32x24.prim = fncdef PFX(filterPixelToShort ## _32x24_ ## sve); \
343
+    p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim  = fncdef PFX(filterPixelToShort ## _32x8_ ## sve)
344
+#define ALL_CHROMA_420_PU(prim, fname, cpu) ALL_CHROMA_420_PU_TYPED(prim, , fname, cpu)
345
+#define CHROMA_420_PU_NEON_1(prim, fname) CHROMA_420_PU_TYPED_NEON_1(prim, , fname)
346
+#define CHROMA_420_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fname) CHROMA_420_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, , fname)
347
+#define CHROMA_420_PU_NEON_2(prim, fname) CHROMA_420_PU_TYPED_NEON_2(prim, , fname)
348
+#define CHROMA_420_PU_MULTIPLE_ARCHS(prim, fname, cpu) CHROMA_420_PU_TYPED_MULTIPLE_ARCHS(prim, , fname, cpu)
349
+#define CHROMA_420_PU_FILTER_PIXEL_TO_SHORT_NEON(prim) CHROMA_420_PU_TYPED_FILTER_PIXEL_TO_SHORT_NEON(prim, )
350
+#define CHROMA_420_PU_SVE_FILTER_PIXEL_TO_SHORT(prim) CHROMA_420_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, )
351
+
352
+
353
+#define ALL_CHROMA_420_4x4_PU_TYPED(prim, fncdef, fname, cpu) \
354
+    p.chromaX265_CSP_I420.puCHROMA_420_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
355
+    p.chromaX265_CSP_I420.puCHROMA_420_8x2.prim   = fncdef PFX(fname ## _8x2_ ## cpu); \
356
+    p.chromaX265_CSP_I420.puCHROMA_420_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
357
+    p.chromaX265_CSP_I420.puCHROMA_420_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
358
+    p.chromaX265_CSP_I420.puCHROMA_420_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
359
+    p.chromaX265_CSP_I420.puCHROMA_420_8x4.prim   = fncdef PFX(fname ## _8x4_ ## cpu); \
360
+    p.chromaX265_CSP_I420.puCHROMA_420_8x6.prim   = fncdef PFX(fname ## _8x6_ ## cpu); \
361
+    p.chromaX265_CSP_I420.puCHROMA_420_4x8.prim   = fncdef PFX(fname ## _4x8_ ## cpu); \
362
+    p.chromaX265_CSP_I420.puCHROMA_420_16x8.prim  = fncdef PFX(fname ## _16x8_ ## cpu); \
363
+    p.chromaX265_CSP_I420.puCHROMA_420_8x16.prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
364
+    p.chromaX265_CSP_I420.puCHROMA_420_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
365
+    p.chromaX265_CSP_I420.puCHROMA_420_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
366
+    p.chromaX265_CSP_I420.puCHROMA_420_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
367
+    p.chromaX265_CSP_I420.puCHROMA_420_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
368
+    p.chromaX265_CSP_I420.puCHROMA_420_16x4.prim  = fncdef PFX(fname ## _16x4_ ## cpu); \
369
+    p.chromaX265_CSP_I420.puCHROMA_420_4x16.prim  = fncdef PFX(fname ## _4x16_ ## cpu); \
370
+    p.chromaX265_CSP_I420.puCHROMA_420_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
371
+    p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
372
+    p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
373
+    p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu)
374
+#define ALL_CHROMA_420_4x4_PU(prim, fname, cpu) ALL_CHROMA_420_4x4_PU_TYPED(prim, , fname, cpu)
375
+
376
+#define ALL_CHROMA_422_PU_TYPED(prim, fncdef, fname, cpu)               \
377
+    p.chromaX265_CSP_I422.puCHROMA_422_4x8.prim   = fncdef PFX(fname ## _4x8_ ## cpu); \
378
+    p.chromaX265_CSP_I422.puCHROMA_422_8x16.prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
379
+    p.chromaX265_CSP_I422.puCHROMA_422_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
380
+    p.chromaX265_CSP_I422.puCHROMA_422_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
381
+    p.chromaX265_CSP_I422.puCHROMA_422_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
382
+    p.chromaX265_CSP_I422.puCHROMA_422_2x8.prim   = fncdef PFX(fname ## _2x8_ ## cpu); \
383
+    p.chromaX265_CSP_I422.puCHROMA_422_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
384
+    p.chromaX265_CSP_I422.puCHROMA_422_4x16.prim  = fncdef PFX(fname ## _4x16_ ## cpu); \
385
+    p.chromaX265_CSP_I422.puCHROMA_422_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
386
+    p.chromaX265_CSP_I422.puCHROMA_422_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu); \
387
+    p.chromaX265_CSP_I422.puCHROMA_422_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
388
+    p.chromaX265_CSP_I422.puCHROMA_422_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu); \
389
+    p.chromaX265_CSP_I422.puCHROMA_422_8x12.prim  = fncdef PFX(fname ## _8x12_ ## cpu); \
390
+    p.chromaX265_CSP_I422.puCHROMA_422_6x16.prim  = fncdef PFX(fname ## _6x16_ ## cpu); \
391
+    p.chromaX265_CSP_I422.puCHROMA_422_8x4.prim   = fncdef PFX(fname ## _8x4_ ## cpu); \
392
+    p.chromaX265_CSP_I422.puCHROMA_422_2x16.prim  = fncdef PFX(fname ## _2x16_ ## cpu); \
393
+    p.chromaX265_CSP_I422.puCHROMA_422_16x24.prim = fncdef PFX(fname ## _16x24_ ## cpu); \
394
+    p.chromaX265_CSP_I422.puCHROMA_422_12x32.prim = fncdef PFX(fname ## _12x32_ ## cpu); \
395
+    p.chromaX265_CSP_I422.puCHROMA_422_16x8.prim  = fncdef PFX(fname ## _16x8_ ## cpu); \
396
+    p.chromaX265_CSP_I422.puCHROMA_422_4x32.prim  = fncdef PFX(fname ## _4x32_ ## cpu); \
397
+    p.chromaX265_CSP_I422.puCHROMA_422_32x48.prim = fncdef PFX(fname ## _32x48_ ## cpu); \
398
+    p.chromaX265_CSP_I422.puCHROMA_422_24x64.prim = fncdef PFX(fname ## _24x64_ ## cpu); \
399
+    p.chromaX265_CSP_I422.puCHROMA_422_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
400
+    p.chromaX265_CSP_I422.puCHROMA_422_8x64.prim  = fncdef PFX(fname ## _8x64_ ## cpu)
401
+#define CHROMA_422_PU_TYPED_NEON_1(prim, fncdef, fname)               \
402
+    p.chromaX265_CSP_I422.puCHROMA_422_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
403
+    p.chromaX265_CSP_I422.puCHROMA_422_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
404
+    p.chromaX265_CSP_I422.puCHROMA_422_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon); \
405
+    p.chromaX265_CSP_I422.puCHROMA_422_6x16.prim  = fncdef PFX(fname ## _6x16_ ## neon); \
406
+    p.chromaX265_CSP_I422.puCHROMA_422_12x32.prim = fncdef PFX(fname ## _12x32_ ## neon); \
407
+    p.chromaX265_CSP_I422.puCHROMA_422_4x32.prim  = fncdef PFX(fname ## _4x32_ ## neon); \
408
+    p.chromaX265_CSP_I422.puCHROMA_422_8x16.prim  = fncdef PFX(fname ## _8x16_ ## neon); \
409
+    p.chromaX265_CSP_I422.puCHROMA_422_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
410
+    p.chromaX265_CSP_I422.puCHROMA_422_2x8.prim   = fncdef PFX(fname ## _2x8_ ## neon); \
411
+    p.chromaX265_CSP_I422.puCHROMA_422_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
412
+    p.chromaX265_CSP_I422.puCHROMA_422_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
413
+    p.chromaX265_CSP_I422.puCHROMA_422_8x32.prim  = fncdef PFX(fname ## _8x32_ ## neon); \
414
+    p.chromaX265_CSP_I422.puCHROMA_422_16x64.prim = fncdef PFX(fname ## _16x64_ ## neon); \
415
+    p.chromaX265_CSP_I422.puCHROMA_422_8x12.prim  = fncdef PFX(fname ## _8x12_ ## neon); \
416
+    p.chromaX265_CSP_I422.puCHROMA_422_8x4.prim   = fncdef PFX(fname ## _8x4_ ## neon); \
417
+    p.chromaX265_CSP_I422.puCHROMA_422_2x16.prim  = fncdef PFX(fname ## _2x16_ ## neon); \
418
+    p.chromaX265_CSP_I422.puCHROMA_422_16x24.prim = fncdef PFX(fname ## _16x24_ ## neon); \
419
+    p.chromaX265_CSP_I422.puCHROMA_422_16x8.prim  = fncdef PFX(fname ## _16x8_ ## neon); \
420
+    p.chromaX265_CSP_I422.puCHROMA_422_24x64.prim = fncdef PFX(fname ## _24x64_ ## neon); \
421
+    p.chromaX265_CSP_I422.puCHROMA_422_8x64.prim  = fncdef PFX(fname ## _8x64_ ## neon)
422
+#define CHROMA_422_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fncdef, fname)               \
423
+    p.chromaX265_CSP_I422.puCHROMA_422_32x64.prim = fncdef PFX(fname ## _32x64_ ## sve); \
424
+    p.chromaX265_CSP_I422.puCHROMA_422_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve); \
425
+    p.chromaX265_CSP_I422.puCHROMA_422_32x48.prim = fncdef PFX(fname ## _32x48_ ## sve); \
426
+    p.chromaX265_CSP_I422.puCHROMA_422_32x16.prim = fncdef PFX(fname ## _32x16_ ## sve)
427
+#define CHROMA_422_PU_TYPED_NEON_2(prim, fncdef, fname)               \
428
+    p.chromaX265_CSP_I422.puCHROMA_422_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
429
+    p.chromaX265_CSP_I422.puCHROMA_422_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
430
+    p.chromaX265_CSP_I422.puCHROMA_422_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon); \
431
+    p.chromaX265_CSP_I422.puCHROMA_422_4x32.prim  = fncdef PFX(fname ## _4x32_ ## neon)
432
+#define CHROMA_422_PU_TYPED_CAN_USE_SVE2(prim, fncdef, fname)               \
433
+    p.chromaX265_CSP_I422.puCHROMA_422_8x16.prim  = fncdef PFX(fname ## _8x16_ ## sve2); \
434
+    p.chromaX265_CSP_I422.puCHROMA_422_16x32.prim = fncdef PFX(fname ## _16x32_ ## sve2); \
435
+    p.chromaX265_CSP_I422.puCHROMA_422_32x64.prim = fncdef PFX(fname ## _32x64_ ## sve2); \
436
+    p.chromaX265_CSP_I422.puCHROMA_422_2x8.prim   = fncdef PFX(fname ## _2x8_ ## sve2); \
437
+    p.chromaX265_CSP_I422.puCHROMA_422_8x8.prim   = fncdef PFX(fname ## _8x8_ ## sve2); \
438
+    p.chromaX265_CSP_I422.puCHROMA_422_16x16.prim = fncdef PFX(fname ## _16x16_ ## sve2); \
439
+    p.chromaX265_CSP_I422.puCHROMA_422_8x32.prim  = fncdef PFX(fname ## _8x32_ ## sve2); \
440
+    p.chromaX265_CSP_I422.puCHROMA_422_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve2); \
441
+    p.chromaX265_CSP_I422.puCHROMA_422_16x64.prim = fncdef PFX(fname ## _16x64_ ## sve2); \
442
+    p.chromaX265_CSP_I422.puCHROMA_422_8x12.prim  = fncdef PFX(fname ## _8x12_ ## sve2); \
443
+    p.chromaX265_CSP_I422.puCHROMA_422_6x16.prim  = fncdef PFX(fname ## _6x16_ ## sve2); \
444
+    p.chromaX265_CSP_I422.puCHROMA_422_8x4.prim   = fncdef PFX(fname ## _8x4_ ## sve2); \
445
+    p.chromaX265_CSP_I422.puCHROMA_422_2x16.prim  = fncdef PFX(fname ## _2x16_ ## sve2); \
446
+    p.chromaX265_CSP_I422.puCHROMA_422_16x24.prim = fncdef PFX(fname ## _16x24_ ## sve2); \
447
+    p.chromaX265_CSP_I422.puCHROMA_422_12x32.prim = fncdef PFX(fname ## _12x32_ ## sve2); \
448
+    p.chromaX265_CSP_I422.puCHROMA_422_16x8.prim  = fncdef PFX(fname ## _16x8_ ## sve2); \
449
+    p.chromaX265_CSP_I422.puCHROMA_422_32x48.prim = fncdef PFX(fname ## _32x48_ ## sve2); \
450
+    p.chromaX265_CSP_I422.puCHROMA_422_24x64.prim = fncdef PFX(fname ## _24x64_ ## sve2); \
451
+    p.chromaX265_CSP_I422.puCHROMA_422_32x16.prim = fncdef PFX(fname ## _32x16_ ## sve2); \
452
+    p.chromaX265_CSP_I422.puCHROMA_422_8x64.prim  = fncdef PFX(fname ## _8x64_ ## sve2)
453
+#define CHROMA_422_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, fncdef)               \
454
+    p.chromaX265_CSP_I422.puCHROMA_422_4x8.prim   = fncdef PFX(filterPixelToShort ## _4x8_ ## neon); \
455
+    p.chromaX265_CSP_I422.puCHROMA_422_8x16.prim  = fncdef PFX(filterPixelToShort ## _8x16_ ## neon); \
456
+    p.chromaX265_CSP_I422.puCHROMA_422_16x32.prim = fncdef PFX(filterPixelToShort ## _16x32_ ## neon); \
457
+    p.chromaX265_CSP_I422.puCHROMA_422_4x4.prim   = fncdef PFX(filterPixelToShort ## _4x4_ ## neon); \
458
+    p.chromaX265_CSP_I422.puCHROMA_422_8x8.prim   = fncdef PFX(filterPixelToShort ## _8x8_ ## neon); \
459
+    p.chromaX265_CSP_I422.puCHROMA_422_4x16.prim  = fncdef PFX(filterPixelToShort ## _4x16_ ## neon); \
460
+    p.chromaX265_CSP_I422.puCHROMA_422_16x16.prim = fncdef PFX(filterPixelToShort ## _16x16_ ## neon); \
461
+    p.chromaX265_CSP_I422.puCHROMA_422_8x32.prim  = fncdef PFX(filterPixelToShort ## _8x32_ ## neon); \
462
+    p.chromaX265_CSP_I422.puCHROMA_422_16x64.prim = fncdef PFX(filterPixelToShort ## _16x64_ ## neon); \
463
+    p.chromaX265_CSP_I422.puCHROMA_422_8x12.prim  = fncdef PFX(filterPixelToShort ## _8x12_ ## neon); \
464
+    p.chromaX265_CSP_I422.puCHROMA_422_8x4.prim   = fncdef PFX(filterPixelToShort ## _8x4_ ## neon); \
465
+    p.chromaX265_CSP_I422.puCHROMA_422_16x24.prim = fncdef PFX(filterPixelToShort ## _16x24_ ## neon); \
466
+    p.chromaX265_CSP_I422.puCHROMA_422_12x32.prim = fncdef PFX(filterPixelToShort ## _12x32_ ## neon); \
467
+    p.chromaX265_CSP_I422.puCHROMA_422_16x8.prim  = fncdef PFX(filterPixelToShort ## _16x8_ ## neon); \
468
+    p.chromaX265_CSP_I422.puCHROMA_422_4x32.prim  = fncdef PFX(filterPixelToShort ## _4x32_ ## neon); \
469
+    p.chromaX265_CSP_I422.puCHROMA_422_24x64.prim = fncdef PFX(filterPixelToShort ## _24x64_ ## neon); \
470
+    p.chromaX265_CSP_I422.puCHROMA_422_8x64.prim  = fncdef PFX(filterPixelToShort ## _8x64_ ## neon)
471
+#define CHROMA_422_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, fncdef)               \
472
+    p.chromaX265_CSP_I422.puCHROMA_422_2x8.prim   = fncdef PFX(filterPixelToShort ## _2x8_ ## sve); \
473
+    p.chromaX265_CSP_I422.puCHROMA_422_2x16.prim  = fncdef PFX(filterPixelToShort ## _2x16_ ## sve); \
474
+    p.chromaX265_CSP_I422.puCHROMA_422_6x16.prim  = fncdef PFX(filterPixelToShort ## _6x16_ ## sve); \
475
+    p.chromaX265_CSP_I422.puCHROMA_422_32x64.prim = fncdef PFX(filterPixelToShort ## _32x64_ ## sve); \
476
+    p.chromaX265_CSP_I422.puCHROMA_422_32x32.prim = fncdef PFX(filterPixelToShort ## _32x32_ ## sve); \
477
+    p.chromaX265_CSP_I422.puCHROMA_422_32x48.prim = fncdef PFX(filterPixelToShort ## _32x48_ ## sve); \
478
+    p.chromaX265_CSP_I422.puCHROMA_422_32x16.prim = fncdef PFX(filterPixelToShort ## _32x16_ ## sve)
479
+#define ALL_CHROMA_422_PU(prim, fname, cpu) ALL_CHROMA_422_PU_TYPED(prim, , fname, cpu)
480
+#define CHROMA_422_PU_NEON_1(prim, fname) CHROMA_422_PU_TYPED_NEON_1(prim, , fname)
481
+#define CHROMA_422_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fname) CHROMA_422_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, , fname)
482
+#define CHROMA_422_PU_NEON_2(prim, fname) CHROMA_422_PU_TYPED_NEON_2(prim, , fname)
483
+#define CHROMA_422_PU_CAN_USE_SVE2(prim, fname) CHROMA_422_PU_TYPED_CAN_USE_SVE2(prim, , fname)
484
+#define CHROMA_422_PU_NEON_FILTER_PIXEL_TO_SHORT(prim) CHROMA_422_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, )
485
+#define CHROMA_422_PU_SVE_FILTER_PIXEL_TO_SHORT(prim) CHROMA_422_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, )
486
+
487
+#define ALL_CHROMA_444_PU_TYPED(prim, fncdef, fname, cpu) \
488
+    p.chromaX265_CSP_I444.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
489
+    p.chromaX265_CSP_I444.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
490
+    p.chromaX265_CSP_I444.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
491
+    p.chromaX265_CSP_I444.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
492
+    p.chromaX265_CSP_I444.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
493
+    p.chromaX265_CSP_I444.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## cpu); \
494
+    p.chromaX265_CSP_I444.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## cpu); \
495
+    p.chromaX265_CSP_I444.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## cpu); \
496
+    p.chromaX265_CSP_I444.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
497
+    p.chromaX265_CSP_I444.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
498
+    p.chromaX265_CSP_I444.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
499
+    p.chromaX265_CSP_I444.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
500
+    p.chromaX265_CSP_I444.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
501
+    p.chromaX265_CSP_I444.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
502
+    p.chromaX265_CSP_I444.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
503
+    p.chromaX265_CSP_I444.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## cpu); \
504
+    p.chromaX265_CSP_I444.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## cpu); \
505
+    p.chromaX265_CSP_I444.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
506
+    p.chromaX265_CSP_I444.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
507
+    p.chromaX265_CSP_I444.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
508
+    p.chromaX265_CSP_I444.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu); \
509
+    p.chromaX265_CSP_I444.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
510
+    p.chromaX265_CSP_I444.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
511
+    p.chromaX265_CSP_I444.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \
512
+    p.chromaX265_CSP_I444.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu)
513
+#define CHROMA_444_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
514
+    p.chromaX265_CSP_I444.puLUMA_4x4.prim   = fncdef PFX(filterPixelToShort ## _4x4_ ## neon); \
515
+    p.chromaX265_CSP_I444.puLUMA_8x8.prim   = fncdef PFX(filterPixelToShort ## _8x8_ ## neon); \
516
+    p.chromaX265_CSP_I444.puLUMA_16x16.prim = fncdef PFX(filterPixelToShort ## _16x16_ ## neon); \
517
+    p.chromaX265_CSP_I444.puLUMA_8x4.prim   = fncdef PFX(filterPixelToShort ## _8x4_ ## neon); \
518
+    p.chromaX265_CSP_I444.puLUMA_4x8.prim   = fncdef PFX(filterPixelToShort ## _4x8_ ## neon); \
519
+    p.chromaX265_CSP_I444.puLUMA_16x8.prim  = fncdef PFX(filterPixelToShort ## _16x8_ ## neon); \
520
+    p.chromaX265_CSP_I444.puLUMA_8x16.prim  = fncdef PFX(filterPixelToShort ## _8x16_ ## neon); \
521
+    p.chromaX265_CSP_I444.puLUMA_16x32.prim = fncdef PFX(filterPixelToShort ## _16x32_ ## neon); \
522
+    p.chromaX265_CSP_I444.puLUMA_16x12.prim = fncdef PFX(filterPixelToShort ## _16x12_ ## neon); \
523
+    p.chromaX265_CSP_I444.puLUMA_12x16.prim = fncdef PFX(filterPixelToShort ## _12x16_ ## neon); \
524
+    p.chromaX265_CSP_I444.puLUMA_16x4.prim  = fncdef PFX(filterPixelToShort ## _16x4_ ## neon); \
525
+    p.chromaX265_CSP_I444.puLUMA_4x16.prim  = fncdef PFX(filterPixelToShort ## _4x16_ ## neon); \
526
+    p.chromaX265_CSP_I444.puLUMA_24x32.prim = fncdef PFX(filterPixelToShort ## _24x32_ ## neon); \
527
+    p.chromaX265_CSP_I444.puLUMA_8x32.prim  = fncdef PFX(filterPixelToShort ## _8x32_ ## neon); \
528
+    p.chromaX265_CSP_I444.puLUMA_16x64.prim = fncdef PFX(filterPixelToShort ## _16x64_ ## neon)
529
+#define CHROMA_444_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
530
+    p.chromaX265_CSP_I444.puLUMA_32x32.prim = fncdef PFX(filterPixelToShort ## _32x32_ ## sve); \
531
+    p.chromaX265_CSP_I444.puLUMA_32x16.prim = fncdef PFX(filterPixelToShort ## _32x16_ ## sve); \
532
+    p.chromaX265_CSP_I444.puLUMA_32x64.prim = fncdef PFX(filterPixelToShort ## _32x64_ ## sve); \
533
+    p.chromaX265_CSP_I444.puLUMA_32x24.prim = fncdef PFX(filterPixelToShort ## _32x24_ ## sve); \
534
+    p.chromaX265_CSP_I444.puLUMA_32x8.prim  = fncdef PFX(filterPixelToShort ## _32x8_ ## sve); \
535
+    p.chromaX265_CSP_I444.puLUMA_64x64.prim = fncdef PFX(filterPixelToShort ## _64x64_ ## sve); \
536
+    p.chromaX265_CSP_I444.puLUMA_64x32.prim = fncdef PFX(filterPixelToShort ## _64x32_ ## sve); \
537
+    p.chromaX265_CSP_I444.puLUMA_64x48.prim = fncdef PFX(filterPixelToShort ## _64x48_ ## sve); \
538
+    p.chromaX265_CSP_I444.puLUMA_64x16.prim = fncdef PFX(filterPixelToShort ## _64x16_ ## sve); \
539
+    p.chromaX265_CSP_I444.puLUMA_48x64.prim = fncdef PFX(filterPixelToShort ## _48x64_ ## sve)
540
+#define ALL_CHROMA_444_PU(prim, fname, cpu) ALL_CHROMA_444_PU_TYPED(prim, , fname, cpu)
541
+#define CHROMA_444_PU_NEON_FILTER_PIXEL_TO_SHORT(prim) CHROMA_444_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, )
542
+#define CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(prim) CHROMA_444_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, )
543
+
544
+#define ALL_CHROMA_420_VERT_FILTERS(cpu)                             \
545
+    ALL_CHROMA_420_4x4_PU(filter_vpp, interp_4tap_vert_pp, cpu); \
546
+    ALL_CHROMA_420_4x4_PU(filter_vps, interp_4tap_vert_ps, cpu); \
547
+    ALL_CHROMA_420_4x4_PU(filter_vsp, interp_4tap_vert_sp, cpu); \
548
+    ALL_CHROMA_420_4x4_PU(filter_vss, interp_4tap_vert_ss, cpu)
549
+
550
+#define CHROMA_420_VERT_FILTERS_NEON()                             \
551
+    ALL_CHROMA_420_4x4_PU(filter_vsp, interp_4tap_vert_sp, neon)
552
+
553
+#define CHROMA_420_VERT_FILTERS_CAN_USE_SVE2()                             \
554
+    ALL_CHROMA_420_4x4_PU(filter_vpp, interp_4tap_vert_pp, sve2); \
555
+    ALL_CHROMA_420_4x4_PU(filter_vps, interp_4tap_vert_ps, sve2); \
556
+    ALL_CHROMA_420_4x4_PU(filter_vss, interp_4tap_vert_ss, sve2)
557
+
558
+#define SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(W, H) \
559
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vsp = PFX(interp_4tap_vert_sp_ ## W ## x ## H ## _ ## neon)
560
+
561
+#define SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(W, H, cpu) \
562
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vpp = PFX(interp_4tap_vert_pp_ ## W ## x ## H ## _ ## cpu); \
563
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vps = PFX(interp_4tap_vert_ps_ ## W ## x ## H ## _ ## cpu); \
564
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vss = PFX(interp_4tap_vert_ss_ ## W ## x ## H ## _ ## cpu)
565
+
566
+#define CHROMA_422_VERT_FILTERS_NEON() \
567
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(4, 8); \
568
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(8, 16); \
569
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(8, 8); \
570
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(4, 16); \
571
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(8, 12); \
572
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(8, 4); \
573
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(16, 32); \
574
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(16, 16); \
575
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(8, 32); \
576
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(16, 24); \
577
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(12, 32); \
578
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(16, 8); \
579
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(4, 32); \
580
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(32, 64); \
581
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(32, 32); \
582
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(16, 64); \
583
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(32, 48); \
584
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(24, 64); \
585
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(32, 16); \
586
+    SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(8, 64)
587
+
588
+#define CHROMA_422_VERT_FILTERS_CAN_USE_SVE2(cpu) \
589
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(4, 8, cpu); \
590
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(8, 16, cpu); \
591
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(8, 8, cpu); \
592
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(4, 16, cpu); \
593
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(8, 12, cpu); \
594
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(8, 4, cpu); \
595
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(16, 32, cpu); \
596
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(16, 16, cpu); \
597
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(8, 32, cpu); \
598
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(16, 24, cpu); \
599
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(12, 32, cpu); \
600
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(16, 8, cpu); \
601
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(4, 32, cpu); \
602
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(32, 64, cpu); \
603
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(32, 32, cpu); \
604
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(16, 64, cpu); \
605
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(32, 48, cpu); \
606
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(24, 64, cpu); \
607
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(32, 16, cpu); \
608
+    SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(8, 64, cpu)
609
+
610
+#define ALL_CHROMA_444_VERT_FILTERS(cpu) \
611
+    ALL_CHROMA_444_PU(filter_vpp, interp_4tap_vert_pp, cpu); \
612
+    ALL_CHROMA_444_PU(filter_vps, interp_4tap_vert_ps, cpu); \
613
+    ALL_CHROMA_444_PU(filter_vsp, interp_4tap_vert_sp, cpu); \
614
+    ALL_CHROMA_444_PU(filter_vss, interp_4tap_vert_ss, cpu)
615
+
616
+#define CHROMA_444_VERT_FILTERS_NEON() \
617
+    ALL_CHROMA_444_PU(filter_vsp, interp_4tap_vert_sp, neon)
618
+
619
+#define CHROMA_444_VERT_FILTERS_CAN_USE_SVE2() \
620
+    ALL_CHROMA_444_PU(filter_vpp, interp_4tap_vert_pp, sve2); \
621
+    ALL_CHROMA_444_PU(filter_vps, interp_4tap_vert_ps, sve2); \
622
+    ALL_CHROMA_444_PU(filter_vss, interp_4tap_vert_ss, sve2)
623
+
624
+#define ALL_CHROMA_420_FILTERS(cpu)                               \
625
+    ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, cpu); \
626
+    ALL_CHROMA_420_PU(filter_hps, interp_4tap_horiz_ps, cpu); \
627
+    ALL_CHROMA_420_PU(filter_vpp, interp_4tap_vert_pp, cpu);  \
628
+    ALL_CHROMA_420_PU(filter_vps, interp_4tap_vert_ps, cpu)
629
+
630
+#define CHROMA_420_FILTERS_NEON()                               \
631
+    ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, neon); \
632
+    ALL_CHROMA_420_PU(filter_hps, interp_4tap_horiz_ps, neon)
633
+
634
+#define CHROMA_420_FILTERS_CAN_USE_SVE2()                               \
635
+    ALL_CHROMA_420_PU(filter_vpp, interp_4tap_vert_pp, sve2);  \
636
+    ALL_CHROMA_420_PU(filter_vps, interp_4tap_vert_ps, sve2)
637
+
638
+#define ALL_CHROMA_422_FILTERS(cpu) \
639
+    ALL_CHROMA_422_PU(filter_hpp, interp_4tap_horiz_pp, cpu); \
640
+    ALL_CHROMA_422_PU(filter_hps, interp_4tap_horiz_ps, cpu); \
641
+    ALL_CHROMA_422_PU(filter_vpp, interp_4tap_vert_pp, cpu);  \
642
+    ALL_CHROMA_422_PU(filter_vps, interp_4tap_vert_ps, cpu)
643
+
644
+#define CHROMA_422_FILTERS_NEON() \
645
+    ALL_CHROMA_422_PU(filter_hpp, interp_4tap_horiz_pp, neon); \
646
+    ALL_CHROMA_422_PU(filter_hps, interp_4tap_horiz_ps, neon)
647
+
648
+#define CHROMA_422_FILTERS_CAN_USE_SVE2() \
649
+    ALL_CHROMA_422_PU(filter_vpp, interp_4tap_vert_pp, sve2);  \
650
+    ALL_CHROMA_422_PU(filter_vps, interp_4tap_vert_ps, sve2)
651
+
652
+#define ALL_CHROMA_444_FILTERS(cpu) \
653
+    ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, cpu); \
654
+    ALL_CHROMA_444_PU(filter_hps, interp_4tap_horiz_ps, cpu); \
655
+    ALL_CHROMA_444_PU(filter_vpp, interp_4tap_vert_pp, cpu);  \
656
+    ALL_CHROMA_444_PU(filter_vps, interp_4tap_vert_ps, cpu)
657
+
658
+#define CHROMA_444_FILTERS_NEON() \
659
+    ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, neon); \
660
+    ALL_CHROMA_444_PU(filter_hps, interp_4tap_horiz_ps, neon)
661
+
662
+#define CHROMA_444_FILTERS_CAN_USE_SVE2() \
663
+    ALL_CHROMA_444_PU(filter_vpp, interp_4tap_vert_pp, sve2);  \
664
+    ALL_CHROMA_444_PU(filter_vps, interp_4tap_vert_ps, sve2)
665
+
666
 
667
 #if defined(__GNUC__)
668
 #define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
669
@@ -35,18 +684,19 @@
670
 #define GCC_4_9_0 40900
671
 #define GCC_5_1_0 50100
672
 
673
-extern "C" {
674
-#include "pixel.h"
675
-#include "pixel-util.h"
676
-#include "ipfilter8.h"
677
-}
678
+#include "pixel-prim.h"
679
+#include "filter-prim.h"
680
+#include "dct-prim.h"
681
+#include "loopfilter-prim.h"
682
+#include "intrapred-prim.h"
683
 
684
-namespace X265_NS {
685
+namespace X265_NS
686
+{
687
 // private x265 namespace
688
 
689
 
690
 template<int size>
691
-void interp_8tap_hv_pp_cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY)
692
+void interp_8tap_hv_pp_cpu(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
693
 {
694
     ALIGN_VAR_32(int16_t, immedMAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA - 1));
695
     const int halfFilterSize = NTAPS_LUMA >> 1;
696
@@ -56,164 +706,1259 @@
697
     primitives.pusize.luma_vsp(immed + (halfFilterSize - 1) * immedStride, immedStride, dst, dstStride, idxY);
698
 }
699
 
700
-
701
-/* Temporary workaround because luma_vsp assembly primitive has not been completed
702
- * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
703
- * Otherwise, segment fault occurs. */
704
-void setupAliasCPrimitives(EncoderPrimitives &cp, EncoderPrimitives &asmp, int cpuMask)
705
+void setupNeonPrimitives(EncoderPrimitives &p)
706
 {
707
-    if (cpuMask & X265_CPU_NEON)
708
-    {
709
-        asmp.puLUMA_8x4.luma_vsp   = cp.puLUMA_8x4.luma_vsp;
710
-        asmp.puLUMA_8x8.luma_vsp   = cp.puLUMA_8x8.luma_vsp;
711
-        asmp.puLUMA_8x16.luma_vsp  = cp.puLUMA_8x16.luma_vsp;
712
-        asmp.puLUMA_8x32.luma_vsp  = cp.puLUMA_8x32.luma_vsp;
713
-        asmp.puLUMA_12x16.luma_vsp = cp.puLUMA_12x16.luma_vsp;
714
-#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */
715
-        asmp.puLUMA_16x4.luma_vsp  = cp.puLUMA_16x4.luma_vsp;
716
-        asmp.puLUMA_16x8.luma_vsp  = cp.puLUMA_16x8.luma_vsp;
717
-        asmp.puLUMA_16x12.luma_vsp = cp.puLUMA_16x12.luma_vsp;
718
-        asmp.puLUMA_16x16.luma_vsp = cp.puLUMA_16x16.luma_vsp;
719
-        asmp.puLUMA_16x32.luma_vsp = cp.puLUMA_16x32.luma_vsp;
720
-        asmp.puLUMA_16x64.luma_vsp = cp.puLUMA_16x64.luma_vsp;
721
-        asmp.puLUMA_32x16.luma_vsp = cp.puLUMA_32x16.luma_vsp;
722
-        asmp.puLUMA_32x24.luma_vsp = cp.puLUMA_32x24.luma_vsp;
723
-        asmp.puLUMA_32x32.luma_vsp = cp.puLUMA_32x32.luma_vsp;
724
-        asmp.puLUMA_32x64.luma_vsp = cp.puLUMA_32x64.luma_vsp;
725
-        asmp.puLUMA_48x64.luma_vsp = cp.puLUMA_48x64.luma_vsp;
726
-        asmp.puLUMA_64x16.luma_vsp = cp.puLUMA_64x16.luma_vsp;
727
-        asmp.puLUMA_64x32.luma_vsp = cp.puLUMA_64x32.luma_vsp;
728
-        asmp.puLUMA_64x48.luma_vsp = cp.puLUMA_64x48.luma_vsp;
729
-        asmp.puLUMA_64x64.luma_vsp = cp.puLUMA_64x64.luma_vsp;    
730
-#if !AUTO_VECTORIZE || GCC_VERSION < GCC_4_9_0 /* gcc_version < gcc-4.9.0 */
731
-        asmp.puLUMA_4x4.luma_vsp   = cp.puLUMA_4x4.luma_vsp;
732
-        asmp.puLUMA_4x8.luma_vsp   = cp.puLUMA_4x8.luma_vsp;
733
-        asmp.puLUMA_4x16.luma_vsp  = cp.puLUMA_4x16.luma_vsp;
734
-        asmp.puLUMA_24x32.luma_vsp = cp.puLUMA_24x32.luma_vsp;
735
-        asmp.puLUMA_32x8.luma_vsp  = cp.puLUMA_32x8.luma_vsp;
736
+    setupPixelPrimitives_neon(p);
737
+    setupFilterPrimitives_neon(p);
738
+    setupDCTPrimitives_neon(p);
739
+    setupLoopFilterPrimitives_neon(p);
740
+    setupIntraPrimitives_neon(p);
741
+
742
+    ALL_CHROMA_420_PU(p2sNONALIGNED, filterPixelToShort, neon);
743
+    ALL_CHROMA_422_PU(p2sALIGNED, filterPixelToShort, neon);
744
+    ALL_CHROMA_444_PU(p2sALIGNED, filterPixelToShort, neon);
745
+    ALL_LUMA_PU(convert_p2sALIGNED, filterPixelToShort, neon);
746
+    ALL_CHROMA_420_PU(p2sALIGNED, filterPixelToShort, neon);
747
+    ALL_CHROMA_422_PU(p2sNONALIGNED, filterPixelToShort, neon);
748
+    ALL_CHROMA_444_PU(p2sNONALIGNED, filterPixelToShort, neon);
749
+    ALL_LUMA_PU(convert_p2sNONALIGNED, filterPixelToShort, neon);
750
+
751
+#if !HIGH_BIT_DEPTH
752
+    ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, neon);
753
+    ALL_LUMA_PU(luma_vsp, interp_8tap_vert_sp, neon);
754
+    ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, neon);
755
+    ALL_LUMA_PU(luma_hpp, interp_horiz_pp, neon);
756
+    ALL_LUMA_PU(luma_hps, interp_horiz_ps, neon);
757
+    ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, neon);
758
+    ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu);
759
+    ALL_CHROMA_420_VERT_FILTERS(neon);
760
+    CHROMA_422_VERT_FILTERS_NEON();
761
+    CHROMA_422_VERT_FILTERS_CAN_USE_SVE2(neon);
762
+    ALL_CHROMA_444_VERT_FILTERS(neon);
763
+    ALL_CHROMA_420_FILTERS(neon);
764
+    ALL_CHROMA_422_FILTERS(neon);
765
+    ALL_CHROMA_444_FILTERS(neon);
766
+
767
+    // Blockcopy_pp
768
+    ALL_LUMA_PU(copy_pp, blockcopy_pp, neon);
769
+    ALL_CHROMA_420_PU(copy_pp, blockcopy_pp, neon);
770
+    ALL_CHROMA_422_PU(copy_pp, blockcopy_pp, neon);
771
+    p.cuBLOCK_4x4.copy_pp   = PFX(blockcopy_pp_4x4_neon);
772
+    p.cuBLOCK_8x8.copy_pp   = PFX(blockcopy_pp_8x8_neon);
773
+    p.cuBLOCK_16x16.copy_pp = PFX(blockcopy_pp_16x16_neon);
774
+    p.cuBLOCK_32x32.copy_pp = PFX(blockcopy_pp_32x32_neon);
775
+    p.cuBLOCK_64x64.copy_pp = PFX(blockcopy_pp_64x64_neon);
776
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_pp = PFX(blockcopy_pp_4x4_neon);
777
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_pp = PFX(blockcopy_pp_8x8_neon);
778
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_pp = PFX(blockcopy_pp_16x16_neon);
779
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_pp = PFX(blockcopy_pp_32x32_neon);
780
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_pp = PFX(blockcopy_pp_4x8_neon);
781
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_pp = PFX(blockcopy_pp_8x16_neon);
782
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_pp = PFX(blockcopy_pp_16x32_neon);
783
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_pp = PFX(blockcopy_pp_32x64_neon);
784
+
785
+#endif // !HIGH_BIT_DEPTH
786
+
787
+    // Blockcopy_ss
788
+    p.cuBLOCK_4x4.copy_ss   = PFX(blockcopy_ss_4x4_neon);
789
+    p.cuBLOCK_8x8.copy_ss   = PFX(blockcopy_ss_8x8_neon);
790
+    p.cuBLOCK_16x16.copy_ss = PFX(blockcopy_ss_16x16_neon);
791
+    p.cuBLOCK_32x32.copy_ss = PFX(blockcopy_ss_32x32_neon);
792
+    p.cuBLOCK_64x64.copy_ss = PFX(blockcopy_ss_64x64_neon);
793
+
794
+    // Blockcopy_ps
795
+    p.cuBLOCK_4x4.copy_ps   = PFX(blockcopy_ps_4x4_neon);
796
+    p.cuBLOCK_8x8.copy_ps   = PFX(blockcopy_ps_8x8_neon);
797
+    p.cuBLOCK_16x16.copy_ps = PFX(blockcopy_ps_16x16_neon);
798
+    p.cuBLOCK_32x32.copy_ps = PFX(blockcopy_ps_32x32_neon);
799
+    p.cuBLOCK_64x64.copy_ps = PFX(blockcopy_ps_64x64_neon);
800
+
801
+    // Blockcopy_sp
802
+    p.cuBLOCK_4x4.copy_sp   = PFX(blockcopy_sp_4x4_neon);
803
+    p.cuBLOCK_8x8.copy_sp   = PFX(blockcopy_sp_8x8_neon);
804
+    p.cuBLOCK_16x16.copy_sp = PFX(blockcopy_sp_16x16_neon);
805
+    p.cuBLOCK_32x32.copy_sp = PFX(blockcopy_sp_32x32_neon);
806
+    p.cuBLOCK_64x64.copy_sp = PFX(blockcopy_sp_64x64_neon);
807
+
808
+    // chroma blockcopy_ss
809
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_ss   = PFX(blockcopy_ss_4x4_neon);
810
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_ss   = PFX(blockcopy_ss_8x8_neon);
811
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_ss = PFX(blockcopy_ss_16x16_neon);
812
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_ss = PFX(blockcopy_ss_32x32_neon);
813
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_ss   = PFX(blockcopy_ss_4x8_neon);
814
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_ss  = PFX(blockcopy_ss_8x16_neon);
815
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_ss = PFX(blockcopy_ss_16x32_neon);
816
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_ss = PFX(blockcopy_ss_32x64_neon);
817
+
818
+    // chroma blockcopy_ps
819
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_ps   = PFX(blockcopy_ps_4x4_neon);
820
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_ps   = PFX(blockcopy_ps_8x8_neon);
821
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_ps = PFX(blockcopy_ps_16x16_neon);
822
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_ps = PFX(blockcopy_ps_32x32_neon);
823
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_ps   = PFX(blockcopy_ps_4x8_neon);
824
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_ps  = PFX(blockcopy_ps_8x16_neon);
825
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_ps = PFX(blockcopy_ps_16x32_neon);
826
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_ps = PFX(blockcopy_ps_32x64_neon);
827
+
828
+    // chroma blockcopy_sp
829
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_sp   = PFX(blockcopy_sp_4x4_neon);
830
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_sp   = PFX(blockcopy_sp_8x8_neon);
831
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_sp = PFX(blockcopy_sp_16x16_neon);
832
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_sp = PFX(blockcopy_sp_32x32_neon);
833
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_sp   = PFX(blockcopy_sp_4x8_neon);
834
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_sp  = PFX(blockcopy_sp_8x16_neon);
835
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_sp = PFX(blockcopy_sp_16x32_neon);
836
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_sp = PFX(blockcopy_sp_32x64_neon);
837
+
838
+    // Block_fill
839
+    ALL_LUMA_TU(blockfill_sALIGNED, blockfill_s, neon);
840
+    ALL_LUMA_TU(blockfill_sNONALIGNED, blockfill_s, neon);
841
+
842
+    // copy_count
843
+    p.cuBLOCK_4x4.copy_cnt     = PFX(copy_cnt_4_neon);
844
+    p.cuBLOCK_8x8.copy_cnt     = PFX(copy_cnt_8_neon);
845
+    p.cuBLOCK_16x16.copy_cnt   = PFX(copy_cnt_16_neon);
846
+    p.cuBLOCK_32x32.copy_cnt   = PFX(copy_cnt_32_neon);
847
+
848
+    // count nonzero
849
+    p.cuBLOCK_4x4.count_nonzero     = PFX(count_nonzero_4_neon);
850
+    p.cuBLOCK_8x8.count_nonzero     = PFX(count_nonzero_8_neon);
851
+    p.cuBLOCK_16x16.count_nonzero   = PFX(count_nonzero_16_neon);
852
+    p.cuBLOCK_32x32.count_nonzero   = PFX(count_nonzero_32_neon);
853
+
854
+    // cpy2Dto1D_shl
855
+    p.cuBLOCK_4x4.cpy2Dto1D_shl   = PFX(cpy2Dto1D_shl_4x4_neon);
856
+    p.cuBLOCK_8x8.cpy2Dto1D_shl   = PFX(cpy2Dto1D_shl_8x8_neon);
857
+    p.cuBLOCK_16x16.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16x16_neon);
858
+    p.cuBLOCK_32x32.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32x32_neon);
859
+    p.cuBLOCK_64x64.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_64x64_neon);
860
+
861
+    // cpy2Dto1D_shr
862
+    p.cuBLOCK_4x4.cpy2Dto1D_shr   = PFX(cpy2Dto1D_shr_4x4_neon);
863
+    p.cuBLOCK_8x8.cpy2Dto1D_shr   = PFX(cpy2Dto1D_shr_8x8_neon);
864
+    p.cuBLOCK_16x16.cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16x16_neon);
865
+    p.cuBLOCK_32x32.cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_neon);
866
+
867
+    // cpy1Dto2D_shl
868
+    p.cuBLOCK_4x4.cpy1Dto2D_shlALIGNED      = PFX(cpy1Dto2D_shl_4x4_neon);
869
+    p.cuBLOCK_8x8.cpy1Dto2D_shlALIGNED      = PFX(cpy1Dto2D_shl_8x8_neon);
870
+    p.cuBLOCK_16x16.cpy1Dto2D_shlALIGNED    = PFX(cpy1Dto2D_shl_16x16_neon);
871
+    p.cuBLOCK_32x32.cpy1Dto2D_shlALIGNED    = PFX(cpy1Dto2D_shl_32x32_neon);
872
+    p.cuBLOCK_64x64.cpy1Dto2D_shlALIGNED    = PFX(cpy1Dto2D_shl_64x64_neon);
873
+
874
+    p.cuBLOCK_4x4.cpy1Dto2D_shlNONALIGNED   = PFX(cpy1Dto2D_shl_4x4_neon);
875
+    p.cuBLOCK_8x8.cpy1Dto2D_shlNONALIGNED   = PFX(cpy1Dto2D_shl_8x8_neon);
876
+    p.cuBLOCK_16x16.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_16x16_neon);
877
+    p.cuBLOCK_32x32.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_32x32_neon);
878
+    p.cuBLOCK_64x64.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_64x64_neon);
879
+
880
+    // cpy1Dto2D_shr
881
+    p.cuBLOCK_4x4.cpy1Dto2D_shr   = PFX(cpy1Dto2D_shr_4x4_neon);
882
+    p.cuBLOCK_8x8.cpy1Dto2D_shr   = PFX(cpy1Dto2D_shr_8x8_neon);
883
+    p.cuBLOCK_16x16.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16x16_neon);
884
+    p.cuBLOCK_32x32.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_neon);
885
+    p.cuBLOCK_64x64.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_64x64_neon);
886
+
887
+#if !HIGH_BIT_DEPTH
888
+    // pixel_avg_pp
889
+    ALL_LUMA_PU(pixelavg_ppNONALIGNED, pixel_avg_pp, neon);
890
+    ALL_LUMA_PU(pixelavg_ppALIGNED, pixel_avg_pp, neon);
891
+
892
+    // addAvg
893
+    ALL_LUMA_PU(addAvgNONALIGNED, addAvg, neon);
894
+    ALL_LUMA_PU(addAvgALIGNED, addAvg, neon);
895
+    ALL_CHROMA_420_PU(addAvgNONALIGNED, addAvg, neon);
896
+    ALL_CHROMA_422_PU(addAvgNONALIGNED, addAvg, neon);
897
+    ALL_CHROMA_420_PU(addAvgALIGNED, addAvg, neon);
898
+    ALL_CHROMA_422_PU(addAvgALIGNED, addAvg, neon);
899
+
900
+    // sad
901
+    ALL_LUMA_PU(sad, pixel_sad, neon);
902
+    ALL_LUMA_PU(sad_x3, sad_x3, neon);
903
+    ALL_LUMA_PU(sad_x4, sad_x4, neon);
904
+
905
+    // sse_pp
906
+    p.cuBLOCK_4x4.sse_pp   = PFX(pixel_sse_pp_4x4_neon);
907
+    p.cuBLOCK_8x8.sse_pp   = PFX(pixel_sse_pp_8x8_neon);
908
+    p.cuBLOCK_16x16.sse_pp = PFX(pixel_sse_pp_16x16_neon);
909
+    p.cuBLOCK_32x32.sse_pp = PFX(pixel_sse_pp_32x32_neon);
910
+    p.cuBLOCK_64x64.sse_pp = PFX(pixel_sse_pp_64x64_neon);
911
+
912
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.sse_pp   = PFX(pixel_sse_pp_4x4_neon);
913
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.sse_pp   = PFX(pixel_sse_pp_8x8_neon);
914
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.sse_pp = PFX(pixel_sse_pp_16x16_neon);
915
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.sse_pp = PFX(pixel_sse_pp_32x32_neon);
916
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.sse_pp   = PFX(pixel_sse_pp_4x8_neon);
917
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.sse_pp  = PFX(pixel_sse_pp_8x16_neon);
918
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.sse_pp = PFX(pixel_sse_pp_16x32_neon);
919
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.sse_pp = PFX(pixel_sse_pp_32x64_neon);
920
+
921
+    // sse_ss
922
+    p.cuBLOCK_4x4.sse_ss   = PFX(pixel_sse_ss_4x4_neon);
923
+    p.cuBLOCK_8x8.sse_ss   = PFX(pixel_sse_ss_8x8_neon);
924
+    p.cuBLOCK_16x16.sse_ss = PFX(pixel_sse_ss_16x16_neon);
925
+    p.cuBLOCK_32x32.sse_ss = PFX(pixel_sse_ss_32x32_neon);
926
+    p.cuBLOCK_64x64.sse_ss = PFX(pixel_sse_ss_64x64_neon);
927
+
928
+    // ssd_s
929
+    p.cuBLOCK_4x4.ssd_sNONALIGNED   = PFX(pixel_ssd_s_4x4_neon);
930
+    p.cuBLOCK_8x8.ssd_sNONALIGNED   = PFX(pixel_ssd_s_8x8_neon);
931
+    p.cuBLOCK_16x16.ssd_sNONALIGNED = PFX(pixel_ssd_s_16x16_neon);
932
+    p.cuBLOCK_32x32.ssd_sNONALIGNED = PFX(pixel_ssd_s_32x32_neon);
933
+
934
+    p.cuBLOCK_4x4.ssd_sALIGNED   = PFX(pixel_ssd_s_4x4_neon);
935
+    p.cuBLOCK_8x8.ssd_sALIGNED   = PFX(pixel_ssd_s_8x8_neon);
936
+    p.cuBLOCK_16x16.ssd_sALIGNED = PFX(pixel_ssd_s_16x16_neon);
937
+    p.cuBLOCK_32x32.ssd_sALIGNED = PFX(pixel_ssd_s_32x32_neon);
938
+
939
+    // pixel_var
940
+    p.cuBLOCK_8x8.var   = PFX(pixel_var_8x8_neon);
941
+    p.cuBLOCK_16x16.var = PFX(pixel_var_16x16_neon);
942
+    p.cuBLOCK_32x32.var = PFX(pixel_var_32x32_neon);
943
+    p.cuBLOCK_64x64.var = PFX(pixel_var_64x64_neon);
944
+
945
+    // calc_Residual
946
+    p.cuBLOCK_4x4.calcresidualNONALIGNED   = PFX(getResidual4_neon);
947
+    p.cuBLOCK_8x8.calcresidualNONALIGNED   = PFX(getResidual8_neon);
948
+    p.cuBLOCK_16x16.calcresidualNONALIGNED = PFX(getResidual16_neon);
949
+    p.cuBLOCK_32x32.calcresidualNONALIGNED = PFX(getResidual32_neon);
950
+
951
+    p.cuBLOCK_4x4.calcresidualALIGNED   = PFX(getResidual4_neon);
952
+    p.cuBLOCK_8x8.calcresidualALIGNED   = PFX(getResidual8_neon);
953
+    p.cuBLOCK_16x16.calcresidualALIGNED = PFX(getResidual16_neon);
954
+    p.cuBLOCK_32x32.calcresidualALIGNED = PFX(getResidual32_neon);
955
+
956
+    // pixel_sub_ps
957
+    p.cuBLOCK_4x4.sub_ps   = PFX(pixel_sub_ps_4x4_neon);
958
+    p.cuBLOCK_8x8.sub_ps   = PFX(pixel_sub_ps_8x8_neon);
959
+    p.cuBLOCK_16x16.sub_ps = PFX(pixel_sub_ps_16x16_neon);
960
+    p.cuBLOCK_32x32.sub_ps = PFX(pixel_sub_ps_32x32_neon);
961
+    p.cuBLOCK_64x64.sub_ps = PFX(pixel_sub_ps_64x64_neon);
962
+
963
+    // chroma sub_ps
964
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.sub_ps   = PFX(pixel_sub_ps_4x4_neon);
965
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.sub_ps   = PFX(pixel_sub_ps_8x8_neon);
966
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.sub_ps = PFX(pixel_sub_ps_16x16_neon);
967
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.sub_ps = PFX(pixel_sub_ps_32x32_neon);
968
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.sub_ps   = PFX(pixel_sub_ps_4x8_neon);
969
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.sub_ps  = PFX(pixel_sub_ps_8x16_neon);
970
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.sub_ps = PFX(pixel_sub_ps_16x32_neon);
971
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.sub_ps = PFX(pixel_sub_ps_32x64_neon);
972
+
973
+    // pixel_add_ps
974
+    p.cuBLOCK_4x4.add_psNONALIGNED   = PFX(pixel_add_ps_4x4_neon);
975
+    p.cuBLOCK_8x8.add_psNONALIGNED   = PFX(pixel_add_ps_8x8_neon);
976
+    p.cuBLOCK_16x16.add_psNONALIGNED = PFX(pixel_add_ps_16x16_neon);
977
+    p.cuBLOCK_32x32.add_psNONALIGNED = PFX(pixel_add_ps_32x32_neon);
978
+    p.cuBLOCK_64x64.add_psNONALIGNED = PFX(pixel_add_ps_64x64_neon);
979
+
980
+    p.cuBLOCK_4x4.add_psALIGNED   = PFX(pixel_add_ps_4x4_neon);
981
+    p.cuBLOCK_8x8.add_psALIGNED   = PFX(pixel_add_ps_8x8_neon);
982
+    p.cuBLOCK_16x16.add_psALIGNED = PFX(pixel_add_ps_16x16_neon);
983
+    p.cuBLOCK_32x32.add_psALIGNED = PFX(pixel_add_ps_32x32_neon);
984
+    p.cuBLOCK_64x64.add_psALIGNED = PFX(pixel_add_ps_64x64_neon);
985
+
986
+    // chroma add_ps
987
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.add_psNONALIGNED   = PFX(pixel_add_ps_4x4_neon);
988
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.add_psNONALIGNED   = PFX(pixel_add_ps_8x8_neon);
989
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.add_psNONALIGNED = PFX(pixel_add_ps_16x16_neon);
990
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.add_psNONALIGNED = PFX(pixel_add_ps_32x32_neon);
991
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.add_psNONALIGNED   = PFX(pixel_add_ps_4x8_neon);
992
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.add_psNONALIGNED  = PFX(pixel_add_ps_8x16_neon);
993
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.add_psNONALIGNED = PFX(pixel_add_ps_16x32_neon);
994
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.add_psNONALIGNED = PFX(pixel_add_ps_32x64_neon);
995
+
996
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.add_psALIGNED   = PFX(pixel_add_ps_4x4_neon);
997
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.add_psALIGNED   = PFX(pixel_add_ps_8x8_neon);
998
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.add_psALIGNED = PFX(pixel_add_ps_16x16_neon);
999
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.add_psALIGNED = PFX(pixel_add_ps_32x32_neon);
1000
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.add_psALIGNED   = PFX(pixel_add_ps_4x8_neon);
1001
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.add_psALIGNED  = PFX(pixel_add_ps_8x16_neon);
1002
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.add_psALIGNED = PFX(pixel_add_ps_16x32_neon);
1003
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.add_psALIGNED = PFX(pixel_add_ps_32x64_neon);
1004
+
1005
+    //scale2D_64to32
1006
+    p.scale2D_64to32  = PFX(scale2D_64to32_neon);
1007
+
1008
+    // scale1D_128to64
1009
+    p.scale1D_128to64NONALIGNED = PFX(scale1D_128to64_neon);
1010
+    p.scale1D_128to64ALIGNED = PFX(scale1D_128to64_neon);
1011
+
1012
+    // planecopy
1013
+    p.planecopy_cp = PFX(pixel_planecopy_cp_neon);
1014
+
1015
+    // satd
1016
+    ALL_LUMA_PU(satd, pixel_satd, neon);
1017
+
1018
+    p.chromaX265_CSP_I420.puCHROMA_420_4x4.satd   = PFX(pixel_satd_4x4_neon);
1019
+    p.chromaX265_CSP_I420.puCHROMA_420_8x8.satd   = PFX(pixel_satd_8x8_neon);
1020
+    p.chromaX265_CSP_I420.puCHROMA_420_16x16.satd = PFX(pixel_satd_16x16_neon);
1021
+    p.chromaX265_CSP_I420.puCHROMA_420_32x32.satd = PFX(pixel_satd_32x32_neon);
1022
+    p.chromaX265_CSP_I420.puCHROMA_420_8x4.satd   = PFX(pixel_satd_8x4_neon);
1023
+    p.chromaX265_CSP_I420.puCHROMA_420_4x8.satd   = PFX(pixel_satd_4x8_neon);
1024
+    p.chromaX265_CSP_I420.puCHROMA_420_16x8.satd  = PFX(pixel_satd_16x8_neon);
1025
+    p.chromaX265_CSP_I420.puCHROMA_420_8x16.satd  = PFX(pixel_satd_8x16_neon);
1026
+    p.chromaX265_CSP_I420.puCHROMA_420_32x16.satd = PFX(pixel_satd_32x16_neon);
1027
+    p.chromaX265_CSP_I420.puCHROMA_420_16x32.satd = PFX(pixel_satd_16x32_neon);
1028
+    p.chromaX265_CSP_I420.puCHROMA_420_16x12.satd = PFX(pixel_satd_16x12_neon);
1029
+    p.chromaX265_CSP_I420.puCHROMA_420_12x16.satd = PFX(pixel_satd_12x16_neon);
1030
+    p.chromaX265_CSP_I420.puCHROMA_420_16x4.satd  = PFX(pixel_satd_16x4_neon);
1031
+    p.chromaX265_CSP_I420.puCHROMA_420_4x16.satd  = PFX(pixel_satd_4x16_neon);
1032
+    p.chromaX265_CSP_I420.puCHROMA_420_32x24.satd = PFX(pixel_satd_32x24_neon);
1033
+    p.chromaX265_CSP_I420.puCHROMA_420_24x32.satd = PFX(pixel_satd_24x32_neon);
1034
+    p.chromaX265_CSP_I420.puCHROMA_420_32x8.satd  = PFX(pixel_satd_32x8_neon);
1035
+    p.chromaX265_CSP_I420.puCHROMA_420_8x32.satd  = PFX(pixel_satd_8x32_neon);
1036
+
1037
+    p.chromaX265_CSP_I422.puCHROMA_422_4x8.satd   = PFX(pixel_satd_4x8_neon);
1038
+    p.chromaX265_CSP_I422.puCHROMA_422_8x16.satd  = PFX(pixel_satd_8x16_neon);
1039
+    p.chromaX265_CSP_I422.puCHROMA_422_16x32.satd = PFX(pixel_satd_16x32_neon);
1040
+    p.chromaX265_CSP_I422.puCHROMA_422_32x64.satd = PFX(pixel_satd_32x64_neon);
1041
+    p.chromaX265_CSP_I422.puCHROMA_422_4x4.satd   = PFX(pixel_satd_4x4_neon);
1042
+    p.chromaX265_CSP_I422.puCHROMA_422_8x8.satd   = PFX(pixel_satd_8x8_neon);
1043
+    p.chromaX265_CSP_I422.puCHROMA_422_4x16.satd  = PFX(pixel_satd_4x16_neon);
1044
+    p.chromaX265_CSP_I422.puCHROMA_422_16x16.satd = PFX(pixel_satd_16x16_neon);
1045
+    p.chromaX265_CSP_I422.puCHROMA_422_8x32.satd  = PFX(pixel_satd_8x32_neon);
1046
+    p.chromaX265_CSP_I422.puCHROMA_422_32x32.satd = PFX(pixel_satd_32x32_neon);
1047
+    p.chromaX265_CSP_I422.puCHROMA_422_16x64.satd = PFX(pixel_satd_16x64_neon);
1048
+    p.chromaX265_CSP_I422.puCHROMA_422_8x12.satd  = PFX(pixel_satd_8x12_neon);
1049
+    p.chromaX265_CSP_I422.puCHROMA_422_8x4.satd   = PFX(pixel_satd_8x4_neon);
1050
+    p.chromaX265_CSP_I422.puCHROMA_422_16x24.satd = PFX(pixel_satd_16x24_neon);
1051
+    p.chromaX265_CSP_I422.puCHROMA_422_12x32.satd = PFX(pixel_satd_12x32_neon);
1052
+    p.chromaX265_CSP_I422.puCHROMA_422_16x8.satd  = PFX(pixel_satd_16x8_neon);
1053
+    p.chromaX265_CSP_I422.puCHROMA_422_4x32.satd  = PFX(pixel_satd_4x32_neon);
1054
+    p.chromaX265_CSP_I422.puCHROMA_422_32x48.satd = PFX(pixel_satd_32x48_neon);
1055
+    p.chromaX265_CSP_I422.puCHROMA_422_24x64.satd = PFX(pixel_satd_24x64_neon);
1056
+    p.chromaX265_CSP_I422.puCHROMA_422_32x16.satd = PFX(pixel_satd_32x16_neon);
1057
+    p.chromaX265_CSP_I422.puCHROMA_422_8x64.satd  = PFX(pixel_satd_8x64_neon);
1058
+
1059
+    // sa8d
1060
+    p.cuBLOCK_4x4.sa8d   = PFX(pixel_satd_4x4_neon);
1061
+    p.cuBLOCK_8x8.sa8d   = PFX(pixel_sa8d_8x8_neon);
1062
+    p.cuBLOCK_16x16.sa8d = PFX(pixel_sa8d_16x16_neon);
1063
+    p.cuBLOCK_32x32.sa8d = PFX(pixel_sa8d_32x32_neon);
1064
+    p.cuBLOCK_64x64.sa8d = PFX(pixel_sa8d_64x64_neon);
1065
+    p.chromaX265_CSP_I420.cuBLOCK_8x8.sa8d = PFX(pixel_satd_4x4_neon);
1066
+    p.chromaX265_CSP_I420.cuBLOCK_16x16.sa8d = PFX(pixel_sa8d_16x16_neon);
1067
+    p.chromaX265_CSP_I420.cuBLOCK_32x32.sa8d = PFX(pixel_sa8d_32x32_neon);
1068
+    p.chromaX265_CSP_I420.cuBLOCK_64x64.sa8d = PFX(pixel_sa8d_64x64_neon);
1069
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.sa8d = PFX(pixel_sa8d_8x16_neon);
1070
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.sa8d = PFX(pixel_sa8d_16x32_neon);
1071
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.sa8d = PFX(pixel_sa8d_32x64_neon);
1072
+
1073
+    // dequant_scaling
1074
+    p.dequant_scaling = PFX(dequant_scaling_neon);
1075
+    p.dequant_normal  = PFX(dequant_normal_neon);
1076
+
1077
+    // ssim_4x4x2_core
1078
+    p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_neon);
1079
+
1080
+    // ssimDist
1081
+    p.cuBLOCK_4x4.ssimDist = PFX(ssimDist4_neon);
1082
+    p.cuBLOCK_8x8.ssimDist = PFX(ssimDist8_neon);
1083
+    p.cuBLOCK_16x16.ssimDist = PFX(ssimDist16_neon);
1084
+    p.cuBLOCK_32x32.ssimDist = PFX(ssimDist32_neon);
1085
+    p.cuBLOCK_64x64.ssimDist = PFX(ssimDist64_neon);
1086
+
1087
+    // normFact
1088
+    p.cuBLOCK_8x8.normFact = PFX(normFact8_neon);
1089
+    p.cuBLOCK_16x16.normFact = PFX(normFact16_neon);
1090
+    p.cuBLOCK_32x32.normFact = PFX(normFact32_neon);
1091
+    p.cuBLOCK_64x64.normFact = PFX(normFact64_neon);
1092
+
1093
+    // psy_cost_pp
1094
+    p.cuBLOCK_4x4.psy_cost_pp = PFX(psyCost_4x4_neon);
1095
+
1096
+    p.weight_pp = PFX(weight_pp_neon);
1097
+#if !defined(__APPLE__)
1098
+    p.scanPosLast = PFX(scanPosLast_neon);
1099
 #endif
1100
+    p.costCoeffNxN = PFX(costCoeffNxN_neon);
1101
 #endif
1102
-    }
1103
-}
1104
 
1105
+    // quant
1106
+    p.quant = PFX(quant_neon);
1107
+    p.nquant = PFX(nquant_neon);
1108
+}
1109
 
1110
-void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) 
1111
+#if defined(HAVE_SVE2) || defined(HAVE_SVE)
1112
+void setupSvePrimitives(EncoderPrimitives &p)
1113
 {
1114
-    if (cpuMask & X265_CPU_NEON)
1115
-    {
1116
-        p.puLUMA_4x4.satd   = PFX(pixel_satd_4x4_neon);
1117
-        p.puLUMA_4x8.satd   = PFX(pixel_satd_4x8_neon);
1118
-        p.puLUMA_4x16.satd  = PFX(pixel_satd_4x16_neon);
1119
-        p.puLUMA_8x4.satd   = PFX(pixel_satd_8x4_neon);
1120
-        p.puLUMA_8x8.satd   = PFX(pixel_satd_8x8_neon);
1121
-        p.puLUMA_12x16.satd = PFX(pixel_satd_12x16_neon);
1122
-        
1123
-        p.chromaX265_CSP_I420.puCHROMA_420_4x4.satd    = PFX(pixel_satd_4x4_neon);
1124
-        p.chromaX265_CSP_I420.puCHROMA_420_4x8.satd    = PFX(pixel_satd_4x8_neon);
1125
-        p.chromaX265_CSP_I420.puCHROMA_420_4x16.satd   = PFX(pixel_satd_4x16_neon);
1126
-        p.chromaX265_CSP_I420.puCHROMA_420_8x4.satd    = PFX(pixel_satd_8x4_neon);
1127
-        p.chromaX265_CSP_I420.puCHROMA_420_8x8.satd    = PFX(pixel_satd_8x8_neon);
1128
-        p.chromaX265_CSP_I420.puCHROMA_420_12x16.satd  = PFX(pixel_satd_12x16_neon);
1129
-        
1130
-        p.chromaX265_CSP_I422.puCHROMA_422_4x4.satd    = PFX(pixel_satd_4x4_neon);
1131
-        p.chromaX265_CSP_I422.puCHROMA_422_4x8.satd    = PFX(pixel_satd_4x8_neon);
1132
-        p.chromaX265_CSP_I422.puCHROMA_422_4x16.satd   = PFX(pixel_satd_4x16_neon);
1133
-        p.chromaX265_CSP_I422.puCHROMA_422_4x32.satd   = PFX(pixel_satd_4x32_neon);
1134
-        p.chromaX265_CSP_I422.puCHROMA_422_8x4.satd    = PFX(pixel_satd_8x4_neon);
1135
-        p.chromaX265_CSP_I422.puCHROMA_422_8x8.satd    = PFX(pixel_satd_8x8_neon);
1136
-        p.chromaX265_CSP_I422.puCHROMA_422_12x32.satd  = PFX(pixel_satd_12x32_neon);
1137
-
1138
-        p.puLUMA_4x4.pixelavg_ppNONALIGNED   = PFX(pixel_avg_pp_4x4_neon);
1139
-        p.puLUMA_4x8.pixelavg_ppNONALIGNED   = PFX(pixel_avg_pp_4x8_neon);
1140
-        p.puLUMA_4x16.pixelavg_ppNONALIGNED  = PFX(pixel_avg_pp_4x16_neon);
1141
-        p.puLUMA_8x4.pixelavg_ppNONALIGNED   = PFX(pixel_avg_pp_8x4_neon);
1142
-        p.puLUMA_8x8.pixelavg_ppNONALIGNED   = PFX(pixel_avg_pp_8x8_neon);
1143
-        p.puLUMA_8x16.pixelavg_ppNONALIGNED  = PFX(pixel_avg_pp_8x16_neon);
1144
-        p.puLUMA_8x32.pixelavg_ppNONALIGNED  = PFX(pixel_avg_pp_8x32_neon);
1145
-
1146
-        p.puLUMA_4x4.pixelavg_ppALIGNED   = PFX(pixel_avg_pp_4x4_neon);
1147
-        p.puLUMA_4x8.pixelavg_ppALIGNED   = PFX(pixel_avg_pp_4x8_neon);
1148
-        p.puLUMA_4x16.pixelavg_ppALIGNED  = PFX(pixel_avg_pp_4x16_neon);
1149
-        p.puLUMA_8x4.pixelavg_ppALIGNED   = PFX(pixel_avg_pp_8x4_neon);
1150
-        p.puLUMA_8x8.pixelavg_ppALIGNED   = PFX(pixel_avg_pp_8x8_neon);
1151
-        p.puLUMA_8x16.pixelavg_ppALIGNED  = PFX(pixel_avg_pp_8x16_neon);
1152
-        p.puLUMA_8x32.pixelavg_ppALIGNED  = PFX(pixel_avg_pp_8x32_neon);
1153
-
1154
-        p.puLUMA_8x4.sad_x3   = PFX(sad_x3_8x4_neon);
1155
-        p.puLUMA_8x8.sad_x3   = PFX(sad_x3_8x8_neon);
1156
-        p.puLUMA_8x16.sad_x3  = PFX(sad_x3_8x16_neon);
1157
-        p.puLUMA_8x32.sad_x3  = PFX(sad_x3_8x32_neon);
1158
-
1159
-        p.puLUMA_8x4.sad_x4   = PFX(sad_x4_8x4_neon);
1160
-        p.puLUMA_8x8.sad_x4   = PFX(sad_x4_8x8_neon);
1161
-        p.puLUMA_8x16.sad_x4  = PFX(sad_x4_8x16_neon);
1162
-        p.puLUMA_8x32.sad_x4  = PFX(sad_x4_8x32_neon);
1163
-
1164
-        // quant
1165
-        p.quant = PFX(quant_neon);
1166
-        // luma_hps
1167
-        p.puLUMA_4x4.luma_hps   = PFX(interp_8tap_horiz_ps_4x4_neon);
1168
-        p.puLUMA_4x8.luma_hps   = PFX(interp_8tap_horiz_ps_4x8_neon);
1169
-        p.puLUMA_4x16.luma_hps  = PFX(interp_8tap_horiz_ps_4x16_neon);
1170
-        p.puLUMA_8x4.luma_hps   = PFX(interp_8tap_horiz_ps_8x4_neon);
1171
-        p.puLUMA_8x8.luma_hps   = PFX(interp_8tap_horiz_ps_8x8_neon);
1172
-        p.puLUMA_8x16.luma_hps  = PFX(interp_8tap_horiz_ps_8x16_neon);
1173
-        p.puLUMA_8x32.luma_hps  = PFX(interp_8tap_horiz_ps_8x32_neon);
1174
-        p.puLUMA_12x16.luma_hps = PFX(interp_8tap_horiz_ps_12x16_neon);
1175
-        p.puLUMA_24x32.luma_hps = PFX(interp_8tap_horiz_ps_24x32_neon);
1176
-#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */
1177
-        p.puLUMA_16x4.luma_hps  = PFX(interp_8tap_horiz_ps_16x4_neon);
1178
-        p.puLUMA_16x8.luma_hps  = PFX(interp_8tap_horiz_ps_16x8_neon);
1179
-        p.puLUMA_16x12.luma_hps = PFX(interp_8tap_horiz_ps_16x12_neon);
1180
-        p.puLUMA_16x16.luma_hps = PFX(interp_8tap_horiz_ps_16x16_neon);
1181
-        p.puLUMA_16x32.luma_hps = PFX(interp_8tap_horiz_ps_16x32_neon);
1182
-        p.puLUMA_16x64.luma_hps = PFX(interp_8tap_horiz_ps_16x64_neon);
1183
-        p.puLUMA_32x8.luma_hps  = PFX(interp_8tap_horiz_ps_32x8_neon);
1184
-        p.puLUMA_32x16.luma_hps = PFX(interp_8tap_horiz_ps_32x16_neon);
1185
-        p.puLUMA_32x24.luma_hps = PFX(interp_8tap_horiz_ps_32x24_neon);
1186
-        p.puLUMA_32x32.luma_hps = PFX(interp_8tap_horiz_ps_32x32_neon);
1187
-        p.puLUMA_32x64.luma_hps = PFX(interp_8tap_horiz_ps_32x64_neon);
1188
-        p.puLUMA_48x64.luma_hps = PFX(interp_8tap_horiz_ps_48x64_neon);
1189
-        p.puLUMA_64x16.luma_hps = PFX(interp_8tap_horiz_ps_64x16_neon);
1190
-        p.puLUMA_64x32.luma_hps = PFX(interp_8tap_horiz_ps_64x32_neon);
1191
-        p.puLUMA_64x48.luma_hps = PFX(interp_8tap_horiz_ps_64x48_neon);
1192
-        p.puLUMA_64x64.luma_hps = PFX(interp_8tap_horiz_ps_64x64_neon);
1193
-#endif
1194
-
1195
-        p.puLUMA_8x4.luma_hvpp   =  interp_8tap_hv_pp_cpu<LUMA_8x4>;
1196
-        p.puLUMA_8x8.luma_hvpp   =  interp_8tap_hv_pp_cpu<LUMA_8x8>;
1197
-        p.puLUMA_8x16.luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_8x16>;
1198
-        p.puLUMA_8x32.luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_8x32>;
1199
-        p.puLUMA_12x16.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_12x16>;
1200
-#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */
1201
-        p.puLUMA_16x4.luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_16x4>;
1202
-        p.puLUMA_16x8.luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_16x8>;
1203
-        p.puLUMA_16x12.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x12>;
1204
-        p.puLUMA_16x16.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x16>;
1205
-        p.puLUMA_16x32.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x32>;
1206
-        p.puLUMA_16x64.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x64>;
1207
-        p.puLUMA_32x16.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x16>;
1208
-        p.puLUMA_32x24.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x24>;
1209
-        p.puLUMA_32x32.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x32>;
1210
-        p.puLUMA_32x64.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x64>;
1211
-        p.puLUMA_48x64.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_48x64>;
1212
-        p.puLUMA_64x16.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_64x16>;
1213
-        p.puLUMA_64x32.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_64x32>;
1214
-        p.puLUMA_64x48.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_64x48>;
1215
-        p.puLUMA_64x64.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_64x64>;
1216
-#if !AUTO_VECTORIZE || GCC_VERSION < GCC_4_9_0 /* gcc_version < gcc-4.9.0 */
1217
-        p.puLUMA_4x4.luma_hvpp   =  interp_8tap_hv_pp_cpu<LUMA_4x4>;
1218
-        p.puLUMA_4x8.luma_hvpp   =  interp_8tap_hv_pp_cpu<LUMA_4x8>;
1219
-        p.puLUMA_4x16.luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_4x16>;
1220
-        p.puLUMA_24x32.luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_24x32>;
1221
-        p.puLUMA_32x8.luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_32x8>;
1222
+    // When these primitives will use SVE/SVE2 instructions set,
1223
+    // change the following definitions to point to the SVE/SVE2 implementation
1224
+    setupPixelPrimitives_neon(p);
1225
+    setupFilterPrimitives_neon(p);
1226
+    setupDCTPrimitives_neon(p);
1227
+    setupLoopFilterPrimitives_neon(p);
1228
+    setupIntraPrimitives_neon(p);
1229
+
1230
+    CHROMA_420_PU_FILTER_PIXEL_TO_SHORT_NEON(p2sNONALIGNED);
1231
+    CHROMA_420_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1232
+    CHROMA_422_PU_NEON_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1233
+    CHROMA_422_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1234
+    CHROMA_444_PU_NEON_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1235
+    CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1236
+    LUMA_PU_NEON_FILTER_PIXEL_TO_SHORT(convert_p2sALIGNED);
1237
+    LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2sALIGNED);
1238
+    CHROMA_420_PU_FILTER_PIXEL_TO_SHORT_NEON(p2sALIGNED);
1239
+    CHROMA_420_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1240
+    CHROMA_422_PU_NEON_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1241
+    CHROMA_422_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1242
+    CHROMA_444_PU_NEON_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1243
+    CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1244
+    LUMA_PU_NEON_FILTER_PIXEL_TO_SHORT(convert_p2sNONALIGNED);
1245
+    LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2sNONALIGNED);
1246
+
1247
+#if !HIGH_BIT_DEPTH
1248
+    ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, neon);
1249
+    ALL_LUMA_PU(luma_vsp, interp_8tap_vert_sp, neon);
1250
+    ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, neon);
1251
+    ALL_LUMA_PU(luma_hpp, interp_horiz_pp, neon);
1252
+    ALL_LUMA_PU(luma_hps, interp_horiz_ps, neon);
1253
+    ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, neon);
1254
+    ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu);
1255
+    ALL_CHROMA_420_VERT_FILTERS(neon);
1256
+    CHROMA_422_VERT_FILTERS_NEON();
1257
+    CHROMA_422_VERT_FILTERS_CAN_USE_SVE2(neon);
1258
+    ALL_CHROMA_444_VERT_FILTERS(neon);
1259
+    ALL_CHROMA_420_FILTERS(neon);
1260
+    ALL_CHROMA_422_FILTERS(neon);
1261
+    ALL_CHROMA_444_FILTERS(neon);
1262
+
1263
+
1264
+    // Blockcopy_pp
1265
+    LUMA_PU_NEON_1(copy_pp, blockcopy_pp);
1266
+    LUMA_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
1267
+    CHROMA_420_PU_NEON_1(copy_pp, blockcopy_pp);
1268
+    CHROMA_420_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
1269
+    CHROMA_422_PU_NEON_1(copy_pp, blockcopy_pp);
1270
+    CHROMA_422_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
1271
+    p.cuBLOCK_4x4.copy_pp   = PFX(blockcopy_pp_4x4_neon);
1272
+    p.cuBLOCK_8x8.copy_pp   = PFX(blockcopy_pp_8x8_neon);
1273
+    p.cuBLOCK_16x16.copy_pp = PFX(blockcopy_pp_16x16_neon);
1274
+    p.cuBLOCK_32x32.copy_pp = PFX(blockcopy_pp_32x32_sve);
1275
+    p.cuBLOCK_64x64.copy_pp = PFX(blockcopy_pp_64x64_sve);
1276
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_pp = PFX(blockcopy_pp_4x4_neon);
1277
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_pp = PFX(blockcopy_pp_8x8_neon);
1278
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_pp = PFX(blockcopy_pp_16x16_neon);
1279
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_pp = PFX(blockcopy_pp_32x32_sve);
1280
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_pp = PFX(blockcopy_pp_4x8_neon);
1281
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_pp = PFX(blockcopy_pp_8x16_neon);
1282
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_pp = PFX(blockcopy_pp_16x32_neon);
1283
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_pp = PFX(blockcopy_pp_32x64_sve);
1284
+
1285
+#endif // !HIGH_BIT_DEPTH
1286
+
1287
+    // Blockcopy_ss
1288
+    p.cuBLOCK_4x4.copy_ss   = PFX(blockcopy_ss_4x4_neon);
1289
+    p.cuBLOCK_8x8.copy_ss   = PFX(blockcopy_ss_8x8_neon);
1290
+    p.cuBLOCK_16x16.copy_ss = PFX(blockcopy_ss_16x16_sve);
1291
+    p.cuBLOCK_32x32.copy_ss = PFX(blockcopy_ss_32x32_sve);
1292
+    p.cuBLOCK_64x64.copy_ss = PFX(blockcopy_ss_64x64_sve);
1293
+
1294
+    // Blockcopy_ps
1295
+    p.cuBLOCK_4x4.copy_ps   = PFX(blockcopy_ps_4x4_neon);
1296
+    p.cuBLOCK_8x8.copy_ps   = PFX(blockcopy_ps_8x8_neon);
1297
+    p.cuBLOCK_16x16.copy_ps = PFX(blockcopy_ps_16x16_sve);
1298
+    p.cuBLOCK_32x32.copy_ps = PFX(blockcopy_ps_32x32_sve);
1299
+    p.cuBLOCK_64x64.copy_ps = PFX(blockcopy_ps_64x64_sve);
1300
+
1301
+    // Blockcopy_sp
1302
+    p.cuBLOCK_4x4.copy_sp   = PFX(blockcopy_sp_4x4_sve);
1303
+    p.cuBLOCK_8x8.copy_sp   = PFX(blockcopy_sp_8x8_sve);
1304
+    p.cuBLOCK_16x16.copy_sp = PFX(blockcopy_sp_16x16_sve);
1305
+    p.cuBLOCK_32x32.copy_sp = PFX(blockcopy_sp_32x32_sve);
1306
+    p.cuBLOCK_64x64.copy_sp = PFX(blockcopy_sp_64x64_neon);
1307
+
1308
+    // chroma blockcopy_ss
1309
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_ss   = PFX(blockcopy_ss_4x4_neon);
1310
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_ss   = PFX(blockcopy_ss_8x8_neon);
1311
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_ss = PFX(blockcopy_ss_16x16_sve);
1312
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_ss = PFX(blockcopy_ss_32x32_sve);
1313
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_ss   = PFX(blockcopy_ss_4x8_neon);
1314
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_ss  = PFX(blockcopy_ss_8x16_neon);
1315
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_ss = PFX(blockcopy_ss_16x32_sve);
1316
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_ss = PFX(blockcopy_ss_32x64_sve);
1317
+
1318
+    // chroma blockcopy_ps
1319
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_ps   = PFX(blockcopy_ps_4x4_neon);
1320
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_ps   = PFX(blockcopy_ps_8x8_neon);
1321
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_ps = PFX(blockcopy_ps_16x16_sve);
1322
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_ps = PFX(blockcopy_ps_32x32_sve);
1323
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_ps   = PFX(blockcopy_ps_4x8_sve);
1324
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_ps  = PFX(blockcopy_ps_8x16_sve);
1325
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_ps = PFX(blockcopy_ps_16x32_sve);
1326
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_ps = PFX(blockcopy_ps_32x64_sve);
1327
+
1328
+    // chroma blockcopy_sp
1329
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_sp   = PFX(blockcopy_sp_4x4_sve);
1330
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_sp   = PFX(blockcopy_sp_8x8_sve);
1331
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_sp = PFX(blockcopy_sp_16x16_sve);
1332
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_sp = PFX(blockcopy_sp_32x32_sve);
1333
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_sp   = PFX(blockcopy_sp_4x8_sve);
1334
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_sp  = PFX(blockcopy_sp_8x16_sve);
1335
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_sp = PFX(blockcopy_sp_16x32_sve);
1336
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_sp = PFX(blockcopy_sp_32x64_sve);
1337
+
1338
+    // Block_fill
1339
+    LUMA_TU_NEON(blockfill_sALIGNED, blockfill_s);
1340
+    LUMA_TU_CAN_USE_SVE(blockfill_sALIGNED, blockfill_s);
1341
+    LUMA_TU_NEON(blockfill_sNONALIGNED, blockfill_s);
1342
+    LUMA_TU_CAN_USE_SVE(blockfill_sNONALIGNED, blockfill_s);
1343
+
1344
+    // copy_count
1345
+    p.cuBLOCK_4x4.copy_cnt     = PFX(copy_cnt_4_neon);
1346
+    p.cuBLOCK_8x8.copy_cnt     = PFX(copy_cnt_8_neon);
1347
+    p.cuBLOCK_16x16.copy_cnt   = PFX(copy_cnt_16_neon);
1348
+    p.cuBLOCK_32x32.copy_cnt   = PFX(copy_cnt_32_neon);
1349
+
1350
+    // count nonzero
1351
+    p.cuBLOCK_4x4.count_nonzero     = PFX(count_nonzero_4_neon);
1352
+    p.cuBLOCK_8x8.count_nonzero     = PFX(count_nonzero_8_neon);
1353
+    p.cuBLOCK_16x16.count_nonzero   = PFX(count_nonzero_16_neon);
1354
+    p.cuBLOCK_32x32.count_nonzero   = PFX(count_nonzero_32_neon);
1355
+
1356
+    // cpy2Dto1D_shl
1357
+    p.cuBLOCK_4x4.cpy2Dto1D_shl   = PFX(cpy2Dto1D_shl_4x4_neon);
1358
+    p.cuBLOCK_8x8.cpy2Dto1D_shl   = PFX(cpy2Dto1D_shl_8x8_neon);
1359
+    p.cuBLOCK_16x16.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16x16_sve);
1360
+    p.cuBLOCK_32x32.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32x32_sve);
1361
+    p.cuBLOCK_64x64.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_64x64_sve);
1362
+
1363
+    // cpy2Dto1D_shr
1364
+    p.cuBLOCK_4x4.cpy2Dto1D_shr   = PFX(cpy2Dto1D_shr_4x4_neon);
1365
+    p.cuBLOCK_8x8.cpy2Dto1D_shr   = PFX(cpy2Dto1D_shr_8x8_neon);
1366
+    p.cuBLOCK_16x16.cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16x16_sve);
1367
+    p.cuBLOCK_32x32.cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_sve);
1368
+
1369
+    // cpy1Dto2D_shl
1370
+    p.cuBLOCK_4x4.cpy1Dto2D_shlALIGNED      = PFX(cpy1Dto2D_shl_4x4_neon);
1371
+    p.cuBLOCK_8x8.cpy1Dto2D_shlALIGNED      = PFX(cpy1Dto2D_shl_8x8_neon);
1372
+    p.cuBLOCK_16x16.cpy1Dto2D_shlALIGNED    = PFX(cpy1Dto2D_shl_16x16_sve);
1373
+    p.cuBLOCK_32x32.cpy1Dto2D_shlALIGNED    = PFX(cpy1Dto2D_shl_32x32_sve);
1374
+    p.cuBLOCK_64x64.cpy1Dto2D_shlALIGNED    = PFX(cpy1Dto2D_shl_64x64_sve);
1375
+
1376
+    p.cuBLOCK_4x4.cpy1Dto2D_shlNONALIGNED   = PFX(cpy1Dto2D_shl_4x4_neon);
1377
+    p.cuBLOCK_8x8.cpy1Dto2D_shlNONALIGNED   = PFX(cpy1Dto2D_shl_8x8_neon);
1378
+    p.cuBLOCK_16x16.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_16x16_sve);
1379
+    p.cuBLOCK_32x32.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_32x32_sve);
1380
+    p.cuBLOCK_64x64.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_64x64_sve);
1381
+
1382
+    // cpy1Dto2D_shr
1383
+    p.cuBLOCK_4x4.cpy1Dto2D_shr   = PFX(cpy1Dto2D_shr_4x4_neon);
1384
+    p.cuBLOCK_8x8.cpy1Dto2D_shr   = PFX(cpy1Dto2D_shr_8x8_neon);
1385
+    p.cuBLOCK_16x16.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16x16_sve);
1386
+    p.cuBLOCK_32x32.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_sve);
1387
+    p.cuBLOCK_64x64.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_64x64_sve);
1388
+
1389
+#if !HIGH_BIT_DEPTH
1390
+    // pixel_avg_pp
1391
+    ALL_LUMA_PU(pixelavg_ppNONALIGNED, pixel_avg_pp, neon);
1392
+    ALL_LUMA_PU(pixelavg_ppALIGNED, pixel_avg_pp, neon);
1393
+
1394
+    // addAvg
1395
+    ALL_LUMA_PU(addAvgNONALIGNED, addAvg, neon);
1396
+    ALL_LUMA_PU(addAvgALIGNED, addAvg, neon);
1397
+    ALL_CHROMA_420_PU(addAvgNONALIGNED, addAvg, neon);
1398
+    ALL_CHROMA_422_PU(addAvgNONALIGNED, addAvg, neon);
1399
+    ALL_CHROMA_420_PU(addAvgALIGNED, addAvg, neon);
1400
+    ALL_CHROMA_422_PU(addAvgALIGNED, addAvg, neon);
1401
+
1402
+    // sad
1403
+    ALL_LUMA_PU(sad, pixel_sad, neon);
1404
+    ALL_LUMA_PU(sad_x3, sad_x3, neon);
1405
+    ALL_LUMA_PU(sad_x4, sad_x4, neon);
1406
+
1407
+    // sse_pp
1408
+    p.cuBLOCK_4x4.sse_pp   = PFX(pixel_sse_pp_4x4_sve);
1409
+    p.cuBLOCK_8x8.sse_pp   = PFX(pixel_sse_pp_8x8_neon);
1410
+    p.cuBLOCK_16x16.sse_pp = PFX(pixel_sse_pp_16x16_neon);
1411
+    p.cuBLOCK_32x32.sse_pp = PFX(pixel_sse_pp_32x32_neon);
1412
+    p.cuBLOCK_64x64.sse_pp = PFX(pixel_sse_pp_64x64_neon);
1413
+
1414
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.sse_pp   = PFX(pixel_sse_pp_4x4_sve);
1415
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.sse_pp   = PFX(pixel_sse_pp_8x8_neon);
1416
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.sse_pp = PFX(pixel_sse_pp_16x16_neon);
1417
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.sse_pp = PFX(pixel_sse_pp_32x32_neon);
1418
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.sse_pp   = PFX(pixel_sse_pp_4x8_sve);
1419
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.sse_pp  = PFX(pixel_sse_pp_8x16_neon);
1420
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.sse_pp = PFX(pixel_sse_pp_16x32_neon);
1421
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.sse_pp = PFX(pixel_sse_pp_32x64_neon);
1422
+
1423
+    // sse_ss
1424
+    p.cuBLOCK_4x4.sse_ss   = PFX(pixel_sse_ss_4x4_neon);
1425
+    p.cuBLOCK_8x8.sse_ss   = PFX(pixel_sse_ss_8x8_neon);
1426
+    p.cuBLOCK_16x16.sse_ss = PFX(pixel_sse_ss_16x16_neon);
1427
+    p.cuBLOCK_32x32.sse_ss = PFX(pixel_sse_ss_32x32_neon);
1428
+    p.cuBLOCK_64x64.sse_ss = PFX(pixel_sse_ss_64x64_neon);
1429
+
1430
+    // ssd_s
1431
+    p.cuBLOCK_4x4.ssd_sNONALIGNED   = PFX(pixel_ssd_s_4x4_neon);
1432
+    p.cuBLOCK_8x8.ssd_sNONALIGNED   = PFX(pixel_ssd_s_8x8_neon);
1433
+    p.cuBLOCK_16x16.ssd_sNONALIGNED = PFX(pixel_ssd_s_16x16_neon);
1434
+    p.cuBLOCK_32x32.ssd_sNONALIGNED = PFX(pixel_ssd_s_32x32_neon);
1435
+
1436
+    p.cuBLOCK_4x4.ssd_sALIGNED   = PFX(pixel_ssd_s_4x4_neon);
1437
+    p.cuBLOCK_8x8.ssd_sALIGNED   = PFX(pixel_ssd_s_8x8_neon);
1438
+    p.cuBLOCK_16x16.ssd_sALIGNED = PFX(pixel_ssd_s_16x16_neon);
1439
+    p.cuBLOCK_32x32.ssd_sALIGNED = PFX(pixel_ssd_s_32x32_neon);
1440
+
1441
+    // pixel_var
1442
+    p.cuBLOCK_8x8.var   = PFX(pixel_var_8x8_neon);
1443
+    p.cuBLOCK_16x16.var = PFX(pixel_var_16x16_neon);
1444
+    p.cuBLOCK_32x32.var = PFX(pixel_var_32x32_neon);
1445
+    p.cuBLOCK_64x64.var = PFX(pixel_var_64x64_neon);
1446
+
1447
+    // calc_Residual
1448
+    p.cuBLOCK_4x4.calcresidualNONALIGNED   = PFX(getResidual4_neon);
1449
+    p.cuBLOCK_8x8.calcresidualNONALIGNED   = PFX(getResidual8_neon);
1450
+    p.cuBLOCK_16x16.calcresidualNONALIGNED = PFX(getResidual16_neon);
1451
+    p.cuBLOCK_32x32.calcresidualNONALIGNED = PFX(getResidual32_neon);
1452
+
1453
+    p.cuBLOCK_4x4.calcresidualALIGNED   = PFX(getResidual4_neon);
1454
+    p.cuBLOCK_8x8.calcresidualALIGNED   = PFX(getResidual8_neon);
1455
+    p.cuBLOCK_16x16.calcresidualALIGNED = PFX(getResidual16_neon);
1456
+    p.cuBLOCK_32x32.calcresidualALIGNED = PFX(getResidual32_neon);
1457
+
1458
+    // pixel_sub_ps
1459
+    p.cuBLOCK_4x4.sub_ps   = PFX(pixel_sub_ps_4x4_neon);
1460
+    p.cuBLOCK_8x8.sub_ps   = PFX(pixel_sub_ps_8x8_neon);
1461
+    p.cuBLOCK_16x16.sub_ps = PFX(pixel_sub_ps_16x16_neon);
1462
+    p.cuBLOCK_32x32.sub_ps = PFX(pixel_sub_ps_32x32_neon);
1463
+    p.cuBLOCK_64x64.sub_ps = PFX(pixel_sub_ps_64x64_neon);
1464
+
1465
+    // chroma sub_ps
1466
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.sub_ps   = PFX(pixel_sub_ps_4x4_neon);
1467
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.sub_ps   = PFX(pixel_sub_ps_8x8_neon);
1468
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.sub_ps = PFX(pixel_sub_ps_16x16_neon);
1469
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.sub_ps = PFX(pixel_sub_ps_32x32_neon);
1470
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.sub_ps   = PFX(pixel_sub_ps_4x8_neon);
1471
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.sub_ps  = PFX(pixel_sub_ps_8x16_sve);
1472
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.sub_ps = PFX(pixel_sub_ps_16x32_neon);
1473
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.sub_ps = PFX(pixel_sub_ps_32x64_neon);
1474
+
1475
+    // pixel_add_ps
1476
+    p.cuBLOCK_4x4.add_psNONALIGNED   = PFX(pixel_add_ps_4x4_neon);
1477
+    p.cuBLOCK_8x8.add_psNONALIGNED   = PFX(pixel_add_ps_8x8_neon);
1478
+    p.cuBLOCK_16x16.add_psNONALIGNED = PFX(pixel_add_ps_16x16_neon);
1479
+    p.cuBLOCK_32x32.add_psNONALIGNED = PFX(pixel_add_ps_32x32_neon);
1480
+    p.cuBLOCK_64x64.add_psNONALIGNED = PFX(pixel_add_ps_64x64_neon);
1481
+
1482
+    p.cuBLOCK_4x4.add_psALIGNED   = PFX(pixel_add_ps_4x4_neon);
1483
+    p.cuBLOCK_8x8.add_psALIGNED   = PFX(pixel_add_ps_8x8_neon);
1484
+    p.cuBLOCK_16x16.add_psALIGNED = PFX(pixel_add_ps_16x16_neon);
1485
+    p.cuBLOCK_32x32.add_psALIGNED = PFX(pixel_add_ps_32x32_neon);
1486
+    p.cuBLOCK_64x64.add_psALIGNED = PFX(pixel_add_ps_64x64_neon);
1487
+
1488
+    // chroma add_ps
1489
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.add_psNONALIGNED   = PFX(pixel_add_ps_4x4_neon);
1490
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.add_psNONALIGNED   = PFX(pixel_add_ps_8x8_neon);
1491
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.add_psNONALIGNED = PFX(pixel_add_ps_16x16_neon);
1492
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.add_psNONALIGNED = PFX(pixel_add_ps_32x32_neon);
1493
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.add_psNONALIGNED   = PFX(pixel_add_ps_4x8_neon);
1494
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.add_psNONALIGNED  = PFX(pixel_add_ps_8x16_neon);
1495
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.add_psNONALIGNED = PFX(pixel_add_ps_16x32_neon);
1496
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.add_psNONALIGNED = PFX(pixel_add_ps_32x64_neon);
1497
+
1498
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.add_psALIGNED   = PFX(pixel_add_ps_4x4_neon);
1499
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.add_psALIGNED   = PFX(pixel_add_ps_8x8_neon);
1500
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.add_psALIGNED = PFX(pixel_add_ps_16x16_neon);
1501
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.add_psALIGNED = PFX(pixel_add_ps_32x32_neon);
1502
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.add_psALIGNED   = PFX(pixel_add_ps_4x8_neon);
1503
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.add_psALIGNED  = PFX(pixel_add_ps_8x16_neon);
1504
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.add_psALIGNED = PFX(pixel_add_ps_16x32_neon);
1505
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.add_psALIGNED = PFX(pixel_add_ps_32x64_neon);
1506
+
1507
+    //scale2D_64to32
1508
+    p.scale2D_64to32  = PFX(scale2D_64to32_neon);
1509
+
1510
+    // scale1D_128to64
1511
+    p.scale1D_128to64NONALIGNED = PFX(scale1D_128to64_neon);
1512
+    p.scale1D_128to64ALIGNED = PFX(scale1D_128to64_neon);
1513
+
1514
+    // planecopy
1515
+    p.planecopy_cp = PFX(pixel_planecopy_cp_neon);
1516
+
1517
+    // satd
1518
+    p.puLUMA_4x4.satd   = PFX(pixel_satd_4x4_sve);
1519
+    p.puLUMA_8x8.satd   = PFX(pixel_satd_8x8_neon);
1520
+    p.puLUMA_16x16.satd = PFX(pixel_satd_16x16_neon);
1521
+    p.puLUMA_32x32.satd = PFX(pixel_satd_32x32_sve);
1522
+    p.puLUMA_64x64.satd = PFX(pixel_satd_64x64_neon);
1523
+    p.puLUMA_8x4.satd   = PFX(pixel_satd_8x4_sve);
1524
+    p.puLUMA_4x8.satd   = PFX(pixel_satd_4x8_neon);
1525
+    p.puLUMA_16x8.satd  = PFX(pixel_satd_16x8_neon);
1526
+    p.puLUMA_8x16.satd  = PFX(pixel_satd_8x16_neon);
1527
+    p.puLUMA_16x32.satd = PFX(pixel_satd_16x32_neon);
1528
+    p.puLUMA_32x16.satd = PFX(pixel_satd_32x16_sve);
1529
+    p.puLUMA_64x32.satd = PFX(pixel_satd_64x32_neon);
1530
+    p.puLUMA_32x64.satd = PFX(pixel_satd_32x64_neon);
1531
+    p.puLUMA_16x12.satd = PFX(pixel_satd_16x12_neon);
1532
+    p.puLUMA_12x16.satd = PFX(pixel_satd_12x16_neon);
1533
+    p.puLUMA_16x4.satd  = PFX(pixel_satd_16x4_neon);
1534
+    p.puLUMA_4x16.satd  = PFX(pixel_satd_4x16_neon);
1535
+    p.puLUMA_32x24.satd = PFX(pixel_satd_32x24_neon);
1536
+    p.puLUMA_24x32.satd = PFX(pixel_satd_24x32_neon);
1537
+    p.puLUMA_32x8.satd  = PFX(pixel_satd_32x8_neon);
1538
+    p.puLUMA_8x32.satd  = PFX(pixel_satd_8x32_neon);
1539
+    p.puLUMA_64x48.satd = PFX(pixel_satd_64x48_sve);
1540
+    p.puLUMA_48x64.satd = PFX(pixel_satd_48x64_neon);
1541
+    p.puLUMA_64x16.satd = PFX(pixel_satd_64x16_neon);
1542
+    p.puLUMA_16x64.satd = PFX(pixel_satd_16x64_neon);
1543
+
1544
+    p.chromaX265_CSP_I420.puCHROMA_420_4x4.satd   = PFX(pixel_satd_4x4_sve);
1545
+    p.chromaX265_CSP_I420.puCHROMA_420_8x8.satd   = PFX(pixel_satd_8x8_neon);
1546
+    p.chromaX265_CSP_I420.puCHROMA_420_16x16.satd = PFX(pixel_satd_16x16_neon);
1547
+    p.chromaX265_CSP_I420.puCHROMA_420_32x32.satd = PFX(pixel_satd_32x32_neon);
1548
+    p.chromaX265_CSP_I420.puCHROMA_420_8x4.satd   = PFX(pixel_satd_8x4_sve);
1549
+    p.chromaX265_CSP_I420.puCHROMA_420_4x8.satd   = PFX(pixel_satd_4x8_neon);
1550
+    p.chromaX265_CSP_I420.puCHROMA_420_16x8.satd  = PFX(pixel_satd_16x8_neon);
1551
+    p.chromaX265_CSP_I420.puCHROMA_420_8x16.satd  = PFX(pixel_satd_8x16_neon);
1552
+    p.chromaX265_CSP_I420.puCHROMA_420_32x16.satd = PFX(pixel_satd_32x16_neon);
1553
+    p.chromaX265_CSP_I420.puCHROMA_420_16x32.satd = PFX(pixel_satd_16x32_neon);
1554
+    p.chromaX265_CSP_I420.puCHROMA_420_16x12.satd = PFX(pixel_satd_16x12_neon);
1555
+    p.chromaX265_CSP_I420.puCHROMA_420_12x16.satd = PFX(pixel_satd_12x16_neon);
1556
+    p.chromaX265_CSP_I420.puCHROMA_420_16x4.satd  = PFX(pixel_satd_16x4_neon);
1557
+    p.chromaX265_CSP_I420.puCHROMA_420_4x16.satd  = PFX(pixel_satd_4x16_neon);
1558
+    p.chromaX265_CSP_I420.puCHROMA_420_32x24.satd = PFX(pixel_satd_32x24_neon);
1559
+    p.chromaX265_CSP_I420.puCHROMA_420_24x32.satd = PFX(pixel_satd_24x32_neon);
1560
+    p.chromaX265_CSP_I420.puCHROMA_420_32x8.satd  = PFX(pixel_satd_32x8_neon);
1561
+    p.chromaX265_CSP_I420.puCHROMA_420_8x32.satd  = PFX(pixel_satd_8x32_neon);
1562
+
1563
+    p.chromaX265_CSP_I422.puCHROMA_422_4x8.satd   = PFX(pixel_satd_4x8_neon);
1564
+    p.chromaX265_CSP_I422.puCHROMA_422_8x16.satd  = PFX(pixel_satd_8x16_neon);
1565
+    p.chromaX265_CSP_I422.puCHROMA_422_16x32.satd = PFX(pixel_satd_16x32_neon);
1566
+    p.chromaX265_CSP_I422.puCHROMA_422_32x64.satd = PFX(pixel_satd_32x64_neon);
1567
+    p.chromaX265_CSP_I422.puCHROMA_422_4x4.satd   = PFX(pixel_satd_4x4_sve);
1568
+    p.chromaX265_CSP_I422.puCHROMA_422_8x8.satd   = PFX(pixel_satd_8x8_neon);
1569
+    p.chromaX265_CSP_I422.puCHROMA_422_4x16.satd  = PFX(pixel_satd_4x16_neon);
1570
+    p.chromaX265_CSP_I422.puCHROMA_422_16x16.satd = PFX(pixel_satd_16x16_neon);
1571
+    p.chromaX265_CSP_I422.puCHROMA_422_8x32.satd  = PFX(pixel_satd_8x32_neon);
1572
+    p.chromaX265_CSP_I422.puCHROMA_422_32x32.satd = PFX(pixel_satd_32x32_neon);
1573
+    p.chromaX265_CSP_I422.puCHROMA_422_16x64.satd = PFX(pixel_satd_16x64_neon);
1574
+    p.chromaX265_CSP_I422.puCHROMA_422_8x12.satd  = PFX(pixel_satd_8x12_sve);
1575
+    p.chromaX265_CSP_I422.puCHROMA_422_8x4.satd   = PFX(pixel_satd_8x4_sve);
1576
+    p.chromaX265_CSP_I422.puCHROMA_422_16x24.satd = PFX(pixel_satd_16x24_neon);
1577
+    p.chromaX265_CSP_I422.puCHROMA_422_12x32.satd = PFX(pixel_satd_12x32_neon);
1578
+    p.chromaX265_CSP_I422.puCHROMA_422_16x8.satd  = PFX(pixel_satd_16x8_neon);
1579
+    p.chromaX265_CSP_I422.puCHROMA_422_4x32.satd  = PFX(pixel_satd_4x32_neon);
1580
+    p.chromaX265_CSP_I422.puCHROMA_422_32x48.satd = PFX(pixel_satd_32x48_neon);
1581
+    p.chromaX265_CSP_I422.puCHROMA_422_24x64.satd = PFX(pixel_satd_24x64_neon);
1582
+    p.chromaX265_CSP_I422.puCHROMA_422_32x16.satd = PFX(pixel_satd_32x16_neon);
1583
+    p.chromaX265_CSP_I422.puCHROMA_422_8x64.satd  = PFX(pixel_satd_8x64_neon);
1584
+
1585
+    // sa8d
1586
+    p.cuBLOCK_4x4.sa8d   = PFX(pixel_satd_4x4_sve);
1587
+    p.cuBLOCK_8x8.sa8d   = PFX(pixel_sa8d_8x8_neon);
1588
+    p.cuBLOCK_16x16.sa8d = PFX(pixel_sa8d_16x16_neon);
1589
+    p.cuBLOCK_32x32.sa8d = PFX(pixel_sa8d_32x32_neon);
1590
+    p.cuBLOCK_64x64.sa8d = PFX(pixel_sa8d_64x64_neon);
1591
+    p.chromaX265_CSP_I420.cuBLOCK_8x8.sa8d = PFX(pixel_satd_4x4_sve);
1592
+    p.chromaX265_CSP_I420.cuBLOCK_16x16.sa8d = PFX(pixel_sa8d_16x16_neon);
1593
+    p.chromaX265_CSP_I420.cuBLOCK_32x32.sa8d = PFX(pixel_sa8d_32x32_neon);
1594
+    p.chromaX265_CSP_I420.cuBLOCK_64x64.sa8d = PFX(pixel_sa8d_64x64_neon);
1595
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.sa8d = PFX(pixel_sa8d_8x16_neon);
1596
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.sa8d = PFX(pixel_sa8d_16x32_neon);
1597
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.sa8d = PFX(pixel_sa8d_32x64_neon);
1598
+
1599
+    // dequant_scaling
1600
+    p.dequant_scaling = PFX(dequant_scaling_neon);
1601
+    p.dequant_normal  = PFX(dequant_normal_neon);
1602
+
1603
+    // ssim_4x4x2_core
1604
+    p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_neon);
1605
+
1606
+    // ssimDist
1607
+    p.cuBLOCK_4x4.ssimDist = PFX(ssimDist4_neon);
1608
+    p.cuBLOCK_8x8.ssimDist = PFX(ssimDist8_neon);
1609
+    p.cuBLOCK_16x16.ssimDist = PFX(ssimDist16_neon);
1610
+    p.cuBLOCK_32x32.ssimDist = PFX(ssimDist32_neon);
1611
+    p.cuBLOCK_64x64.ssimDist = PFX(ssimDist64_neon);
1612
+
1613
+    // normFact
1614
+    p.cuBLOCK_8x8.normFact = PFX(normFact8_neon);
1615
+    p.cuBLOCK_16x16.normFact = PFX(normFact16_neon);
1616
+    p.cuBLOCK_32x32.normFact = PFX(normFact32_neon);
1617
+    p.cuBLOCK_64x64.normFact = PFX(normFact64_neon);
1618
+
1619
+    // psy_cost_pp
1620
+    p.cuBLOCK_4x4.psy_cost_pp = PFX(psyCost_4x4_neon);
1621
+
1622
+    p.weight_pp = PFX(weight_pp_neon);
1623
+#if !defined(__APPLE__)
1624
+    p.scanPosLast = PFX(scanPosLast_neon);
1625
+#endif
1626
+    p.costCoeffNxN = PFX(costCoeffNxN_neon);
1627
 #endif
1628
+
1629
+    // quant
1630
+    p.quant = PFX(quant_sve);
1631
+    p.nquant = PFX(nquant_neon);
1632
+}
1633
 #endif
1634
 
1635
+#if defined(HAVE_SVE2)
1636
+void setupSve2Primitives(EncoderPrimitives &p)
1637
+{
1638
+    // When these primitives will use SVE/SVE2 instructions set,
1639
+    // change the following definitions to point to the SVE/SVE2 implementation
1640
+    setupPixelPrimitives_neon(p);
1641
+    setupFilterPrimitives_neon(p);
1642
+    setupDCTPrimitives_neon(p);
1643
+    setupLoopFilterPrimitives_neon(p);
1644
+    setupIntraPrimitives_neon(p);
1645
+
1646
+    CHROMA_420_PU_FILTER_PIXEL_TO_SHORT_NEON(p2sNONALIGNED);
1647
+    CHROMA_420_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1648
+    CHROMA_422_PU_NEON_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1649
+    CHROMA_422_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1650
+    CHROMA_444_PU_NEON_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1651
+    CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1652
+    LUMA_PU_NEON_FILTER_PIXEL_TO_SHORT(convert_p2sALIGNED);
1653
+    LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2sALIGNED);
1654
+    CHROMA_420_PU_FILTER_PIXEL_TO_SHORT_NEON(p2sALIGNED);
1655
+    CHROMA_420_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1656
+    CHROMA_422_PU_NEON_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1657
+    CHROMA_422_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1658
+    CHROMA_444_PU_NEON_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1659
+    CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1660
+    LUMA_PU_NEON_FILTER_PIXEL_TO_SHORT(convert_p2sNONALIGNED);
1661
+    LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2sNONALIGNED);
1662
+
1663
 #if !HIGH_BIT_DEPTH
1664
-        p.cuBLOCK_4x4.psy_cost_pp = PFX(psyCost_4x4_neon);
1665
+    LUMA_PU_MULTIPLE_ARCHS_1(luma_vpp, interp_8tap_vert_pp, neon);
1666
+    LUMA_PU_MULTIPLE_ARCHS_2(luma_vpp, interp_8tap_vert_pp, sve2);
1667
+    LUMA_PU_MULTIPLE_ARCHS_1(luma_vsp, interp_8tap_vert_sp, sve2);
1668
+    LUMA_PU_MULTIPLE_ARCHS_2(luma_vsp, interp_8tap_vert_sp, neon);
1669
+    ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, sve2);
1670
+    ALL_LUMA_PU(luma_hpp, interp_horiz_pp, neon);
1671
+    ALL_LUMA_PU(luma_hps, interp_horiz_ps, neon);
1672
+    ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, sve2);
1673
+    ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu);
1674
+    CHROMA_420_VERT_FILTERS_NEON();
1675
+    CHROMA_420_VERT_FILTERS_CAN_USE_SVE2();
1676
+    CHROMA_422_VERT_FILTERS_NEON();
1677
+    CHROMA_422_VERT_FILTERS_CAN_USE_SVE2(sve2);
1678
+    CHROMA_444_VERT_FILTERS_NEON();
1679
+    CHROMA_444_VERT_FILTERS_CAN_USE_SVE2();
1680
+    CHROMA_420_FILTERS_NEON();
1681
+    CHROMA_420_FILTERS_CAN_USE_SVE2();
1682
+    CHROMA_422_FILTERS_NEON();
1683
+    CHROMA_422_FILTERS_CAN_USE_SVE2();
1684
+    CHROMA_444_FILTERS_NEON();
1685
+    CHROMA_444_FILTERS_CAN_USE_SVE2();
1686
+
1687
+    // Blockcopy_pp
1688
+    LUMA_PU_NEON_1(copy_pp, blockcopy_pp);
1689
+    LUMA_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
1690
+    CHROMA_420_PU_NEON_1(copy_pp, blockcopy_pp);
1691
+    CHROMA_420_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
1692
+    CHROMA_422_PU_NEON_1(copy_pp, blockcopy_pp);
1693
+    CHROMA_422_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
1694
+    p.cuBLOCK_4x4.copy_pp   = PFX(blockcopy_pp_4x4_neon);
1695
+    p.cuBLOCK_8x8.copy_pp   = PFX(blockcopy_pp_8x8_neon);
1696
+    p.cuBLOCK_16x16.copy_pp = PFX(blockcopy_pp_16x16_neon);
1697
+    p.cuBLOCK_32x32.copy_pp = PFX(blockcopy_pp_32x32_sve);
1698
+    p.cuBLOCK_64x64.copy_pp = PFX(blockcopy_pp_64x64_sve);
1699
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_pp = PFX(blockcopy_pp_4x4_neon);
1700
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_pp = PFX(blockcopy_pp_8x8_neon);
1701
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_pp = PFX(blockcopy_pp_16x16_neon);
1702
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_pp = PFX(blockcopy_pp_32x32_sve);
1703
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_pp = PFX(blockcopy_pp_4x8_neon);
1704
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_pp = PFX(blockcopy_pp_8x16_neon);
1705
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_pp = PFX(blockcopy_pp_16x32_neon);
1706
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_pp = PFX(blockcopy_pp_32x64_sve);
1707
+
1708
 #endif // !HIGH_BIT_DEPTH
1709
 
1710
+    // Blockcopy_ss
1711
+    p.cuBLOCK_4x4.copy_ss   = PFX(blockcopy_ss_4x4_neon);
1712
+    p.cuBLOCK_8x8.copy_ss   = PFX(blockcopy_ss_8x8_neon);
1713
+    p.cuBLOCK_16x16.copy_ss = PFX(blockcopy_ss_16x16_sve);
1714
+    p.cuBLOCK_32x32.copy_ss = PFX(blockcopy_ss_32x32_sve);
1715
+    p.cuBLOCK_64x64.copy_ss = PFX(blockcopy_ss_64x64_sve);
1716
+
1717
+    // Blockcopy_ps
1718
+    p.cuBLOCK_4x4.copy_ps   = PFX(blockcopy_ps_4x4_neon);
1719
+    p.cuBLOCK_8x8.copy_ps   = PFX(blockcopy_ps_8x8_neon);
1720
+    p.cuBLOCK_16x16.copy_ps = PFX(blockcopy_ps_16x16_sve);
1721
+    p.cuBLOCK_32x32.copy_ps = PFX(blockcopy_ps_32x32_sve);
1722
+    p.cuBLOCK_64x64.copy_ps = PFX(blockcopy_ps_64x64_sve);
1723
+
1724
+    // Blockcopy_sp
1725
+    p.cuBLOCK_4x4.copy_sp   = PFX(blockcopy_sp_4x4_sve);
1726
+    p.cuBLOCK_8x8.copy_sp   = PFX(blockcopy_sp_8x8_sve);
1727
+    p.cuBLOCK_16x16.copy_sp = PFX(blockcopy_sp_16x16_sve);
1728
+    p.cuBLOCK_32x32.copy_sp = PFX(blockcopy_sp_32x32_sve);
1729
+    p.cuBLOCK_64x64.copy_sp = PFX(blockcopy_sp_64x64_neon);
1730
+
1731
+    // chroma blockcopy_ss
1732
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_ss   = PFX(blockcopy_ss_4x4_neon);
1733
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_ss   = PFX(blockcopy_ss_8x8_neon);
1734
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_ss = PFX(blockcopy_ss_16x16_sve);
1735
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_ss = PFX(blockcopy_ss_32x32_sve);
1736
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_ss   = PFX(blockcopy_ss_4x8_neon);
1737
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_ss  = PFX(blockcopy_ss_8x16_neon);
1738
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_ss = PFX(blockcopy_ss_16x32_sve);
1739
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_ss = PFX(blockcopy_ss_32x64_sve);
1740
+
1741
+    // chroma blockcopy_ps
1742
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_ps   = PFX(blockcopy_ps_4x4_neon);
1743
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_ps   = PFX(blockcopy_ps_8x8_neon);
1744
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_ps = PFX(blockcopy_ps_16x16_sve);
1745
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_ps = PFX(blockcopy_ps_32x32_sve);
1746
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_ps   = PFX(blockcopy_ps_4x8_sve);
1747
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_ps  = PFX(blockcopy_ps_8x16_sve);
1748
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_ps = PFX(blockcopy_ps_16x32_sve);
1749
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_ps = PFX(blockcopy_ps_32x64_sve);
1750
+
1751
+    // chroma blockcopy_sp
1752
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_sp   = PFX(blockcopy_sp_4x4_sve);
1753
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_sp   = PFX(blockcopy_sp_8x8_sve);
1754
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_sp = PFX(blockcopy_sp_16x16_sve);
1755
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_sp = PFX(blockcopy_sp_32x32_sve);
1756
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_sp   = PFX(blockcopy_sp_4x8_sve);
1757
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_sp  = PFX(blockcopy_sp_8x16_sve);
1758
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_sp = PFX(blockcopy_sp_16x32_sve);
1759
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_sp = PFX(blockcopy_sp_32x64_sve);
1760
+
1761
+    // Block_fill
1762
+    LUMA_TU_NEON(blockfill_sALIGNED, blockfill_s);
1763
+    LUMA_TU_CAN_USE_SVE(blockfill_sALIGNED, blockfill_s);
1764
+    LUMA_TU_NEON(blockfill_sNONALIGNED, blockfill_s);
1765
+    LUMA_TU_CAN_USE_SVE(blockfill_sNONALIGNED, blockfill_s);
1766
+
1767
+    // copy_count
1768
+    p.cuBLOCK_4x4.copy_cnt     = PFX(copy_cnt_4_neon);
1769
+    p.cuBLOCK_8x8.copy_cnt     = PFX(copy_cnt_8_neon);
1770
+    p.cuBLOCK_16x16.copy_cnt   = PFX(copy_cnt_16_neon);
1771
+    p.cuBLOCK_32x32.copy_cnt   = PFX(copy_cnt_32_neon);
1772
+
1773
+    // count nonzero
1774
+    p.cuBLOCK_4x4.count_nonzero     = PFX(count_nonzero_4_neon);
1775
+    p.cuBLOCK_8x8.count_nonzero     = PFX(count_nonzero_8_neon);
1776
+    p.cuBLOCK_16x16.count_nonzero   = PFX(count_nonzero_16_neon);
1777
+    p.cuBLOCK_32x32.count_nonzero   = PFX(count_nonzero_32_neon);
1778
+
1779
+    // cpy2Dto1D_shl
1780
+    p.cuBLOCK_4x4.cpy2Dto1D_shl   = PFX(cpy2Dto1D_shl_4x4_neon);
1781
+    p.cuBLOCK_8x8.cpy2Dto1D_shl   = PFX(cpy2Dto1D_shl_8x8_neon);
1782
+    p.cuBLOCK_16x16.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16x16_sve);
1783
+    p.cuBLOCK_32x32.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32x32_sve);
1784
+    p.cuBLOCK_64x64.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_64x64_sve);
1785
+
1786
+    // cpy2Dto1D_shr
1787
+    p.cuBLOCK_4x4.cpy2Dto1D_shr   = PFX(cpy2Dto1D_shr_4x4_neon);
1788
+    p.cuBLOCK_8x8.cpy2Dto1D_shr   = PFX(cpy2Dto1D_shr_8x8_neon);
1789
+    p.cuBLOCK_16x16.cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16x16_sve);
1790
+    p.cuBLOCK_32x32.cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_sve);
1791
+
1792
+    // cpy1Dto2D_shl
1793
+    p.cuBLOCK_4x4.cpy1Dto2D_shlALIGNED      = PFX(cpy1Dto2D_shl_4x4_neon);
1794
+    p.cuBLOCK_8x8.cpy1Dto2D_shlALIGNED      = PFX(cpy1Dto2D_shl_8x8_neon);
1795
+    p.cuBLOCK_16x16.cpy1Dto2D_shlALIGNED    = PFX(cpy1Dto2D_shl_16x16_sve);
1796
+    p.cuBLOCK_32x32.cpy1Dto2D_shlALIGNED    = PFX(cpy1Dto2D_shl_32x32_sve);
1797
+    p.cuBLOCK_64x64.cpy1Dto2D_shlALIGNED    = PFX(cpy1Dto2D_shl_64x64_sve);
1798
+
1799
+    p.cuBLOCK_4x4.cpy1Dto2D_shlNONALIGNED   = PFX(cpy1Dto2D_shl_4x4_neon);
1800
+    p.cuBLOCK_8x8.cpy1Dto2D_shlNONALIGNED   = PFX(cpy1Dto2D_shl_8x8_neon);
1801
+    p.cuBLOCK_16x16.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_16x16_sve);
1802
+    p.cuBLOCK_32x32.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_32x32_sve);
1803
+    p.cuBLOCK_64x64.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_64x64_sve);
1804
+
1805
+    // cpy1Dto2D_shr
1806
+    p.cuBLOCK_4x4.cpy1Dto2D_shr   = PFX(cpy1Dto2D_shr_4x4_neon);
1807
+    p.cuBLOCK_8x8.cpy1Dto2D_shr   = PFX(cpy1Dto2D_shr_8x8_neon);
1808
+    p.cuBLOCK_16x16.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16x16_sve);
1809
+    p.cuBLOCK_32x32.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_sve);
1810
+    p.cuBLOCK_64x64.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_64x64_sve);
1811
+
1812
+#if !HIGH_BIT_DEPTH
1813
+    // pixel_avg_pp
1814
+    LUMA_PU_NEON_2(pixelavg_ppNONALIGNED, pixel_avg_pp);
1815
+    LUMA_PU_MULTIPLE_ARCHS_3(pixelavg_ppNONALIGNED, pixel_avg_pp, sve2);
1816
+    LUMA_PU_NEON_2(pixelavg_ppALIGNED, pixel_avg_pp);
1817
+    LUMA_PU_MULTIPLE_ARCHS_3(pixelavg_ppALIGNED, pixel_avg_pp, sve2);
1818
+
1819
+    // addAvg
1820
+    LUMA_PU_NEON_3(addAvgNONALIGNED, addAvg);
1821
+    LUMA_PU_CAN_USE_SVE2(addAvgNONALIGNED, addAvg);
1822
+    LUMA_PU_NEON_3(addAvgALIGNED, addAvg);
1823
+    LUMA_PU_CAN_USE_SVE2(addAvgALIGNED, addAvg);
1824
+    CHROMA_420_PU_NEON_2(addAvgNONALIGNED, addAvg);
1825
+    CHROMA_420_PU_MULTIPLE_ARCHS(addAvgNONALIGNED, addAvg, sve2);
1826
+    CHROMA_420_PU_NEON_2(addAvgALIGNED, addAvg);
1827
+    CHROMA_420_PU_MULTIPLE_ARCHS(addAvgALIGNED, addAvg, sve2);
1828
+    CHROMA_422_PU_NEON_2(addAvgNONALIGNED, addAvg);
1829
+    CHROMA_422_PU_CAN_USE_SVE2(addAvgNONALIGNED, addAvg);
1830
+    CHROMA_422_PU_NEON_2(addAvgALIGNED, addAvg);
1831
+    CHROMA_422_PU_CAN_USE_SVE2(addAvgALIGNED, addAvg);
1832
+
1833
+    // sad
1834
+    ALL_LUMA_PU(sad, pixel_sad, sve2);
1835
+    ALL_LUMA_PU(sad_x3, sad_x3, sve2);
1836
+    ALL_LUMA_PU(sad_x4, sad_x4, sve2);
1837
+
1838
+    // sse_pp
1839
+    p.cuBLOCK_4x4.sse_pp   = PFX(pixel_sse_pp_4x4_sve);
1840
+    p.cuBLOCK_8x8.sse_pp   = PFX(pixel_sse_pp_8x8_neon);
1841
+    p.cuBLOCK_16x16.sse_pp = PFX(pixel_sse_pp_16x16_neon);
1842
+    p.cuBLOCK_32x32.sse_pp = PFX(pixel_sse_pp_32x32_sve2);
1843
+    p.cuBLOCK_64x64.sse_pp = PFX(pixel_sse_pp_64x64_sve2);
1844
+
1845
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.sse_pp   = PFX(pixel_sse_pp_4x4_sve);
1846
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.sse_pp   = PFX(pixel_sse_pp_8x8_neon);
1847
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.sse_pp = PFX(pixel_sse_pp_16x16_neon);
1848
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.sse_pp = PFX(pixel_sse_pp_32x32_sve2);
1849
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.sse_pp   = PFX(pixel_sse_pp_4x8_sve);
1850
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.sse_pp  = PFX(pixel_sse_pp_8x16_neon);
1851
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.sse_pp = PFX(pixel_sse_pp_16x32_neon);
1852
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.sse_pp = PFX(pixel_sse_pp_32x64_sve2);
1853
+
1854
+    // sse_ss
1855
+    p.cuBLOCK_4x4.sse_ss   = PFX(pixel_sse_ss_4x4_sve2);
1856
+    p.cuBLOCK_8x8.sse_ss   = PFX(pixel_sse_ss_8x8_sve2);
1857
+    p.cuBLOCK_16x16.sse_ss = PFX(pixel_sse_ss_16x16_sve2);
1858
+    p.cuBLOCK_32x32.sse_ss = PFX(pixel_sse_ss_32x32_sve2);
1859
+    p.cuBLOCK_64x64.sse_ss = PFX(pixel_sse_ss_64x64_sve2);
1860
+
1861
+    // ssd_s
1862
+    p.cuBLOCK_4x4.ssd_sNONALIGNED   = PFX(pixel_ssd_s_4x4_sve2);
1863
+    p.cuBLOCK_8x8.ssd_sNONALIGNED   = PFX(pixel_ssd_s_8x8_sve2);
1864
+    p.cuBLOCK_16x16.ssd_sNONALIGNED = PFX(pixel_ssd_s_16x16_sve2);
1865
+    p.cuBLOCK_32x32.ssd_sNONALIGNED = PFX(pixel_ssd_s_32x32_sve2);
1866
+
1867
+    p.cuBLOCK_4x4.ssd_sALIGNED   = PFX(pixel_ssd_s_4x4_sve2);
1868
+    p.cuBLOCK_8x8.ssd_sALIGNED   = PFX(pixel_ssd_s_8x8_sve2);
1869
+    p.cuBLOCK_16x16.ssd_sALIGNED = PFX(pixel_ssd_s_16x16_sve2);
1870
+    p.cuBLOCK_32x32.ssd_sALIGNED = PFX(pixel_ssd_s_32x32_sve2);
1871
+
1872
+    // pixel_var
1873
+    p.cuBLOCK_8x8.var   = PFX(pixel_var_8x8_sve2);
1874
+    p.cuBLOCK_16x16.var = PFX(pixel_var_16x16_sve2);
1875
+    p.cuBLOCK_32x32.var = PFX(pixel_var_32x32_sve2);
1876
+    p.cuBLOCK_64x64.var = PFX(pixel_var_64x64_sve2);
1877
+
1878
+    // calc_Residual
1879
+    p.cuBLOCK_4x4.calcresidualNONALIGNED   = PFX(getResidual4_neon);
1880
+    p.cuBLOCK_8x8.calcresidualNONALIGNED   = PFX(getResidual8_neon);
1881
+    p.cuBLOCK_16x16.calcresidualNONALIGNED = PFX(getResidual16_sve2);
1882
+    p.cuBLOCK_32x32.calcresidualNONALIGNED = PFX(getResidual32_sve2);
1883
+
1884
+    p.cuBLOCK_4x4.calcresidualALIGNED   = PFX(getResidual4_neon);
1885
+    p.cuBLOCK_8x8.calcresidualALIGNED   = PFX(getResidual8_neon);
1886
+    p.cuBLOCK_16x16.calcresidualALIGNED = PFX(getResidual16_sve2);
1887
+    p.cuBLOCK_32x32.calcresidualALIGNED = PFX(getResidual32_sve2);
1888
+
1889
+    // pixel_sub_ps
1890
+    p.cuBLOCK_4x4.sub_ps   = PFX(pixel_sub_ps_4x4_neon);
1891
+    p.cuBLOCK_8x8.sub_ps   = PFX(pixel_sub_ps_8x8_neon);
1892
+    p.cuBLOCK_16x16.sub_ps = PFX(pixel_sub_ps_16x16_neon);
1893
+    p.cuBLOCK_32x32.sub_ps = PFX(pixel_sub_ps_32x32_sve2);
1894
+    p.cuBLOCK_64x64.sub_ps = PFX(pixel_sub_ps_64x64_sve2);
1895
+
1896
+    // chroma sub_ps
1897
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.sub_ps   = PFX(pixel_sub_ps_4x4_neon);
1898
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.sub_ps   = PFX(pixel_sub_ps_8x8_neon);
1899
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.sub_ps = PFX(pixel_sub_ps_16x16_neon);
1900
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.sub_ps = PFX(pixel_sub_ps_32x32_sve2);
1901
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.sub_ps   = PFX(pixel_sub_ps_4x8_neon);
1902
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.sub_ps  = PFX(pixel_sub_ps_8x16_sve);
1903
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.sub_ps = PFX(pixel_sub_ps_16x32_neon);
1904
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.sub_ps = PFX(pixel_sub_ps_32x64_sve2);
1905
+
1906
+    // pixel_add_ps
1907
+    p.cuBLOCK_4x4.add_psNONALIGNED   = PFX(pixel_add_ps_4x4_sve2);
1908
+    p.cuBLOCK_8x8.add_psNONALIGNED   = PFX(pixel_add_ps_8x8_sve2);
1909
+    p.cuBLOCK_16x16.add_psNONALIGNED = PFX(pixel_add_ps_16x16_sve2);
1910
+    p.cuBLOCK_32x32.add_psNONALIGNED = PFX(pixel_add_ps_32x32_sve2);
1911
+    p.cuBLOCK_64x64.add_psNONALIGNED = PFX(pixel_add_ps_64x64_sve2);
1912
+
1913
+    p.cuBLOCK_4x4.add_psALIGNED   = PFX(pixel_add_ps_4x4_sve2);
1914
+    p.cuBLOCK_8x8.add_psALIGNED   = PFX(pixel_add_ps_8x8_sve2);
1915
+    p.cuBLOCK_16x16.add_psALIGNED = PFX(pixel_add_ps_16x16_sve2);
1916
+    p.cuBLOCK_32x32.add_psALIGNED = PFX(pixel_add_ps_32x32_sve2);
1917
+    p.cuBLOCK_64x64.add_psALIGNED = PFX(pixel_add_ps_64x64_sve2);
1918
+
1919
+    // chroma add_ps
1920
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.add_psNONALIGNED   = PFX(pixel_add_ps_4x4_sve2);
1921
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.add_psNONALIGNED   = PFX(pixel_add_ps_8x8_sve2);
1922
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.add_psNONALIGNED = PFX(pixel_add_ps_16x16_sve2);
1923
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.add_psNONALIGNED = PFX(pixel_add_ps_32x32_sve2);
1924
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.add_psNONALIGNED   = PFX(pixel_add_ps_4x8_sve2);
1925
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.add_psNONALIGNED  = PFX(pixel_add_ps_8x16_sve2);
1926
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.add_psNONALIGNED = PFX(pixel_add_ps_16x32_sve2);
1927
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.add_psNONALIGNED = PFX(pixel_add_ps_32x64_sve2);
1928
+
1929
+    p.chromaX265_CSP_I420.cuBLOCK_420_4x4.add_psALIGNED   = PFX(pixel_add_ps_4x4_sve2);
1930
+    p.chromaX265_CSP_I420.cuBLOCK_420_8x8.add_psALIGNED   = PFX(pixel_add_ps_8x8_sve2);
1931
+    p.chromaX265_CSP_I420.cuBLOCK_420_16x16.add_psALIGNED = PFX(pixel_add_ps_16x16_sve2);
1932
+    p.chromaX265_CSP_I420.cuBLOCK_420_32x32.add_psALIGNED = PFX(pixel_add_ps_32x32_sve2);
1933
+    p.chromaX265_CSP_I422.cuBLOCK_422_4x8.add_psALIGNED   = PFX(pixel_add_ps_4x8_sve2);
1934
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.add_psALIGNED  = PFX(pixel_add_ps_8x16_sve2);
1935
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.add_psALIGNED = PFX(pixel_add_ps_16x32_sve2);
1936
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.add_psALIGNED = PFX(pixel_add_ps_32x64_sve2);
1937
+
1938
+    //scale2D_64to32
1939
+    p.scale2D_64to32  = PFX(scale2D_64to32_neon);
1940
+
1941
+    // scale1D_128to64
1942
+    p.scale1D_128to64NONALIGNED = PFX(scale1D_128to64_sve2);
1943
+    p.scale1D_128to64ALIGNED = PFX(scale1D_128to64_sve2);
1944
+
1945
+    // planecopy
1946
+    p.planecopy_cp = PFX(pixel_planecopy_cp_neon);
1947
+
1948
+    // satd
1949
+    p.puLUMA_4x4.satd   = PFX(pixel_satd_4x4_sve);
1950
+    p.puLUMA_8x8.satd   = PFX(pixel_satd_8x8_neon);
1951
+    p.puLUMA_16x16.satd = PFX(pixel_satd_16x16_neon);
1952
+    p.puLUMA_32x32.satd = PFX(pixel_satd_32x32_sve);
1953
+    p.puLUMA_64x64.satd = PFX(pixel_satd_64x64_neon);
1954
+    p.puLUMA_8x4.satd   = PFX(pixel_satd_8x4_sve);
1955
+    p.puLUMA_4x8.satd   = PFX(pixel_satd_4x8_neon);
1956
+    p.puLUMA_16x8.satd  = PFX(pixel_satd_16x8_neon);
1957
+    p.puLUMA_8x16.satd  = PFX(pixel_satd_8x16_neon);
1958
+    p.puLUMA_16x32.satd = PFX(pixel_satd_16x32_neon);
1959
+    p.puLUMA_32x16.satd = PFX(pixel_satd_32x16_sve);
1960
+    p.puLUMA_64x32.satd = PFX(pixel_satd_64x32_neon);
1961
+    p.puLUMA_32x64.satd = PFX(pixel_satd_32x64_neon);
1962
+    p.puLUMA_16x12.satd = PFX(pixel_satd_16x12_neon);
1963
+    p.puLUMA_12x16.satd = PFX(pixel_satd_12x16_neon);
1964
+    p.puLUMA_16x4.satd  = PFX(pixel_satd_16x4_neon);
1965
+    p.puLUMA_4x16.satd  = PFX(pixel_satd_4x16_neon);
1966
+    p.puLUMA_32x24.satd = PFX(pixel_satd_32x24_neon);
1967
+    p.puLUMA_24x32.satd = PFX(pixel_satd_24x32_neon);
1968
+    p.puLUMA_32x8.satd  = PFX(pixel_satd_32x8_neon);
1969
+    p.puLUMA_8x32.satd  = PFX(pixel_satd_8x32_neon);
1970
+    p.puLUMA_64x48.satd = PFX(pixel_satd_64x48_sve);
1971
+    p.puLUMA_48x64.satd = PFX(pixel_satd_48x64_neon);
1972
+    p.puLUMA_64x16.satd = PFX(pixel_satd_64x16_neon);
1973
+    p.puLUMA_16x64.satd = PFX(pixel_satd_16x64_neon);
1974
+
1975
+    p.chromaX265_CSP_I420.puCHROMA_420_4x4.satd   = PFX(pixel_satd_4x4_sve);
1976
+    p.chromaX265_CSP_I420.puCHROMA_420_8x8.satd   = PFX(pixel_satd_8x8_neon);
1977
+    p.chromaX265_CSP_I420.puCHROMA_420_16x16.satd = PFX(pixel_satd_16x16_neon);
1978
+    p.chromaX265_CSP_I420.puCHROMA_420_32x32.satd = PFX(pixel_satd_32x32_neon);
1979
+    p.chromaX265_CSP_I420.puCHROMA_420_8x4.satd   = PFX(pixel_satd_8x4_sve);
1980
+    p.chromaX265_CSP_I420.puCHROMA_420_4x8.satd   = PFX(pixel_satd_4x8_neon);
1981
+    p.chromaX265_CSP_I420.puCHROMA_420_16x8.satd  = PFX(pixel_satd_16x8_neon);
1982
+    p.chromaX265_CSP_I420.puCHROMA_420_8x16.satd  = PFX(pixel_satd_8x16_neon);
1983
+    p.chromaX265_CSP_I420.puCHROMA_420_32x16.satd = PFX(pixel_satd_32x16_neon);
1984
+    p.chromaX265_CSP_I420.puCHROMA_420_16x32.satd = PFX(pixel_satd_16x32_neon);
1985
+    p.chromaX265_CSP_I420.puCHROMA_420_16x12.satd = PFX(pixel_satd_16x12_neon);
1986
+    p.chromaX265_CSP_I420.puCHROMA_420_12x16.satd = PFX(pixel_satd_12x16_neon);
1987
+    p.chromaX265_CSP_I420.puCHROMA_420_16x4.satd  = PFX(pixel_satd_16x4_neon);
1988
+    p.chromaX265_CSP_I420.puCHROMA_420_4x16.satd  = PFX(pixel_satd_4x16_neon);
1989
+    p.chromaX265_CSP_I420.puCHROMA_420_32x24.satd = PFX(pixel_satd_32x24_neon);
1990
+    p.chromaX265_CSP_I420.puCHROMA_420_24x32.satd = PFX(pixel_satd_24x32_neon);
1991
+    p.chromaX265_CSP_I420.puCHROMA_420_32x8.satd  = PFX(pixel_satd_32x8_neon);
1992
+    p.chromaX265_CSP_I420.puCHROMA_420_8x32.satd  = PFX(pixel_satd_8x32_neon);
1993
+
1994
+    p.chromaX265_CSP_I422.puCHROMA_422_4x8.satd   = PFX(pixel_satd_4x8_neon);
1995
+    p.chromaX265_CSP_I422.puCHROMA_422_8x16.satd  = PFX(pixel_satd_8x16_neon);
1996
+    p.chromaX265_CSP_I422.puCHROMA_422_16x32.satd = PFX(pixel_satd_16x32_neon);
1997
+    p.chromaX265_CSP_I422.puCHROMA_422_32x64.satd = PFX(pixel_satd_32x64_neon);
1998
+    p.chromaX265_CSP_I422.puCHROMA_422_4x4.satd   = PFX(pixel_satd_4x4_sve);
1999
+    p.chromaX265_CSP_I422.puCHROMA_422_8x8.satd   = PFX(pixel_satd_8x8_neon);
2000
+    p.chromaX265_CSP_I422.puCHROMA_422_4x16.satd  = PFX(pixel_satd_4x16_neon);
2001
+    p.chromaX265_CSP_I422.puCHROMA_422_16x16.satd = PFX(pixel_satd_16x16_neon);
2002
+    p.chromaX265_CSP_I422.puCHROMA_422_8x32.satd  = PFX(pixel_satd_8x32_neon);
2003
+    p.chromaX265_CSP_I422.puCHROMA_422_32x32.satd = PFX(pixel_satd_32x32_neon);
2004
+    p.chromaX265_CSP_I422.puCHROMA_422_16x64.satd = PFX(pixel_satd_16x64_neon);
2005
+    p.chromaX265_CSP_I422.puCHROMA_422_8x12.satd  = PFX(pixel_satd_8x12_sve);
2006
+    p.chromaX265_CSP_I422.puCHROMA_422_8x4.satd   = PFX(pixel_satd_8x4_sve);
2007
+    p.chromaX265_CSP_I422.puCHROMA_422_16x24.satd = PFX(pixel_satd_16x24_neon);
2008
+    p.chromaX265_CSP_I422.puCHROMA_422_12x32.satd = PFX(pixel_satd_12x32_neon);
2009
+    p.chromaX265_CSP_I422.puCHROMA_422_16x8.satd  = PFX(pixel_satd_16x8_neon);
2010
+    p.chromaX265_CSP_I422.puCHROMA_422_4x32.satd  = PFX(pixel_satd_4x32_neon);
2011
+    p.chromaX265_CSP_I422.puCHROMA_422_32x48.satd = PFX(pixel_satd_32x48_neon);
2012
+    p.chromaX265_CSP_I422.puCHROMA_422_24x64.satd = PFX(pixel_satd_24x64_neon);
2013
+    p.chromaX265_CSP_I422.puCHROMA_422_32x16.satd = PFX(pixel_satd_32x16_neon);
2014
+    p.chromaX265_CSP_I422.puCHROMA_422_8x64.satd  = PFX(pixel_satd_8x64_neon);
2015
+
2016
+    // sa8d
2017
+    p.cuBLOCK_4x4.sa8d   = PFX(pixel_satd_4x4_sve);
2018
+    p.cuBLOCK_8x8.sa8d   = PFX(pixel_sa8d_8x8_neon);
2019
+    p.cuBLOCK_16x16.sa8d = PFX(pixel_sa8d_16x16_neon);
2020
+    p.cuBLOCK_32x32.sa8d = PFX(pixel_sa8d_32x32_neon);
2021
+    p.cuBLOCK_64x64.sa8d = PFX(pixel_sa8d_64x64_neon);
2022
+    p.chromaX265_CSP_I420.cuBLOCK_8x8.sa8d = PFX(pixel_satd_4x4_sve);
2023
+    p.chromaX265_CSP_I420.cuBLOCK_16x16.sa8d = PFX(pixel_sa8d_16x16_neon);
2024
+    p.chromaX265_CSP_I420.cuBLOCK_32x32.sa8d = PFX(pixel_sa8d_32x32_neon);
2025
+    p.chromaX265_CSP_I420.cuBLOCK_64x64.sa8d = PFX(pixel_sa8d_64x64_neon);
2026
+    p.chromaX265_CSP_I422.cuBLOCK_422_8x16.sa8d = PFX(pixel_sa8d_8x16_neon);
2027
+    p.chromaX265_CSP_I422.cuBLOCK_422_16x32.sa8d = PFX(pixel_sa8d_16x32_neon);
2028
+    p.chromaX265_CSP_I422.cuBLOCK_422_32x64.sa8d = PFX(pixel_sa8d_32x64_neon);
2029
+
2030
+    // dequant_scaling
2031
+    p.dequant_scaling = PFX(dequant_scaling_sve2);
2032
+    p.dequant_normal  = PFX(dequant_normal_sve2);
2033
+
2034
+    // ssim_4x4x2_core
2035
+    p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_sve2);
2036
+
2037
+    // ssimDist
2038
+    p.cuBLOCK_4x4.ssimDist = PFX(ssimDist4_sve2);
2039
+    p.cuBLOCK_8x8.ssimDist = PFX(ssimDist8_sve2);
2040
+    p.cuBLOCK_16x16.ssimDist = PFX(ssimDist16_sve2);
2041
+    p.cuBLOCK_32x32.ssimDist = PFX(ssimDist32_sve2);
2042
+    p.cuBLOCK_64x64.ssimDist = PFX(ssimDist64_sve2);
2043
+
2044
+    // normFact
2045
+    p.cuBLOCK_8x8.normFact = PFX(normFact8_sve2);
2046
+    p.cuBLOCK_16x16.normFact = PFX(normFact16_sve2);
2047
+    p.cuBLOCK_32x32.normFact = PFX(normFact32_sve2);
2048
+    p.cuBLOCK_64x64.normFact = PFX(normFact64_sve2);
2049
+
2050
+    // psy_cost_pp
2051
+    p.cuBLOCK_4x4.psy_cost_pp = PFX(psyCost_4x4_neon);
2052
+
2053
+    p.weight_pp = PFX(weight_pp_neon);
2054
+#if !defined(__APPLE__)
2055
+    p.scanPosLast = PFX(scanPosLast_neon);
2056
+#endif
2057
+    p.costCoeffNxN = PFX(costCoeffNxN_neon);
2058
+#endif
2059
+
2060
+    // quant
2061
+    p.quant = PFX(quant_sve);
2062
+    p.nquant = PFX(nquant_neon);
2063
+}
2064
+#endif
2065
+
2066
+void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)
2067
+{
2068
+
2069
+#ifdef HAVE_SVE2
2070
+    if (cpuMask & X265_CPU_SVE2)
2071
+    {
2072
+        setupSve2Primitives(p);
2073
     }
2074
+    else if (cpuMask & X265_CPU_SVE)
2075
+    {
2076
+        setupSvePrimitives(p);
2077
+    }
2078
+    else if (cpuMask & X265_CPU_NEON)
2079
+    {
2080
+        setupNeonPrimitives(p);
2081
+    }
2082
+
2083
+#elif defined(HAVE_SVE)
2084
+    if (cpuMask & X265_CPU_SVE)
2085
+    {
2086
+        setupSvePrimitives(p);
2087
+    }
2088
+    else if (cpuMask & X265_CPU_NEON)
2089
+    {
2090
+        setupNeonPrimitives(p);
2091
+    }
2092
+
2093
+#else
2094
+    if (cpuMask & X265_CPU_NEON)
2095
+    {
2096
+        setupNeonPrimitives(p);
2097
+    }
2098
+#endif
2099
+
2100
 }
2101
 } // namespace X265_NS
2102
x265_3.6.tar.gz/source/common/aarch64/asm-sve.S Added
41
 
1
@@ -0,0 +1,39 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+
27
+.arch armv8-a+sve
28
+
29
+.macro ABS2_SVE a b c
30
+    abs             \a, \c\()/m, \a
31
+    abs             \b, \c\()/m, \b
32
+.endm
33
+
34
+.macro ABS8_SVE z0, z1, z2, z3, z4, z5, z6, z7, p0
35
+    ABS2_SVE        \z0, \z1, p0
36
+    ABS2_SVE        \z2, \z3, p0
37
+    ABS2_SVE        \z4, \z5, p0
38
+    ABS2_SVE        \z6, \z7, p0
39
+.endm
40
+
41
x265_3.5.tar.gz/source/common/aarch64/asm.S -> x265_3.6.tar.gz/source/common/aarch64/asm.S Changed
173
 
1
@@ -1,7 +1,8 @@
2
 /*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
+ * Copyright (C) 2020-2021 MulticoreWare, Inc
5
  *
6
  * Authors: Hongbin Liu <liuhongbin1@huawei.com>
7
+ *          Sebastian Pop <spop@amazon.com>
8
  *
9
  * This program is free software; you can redistribute it and/or modify
10
  * it under the terms of the GNU General Public License as published by
11
@@ -21,34 +22,74 @@
12
  * For more information, contact us at license @ x265.com.
13
  *****************************************************************************/
14
 
15
+#ifndef ASM_S_  // #include guards
16
+#define ASM_S_
17
+
18
 .arch           armv8-a
19
 
20
+#define PFX3(prefix, name) prefix ## _ ## name
21
+#define PFX2(prefix, name) PFX3(prefix, name)
22
+#define PFX(name)          PFX2(X265_NS, name)
23
+
24
+#ifdef __APPLE__
25
+#define PREFIX 1
26
+#endif
27
+
28
 #ifdef PREFIX
29
 #define EXTERN_ASM _
30
+#define HAVE_AS_FUNC 0
31
+#elif defined __clang__
32
+#define EXTERN_ASM
33
+#define HAVE_AS_FUNC 0
34
+#define PREFIX 1
35
 #else
36
 #define EXTERN_ASM
37
+#define HAVE_AS_FUNC 1
38
 #endif
39
 
40
 #ifdef __ELF__
41
 #define ELF
42
 #else
43
+#ifdef PREFIX
44
+#define ELF #
45
+#else
46
 #define ELF @
47
 #endif
48
-
49
-#define HAVE_AS_FUNC 1
50
+#endif
51
 
52
 #if HAVE_AS_FUNC
53
 #define FUNC
54
 #else
55
+#ifdef PREFIX
56
+#define FUNC #
57
+#else
58
 #define FUNC @
59
 #endif
60
+#endif
61
+
62
+#define GLUE(a, b) a ## b
63
+#define JOIN(a, b) GLUE(a, b)
64
+
65
+#define PFX_C(name)        JOIN(JOIN(JOIN(EXTERN_ASM, X265_NS), _), name)
66
+
67
+#ifdef __APPLE__
68
+.macro endfunc
69
+ELF .size \name, . - \name
70
+FUNC .endfunc
71
+.endm
72
+#endif
73
 
74
 .macro function name, export=1
75
+#ifdef __APPLE__
76
+    .global \name
77
+    endfunc
78
+#else
79
     .macro endfunc
80
 ELF     .size   \name, . - \name
81
 FUNC    .endfunc
82
         .purgem endfunc
83
     .endm
84
+#endif
85
         .align  2
86
 .if \export == 1
87
         .global EXTERN_ASM\name
88
@@ -64,6 +105,83 @@
89
 .endif
90
 .endm
91
 
92
+.macro  const   name, align=2
93
+    .macro endconst
94
+ELF     .size   \name, . - \name
95
+        .purgem endconst
96
+    .endm
97
+#ifdef __MACH__
98
+    .const_data
99
+#else
100
+    .section .rodata
101
+#endif
102
+    .align          \align
103
+\name:
104
+.endm
105
+
106
+.macro  movrel rd, val, offset=0
107
+#if defined(__APPLE__)
108
+  .if \offset < 0
109
+        adrp            \rd, \val@PAGE
110
+        add             \rd, \rd, \val@PAGEOFF
111
+        sub             \rd, \rd, -(\offset)
112
+  .else
113
+        adrp            \rd, \val+(\offset)@PAGE
114
+        add             \rd, \rd, \val+(\offset)@PAGEOFF
115
+  .endif
116
+#elif defined(PIC) && defined(_WIN32)
117
+  .if \offset < 0
118
+        adrp            \rd, \val
119
+        add             \rd, \rd, :lo12:\val
120
+        sub             \rd, \rd, -(\offset)
121
+  .else
122
+        adrp            \rd, \val+(\offset)
123
+        add             \rd, \rd, :lo12:\val+(\offset)
124
+  .endif
125
+#else
126
+        adrp            \rd, \val+(\offset)
127
+        add             \rd, \rd, :lo12:\val+(\offset)
128
+#endif
129
+.endm
130
 
131
 #define FENC_STRIDE 64
132
 #define FDEC_STRIDE 32
133
+
134
+.macro SUMSUB_AB sum, diff, a, b
135
+    add             \sum,  \a, \b
136
+    sub             \diff, \a, \b
137
+.endm
138
+
139
+.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
140
+    SUMSUB_AB       \s1, \d1, \a, \b
141
+    SUMSUB_AB       \s2, \d2, \c, \d
142
+.endm
143
+
144
+.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
145
+    SUMSUB_ABCD     \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
146
+    SUMSUB_ABCD     \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
147
+.endm
148
+
149
+.macro ABS2 a b
150
+    abs             \a, \a
151
+    abs             \b, \b
152
+.endm
153
+
154
+.macro ABS8 v0, v1, v2, v3, v4, v5, v6, v7
155
+    ABS2            \v0, \v1
156
+    ABS2            \v2, \v3
157
+    ABS2            \v4, \v5
158
+    ABS2            \v6, \v7
159
+.endm
160
+
161
+.macro vtrn t1, t2, s1, s2
162
+    trn1            \t1, \s1, \s2
163
+    trn2            \t2, \s1, \s2
164
+.endm
165
+
166
+.macro trn4 t1, t2, t3, t4, s1, s2, s3, s4
167
+    vtrn            \t1, \t2, \s1, \s2
168
+    vtrn            \t3, \t4, \s3, \s4
169
+.endm
170
+
171
+#endif
172
\ No newline at end of file
173
x265_3.6.tar.gz/source/common/aarch64/blockcopy8-common.S Added
56
 
1
@@ -0,0 +1,54 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+#include "asm.S"
29
+
30
+.arch           armv8-a
31
+
32
+// void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
33
+.macro cpy1Dto2D_shr_start
34
+    add             x2, x2, x2
35
+    dup             v0.8h, w3
36
+    cmeq            v1.8h, v1.8h, v1.8h
37
+    sshl            v1.8h, v1.8h, v0.8h
38
+    sri             v1.8h, v1.8h, #1
39
+    neg             v0.8h, v0.8h
40
+.endm
41
+
42
+.macro cpy2Dto1D_shr_start
43
+    add             x2, x2, x2
44
+    dup             v0.8h, w3
45
+    cmeq            v1.8h, v1.8h, v1.8h
46
+    sshl            v1.8h, v1.8h, v0.8h
47
+    sri             v1.8h, v1.8h, #1
48
+    neg             v0.8h, v0.8h
49
+.endm
50
+
51
+const xtn_xtn2_table, align=4
52
+.byte    0, 2, 4, 6, 8, 10, 12, 14
53
+.byte    16, 18, 20, 22, 24, 26, 28, 30
54
+endconst
55
+
56
x265_3.6.tar.gz/source/common/aarch64/blockcopy8-sve.S Added
1418
 
1
@@ -0,0 +1,1416 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ 
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "blockcopy8-common.S"
27
+
28
+.arch armv8-a+sve
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+/* void blockcopy_sp(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
41
+ *
42
+ * r0   - a
43
+ * r1   - stridea
44
+ * r2   - b
45
+ * r3   - strideb */
46
+
47
+function PFX(blockcopy_sp_4x4_sve)
48
+    ptrue           p0.h, vl4
49
+.rept 2
50
+    ld1h            {z0.h}, p0/z, x2
51
+    add             x2, x2, x3, lsl #1
52
+    st1b            {z0.h}, p0, x0
53
+    add             x0, x0, x1
54
+    ld1h            {z1.h}, p0/z, x2
55
+    add             x2, x2, x3, lsl #1
56
+    st1b            {z1.h}, p0, x0
57
+    add             x0, x0, x1
58
+.endr
59
+    ret
60
+endfunc
61
+
62
+function PFX(blockcopy_sp_8x8_sve)
63
+    ptrue           p0.h, vl8
64
+.rept 4
65
+    ld1h            {z0.h}, p0/z, x2
66
+    add             x2, x2, x3, lsl #1
67
+    st1b            {z0.h}, p0, x0
68
+    add            x0, x0, x1
69
+    ld1h            {z1.h}, p0/z, x2
70
+    add             x2, x2, x3, lsl #1
71
+    st1b            {z1.h}, p0, x0
72
+    add            x0, x0, x1
73
+.endr
74
+    ret
75
+endfunc
76
+
77
+function PFX(blockcopy_sp_16x16_sve)
78
+    rdvl            x9, #1
79
+    cmp             x9, #16
80
+    bgt             .vl_gt_16_blockcopy_sp_16_16
81
+    lsl             x3, x3, #1
82
+    movrel          x11, xtn_xtn2_table
83
+    ld1             {v31.16b}, x11
84
+.rept 8
85
+    ld1             {v0.8h-v1.8h}, x2, x3
86
+    ld1             {v2.8h-v3.8h}, x2, x3
87
+    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
88
+    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
89
+    st1             {v0.16b}, x0, x1
90
+    st1             {v1.16b}, x0, x1
91
+.endr
92
+    ret
93
+.vl_gt_16_blockcopy_sp_16_16:
94
+    ptrue           p0.h, vl16
95
+.rept 8
96
+    ld1h            {z0.h}, p0/z, x2
97
+    st1b            {z0.h}, p0, x0
98
+    add             x2, x2, x3, lsl #1
99
+    add             x0, x0, x1
100
+    ld1h            {z1.h}, p0/z, x2
101
+    st1b            {z1.h}, p0, x0
102
+    add             x2, x2, x3, lsl #1
103
+    add             x0, x0, x1
104
+.endr
105
+    ret
106
+endfunc
107
+
108
+function PFX(blockcopy_sp_32x32_sve)
109
+    mov             w12, #4
110
+    rdvl            x9, #1
111
+    cmp             x9, #16
112
+    bgt             .vl_gt_16_blockcopy_sp_32_32
113
+    lsl             x3, x3, #1
114
+    movrel          x11, xtn_xtn2_table
115
+    ld1             {v31.16b}, x11
116
+.loop_csp32_sve:
117
+    sub             w12, w12, #1
118
+.rept 4
119
+    ld1             {v0.8h-v3.8h}, x2, x3
120
+    ld1             {v4.8h-v7.8h}, x2, x3
121
+    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
122
+    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
123
+    tbl             v2.16b, {v4.16b,v5.16b}, v31.16b
124
+    tbl             v3.16b, {v6.16b,v7.16b}, v31.16b
125
+    st1             {v0.16b-v1.16b}, x0, x1
126
+    st1             {v2.16b-v3.16b}, x0, x1
127
+.endr
128
+    cbnz            w12, .loop_csp32_sve
129
+    ret
130
+.vl_gt_16_blockcopy_sp_32_32:
131
+    cmp             x9, #48
132
+    bgt             .vl_gt_48_blockcopy_sp_32_32
133
+    ptrue           p0.h, vl16
134
+.vl_gt_16_loop_csp32_sve:
135
+    sub             w12, w12, #1
136
+.rept 4
137
+    ld1h            {z0.h}, p0/z, x2
138
+    ld1h            {z1.h}, p0/z, x2, #1, mul vl
139
+    st1b            {z0.h}, p0, x0
140
+    st1b            {z1.h}, p0, x0, #1, mul vl
141
+    add             x2, x2, x3, lsl #1
142
+    add             x0, x0, x1
143
+    ld1h            {z2.h}, p0/z, x2
144
+    ld1h            {z3.h}, p0/z, x2, #1, mul vl
145
+    st1b            {z2.h}, p0, x0
146
+    st1b            {z3.h}, p0, x0, #1, mul vl
147
+    add             x2, x2, x3, lsl #1
148
+    add             x0, x0, x1
149
+.endr
150
+    cbnz            w12, .vl_gt_16_loop_csp32_sve
151
+    ret
152
+.vl_gt_48_blockcopy_sp_32_32:
153
+    ptrue           p0.h, vl32
154
+.vl_gt_48_loop_csp32_sve:
155
+    sub             w12, w12, #1
156
+.rept 4
157
+    ld1h            {z0.h}, p0/z, x2
158
+    st1b            {z0.h}, p0, x0
159
+    add             x2, x2, x3, lsl #1
160
+    add             x0, x0, x1
161
+    ld1h            {z1.h}, p0/z, x2
162
+    st1b            {z1.h}, p0, x0
163
+    add             x2, x2, x3, lsl #1
164
+    add             x0, x0, x1
165
+.endr
166
+    cbnz            w12, .vl_gt_48_loop_csp32_sve
167
+    ret
168
+endfunc
169
+
170
+function PFX(blockcopy_ps_16x16_sve)
171
+    rdvl            x9, #1
172
+    cmp             x9, #16
173
+    bgt             .vl_gt_16_blockcopy_ps_16_16
174
+    lsl             x1, x1, #1
175
+.rept 8
176
+    ld1             {v4.16b}, x2, x3
177
+    ld1             {v5.16b}, x2, x3
178
+    uxtl            v0.8h, v4.8b
179
+    uxtl2           v1.8h, v4.16b
180
+    uxtl            v2.8h, v5.8b
181
+    uxtl2           v3.8h, v5.16b
182
+    st1             {v0.8h-v1.8h}, x0, x1
183
+    st1             {v2.8h-v3.8h}, x0, x1
184
+.endr
185
+    ret
186
+.vl_gt_16_blockcopy_ps_16_16:
187
+    ptrue           p0.b, vl32
188
+.rept 16
189
+    ld1b            {z1.h}, p0/z, x2
190
+    st1h            {z1.h}, p0, x0
191
+    add             x0, x0, x1, lsl #1
192
+    add             x2, x2, x3
193
+.endr
194
+    ret
195
+endfunc
196
+
197
+function PFX(blockcopy_ps_32x32_sve)
198
+    rdvl            x9, #1
199
+    cmp             x9, #16
200
+    bgt             .vl_gt_16_blockcopy_ps_32_32
201
+    lsl             x1, x1, #1
202
+    mov             w12, #4
203
+.loop_cps32_sve:
204
+    sub             w12, w12, #1
205
+.rept 4
206
+    ld1             {v16.16b-v17.16b}, x2, x3
207
+    ld1             {v18.16b-v19.16b}, x2, x3
208
+    uxtl            v0.8h, v16.8b
209
+    uxtl2           v1.8h, v16.16b
210
+    uxtl            v2.8h, v17.8b
211
+    uxtl2           v3.8h, v17.16b
212
+    uxtl            v4.8h, v18.8b
213
+    uxtl2           v5.8h, v18.16b
214
+    uxtl            v6.8h, v19.8b
215
+    uxtl2           v7.8h, v19.16b
216
+    st1             {v0.8h-v3.8h}, x0, x1
217
+    st1             {v4.8h-v7.8h}, x0, x1
218
+.endr
219
+    cbnz            w12, .loop_cps32_sve
220
+    ret
221
+.vl_gt_16_blockcopy_ps_32_32:
222
+    cmp             x9, #48
223
+    bgt             .vl_gt_48_blockcopy_ps_32_32
224
+    ptrue           p0.b, vl32
225
+.rept 32
226
+    ld1b            {z2.h}, p0/z, x2
227
+    ld1b            {z3.h}, p0/z, x2, #1, mul vl
228
+    st1h            {z2.h}, p0, x0
229
+    st1h            {z3.h}, p0, x0, #1, mul vl
230
+    add             x0, x0, x1, lsl #1
231
+    add             x2, x2, x3
232
+.endr
233
+    ret
234
+.vl_gt_48_blockcopy_ps_32_32:
235
+    ptrue           p0.b, vl64
236
+.rept 32
237
+    ld1b            {z2.h}, p0/z, x2
238
+    st1h            {z2.h}, p0, x0
239
+    add             x0, x0, x1, lsl #1
240
+    add             x2, x2, x3
241
+.endr
242
+    ret
243
+endfunc
244
+
245
+function PFX(blockcopy_ps_64x64_sve)
246
+    rdvl            x9, #1
247
+    cmp             x9, #16
248
+    bgt             .vl_gt_16_blockcopy_ps_64_64
249
+    lsl             x1, x1, #1
250
+    sub             x1, x1, #64
251
+    mov             w12, #16
252
+.loop_cps64_sve:
253
+    sub             w12, w12, #1
254
+.rept 4
255
+    ld1             {v16.16b-v19.16b}, x2, x3
256
+    uxtl            v0.8h, v16.8b
257
+    uxtl2           v1.8h, v16.16b
258
+    uxtl            v2.8h, v17.8b
259
+    uxtl2           v3.8h, v17.16b
260
+    uxtl            v4.8h, v18.8b
261
+    uxtl2           v5.8h, v18.16b
262
+    uxtl            v6.8h, v19.8b
263
+    uxtl2           v7.8h, v19.16b
264
+    st1             {v0.8h-v3.8h}, x0, #64
265
+    st1             {v4.8h-v7.8h}, x0, x1
266
+.endr
267
+    cbnz            w12, .loop_cps64_sve
268
+    ret
269
+.vl_gt_16_blockcopy_ps_64_64:
270
+    cmp             x9, #48
271
+    bgt             .vl_gt_48_blockcopy_ps_64_64
272
+    ptrue           p0.b, vl32
273
+.rept 64
274
+    ld1b            {z4.h}, p0/z, x2
275
+    ld1b            {z5.h}, p0/z, x2, #1, mul vl
276
+    ld1b            {z6.h}, p0/z, x2, #2, mul vl
277
+    ld1b            {z7.h}, p0/z, x2, #3, mul vl
278
+    st1h            {z4.h}, p0, x0
279
+    st1h            {z5.h}, p0, x0, #1, mul vl
280
+    st1h            {z6.h}, p0, x0, #2, mul vl
281
+    st1h            {z7.h}, p0, x0, #3, mul vl
282
+    add             x0, x0, x1, lsl #1
283
+    add             x2, x2, x3
284
+.endr
285
+    ret
286
+.vl_gt_48_blockcopy_ps_64_64:
287
+    cmp             x9, #112
288
+    bgt             .vl_gt_112_blockcopy_ps_64_64
289
+    ptrue           p0.b, vl64
290
+.rept 64
291
+    ld1b            {z4.h}, p0/z, x2
292
+    ld1b            {z5.h}, p0/z, x2, #1, mul vl
293
+    st1h            {z4.h}, p0, x0
294
+    st1h            {z5.h}, p0, x0, #1, mul vl
295
+    add             x0, x0, x1, lsl #1
296
+    add             x2, x2, x3
297
+.endr
298
+    ret
299
+.vl_gt_112_blockcopy_ps_64_64:
300
+    ptrue           p0.b, vl128
301
+.rept 64
302
+    ld1b            {z4.h}, p0/z, x2
303
+    st1h            {z4.h}, p0, x0
304
+    add             x0, x0, x1, lsl #1
305
+    add             x2, x2, x3
306
+.endr
307
+    ret
308
+
309
+endfunc
310
+
311
+function PFX(blockcopy_ss_16x16_sve)
312
+    rdvl            x9, #1
313
+    cmp             x9, #16
314
+    bgt             .vl_gt_16_blockcopy_ss_16_16
315
+    lsl             x1, x1, #1
316
+    lsl             x3, x3, #1
317
+.rept 8
318
+    ld1             {v0.8h-v1.8h}, x2, x3
319
+    ld1             {v2.8h-v3.8h}, x2, x3
320
+    st1             {v0.8h-v1.8h}, x0, x1
321
+    st1             {v2.8h-v3.8h}, x0, x1
322
+.endr
323
+    ret
324
+.vl_gt_16_blockcopy_ss_16_16:
325
+    ptrue           p0.h, vl16
326
+.rept 16
327
+    ld1h            {z0.h}, p0/z, x2
328
+    st1h            {z0.h}, p0, x0
329
+    add             x2, x2, x3, lsl #1
330
+    add             x0, x0, x1, lsl #1
331
+.endr
332
+    ret
333
+endfunc
334
+
335
+function PFX(blockcopy_ss_32x32_sve)
336
+    rdvl            x9, #1
337
+    cmp             x9, #16
338
+    bgt             .vl_gt_16_blockcopy_ss_32_32
339
+    lsl             x1, x1, #1
340
+    lsl             x3, x3, #1
341
+    mov             w12, #4
342
+.loop_css32_sve:
343
+    sub             w12, w12, #1
344
+.rept 8
345
+    ld1             {v0.8h-v3.8h}, x2, x3
346
+    st1             {v0.8h-v3.8h}, x0, x1
347
+.endr
348
+    cbnz            w12, .loop_css32_sve
349
+    ret
350
+.vl_gt_16_blockcopy_ss_32_32:
351
+    cmp             x9, #48
352
+    bgt             .vl_gt_48_blockcopy_ss_32_32
353
+    ptrue           p0.h, vl16
354
+.rept 32
355
+    ld1h            {z0.h}, p0/z, x2
356
+    ld1h            {z1.h}, p0/z, x2, #1, mul vl
357
+    st1h            {z0.h}, p0, x0
358
+    st1h            {z1.h}, p0, x0, #1, mul vl
359
+    add             x2, x2, x3, lsl #1
360
+    add             x0, x0, x1, lsl #1
361
+.endr
362
+    ret
363
+.vl_gt_48_blockcopy_ss_32_32:
364
+    ptrue           p0.h, vl32
365
+.rept 32
366
+    ld1h            {z0.h}, p0/z, x2
367
+    st1h            {z0.h}, p0, x0
368
+    add             x2, x2, x3, lsl #1
369
+    add             x0, x0, x1, lsl #1
370
+.endr
371
+    ret
372
+endfunc
373
+
374
+function PFX(blockcopy_ss_64x64_sve)
375
+    rdvl            x9, #1
376
+    cmp             x9, #16
377
+    bgt             .vl_gt_16_blockcopy_ss_64_64
378
+    lsl             x1, x1, #1
379
+    sub             x1, x1, #64
380
+    lsl             x3, x3, #1
381
+    sub             x3, x3, #64
382
+    mov             w12, #8
383
+.loop_css64_sve:
384
+    sub             w12, w12, #1
385
+.rept 8
386
+    ld1             {v0.8h-v3.8h}, x2, #64
387
+    ld1             {v4.8h-v7.8h}, x2, x3
388
+    st1             {v0.8h-v3.8h}, x0, #64
389
+    st1             {v4.8h-v7.8h}, x0, x1
390
+.endr
391
+    cbnz            w12, .loop_css64_sve
392
+    ret
393
+.vl_gt_16_blockcopy_ss_64_64:
394
+    cmp             x9, #48
395
+    bgt             .vl_gt_48_blockcopy_ss_64_64
396
+    mov             w12, #8
397
+    ptrue           p0.b, vl32
398
+.vl_gt_16_loop_css64_sve:
399
+    sub             w12, w12, #1
400
+.rept 8
401
+    ld1b            {z0.b}, p0/z, x2
402
+    ld1b            {z1.b}, p0/z, x2, #1, mul vl
403
+    ld1b            {z2.b}, p0/z, x2, #2, mul vl
404
+    ld1b            {z3.b}, p0/z, x2, #3, mul vl
405
+    st1b            {z0.b}, p0, x0
406
+    st1b            {z1.b}, p0, x0, #1, mul vl
407
+    st1b            {z2.b}, p0, x0, #2, mul vl
408
+    st1b            {z3.b}, p0, x0, #3, mul vl
409
+    add             x2, x2, x3, lsl #1
410
+    add             x0, x0, x1, lsl #1
411
+.endr
412
+    cbnz            w12, .vl_gt_16_loop_css64_sve
413
+    ret
414
+.vl_gt_48_blockcopy_ss_64_64:
415
+    cmp             x9, #112
416
+    bgt             .vl_gt_112_blockcopy_ss_64_64
417
+    mov             w12, #8
418
+    ptrue           p0.b, vl64
419
+.vl_gt_48_loop_css64_sve:
420
+    sub             w12, w12, #1
421
+.rept 8
422
+    ld1b            {z0.b}, p0/z, x2
423
+    ld1b            {z1.b}, p0/z, x2, #1, mul vl
424
+    st1b            {z0.b}, p0, x0
425
+    st1b            {z1.b}, p0, x0, #1, mul vl
426
+    add             x2, x2, x3, lsl #1
427
+    add             x0, x0, x1, lsl #1
428
+.endr
429
+    cbnz            w12, .vl_gt_48_loop_css64_sve
430
+    ret
431
+.vl_gt_112_blockcopy_ss_64_64:
432
+    mov             w12, #8
433
+    ptrue           p0.b, vl128
434
+.vl_gt_112_loop_css64_sve:
435
+    sub             w12, w12, #1
436
+.rept 8
437
+    ld1b            {z0.b}, p0/z, x2
438
+    st1b            {z0.b}, p0, x0
439
+    add             x2, x2, x3, lsl #1
440
+    add             x0, x0, x1, lsl #1
441
+.endr
442
+    cbnz            w12, .vl_gt_112_loop_css64_sve
443
+    ret
444
+endfunc
445
+
446
+/******** Chroma blockcopy********/
447
+function PFX(blockcopy_ss_16x32_sve)
448
+    rdvl            x9, #1
449
+    cmp             x9, #16
450
+    bgt             .vl_gt_16_blockcopy_ss_16_32
451
+    lsl             x1, x1, #1
452
+    lsl             x3, x3, #1
453
+.rept 16
454
+    ld1             {v0.8h-v1.8h}, x2, x3
455
+    ld1             {v2.8h-v3.8h}, x2, x3
456
+    st1             {v0.8h-v1.8h}, x0, x1
457
+    st1             {v2.8h-v3.8h}, x0, x1
458
+.endr
459
+    ret
460
+.vl_gt_16_blockcopy_ss_16_32:
461
+    ptrue           p0.h, vl16
462
+.rept 32
463
+    ld1h            {z0.h}, p0/z, x2
464
+    st1h            {z0.h}, p0, x0
465
+    add             x2, x2, x3, lsl #1
466
+    add             x0, x0, x1, lsl #1
467
+.endr
468
+    ret
469
+endfunc
470
+
471
+function PFX(blockcopy_ss_32x64_sve)
472
+    rdvl            x9, #1
473
+    cmp             x9, #16
474
+    bgt             .vl_gt_16_blockcopy_ss_32_64
475
+    lsl             x1, x1, #1
476
+    lsl             x3, x3, #1
477
+    mov             w12, #8
478
+.loop_css32x64_sve:
479
+    sub             w12, w12, #1
480
+.rept 8
481
+    ld1             {v0.8h-v3.8h}, x2, x3
482
+    st1             {v0.8h-v3.8h}, x0, x1
483
+.endr
484
+    cbnz            w12, .loop_css32x64_sve
485
+    ret
486
+.vl_gt_16_blockcopy_ss_32_64:
487
+    cmp             x9, #48
488
+    bgt             .vl_gt_48_blockcopy_ss_32_64
489
+    mov             w12, #8
490
+    ptrue           p0.b, vl32
491
+.vl_gt_32_loop_css32x64_sve:
492
+    sub             w12, w12, #1
493
+.rept 8
494
+    ld1b            {z0.b}, p0/z, x2
495
+    ld1b            {z1.b}, p0/z, x2, #1, mul vl
496
+    st1b            {z0.b}, p0, x0
497
+    st1b            {z1.b}, p0, x0, #1, mul vl
498
+    add             x2, x2, x3, lsl #1
499
+    add             x0, x0, x1, lsl #1
500
+.endr
501
+    cbnz            w12, .vl_gt_32_loop_css32x64_sve
502
+    ret
503
+.vl_gt_48_blockcopy_ss_32_64:
504
+    mov             w12, #8
505
+    ptrue           p0.b, vl64
506
+.vl_gt_48_loop_css32x64_sve:
507
+    sub             w12, w12, #1
508
+.rept 8
509
+    ld1b            {z0.b}, p0/z, x2
510
+    st1b            {z0.b}, p0, x0
511
+    add             x2, x2, x3, lsl #1
512
+    add             x0, x0, x1, lsl #1
513
+.endr
514
+    cbnz            w12, .vl_gt_48_loop_css32x64_sve
515
+    ret
516
+endfunc
517
+
518
+// chroma blockcopy_ps
519
+function PFX(blockcopy_ps_4x8_sve)
520
+    ptrue           p0.h, vl4
521
+.rept 8
522
+    ld1b            {z0.h}, p0/z, x2
523
+    st1h            {z0.h}, p0, x0
524
+    add             x0, x0, x1, lsl #1
525
+    add             x2, x2, x3
526
+.endr
527
+    ret
528
+endfunc
529
+
530
+function PFX(blockcopy_ps_8x16_sve)
531
+    ptrue           p0.h, vl8
532
+.rept 16
533
+    ld1b            {z0.h}, p0/z, x2
534
+    st1h            {z0.h}, p0, x0
535
+    add             x0, x0, x1, lsl #1
536
+    add             x2, x2, x3
537
+.endr
538
+    ret
539
+endfunc
540
+
541
+function PFX(blockcopy_ps_16x32_sve)
542
+    rdvl            x9, #1
543
+    cmp             x9, #16
544
+    bgt             .vl_gt_16_blockcopy_ps_16_32
545
+    lsl             x1, x1, #1
546
+.rept 16
547
+    ld1             {v4.16b}, x2, x3
548
+    ld1             {v5.16b}, x2, x3
549
+    uxtl            v0.8h, v4.8b
550
+    uxtl2           v1.8h, v4.16b
551
+    uxtl            v2.8h, v5.8b
552
+    uxtl2           v3.8h, v5.16b
553
+    st1             {v0.8h-v1.8h}, x0, x1
554
+    st1             {v2.8h-v3.8h}, x0, x1
555
+.endr
556
+    ret
557
+.vl_gt_16_blockcopy_ps_16_32:
558
+    ptrue           p0.b, vl32
559
+.rept 32
560
+    ld1b            {z1.h}, p0/z, x2
561
+    st1h            {z1.h}, p0, x0
562
+    add             x0, x0, x1, lsl #1
563
+    add             x2, x2, x3
564
+.endr
565
+    ret
566
+endfunc
567
+
568
+function PFX(blockcopy_ps_32x64_sve)
569
+    rdvl            x9, #1
570
+    cmp             x9, #16
571
+    bgt             .vl_gt_16_blockcopy_ps_32_64
572
+    lsl             x1, x1, #1
573
+    mov             w12, #8
574
+.loop_cps32x64_sve:
575
+    sub             w12, w12, #1
576
+.rept 4
577
+    ld1             {v16.16b-v17.16b}, x2, x3
578
+    ld1             {v18.16b-v19.16b}, x2, x3
579
+    uxtl            v0.8h, v16.8b
580
+    uxtl2           v1.8h, v16.16b
581
+    uxtl            v2.8h, v17.8b
582
+    uxtl2           v3.8h, v17.16b
583
+    uxtl            v4.8h, v18.8b
584
+    uxtl2           v5.8h, v18.16b
585
+    uxtl            v6.8h, v19.8b
586
+    uxtl2           v7.8h, v19.16b
587
+    st1             {v0.8h-v3.8h}, x0, x1
588
+    st1             {v4.8h-v7.8h}, x0, x1
589
+.endr
590
+    cbnz            w12, .loop_cps32x64_sve
591
+    ret
592
+.vl_gt_16_blockcopy_ps_32_64:
593
+    cmp             x9, #48
594
+    bgt             .vl_gt_48_blockcopy_ps_32_64
595
+    ptrue           p0.b, vl32
596
+.rept 64
597
+    ld1b            {z2.h}, p0/z, x2
598
+    ld1b            {z3.h}, p0/z, x2, #1, mul vl
599
+    st1h            {z2.h}, p0, x0
600
+    st1h            {z3.h}, p0, x0, #1, mul vl
601
+    add             x0, x0, x1, lsl #1
602
+    add             x2, x2, x3
603
+.endr
604
+    ret
605
+.vl_gt_48_blockcopy_ps_32_64:
606
+    ptrue           p0.b, vl64
607
+.rept 64
608
+    ld1b            {z2.h}, p0/z, x2
609
+    st1h            {z2.h}, p0, x0
610
+    add             x0, x0, x1, lsl #1
611
+    add             x2, x2, x3
612
+.endr
613
+    ret
614
+endfunc
615
+
616
+// chroma blockcopy_sp
617
+function PFX(blockcopy_sp_4x8_sve)
618
+    ptrue           p0.h, vl4
619
+.rept 8
620
+    ld1h            {z0.h}, p0/z, x2
621
+    st1b            {z0.h}, p0, x0
622
+    add             x2, x2, x3, lsl #1
623
+    add             x0, x0, x1
624
+.endr
625
+    ret
626
+endfunc
627
+
628
+function PFX(blockcopy_sp_8x16_sve)
629
+    ptrue           p0.h, vl8
630
+.rept 16
631
+    ld1h            {z0.h}, p0/z, x2
632
+    st1b            {z0.h}, p0, x0
633
+    add             x2, x2, x3, lsl #1
634
+    add             x0, x0, x1
635
+.endr
636
+    ret
637
+endfunc
638
+
639
+function PFX(blockcopy_sp_16x32_sve)
640
+    rdvl            x9, #1
641
+    cmp             x9, #16
642
+    bgt             .vl_gt_16_blockcopy_sp_16_32
643
+    ptrue           p0.h, vl8
644
+.rept 32
645
+    ld1h            {z0.h}, p0/z, x2
646
+    ld1h            {z1.h}, p0/z, x2, #1, mul vl
647
+    st1b            {z0.h}, p0, x0
648
+    st1b            {z1.h}, p0, x0, #1, mul vl
649
+    add             x2, x2, x3, lsl #1
650
+    add             x0, x0, x1
651
+.endr
652
+    ret
653
+.vl_gt_16_blockcopy_sp_16_32:
654
+    ptrue           p0.h, vl16
655
+.rept 32
656
+    ld1h            {z0.h}, p0/z, x2
657
+    st1b            {z0.h}, p0, x0
658
+    add             x2, x2, x3, lsl #1
659
+    add             x0, x0, x1
660
+.endr
661
+    ret
662
+endfunc
663
+
664
+function PFX(blockcopy_sp_32x64_sve)
665
+    rdvl            x9, #1
666
+    cmp             x9, #16
667
+    bgt             .vl_gt_16_blockcopy_sp_32_64
668
+    ptrue           p0.h, vl8
669
+.rept 64
670
+    ld1h            {z0.h}, p0/z, x2
671
+    ld1h            {z1.h}, p0/z, x2, #1, mul vl
672
+    ld1h            {z2.h}, p0/z, x2, #2, mul vl
673
+    ld1h            {z3.h}, p0/z, x2, #3, mul vl
674
+    st1b            {z0.h}, p0, x0
675
+    st1b            {z1.h}, p0, x0, #1, mul vl
676
+    st1b            {z2.h}, p0, x0, #2, mul vl
677
+    st1b            {z3.h}, p0, x0, #3, mul vl
678
+    add             x2, x2, x3, lsl #1
679
+    add             x0, x0, x1
680
+.endr
681
+    ret
682
+.vl_gt_16_blockcopy_sp_32_64:
683
+    cmp             x9, #48
684
+    bgt             .vl_gt_48_blockcopy_sp_32_64
685
+    ptrue           p0.h, vl16
686
+.rept 64
687
+    ld1h            {z0.h}, p0/z, x2
688
+    ld1h            {z1.h}, p0/z, x2, #1, mul vl
689
+    st1b            {z0.h}, p0, x0
690
+    st1b            {z1.h}, p0, x0, #1, mul vl
691
+    add             x2, x2, x3, lsl #1
692
+    add             x0, x0, x1
693
+.endr
694
+    ret
695
+.vl_gt_48_blockcopy_sp_32_64:
696
+    ptrue           p0.h, vl32
697
+.rept 64
698
+    ld1h            {z0.h}, p0/z, x2
699
+    st1b            {z0.h}, p0, x0
700
+    add             x2, x2, x3, lsl #1
701
+    add             x0, x0, x1
702
+.endr
703
+    ret
704
+endfunc
705
+
706
+/* blockcopy_pp(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) */
707
+
708
+function PFX(blockcopy_pp_32x8_sve)
709
+    rdvl            x9, #1
710
+    cmp             x9, #16
711
+    bgt             .vl_gt_16_blockcopy_pp_32_8
712
+.rept 8
713
+    ld1             {v0.16b-v1.16b}, x2, x3
714
+    st1             {v0.16b-v1.16b}, x0, x1
715
+.endr
716
+    ret
717
+.vl_gt_16_blockcopy_pp_32_8:
718
+    ptrue           p0.b, vl32
719
+.rept 8
720
+    ld1b            {z0.b}, p0/z, x2
721
+    st1b            {z0.b}, p0, x0
722
+    add             x2, x2, x3
723
+    add             x0, x0, x1
724
+.endr
725
+    ret
726
+endfunc
727
+
728
+.macro blockcopy_pp_32xN_sve h
729
+function PFX(blockcopy_pp_32x\h\()_sve)
730
+    mov             w12, #\h / 8
731
+    rdvl            x9, #1
732
+    cmp             x9, #16
733
+    bgt             .vl_gt_16_blockcopy_pp_32xN_\h
734
+.loop_sve_32x\h\():
735
+    sub             w12, w12, #1
736
+.rept 8
737
+    ld1             {v0.16b-v1.16b}, x2, x3
738
+    st1             {v0.16b-v1.16b}, x0, x1
739
+.endr
740
+    cbnz            w12, .loop_sve_32x\h
741
+    ret
742
+.vl_gt_16_blockcopy_pp_32xN_\h:
743
+    ptrue           p0.b, vl32
744
+.L_gt_16_blockcopy_pp_32xN_\h:
745
+    sub             w12, w12, #1
746
+.rept 8
747
+    ld1b            {z0.b}, p0/z, x2
748
+    st1b            {z0.b}, p0, x0
749
+    add             x2, x2, x3
750
+    add             x0, x0, x1
751
+.endr
752
+    cbnz            w12, .L_gt_16_blockcopy_pp_32xN_\h
753
+    ret
754
+endfunc
755
+.endm
756
+
757
+blockcopy_pp_32xN_sve 16
758
+blockcopy_pp_32xN_sve 24
759
+blockcopy_pp_32xN_sve 32
760
+blockcopy_pp_32xN_sve 64
761
+blockcopy_pp_32xN_sve 48
762
+
763
+.macro blockcopy_pp_64xN_sve h
764
+function PFX(blockcopy_pp_64x\h\()_sve)
765
+    mov             w12, #\h / 4
766
+    rdvl            x9, #1
767
+    cmp             x9, #16
768
+    bgt             .vl_gt_16_blockcopy_pp_64xN_\h
769
+.loop_sve_64x\h\():
770
+    sub             w12, w12, #1
771
+.rept 4
772
+    ld1             {v0.16b-v3.16b}, x2, x3
773
+    st1             {v0.16b-v3.16b}, x0, x1
774
+.endr
775
+    cbnz            w12, .loop_sve_64x\h
776
+    ret
777
+.vl_gt_16_blockcopy_pp_64xN_\h:
778
+    cmp             x9, #48
779
+    bgt             .vl_gt_48_blockcopy_pp_64xN_\h
780
+    ptrue           p0.b, vl32
781
+.L_le_32_blockcopy_pp_64xN_\h:
782
+    sub             w12, w12, #1
783
+.rept 4
784
+    ld1b            {z0.b}, p0/z, x2
785
+    ld1b            {z1.b}, p0/z, x2, #1, mul vl
786
+    st1b            {z0.b}, p0, x0
787
+    st1b            {z1.b}, p0, x0, #1, mul vl
788
+    add             x2, x2, x3
789
+    add             x0, x0, x1
790
+.endr
791
+    cbnz            w12, .L_le_32_blockcopy_pp_64xN_\h
792
+    ret
793
+.vl_gt_48_blockcopy_pp_64xN_\h:
794
+    ptrue           p0.b, vl64
795
+.L_blockcopy_pp_64xN_\h:
796
+    sub             w12, w12, #1
797
+.rept 4
798
+    ld1b            {z0.b}, p0/z, x2
799
+    st1b            {z0.b}, p0, x0
800
+    add             x2, x2, x3
801
+    add             x0, x0, x1
802
+.endr
803
+    cbnz            w12, .L_blockcopy_pp_64xN_\h
804
+    ret
805
+endfunc
806
+.endm
807
+
808
+blockcopy_pp_64xN_sve 16
809
+blockcopy_pp_64xN_sve 32
810
+blockcopy_pp_64xN_sve 48
811
+blockcopy_pp_64xN_sve 64
812
+
813
+function PFX(blockfill_s_32x32_sve)
814
+    rdvl            x9, #1
815
+    cmp             x9, #16
816
+    bgt             .vl_gt_16_blockfill_s_32_32
817
+    dup             v0.8h, w2
818
+    mov             v1.16b, v0.16b
819
+    mov             v2.16b, v0.16b
820
+    mov             v3.16b, v0.16b
821
+    lsl             x1, x1, #1
822
+.rept 32
823
+    st1             {v0.8h-v3.8h}, x0, x1
824
+.endr
825
+    ret
826
+.vl_gt_16_blockfill_s_32_32:
827
+    cmp             x9, #48
828
+    bgt             .vl_gt_48_blockfill_s_32_32
829
+    dup             z0.h, w2
830
+    ptrue           p0.h, vl16
831
+.rept 32
832
+    st1h            {z0.h}, p0, x0
833
+    st1h            {z0.h}, p0, x0, #1, mul vl
834
+    add             x0, x0, x1, lsl #1
835
+.endr
836
+    ret
837
+.vl_gt_48_blockfill_s_32_32:
838
+    dup             z0.h, w2
839
+    ptrue           p0.h, vl32
840
+.rept 32
841
+    st1h            {z0.h}, p0, x0
842
+    add             x0, x0, x1, lsl #1
843
+.endr
844
+    ret
845
+endfunc
846
+
847
+// void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
848
+.macro cpy2Dto1D_shl_start_sve
849
+    add             x2, x2, x2
850
+    mov             z0.h, w3
851
+.endm
852
+
853
+function PFX(cpy2Dto1D_shl_16x16_sve)
854
+    dup             z0.h, w3
855
+    rdvl            x9, #1
856
+    cmp             x9, #16
857
+    bgt             .vl_gt_16_cpy2Dto1D_shl_16x16
858
+    cpy2Dto1D_shl_start_sve
859
+    mov             w12, #4
860
+.loop_cpy2Dto1D_shl_16_sve:
861
+    sub             w12, w12, #1
862
+.rept 4
863
+    ld1             {v2.16b-v3.16b}, x1, x2
864
+    sshl            v2.8h, v2.8h, v0.8h
865
+    sshl            v3.8h, v3.8h, v0.8h
866
+    st1             {v2.16b-v3.16b}, x0, #32
867
+.endr
868
+    cbnz            w12, .loop_cpy2Dto1D_shl_16_sve
869
+    ret
870
+.vl_gt_16_cpy2Dto1D_shl_16x16:
871
+    ptrue           p0.h, vl16
872
+.rept 16
873
+    ld1h            {z1.h}, p0/z, x1
874
+    lsl             z1.h, p0/m, z1.h, z0.h
875
+    st1h            {z1.h}, p0, x0
876
+    add             x1, x1, x2, lsl #1
877
+    add             x0, x0, #32
878
+.endr
879
+    ret
880
+endfunc
881
+
882
+function PFX(cpy2Dto1D_shl_32x32_sve)
883
+    dup             z0.h, w3
884
+    rdvl            x9, #1
885
+    cmp             x9, #16
886
+    bgt             .vl_gt_16_cpy2Dto1D_shl_32x32
887
+    cpy2Dto1D_shl_start_sve
888
+    mov             w12, #16
889
+.loop_cpy2Dto1D_shl_32_sve:
890
+    sub             w12, w12, #1
891
+.rept 2
892
+    ld1             {v2.16b-v5.16b}, x1, x2
893
+    sshl            v2.8h, v2.8h, v0.8h
894
+    sshl            v3.8h, v3.8h, v0.8h
895
+    sshl            v4.8h, v4.8h, v0.8h
896
+    sshl            v5.8h, v5.8h, v0.8h
897
+    st1             {v2.16b-v5.16b}, x0, #64
898
+.endr
899
+    cbnz            w12, .loop_cpy2Dto1D_shl_32_sve
900
+    ret
901
+.vl_gt_16_cpy2Dto1D_shl_32x32:
902
+    cmp             x9, #48
903
+    bgt             .vl_gt_48_cpy2Dto1D_shl_32x32
904
+    ptrue           p0.h, vl16
905
+.rept 32
906
+    ld1h            {z1.h}, p0/z, x1
907
+    ld1h            {z2.h}, p0/z, x1, #1, mul vl
908
+    lsl             z1.h, p0/m, z1.h, z0.h
909
+    lsl             z2.h, p0/m, z2.h, z0.h
910
+    st1h            {z1.h}, p0, x0
911
+    st1h            {z2.h}, p0, x0, #1, mul vl
912
+    add             x1, x1, x2, lsl #1
913
+    add             x0, x0, #64
914
+.endr
915
+    ret
916
+.vl_gt_48_cpy2Dto1D_shl_32x32:
917
+    ptrue           p0.h, vl32
918
+.rept 32
919
+    ld1h            {z1.h}, p0/z, x1
920
+    lsl             z1.h, p0/m, z1.h, z0.h
921
+    st1h            {z1.h}, p0, x0
922
+    add             x1, x1, x2, lsl #1
923
+    add             x0, x0, #64
924
+.endr
925
+    ret
926
+endfunc
927
+
928
+function PFX(cpy2Dto1D_shl_64x64_sve)
929
+    rdvl            x9, #1
930
+    cmp             x9, #16
931
+    bgt             .vl_gt_16_cpy2Dto1D_shl_64x64
932
+    cpy2Dto1D_shl_start_sve
933
+    mov             w12, #32
934
+    sub             x2, x2, #64
935
+.loop_cpy2Dto1D_shl_64_sve:
936
+    sub             w12, w12, #1
937
+.rept 2
938
+    ld1             {v2.16b-v5.16b}, x1, #64
939
+    ld1             {v16.16b-v19.16b}, x1, x2
940
+    sshl            v2.8h, v2.8h, v0.8h
941
+    sshl            v3.8h, v3.8h, v0.8h
942
+    sshl            v4.8h, v4.8h, v0.8h
943
+    sshl            v5.8h, v5.8h, v0.8h
944
+    sshl            v16.8h, v16.8h, v0.8h
945
+    sshl            v17.8h, v17.8h, v0.8h
946
+    sshl            v18.8h, v18.8h, v0.8h
947
+    sshl            v19.8h, v19.8h, v0.8h
948
+    st1             {v2.16b-v5.16b}, x0, #64
949
+    st1             {v16.16b-v19.16b}, x0, #64
950
+.endr
951
+    cbnz            w12, .loop_cpy2Dto1D_shl_64_sve
952
+    ret
953
+.vl_gt_16_cpy2Dto1D_shl_64x64:
954
+    dup             z0.h, w3
955
+    mov             x8, #64
956
+    mov             w12, #64
957
+.L_init_cpy2Dto1D_shl_64x64:
958
+    sub             w12, w12, 1
959
+    mov             x9, #0
960
+    whilelt         p0.h, x9, x8
961
+.L_cpy2Dto1D_shl_64x64:
962
+    ld1h            {z1.h}, p0/z, x1, x9, lsl #1
963
+    lsl             z1.h, p0/m, z1.h, z0.h
964
+    st1h            {z1.h}, p0, x0, x9, lsl #1
965
+    inch            x9
966
+    whilelt         p0.h, x9, x8
967
+    b.first         .L_cpy2Dto1D_shl_64x64
968
+    add             x1, x1, x2, lsl #1
969
+    addvl           x0, x0, #1
970
+    cbnz            w12, .L_init_cpy2Dto1D_shl_64x64
971
+    ret
972
+endfunc
973
+
974
+// void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
975
+
976
+function PFX(cpy2Dto1D_shr_4x4_sve)
977
+    dup             z0.h, w3
978
+    sub             w4, w3, #1
979
+    dup             z1.h, w4
980
+    ptrue           p0.h, vl8
981
+    mov             z2.h, #1
982
+    lsl             z2.h, p0/m, z2.h, z1.h
983
+    lsl             x2, x2, #1
984
+    index           z3.d, #0, x2
985
+    index           z4.d, #0, #8
986
+.rept 2
987
+    ld1d            {z5.d}, p0/z, x1, z3.d
988
+    add             x1, x1, x2, lsl #1
989
+    add             z5.h, p0/m, z5.h, z2.h
990
+    asr             z5.h, p0/m, z5.h, z0.h
991
+    st1d            {z5.d}, p0, x0, z4.d
992
+    add             x0, x0, #16
993
+.endr
994
+    ret
995
+endfunc
996
+
997
+function PFX(cpy2Dto1D_shr_8x8_sve)
998
+    dup             z0.h, w3
999
+    sub             w4, w3, #1
1000
+    dup             z1.h, w4
1001
+    ptrue           p0.h, vl8
1002
+    mov             z2.h, #1
1003
+    lsl             z2.h, p0/m, z2.h, z1.h
1004
+.rept 8
1005
+    ld1d            {z5.d}, p0/z, x1
1006
+    add             x1, x1, x2, lsl #1
1007
+    add             z5.h, p0/m, z5.h, z2.h
1008
+    asr             z5.h, p0/m, z5.h, z0.h
1009
+    st1d            {z5.d}, p0, x0
1010
+    add             x0, x0, #16
1011
+.endr
1012
+    ret
1013
+endfunc
1014
+
1015
+function PFX(cpy2Dto1D_shr_16x16_sve)
1016
+    dup             z0.h, w3
1017
+    sub             w4, w3, #1
1018
+    dup             z1.h, w4
1019
+    rdvl            x9, #1
1020
+    cmp             x9, #16
1021
+    bgt             .vl_gt_16_cpy2Dto1D_shr_16x16
1022
+    ptrue           p0.h, vl8
1023
+    mov             z2.h, #1
1024
+    lsl             z2.h, p0/m, z2.h, z1.h
1025
+.rept 16
1026
+    ld1d            {z5.d}, p0/z, x1
1027
+    ld1d            {z6.d}, p0/z, x1, #1, mul vl
1028
+    add             x1, x1, x2, lsl #1
1029
+    add             z5.h, p0/m, z5.h, z2.h
1030
+    add             z6.h, p0/m, z6.h, z2.h
1031
+    asr             z5.h, p0/m, z5.h, z0.h
1032
+    asr             z6.h, p0/m, z6.h, z0.h
1033
+    st1d            {z5.d}, p0, x0
1034
+    st1d            {z6.d}, p0, x0, #1, mul vl
1035
+    add             x0, x0, #32
1036
+.endr
1037
+    ret
1038
+.vl_gt_16_cpy2Dto1D_shr_16x16:
1039
+    ptrue           p0.h, vl16
1040
+    mov             z2.h, #1
1041
+    lsl             z2.h, p0/m, z2.h, z1.h
1042
+.rept 16
1043
+    ld1d            {z5.d}, p0/z, x1
1044
+    add             x1, x1, x2, lsl #1
1045
+    add             z5.h, p0/m, z5.h, z2.h
1046
+    asr             z5.h, p0/m, z5.h, z0.h
1047
+    st1d            {z5.d}, p0, x0
1048
+    add             x0, x0, #32
1049
+.endr
1050
+    ret
1051
+endfunc
1052
+
1053
+function PFX(cpy2Dto1D_shr_32x32_sve)
1054
+    rdvl            x9, #1
1055
+    cmp             x9, #16
1056
+    bgt             .vl_gt_16_cpy2Dto1D_shr_32x32
1057
+    cpy2Dto1D_shr_start
1058
+    mov             w12, #16
1059
+.loop_cpy2Dto1D_shr_32_sve:
1060
+    sub             w12, w12, #1
1061
+.rept 2
1062
+    ld1             {v2.8h-v5.8h}, x1, x2
1063
+    sub             v2.8h, v2.8h, v1.8h
1064
+    sub             v3.8h, v3.8h, v1.8h
1065
+    sub             v4.8h, v4.8h, v1.8h
1066
+    sub             v5.8h, v5.8h, v1.8h
1067
+    sshl            v2.8h, v2.8h, v0.8h
1068
+    sshl            v3.8h, v3.8h, v0.8h
1069
+    sshl            v4.8h, v4.8h, v0.8h
1070
+    sshl            v5.8h, v5.8h, v0.8h
1071
+    st1             {v2.8h-v5.8h}, x0, #64
1072
+.endr
1073
+    cbnz            w12, .loop_cpy2Dto1D_shr_32_sve
1074
+    ret
1075
+.vl_gt_16_cpy2Dto1D_shr_32x32:
1076
+    dup             z0.h, w3
1077
+    sub             w4, w3, #1
1078
+    dup             z1.h, w4
1079
+    cmp             x9, #48
1080
+    bgt             .vl_gt_48_cpy2Dto1D_shr_32x32
1081
+    ptrue           p0.h, vl16
1082
+    mov             z2.h, #1
1083
+    lsl             z2.h, p0/m, z2.h, z1.h
1084
+.rept 32
1085
+    ld1d            {z5.d}, p0/z, x1
1086
+    ld1d            {z6.d}, p0/z, x1, #1, mul vl
1087
+    add             x1, x1, x2, lsl #1
1088
+    add             z5.h, p0/m, z5.h, z2.h
1089
+    add             z6.h, p0/m, z6.h, z2.h
1090
+    asr             z5.h, p0/m, z5.h, z0.h
1091
+    asr             z6.h, p0/m, z6.h, z0.h
1092
+    st1d            {z5.d}, p0, x0
1093
+    st1d            {z6.d}, p0, x0, #1, mul vl
1094
+    add             x0, x0, #64
1095
+.endr
1096
+    ret
1097
+.vl_gt_48_cpy2Dto1D_shr_32x32:
1098
+    ptrue           p0.h, vl32
1099
+    mov             z2.h, #1
1100
+    lsl             z2.h, p0/m, z2.h, z1.h
1101
+.rept 32
1102
+    ld1d            {z5.d}, p0/z, x1
1103
+    add             x1, x1, x2, lsl #1
1104
+    add             z5.h, p0/m, z5.h, z2.h
1105
+    asr             z5.h, p0/m, z5.h, z0.h
1106
+    st1d            {z5.d}, p0, x0
1107
+    add             x0, x0, #64
1108
+.endr
1109
+    ret
1110
+endfunc
1111
+
1112
+// void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
1113
+
1114
+function PFX(cpy1Dto2D_shl_16x16_sve)
1115
+    dup             z0.h, w3
1116
+    rdvl            x9, #1
1117
+    cmp             x9, #16
1118
+    bgt             .vl_gt_16_cpy1Dto2D_shl_16x16
1119
+    ptrue           p0.h, vl8
1120
+.rept 16
1121
+    ld1h            {z1.h}, p0/z, x1
1122
+    ld1h            {z2.h}, p0/z, x1, #1, mul vl
1123
+    lsl             z1.h, p0/m, z1.h, z0.h
1124
+    lsl             z2.h, p0/m, z2.h, z0.h
1125
+    st1h            {z1.h}, p0, x0
1126
+    st1h            {z2.h}, p0, x0, #1, mul vl
1127
+    add             x1, x1, #32
1128
+    add             x0, x0, x2, lsl #1
1129
+.endr
1130
+    ret
1131
+.vl_gt_16_cpy1Dto2D_shl_16x16:
1132
+    ptrue           p0.h, vl16
1133
+.rept 16
1134
+    ld1h            {z1.h}, p0/z, x1
1135
+    lsl             z1.h, p0/m, z1.h, z0.h
1136
+    st1h            {z1.h}, p0, x0
1137
+    add             x1, x1, #32
1138
+    add             x0, x0, x2, lsl #1
1139
+.endr
1140
+    ret
1141
+endfunc
1142
+
1143
+function PFX(cpy1Dto2D_shl_32x32_sve)
1144
+    dup             z0.h, w3
1145
+    rdvl            x9, #1
1146
+    cmp             x9, #16
1147
+    bgt             .vl_gt_16_cpy1Dto2D_shl_32x32
1148
+    ptrue           p0.h, vl8
1149
+.rept 32
1150
+    ld1h            {z1.h}, p0/z, x1
1151
+    ld1h            {z2.h}, p0/z, x1, #1, mul vl
1152
+    ld1h            {z3.h}, p0/z, x1, #2, mul vl
1153
+    ld1h            {z4.h}, p0/z, x1, #3, mul vl
1154
+    lsl             z1.h, p0/m, z1.h, z0.h
1155
+    lsl             z2.h, p0/m, z2.h, z0.h
1156
+    lsl             z3.h, p0/m, z3.h, z0.h
1157
+    lsl             z4.h, p0/m, z4.h, z0.h
1158
+    st1h            {z1.h}, p0, x0
1159
+    st1h            {z2.h}, p0, x0, #1, mul vl
1160
+    st1h            {z3.h}, p0, x0, #2, mul vl
1161
+    st1h            {z4.h}, p0, x0, #3, mul vl
1162
+    add             x1, x1, #64
1163
+    add             x0, x0, x2, lsl #1
1164
+.endr
1165
+    ret
1166
+.vl_gt_16_cpy1Dto2D_shl_32x32:
1167
+    cmp             x9, #48
1168
+    bgt             .vl_gt_48_cpy1Dto2D_shl_32x32
1169
+    ptrue           p0.h, vl16
1170
+.rept 32
1171
+    ld1h            {z1.h}, p0/z, x1
1172
+    ld1h            {z2.h}, p0/z, x1, #1, mul vl
1173
+    lsl             z1.h, p0/m, z1.h, z0.h
1174
+    lsl             z2.h, p0/m, z2.h, z0.h
1175
+    st1h            {z1.h}, p0, x0
1176
+    st1h            {z2.h}, p0, x0, #1, mul vl
1177
+    add             x1, x1, #64
1178
+    add             x0, x0, x2, lsl #1
1179
+.endr
1180
+    ret
1181
+.vl_gt_48_cpy1Dto2D_shl_32x32:
1182
+    ptrue           p0.h, vl32
1183
+.rept 32
1184
+    ld1h            {z1.h}, p0/z, x1
1185
+    lsl             z1.h, p0/m, z1.h, z0.h
1186
+    st1h            {z1.h}, p0, x0
1187
+    add             x1, x1, #64
1188
+    add             x0, x0, x2, lsl #1
1189
+.endr
1190
+    ret
1191
+endfunc
1192
+
1193
+function PFX(cpy1Dto2D_shl_64x64_sve)
1194
+    dup             z0.h, w3
1195
+    mov             x8, #64
1196
+    mov             w12, #64
1197
+.L_init_cpy1Dto2D_shl_64x64:
1198
+    sub             w12, w12, 1
1199
+    mov             x9, #0
1200
+    whilelt         p0.h, x9, x8
1201
+.L_cpy1Dto2D_shl_64x64:
1202
+    ld1h            {z1.h}, p0/z, x1, x9, lsl #1
1203
+    lsl             z1.h, p0/m, z1.h, z0.h
1204
+    st1h            {z1.h}, p0, x0, x9, lsl #1
1205
+    inch            x9
1206
+    whilelt         p0.h, x9, x8
1207
+    b.first         .L_cpy1Dto2D_shl_64x64
1208
+    addvl           x1, x1, #1
1209
+    add             x0, x0, x2, lsl #1
1210
+    cbnz            w12, .L_init_cpy1Dto2D_shl_64x64
1211
+    ret
1212
+endfunc
1213
+
1214
+// void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
1215
+
1216
+function PFX(cpy1Dto2D_shr_16x16_sve)
1217
+    rdvl            x9, #1
1218
+    cmp             x9, #16
1219
+    bgt             .vl_gt_16_cpy1Dto2D_shr_16x16
1220
+    cpy1Dto2D_shr_start
1221
+    mov             w12, #4
1222
+.loop_cpy1Dto2D_shr_16:
1223
+    sub             w12, w12, #1
1224
+.rept 4
1225
+    ld1             {v2.8h-v3.8h}, x1, #32
1226
+    sub             v2.8h, v2.8h, v1.8h
1227
+    sub             v3.8h, v3.8h, v1.8h
1228
+    sshl            v2.8h, v2.8h, v0.8h
1229
+    sshl            v3.8h, v3.8h, v0.8h
1230
+    st1             {v2.8h-v3.8h}, x0, x2
1231
+.endr
1232
+    cbnz            w12, .loop_cpy1Dto2D_shr_16
1233
+    ret
1234
+.vl_gt_16_cpy1Dto2D_shr_16x16:
1235
+    dup             z0.h, w3
1236
+    sub             w4, w3, #1
1237
+    dup             z1.h, w4
1238
+    ptrue           p0.h, vl16
1239
+    mov             z2.h, #1
1240
+    lsl             z2.h, p0/m, z2.h, z1.h
1241
+.rept 16
1242
+    ld1d            {z5.d}, p0/z, x1
1243
+    add             x1, x1, #32
1244
+    add             z5.h, p0/m, z5.h, z2.h
1245
+    asr             z5.h, p0/m, z5.h, z0.h
1246
+    st1d            {z5.d}, p0, x0
1247
+    add             x0, x0, x2, lsl #1
1248
+.endr
1249
+    ret
1250
+endfunc
1251
+
1252
+function PFX(cpy1Dto2D_shr_32x32_sve)
1253
+    rdvl            x9, #1
1254
+    cmp             x9, #16
1255
+    bgt             .vl_gt_16_cpy1Dto2D_shr_32x32
1256
+    cpy1Dto2D_shr_start
1257
+    mov             w12, #16
1258
+.loop_cpy1Dto2D_shr_32_sve:
1259
+    sub             w12, w12, #1
1260
+.rept 2
1261
+    ld1             {v2.16b-v5.16b}, x1, #64
1262
+    sub             v2.8h, v2.8h, v1.8h
1263
+    sub             v3.8h, v3.8h, v1.8h
1264
+    sub             v4.8h, v4.8h, v1.8h
1265
+    sub             v5.8h, v5.8h, v1.8h
1266
+    sshl            v2.8h, v2.8h, v0.8h
1267
+    sshl            v3.8h, v3.8h, v0.8h
1268
+    sshl            v4.8h, v4.8h, v0.8h
1269
+    sshl            v5.8h, v5.8h, v0.8h
1270
+    st1             {v2.16b-v5.16b}, x0, x2
1271
+.endr
1272
+    cbnz            w12, .loop_cpy1Dto2D_shr_32_sve
1273
+    ret
1274
+.vl_gt_16_cpy1Dto2D_shr_32x32:
1275
+    dup             z0.h, w3
1276
+    sub             w4, w3, #1
1277
+    dup             z1.h, w4
1278
+    cmp             x9, #48
1279
+    bgt             .vl_gt_48_cpy2Dto1D_shr_32x32
1280
+    ptrue           p0.h, vl16
1281
+    mov             z2.h, #1
1282
+    lsl             z2.h, p0/m, z2.h, z1.h
1283
+.rept 32
1284
+    ld1d            {z5.d}, p0/z, x1
1285
+    ld1d            {z6.d}, p0/z, x1, #1, mul vl
1286
+    add             x1, x1, #64
1287
+    add             z5.h, p0/m, z5.h, z2.h
1288
+    add             z6.h, p0/m, z6.h, z2.h
1289
+    asr             z5.h, p0/m, z5.h, z0.h
1290
+    asr             z6.h, p0/m, z6.h, z0.h
1291
+    st1d            {z5.d}, p0, x0
1292
+    st1d            {z6.d}, p0, x0, #1, mul vl
1293
+    add             x0, x0, x2, lsl #1
1294
+.endr
1295
+    ret
1296
+.vl_gt_48_cpy1Dto2D_shr_32x32:
1297
+    ptrue           p0.h, vl32
1298
+    mov             z2.h, #1
1299
+    lsl             z2.h, p0/m, z2.h, z1.h
1300
+.rept 32
1301
+    ld1d            {z5.d}, p0/z, x1
1302
+    add             x1, x1, #64
1303
+    add             z5.h, p0/m, z5.h, z2.h
1304
+    asr             z5.h, p0/m, z5.h, z0.h
1305
+    st1d            {z5.d}, p0, x0
1306
+    add             x0, x0, x2, lsl #1
1307
+.endr
1308
+    ret
1309
+endfunc
1310
+
1311
+function PFX(cpy1Dto2D_shr_64x64_sve)
1312
+    dup             z0.h, w3
1313
+    sub             w4, w3, #1
1314
+    dup             z1.h, w4
1315
+    rdvl            x9, #1
1316
+    cmp             x9, #16
1317
+    bgt             .vl_gt_16_cpy1Dto2D_shr_64x64
1318
+    ptrue           p0.h, vl8
1319
+    mov             z2.h, #1
1320
+    lsl             z2.h, p0/m, z2.h, z1.h
1321
+.rept 128
1322
+    ld1d            {z5.d}, p0/z, x1
1323
+    ld1d            {z6.d}, p0/z, x1, #1, mul vl
1324
+    ld1d            {z7.d}, p0/z, x1, #2, mul vl
1325
+    ld1d            {z8.d}, p0/z, x1, #3, mul vl
1326
+    ld1d            {z9.d}, p0/z, x1, #4, mul vl
1327
+    ld1d            {z10.d}, p0/z, x1, #5, mul vl
1328
+    ld1d            {z11.d}, p0/z, x1, #6, mul vl
1329
+    ld1d            {z12.d}, p0/z, x1, #7, mul vl
1330
+    add             x1, x1, #128
1331
+    add             z5.h, p0/m, z5.h, z2.h
1332
+    add             z6.h, p0/m, z6.h, z2.h
1333
+    add             z7.h, p0/m, z7.h, z2.h
1334
+    add             z8.h, p0/m, z8.h, z2.h
1335
+    add             z9.h, p0/m, z9.h, z2.h
1336
+    add             z10.h, p0/m, z10.h, z2.h
1337
+    add             z11.h, p0/m, z11.h, z2.h
1338
+    add             z12.h, p0/m, z12.h, z2.h
1339
+    asr             z5.h, p0/m, z5.h, z0.h
1340
+    asr             z6.h, p0/m, z6.h, z0.h
1341
+    asr             z7.h, p0/m, z7.h, z0.h
1342
+    asr             z8.h, p0/m, z8.h, z0.h
1343
+    asr             z9.h, p0/m, z9.h, z0.h
1344
+    asr             z10.h, p0/m, z10.h, z0.h
1345
+    asr             z11.h, p0/m, z11.h, z0.h
1346
+    asr             z12.h, p0/m, z12.h, z0.h
1347
+    st1d            {z5.d}, p0, x0
1348
+    st1d            {z6.d}, p0, x0, #1, mul vl
1349
+    st1d            {z7.d}, p0, x0, #2, mul vl
1350
+    st1d            {z8.d}, p0, x0, #3, mul vl
1351
+    st1d            {z9.d}, p0, x0, #4, mul vl
1352
+    st1d            {z10.d}, p0, x0, #5, mul vl
1353
+    st1d            {z11.d}, p0, x0, #6, mul vl
1354
+    st1d            {z12.d}, p0, x0, #7, mul vl
1355
+    add             x0, x0, x2, lsl #1
1356
+.endr
1357
+    ret
1358
+.vl_gt_16_cpy1Dto2D_shr_64x64:
1359
+    cmp             x9, #48
1360
+    bgt             .vl_gt_48_cpy1Dto2D_shr_64x64
1361
+    ptrue           p0.h, vl16
1362
+    mov             z2.h, #1
1363
+    lsl             z2.h, p0/m, z2.h, z1.h
1364
+.rept 128
1365
+    ld1d            {z5.d}, p0/z, x1
1366
+    ld1d            {z6.d}, p0/z, x1, #1, mul vl
1367
+    ld1d            {z7.d}, p0/z, x1, #2, mul vl
1368
+    ld1d            {z8.d}, p0/z, x1, #3, mul vl
1369
+    add             x1, x1, #128
1370
+    add             z5.h, p0/m, z5.h, z2.h
1371
+    add             z6.h, p0/m, z6.h, z2.h
1372
+    add             z7.h, p0/m, z7.h, z2.h
1373
+    add             z8.h, p0/m, z8.h, z2.h
1374
+    asr             z5.h, p0/m, z5.h, z0.h
1375
+    asr             z6.h, p0/m, z6.h, z0.h
1376
+    asr             z7.h, p0/m, z7.h, z0.h
1377
+    asr             z8.h, p0/m, z8.h, z0.h
1378
+    st1d            {z5.d}, p0, x0
1379
+    st1d            {z6.d}, p0, x0, #1, mul vl
1380
+    st1d            {z7.d}, p0, x0, #2, mul vl
1381
+    st1d            {z8.d}, p0, x0, #3, mul vl
1382
+    add             x0, x0, x2, lsl #1
1383
+.endr
1384
+    ret
1385
+.vl_gt_48_cpy1Dto2D_shr_64x64:
1386
+    cmp             x9, #112
1387
+    bgt             .vl_gt_112_cpy1Dto2D_shr_64x64
1388
+    ptrue           p0.h, vl32
1389
+    mov             z2.h, #1
1390
+    lsl             z2.h, p0/m, z2.h, z1.h
1391
+.rept 128
1392
+    ld1d            {z5.d}, p0/z, x1
1393
+    ld1d            {z6.d}, p0/z, x1, #1, mul vl
1394
+    add             x1, x1, #128
1395
+    add             z5.h, p0/m, z5.h, z2.h
1396
+    add             z6.h, p0/m, z6.h, z2.h
1397
+    asr             z5.h, p0/m, z5.h, z0.h
1398
+    asr             z6.h, p0/m, z6.h, z0.h
1399
+    st1d            {z5.d}, p0, x0
1400
+    st1d            {z6.d}, p0, x0, #1, mul vl
1401
+    add             x0, x0, x2, lsl #1
1402
+.endr
1403
+    ret
1404
+.vl_gt_112_cpy1Dto2D_shr_64x64:
1405
+    ptrue           p0.h, vl64
1406
+    mov             z2.h, #1
1407
+    lsl             z2.h, p0/m, z2.h, z1.h
1408
+.rept 128
1409
+    ld1d            {z5.d}, p0/z, x1
1410
+    add             x1, x1, #128
1411
+    add             z5.h, p0/m, z5.h, z2.h
1412
+    asr             z5.h, p0/m, z5.h, z0.h
1413
+    st1d            {z5.d}, p0, x0
1414
+    add             x0, x0, x2, lsl #1
1415
+.endr
1416
+    ret
1417
+endfunc
1418
x265_3.6.tar.gz/source/common/aarch64/blockcopy8.S Added
1301
 
1
@@ -0,0 +1,1299 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2021 MulticoreWare, Inc
4
+ *
5
+ * Authors: Sebastian Pop <spop@amazon.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+#include "blockcopy8-common.S"
27
+
28
+#ifdef __APPLE__
29
+.section __RODATA,__rodata
30
+#else
31
+.section .rodata
32
+#endif
33
+
34
+.align 4
35
+
36
+.text
37
+
38
+/* void blockcopy_sp(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
39
+ *
40
+ * r0   - a
41
+ * r1   - stridea
42
+ * r2   - b
43
+ * r3   - strideb */
44
+function PFX(blockcopy_sp_4x4_neon)
45
+    lsl             x3, x3, #1
46
+.rept 2
47
+    ld1             {v0.8h}, x2, x3
48
+    ld1             {v1.8h}, x2, x3
49
+    xtn             v0.8b, v0.8h
50
+    xtn             v1.8b, v1.8h
51
+    st1             {v0.s}0, x0, x1
52
+    st1             {v1.s}0, x0, x1
53
+.endr
54
+    ret
55
+endfunc
56
+
57
+function PFX(blockcopy_sp_8x8_neon)
58
+    lsl             x3, x3, #1
59
+.rept 4
60
+    ld1             {v0.8h}, x2, x3
61
+    ld1             {v1.8h}, x2, x3
62
+    xtn             v0.8b, v0.8h
63
+    xtn             v1.8b, v1.8h
64
+    st1             {v0.d}0, x0, x1
65
+    st1             {v1.d}0, x0, x1
66
+.endr
67
+    ret
68
+endfunc
69
+
70
+function PFX(blockcopy_sp_16x16_neon)
71
+    lsl             x3, x3, #1
72
+    movrel          x11, xtn_xtn2_table
73
+    ld1             {v31.16b}, x11
74
+.rept 8
75
+    ld1             {v0.8h-v1.8h}, x2, x3
76
+    ld1             {v2.8h-v3.8h}, x2, x3
77
+    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
78
+    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
79
+    st1             {v0.16b}, x0, x1
80
+    st1             {v1.16b}, x0, x1
81
+.endr
82
+    ret
83
+endfunc
84
+
85
+function PFX(blockcopy_sp_32x32_neon)
86
+    mov             w12, #4
87
+    lsl             x3, x3, #1
88
+    movrel          x11, xtn_xtn2_table
89
+    ld1             {v31.16b}, x11
90
+.loop_csp32:
91
+    sub             w12, w12, #1
92
+.rept 4
93
+    ld1             {v0.8h-v3.8h}, x2, x3
94
+    ld1             {v4.8h-v7.8h}, x2, x3
95
+    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
96
+    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
97
+    tbl             v2.16b, {v4.16b,v5.16b}, v31.16b
98
+    tbl             v3.16b, {v6.16b,v7.16b}, v31.16b
99
+    st1             {v0.16b-v1.16b}, x0, x1
100
+    st1             {v2.16b-v3.16b}, x0, x1
101
+.endr
102
+    cbnz            w12, .loop_csp32
103
+    ret
104
+endfunc
105
+
106
+function PFX(blockcopy_sp_64x64_neon)
107
+    mov             w12, #16
108
+    lsl             x3, x3, #1
109
+    sub             x3, x3, #64
110
+    movrel          x11, xtn_xtn2_table
111
+    ld1             {v31.16b}, x11
112
+.loop_csp64:
113
+    sub             w12, w12, #1
114
+.rept 4
115
+    ld1             {v0.8h-v3.8h}, x2, #64
116
+    ld1             {v4.8h-v7.8h}, x2, x3
117
+    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
118
+    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
119
+    tbl             v2.16b, {v4.16b,v5.16b}, v31.16b
120
+    tbl             v3.16b, {v6.16b,v7.16b}, v31.16b
121
+    st1             {v0.16b-v3.16b}, x0, x1
122
+.endr
123
+    cbnz            w12, .loop_csp64
124
+    ret
125
+endfunc
126
+
127
+// void blockcopy_ps(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb)
128
+function PFX(blockcopy_ps_4x4_neon)
129
+    lsl             x1, x1, #1
130
+.rept 2
131
+    ld1             {v0.8b}, x2, x3
132
+    ld1             {v1.8b}, x2, x3
133
+    uxtl            v0.8h, v0.8b
134
+    uxtl            v1.8h, v1.8b
135
+    st1             {v0.4h}, x0, x1
136
+    st1             {v1.4h}, x0, x1
137
+.endr
138
+    ret
139
+endfunc
140
+
141
+function PFX(blockcopy_ps_8x8_neon)
142
+    lsl             x1, x1, #1
143
+.rept 4
144
+    ld1             {v0.8b}, x2, x3
145
+    ld1             {v1.8b}, x2, x3
146
+    uxtl            v0.8h, v0.8b
147
+    uxtl            v1.8h, v1.8b
148
+    st1             {v0.8h}, x0, x1
149
+    st1             {v1.8h}, x0, x1
150
+.endr
151
+    ret
152
+endfunc
153
+
154
+function PFX(blockcopy_ps_16x16_neon)
155
+    lsl             x1, x1, #1
156
+.rept 8
157
+    ld1             {v4.16b}, x2, x3
158
+    ld1             {v5.16b}, x2, x3
159
+    uxtl            v0.8h, v4.8b
160
+    uxtl2           v1.8h, v4.16b
161
+    uxtl            v2.8h, v5.8b
162
+    uxtl2           v3.8h, v5.16b
163
+    st1             {v0.8h-v1.8h}, x0, x1
164
+    st1             {v2.8h-v3.8h}, x0, x1
165
+.endr
166
+    ret
167
+endfunc
168
+
169
+function PFX(blockcopy_ps_32x32_neon)
170
+    lsl             x1, x1, #1
171
+    mov             w12, #4
172
+.loop_cps32:
173
+    sub             w12, w12, #1
174
+.rept 4
175
+    ld1             {v16.16b-v17.16b}, x2, x3
176
+    ld1             {v18.16b-v19.16b}, x2, x3
177
+    uxtl            v0.8h, v16.8b
178
+    uxtl2           v1.8h, v16.16b
179
+    uxtl            v2.8h, v17.8b
180
+    uxtl2           v3.8h, v17.16b
181
+    uxtl            v4.8h, v18.8b
182
+    uxtl2           v5.8h, v18.16b
183
+    uxtl            v6.8h, v19.8b
184
+    uxtl2           v7.8h, v19.16b
185
+    st1             {v0.8h-v3.8h}, x0, x1
186
+    st1             {v4.8h-v7.8h}, x0, x1
187
+.endr
188
+    cbnz            w12, .loop_cps32
189
+    ret
190
+endfunc
191
+
192
+function PFX(blockcopy_ps_64x64_neon)
193
+    lsl             x1, x1, #1
194
+    sub             x1, x1, #64
195
+    mov             w12, #16
196
+.loop_cps64:
197
+    sub             w12, w12, #1
198
+.rept 4
199
+    ld1             {v16.16b-v19.16b}, x2, x3
200
+    uxtl            v0.8h, v16.8b
201
+    uxtl2           v1.8h, v16.16b
202
+    uxtl            v2.8h, v17.8b
203
+    uxtl2           v3.8h, v17.16b
204
+    uxtl            v4.8h, v18.8b
205
+    uxtl2           v5.8h, v18.16b
206
+    uxtl            v6.8h, v19.8b
207
+    uxtl2           v7.8h, v19.16b
208
+    st1             {v0.8h-v3.8h}, x0, #64
209
+    st1             {v4.8h-v7.8h}, x0, x1
210
+.endr
211
+    cbnz            w12, .loop_cps64
212
+    ret
213
+endfunc
214
+
215
+// void x265_blockcopy_ss(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
216
+function PFX(blockcopy_ss_4x4_neon)
217
+    lsl             x1, x1, #1
218
+    lsl             x3, x3, #1
219
+.rept 2
220
+    ld1             {v0.8b}, x2, x3
221
+    ld1             {v1.8b}, x2, x3
222
+    st1             {v0.8b}, x0, x1
223
+    st1             {v1.8b}, x0, x1
224
+.endr
225
+    ret
226
+endfunc
227
+
228
+function PFX(blockcopy_ss_8x8_neon)
229
+    lsl             x1, x1, #1
230
+    lsl             x3, x3, #1
231
+.rept 4
232
+    ld1             {v0.8h}, x2, x3
233
+    ld1             {v1.8h}, x2, x3
234
+    st1             {v0.8h}, x0, x1
235
+    st1             {v1.8h}, x0, x1
236
+.endr
237
+    ret
238
+endfunc
239
+
240
+function PFX(blockcopy_ss_16x16_neon)
241
+    lsl             x1, x1, #1
242
+    lsl             x3, x3, #1
243
+.rept 8
244
+    ld1             {v0.8h-v1.8h}, x2, x3
245
+    ld1             {v2.8h-v3.8h}, x2, x3
246
+    st1             {v0.8h-v1.8h}, x0, x1
247
+    st1             {v2.8h-v3.8h}, x0, x1
248
+.endr
249
+    ret
250
+endfunc
251
+
252
+function PFX(blockcopy_ss_32x32_neon)
253
+    lsl             x1, x1, #1
254
+    lsl             x3, x3, #1
255
+    mov             w12, #4
256
+.loop_css32:
257
+    sub             w12, w12, #1
258
+.rept 8
259
+    ld1             {v0.8h-v3.8h}, x2, x3
260
+    st1             {v0.8h-v3.8h}, x0, x1
261
+.endr
262
+    cbnz            w12, .loop_css32
263
+    ret
264
+endfunc
265
+
266
+function PFX(blockcopy_ss_64x64_neon)
267
+    lsl             x1, x1, #1
268
+    sub             x1, x1, #64
269
+    lsl             x3, x3, #1
270
+    sub             x3, x3, #64
271
+    mov             w12, #8
272
+.loop_css64:
273
+    sub             w12, w12, #1
274
+.rept 8
275
+    ld1             {v0.8h-v3.8h}, x2, #64
276
+    ld1             {v4.8h-v7.8h}, x2, x3
277
+    st1             {v0.8h-v3.8h}, x0, #64
278
+    st1             {v4.8h-v7.8h}, x0, x1
279
+.endr
280
+    cbnz            w12, .loop_css64
281
+    ret
282
+endfunc
283
+
284
+/******** Chroma blockcopy********/
285
+function PFX(blockcopy_ss_4x8_neon)
286
+    lsl             x1, x1, #1
287
+    lsl             x3, x3, #1
288
+.rept 4
289
+    ld1             {v0.8b}, x2, x3
290
+    ld1             {v1.8b}, x2, x3
291
+    st1             {v0.8b}, x0, x1
292
+    st1             {v1.8b}, x0, x1
293
+.endr
294
+    ret
295
+endfunc
296
+
297
+function PFX(blockcopy_ss_8x16_neon)
298
+    lsl             x1, x1, #1
299
+    lsl             x3, x3, #1
300
+.rept 8
301
+    ld1             {v0.8h}, x2, x3
302
+    ld1             {v1.8h}, x2, x3
303
+    st1             {v0.8h}, x0, x1
304
+    st1             {v1.8h}, x0, x1
305
+.endr
306
+    ret
307
+endfunc
308
+
309
+function PFX(blockcopy_ss_16x32_neon)
310
+    lsl             x1, x1, #1
311
+    lsl             x3, x3, #1
312
+.rept 16
313
+    ld1             {v0.8h-v1.8h}, x2, x3
314
+    ld1             {v2.8h-v3.8h}, x2, x3
315
+    st1             {v0.8h-v1.8h}, x0, x1
316
+    st1             {v2.8h-v3.8h}, x0, x1
317
+.endr
318
+    ret
319
+endfunc
320
+
321
+function PFX(blockcopy_ss_32x64_neon)
322
+    lsl             x1, x1, #1
323
+    lsl             x3, x3, #1
324
+    mov             w12, #8
325
+.loop_css32x64:
326
+    sub             w12, w12, #1
327
+.rept 8
328
+    ld1             {v0.8h-v3.8h}, x2, x3
329
+    st1             {v0.8h-v3.8h}, x0, x1
330
+.endr
331
+    cbnz            w12, .loop_css32x64
332
+    ret
333
+endfunc
334
+
335
+// chroma blockcopy_ps
336
+function PFX(blockcopy_ps_4x8_neon)
337
+    lsl             x1, x1, #1
338
+.rept 4
339
+    ld1             {v0.8b}, x2, x3
340
+    ld1             {v1.8b}, x2, x3
341
+    uxtl            v0.8h, v0.8b
342
+    uxtl            v1.8h, v1.8b
343
+    st1             {v0.4h}, x0, x1
344
+    st1             {v1.4h}, x0, x1
345
+.endr
346
+    ret
347
+endfunc
348
+
349
+function PFX(blockcopy_ps_8x16_neon)
350
+    lsl             x1, x1, #1
351
+.rept 8
352
+    ld1             {v0.8b}, x2, x3
353
+    ld1             {v1.8b}, x2, x3
354
+    uxtl            v0.8h, v0.8b
355
+    uxtl            v1.8h, v1.8b
356
+    st1             {v0.8h}, x0, x1
357
+    st1             {v1.8h}, x0, x1
358
+.endr
359
+    ret
360
+endfunc
361
+
362
+function PFX(blockcopy_ps_16x32_neon)
363
+    lsl             x1, x1, #1
364
+.rept 16
365
+    ld1             {v4.16b}, x2, x3
366
+    ld1             {v5.16b}, x2, x3
367
+    uxtl            v0.8h, v4.8b
368
+    uxtl2           v1.8h, v4.16b
369
+    uxtl            v2.8h, v5.8b
370
+    uxtl2           v3.8h, v5.16b
371
+    st1             {v0.8h-v1.8h}, x0, x1
372
+    st1             {v2.8h-v3.8h}, x0, x1
373
+.endr
374
+    ret
375
+endfunc
376
+
377
+function PFX(blockcopy_ps_32x64_neon)
378
+    lsl             x1, x1, #1
379
+    mov             w12, #8
380
+.loop_cps32x64:
381
+    sub             w12, w12, #1
382
+.rept 4
383
+    ld1             {v16.16b-v17.16b}, x2, x3
384
+    ld1             {v18.16b-v19.16b}, x2, x3
385
+    uxtl            v0.8h, v16.8b
386
+    uxtl2           v1.8h, v16.16b
387
+    uxtl            v2.8h, v17.8b
388
+    uxtl2           v3.8h, v17.16b
389
+    uxtl            v4.8h, v18.8b
390
+    uxtl2           v5.8h, v18.16b
391
+    uxtl            v6.8h, v19.8b
392
+    uxtl2           v7.8h, v19.16b
393
+    st1             {v0.8h-v3.8h}, x0, x1
394
+    st1             {v4.8h-v7.8h}, x0, x1
395
+.endr
396
+    cbnz            w12, .loop_cps32x64
397
+    ret
398
+endfunc
399
+
400
+// chroma blockcopy_sp
401
+function PFX(blockcopy_sp_4x8_neon)
402
+    lsl             x3, x3, #1
403
+.rept 4
404
+    ld1             {v0.8h}, x2, x3
405
+    ld1             {v1.8h}, x2, x3
406
+    xtn             v0.8b, v0.8h
407
+    xtn             v1.8b, v1.8h
408
+    st1             {v0.s}0, x0, x1
409
+    st1             {v1.s}0, x0, x1
410
+.endr
411
+    ret
412
+endfunc
413
+
414
+function PFX(blockcopy_sp_8x16_neon)
415
+    lsl             x3, x3, #1
416
+.rept 8
417
+    ld1             {v0.8h}, x2, x3
418
+    ld1             {v1.8h}, x2, x3
419
+    xtn             v0.8b, v0.8h
420
+    xtn             v1.8b, v1.8h
421
+    st1             {v0.d}0, x0, x1
422
+    st1             {v1.d}0, x0, x1
423
+.endr
424
+    ret
425
+endfunc
426
+
427
+function PFX(blockcopy_sp_16x32_neon)
428
+    lsl             x3, x3, #1
429
+    movrel          x11, xtn_xtn2_table
430
+    ld1             {v31.16b}, x11
431
+.rept 16
432
+    ld1             {v0.8h-v1.8h}, x2, x3
433
+    ld1             {v2.8h-v3.8h}, x2, x3
434
+    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
435
+    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
436
+    st1             {v0.16b}, x0, x1
437
+    st1             {v1.16b}, x0, x1
438
+.endr
439
+    ret
440
+endfunc
441
+
442
+function PFX(blockcopy_sp_32x64_neon)
443
+    mov             w12, #8
444
+    lsl             x3, x3, #1
445
+    movrel          x11, xtn_xtn2_table
446
+    ld1             {v31.16b}, x11
447
+.loop_csp32x64:
448
+    sub             w12, w12, #1
449
+.rept 4
450
+    ld1             {v0.8h-v3.8h}, x2, x3
451
+    ld1             {v4.8h-v7.8h}, x2, x3
452
+    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
453
+    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
454
+    tbl             v2.16b, {v4.16b,v5.16b}, v31.16b
455
+    tbl             v3.16b, {v6.16b,v7.16b}, v31.16b
456
+    st1             {v0.16b-v1.16b}, x0, x1
457
+    st1             {v2.16b-v3.16b}, x0, x1
458
+.endr
459
+    cbnz            w12, .loop_csp32x64
460
+    ret
461
+endfunc
462
+
463
+/* blockcopy_pp(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) */
464
+
465
+function PFX(blockcopy_pp_2x4_neon)
466
+    ldrh            w9, x2
467
+    add             x4, x1, x1
468
+    add             x14, x3, x3
469
+    strh            w9, x0
470
+    ldrh            w10, x2, x3
471
+    add             x5, x4, x1
472
+    add             x15, x14, x3
473
+    strh            w10, x0, x1
474
+    ldrh            w11, x2, x14
475
+    strh            w11, x0, x4
476
+    ldrh            w12, x2, x15
477
+    strh            w12, x0, x5
478
+    ret
479
+endfunc
480
+
481
+.macro blockcopy_pp_2xN_neon h
482
+function PFX(blockcopy_pp_2x\h\()_neon)
483
+    add             x4, x1, x1
484
+    add             x5, x4, x1
485
+    add             x6, x5, x1
486
+
487
+    add             x14, x3, x3
488
+    add             x15, x14, x3
489
+    add             x16, x15, x3
490
+
491
+.rept \h / 4
492
+    ldrh            w9, x2
493
+    strh            w9, x0
494
+    ldrh            w10, x2, x3
495
+    strh            w10, x0, x1
496
+    ldrh            w11, x2, x14
497
+    strh            w11, x0, x4
498
+    ldrh            w12, x2, x15
499
+    strh            w12, x0, x5
500
+    add             x2, x2, x16
501
+    add             x0, x0, x6
502
+.endr
503
+    ret
504
+endfunc
505
+.endm
506
+
507
+blockcopy_pp_2xN_neon 8
508
+blockcopy_pp_2xN_neon 16
509
+
510
+function PFX(blockcopy_pp_4x2_neon)
511
+    ldr             w9, x2
512
+    str             w9, x0
513
+    ldr             w10, x2, x3
514
+    str             w10, x0, x1
515
+    ret
516
+endfunc
517
+
518
+function PFX(blockcopy_pp_4x4_neon)
519
+    ldr             w9, x2
520
+    add             x4, x1, x1
521
+    add             x14, x3, x3
522
+    str             w9, x0
523
+    ldr             w10, x2, x3
524
+    add             x5, x4, x1
525
+    add             x15, x14, x3
526
+    str             w10, x0, x1
527
+    ldr             w11, x2, x14
528
+    str             w11, x0, x4
529
+    ldr             w12, x2, x15
530
+    str             w12, x0, x5
531
+    ret
532
+endfunc
533
+
534
+.macro blockcopy_pp_4xN_neon h
535
+function PFX(blockcopy_pp_4x\h\()_neon)
536
+    add             x4, x1, x1
537
+    add             x5, x4, x1
538
+    add             x6, x5, x1
539
+
540
+    add             x14, x3, x3
541
+    add             x15, x14, x3
542
+    add             x16, x15, x3
543
+
544
+.rept \h / 4
545
+    ldr             w9, x2
546
+    str             w9, x0
547
+    ldr             w10, x2, x3
548
+    str             w10, x0, x1
549
+    ldr             w11, x2, x14
550
+    str             w11, x0, x4
551
+    ldr             w12, x2, x15
552
+    str             w12, x0, x5
553
+    add             x2, x2, x16
554
+    add             x0, x0, x6
555
+.endr
556
+    ret
557
+endfunc
558
+.endm
559
+
560
+blockcopy_pp_4xN_neon 8
561
+blockcopy_pp_4xN_neon 16
562
+blockcopy_pp_4xN_neon 32
563
+
564
+.macro blockcopy_pp_6xN_neon h
565
+function PFX(blockcopy_pp_6x\h\()_neon)
566
+    sub             x1, x1, #4
567
+.rept \h
568
+    ld1             {v0.8b}, x2, x3
569
+    st1             {v0.s}0, x0, #4
570
+    st1             {v0.h}2, x0, x1
571
+.endr
572
+    ret
573
+endfunc
574
+.endm
575
+
576
+blockcopy_pp_6xN_neon 8
577
+blockcopy_pp_6xN_neon 16
578
+
579
+.macro blockcopy_pp_8xN_neon h
580
+function PFX(blockcopy_pp_8x\h\()_neon)
581
+.rept \h
582
+    ld1             {v0.4h}, x2, x3
583
+    st1             {v0.4h}, x0, x1
584
+.endr
585
+    ret
586
+endfunc
587
+.endm
588
+
589
+blockcopy_pp_8xN_neon 2
590
+blockcopy_pp_8xN_neon 4
591
+blockcopy_pp_8xN_neon 6
592
+blockcopy_pp_8xN_neon 8
593
+blockcopy_pp_8xN_neon 12
594
+blockcopy_pp_8xN_neon 16
595
+blockcopy_pp_8xN_neon 32
596
+
597
+function PFX(blockcopy_pp_8x64_neon)
598
+    mov             w12, #4
599
+.loop_pp_8x64:
600
+    sub             w12, w12, #1
601
+.rept 16
602
+    ld1             {v0.4h}, x2, x3
603
+    st1             {v0.4h}, x0, x1
604
+.endr
605
+    cbnz            w12, .loop_pp_8x64
606
+    ret
607
+endfunc
608
+
609
+.macro blockcopy_pp_16xN_neon h
610
+function PFX(blockcopy_pp_16x\h\()_neon)
611
+.rept \h
612
+    ld1             {v0.8h}, x2, x3
613
+    st1             {v0.8h}, x0, x1
614
+.endr
615
+    ret
616
+endfunc
617
+.endm
618
+
619
+blockcopy_pp_16xN_neon 4
620
+blockcopy_pp_16xN_neon 8
621
+blockcopy_pp_16xN_neon 12
622
+blockcopy_pp_16xN_neon 16
623
+
624
+.macro blockcopy_pp_16xN1_neon h
625
+function PFX(blockcopy_pp_16x\h\()_neon)
626
+    mov             w12, #\h / 8
627
+.loop_16x\h\():
628
+.rept 8
629
+    ld1             {v0.8h}, x2, x3
630
+    st1             {v0.8h}, x0, x1
631
+.endr
632
+    sub             w12, w12, #1
633
+    cbnz            w12, .loop_16x\h
634
+    ret
635
+endfunc
636
+.endm
637
+
638
+blockcopy_pp_16xN1_neon 24
639
+blockcopy_pp_16xN1_neon 32
640
+blockcopy_pp_16xN1_neon 64
641
+
642
+function PFX(blockcopy_pp_12x16_neon)
643
+    sub             x1, x1, #8
644
+.rept 16
645
+    ld1             {v0.16b}, x2, x3
646
+    str             d0, x0, #8
647
+    st1             {v0.s}2, x0, x1
648
+.endr
649
+    ret
650
+endfunc
651
+
652
+function PFX(blockcopy_pp_12x32_neon)
653
+    sub             x1, x1, #8
654
+    mov             w12, #4
655
+.loop_pp_12x32:
656
+    sub             w12, w12, #1
657
+.rept 8
658
+    ld1             {v0.16b}, x2, x3
659
+    str             d0, x0, #8
660
+    st1             {v0.s}2, x0, x1
661
+.endr
662
+    cbnz            w12, .loop_pp_12x32
663
+    ret
664
+endfunc
665
+
666
+function PFX(blockcopy_pp_24x32_neon)
667
+    mov             w12, #4
668
+.loop_24x32:
669
+    sub             w12, w12, #1
670
+.rept 8
671
+    ld1             {v0.8b-v2.8b}, x2, x3
672
+    st1             {v0.8b-v2.8b}, x0, x1
673
+.endr
674
+    cbnz            w12, .loop_24x32
675
+    ret
676
+endfunc
677
+
678
+function PFX(blockcopy_pp_24x64_neon)
679
+    mov             w12, #4
680
+.loop_24x64:
681
+    sub             w12, w12, #1
682
+.rept 16
683
+    ld1             {v0.8b-v2.8b}, x2, x3
684
+    st1             {v0.8b-v2.8b}, x0, x1
685
+.endr
686
+    cbnz            w12, .loop_24x64
687
+    ret
688
+endfunc
689
+
690
+function PFX(blockcopy_pp_32x8_neon)
691
+.rept 8
692
+    ld1             {v0.16b-v1.16b}, x2, x3
693
+    st1             {v0.16b-v1.16b}, x0, x1
694
+.endr
695
+    ret
696
+endfunc
697
+
698
+.macro blockcopy_pp_32xN_neon h
699
+function PFX(blockcopy_pp_32x\h\()_neon)
700
+    mov             w12, #\h / 8
701
+.loop_32x\h\():
702
+    sub             w12, w12, #1
703
+.rept 8
704
+    ld1             {v0.16b-v1.16b}, x2, x3
705
+    st1             {v0.16b-v1.16b}, x0, x1
706
+.endr
707
+    cbnz            w12, .loop_32x\h
708
+    ret
709
+endfunc
710
+.endm
711
+
712
+blockcopy_pp_32xN_neon 16
713
+blockcopy_pp_32xN_neon 24
714
+blockcopy_pp_32xN_neon 32
715
+blockcopy_pp_32xN_neon 64
716
+blockcopy_pp_32xN_neon 48
717
+
718
+function PFX(blockcopy_pp_48x64_neon)
719
+    mov             w12, #8
720
+.loop_48x64:
721
+    sub             w12, w12, #1
722
+.rept 8
723
+    ld1             {v0.16b-v2.16b}, x2, x3
724
+    st1             {v0.16b-v2.16b}, x0, x1
725
+.endr
726
+    cbnz            w12, .loop_48x64
727
+    ret
728
+endfunc
729
+
730
+.macro blockcopy_pp_64xN_neon h
731
+function PFX(blockcopy_pp_64x\h\()_neon)
732
+    mov             w12, #\h / 4
733
+.loop_64x\h\():
734
+    sub             w12, w12, #1
735
+.rept 4
736
+    ld1             {v0.16b-v3.16b}, x2, x3
737
+    st1             {v0.16b-v3.16b}, x0, x1
738
+.endr
739
+    cbnz            w12, .loop_64x\h
740
+    ret
741
+endfunc
742
+.endm
743
+
744
+blockcopy_pp_64xN_neon 16
745
+blockcopy_pp_64xN_neon 32
746
+blockcopy_pp_64xN_neon 48
747
+blockcopy_pp_64xN_neon 64
748
+
749
+// void x265_blockfill_s_neon(int16_t* dst, intptr_t dstride, int16_t val)
750
+function PFX(blockfill_s_4x4_neon)
751
+    dup             v0.4h, w2
752
+    lsl             x1, x1, #1
753
+.rept 4
754
+    st1             {v0.4h}, x0, x1
755
+.endr
756
+    ret
757
+endfunc
758
+
759
+function PFX(blockfill_s_8x8_neon)
760
+    dup             v0.8h, w2
761
+    lsl             x1, x1, #1
762
+.rept 8
763
+    st1             {v0.8h}, x0, x1
764
+.endr
765
+    ret
766
+endfunc
767
+
768
+function PFX(blockfill_s_16x16_neon)
769
+    dup             v0.8h, w2
770
+    mov             v1.16b, v0.16b
771
+    lsl             x1, x1, #1
772
+.rept 16
773
+    stp             q0, q1, x0
774
+    add             x0, x0, x1
775
+.endr
776
+    ret
777
+endfunc
778
+
779
+function PFX(blockfill_s_32x32_neon)
780
+    dup             v0.8h, w2
781
+    mov             v1.16b, v0.16b
782
+    mov             v2.16b, v0.16b
783
+    mov             v3.16b, v0.16b
784
+    lsl             x1, x1, #1
785
+.rept 32
786
+    st1             {v0.8h-v3.8h}, x0, x1
787
+.endr
788
+    ret
789
+endfunc
790
+
791
+function PFX(blockfill_s_64x64_neon)
792
+    dup             v0.8h, w2
793
+    mov             v1.16b, v0.16b
794
+    mov             v2.16b, v0.16b
795
+    mov             v3.16b, v0.16b
796
+    lsl             x1, x1, #1
797
+    sub             x1, x1, #64
798
+.rept 64
799
+    st1             {v0.8h-v3.8h}, x0, #64
800
+    st1             {v0.8h-v3.8h}, x0, x1
801
+.endr
802
+    ret
803
+endfunc
804
+
805
+// uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t resiStride)
806
+function PFX(copy_cnt_4_neon)
807
+    lsl             x2, x2, #1
808
+    movi            v4.8b, #0
809
+.rept 2
810
+    ld1             {v0.8b}, x1, x2
811
+    ld1             {v1.8b}, x1, x2
812
+    stp             d0, d1, x0, #16
813
+    cmeq            v0.4h, v0.4h, #0
814
+    cmeq            v1.4h, v1.4h, #0
815
+    add             v4.4h, v4.4h, v0.4h
816
+    add             v4.4h, v4.4h, v1.4h
817
+.endr
818
+    saddlv          s4, v4.4h
819
+    fmov            w12, s4
820
+    add             w0, w12, #16
821
+    ret
822
+endfunc
823
+
824
+function PFX(copy_cnt_8_neon)
825
+    lsl             x2, x2, #1
826
+    movi            v4.8b, #0
827
+.rept 4
828
+    ld1             {v0.16b}, x1, x2
829
+    ld1             {v1.16b}, x1, x2
830
+    stp             q0, q1, x0, #32
831
+    cmeq            v0.8h, v0.8h, #0
832
+    cmeq            v1.8h, v1.8h, #0
833
+    add             v4.8h, v4.8h, v0.8h
834
+    add             v4.8h, v4.8h, v1.8h
835
+.endr
836
+    saddlv          s4, v4.8h
837
+    fmov            w12, s4
838
+    add             w0, w12, #64
839
+    ret
840
+endfunc
841
+
842
+function PFX(copy_cnt_16_neon)
843
+    lsl             x2, x2, #1
844
+    movi            v4.8b, #0
845
+.rept 16
846
+    ld1             {v0.16b-v1.16b}, x1, x2
847
+    st1             {v0.16b-v1.16b}, x0, #32
848
+    cmeq            v0.8h, v0.8h, #0
849
+    cmeq            v1.8h, v1.8h, #0
850
+    add             v4.8h, v4.8h, v0.8h
851
+    add             v4.8h, v4.8h, v1.8h
852
+.endr
853
+    saddlv          s4, v4.8h
854
+    fmov            w12, s4
855
+    add             w0, w12, #256
856
+    ret
857
+endfunc
858
+
859
+function PFX(copy_cnt_32_neon)
860
+    lsl             x2, x2, #1
861
+    movi            v4.8b, #0
862
+.rept 32
863
+    ld1             {v0.16b-v3.16b}, x1, x2
864
+    st1             {v0.16b-v3.16b}, x0, #64
865
+    cmeq            v0.8h, v0.8h, #0
866
+    cmeq            v1.8h, v1.8h, #0
867
+    cmeq            v2.8h, v2.8h, #0
868
+    cmeq            v3.8h, v3.8h, #0
869
+    add             v0.8h, v0.8h, v1.8h
870
+    add             v2.8h, v2.8h, v3.8h
871
+    add             v4.8h, v4.8h, v0.8h
872
+    add             v4.8h, v4.8h, v2.8h
873
+.endr
874
+    saddlv          s4, v4.8h
875
+    fmov            w12, s4
876
+    add             w0, w12, #1024
877
+    ret
878
+endfunc
879
+
880
+// int  count_nonzero_c(const int16_t* quantCoeff)
881
+function PFX(count_nonzero_4_neon)
882
+    movi            v16.16b, #1
883
+    movi            v17.16b, #0
884
+    trn1            v16.16b, v16.16b, v17.16b
885
+    ldp             q0, q1, x0
886
+    cmhi            v0.8h, v0.8h, v17.8h
887
+    cmhi            v1.8h, v1.8h, v17.8h
888
+    and             v0.16b, v0.16b, v16.16b
889
+    and             v1.16b, v1.16b, v16.16b
890
+    add             v0.8h, v0.8h, v1.8h
891
+    uaddlv          s0, v0.8h
892
+    fmov            w0, s0
893
+    ret
894
+endfunc
895
+
896
+.macro COUNT_NONZERO_8
897
+    ld1             {v0.16b-v3.16b}, x0, #64
898
+    ld1             {v4.16b-v7.16b}, x0, #64
899
+    cmhi            v0.8h, v0.8h, v17.8h
900
+    cmhi            v1.8h, v1.8h, v17.8h
901
+    cmhi            v2.8h, v2.8h, v17.8h
902
+    cmhi            v3.8h, v3.8h, v17.8h
903
+    cmhi            v4.8h, v4.8h, v17.8h
904
+    cmhi            v5.8h, v5.8h, v17.8h
905
+    cmhi            v6.8h, v6.8h, v17.8h
906
+    cmhi            v7.8h, v7.8h, v17.8h
907
+    and             v0.16b, v0.16b, v16.16b
908
+    and             v1.16b, v1.16b, v16.16b
909
+    and             v2.16b, v2.16b, v16.16b
910
+    and             v3.16b, v3.16b, v16.16b
911
+    and             v4.16b, v4.16b, v16.16b
912
+    and             v5.16b, v5.16b, v16.16b
913
+    and             v6.16b, v6.16b, v16.16b
914
+    and             v7.16b, v7.16b, v16.16b
915
+    add             v0.8h, v0.8h, v1.8h
916
+    add             v2.8h, v2.8h, v3.8h
917
+    add             v4.8h, v4.8h, v5.8h
918
+    add             v6.8h, v6.8h, v7.8h
919
+    add             v0.8h, v0.8h, v2.8h
920
+    add             v4.8h, v4.8h, v6.8h
921
+    add             v0.8h, v0.8h, v4.8h
922
+.endm
923
+
924
+function PFX(count_nonzero_8_neon)
925
+    movi            v16.16b, #1
926
+    movi            v17.16b, #0
927
+    trn1            v16.16b, v16.16b, v17.16b
928
+    COUNT_NONZERO_8
929
+    uaddlv          s0, v0.8h
930
+    fmov            w0, s0
931
+    ret
932
+endfunc
933
+
934
+function PFX(count_nonzero_16_neon)
935
+    movi            v16.16b, #1
936
+    movi            v17.16b, #0
937
+    trn1            v16.16b, v16.16b, v17.16b
938
+    movi            v18.16b, #0
939
+.rept 4
940
+    COUNT_NONZERO_8
941
+    add             v18.16b, v18.16b, v0.16b
942
+.endr
943
+    uaddlv          s0, v18.8h
944
+    fmov            w0, s0
945
+    ret
946
+endfunc
947
+
948
+function PFX(count_nonzero_32_neon)
949
+    movi            v16.16b, #1
950
+    movi            v17.16b, #0
951
+    trn1            v16.16b, v16.16b, v17.16b
952
+    movi            v18.16b, #0
953
+    mov             w12, #16
954
+.loop_count_nonzero_32:
955
+    sub             w12, w12, #1
956
+    COUNT_NONZERO_8
957
+    add             v18.16b, v18.16b, v0.16b
958
+    cbnz            w12, .loop_count_nonzero_32
959
+
960
+    uaddlv          s0, v18.8h
961
+    fmov            w0, s0
962
+    ret
963
+endfunc
964
+
965
+// void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
966
+.macro cpy2Dto1D_shl_start
967
+    add             x2, x2, x2
968
+    dup             v0.8h, w3
969
+.endm
970
+
971
+function PFX(cpy2Dto1D_shl_4x4_neon)
972
+    cpy2Dto1D_shl_start
973
+    ld1             {v2.d}0, x1, x2
974
+    ld1             {v2.d}1, x1, x2
975
+    ld1             {v3.d}0, x1, x2
976
+    ld1             {v3.d}1, x1, x2
977
+    sshl            v2.8h, v2.8h, v0.8h
978
+    sshl            v3.8h, v3.8h, v0.8h
979
+    st1             {v2.16b-v3.16b}, x0
980
+    ret
981
+endfunc
982
+
983
+function PFX(cpy2Dto1D_shl_8x8_neon)
984
+    cpy2Dto1D_shl_start
985
+.rept 4
986
+    ld1             {v2.16b}, x1, x2
987
+    ld1             {v3.16b}, x1, x2
988
+    sshl            v2.8h, v2.8h, v0.8h
989
+    sshl            v3.8h, v3.8h, v0.8h
990
+    st1             {v2.16b-v3.16b}, x0, #32
991
+.endr
992
+    ret
993
+endfunc
994
+
995
+function PFX(cpy2Dto1D_shl_16x16_neon)
996
+    cpy2Dto1D_shl_start
997
+    mov             w12, #4
998
+.loop_cpy2Dto1D_shl_16:
999
+    sub             w12, w12, #1
1000
+.rept 4
1001
+    ld1             {v2.16b-v3.16b}, x1, x2
1002
+    sshl            v2.8h, v2.8h, v0.8h
1003
+    sshl            v3.8h, v3.8h, v0.8h
1004
+    st1             {v2.16b-v3.16b}, x0, #32
1005
+.endr
1006
+    cbnz            w12, .loop_cpy2Dto1D_shl_16
1007
+    ret
1008
+endfunc
1009
+
1010
+function PFX(cpy2Dto1D_shl_32x32_neon)
1011
+    cpy2Dto1D_shl_start
1012
+    mov             w12, #16
1013
+.loop_cpy2Dto1D_shl_32:
1014
+    sub             w12, w12, #1
1015
+.rept 2
1016
+    ld1             {v2.16b-v5.16b}, x1, x2
1017
+    sshl            v2.8h, v2.8h, v0.8h
1018
+    sshl            v3.8h, v3.8h, v0.8h
1019
+    sshl            v4.8h, v4.8h, v0.8h
1020
+    sshl            v5.8h, v5.8h, v0.8h
1021
+    st1             {v2.16b-v5.16b}, x0, #64
1022
+.endr
1023
+    cbnz            w12, .loop_cpy2Dto1D_shl_32
1024
+    ret
1025
+endfunc
1026
+
1027
+function PFX(cpy2Dto1D_shl_64x64_neon)
1028
+    cpy2Dto1D_shl_start
1029
+    mov             w12, #32
1030
+    sub             x2, x2, #64
1031
+.loop_cpy2Dto1D_shl_64:
1032
+    sub             w12, w12, #1
1033
+.rept 2
1034
+    ld1             {v2.16b-v5.16b}, x1, #64
1035
+    ld1             {v16.16b-v19.16b}, x1, x2
1036
+    sshl            v2.8h, v2.8h, v0.8h
1037
+    sshl            v3.8h, v3.8h, v0.8h
1038
+    sshl            v4.8h, v4.8h, v0.8h
1039
+    sshl            v5.8h, v5.8h, v0.8h
1040
+    sshl            v16.8h, v16.8h, v0.8h
1041
+    sshl            v17.8h, v17.8h, v0.8h
1042
+    sshl            v18.8h, v18.8h, v0.8h
1043
+    sshl            v19.8h, v19.8h, v0.8h
1044
+    st1             {v2.16b-v5.16b}, x0, #64
1045
+    st1             {v16.16b-v19.16b}, x0, #64
1046
+.endr
1047
+    cbnz            w12, .loop_cpy2Dto1D_shl_64
1048
+    ret
1049
+endfunc
1050
+
1051
+// void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
1052
+function PFX(cpy2Dto1D_shr_4x4_neon)
1053
+    cpy2Dto1D_shr_start
1054
+    ld1             {v2.d}0, x1, x2
1055
+    ld1             {v2.d}1, x1, x2
1056
+    ld1             {v3.d}0, x1, x2
1057
+    ld1             {v3.d}1, x1, x2
1058
+    sub             v2.8h, v2.8h, v1.8h
1059
+    sub             v3.8h, v3.8h, v1.8h
1060
+    sshl            v2.8h, v2.8h, v0.8h
1061
+    sshl            v3.8h, v3.8h, v0.8h
1062
+    stp             q2, q3, x0
1063
+    ret
1064
+endfunc
1065
+
1066
+function PFX(cpy2Dto1D_shr_8x8_neon)
1067
+    cpy2Dto1D_shr_start
1068
+.rept 4
1069
+    ld1             {v2.16b}, x1, x2
1070
+    ld1             {v3.16b}, x1, x2
1071
+    sub             v2.8h, v2.8h, v1.8h
1072
+    sub             v3.8h, v3.8h, v1.8h
1073
+    sshl            v2.8h, v2.8h, v0.8h
1074
+    sshl            v3.8h, v3.8h, v0.8h
1075
+    stp             q2, q3, x0, #32
1076
+.endr
1077
+    ret
1078
+endfunc
1079
+
1080
+function PFX(cpy2Dto1D_shr_16x16_neon)
1081
+    cpy2Dto1D_shr_start
1082
+    mov             w12, #4
1083
+.loop_cpy2Dto1D_shr_16:
1084
+    sub             w12, w12, #1
1085
+.rept 4
1086
+    ld1             {v2.8h-v3.8h}, x1, x2
1087
+    sub             v2.8h, v2.8h, v1.8h
1088
+    sub             v3.8h, v3.8h, v1.8h
1089
+    sshl            v2.8h, v2.8h, v0.8h
1090
+    sshl            v3.8h, v3.8h, v0.8h
1091
+    st1             {v2.8h-v3.8h}, x0, #32
1092
+.endr
1093
+    cbnz            w12, .loop_cpy2Dto1D_shr_16
1094
+    ret
1095
+endfunc
1096
+
1097
+function PFX(cpy2Dto1D_shr_32x32_neon)
1098
+    cpy2Dto1D_shr_start
1099
+    mov             w12, #16
1100
+.loop_cpy2Dto1D_shr_32:
1101
+    sub             w12, w12, #1
1102
+.rept 2
1103
+    ld1             {v2.8h-v5.8h}, x1, x2
1104
+    sub             v2.8h, v2.8h, v1.8h
1105
+    sub             v3.8h, v3.8h, v1.8h
1106
+    sub             v4.8h, v4.8h, v1.8h
1107
+    sub             v5.8h, v5.8h, v1.8h
1108
+    sshl            v2.8h, v2.8h, v0.8h
1109
+    sshl            v3.8h, v3.8h, v0.8h
1110
+    sshl            v4.8h, v4.8h, v0.8h
1111
+    sshl            v5.8h, v5.8h, v0.8h
1112
+    st1             {v2.8h-v5.8h}, x0, #64
1113
+.endr
1114
+    cbnz            w12, .loop_cpy2Dto1D_shr_32
1115
+    ret
1116
+endfunc
1117
+
1118
+// void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
1119
+.macro cpy1Dto2D_shl_start
1120
+    add             x2, x2, x2
1121
+    dup             v0.8h, w3
1122
+.endm
1123
+
1124
+function PFX(cpy1Dto2D_shl_4x4_neon)
1125
+    cpy1Dto2D_shl_start
1126
+    ld1             {v2.16b-v3.16b}, x1
1127
+    sshl            v2.8h, v2.8h, v0.8h
1128
+    sshl            v3.8h, v3.8h, v0.8h
1129
+    st1             {v2.d}0, x0, x2
1130
+    st1             {v2.d}1, x0, x2
1131
+    st1             {v3.d}0, x0, x2
1132
+    st1             {v3.d}1, x0, x2
1133
+    ret
1134
+endfunc
1135
+
1136
+function PFX(cpy1Dto2D_shl_8x8_neon)
1137
+    cpy1Dto2D_shl_start
1138
+.rept 4
1139
+    ld1             {v2.16b-v3.16b}, x1, #32
1140
+    sshl            v2.8h, v2.8h, v0.8h
1141
+    sshl            v3.8h, v3.8h, v0.8h
1142
+    st1             {v2.16b}, x0, x2
1143
+    st1             {v3.16b}, x0, x2
1144
+.endr
1145
+    ret
1146
+endfunc
1147
+
1148
+function PFX(cpy1Dto2D_shl_16x16_neon)
1149
+    cpy1Dto2D_shl_start
1150
+    mov             w12, #4
1151
+.loop_cpy1Dto2D_shl_16:
1152
+    sub             w12, w12, #1
1153
+.rept 4
1154
+    ld1             {v2.16b-v3.16b}, x1, #32
1155
+    sshl            v2.8h, v2.8h, v0.8h
1156
+    sshl            v3.8h, v3.8h, v0.8h
1157
+    st1             {v2.16b-v3.16b}, x0, x2
1158
+.endr
1159
+    cbnz            w12, .loop_cpy1Dto2D_shl_16
1160
+    ret
1161
+endfunc
1162
+
1163
+function PFX(cpy1Dto2D_shl_32x32_neon)
1164
+    cpy1Dto2D_shl_start
1165
+    mov             w12, #16
1166
+.loop_cpy1Dto2D_shl_32:
1167
+    sub             w12, w12, #1
1168
+.rept 2
1169
+    ld1             {v2.16b-v5.16b}, x1, #64
1170
+    sshl            v2.8h, v2.8h, v0.8h
1171
+    sshl            v3.8h, v3.8h, v0.8h
1172
+    sshl            v4.8h, v4.8h, v0.8h
1173
+    sshl            v5.8h, v5.8h, v0.8h
1174
+    st1             {v2.16b-v5.16b}, x0, x2
1175
+.endr
1176
+    cbnz            w12, .loop_cpy1Dto2D_shl_32
1177
+    ret
1178
+endfunc
1179
+
1180
+function PFX(cpy1Dto2D_shl_64x64_neon)
1181
+    cpy1Dto2D_shl_start
1182
+    mov             w12, #32
1183
+    sub             x2, x2, #64
1184
+.loop_cpy1Dto2D_shl_64:
1185
+    sub             w12, w12, #1
1186
+.rept 2
1187
+    ld1             {v2.16b-v5.16b}, x1, #64
1188
+    ld1             {v16.16b-v19.16b}, x1, #64
1189
+    sshl            v2.8h, v2.8h, v0.8h
1190
+    sshl            v3.8h, v3.8h, v0.8h
1191
+    sshl            v4.8h, v4.8h, v0.8h
1192
+    sshl            v5.8h, v5.8h, v0.8h
1193
+    sshl            v16.8h, v16.8h, v0.8h
1194
+    sshl            v17.8h, v17.8h, v0.8h
1195
+    sshl            v18.8h, v18.8h, v0.8h
1196
+    sshl            v19.8h, v19.8h, v0.8h
1197
+    st1             {v2.16b-v5.16b}, x0, #64
1198
+    st1             {v16.16b-v19.16b}, x0, x2
1199
+.endr
1200
+    cbnz            w12, .loop_cpy1Dto2D_shl_64
1201
+    ret
1202
+endfunc
1203
+
1204
+function PFX(cpy1Dto2D_shr_4x4_neon)
1205
+    cpy1Dto2D_shr_start
1206
+    ld1             {v2.16b-v3.16b}, x1
1207
+    sub             v2.8h, v2.8h, v1.8h
1208
+    sub             v3.8h, v3.8h, v1.8h
1209
+    sshl            v2.8h, v2.8h, v0.8h
1210
+    sshl            v3.8h, v3.8h, v0.8h
1211
+    st1             {v2.d}0, x0, x2
1212
+    st1             {v2.d}1, x0, x2
1213
+    st1             {v3.d}0, x0, x2
1214
+    st1             {v3.d}1, x0, x2
1215
+    ret
1216
+endfunc
1217
+
1218
+function PFX(cpy1Dto2D_shr_8x8_neon)
1219
+    cpy1Dto2D_shr_start
1220
+.rept 4
1221
+    ld1             {v2.16b-v3.16b}, x1, #32
1222
+    sub             v2.8h, v2.8h, v1.8h
1223
+    sub             v3.8h, v3.8h, v1.8h
1224
+    sshl            v2.8h, v2.8h, v0.8h
1225
+    sshl            v3.8h, v3.8h, v0.8h
1226
+    st1             {v2.16b}, x0, x2
1227
+    st1             {v3.16b}, x0, x2
1228
+.endr
1229
+    ret
1230
+endfunc
1231
+
1232
+function PFX(cpy1Dto2D_shr_16x16_neon)
1233
+    cpy1Dto2D_shr_start
1234
+    mov             w12, #4
1235
+.loop_cpy1Dto2D_shr_16:
1236
+    sub             w12, w12, #1
1237
+.rept 4
1238
+    ld1             {v2.8h-v3.8h}, x1, #32
1239
+    sub             v2.8h, v2.8h, v1.8h
1240
+    sub             v3.8h, v3.8h, v1.8h
1241
+    sshl            v2.8h, v2.8h, v0.8h
1242
+    sshl            v3.8h, v3.8h, v0.8h
1243
+    st1             {v2.8h-v3.8h}, x0, x2
1244
+.endr
1245
+    cbnz            w12, .loop_cpy1Dto2D_shr_16
1246
+    ret
1247
+endfunc
1248
+
1249
+function PFX(cpy1Dto2D_shr_32x32_neon)
1250
+    cpy1Dto2D_shr_start
1251
+    mov             w12, #16
1252
+.loop_cpy1Dto2D_shr_32:
1253
+    sub             w12, w12, #1
1254
+.rept 2
1255
+    ld1             {v2.16b-v5.16b}, x1, #64
1256
+    sub             v2.8h, v2.8h, v1.8h
1257
+    sub             v3.8h, v3.8h, v1.8h
1258
+    sub             v4.8h, v4.8h, v1.8h
1259
+    sub             v5.8h, v5.8h, v1.8h
1260
+    sshl            v2.8h, v2.8h, v0.8h
1261
+    sshl            v3.8h, v3.8h, v0.8h
1262
+    sshl            v4.8h, v4.8h, v0.8h
1263
+    sshl            v5.8h, v5.8h, v0.8h
1264
+    st1             {v2.16b-v5.16b}, x0, x2
1265
+.endr
1266
+    cbnz            w12, .loop_cpy1Dto2D_shr_32
1267
+    ret
1268
+endfunc
1269
+
1270
+function PFX(cpy1Dto2D_shr_64x64_neon)
1271
+    cpy1Dto2D_shr_start
1272
+    mov             w12, #32
1273
+    sub             x2, x2, #64
1274
+.loop_cpy1Dto2D_shr_64:
1275
+    sub             w12, w12, #1
1276
+.rept 2
1277
+    ld1             {v2.16b-v5.16b}, x1, #64
1278
+    ld1             {v16.16b-v19.16b}, x1, #64
1279
+    sub             v2.8h, v2.8h, v1.8h
1280
+    sub             v3.8h, v3.8h, v1.8h
1281
+    sub             v4.8h, v4.8h, v1.8h
1282
+    sub             v5.8h, v5.8h, v1.8h
1283
+    sub             v16.8h, v16.8h, v1.8h
1284
+    sub             v17.8h, v17.8h, v1.8h
1285
+    sub             v18.8h, v18.8h, v1.8h
1286
+    sub             v19.8h, v19.8h, v1.8h
1287
+    sshl            v2.8h, v2.8h, v0.8h
1288
+    sshl            v3.8h, v3.8h, v0.8h
1289
+    sshl            v4.8h, v4.8h, v0.8h
1290
+    sshl            v5.8h, v5.8h, v0.8h
1291
+    sshl            v16.8h, v16.8h, v0.8h
1292
+    sshl            v17.8h, v17.8h, v0.8h
1293
+    sshl            v18.8h, v18.8h, v0.8h
1294
+    sshl            v19.8h, v19.8h, v0.8h
1295
+    st1             {v2.16b-v5.16b}, x0, #64
1296
+    st1             {v16.16b-v19.16b}, x0, x2
1297
+.endr
1298
+    cbnz            w12, .loop_cpy1Dto2D_shr_64
1299
+    ret
1300
+endfunc
1301
x265_3.6.tar.gz/source/common/aarch64/dct-prim.cpp Added
950
 
1
@@ -0,0 +1,948 @@
2
+#include "dct-prim.h"
3
+
4
+
5
+#if HAVE_NEON
6
+
7
+#include <arm_neon.h>
8
+
9
+
10
+namespace
11
+{
12
+using namespace X265_NS;
13
+
14
+
15
+static int16x8_t rev16(const int16x8_t a)
16
+{
17
+    static const int8x16_t tbl = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
18
+    return vqtbx1q_u8(a, a, tbl);
19
+}
20
+
21
+static int32x4_t rev32(const int32x4_t a)
22
+{
23
+    static const int8x16_t tbl = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
24
+    return vqtbx1q_u8(a, a, tbl);
25
+}
26
+
27
+static void transpose_4x4x16(int16x4_t &x0, int16x4_t &x1, int16x4_t &x2, int16x4_t &x3)
28
+{
29
+    int16x4_t s0, s1, s2, s3;
30
+    s0 = vtrn1_s32(x0, x2);
31
+    s1 = vtrn1_s32(x1, x3);
32
+    s2 = vtrn2_s32(x0, x2);
33
+    s3 = vtrn2_s32(x1, x3);
34
+
35
+    x0 = vtrn1_s16(s0, s1);
36
+    x1 = vtrn2_s16(s0, s1);
37
+    x2 = vtrn1_s16(s2, s3);
38
+    x3 = vtrn2_s16(s2, s3);
39
+}
40
+
41
+
42
+
43
+static int scanPosLast_opt(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag,
44
+                           uint8_t *coeffNum, int numSig, const uint16_t * /*scanCG4x4*/, const int /*trSize*/)
45
+{
46
+
47
+    // This is an optimized function for scanPosLast, which removes the rmw dependency, once integrated into mainline x265, should replace reference implementation
48
+    // For clarity, left the original reference code in comments
49
+    int scanPosLast = 0;
50
+
51
+    uint16_t cSign = 0;
52
+    uint16_t cFlag = 0;
53
+    uint8_t cNum = 0;
54
+
55
+    uint32_t prevcgIdx = 0;
56
+    do
57
+    {
58
+        const uint32_t cgIdx = (uint32_t)scanPosLast >> MLS_CG_SIZE;
59
+
60
+        const uint32_t posLast = scanscanPosLast;
61
+
62
+        const int curCoeff = coeffposLast;
63
+        const uint32_t isNZCoeff = (curCoeff != 0);
64
+        /*
65
+        NOTE: the new algorithm is complicated, so I keep reference code here
66
+        uint32_t posy   = posLast >> log2TrSize;
67
+        uint32_t posx   = posLast - (posy << log2TrSize);
68
+        uint32_t blkIdx0 = ((posy >> MLS_CG_LOG2_SIZE) << codingParameters.log2TrSizeCG) + (posx >> MLS_CG_LOG2_SIZE);
69
+        const uint32_t blkIdx = ((posLast >> (2 * MLS_CG_LOG2_SIZE)) & ~maskPosXY) + ((posLast >> MLS_CG_LOG2_SIZE) & maskPosXY);
70
+        sigCoeffGroupFlag64 |= ((uint64_t)isNZCoeff << blkIdx);
71
+        */
72
+
73
+        // get L1 sig map
74
+        numSig -= isNZCoeff;
75
+
76
+        if (scanPosLast % (1 << MLS_CG_SIZE) == 0)
77
+        {
78
+            coeffSignprevcgIdx = cSign;
79
+            coeffFlagprevcgIdx = cFlag;
80
+            coeffNumprevcgIdx = cNum;
81
+            cSign = 0;
82
+            cFlag = 0;
83
+            cNum = 0;
84
+        }
85
+        // TODO: optimize by instruction BTS
86
+        cSign += (uint16_t)(((curCoeff < 0) ? 1 : 0) << cNum);
87
+        cFlag = (cFlag << 1) + (uint16_t)isNZCoeff;
88
+        cNum += (uint8_t)isNZCoeff;
89
+        prevcgIdx = cgIdx;
90
+        scanPosLast++;
91
+    }
92
+    while (numSig > 0);
93
+
94
+    coeffSignprevcgIdx = cSign;
95
+    coeffFlagprevcgIdx = cFlag;
96
+    coeffNumprevcgIdx = cNum;
97
+    return scanPosLast - 1;
98
+}
99
+
100
+
101
+#if (MLS_CG_SIZE == 4)
102
+template<int log2TrSize>
103
+static void nonPsyRdoQuant_neon(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost,
104
+                                int64_t *totalRdCost, uint32_t blkPos)
105
+{
106
+    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH -
107
+                               log2TrSize; /* Represents scaling through forward transform */
108
+    const int scaleBits = SCALE_BITS - 2 * transformShift;
109
+    const uint32_t trSize = 1 << log2TrSize;
110
+
111
+    int64x2_t vcost_sum_0 = vdupq_n_s64(0);
112
+    int64x2_t vcost_sum_1 = vdupq_n_s64(0);
113
+    for (int y = 0; y < MLS_CG_SIZE; y++)
114
+    {
115
+        int16x4_t in = *(int16x4_t *)&m_resiDctCoeffblkPos;
116
+        int32x4_t mul = vmull_s16(in, in);
117
+        int64x2_t cost0, cost1;
118
+        cost0 = vshll_n_s32(vget_low_s32(mul), scaleBits);
119
+        cost1 = vshll_high_n_s32(mul, scaleBits);
120
+        *(int64x2_t *)&costUncodedblkPos + 0 = cost0;
121
+        *(int64x2_t *)&costUncodedblkPos + 2 = cost1;
122
+        vcost_sum_0 = vaddq_s64(vcost_sum_0, cost0);
123
+        vcost_sum_1 = vaddq_s64(vcost_sum_1, cost1);
124
+        blkPos += trSize;
125
+    }
126
+    int64_t sum = vaddvq_s64(vaddq_s64(vcost_sum_0, vcost_sum_1));
127
+    *totalUncodedCost += sum;
128
+    *totalRdCost += sum;
129
+}
130
+
131
+template<int log2TrSize>
132
+static void psyRdoQuant_neon(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded,
133
+                             int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
134
+{
135
+    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH -
136
+                               log2TrSize; /* Represents scaling through forward transform */
137
+    const int scaleBits = SCALE_BITS - 2 * transformShift;
138
+    const uint32_t trSize = 1 << log2TrSize;
139
+    //using preprocessor to bypass clang bug
140
+    const int max = X265_MAX(0, (2 * transformShift + 1));
141
+
142
+    int64x2_t vcost_sum_0 = vdupq_n_s64(0);
143
+    int64x2_t vcost_sum_1 = vdupq_n_s64(0);
144
+    int32x4_t vpsy = vdupq_n_s32(*psyScale);
145
+    for (int y = 0; y < MLS_CG_SIZE; y++)
146
+    {
147
+        int32x4_t signCoef = vmovl_s16(*(int16x4_t *)&m_resiDctCoeffblkPos);
148
+        int32x4_t predictedCoef = vsubq_s32(vmovl_s16(*(int16x4_t *)&m_fencDctCoeffblkPos), signCoef);
149
+        int64x2_t cost0, cost1;
150
+        cost0 = vmull_s32(vget_low_s32(signCoef), vget_low_s32(signCoef));
151
+        cost1 = vmull_high_s32(signCoef, signCoef);
152
+        cost0 = vshlq_n_s64(cost0, scaleBits);
153
+        cost1 = vshlq_n_s64(cost1, scaleBits);
154
+        int64x2_t neg0 = vmull_s32(vget_low_s32(predictedCoef), vget_low_s32(vpsy));
155
+        int64x2_t neg1 = vmull_high_s32(predictedCoef, vpsy);
156
+        if (max > 0)
157
+        {
158
+            int64x2_t shift = vdupq_n_s64(-max);
159
+            neg0 = vshlq_s64(neg0, shift);
160
+            neg1 = vshlq_s64(neg1, shift);
161
+        }
162
+        cost0 = vsubq_s64(cost0, neg0);
163
+        cost1 = vsubq_s64(cost1, neg1);
164
+        *(int64x2_t *)&costUncodedblkPos + 0 = cost0;
165
+        *(int64x2_t *)&costUncodedblkPos + 2 = cost1;
166
+        vcost_sum_0 = vaddq_s64(vcost_sum_0, cost0);
167
+        vcost_sum_1 = vaddq_s64(vcost_sum_1, cost1);
168
+
169
+        blkPos += trSize;
170
+    }
171
+    int64_t sum = vaddvq_s64(vaddq_s64(vcost_sum_0, vcost_sum_1));
172
+    *totalUncodedCost += sum;
173
+    *totalRdCost += sum;
174
+}
175
+
176
+#else
177
+#error "MLS_CG_SIZE must be 4 for neon version"
178
+#endif
179
+
180
+
181
+
182
+template<int trSize>
183
+int  count_nonzero_neon(const int16_t *quantCoeff)
184
+{
185
+    X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
186
+    int count = 0;
187
+    int16x8_t vcount = vdupq_n_s16(0);
188
+    const int numCoeff = trSize * trSize;
189
+    int i = 0;
190
+    for (; (i + 8) <= numCoeff; i += 8)
191
+    {
192
+        int16x8_t in = *(int16x8_t *)&quantCoeffi;
193
+        vcount = vaddq_s16(vcount, vtstq_s16(in, in));
194
+    }
195
+    for (; i < numCoeff; i++)
196
+    {
197
+        count += quantCoeffi != 0;
198
+    }
199
+
200
+    return count - vaddvq_s16(vcount);
201
+}
202
+
203
+template<int trSize>
204
+uint32_t copy_count_neon(int16_t *coeff, const int16_t *residual, intptr_t resiStride)
205
+{
206
+    uint32_t numSig = 0;
207
+    int16x8_t vcount = vdupq_n_s16(0);
208
+    for (int k = 0; k < trSize; k++)
209
+    {
210
+        int j = 0;
211
+        for (; (j + 8) <= trSize; j += 8)
212
+        {
213
+            int16x8_t in = *(int16x8_t *)&residualj;
214
+            *(int16x8_t *)&coeffj = in;
215
+            vcount = vaddq_s16(vcount, vtstq_s16(in, in));
216
+        }
217
+        for (; j < trSize; j++)
218
+        {
219
+            coeffj = residualj;
220
+            numSig += (residualj != 0);
221
+        }
222
+        residual += resiStride;
223
+        coeff += trSize;
224
+    }
225
+
226
+    return numSig - vaddvq_s16(vcount);
227
+}
228
+
229
+
230
+static void partialButterfly16(const int16_t *src, int16_t *dst, int shift, int line)
231
+{
232
+    int j, k;
233
+    int32x4_t E2, O2;
234
+    int32x4_t EE, EO;
235
+    int32x2_t EEE, EEO;
236
+    const int add = 1 << (shift - 1);
237
+    const int32x4_t _vadd = {add, 0};
238
+
239
+    for (j = 0; j < line; j++)
240
+    {
241
+        int16x8_t in0 = *(int16x8_t *)src;
242
+        int16x8_t in1 = rev16(*(int16x8_t *)&src8);
243
+
244
+        E0 = vaddl_s16(vget_low_s16(in0), vget_low_s16(in1));
245
+        O0 = vsubl_s16(vget_low_s16(in0), vget_low_s16(in1));
246
+        E1 = vaddl_high_s16(in0, in1);
247
+        O1 = vsubl_high_s16(in0, in1);
248
+
249
+        for (k = 1; k < 16; k += 2)
250
+        {
251
+            int32x4_t c0 = vmovl_s16(*(int16x4_t *)&g_t16k0);
252
+            int32x4_t c1 = vmovl_s16(*(int16x4_t *)&g_t16k4);
253
+
254
+            int32x4_t res = _vadd;
255
+            res = vmlaq_s32(res, c0, O0);
256
+            res = vmlaq_s32(res, c1, O1);
257
+            dstk * line = (int16_t)(vaddvq_s32(res) >> shift);
258
+        }
259
+
260
+        /* EE and EO */
261
+        EE = vaddq_s32(E0, rev32(E1));
262
+        EO = vsubq_s32(E0, rev32(E1));
263
+
264
+        for (k = 2; k < 16; k += 4)
265
+        {
266
+            int32x4_t c0 = vmovl_s16(*(int16x4_t *)&g_t16k0);
267
+            int32x4_t res = _vadd;
268
+            res = vmlaq_s32(res, c0, EO);
269
+            dstk * line = (int16_t)(vaddvq_s32(res) >> shift);
270
+        }
271
+
272
+        /* EEE and EEO */
273
+        EEE0 = EE0 + EE3;
274
+        EEO0 = EE0 - EE3;
275
+        EEE1 = EE1 + EE2;
276
+        EEO1 = EE1 - EE2;
277
+
278
+        dst0 = (int16_t)((g_t1600 * EEE0 + g_t1601 * EEE1 + add) >> shift);
279
+        dst8 * line = (int16_t)((g_t1680 * EEE0 + g_t1681 * EEE1 + add) >> shift);
280
+        dst4 * line = (int16_t)((g_t1640 * EEO0 + g_t1641 * EEO1 + add) >> shift);
281
+        dst12 * line = (int16_t)((g_t16120 * EEO0 + g_t16121 * EEO1 + add) >> shift);
282
+
283
+
284
+        src += 16;
285
+        dst++;
286
+    }
287
+}
288
+
289
+
290
+static void partialButterfly32(const int16_t *src, int16_t *dst, int shift, int line)
291
+{
292
+    int j, k;
293
+    const int add = 1 << (shift - 1);
294
+
295
+
296
+    for (j = 0; j < line; j++)
297
+    {
298
+        int32x4_t VE4, VO0, VO1, VO2, VO3;
299
+        int32x4_t VEE2, VEO2;
300
+        int32x4_t VEEE, VEEO;
301
+        int EEEE2, EEEO2;
302
+
303
+        int16x8x4_t inputs;
304
+        inputs = *(int16x8x4_t *)&src0;
305
+        int16x8x4_t in_rev;
306
+
307
+        in_rev.val1 = rev16(inputs.val2);
308
+        in_rev.val0 = rev16(inputs.val3);
309
+
310
+        VE0 = vaddl_s16(vget_low_s16(inputs.val0), vget_low_s16(in_rev.val0));
311
+        VE1 = vaddl_high_s16(inputs.val0, in_rev.val0);
312
+        VO0 = vsubl_s16(vget_low_s16(inputs.val0), vget_low_s16(in_rev.val0));
313
+        VO1 = vsubl_high_s16(inputs.val0, in_rev.val0);
314
+        VE2 = vaddl_s16(vget_low_s16(inputs.val1), vget_low_s16(in_rev.val1));
315
+        VE3 = vaddl_high_s16(inputs.val1, in_rev.val1);
316
+        VO2 = vsubl_s16(vget_low_s16(inputs.val1), vget_low_s16(in_rev.val1));
317
+        VO3 = vsubl_high_s16(inputs.val1, in_rev.val1);
318
+
319
+        for (k = 1; k < 32; k += 2)
320
+        {
321
+            int32x4_t c0 = vmovl_s16(*(int16x4_t *)&g_t32k0);
322
+            int32x4_t c1 = vmovl_s16(*(int16x4_t *)&g_t32k4);
323
+            int32x4_t c2 = vmovl_s16(*(int16x4_t *)&g_t32k8);
324
+            int32x4_t c3 = vmovl_s16(*(int16x4_t *)&g_t32k12);
325
+            int32x4_t s = vmulq_s32(c0, VO0);
326
+            s = vmlaq_s32(s, c1, VO1);
327
+            s = vmlaq_s32(s, c2, VO2);
328
+            s = vmlaq_s32(s, c3, VO3);
329
+
330
+            dstk * line = (int16_t)((vaddvq_s32(s) + add) >> shift);
331
+
332
+        }
333
+
334
+        int32x4_t rev_VE2;
335
+
336
+
337
+        rev_VE0 = rev32(VE3);
338
+        rev_VE1 = rev32(VE2);
339
+
340
+        /* EE and EO */
341
+        for (k = 0; k < 2; k++)
342
+        {
343
+            VEEk = vaddq_s32(VEk, rev_VEk);
344
+            VEOk = vsubq_s32(VEk, rev_VEk);
345
+        }
346
+        for (k = 2; k < 32; k += 4)
347
+        {
348
+            int32x4_t c0 = vmovl_s16(*(int16x4_t *)&g_t32k0);
349
+            int32x4_t c1 = vmovl_s16(*(int16x4_t *)&g_t32k4);
350
+            int32x4_t s = vmulq_s32(c0, VEO0);
351
+            s = vmlaq_s32(s, c1, VEO1);
352
+
353
+            dstk * line = (int16_t)((vaddvq_s32(s) + add) >> shift);
354
+
355
+        }
356
+
357
+        int32x4_t tmp = rev32(VEE1);
358
+        VEEE = vaddq_s32(VEE0, tmp);
359
+        VEEO = vsubq_s32(VEE0, tmp);
360
+        for (k = 4; k < 32; k += 8)
361
+        {
362
+            int32x4_t c = vmovl_s16(*(int16x4_t *)&g_t32k0);
363
+            int32x4_t s = vmulq_s32(c, VEEO);
364
+
365
+            dstk * line = (int16_t)((vaddvq_s32(s) + add) >> shift);
366
+        }
367
+
368
+        /* EEEE and EEEO */
369
+        EEEE0 = VEEE0 + VEEE3;
370
+        EEEO0 = VEEE0 - VEEE3;
371
+        EEEE1 = VEEE1 + VEEE2;
372
+        EEEO1 = VEEE1 - VEEE2;
373
+
374
+        dst0 = (int16_t)((g_t3200 * EEEE0 + g_t3201 * EEEE1 + add) >> shift);
375
+        dst16 * line = (int16_t)((g_t32160 * EEEE0 + g_t32161 * EEEE1 + add) >> shift);
376
+        dst8 * line = (int16_t)((g_t3280 * EEEO0 + g_t3281 * EEEO1 + add) >> shift);
377
+        dst24 * line = (int16_t)((g_t32240 * EEEO0 + g_t32241 * EEEO1 + add) >> shift);
378
+
379
+
380
+
381
+        src += 32;
382
+        dst++;
383
+    }
384
+}
385
+
386
+static void partialButterfly8(const int16_t *src, int16_t *dst, int shift, int line)
387
+{
388
+    int j, k;
389
+    int E4, O4;
390
+    int EE2, EO2;
391
+    int add = 1 << (shift - 1);
392
+
393
+    for (j = 0; j < line; j++)
394
+    {
395
+        /* E and O*/
396
+        for (k = 0; k < 4; k++)
397
+        {
398
+            Ek = srck + src7 - k;
399
+            Ok = srck - src7 - k;
400
+        }
401
+
402
+        /* EE and EO */
403
+        EE0 = E0 + E3;
404
+        EO0 = E0 - E3;
405
+        EE1 = E1 + E2;
406
+        EO1 = E1 - E2;
407
+
408
+        dst0 = (int16_t)((g_t800 * EE0 + g_t801 * EE1 + add) >> shift);
409
+        dst4 * line = (int16_t)((g_t840 * EE0 + g_t841 * EE1 + add) >> shift);
410
+        dst2 * line = (int16_t)((g_t820 * EO0 + g_t821 * EO1 + add) >> shift);
411
+        dst6 * line = (int16_t)((g_t860 * EO0 + g_t861 * EO1 + add) >> shift);
412
+
413
+        dstline = (int16_t)((g_t810 * O0 + g_t811 * O1 + g_t812 * O2 + g_t813 * O3 + add) >> shift);
414
+        dst3 * line = (int16_t)((g_t830 * O0 + g_t831 * O1 + g_t832 * O2 + g_t833 * O3 + add) >>
415
+                                  shift);
416
+        dst5 * line = (int16_t)((g_t850 * O0 + g_t851 * O1 + g_t852 * O2 + g_t853 * O3 + add) >>
417
+                                  shift);
418
+        dst7 * line = (int16_t)((g_t870 * O0 + g_t871 * O1 + g_t872 * O2 + g_t873 * O3 + add) >>
419
+                                  shift);
420
+
421
+        src += 8;
422
+        dst++;
423
+    }
424
+}
425
+
426
+static void partialButterflyInverse4(const int16_t *src, int16_t *dst, int shift, int line)
427
+{
428
+    int j;
429
+    int E2, O2;
430
+    int add = 1 << (shift - 1);
431
+
432
+    for (j = 0; j < line; j++)
433
+    {
434
+        /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
435
+        O0 = g_t410 * srcline + g_t430 * src3 * line;
436
+        O1 = g_t411 * srcline + g_t431 * src3 * line;
437
+        E0 = g_t400 * src0 + g_t420 * src2 * line;
438
+        E1 = g_t401 * src0 + g_t421 * src2 * line;
439
+
440
+        /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
441
+        dst0 = (int16_t)(x265_clip3(-32768, 32767, (E0 + O0 + add) >> shift));
442
+        dst1 = (int16_t)(x265_clip3(-32768, 32767, (E1 + O1 + add) >> shift));
443
+        dst2 = (int16_t)(x265_clip3(-32768, 32767, (E1 - O1 + add) >> shift));
444
+        dst3 = (int16_t)(x265_clip3(-32768, 32767, (E0 - O0 + add) >> shift));
445
+
446
+        src++;
447
+        dst += 4;
448
+    }
449
+}
450
+
451
+
452
+
453
+static void partialButterflyInverse16_neon(const int16_t *src, int16_t *orig_dst, int shift, int line)
454
+{
455
+#define FMAK(x,l) sl = vmlal_lane_s16(sl,*(int16x4_t*)&src(x)*line,*(int16x4_t *)&g_t16xk,l)
456
+#define MULK(x,l) vmull_lane_s16(*(int16x4_t*)&srcx*line,*(int16x4_t *)&g_t16xk,l);
457
+#define ODD3_15(k) FMAK(3,k);FMAK(5,k);FMAK(7,k);FMAK(9,k);FMAK(11,k);FMAK(13,k);FMAK(15,k);
458
+#define EVEN6_14_STEP4(k) FMAK(6,k);FMAK(10,k);FMAK(14,k);
459
+
460
+
461
+    int j, k;
462
+    int32x4_t E8, O8;
463
+    int32x4_t EE4, EO4;
464
+    int32x4_t EEE2, EEO2;
465
+    const int add = 1 << (shift - 1);
466
+
467
+
468
+#pragma unroll(4)
469
+    for (j = 0; j < line; j += 4)
470
+    {
471
+        /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
472
+
473
+#pragma unroll(2)
474
+        for (k = 0; k < 2; k++)
475
+        {
476
+            int32x4_t s;
477
+            s = vmull_s16(vdup_n_s16(g_t164k), *(int16x4_t *)&src4 * line);;
478
+            EEOk = vmlal_s16(s, vdup_n_s16(g_t1612k), *(int16x4_t *)&src(12) * line);
479
+            s = vmull_s16(vdup_n_s16(g_t160k), *(int16x4_t *)&src0 * line);;
480
+            EEEk = vmlal_s16(s, vdup_n_s16(g_t168k), *(int16x4_t *)&src(8) * line);
481
+        }
482
+
483
+        /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
484
+        EE0 = vaddq_s32(EEE0 , EEO0);
485
+        EE2 = vsubq_s32(EEE1 , EEO1);
486
+        EE1 = vaddq_s32(EEE1 , EEO1);
487
+        EE3 = vsubq_s32(EEE0 , EEO0);
488
+
489
+
490
+#pragma unroll(1)
491
+        for (k = 0; k < 4; k += 4)
492
+        {
493
+            int32x4_t s4;
494
+            s0 = MULK(2, 0);
495
+            s1 = MULK(2, 1);
496
+            s2 = MULK(2, 2);
497
+            s3 = MULK(2, 3);
498
+
499
+            EVEN6_14_STEP4(0);
500
+            EVEN6_14_STEP4(1);
501
+            EVEN6_14_STEP4(2);
502
+            EVEN6_14_STEP4(3);
503
+
504
+            EOk = s0;
505
+            EOk + 1 = s1;
506
+            EOk + 2 = s2;
507
+            EOk + 3 = s3;
508
+        }
509
+
510
+
511
+
512
+        static const int32x4_t min = vdupq_n_s32(-32768);
513
+        static const int32x4_t max = vdupq_n_s32(32767);
514
+        const int32x4_t minus_shift = vdupq_n_s32(-shift);
515
+
516
+#pragma unroll(4)
517
+        for (k = 0; k < 4; k++)
518
+        {
519
+            Ek = vaddq_s32(EEk , EOk);
520
+            Ek + 4 = vsubq_s32(EE3 - k , EO3 - k);
521
+        }
522
+
523
+#pragma unroll(2)
524
+        for (k = 0; k < 8; k += 4)
525
+        {
526
+            int32x4_t s4;
527
+            s0 = MULK(1, 0);
528
+            s1 = MULK(1, 1);
529
+            s2 = MULK(1, 2);
530
+            s3 = MULK(1, 3);
531
+            ODD3_15(0);
532
+            ODD3_15(1);
533
+            ODD3_15(2);
534
+            ODD3_15(3);
535
+            Ok = s0;
536
+            Ok + 1 = s1;
537
+            Ok + 2 = s2;
538
+            Ok + 3 = s3;
539
+            int32x4_t t;
540
+            int16x4_t x0, x1, x2, x3;
541
+
542
+            Ek = vaddq_s32(vdupq_n_s32(add), Ek);
543
+            t = vaddq_s32(Ek, Ok);
544
+            t = vshlq_s32(t, minus_shift);
545
+            t = vmaxq_s32(t, min);
546
+            t = vminq_s32(t, max);
547
+            x0 = vmovn_s32(t);
548
+
549
+            Ek + 1 = vaddq_s32(vdupq_n_s32(add), Ek + 1);
550
+            t = vaddq_s32(Ek + 1, Ok + 1);
551
+            t = vshlq_s32(t, minus_shift);
552
+            t = vmaxq_s32(t, min);
553
+            t = vminq_s32(t, max);
554
+            x1 = vmovn_s32(t);
555
+
556
+            Ek + 2 = vaddq_s32(vdupq_n_s32(add), Ek + 2);
557
+            t = vaddq_s32(Ek + 2, Ok + 2);
558
+            t = vshlq_s32(t, minus_shift);
559
+            t = vmaxq_s32(t, min);
560
+            t = vminq_s32(t, max);
561
+            x2 = vmovn_s32(t);
562
+
563
+            Ek + 3 = vaddq_s32(vdupq_n_s32(add), Ek + 3);
564
+            t = vaddq_s32(Ek + 3, Ok + 3);
565
+            t = vshlq_s32(t, minus_shift);
566
+            t = vmaxq_s32(t, min);
567
+            t = vminq_s32(t, max);
568
+            x3 = vmovn_s32(t);
569
+
570
+            transpose_4x4x16(x0, x1, x2, x3);
571
+            *(int16x4_t *)&orig_dst0 * 16 + k = x0;
572
+            *(int16x4_t *)&orig_dst1 * 16 + k = x1;
573
+            *(int16x4_t *)&orig_dst2 * 16 + k = x2;
574
+            *(int16x4_t *)&orig_dst3 * 16 + k = x3;
575
+        }
576
+
577
+
578
+#pragma unroll(2)
579
+        for (k = 0; k < 8; k += 4)
580
+        {
581
+            int32x4_t t;
582
+            int16x4_t x0, x1, x2, x3;
583
+
584
+            t = vsubq_s32(E7 - k, O7 - k);
585
+            t = vshlq_s32(t, minus_shift);
586
+            t = vmaxq_s32(t, min);
587
+            t = vminq_s32(t, max);
588
+            x0 = vmovn_s32(t);
589
+
590
+            t = vsubq_s32(E6 - k, O6 - k);
591
+            t = vshlq_s32(t, minus_shift);
592
+            t = vmaxq_s32(t, min);
593
+            t = vminq_s32(t, max);
594
+            x1 = vmovn_s32(t);
595
+
596
+            t = vsubq_s32(E5 - k, O5 - k);
597
+
598
+            t = vshlq_s32(t, minus_shift);
599
+            t = vmaxq_s32(t, min);
600
+            t = vminq_s32(t, max);
601
+            x2 = vmovn_s32(t);
602
+
603
+            t = vsubq_s32(E4 - k, O4 - k);
604
+            t = vshlq_s32(t, minus_shift);
605
+            t = vmaxq_s32(t, min);
606
+            t = vminq_s32(t, max);
607
+            x3 = vmovn_s32(t);
608
+
609
+            transpose_4x4x16(x0, x1, x2, x3);
610
+            *(int16x4_t *)&orig_dst0 * 16 + k + 8 = x0;
611
+            *(int16x4_t *)&orig_dst1 * 16 + k + 8 = x1;
612
+            *(int16x4_t *)&orig_dst2 * 16 + k + 8 = x2;
613
+            *(int16x4_t *)&orig_dst3 * 16 + k + 8 = x3;
614
+        }
615
+        orig_dst += 4 * 16;
616
+        src += 4;
617
+    }
618
+
619
+#undef MUL
620
+#undef FMA
621
+#undef FMAK
622
+#undef MULK
623
+#undef ODD3_15
624
+#undef EVEN6_14_STEP4
625
+
626
+
627
+}
628
+
629
+
630
+
631
+static void partialButterflyInverse32_neon(const int16_t *src, int16_t *orig_dst, int shift, int line)
632
+{
633
+#define MUL(x) vmull_s16(vdup_n_s16(g_t32xk),*(int16x4_t*)&srcx*line);
634
+#define FMA(x) s = vmlal_s16(s,vdup_n_s16(g_t32xk),*(int16x4_t*)&src(x)*line)
635
+#define FMAK(x,l) sl = vmlal_lane_s16(sl,*(int16x4_t*)&src(x)*line,*(int16x4_t *)&g_t32xk,l)
636
+#define MULK(x,l) vmull_lane_s16(*(int16x4_t*)&srcx*line,*(int16x4_t *)&g_t32xk,l);
637
+#define ODD31(k) FMAK(3,k);FMAK(5,k);FMAK(7,k);FMAK(9,k);FMAK(11,k);FMAK(13,k);FMAK(15,k);FMAK(17,k);FMAK(19,k);FMAK(21,k);FMAK(23,k);FMAK(25,k);FMAK(27,k);FMAK(29,k);FMAK(31,k);
638
+
639
+#define ODD15(k) FMAK(6,k);FMAK(10,k);FMAK(14,k);FMAK(18,k);FMAK(22,k);FMAK(26,k);FMAK(30,k);
640
+#define ODD7(k) FMAK(12,k);FMAK(20,k);FMAK(28,k);
641
+
642
+
643
+    int j, k;
644
+    int32x4_t E16, O16;
645
+    int32x4_t EE8, EO8;
646
+    int32x4_t EEE4, EEO4;
647
+    int32x4_t EEEE2, EEEO2;
648
+    int16x4_t dst32;
649
+    int add = 1 << (shift - 1);
650
+
651
+#pragma unroll (8)
652
+    for (j = 0; j < line; j += 4)
653
+    {
654
+#pragma unroll (4)
655
+        for (k = 0; k < 16; k += 4)
656
+        {
657
+            int32x4_t s4;
658
+            s0 = MULK(1, 0);
659
+            s1 = MULK(1, 1);
660
+            s2 = MULK(1, 2);
661
+            s3 = MULK(1, 3);
662
+            ODD31(0);
663
+            ODD31(1);
664
+            ODD31(2);
665
+            ODD31(3);
666
+            Ok = s0;
667
+            Ok + 1 = s1;
668
+            Ok + 2 = s2;
669
+            Ok + 3 = s3;
670
+
671
+
672
+        }
673
+
674
+
675
+#pragma unroll (2)
676
+        for (k = 0; k < 8; k += 4)
677
+        {
678
+            int32x4_t s4;
679
+            s0 = MULK(2, 0);
680
+            s1 = MULK(2, 1);
681
+            s2 = MULK(2, 2);
682
+            s3 = MULK(2, 3);
683
+
684
+            ODD15(0);
685
+            ODD15(1);
686
+            ODD15(2);
687
+            ODD15(3);
688
+
689
+            EOk = s0;
690
+            EOk + 1 = s1;
691
+            EOk + 2 = s2;
692
+            EOk + 3 = s3;
693
+        }
694
+
695
+
696
+        for (k = 0; k < 4; k += 4)
697
+        {
698
+            int32x4_t s4;
699
+            s0 = MULK(4, 0);
700
+            s1 = MULK(4, 1);
701
+            s2 = MULK(4, 2);
702
+            s3 = MULK(4, 3);
703
+
704
+            ODD7(0);
705
+            ODD7(1);
706
+            ODD7(2);
707
+            ODD7(3);
708
+
709
+            EEOk = s0;
710
+            EEOk + 1 = s1;
711
+            EEOk + 2 = s2;
712
+            EEOk + 3 = s3;
713
+        }
714
+
715
+#pragma unroll (2)
716
+        for (k = 0; k < 2; k++)
717
+        {
718
+            int32x4_t s;
719
+            s = MUL(8);
720
+            EEEOk = FMA(24);
721
+            s = MUL(0);
722
+            EEEEk = FMA(16);
723
+        }
724
+        /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
725
+        EEE0 = vaddq_s32(EEEE0, EEEO0);
726
+        EEE3 = vsubq_s32(EEEE0, EEEO0);
727
+        EEE1 = vaddq_s32(EEEE1, EEEO1);
728
+        EEE2 = vsubq_s32(EEEE1, EEEO1);
729
+
730
+#pragma unroll (4)
731
+        for (k = 0; k < 4; k++)
732
+        {
733
+            EEk = vaddq_s32(EEEk, EEOk);
734
+            EEk + 4 = vsubq_s32((EEE3 - k), (EEO3 - k));
735
+        }
736
+
737
+#pragma unroll (8)
738
+        for (k = 0; k < 8; k++)
739
+        {
740
+            Ek = vaddq_s32(EEk, EOk);
741
+            Ek + 8 = vsubq_s32((EE7 - k), (EO7 - k));
742
+        }
743
+
744
+        static const int32x4_t min = vdupq_n_s32(-32768);
745
+        static const int32x4_t max = vdupq_n_s32(32767);
746
+
747
+
748
+
749
+#pragma unroll (16)
750
+        for (k = 0; k < 16; k++)
751
+        {
752
+            int32x4_t adde = vaddq_s32(vdupq_n_s32(add), Ek);
753
+            int32x4_t s = vaddq_s32(adde, Ok);
754
+            s = vshlq_s32(s, vdupq_n_s32(-shift));
755
+            s = vmaxq_s32(s, min);
756
+            s = vminq_s32(s, max);
757
+
758
+
759
+
760
+            dstk = vmovn_s32(s);
761
+            adde = vaddq_s32(vdupq_n_s32(add), (E15 - k));
762
+            s  = vsubq_s32(adde, (O15 - k));
763
+            s = vshlq_s32(s, vdupq_n_s32(-shift));
764
+            s = vmaxq_s32(s, min);
765
+            s = vminq_s32(s, max);
766
+
767
+            dstk + 16 = vmovn_s32(s);
768
+        }
769
+
770
+
771
+#pragma unroll (8)
772
+        for (k = 0; k < 32; k += 4)
773
+        {
774
+            int16x4_t x0 = dstk + 0;
775
+            int16x4_t x1 = dstk + 1;
776
+            int16x4_t x2 = dstk + 2;
777
+            int16x4_t x3 = dstk + 3;
778
+            transpose_4x4x16(x0, x1, x2, x3);
779
+            *(int16x4_t *)&orig_dst0 * 32 + k = x0;
780
+            *(int16x4_t *)&orig_dst1 * 32 + k = x1;
781
+            *(int16x4_t *)&orig_dst2 * 32 + k = x2;
782
+            *(int16x4_t *)&orig_dst3 * 32 + k = x3;
783
+        }
784
+        orig_dst += 4 * 32;
785
+        src += 4;
786
+    }
787
+#undef MUL
788
+#undef FMA
789
+#undef FMAK
790
+#undef MULK
791
+#undef ODD31
792
+#undef ODD15
793
+#undef ODD7
794
+
795
+}
796
+
797
+
798
+static void dct8_neon(const int16_t *src, int16_t *dst, intptr_t srcStride)
799
+{
800
+    const int shift_1st = 2 + X265_DEPTH - 8;
801
+    const int shift_2nd = 9;
802
+
803
+    ALIGN_VAR_32(int16_t, coef8 * 8);
804
+    ALIGN_VAR_32(int16_t, block8 * 8);
805
+
806
+    for (int i = 0; i < 8; i++)
807
+    {
808
+        memcpy(&blocki * 8, &srci * srcStride, 8 * sizeof(int16_t));
809
+    }
810
+
811
+    partialButterfly8(block, coef, shift_1st, 8);
812
+    partialButterfly8(coef, dst, shift_2nd, 8);
813
+}
814
+
815
+static void dct16_neon(const int16_t *src, int16_t *dst, intptr_t srcStride)
816
+{
817
+    const int shift_1st = 3 + X265_DEPTH - 8;
818
+    const int shift_2nd = 10;
819
+
820
+    ALIGN_VAR_32(int16_t, coef16 * 16);
821
+    ALIGN_VAR_32(int16_t, block16 * 16);
822
+
823
+    for (int i = 0; i < 16; i++)
824
+    {
825
+        memcpy(&blocki * 16, &srci * srcStride, 16 * sizeof(int16_t));
826
+    }
827
+
828
+    partialButterfly16(block, coef, shift_1st, 16);
829
+    partialButterfly16(coef, dst, shift_2nd, 16);
830
+}
831
+
832
+static void dct32_neon(const int16_t *src, int16_t *dst, intptr_t srcStride)
833
+{
834
+    const int shift_1st = 4 + X265_DEPTH - 8;
835
+    const int shift_2nd = 11;
836
+
837
+    ALIGN_VAR_32(int16_t, coef32 * 32);
838
+    ALIGN_VAR_32(int16_t, block32 * 32);
839
+
840
+    for (int i = 0; i < 32; i++)
841
+    {
842
+        memcpy(&blocki * 32, &srci * srcStride, 32 * sizeof(int16_t));
843
+    }
844
+
845
+    partialButterfly32(block, coef, shift_1st, 32);
846
+    partialButterfly32(coef, dst, shift_2nd, 32);
847
+}
848
+
849
+static void idct4_neon(const int16_t *src, int16_t *dst, intptr_t dstStride)
850
+{
851
+    const int shift_1st = 7;
852
+    const int shift_2nd = 12 - (X265_DEPTH - 8);
853
+
854
+    ALIGN_VAR_32(int16_t, coef4 * 4);
855
+    ALIGN_VAR_32(int16_t, block4 * 4);
856
+
857
+    partialButterflyInverse4(src, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output
858
+    partialButterflyInverse4(coef, block, shift_2nd, 4); // Forward DST BY FAST ALGORITHM, coef input, coeff output
859
+
860
+    for (int i = 0; i < 4; i++)
861
+    {
862
+        memcpy(&dsti * dstStride, &blocki * 4, 4 * sizeof(int16_t));
863
+    }
864
+}
865
+
866
+static void idct16_neon(const int16_t *src, int16_t *dst, intptr_t dstStride)
867
+{
868
+    const int shift_1st = 7;
869
+    const int shift_2nd = 12 - (X265_DEPTH - 8);
870
+
871
+    ALIGN_VAR_32(int16_t, coef16 * 16);
872
+    ALIGN_VAR_32(int16_t, block16 * 16);
873
+
874
+    partialButterflyInverse16_neon(src, coef, shift_1st, 16);
875
+    partialButterflyInverse16_neon(coef, block, shift_2nd, 16);
876
+
877
+    for (int i = 0; i < 16; i++)
878
+    {
879
+        memcpy(&dsti * dstStride, &blocki * 16, 16 * sizeof(int16_t));
880
+    }
881
+}
882
+
883
+static void idct32_neon(const int16_t *src, int16_t *dst, intptr_t dstStride)
884
+{
885
+    const int shift_1st = 7;
886
+    const int shift_2nd = 12 - (X265_DEPTH - 8);
887
+
888
+    ALIGN_VAR_32(int16_t, coef32 * 32);
889
+    ALIGN_VAR_32(int16_t, block32 * 32);
890
+
891
+    partialButterflyInverse32_neon(src, coef, shift_1st, 32);
892
+    partialButterflyInverse32_neon(coef, block, shift_2nd, 32);
893
+
894
+    for (int i = 0; i < 32; i++)
895
+    {
896
+        memcpy(&dsti * dstStride, &blocki * 32, 32 * sizeof(int16_t));
897
+    }
898
+}
899
+
900
+
901
+
902
+}
903
+
904
+namespace X265_NS
905
+{
906
+// x265 private namespace
907
+void setupDCTPrimitives_neon(EncoderPrimitives &p)
908
+{
909
+    p.cuBLOCK_4x4.nonPsyRdoQuant   = nonPsyRdoQuant_neon<2>;
910
+    p.cuBLOCK_8x8.nonPsyRdoQuant   = nonPsyRdoQuant_neon<3>;
911
+    p.cuBLOCK_16x16.nonPsyRdoQuant = nonPsyRdoQuant_neon<4>;
912
+    p.cuBLOCK_32x32.nonPsyRdoQuant = nonPsyRdoQuant_neon<5>;
913
+    p.cuBLOCK_4x4.psyRdoQuant = psyRdoQuant_neon<2>;
914
+    p.cuBLOCK_8x8.psyRdoQuant = psyRdoQuant_neon<3>;
915
+    p.cuBLOCK_16x16.psyRdoQuant = psyRdoQuant_neon<4>;
916
+    p.cuBLOCK_32x32.psyRdoQuant = psyRdoQuant_neon<5>;
917
+    p.cuBLOCK_8x8.dct   = dct8_neon;
918
+    p.cuBLOCK_16x16.dct = dct16_neon;
919
+    p.cuBLOCK_32x32.dct = dct32_neon;
920
+    p.cuBLOCK_4x4.idct   = idct4_neon;
921
+    p.cuBLOCK_16x16.idct = idct16_neon;
922
+    p.cuBLOCK_32x32.idct = idct32_neon;
923
+    p.cuBLOCK_4x4.count_nonzero = count_nonzero_neon<4>;
924
+    p.cuBLOCK_8x8.count_nonzero = count_nonzero_neon<8>;
925
+    p.cuBLOCK_16x16.count_nonzero = count_nonzero_neon<16>;
926
+    p.cuBLOCK_32x32.count_nonzero = count_nonzero_neon<32>;
927
+
928
+    p.cuBLOCK_4x4.copy_cnt   = copy_count_neon<4>;
929
+    p.cuBLOCK_8x8.copy_cnt   = copy_count_neon<8>;
930
+    p.cuBLOCK_16x16.copy_cnt = copy_count_neon<16>;
931
+    p.cuBLOCK_32x32.copy_cnt = copy_count_neon<32>;
932
+    p.cuBLOCK_4x4.psyRdoQuant_1p = nonPsyRdoQuant_neon<2>;
933
+    p.cuBLOCK_4x4.psyRdoQuant_2p = psyRdoQuant_neon<2>;
934
+    p.cuBLOCK_8x8.psyRdoQuant_1p = nonPsyRdoQuant_neon<3>;
935
+    p.cuBLOCK_8x8.psyRdoQuant_2p = psyRdoQuant_neon<3>;
936
+    p.cuBLOCK_16x16.psyRdoQuant_1p = nonPsyRdoQuant_neon<4>;
937
+    p.cuBLOCK_16x16.psyRdoQuant_2p = psyRdoQuant_neon<4>;
938
+    p.cuBLOCK_32x32.psyRdoQuant_1p = nonPsyRdoQuant_neon<5>;
939
+    p.cuBLOCK_32x32.psyRdoQuant_2p = psyRdoQuant_neon<5>;
940
+
941
+    p.scanPosLast  = scanPosLast_opt;
942
+
943
+}
944
+
945
+};
946
+
947
+
948
+
949
+#endif
950
x265_3.6.tar.gz/source/common/aarch64/dct-prim.h Added
21
 
1
@@ -0,0 +1,19 @@
2
+#ifndef __DCT_PRIM_NEON_H__
3
+#define __DCT_PRIM_NEON_H__
4
+
5
+
6
+#include "common.h"
7
+#include "primitives.h"
8
+#include "contexts.h"   // costCoeffNxN_c
9
+#include "threading.h"  // CLZ
10
+
11
+namespace X265_NS
12
+{
13
+// x265 private namespace
14
+void setupDCTPrimitives_neon(EncoderPrimitives &p);
15
+};
16
+
17
+
18
+
19
+#endif
20
+
21
x265_3.6.tar.gz/source/common/aarch64/filter-prim.cpp Added
997
 
1
@@ -0,0 +1,995 @@
2
+#if HAVE_NEON
3
+
4
+#include "filter-prim.h"
5
+#include <arm_neon.h>
6
+
7
+namespace
8
+{
9
+
10
+using namespace X265_NS;
11
+
12
+
13
+template<int width, int height>
14
+void filterPixelToShort_neon(const pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
15
+{
16
+    const int shift = IF_INTERNAL_PREC - X265_DEPTH;
17
+    int row, col;
18
+    const int16x8_t off = vdupq_n_s16(IF_INTERNAL_OFFS);
19
+    for (row = 0; row < height; row++)
20
+    {
21
+
22
+        for (col = 0; col < width; col += 8)
23
+        {
24
+            int16x8_t in;
25
+
26
+#if HIGH_BIT_DEPTH
27
+            in = *(int16x8_t *)&srccol;
28
+#else
29
+            in = vmovl_u8(*(uint8x8_t *)&srccol);
30
+#endif
31
+
32
+            int16x8_t tmp = vshlq_n_s16(in, shift);
33
+            tmp = vsubq_s16(tmp, off);
34
+            *(int16x8_t *)&dstcol = tmp;
35
+
36
+        }
37
+
38
+        src += srcStride;
39
+        dst += dstStride;
40
+    }
41
+}
42
+
43
+
44
+template<int N, int width, int height>
45
+void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
46
+{
47
+    const int16_t *coeff = (N == 4) ? g_chromaFiltercoeffIdx : g_lumaFiltercoeffIdx;
48
+    int headRoom = IF_FILTER_PREC;
49
+    int offset = (1 << (headRoom - 1));
50
+    uint16_t maxVal = (1 << X265_DEPTH) - 1;
51
+    int cStride = 1;
52
+
53
+    src -= (N / 2 - 1) * cStride;
54
+    int16x8_t vc;
55
+    vc = *(int16x8_t *)coeff;
56
+    int16x4_t low_vc = vget_low_s16(vc);
57
+    int16x4_t high_vc = vget_high_s16(vc);
58
+
59
+    const int32x4_t voffset = vdupq_n_s32(offset);
60
+    const int32x4_t vhr = vdupq_n_s32(-headRoom);
61
+
62
+    int row, col;
63
+    for (row = 0; row < height; row++)
64
+    {
65
+        for (col = 0; col < width; col += 8)
66
+        {
67
+            int32x4_t vsum1, vsum2;
68
+
69
+            int16x8_t inputN;
70
+
71
+            for (int i = 0; i < N; i++)
72
+            {
73
+#if HIGH_BIT_DEPTH
74
+                inputi = *(int16x8_t *)&srccol + i;
75
+#else
76
+                inputi = vmovl_u8(*(uint8x8_t *)&srccol + i);
77
+#endif
78
+            }
79
+            vsum1 = voffset;
80
+            vsum2 = voffset;
81
+
82
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input0), low_vc, 0);
83
+            vsum2 = vmlal_high_lane_s16(vsum2, input0, low_vc, 0);
84
+
85
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input1), low_vc, 1);
86
+            vsum2 = vmlal_high_lane_s16(vsum2, input1, low_vc, 1);
87
+
88
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input2), low_vc, 2);
89
+            vsum2 = vmlal_high_lane_s16(vsum2, input2, low_vc, 2);
90
+
91
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input3), low_vc, 3);
92
+            vsum2 = vmlal_high_lane_s16(vsum2, input3, low_vc, 3);
93
+
94
+            if (N == 8)
95
+            {
96
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input4), high_vc, 0);
97
+                vsum2 = vmlal_high_lane_s16(vsum2, input4, high_vc, 0);
98
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input5), high_vc, 1);
99
+                vsum2 = vmlal_high_lane_s16(vsum2, input5, high_vc, 1);
100
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input6), high_vc, 2);
101
+                vsum2 = vmlal_high_lane_s16(vsum2, input6, high_vc, 2);
102
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input7), high_vc, 3);
103
+                vsum2 = vmlal_high_lane_s16(vsum2, input7, high_vc, 3);
104
+
105
+            }
106
+
107
+            vsum1 = vshlq_s32(vsum1, vhr);
108
+            vsum2 = vshlq_s32(vsum2, vhr);
109
+
110
+            int16x8_t vsum = vuzp1q_s16(vsum1, vsum2);
111
+            vsum = vminq_s16(vsum, vdupq_n_s16(maxVal));
112
+            vsum = vmaxq_s16(vsum, vdupq_n_s16(0));
113
+#if HIGH_BIT_DEPTH
114
+            *(int16x8_t *)&dstcol = vsum;
115
+#else
116
+            uint8x16_t usum = vuzp1q_u8(vsum, vsum);
117
+            *(uint8x8_t *)&dstcol = vget_low_u8(usum);
118
+#endif
119
+
120
+        }
121
+
122
+        src += srcStride;
123
+        dst += dstStride;
124
+    }
125
+}
126
+
127
+#if HIGH_BIT_DEPTH
128
+
129
+template<int N, int width, int height>
130
+void interp_horiz_ps_neon(const uint16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx,
131
+                          int isRowExt)
132
+{
133
+    const int16_t *coeff = (N == 4) ? g_chromaFiltercoeffIdx : g_lumaFiltercoeffIdx;
134
+    const int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
135
+    const int shift = IF_FILTER_PREC - headRoom;
136
+    const int offset = (unsigned) - IF_INTERNAL_OFFS << shift;
137
+
138
+    int blkheight = height;
139
+    src -= N / 2 - 1;
140
+
141
+    if (isRowExt)
142
+    {
143
+        src -= (N / 2 - 1) * srcStride;
144
+        blkheight += N - 1;
145
+    }
146
+    int16x8_t vc3 = vld1q_s16(coeff);
147
+    const int32x4_t voffset = vdupq_n_s32(offset);
148
+    const int32x4_t vhr = vdupq_n_s32(-shift);
149
+
150
+    int row, col;
151
+    for (row = 0; row < blkheight; row++)
152
+    {
153
+        for (col = 0; col < width; col += 8)
154
+        {
155
+            int32x4_t vsum, vsum2;
156
+
157
+            int16x8_t inputN;
158
+            for (int i = 0; i < N; i++)
159
+            {
160
+                inputi = vld1q_s16((int16_t *)&srccol + i);
161
+            }
162
+
163
+            vsum = voffset;
164
+            vsum2 = voffset;
165
+
166
+            vsum = vmlal_lane_s16(vsum, vget_low_u16(input0), vget_low_s16(vc3), 0);
167
+            vsum2 = vmlal_high_lane_s16(vsum2, input0, vget_low_s16(vc3), 0);
168
+
169
+            vsum = vmlal_lane_s16(vsum, vget_low_u16(input1), vget_low_s16(vc3), 1);
170
+            vsum2 = vmlal_high_lane_s16(vsum2, input1, vget_low_s16(vc3), 1);
171
+
172
+            vsum = vmlal_lane_s16(vsum, vget_low_u16(input2), vget_low_s16(vc3), 2);
173
+            vsum2 = vmlal_high_lane_s16(vsum2, input2, vget_low_s16(vc3), 2);
174
+
175
+            vsum = vmlal_lane_s16(vsum, vget_low_u16(input3), vget_low_s16(vc3), 3);
176
+            vsum2 = vmlal_high_lane_s16(vsum2, input3, vget_low_s16(vc3), 3);
177
+
178
+            if (N == 8)
179
+            {
180
+                vsum = vmlal_lane_s16(vsum, vget_low_s16(input4), vget_high_s16(vc3), 0);
181
+                vsum2 = vmlal_high_lane_s16(vsum2, input4, vget_high_s16(vc3), 0);
182
+
183
+                vsum = vmlal_lane_s16(vsum, vget_low_s16(input5), vget_high_s16(vc3), 1);
184
+                vsum2 = vmlal_high_lane_s16(vsum2, input5, vget_high_s16(vc3), 1);
185
+
186
+                vsum = vmlal_lane_s16(vsum, vget_low_s16(input6), vget_high_s16(vc3), 2);
187
+                vsum2 = vmlal_high_lane_s16(vsum2, input6, vget_high_s16(vc3), 2);
188
+
189
+                vsum = vmlal_lane_s16(vsum, vget_low_s16(input7), vget_high_s16(vc3), 3);
190
+                vsum2 = vmlal_high_lane_s16(vsum2, input7, vget_high_s16(vc3), 3);
191
+            }
192
+
193
+            vsum = vshlq_s32(vsum, vhr);
194
+            vsum2 = vshlq_s32(vsum2, vhr);
195
+            *(int16x4_t *)&dstcol = vmovn_u32(vsum);
196
+            *(int16x4_t *)&dstcol+4 = vmovn_u32(vsum2);
197
+        }
198
+
199
+        src += srcStride;
200
+        dst += dstStride;
201
+    }
202
+}
203
+
204
+
205
+#else
206
+
207
+template<int N, int width, int height>
208
+void interp_horiz_ps_neon(const uint8_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx,
209
+                          int isRowExt)
210
+{
211
+    const int16_t *coeff = (N == 4) ? g_chromaFiltercoeffIdx : g_lumaFiltercoeffIdx;
212
+    const int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
213
+    const int shift = IF_FILTER_PREC - headRoom;
214
+    const int offset = (unsigned) - IF_INTERNAL_OFFS << shift;
215
+
216
+    int blkheight = height;
217
+    src -= N / 2 - 1;
218
+
219
+    if (isRowExt)
220
+    {
221
+        src -= (N / 2 - 1) * srcStride;
222
+        blkheight += N - 1;
223
+    }
224
+    int16x8_t vc;
225
+    vc = *(int16x8_t *)coeff;
226
+
227
+    const int16x8_t voffset = vdupq_n_s16(offset);
228
+    const int16x8_t vhr = vdupq_n_s16(-shift);
229
+
230
+    int row, col;
231
+    for (row = 0; row < blkheight; row++)
232
+    {
233
+        for (col = 0; col < width; col += 8)
234
+        {
235
+            int16x8_t vsum;
236
+
237
+            int16x8_t inputN;
238
+
239
+            for (int i = 0; i < N; i++)
240
+            {
241
+                inputi = vmovl_u8(*(uint8x8_t *)&srccol + i);
242
+            }
243
+            vsum = voffset;
244
+            vsum = vmlaq_laneq_s16(vsum, (input0), vc, 0);
245
+            vsum = vmlaq_laneq_s16(vsum, (input1), vc, 1);
246
+            vsum = vmlaq_laneq_s16(vsum, (input2), vc, 2);
247
+            vsum = vmlaq_laneq_s16(vsum, (input3), vc, 3);
248
+
249
+
250
+            if (N == 8)
251
+            {
252
+                vsum = vmlaq_laneq_s16(vsum, (input4), vc, 4);
253
+                vsum = vmlaq_laneq_s16(vsum, (input5), vc, 5);
254
+                vsum = vmlaq_laneq_s16(vsum, (input6), vc, 6);
255
+                vsum = vmlaq_laneq_s16(vsum, (input7), vc, 7);
256
+
257
+            }
258
+
259
+            vsum = vshlq_s16(vsum, vhr);
260
+            *(int16x8_t *)&dstcol = vsum;
261
+        }
262
+
263
+        src += srcStride;
264
+        dst += dstStride;
265
+    }
266
+}
267
+
268
+#endif
269
+
270
+
271
+template<int N, int width, int height>
272
+void interp_vert_ss_neon(const int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
273
+{
274
+    const int16_t *c = (N == 8 ? g_lumaFiltercoeffIdx : g_chromaFiltercoeffIdx);
275
+    int shift = IF_FILTER_PREC;
276
+    src -= (N / 2 - 1) * srcStride;
277
+    int16x8_t vc;
278
+    vc = *(int16x8_t *)c;
279
+    int16x4_t low_vc = vget_low_s16(vc);
280
+    int16x4_t high_vc = vget_high_s16(vc);
281
+
282
+    const int32x4_t vhr = vdupq_n_s32(-shift);
283
+
284
+    int row, col;
285
+    for (row = 0; row < height; row++)
286
+    {
287
+        for (col = 0; col < width; col += 8)
288
+        {
289
+            int32x4_t vsum1, vsum2;
290
+
291
+            int16x8_t inputN;
292
+
293
+            for (int i = 0; i < N; i++)
294
+            {
295
+                inputi = *(int16x8_t *)&srccol + i * srcStride;
296
+            }
297
+
298
+            vsum1 = vmull_lane_s16(vget_low_s16(input0), low_vc, 0);
299
+            vsum2 = vmull_high_lane_s16(input0, low_vc, 0);
300
+
301
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input1), low_vc, 1);
302
+            vsum2 = vmlal_high_lane_s16(vsum2, input1, low_vc, 1);
303
+
304
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input2), low_vc, 2);
305
+            vsum2 = vmlal_high_lane_s16(vsum2, input2, low_vc, 2);
306
+
307
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input3), low_vc, 3);
308
+            vsum2 = vmlal_high_lane_s16(vsum2, input3, low_vc, 3);
309
+
310
+            if (N == 8)
311
+            {
312
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input4), high_vc, 0);
313
+                vsum2 = vmlal_high_lane_s16(vsum2, input4, high_vc, 0);
314
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input5), high_vc, 1);
315
+                vsum2 = vmlal_high_lane_s16(vsum2, input5, high_vc, 1);
316
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input6), high_vc, 2);
317
+                vsum2 = vmlal_high_lane_s16(vsum2, input6, high_vc, 2);
318
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input7), high_vc, 3);
319
+                vsum2 = vmlal_high_lane_s16(vsum2, input7, high_vc, 3);
320
+
321
+            }
322
+
323
+            vsum1 = vshlq_s32(vsum1, vhr);
324
+            vsum2 = vshlq_s32(vsum2, vhr);
325
+
326
+            int16x8_t vsum = vuzp1q_s16(vsum1, vsum2);
327
+            *(int16x8_t *)&dstcol = vsum;
328
+        }
329
+
330
+        src += srcStride;
331
+        dst += dstStride;
332
+    }
333
+
334
+}
335
+
336
+
337
+#if HIGH_BIT_DEPTH
338
+
339
+template<int N, int width, int height>
340
+void interp_vert_pp_neon(const uint16_t *src, intptr_t srcStride, uint16_t *dst, intptr_t dstStride, int coeffIdx)
341
+{
342
+
343
+    const int16_t *c = (N == 4) ? g_chromaFiltercoeffIdx : g_lumaFiltercoeffIdx;
344
+    int shift = IF_FILTER_PREC;
345
+    int offset = 1 << (shift - 1);
346
+    const uint16_t maxVal = (1 << X265_DEPTH) - 1;
347
+
348
+    src -= (N / 2 - 1) * srcStride;
349
+    int16x8_t vc;
350
+    vc = *(int16x8_t *)c;
351
+    int32x4_t low_vc = vmovl_s16(vget_low_s16(vc));
352
+    int32x4_t high_vc = vmovl_s16(vget_high_s16(vc));
353
+
354
+    const int32x4_t voffset = vdupq_n_s32(offset);
355
+    const int32x4_t vhr = vdupq_n_s32(-shift);
356
+
357
+    int row, col;
358
+    for (row = 0; row < height; row++)
359
+    {
360
+        for (col = 0; col < width; col += 4)
361
+        {
362
+            int32x4_t vsum;
363
+
364
+            int32x4_t inputN;
365
+
366
+            for (int i = 0; i < N; i++)
367
+            {
368
+                inputi = vmovl_u16(*(uint16x4_t *)&srccol + i * srcStride);
369
+            }
370
+            vsum = voffset;
371
+
372
+            vsum = vmlaq_laneq_s32(vsum, (input0), low_vc, 0);
373
+            vsum = vmlaq_laneq_s32(vsum, (input1), low_vc, 1);
374
+            vsum = vmlaq_laneq_s32(vsum, (input2), low_vc, 2);
375
+            vsum = vmlaq_laneq_s32(vsum, (input3), low_vc, 3);
376
+
377
+            if (N == 8)
378
+            {
379
+                vsum = vmlaq_laneq_s32(vsum, (input4), high_vc, 0);
380
+                vsum = vmlaq_laneq_s32(vsum, (input5), high_vc, 1);
381
+                vsum = vmlaq_laneq_s32(vsum, (input6), high_vc, 2);
382
+                vsum = vmlaq_laneq_s32(vsum, (input7), high_vc, 3);
383
+            }
384
+
385
+            vsum = vshlq_s32(vsum, vhr);
386
+            vsum = vminq_s32(vsum, vdupq_n_s32(maxVal));
387
+            vsum = vmaxq_s32(vsum, vdupq_n_s32(0));
388
+            *(uint16x4_t *)&dstcol = vmovn_u32(vsum);
389
+        }
390
+        src += srcStride;
391
+        dst += dstStride;
392
+    }
393
+}
394
+
395
+
396
+
397
+
398
+#else
399
+
400
+template<int N, int width, int height>
401
+void interp_vert_pp_neon(const uint8_t *src, intptr_t srcStride, uint8_t *dst, intptr_t dstStride, int coeffIdx)
402
+{
403
+
404
+    const int16_t *c = (N == 4) ? g_chromaFiltercoeffIdx : g_lumaFiltercoeffIdx;
405
+    int shift = IF_FILTER_PREC;
406
+    int offset = 1 << (shift - 1);
407
+    const uint16_t maxVal = (1 << X265_DEPTH) - 1;
408
+
409
+    src -= (N / 2 - 1) * srcStride;
410
+    int16x8_t vc;
411
+    vc = *(int16x8_t *)c;
412
+
413
+    const int16x8_t voffset = vdupq_n_s16(offset);
414
+    const int16x8_t vhr = vdupq_n_s16(-shift);
415
+
416
+    int row, col;
417
+    for (row = 0; row < height; row++)
418
+    {
419
+        for (col = 0; col < width; col += 8)
420
+        {
421
+            int16x8_t vsum;
422
+
423
+            int16x8_t inputN;
424
+
425
+            for (int i = 0; i < N; i++)
426
+            {
427
+                inputi = vmovl_u8(*(uint8x8_t *)&srccol + i * srcStride);
428
+            }
429
+            vsum = voffset;
430
+
431
+            vsum = vmlaq_laneq_s16(vsum, (input0), vc, 0);
432
+            vsum = vmlaq_laneq_s16(vsum, (input1), vc, 1);
433
+            vsum = vmlaq_laneq_s16(vsum, (input2), vc, 2);
434
+            vsum = vmlaq_laneq_s16(vsum, (input3), vc, 3);
435
+
436
+            if (N == 8)
437
+            {
438
+                vsum = vmlaq_laneq_s16(vsum, (input4), vc, 4);
439
+                vsum = vmlaq_laneq_s16(vsum, (input5), vc, 5);
440
+                vsum = vmlaq_laneq_s16(vsum, (input6), vc, 6);
441
+                vsum = vmlaq_laneq_s16(vsum, (input7), vc, 7);
442
+
443
+            }
444
+
445
+            vsum = vshlq_s16(vsum, vhr);
446
+
447
+            vsum = vminq_s16(vsum, vdupq_n_s16(maxVal));
448
+            vsum = vmaxq_s16(vsum, vdupq_n_s16(0));
449
+            uint8x16_t usum = vuzp1q_u8(vsum, vsum);
450
+            *(uint8x8_t *)&dstcol = vget_low_u8(usum);
451
+
452
+        }
453
+
454
+        src += srcStride;
455
+        dst += dstStride;
456
+    }
457
+}
458
+
459
+
460
+#endif
461
+
462
+
463
+#if HIGH_BIT_DEPTH
464
+
465
+template<int N, int width, int height>
466
+void interp_vert_ps_neon(const uint16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
467
+{
468
+    const int16_t *c = (N == 4) ? g_chromaFiltercoeffIdx : g_lumaFiltercoeffIdx;
469
+    int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
470
+    int shift = IF_FILTER_PREC - headRoom;
471
+    int offset = (unsigned) - IF_INTERNAL_OFFS << shift;
472
+    src -= (N / 2 - 1) * srcStride;
473
+
474
+    int16x8_t vc;
475
+    vc = *(int16x8_t *)c;
476
+    int32x4_t low_vc = vmovl_s16(vget_low_s16(vc));
477
+    int32x4_t high_vc = vmovl_s16(vget_high_s16(vc));
478
+
479
+    const int32x4_t voffset = vdupq_n_s32(offset);
480
+    const int32x4_t vhr = vdupq_n_s32(-shift);
481
+
482
+    int row, col;
483
+    for (row = 0; row < height; row++)
484
+    {
485
+        for (col = 0; col < width; col += 4)
486
+        {
487
+            int16x8_t vsum;
488
+
489
+            int16x8_t inputN;
490
+
491
+            for (int i = 0; i < N; i++)
492
+            {
493
+                inputi = vmovl_u16(*(uint16x4_t *)&srccol + i * srcStride);
494
+            }
495
+            vsum = voffset;
496
+
497
+            vsum = vmlaq_laneq_s32(vsum, (input0), low_vc, 0);
498
+            vsum = vmlaq_laneq_s32(vsum, (input1), low_vc, 1);
499
+            vsum = vmlaq_laneq_s32(vsum, (input2), low_vc, 2);
500
+            vsum = vmlaq_laneq_s32(vsum, (input3), low_vc, 3);
501
+
502
+            if (N == 8)
503
+            {
504
+                int16x8_t  vsum1 = vmulq_laneq_s32((input4), high_vc, 0);
505
+                vsum1 = vmlaq_laneq_s32(vsum1, (input5), high_vc, 1);
506
+                vsum1 = vmlaq_laneq_s32(vsum1, (input6), high_vc, 2);
507
+                vsum1 = vmlaq_laneq_s32(vsum1, (input7), high_vc, 3);
508
+                vsum = vaddq_s32(vsum, vsum1);
509
+            }
510
+
511
+            vsum = vshlq_s32(vsum, vhr);
512
+
513
+            *(uint16x4_t *)&dstcol = vmovn_s32(vsum);
514
+        }
515
+
516
+        src += srcStride;
517
+        dst += dstStride;
518
+    }
519
+}
520
+
521
+#else
522
+
523
+template<int N, int width, int height>
524
+void interp_vert_ps_neon(const uint8_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
525
+{
526
+    const int16_t *c = (N == 4) ? g_chromaFiltercoeffIdx : g_lumaFiltercoeffIdx;
527
+    int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
528
+    int shift = IF_FILTER_PREC - headRoom;
529
+    int offset = (unsigned) - IF_INTERNAL_OFFS << shift;
530
+    src -= (N / 2 - 1) * srcStride;
531
+
532
+    int16x8_t vc;
533
+    vc = *(int16x8_t *)c;
534
+
535
+    const int16x8_t voffset = vdupq_n_s16(offset);
536
+    const int16x8_t vhr = vdupq_n_s16(-shift);
537
+
538
+    int row, col;
539
+    for (row = 0; row < height; row++)
540
+    {
541
+        for (col = 0; col < width; col += 8)
542
+        {
543
+            int16x8_t vsum;
544
+
545
+            int16x8_t inputN;
546
+
547
+            for (int i = 0; i < N; i++)
548
+            {
549
+                inputi = vmovl_u8(*(uint8x8_t *)&srccol + i * srcStride);
550
+            }
551
+            vsum = voffset;
552
+
553
+            vsum = vmlaq_laneq_s16(vsum, (input0), vc, 0);
554
+            vsum = vmlaq_laneq_s16(vsum, (input1), vc, 1);
555
+            vsum = vmlaq_laneq_s16(vsum, (input2), vc, 2);
556
+            vsum = vmlaq_laneq_s16(vsum, (input3), vc, 3);
557
+
558
+            if (N == 8)
559
+            {
560
+                int16x8_t  vsum1 = vmulq_laneq_s16((input4), vc, 4);
561
+                vsum1 = vmlaq_laneq_s16(vsum1, (input5), vc, 5);
562
+                vsum1 = vmlaq_laneq_s16(vsum1, (input6), vc, 6);
563
+                vsum1 = vmlaq_laneq_s16(vsum1, (input7), vc, 7);
564
+                vsum = vaddq_s16(vsum, vsum1);
565
+            }
566
+
567
+            vsum = vshlq_s32(vsum, vhr);
568
+            *(int16x8_t *)&dstcol = vsum;
569
+        }
570
+
571
+        src += srcStride;
572
+        dst += dstStride;
573
+    }
574
+}
575
+
576
+#endif
577
+
578
+
579
+
580
+template<int N, int width, int height>
581
+void interp_vert_sp_neon(const int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
582
+{
583
+    int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
584
+    int shift = IF_FILTER_PREC + headRoom;
585
+    int offset = (1 << (shift - 1)) + (IF_INTERNAL_OFFS << IF_FILTER_PREC);
586
+    uint16_t maxVal = (1 << X265_DEPTH) - 1;
587
+    const int16_t *coeff = (N == 8 ? g_lumaFiltercoeffIdx : g_chromaFiltercoeffIdx);
588
+
589
+    src -= (N / 2 - 1) * srcStride;
590
+
591
+    int16x8_t vc;
592
+    vc = *(int16x8_t *)coeff;
593
+    int16x4_t low_vc = vget_low_s16(vc);
594
+    int16x4_t high_vc = vget_high_s16(vc);
595
+
596
+    const int32x4_t voffset = vdupq_n_s32(offset);
597
+    const int32x4_t vhr = vdupq_n_s32(-shift);
598
+
599
+    int row, col;
600
+    for (row = 0; row < height; row++)
601
+    {
602
+        for (col = 0; col < width; col += 8)
603
+        {
604
+            int32x4_t vsum1, vsum2;
605
+
606
+            int16x8_t inputN;
607
+
608
+            for (int i = 0; i < N; i++)
609
+            {
610
+                inputi = *(int16x8_t *)&srccol + i * srcStride;
611
+            }
612
+            vsum1 = voffset;
613
+            vsum2 = voffset;
614
+
615
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input0), low_vc, 0);
616
+            vsum2 = vmlal_high_lane_s16(vsum2, input0, low_vc, 0);
617
+
618
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input1), low_vc, 1);
619
+            vsum2 = vmlal_high_lane_s16(vsum2, input1, low_vc, 1);
620
+
621
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input2), low_vc, 2);
622
+            vsum2 = vmlal_high_lane_s16(vsum2, input2, low_vc, 2);
623
+
624
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input3), low_vc, 3);
625
+            vsum2 = vmlal_high_lane_s16(vsum2, input3, low_vc, 3);
626
+
627
+            if (N == 8)
628
+            {
629
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input4), high_vc, 0);
630
+                vsum2 = vmlal_high_lane_s16(vsum2, input4, high_vc, 0);
631
+
632
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input5), high_vc, 1);
633
+                vsum2 = vmlal_high_lane_s16(vsum2, input5, high_vc, 1);
634
+
635
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input6), high_vc, 2);
636
+                vsum2 = vmlal_high_lane_s16(vsum2, input6, high_vc, 2);
637
+
638
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input7), high_vc, 3);
639
+                vsum2 = vmlal_high_lane_s16(vsum2, input7, high_vc, 3);
640
+            }
641
+
642
+            vsum1 = vshlq_s32(vsum1, vhr);
643
+            vsum2 = vshlq_s32(vsum2, vhr);
644
+
645
+            int16x8_t vsum = vuzp1q_s16(vsum1, vsum2);
646
+            vsum = vminq_s16(vsum, vdupq_n_s16(maxVal));
647
+            vsum = vmaxq_s16(vsum, vdupq_n_s16(0));
648
+#if HIGH_BIT_DEPTH
649
+            *(int16x8_t *)&dstcol = vsum;
650
+#else
651
+            uint8x16_t usum = vuzp1q_u8(vsum, vsum);
652
+            *(uint8x8_t *)&dstcol = vget_low_u8(usum);
653
+#endif
654
+
655
+        }
656
+
657
+        src += srcStride;
658
+        dst += dstStride;
659
+    }
660
+}
661
+
662
+
663
+
664
+
665
+
666
+
667
+template<int N, int width, int height>
668
+void interp_hv_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
669
+{
670
+    ALIGN_VAR_32(int16_t, immedwidth * (height + N - 1));
671
+
672
+    interp_horiz_ps_neon<N, width, height>(src, srcStride, immed, width, idxX, 1);
673
+    interp_vert_sp_neon<N, width, height>(immed + (N / 2 - 1) * width, width, dst, dstStride, idxY);
674
+}
675
+
676
+
677
+
678
+}
679
+
680
+
681
+
682
+
683
+namespace X265_NS
684
+{
685
+#if defined(__APPLE__)
686
+#define CHROMA_420(W, H) \
687
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_hpp = interp_horiz_pp_neon<4, W, H>; \
688
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_vpp = interp_vert_pp_neon<4, W, H>;  \
689
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_vps = interp_vert_ps_neon<4, W, H>;  \
690
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_vsp = interp_vert_sp_neon<4, W, H>;  \
691
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_vss = interp_vert_ss_neon<4, W, H>;  \
692
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.p2sNONALIGNED = filterPixelToShort_neon<W, H>;\
693
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.p2sALIGNED = filterPixelToShort_neon<W, H>;
694
+    
695
+#define CHROMA_FILTER_420(W, H) \
696
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_hps = interp_horiz_ps_neon<4, W, H>;
697
+    
698
+#else // defined(__APPLE__)
699
+#define CHROMA_420(W, H) \
700
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_vss = interp_vert_ss_neon<4, W, H>; \
701
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.p2sNONALIGNED = filterPixelToShort_neon<W, H>;\
702
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.p2sALIGNED = filterPixelToShort_neon<W, H>;
703
+    
704
+#define CHROMA_FILTER_420(W, H) \
705
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_hpp = interp_horiz_pp_neon<4, W, H>; \
706
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_hps = interp_horiz_ps_neon<4, W, H>; \
707
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_vpp = interp_vert_pp_neon<4, W, H>;  \
708
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_vps = interp_vert_ps_neon<4, W, H>;  \
709
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_vsp = interp_vert_sp_neon<4, W, H>;
710
+#endif // defined(__APPLE__)
711
+
712
+#if defined(__APPLE__)
713
+#define CHROMA_422(W, H) \
714
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_hpp = interp_horiz_pp_neon<4, W, H>; \
715
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vpp = interp_vert_pp_neon<4, W, H>;  \
716
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vps = interp_vert_ps_neon<4, W, H>;  \
717
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vsp = interp_vert_sp_neon<4, W, H>;  \
718
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vss = interp_vert_ss_neon<4, W, H>;  \
719
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.p2sNONALIGNED = filterPixelToShort_neon<W, H>;\
720
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.p2sALIGNED = filterPixelToShort_neon<W, H>;
721
+    
722
+#define CHROMA_FILTER_422(W, H) \
723
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_hps = interp_horiz_ps_neon<4, W, H>;
724
+    
725
+#else // defined(__APPLE__)
726
+#define CHROMA_422(W, H) \
727
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vss = interp_vert_ss_neon<4, W, H>; \
728
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.p2sNONALIGNED = filterPixelToShort_neon<W, H>;\
729
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.p2sALIGNED = filterPixelToShort_neon<W, H>;
730
+    
731
+#define CHROMA_FILTER_422(W, H) \
732
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_hpp = interp_horiz_pp_neon<4, W, H>; \
733
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_hps = interp_horiz_ps_neon<4, W, H>; \
734
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vpp = interp_vert_pp_neon<4, W, H>;  \
735
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vps = interp_vert_ps_neon<4, W, H>;  \
736
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vsp = interp_vert_sp_neon<4, W, H>;
737
+#endif // defined(__APPLE__)
738
+
739
+#if defined(__APPLE__)
740
+#define CHROMA_444(W, H) \
741
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_hpp = interp_horiz_pp_neon<4, W, H>; \
742
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_vpp = interp_vert_pp_neon<4, W, H>;  \
743
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_vps = interp_vert_ps_neon<4, W, H>;  \
744
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_vsp = interp_vert_sp_neon<4, W, H>;  \
745
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_vss = interp_vert_ss_neon<4, W, H>;  \
746
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.p2sNONALIGNED = filterPixelToShort_neon<W, H>;\
747
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.p2sALIGNED = filterPixelToShort_neon<W, H>;
748
+
749
+#define CHROMA_FILTER_444(W, H) \
750
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_hps = interp_horiz_ps_neon<4, W, H>;
751
+    
752
+#else // defined(__APPLE__)
753
+#define CHROMA_444(W, H) \
754
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.p2sNONALIGNED = filterPixelToShort_neon<W, H>;\
755
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.p2sALIGNED = filterPixelToShort_neon<W, H>;
756
+    
757
+#define CHROMA_FILTER_444(W, H) \
758
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_hpp = interp_horiz_pp_neon<4, W, H>; \
759
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_hps = interp_horiz_ps_neon<4, W, H>; \
760
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_vpp = interp_vert_pp_neon<4, W, H>;  \
761
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_vps = interp_vert_ps_neon<4, W, H>;  \
762
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_vsp = interp_vert_sp_neon<4, W, H>;  \
763
+    p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_vss = interp_vert_ss_neon<4, W, H>;
764
+#endif // defined(__APPLE__)
765
+
766
+#if defined(__APPLE__)
767
+#define LUMA(W, H) \
768
+    p.puLUMA_ ## W ## x ## H.luma_hpp     = interp_horiz_pp_neon<8, W, H>; \
769
+    p.puLUMA_ ## W ## x ## H.luma_vpp     = interp_vert_pp_neon<8, W, H>;  \
770
+    p.puLUMA_ ## W ## x ## H.luma_vps     = interp_vert_ps_neon<8, W, H>;  \
771
+    p.puLUMA_ ## W ## x ## H.luma_vsp     = interp_vert_sp_neon<8, W, H>;  \
772
+    p.puLUMA_ ## W ## x ## H.luma_vss     = interp_vert_ss_neon<8, W, H>;  \
773
+    p.puLUMA_ ## W ## x ## H.luma_hvpp    = interp_hv_pp_neon<8, W, H>; \
774
+    p.puLUMA_ ## W ## x ## H.convert_p2sNONALIGNED = filterPixelToShort_neon<W, H>;\
775
+    p.puLUMA_ ## W ## x ## H.convert_p2sALIGNED = filterPixelToShort_neon<W, H>;
776
+    
777
+#else // defined(__APPLE__)
778
+#define LUMA(W, H) \
779
+    p.puLUMA_ ## W ## x ## H.luma_vss     = interp_vert_ss_neon<8, W, H>;  \
780
+    p.puLUMA_ ## W ## x ## H.convert_p2sNONALIGNED = filterPixelToShort_neon<W, H>;\
781
+    p.puLUMA_ ## W ## x ## H.convert_p2sALIGNED = filterPixelToShort_neon<W, H>;
782
+    
783
+#define LUMA_FILTER(W, H) \
784
+    p.puLUMA_ ## W ## x ## H.luma_hpp     = interp_horiz_pp_neon<8, W, H>; \
785
+    p.puLUMA_ ## W ## x ## H.luma_vpp     = interp_vert_pp_neon<8, W, H>;  \
786
+    p.puLUMA_ ## W ## x ## H.luma_vps     = interp_vert_ps_neon<8, W, H>;  \
787
+    p.puLUMA_ ## W ## x ## H.luma_vsp     = interp_vert_sp_neon<8, W, H>;  \
788
+    p.puLUMA_ ## W ## x ## H.luma_hvpp    = interp_hv_pp_neon<8, W, H>;
789
+#endif // defined(__APPLE__)
790
+
791
+void setupFilterPrimitives_neon(EncoderPrimitives &p)
792
+{
793
+
794
+    // All neon functions assume width of multiple of 8, (2,4,12 variants are not optimized)
795
+
796
+    LUMA(8, 8);
797
+    LUMA(8, 4);
798
+    LUMA(16, 16);
799
+    CHROMA_420(8,  8);
800
+    LUMA(16,  8);
801
+    CHROMA_420(8,  4);
802
+    LUMA(8, 16);
803
+    LUMA(16, 12);
804
+    CHROMA_420(8,  6);
805
+    LUMA(16,  4);
806
+    CHROMA_420(8,  2);
807
+    LUMA(32, 32);
808
+    CHROMA_420(16, 16);
809
+    LUMA(32, 16);
810
+    CHROMA_420(16, 8);
811
+    LUMA(16, 32);
812
+    CHROMA_420(8,  16);
813
+    LUMA(32, 24);
814
+    CHROMA_420(16, 12);
815
+    LUMA(24, 32);
816
+    LUMA(32,  8);
817
+    CHROMA_420(16, 4);
818
+    LUMA(8, 32);
819
+    LUMA(64, 64);
820
+    CHROMA_420(32, 32);
821
+    LUMA(64, 32);
822
+    CHROMA_420(32, 16);
823
+    LUMA(32, 64);
824
+    CHROMA_420(16, 32);
825
+    LUMA(64, 48);
826
+    CHROMA_420(32, 24);
827
+    LUMA(48, 64);
828
+    CHROMA_420(24, 32);
829
+    LUMA(64, 16);
830
+    CHROMA_420(32, 8);
831
+    LUMA(16, 64);
832
+    CHROMA_420(8,  32);
833
+    CHROMA_422(8,  16);
834
+    CHROMA_422(8,  8);
835
+    CHROMA_422(8,  12);
836
+    CHROMA_422(8,  4);
837
+    CHROMA_422(16, 32);
838
+    CHROMA_422(16, 16);
839
+    CHROMA_422(8,  32);
840
+    CHROMA_422(16, 24);
841
+    CHROMA_422(16, 8);
842
+    CHROMA_422(32, 64);
843
+    CHROMA_422(32, 32);
844
+    CHROMA_422(16, 64);
845
+    CHROMA_422(32, 48);
846
+    CHROMA_422(24, 64);
847
+    CHROMA_422(32, 16);
848
+    CHROMA_422(8,  64);
849
+    CHROMA_444(8,  8);
850
+    CHROMA_444(8,  4);
851
+    CHROMA_444(16, 16);
852
+    CHROMA_444(16, 8);
853
+    CHROMA_444(8,  16);
854
+    CHROMA_444(16, 12);
855
+    CHROMA_444(16, 4);
856
+    CHROMA_444(32, 32);
857
+    CHROMA_444(32, 16);
858
+    CHROMA_444(16, 32);
859
+    CHROMA_444(32, 24);
860
+    CHROMA_444(24, 32);
861
+    CHROMA_444(32, 8);
862
+    CHROMA_444(8,  32);
863
+    CHROMA_444(64, 64);
864
+    CHROMA_444(64, 32);
865
+    CHROMA_444(32, 64);
866
+    CHROMA_444(64, 48);
867
+    CHROMA_444(48, 64);
868
+    CHROMA_444(64, 16);
869
+    CHROMA_444(16, 64);
870
+
871
+#if defined(__APPLE__) || HIGH_BIT_DEPTH
872
+    p.puLUMA_8x4.luma_hps     = interp_horiz_ps_neon<8, 8, 4>;
873
+    p.puLUMA_8x8.luma_hps     = interp_horiz_ps_neon<8, 8, 8>;
874
+    p.puLUMA_8x16.luma_hps     = interp_horiz_ps_neon<8, 8, 16>;
875
+    p.puLUMA_8x32.luma_hps     = interp_horiz_ps_neon<8, 8, 32>;
876
+#endif // HIGH_BIT_DEPTH
877
+
878
+#if !defined(__APPLE__) && HIGH_BIT_DEPTH
879
+    p.puLUMA_24x32.luma_hps     = interp_horiz_ps_neon<8, 24, 32>;
880
+#endif // !defined(__APPLE__)
881
+
882
+#if !defined(__APPLE__)
883
+    p.puLUMA_32x8.luma_hpp      = interp_horiz_pp_neon<8, 32, 8>;
884
+    p.puLUMA_32x16.luma_hpp     = interp_horiz_pp_neon<8, 32, 16>;
885
+    p.puLUMA_32x24.luma_hpp     = interp_horiz_pp_neon<8, 32, 24>;
886
+    p.puLUMA_32x32.luma_hpp     = interp_horiz_pp_neon<8, 32, 32>;
887
+    p.puLUMA_32x64.luma_hpp     = interp_horiz_pp_neon<8, 32, 64>;
888
+    p.puLUMA_48x64.luma_hpp     = interp_horiz_pp_neon<8, 48, 64>;
889
+    p.puLUMA_64x16.luma_hpp     = interp_horiz_pp_neon<8, 64, 16>;
890
+    p.puLUMA_64x32.luma_hpp     = interp_horiz_pp_neon<8, 64, 32>;
891
+    p.puLUMA_64x48.luma_hpp     = interp_horiz_pp_neon<8, 64, 48>;
892
+    p.puLUMA_64x64.luma_hpp     = interp_horiz_pp_neon<8, 64, 64>;
893
+
894
+    LUMA_FILTER(8, 4);
895
+    LUMA_FILTER(8, 8);
896
+    LUMA_FILTER(8, 16);
897
+    LUMA_FILTER(8, 32);
898
+    LUMA_FILTER(24, 32);
899
+
900
+    LUMA_FILTER(16, 32);
901
+    LUMA_FILTER(32, 16);
902
+    LUMA_FILTER(32, 24);
903
+    LUMA_FILTER(32, 32);
904
+    LUMA_FILTER(32, 64);
905
+    LUMA_FILTER(48, 64);
906
+    LUMA_FILTER(64, 32);
907
+    LUMA_FILTER(64, 48);
908
+    LUMA_FILTER(64, 64);
909
+    
910
+    CHROMA_FILTER_420(24, 32);
911
+    
912
+    p.chromaX265_CSP_I420.puCHROMA_420_32x8.filter_hpp = interp_horiz_pp_neon<4, 32, 8>;
913
+    p.chromaX265_CSP_I420.puCHROMA_420_32x16.filter_hpp = interp_horiz_pp_neon<4, 32, 16>;
914
+    p.chromaX265_CSP_I420.puCHROMA_420_32x24.filter_hpp = interp_horiz_pp_neon<4, 32, 24>;
915
+    p.chromaX265_CSP_I420.puCHROMA_420_32x32.filter_hpp = interp_horiz_pp_neon<4, 32, 32>;
916
+    
917
+    CHROMA_FILTER_422(24, 64);
918
+    
919
+    p.chromaX265_CSP_I422.puCHROMA_422_32x16.filter_hpp = interp_horiz_pp_neon<4, 32, 16>;
920
+    p.chromaX265_CSP_I422.puCHROMA_422_32x32.filter_hpp = interp_horiz_pp_neon<4, 32, 32>;
921
+    p.chromaX265_CSP_I422.puCHROMA_422_32x48.filter_hpp = interp_horiz_pp_neon<4, 32, 48>;
922
+    p.chromaX265_CSP_I422.puCHROMA_422_32x64.filter_hpp = interp_horiz_pp_neon<4, 32, 64>;
923
+    
924
+    CHROMA_FILTER_444(24, 32);
925
+    
926
+    p.chromaX265_CSP_I444.puLUMA_32x8.filter_hpp  = interp_horiz_pp_neon<4, 32, 8>;
927
+    p.chromaX265_CSP_I444.puLUMA_32x16.filter_hpp = interp_horiz_pp_neon<4, 32, 16>;
928
+    p.chromaX265_CSP_I444.puLUMA_32x24.filter_hpp = interp_horiz_pp_neon<4, 32, 24>;
929
+    p.chromaX265_CSP_I444.puLUMA_32x32.filter_hpp = interp_horiz_pp_neon<4, 32, 32>;
930
+    p.chromaX265_CSP_I444.puLUMA_32x64.filter_hpp = interp_horiz_pp_neon<4, 32, 64>;
931
+    p.chromaX265_CSP_I444.puLUMA_48x64.filter_hpp = interp_horiz_pp_neon<4, 48, 64>;
932
+    p.chromaX265_CSP_I444.puLUMA_64x16.filter_hpp = interp_horiz_pp_neon<4, 64, 16>;
933
+    p.chromaX265_CSP_I444.puLUMA_64x32.filter_hpp = interp_horiz_pp_neon<4, 64, 32>;
934
+    p.chromaX265_CSP_I444.puLUMA_64x48.filter_hpp = interp_horiz_pp_neon<4, 64, 48>;
935
+    p.chromaX265_CSP_I444.puLUMA_64x64.filter_hpp = interp_horiz_pp_neon<4, 64, 64>;
936
+    
937
+    p.chromaX265_CSP_I444.puLUMA_16x4.filter_vss  = interp_vert_ss_neon<4, 16, 4>;
938
+    p.chromaX265_CSP_I444.puLUMA_16x8.filter_vss  = interp_vert_ss_neon<4, 16, 8>;
939
+    p.chromaX265_CSP_I444.puLUMA_16x12.filter_vss = interp_vert_ss_neon<4, 16, 12>;
940
+    p.chromaX265_CSP_I444.puLUMA_16x16.filter_vss = interp_vert_ss_neon<4, 16, 16>;
941
+    p.chromaX265_CSP_I444.puLUMA_16x32.filter_vss = interp_vert_ss_neon<4, 16, 32>;
942
+    p.chromaX265_CSP_I444.puLUMA_16x64.filter_vss = interp_vert_ss_neon<4, 16, 64>;
943
+    p.chromaX265_CSP_I444.puLUMA_32x8.filter_vss  = interp_vert_ss_neon<4, 32, 8>;
944
+    p.chromaX265_CSP_I444.puLUMA_32x16.filter_vss = interp_vert_ss_neon<4, 32, 16>;
945
+    p.chromaX265_CSP_I444.puLUMA_32x24.filter_vss = interp_vert_ss_neon<4, 32, 24>;
946
+    p.chromaX265_CSP_I444.puLUMA_32x32.filter_vss = interp_vert_ss_neon<4, 32, 32>;
947
+    p.chromaX265_CSP_I444.puLUMA_32x64.filter_vss = interp_vert_ss_neon<4, 32, 64>;
948
+#endif // !defined(__APPLE__)
949
+
950
+    CHROMA_FILTER_420(8, 2);
951
+    CHROMA_FILTER_420(8, 4);
952
+    CHROMA_FILTER_420(8, 6);
953
+    CHROMA_FILTER_420(8, 8);
954
+    CHROMA_FILTER_420(8, 16);
955
+    CHROMA_FILTER_420(8, 32);
956
+    
957
+    CHROMA_FILTER_422(8, 4);
958
+    CHROMA_FILTER_422(8, 8);
959
+    CHROMA_FILTER_422(8, 12);
960
+    CHROMA_FILTER_422(8, 16);
961
+    CHROMA_FILTER_422(8, 32);
962
+    CHROMA_FILTER_422(8, 64);
963
+    
964
+    CHROMA_FILTER_444(8, 4);
965
+    CHROMA_FILTER_444(8, 8);
966
+    CHROMA_FILTER_444(8, 16);
967
+    CHROMA_FILTER_444(8, 32);
968
+    
969
+#if defined(__APPLE__)
970
+    CHROMA_FILTER_420(16, 4);
971
+    CHROMA_FILTER_420(16, 8);
972
+    CHROMA_FILTER_420(16, 12);
973
+    CHROMA_FILTER_420(16, 16);
974
+    CHROMA_FILTER_420(16, 32);
975
+
976
+    CHROMA_FILTER_422(16, 8);
977
+    CHROMA_FILTER_422(16, 16);
978
+    CHROMA_FILTER_422(16, 24);
979
+    CHROMA_FILTER_422(16, 32);
980
+    CHROMA_FILTER_422(16, 64);
981
+    
982
+    CHROMA_FILTER_444(16, 4);
983
+    CHROMA_FILTER_444(16, 8);
984
+    CHROMA_FILTER_444(16, 12);
985
+    CHROMA_FILTER_444(16, 16);
986
+    CHROMA_FILTER_444(16, 32);
987
+    CHROMA_FILTER_444(16, 64);
988
+#endif // defined(__APPLE__)
989
+}
990
+
991
+};
992
+
993
+
994
+#endif
995
+
996
+
997
x265_3.6.tar.gz/source/common/aarch64/filter-prim.h Added
23
 
1
@@ -0,0 +1,21 @@
2
+#ifndef _FILTER_PRIM_ARM64_H__
3
+#define _FILTER_PRIM_ARM64_H__
4
+
5
+
6
+#include "common.h"
7
+#include "slicetype.h"      // LOWRES_COST_MASK
8
+#include "primitives.h"
9
+#include "x265.h"
10
+
11
+
12
+namespace X265_NS
13
+{
14
+
15
+
16
+void setupFilterPrimitives_neon(EncoderPrimitives &p);
17
+
18
+};
19
+
20
+
21
+#endif
22
+
23
x265_3.6.tar.gz/source/common/aarch64/fun-decls.h Added
258
 
1
@@ -0,0 +1,256 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2021 MulticoreWare, Inc
4
+ *
5
+ * Authors: Sebastian Pop <spop@amazon.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#define FUNCDEF_TU(ret, name, cpu, ...) \
26
+    ret PFX(name ## _4x4_ ## cpu(__VA_ARGS__)); \
27
+    ret PFX(name ## _8x8_ ## cpu(__VA_ARGS__)); \
28
+    ret PFX(name ## _16x16_ ## cpu(__VA_ARGS__)); \
29
+    ret PFX(name ## _32x32_ ## cpu(__VA_ARGS__)); \
30
+    ret PFX(name ## _64x64_ ## cpu(__VA_ARGS__))
31
+
32
+#define FUNCDEF_TU_S(ret, name, cpu, ...) \
33
+    ret PFX(name ## _4_ ## cpu(__VA_ARGS__)); \
34
+    ret PFX(name ## _8_ ## cpu(__VA_ARGS__)); \
35
+    ret PFX(name ## _16_ ## cpu(__VA_ARGS__)); \
36
+    ret PFX(name ## _32_ ## cpu(__VA_ARGS__)); \
37
+    ret PFX(name ## _64_ ## cpu(__VA_ARGS__))
38
+
39
+#define FUNCDEF_TU_S2(ret, name, cpu, ...) \
40
+    ret PFX(name ## 4_ ## cpu(__VA_ARGS__)); \
41
+    ret PFX(name ## 8_ ## cpu(__VA_ARGS__)); \
42
+    ret PFX(name ## 16_ ## cpu(__VA_ARGS__)); \
43
+    ret PFX(name ## 32_ ## cpu(__VA_ARGS__)); \
44
+    ret PFX(name ## 64_ ## cpu(__VA_ARGS__))
45
+
46
+#define FUNCDEF_PU(ret, name, cpu, ...) \
47
+    ret PFX(name ## _4x4_   ## cpu)(__VA_ARGS__); \
48
+    ret PFX(name ## _8x8_   ## cpu)(__VA_ARGS__); \
49
+    ret PFX(name ## _16x16_ ## cpu)(__VA_ARGS__); \
50
+    ret PFX(name ## _32x32_ ## cpu)(__VA_ARGS__); \
51
+    ret PFX(name ## _64x64_ ## cpu)(__VA_ARGS__); \
52
+    ret PFX(name ## _8x4_   ## cpu)(__VA_ARGS__); \
53
+    ret PFX(name ## _4x8_   ## cpu)(__VA_ARGS__); \
54
+    ret PFX(name ## _16x8_  ## cpu)(__VA_ARGS__); \
55
+    ret PFX(name ## _8x16_  ## cpu)(__VA_ARGS__); \
56
+    ret PFX(name ## _16x32_ ## cpu)(__VA_ARGS__); \
57
+    ret PFX(name ## _32x16_ ## cpu)(__VA_ARGS__); \
58
+    ret PFX(name ## _64x32_ ## cpu)(__VA_ARGS__); \
59
+    ret PFX(name ## _32x64_ ## cpu)(__VA_ARGS__); \
60
+    ret PFX(name ## _16x12_ ## cpu)(__VA_ARGS__); \
61
+    ret PFX(name ## _12x16_ ## cpu)(__VA_ARGS__); \
62
+    ret PFX(name ## _16x4_  ## cpu)(__VA_ARGS__); \
63
+    ret PFX(name ## _4x16_  ## cpu)(__VA_ARGS__); \
64
+    ret PFX(name ## _32x24_ ## cpu)(__VA_ARGS__); \
65
+    ret PFX(name ## _24x32_ ## cpu)(__VA_ARGS__); \
66
+    ret PFX(name ## _32x8_  ## cpu)(__VA_ARGS__); \
67
+    ret PFX(name ## _8x32_  ## cpu)(__VA_ARGS__); \
68
+    ret PFX(name ## _64x48_ ## cpu)(__VA_ARGS__); \
69
+    ret PFX(name ## _48x64_ ## cpu)(__VA_ARGS__); \
70
+    ret PFX(name ## _64x16_ ## cpu)(__VA_ARGS__); \
71
+    ret PFX(name ## _16x64_ ## cpu)(__VA_ARGS__)
72
+
73
+#define FUNCDEF_CHROMA_PU(ret, name, cpu, ...) \
74
+    FUNCDEF_PU(ret, name, cpu, __VA_ARGS__); \
75
+    ret PFX(name ## _4x2_ ## cpu)(__VA_ARGS__); \
76
+    ret PFX(name ## _4x4_ ## cpu)(__VA_ARGS__); \
77
+    ret PFX(name ## _2x4_ ## cpu)(__VA_ARGS__); \
78
+    ret PFX(name ## _8x2_ ## cpu)(__VA_ARGS__); \
79
+    ret PFX(name ## _2x8_ ## cpu)(__VA_ARGS__); \
80
+    ret PFX(name ## _8x6_ ## cpu)(__VA_ARGS__); \
81
+    ret PFX(name ## _6x8_ ## cpu)(__VA_ARGS__); \
82
+    ret PFX(name ## _8x12_ ## cpu)(__VA_ARGS__); \
83
+    ret PFX(name ## _12x8_ ## cpu)(__VA_ARGS__); \
84
+    ret PFX(name ## _6x16_ ## cpu)(__VA_ARGS__); \
85
+    ret PFX(name ## _16x6_ ## cpu)(__VA_ARGS__); \
86
+    ret PFX(name ## _2x16_ ## cpu)(__VA_ARGS__); \
87
+    ret PFX(name ## _16x2_ ## cpu)(__VA_ARGS__); \
88
+    ret PFX(name ## _4x12_ ## cpu)(__VA_ARGS__); \
89
+    ret PFX(name ## _12x4_ ## cpu)(__VA_ARGS__); \
90
+    ret PFX(name ## _32x12_ ## cpu)(__VA_ARGS__); \
91
+    ret PFX(name ## _12x32_ ## cpu)(__VA_ARGS__); \
92
+    ret PFX(name ## _32x4_ ## cpu)(__VA_ARGS__); \
93
+    ret PFX(name ## _4x32_ ## cpu)(__VA_ARGS__); \
94
+    ret PFX(name ## _32x48_ ## cpu)(__VA_ARGS__); \
95
+    ret PFX(name ## _48x32_ ## cpu)(__VA_ARGS__); \
96
+    ret PFX(name ## _16x24_ ## cpu)(__VA_ARGS__); \
97
+    ret PFX(name ## _24x16_ ## cpu)(__VA_ARGS__); \
98
+    ret PFX(name ## _8x64_ ## cpu)(__VA_ARGS__); \
99
+    ret PFX(name ## _64x8_ ## cpu)(__VA_ARGS__); \
100
+    ret PFX(name ## _64x24_ ## cpu)(__VA_ARGS__); \
101
+    ret PFX(name ## _24x64_ ## cpu)(__VA_ARGS__);
102
+
103
+#define DECLS(cpu) \
104
+    FUNCDEF_TU(void, cpy2Dto1D_shl, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
105
+    FUNCDEF_TU(void, cpy2Dto1D_shr, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
106
+    FUNCDEF_TU(void, cpy1Dto2D_shl, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
107
+    FUNCDEF_TU(void, cpy1Dto2D_shl_aligned, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
108
+    FUNCDEF_TU(void, cpy1Dto2D_shr, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
109
+    FUNCDEF_TU_S(uint32_t, copy_cnt, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride); \
110
+    FUNCDEF_TU_S(int, count_nonzero, cpu, const int16_t* quantCoeff); \
111
+    FUNCDEF_TU(void, blockfill_s, cpu, int16_t* dst, intptr_t dstride, int16_t val); \
112
+    FUNCDEF_TU(void, blockfill_s_aligned, cpu, int16_t* dst, intptr_t dstride, int16_t val); \
113
+    FUNCDEF_CHROMA_PU(void, blockcopy_ss, cpu, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); \
114
+    FUNCDEF_CHROMA_PU(void, blockcopy_pp, cpu, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); \
115
+    FUNCDEF_PU(void, blockcopy_sp, cpu, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); \
116
+    FUNCDEF_PU(void, blockcopy_ps, cpu, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); \
117
+    FUNCDEF_PU(void, interp_8tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
118
+    FUNCDEF_PU(void, interp_8tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
119
+    FUNCDEF_PU(void, interp_8tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
120
+    FUNCDEF_PU(void, interp_8tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
121
+    FUNCDEF_PU(void, interp_8tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
122
+    FUNCDEF_PU(void, interp_8tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
123
+    FUNCDEF_PU(void, interp_8tap_hv_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY); \
124
+    FUNCDEF_CHROMA_PU(void, filterPixelToShort, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
125
+    FUNCDEF_CHROMA_PU(void, filterPixelToShort_aligned, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
126
+    FUNCDEF_CHROMA_PU(void, interp_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
127
+    FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
128
+    FUNCDEF_CHROMA_PU(void, interp_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
129
+    FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
130
+    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
131
+    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
132
+    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
133
+    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
134
+    FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
135
+    FUNCDEF_CHROMA_PU(void, addAvg_aligned, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
136
+    FUNCDEF_PU(void, pixel_avg_pp, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
137
+    FUNCDEF_PU(void, pixel_avg_pp_aligned, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
138
+    FUNCDEF_PU(void, sad_x3, cpu, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
139
+    FUNCDEF_PU(void, sad_x4, cpu, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
140
+    FUNCDEF_CHROMA_PU(int, pixel_sad, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
141
+    FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
142
+    FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s_aligned, cpu, const int16_t*, intptr_t); \
143
+    FUNCDEF_TU_S(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
144
+    FUNCDEF_TU_S(sse_t, pixel_ssd_s_aligned, cpu, const int16_t*, intptr_t); \
145
+    FUNCDEF_PU(sse_t, pixel_sse_pp, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
146
+    FUNCDEF_CHROMA_PU(sse_t, pixel_sse_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
147
+    FUNCDEF_PU(void, pixel_sub_ps, cpu, int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); \
148
+    FUNCDEF_PU(void, pixel_add_ps, cpu, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \
149
+    FUNCDEF_PU(void, pixel_add_ps_aligned, cpu, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \
150
+    FUNCDEF_CHROMA_PU(int, pixel_satd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
151
+    FUNCDEF_TU_S2(void, ssimDist, cpu, const pixel *fenc, uint32_t fStride, const pixel *recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k); \
152
+    FUNCDEF_TU_S2(void, normFact, cpu, const pixel *src, uint32_t blockSize, int shift, uint64_t *z_k)
153
+
154
+DECLS(neon);
155
+DECLS(sve);
156
+DECLS(sve2);
157
+
158
+
159
+void x265_pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
160
+
161
+uint64_t x265_pixel_var_8x8_neon(const pixel* pix, intptr_t stride);
162
+uint64_t x265_pixel_var_16x16_neon(const pixel* pix, intptr_t stride);
163
+uint64_t x265_pixel_var_32x32_neon(const pixel* pix, intptr_t stride);
164
+uint64_t x265_pixel_var_64x64_neon(const pixel* pix, intptr_t stride);
165
+
166
+void x265_getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
167
+void x265_getResidual8_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
168
+void x265_getResidual16_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
169
+void x265_getResidual32_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
170
+
171
+void x265_scale1D_128to64_neon(pixel *dst, const pixel *src);
172
+void x265_scale2D_64to32_neon(pixel* dst, const pixel* src, intptr_t stride);
173
+
174
+int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
175
+int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
176
+int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
177
+int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
178
+int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
179
+int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
180
+int x265_pixel_satd_8x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
181
+int x265_pixel_satd_8x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
182
+int x265_pixel_satd_8x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
183
+int x265_pixel_satd_8x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
184
+int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
185
+int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
186
+int x265_pixel_satd_16x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
187
+int x265_pixel_satd_16x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
188
+int x265_pixel_satd_16x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
189
+int x265_pixel_satd_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
190
+int x265_pixel_satd_16x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
191
+int x265_pixel_satd_16x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
192
+int x265_pixel_satd_16x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
193
+int x265_pixel_satd_24x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
194
+int x265_pixel_satd_24x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
195
+int x265_pixel_satd_32x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
196
+int x265_pixel_satd_32x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
197
+int x265_pixel_satd_32x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
198
+int x265_pixel_satd_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
199
+int x265_pixel_satd_32x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
200
+int x265_pixel_satd_32x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
201
+int x265_pixel_satd_48x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
202
+int x265_pixel_satd_64x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
203
+int x265_pixel_satd_64x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
204
+int x265_pixel_satd_64x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
205
+int x265_pixel_satd_64x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
206
+
207
+int x265_pixel_sa8d_8x8_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
208
+int x265_pixel_sa8d_8x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
209
+int x265_pixel_sa8d_16x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
210
+int x265_pixel_sa8d_16x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
211
+int x265_pixel_sa8d_32x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
212
+int x265_pixel_sa8d_32x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
213
+int x265_pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
214
+
215
+uint32_t PFX(quant_neon)(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
216
+uint32_t PFX(nquant_neon)(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
217
+
218
+void x265_dequant_scaling_neon(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift);
219
+void x265_dequant_normal_neon(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
220
+
221
+void x265_ssim_4x4x2_core_neon(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24);
222
+
223
+int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
224
+int PFX(psyCost_8x8_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
225
+void PFX(weight_pp_neon)(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
226
+void PFX(weight_sp_neon)(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
227
+int PFX(scanPosLast_neon)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
228
+uint32_t PFX(costCoeffNxN_neon)(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
229
+
230
+uint64_t x265_pixel_var_8x8_sve2(const pixel* pix, intptr_t stride);
231
+uint64_t x265_pixel_var_16x16_sve2(const pixel* pix, intptr_t stride);
232
+uint64_t x265_pixel_var_32x32_sve2(const pixel* pix, intptr_t stride);
233
+uint64_t x265_pixel_var_64x64_sve2(const pixel* pix, intptr_t stride);
234
+
235
+void x265_getResidual16_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
236
+void x265_getResidual32_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
237
+
238
+void x265_scale1D_128to64_sve2(pixel *dst, const pixel *src);
239
+void x265_scale2D_64to32_sve2(pixel* dst, const pixel* src, intptr_t stride);
240
+
241
+int x265_pixel_satd_4x4_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
242
+int x265_pixel_satd_8x4_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
243
+int x265_pixel_satd_8x12_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
244
+int x265_pixel_satd_32x16_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
245
+int x265_pixel_satd_32x32_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
246
+int x265_pixel_satd_64x48_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
247
+
248
+uint32_t PFX(quant_sve)(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
249
+
250
+void x265_dequant_scaling_sve2(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift);
251
+void x265_dequant_normal_sve2(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
252
+
253
+void x265_ssim_4x4x2_core_sve2(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24);
254
+
255
+int PFX(psyCost_8x8_sve2)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
256
+void PFX(weight_sp_sve2)(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
257
+int PFX(scanPosLast_sve2)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
258
x265_3.6.tar.gz/source/common/aarch64/intrapred-prim.cpp Added
267
 
1
@@ -0,0 +1,265 @@
2
+#include "common.h"
3
+#include "primitives.h"
4
+
5
+
6
+#if 1
7
+#include "arm64-utils.h"
8
+#include <arm_neon.h>
9
+
10
+using namespace X265_NS;
11
+
12
+namespace
13
+{
14
+
15
+
16
+
17
+template<int width>
18
+void intra_pred_ang_neon(pixel *dst, intptr_t dstStride, const pixel *srcPix0, int dirMode, int bFilter)
19
+{
20
+    int width2 = width << 1;
21
+    // Flip the neighbours in the horizontal case.
22
+    int horMode = dirMode < 18;
23
+    pixel neighbourBuf129;
24
+    const pixel *srcPix = srcPix0;
25
+
26
+    if (horMode)
27
+    {
28
+        neighbourBuf0 = srcPix0;
29
+        //for (int i = 0; i < width << 1; i++)
30
+        //{
31
+        //    neighbourBuf1 + i = srcPixwidth2 + 1 + i;
32
+        //    neighbourBufwidth2 + 1 + i = srcPix1 + i;
33
+        //}
34
+        memcpy(&neighbourBuf1, &srcPixwidth2 + 1, sizeof(pixel) * (width << 1));
35
+        memcpy(&neighbourBufwidth2 + 1, &srcPix1, sizeof(pixel) * (width << 1));
36
+        srcPix = neighbourBuf;
37
+    }
38
+
39
+    // Intra prediction angle and inverse angle tables.
40
+    const int8_t angleTable17 = { -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 };
41
+    const int16_t invAngleTable8 = { 4096, 1638, 910, 630, 482, 390, 315, 256 };
42
+
43
+    // Get the prediction angle.
44
+    int angleOffset = horMode ? 10 - dirMode : dirMode - 26;
45
+    int angle = angleTable8 + angleOffset;
46
+
47
+    // Vertical Prediction.
48
+    if (!angle)
49
+    {
50
+        for (int y = 0; y < width; y++)
51
+        {
52
+            memcpy(&dsty * dstStride, srcPix + 1, sizeof(pixel)*width);
53
+        }
54
+        if (bFilter)
55
+        {
56
+            int topLeft = srcPix0, top = srcPix1;
57
+            for (int y = 0; y < width; y++)
58
+            {
59
+                dsty * dstStride = x265_clip((int16_t)(top + ((srcPixwidth2 + 1 + y - topLeft) >> 1)));
60
+            }
61
+        }
62
+    }
63
+    else // Angular prediction.
64
+    {
65
+        // Get the reference pixels. The reference base is the first pixel to the top (neighbourBuf1).
66
+        pixel refBuf64;
67
+        const pixel *ref;
68
+
69
+        // Use the projected left neighbours and the top neighbours.
70
+        if (angle < 0)
71
+        {
72
+            // Number of neighbours projected.
73
+            int nbProjected = -((width * angle) >> 5) - 1;
74
+            pixel *ref_pix = refBuf + nbProjected + 1;
75
+
76
+            // Project the neighbours.
77
+            int invAngle = invAngleTable- angleOffset - 1;
78
+            int invAngleSum = 128;
79
+            for (int i = 0; i < nbProjected; i++)
80
+            {
81
+                invAngleSum += invAngle;
82
+                ref_pix- 2 - i = srcPixwidth2 + (invAngleSum >> 8);
83
+            }
84
+
85
+            // Copy the top-left and top pixels.
86
+            //for (int i = 0; i < width + 1; i++)
87
+            //ref_pix-1 + i = srcPixi;
88
+
89
+            memcpy(&ref_pix-1, srcPix, (width + 1)*sizeof(pixel));
90
+            ref = ref_pix;
91
+        }
92
+        else // Use the top and top-right neighbours.
93
+        {
94
+            ref = srcPix + 1;
95
+        }
96
+
97
+        // Pass every row.
98
+        int angleSum = 0;
99
+        for (int y = 0; y < width; y++)
100
+        {
101
+            angleSum += angle;
102
+            int offset = angleSum >> 5;
103
+            int fraction = angleSum & 31;
104
+
105
+            if (fraction) // Interpolate
106
+            {
107
+                if (width >= 8 && sizeof(pixel) == 1)
108
+                {
109
+                    const int16x8_t f0 = vdupq_n_s16(32 - fraction);
110
+                    const int16x8_t f1 = vdupq_n_s16(fraction);
111
+                    for (int x = 0; x < width; x += 8)
112
+                    {
113
+                        uint8x8_t in0 = *(uint8x8_t *)&refoffset + x;
114
+                        uint8x8_t in1 = *(uint8x8_t *)&refoffset + x + 1;
115
+                        int16x8_t lo = vmlaq_s16(vdupq_n_s16(16), vmovl_u8(in0), f0);
116
+                        lo = vmlaq_s16(lo, vmovl_u8(in1), f1);
117
+                        lo = vshrq_n_s16(lo, 5);
118
+                        *(uint8x8_t *)&dsty * dstStride + x = vmovn_u16(lo);
119
+                    }
120
+                }
121
+                else if (width >= 4 && sizeof(pixel) == 2)
122
+                {
123
+                    const int32x4_t f0 = vdupq_n_s32(32 - fraction);
124
+                    const int32x4_t f1 = vdupq_n_s32(fraction);
125
+                    for (int x = 0; x < width; x += 4)
126
+                    {
127
+                        uint16x4_t in0 = *(uint16x4_t *)&refoffset + x;
128
+                        uint16x4_t in1 = *(uint16x4_t *)&refoffset + x + 1;
129
+                        int32x4_t lo = vmlaq_s32(vdupq_n_s32(16), vmovl_u16(in0), f0);
130
+                        lo = vmlaq_s32(lo, vmovl_u16(in1), f1);
131
+                        lo = vshrq_n_s32(lo, 5);
132
+                        *(uint16x4_t *)&dsty * dstStride + x = vmovn_u32(lo);
133
+                    }
134
+                }
135
+                else
136
+                {
137
+                    for (int x = 0; x < width; x++)
138
+                    {
139
+                        dsty * dstStride + x = (pixel)(((32 - fraction) * refoffset + x + fraction * refoffset + x + 1 + 16) >> 5);
140
+                    }
141
+                }
142
+            }
143
+            else // Copy.
144
+            {
145
+                memcpy(&dsty * dstStride, &refoffset, sizeof(pixel)*width);
146
+            }
147
+        }
148
+    }
149
+
150
+    // Flip for horizontal.
151
+    if (horMode)
152
+    {
153
+        if (width == 8)
154
+        {
155
+            transpose8x8(dst, dst, dstStride, dstStride);
156
+        }
157
+        else if (width == 16)
158
+        {
159
+            transpose16x16(dst, dst, dstStride, dstStride);
160
+        }
161
+        else if (width == 32)
162
+        {
163
+            transpose32x32(dst, dst, dstStride, dstStride);
164
+        }
165
+        else
166
+        {
167
+            for (int y = 0; y < width - 1; y++)
168
+            {
169
+                for (int x = y + 1; x < width; x++)
170
+                {
171
+                    pixel tmp              = dsty * dstStride + x;
172
+                    dsty * dstStride + x = dstx * dstStride + y;
173
+                    dstx * dstStride + y = tmp;
174
+                }
175
+            }
176
+        }
177
+    }
178
+}
179
+
180
+template<int log2Size>
181
+void all_angs_pred_neon(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
182
+{
183
+    const int size = 1 << log2Size;
184
+    for (int mode = 2; mode <= 34; mode++)
185
+    {
186
+        pixel *srcPix  = (g_intraFilterFlagsmode & size ? filtPix  : refPix);
187
+        pixel *out = dest + ((mode - 2) << (log2Size * 2));
188
+
189
+        intra_pred_ang_neon<size>(out, size, srcPix, mode, bLuma);
190
+
191
+        // Optimize code don't flip buffer
192
+        bool modeHor = (mode < 18);
193
+
194
+        // transpose the block if this is a horizontal mode
195
+        if (modeHor)
196
+        {
197
+            if (size == 8)
198
+            {
199
+                transpose8x8(out, out, size, size);
200
+            }
201
+            else if (size == 16)
202
+            {
203
+                transpose16x16(out, out, size, size);
204
+            }
205
+            else if (size == 32)
206
+            {
207
+                transpose32x32(out, out, size, size);
208
+            }
209
+            else
210
+            {
211
+                for (int k = 0; k < size - 1; k++)
212
+                {
213
+                    for (int l = k + 1; l < size; l++)
214
+                    {
215
+                        pixel tmp         = outk * size + l;
216
+                        outk * size + l = outl * size + k;
217
+                        outl * size + k = tmp;
218
+                    }
219
+                }
220
+            }
221
+        }
222
+    }
223
+}
224
+}
225
+
226
+namespace X265_NS
227
+{
228
+// x265 private namespace
229
+
230
+void setupIntraPrimitives_neon(EncoderPrimitives &p)
231
+{
232
+    for (int i = 2; i < NUM_INTRA_MODE; i++)
233
+    {
234
+        p.cuBLOCK_8x8.intra_predi = intra_pred_ang_neon<8>;
235
+        p.cuBLOCK_16x16.intra_predi = intra_pred_ang_neon<16>;
236
+        p.cuBLOCK_32x32.intra_predi = intra_pred_ang_neon<32>;
237
+    }
238
+    p.cuBLOCK_4x4.intra_pred2 = intra_pred_ang_neon<4>;
239
+    p.cuBLOCK_4x4.intra_pred10 = intra_pred_ang_neon<4>;
240
+    p.cuBLOCK_4x4.intra_pred18 = intra_pred_ang_neon<4>;
241
+    p.cuBLOCK_4x4.intra_pred26 = intra_pred_ang_neon<4>;
242
+    p.cuBLOCK_4x4.intra_pred34 = intra_pred_ang_neon<4>;
243
+
244
+    p.cuBLOCK_4x4.intra_pred_allangs = all_angs_pred_neon<2>;
245
+    p.cuBLOCK_8x8.intra_pred_allangs = all_angs_pred_neon<3>;
246
+    p.cuBLOCK_16x16.intra_pred_allangs = all_angs_pred_neon<4>;
247
+    p.cuBLOCK_32x32.intra_pred_allangs = all_angs_pred_neon<5>;
248
+}
249
+
250
+}
251
+
252
+
253
+
254
+#else
255
+
256
+namespace X265_NS
257
+{
258
+// x265 private namespace
259
+void setupIntraPrimitives_neon(EncoderPrimitives &p)
260
+{}
261
+}
262
+
263
+#endif
264
+
265
+
266
+
267
x265_3.6.tar.gz/source/common/aarch64/intrapred-prim.h Added
17
 
1
@@ -0,0 +1,15 @@
2
+#ifndef INTRAPRED_PRIM_H__
3
+
4
+#if defined(__aarch64__)
5
+
6
+namespace X265_NS
7
+{
8
+// x265 private namespace
9
+
10
+void setupIntraPrimitives_neon(EncoderPrimitives &p);
11
+}
12
+
13
+#endif
14
+
15
+#endif
16
+
17
x265_3.6.tar.gz/source/common/aarch64/ipfilter-common.S Added
1438
 
1
@@ -0,0 +1,1436 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+// Macros below follow these conventions:
29
+// - input data in registers: v0, v1, v2, v3, v4, v5, v6, v7
30
+// - constants in registers: v24, v25, v26, v27, v31
31
+// - temporary registers: v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30.
32
+// - _32b macros output a result in v17.4s
33
+// - _64b and _32b_1 macros output results in v17.4s, v18.4s
34
+
35
+#include "asm.S"
36
+
37
+.arch           armv8-a
38
+
39
+#ifdef __APPLE__
40
+.section __RODATA,__rodata
41
+#else
42
+.section .rodata
43
+#endif
44
+
45
+.align 4
46
+
47
+.macro vextin8 v
48
+    ldp             d6, d7, x11, #16
49
+.if \v == 0
50
+    // qpel_filter_0 only uses values in v3
51
+    ext             v3.8b, v6.8b, v7.8b, #4
52
+.else
53
+.if \v != 3
54
+    ext             v0.8b, v6.8b, v7.8b, #1
55
+.endif
56
+    ext             v1.8b, v6.8b, v7.8b, #2
57
+    ext             v2.8b, v6.8b, v7.8b, #3
58
+    ext             v3.8b, v6.8b, v7.8b, #4
59
+    ext             v4.8b, v6.8b, v7.8b, #5
60
+    ext             v5.8b, v6.8b, v7.8b, #6
61
+    ext             v6.8b, v6.8b, v7.8b, #7
62
+.endif
63
+.endm
64
+
65
+.macro vextin8_64 v
66
+    ldp             q6, q7, x11, #32
67
+.if \v == 0
68
+    // qpel_filter_0 only uses values in v3
69
+    ext             v3.16b, v6.16b, v7.16b, #4
70
+.else
71
+.if \v != 3
72
+    // qpel_filter_3 does not use values in v0
73
+    ext             v0.16b, v6.16b, v7.16b, #1
74
+.endif
75
+    ext             v1.16b, v6.16b, v7.16b, #2
76
+    ext             v2.16b, v6.16b, v7.16b, #3
77
+    ext             v3.16b, v6.16b, v7.16b, #4
78
+    ext             v4.16b, v6.16b, v7.16b, #5
79
+    ext             v5.16b, v6.16b, v7.16b, #6
80
+.if \v == 1
81
+    ext             v6.16b, v6.16b, v7.16b, #7
82
+    // qpel_filter_1 does not use v7
83
+.else
84
+    ext             v16.16b, v6.16b, v7.16b, #7
85
+    ext             v7.16b, v6.16b, v7.16b, #8
86
+    mov             v6.16b, v16.16b
87
+.endif
88
+.endif
89
+.endm
90
+
91
+.macro vextin8_chroma v
92
+    ldp             d6, d7, x11, #16
93
+.if \v == 0
94
+    // qpel_filter_chroma_0 only uses values in v1
95
+    ext             v1.8b, v6.8b, v7.8b, #2
96
+.else
97
+    ext             v0.8b, v6.8b, v7.8b, #1
98
+    ext             v1.8b, v6.8b, v7.8b, #2
99
+    ext             v2.8b, v6.8b, v7.8b, #3
100
+    ext             v3.8b, v6.8b, v7.8b, #4
101
+.endif
102
+.endm
103
+
104
+.macro vextin8_chroma_64 v
105
+    ldp             q16, q17, x11, #32
106
+.if \v == 0
107
+    // qpel_filter_chroma_0 only uses values in v1
108
+    ext             v1.16b, v16.16b, v17.16b, #2
109
+.else
110
+    ext             v0.16b, v16.16b, v17.16b, #1
111
+    ext             v1.16b, v16.16b, v17.16b, #2
112
+    ext             v2.16b, v16.16b, v17.16b, #3
113
+    ext             v3.16b, v16.16b, v17.16b, #4
114
+.endif
115
+.endm
116
+
117
+.macro qpel_load_32b v
118
+.if \v == 0
119
+    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
120
+    ld1             {v3.8b}, x6, x1
121
+.elseif \v == 1 || \v == 2 || \v == 3
122
+.if \v != 3                           // not used in qpel_filter_3
123
+    ld1             {v0.8b}, x6, x1
124
+.else
125
+    add             x6, x6, x1
126
+.endif
127
+    ld1             {v1.8b}, x6, x1
128
+    ld1             {v2.8b}, x6, x1
129
+    ld1             {v3.8b}, x6, x1
130
+    ld1             {v4.8b}, x6, x1
131
+    ld1             {v5.8b}, x6, x1
132
+.if \v != 1                           // not used in qpel_filter_1
133
+    ld1             {v6.8b}, x6, x1
134
+    ld1             {v7.8b}, x6
135
+.else
136
+    ld1             {v6.8b}, x6
137
+.endif
138
+.endif
139
+.endm
140
+
141
+.macro qpel_load_64b v
142
+.if \v == 0
143
+    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
144
+    ld1             {v3.16b}, x6, x1
145
+.elseif \v == 1 || \v == 2 || \v == 3
146
+.if \v != 3                           // not used in qpel_filter_3
147
+    ld1             {v0.16b}, x6, x1
148
+.else
149
+    add             x6, x6, x1
150
+.endif
151
+    ld1             {v1.16b}, x6, x1
152
+    ld1             {v2.16b}, x6, x1
153
+    ld1             {v3.16b}, x6, x1
154
+    ld1             {v4.16b}, x6, x1
155
+    ld1             {v5.16b}, x6, x1
156
+.if \v != 1                           // not used in qpel_filter_1
157
+    ld1             {v6.16b}, x6, x1
158
+    ld1             {v7.16b}, x6
159
+.else
160
+    ld1             {v6.16b}, x6
161
+.endif
162
+.endif
163
+.endm
164
+
165
+.macro qpel_chroma_load_32b v
166
+.if \v == 0
167
+    // qpel_filter_chroma_0 only uses values in v1
168
+    add             x6, x6, x1
169
+    ldr             d1, x6
170
+.else
171
+    ld1             {v0.8b}, x6, x1
172
+    ld1             {v1.8b}, x6, x1
173
+    ld1             {v2.8b}, x6, x1
174
+    ld1             {v3.8b}, x6
175
+.endif
176
+.endm
177
+
178
+.macro qpel_chroma_load_64b v
179
+.if \v == 0
180
+    // qpel_filter_chroma_0 only uses values in v1
181
+    add             x6, x6, x1
182
+    ldr             q1, x6
183
+.else
184
+    ld1             {v0.16b}, x6, x1
185
+    ld1             {v1.16b}, x6, x1
186
+    ld1             {v2.16b}, x6, x1
187
+    ld1             {v3.16b}, x6
188
+.endif
189
+.endm
190
+
191
+//          a, b,   c,  d,  e,   f, g,  h
192
+// .hword   0, 0,   0, 64,  0,   0, 0,  0
193
+.macro qpel_start_0
194
+    movi            v24.16b, #64
195
+.endm
196
+
197
+.macro qpel_filter_0_32b
198
+    umull           v17.8h, v3.8b, v24.8b    // 64*d
199
+.endm
200
+
201
+.macro qpel_filter_0_64b
202
+    qpel_filter_0_32b
203
+    umull2          v18.8h, v3.16b, v24.16b  // 64*d
204
+.endm
205
+
206
+.macro qpel_start_0_1
207
+    movi            v24.8h, #64
208
+.endm
209
+
210
+.macro qpel_filter_0_32b_1
211
+    smull           v17.4s, v3.4h, v24.4h    // 64*d0
212
+    smull2          v18.4s, v3.8h, v24.8h    // 64*d1
213
+.endm
214
+
215
+//          a, b,   c,  d,  e,   f, g,  h
216
+// .hword  -1, 4, -10, 58, 17,  -5, 1,  0
217
+.macro qpel_start_1
218
+    movi            v24.16b, #58
219
+    movi            v25.16b, #10
220
+    movi            v26.16b, #17
221
+    movi            v27.16b, #5
222
+.endm
223
+
224
+.macro qpel_filter_1_32b
225
+    umull           v19.8h, v2.8b, v25.8b  // c*10
226
+    umull           v17.8h, v3.8b, v24.8b  // d*58
227
+    umull           v21.8h, v4.8b, v26.8b  // e*17
228
+    umull           v23.8h, v5.8b, v27.8b  // f*5
229
+    sub             v17.8h, v17.8h, v19.8h // d*58 - c*10
230
+    ushll           v18.8h, v1.8b, #2      // b*4
231
+    add             v17.8h, v17.8h, v21.8h // d*58 - c*10 + e*17
232
+    usubl           v21.8h, v6.8b, v0.8b   // g - a
233
+    add             v17.8h, v17.8h, v18.8h // d*58 - c*10 + e*17 + b*4
234
+    sub             v21.8h, v21.8h, v23.8h // g - a - f*5
235
+    add             v17.8h, v17.8h, v21.8h // d*58 - c*10 + e*17 + b*4 + g - a - f*5
236
+.endm
237
+
238
+.macro qpel_filter_1_64b
239
+    qpel_filter_1_32b
240
+    umull2          v20.8h, v2.16b, v25.16b  // c*10
241
+    umull2          v18.8h, v3.16b, v24.16b  // d*58
242
+    umull2          v21.8h, v4.16b, v26.16b  // e*17
243
+    umull2          v23.8h, v5.16b, v27.16b  // f*5
244
+    sub             v18.8h, v18.8h, v20.8h   // d*58 - c*10
245
+    ushll2          v28.8h, v1.16b, #2       // b*4
246
+    add             v18.8h, v18.8h, v21.8h   // d*58 - c*10 + e*17
247
+    usubl2          v21.8h, v6.16b, v0.16b   // g - a
248
+    add             v18.8h, v18.8h, v28.8h   // d*58 - c*10 + e*17 + b*4
249
+    sub             v21.8h, v21.8h, v23.8h   // g - a - f*5
250
+    add             v18.8h, v18.8h, v21.8h   // d*58 - c*10 + e*17 + b*4 + g - a - f*5
251
+.endm
252
+
253
+.macro qpel_start_1_1
254
+    movi            v24.8h, #58
255
+    movi            v25.8h, #10
256
+    movi            v26.8h, #17
257
+    movi            v27.8h, #5
258
+.endm
259
+
260
+.macro qpel_filter_1_32b_1
261
+    smull           v17.4s, v3.4h, v24.4h    // 58 * d0
262
+    smull2          v18.4s, v3.8h, v24.8h    // 58 * d1
263
+    smull           v19.4s, v2.4h, v25.4h    // 10 * c0
264
+    smull2          v20.4s, v2.8h, v25.8h    // 10 * c1
265
+    smull           v21.4s, v4.4h, v26.4h    // 17 * e0
266
+    smull2          v22.4s, v4.8h, v26.8h    // 17 * e1
267
+    smull           v23.4s, v5.4h, v27.4h    //  5 * f0
268
+    smull2          v16.4s, v5.8h, v27.8h    //  5 * f1
269
+    sub             v17.4s, v17.4s, v19.4s   // 58 * d0 - 10 * c0
270
+    sub             v18.4s, v18.4s, v20.4s   // 58 * d1 - 10 * c1
271
+    sshll           v19.4s, v1.4h, #2        // 4 * b0
272
+    sshll2          v20.4s, v1.8h, #2        // 4 * b1
273
+    add             v17.4s, v17.4s, v21.4s   // 58 * d0 - 10 * c0 + 17 * e0
274
+    add             v18.4s, v18.4s, v22.4s   // 58 * d1 - 10 * c1 + 17 * e1
275
+    ssubl           v21.4s, v6.4h, v0.4h     // g0 - a0
276
+    ssubl2          v22.4s, v6.8h, v0.8h     // g1 - a1
277
+    add             v17.4s, v17.4s, v19.4s   // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0
278
+    add             v18.4s, v18.4s, v20.4s   // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1
279
+    sub             v21.4s, v21.4s, v23.4s   // g0 - a0 - 5 * f0
280
+    sub             v22.4s, v22.4s, v16.4s   // g1 - a1 - 5 * f1
281
+    add             v17.4s, v17.4s, v21.4s   // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0
282
+    add             v18.4s, v18.4s, v22.4s   // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1
283
+.endm
284
+
285
+//          a, b,   c,  d,  e,   f, g,  h
286
+// .hword  -1, 4, -11, 40, 40, -11, 4, -1
287
+.macro qpel_start_2
288
+    movi            v24.8h, #11
289
+    movi            v25.8h, #40
290
+.endm
291
+
292
+.macro qpel_filter_2_32b
293
+    uaddl           v17.8h, v3.8b, v4.8b     // d + e
294
+    uaddl           v19.8h, v2.8b, v5.8b     // c + f
295
+    uaddl           v23.8h, v1.8b, v6.8b     // b + g
296
+    uaddl           v21.8h, v0.8b, v7.8b     // a + h
297
+    mul             v17.8h, v17.8h, v25.8h   // 40 * (d + e)
298
+    mul             v19.8h, v19.8h, v24.8h   // 11 * (c + f)
299
+    shl             v23.8h, v23.8h, #2       // (b + g) * 4
300
+    add             v19.8h, v19.8h, v21.8h   // 11 * (c + f) + a + h
301
+    add             v17.8h, v17.8h, v23.8h   // 40 * (d + e) + (b + g) * 4
302
+    sub             v17.8h, v17.8h, v19.8h   // 40 * (d + e) + (b + g) * 4 - 11 * (c + f) - a - h
303
+.endm
304
+
305
+.macro qpel_filter_2_64b
306
+    qpel_filter_2_32b
307
+    uaddl2          v27.8h, v3.16b, v4.16b   // d + e
308
+    uaddl2          v16.8h, v2.16b, v5.16b   // c + f
309
+    uaddl2          v23.8h, v1.16b, v6.16b   // b + g
310
+    uaddl2          v21.8h, v0.16b, v7.16b   // a + h
311
+    mul             v27.8h, v27.8h, v25.8h   // 40 * (d + e)
312
+    mul             v16.8h, v16.8h, v24.8h   // 11 * (c + f)
313
+    shl             v23.8h, v23.8h, #2       // (b + g) * 4
314
+    add             v16.8h, v16.8h, v21.8h   // 11 * (c + f) + a + h
315
+    add             v27.8h, v27.8h, v23.8h   // 40 * (d + e) + (b + g) * 4
316
+    sub             v18.8h, v27.8h, v16.8h   // 40 * (d + e) + (b + g) * 4 - 11 * (c + f) - a - h
317
+.endm
318
+
319
+.macro qpel_start_2_1
320
+    movi            v24.4s, #11
321
+    movi            v25.4s, #40
322
+.endm
323
+
324
+.macro qpel_filter_2_32b_1
325
+    saddl           v17.4s, v3.4h, v4.4h     // d0 + e0
326
+    saddl2          v18.4s, v3.8h, v4.8h     // d1 + e1
327
+    saddl           v19.4s, v2.4h, v5.4h     // c0 + f0
328
+    saddl2          v20.4s, v2.8h, v5.8h     // c1 + f1
329
+    mul             v19.4s, v19.4s, v24.4s   // 11 * (c0 + f0)
330
+    mul             v20.4s, v20.4s, v24.4s   // 11 * (c1 + f1)
331
+    saddl           v23.4s, v1.4h, v6.4h     // b0 + g0
332
+    mul             v17.4s, v17.4s, v25.4s   // 40 * (d0 + e0)
333
+    mul             v18.4s, v18.4s, v25.4s   // 40 * (d1 + e1)
334
+    saddl2          v16.4s, v1.8h, v6.8h     // b1 + g1
335
+    saddl           v21.4s, v0.4h, v7.4h     // a0 + h0
336
+    saddl2          v22.4s, v0.8h, v7.8h     // a1 + h1
337
+    shl             v23.4s, v23.4s, #2       // 4*(b0+g0)
338
+    shl             v16.4s, v16.4s, #2       // 4*(b1+g1)
339
+    add             v19.4s, v19.4s, v21.4s   // 11 * (c0 + f0) + a0 + h0
340
+    add             v20.4s, v20.4s, v22.4s   // 11 * (c1 + f1) + a1 + h1
341
+    add             v17.4s, v17.4s, v23.4s   // 40 * (d0 + e0) + 4*(b0+g0)
342
+    add             v18.4s, v18.4s, v16.4s   // 40 * (d1 + e1) + 4*(b1+g1)
343
+    sub             v17.4s, v17.4s, v19.4s   // 40 * (d0 + e0) + 4*(b0+g0) - (11 * (c0 + f0) + a0 + h0)
344
+    sub             v18.4s, v18.4s, v20.4s   // 40 * (d1 + e1) + 4*(b1+g1) - (11 * (c1 + f1) + a1 + h1)
345
+.endm
346
+
347
+//          a, b,   c,  d,  e,   f, g,  h
348
+// .hword   0, 1,  -5, 17, 58, -10, 4, -1
349
+.macro qpel_start_3
350
+    movi            v24.16b, #17
351
+    movi            v25.16b, #5
352
+    movi            v26.16b, #58
353
+    movi            v27.16b, #10
354
+.endm
355
+
356
+.macro qpel_filter_3_32b
357
+    umull           v19.8h, v2.8b, v25.8b    // c * 5
358
+    umull           v17.8h, v3.8b, v24.8b    // d * 17
359
+    umull           v21.8h, v4.8b, v26.8b    // e * 58
360
+    umull           v23.8h, v5.8b, v27.8b    // f * 10
361
+    sub             v17.8h, v17.8h, v19.8h   // d * 17 - c * 5
362
+    ushll           v19.8h, v6.8b, #2        // g * 4
363
+    add             v17.8h, v17.8h, v21.8h   // d * 17 - c * 5 + e * 58
364
+    usubl           v21.8h, v1.8b, v7.8b     // b - h
365
+    add             v17.8h, v17.8h, v19.8h   // d * 17 - c * 5 + e * 58 + g * 4
366
+    sub             v21.8h, v21.8h, v23.8h   // b - h - f * 10
367
+    add             v17.8h, v17.8h, v21.8h   // d * 17 - c * 5 + e * 58 + g * 4 + b - h - f * 10
368
+.endm
369
+
370
+.macro qpel_filter_3_64b
371
+    qpel_filter_3_32b
372
+    umull2          v16.8h, v2.16b, v25.16b  // c * 5
373
+    umull2          v18.8h, v3.16b, v24.16b  // d * 17
374
+    umull2          v21.8h, v4.16b, v26.16b  // e * 58
375
+    umull2          v23.8h, v5.16b, v27.16b  // f * 10
376
+    sub             v18.8h, v18.8h, v16.8h   // d * 17 - c * 5
377
+    ushll2          v16.8h, v6.16b, #2       // g * 4
378
+    add             v18.8h, v18.8h, v21.8h   // d * 17 - c * 5 + e * 58
379
+    usubl2          v21.8h, v1.16b, v7.16b   // b - h
380
+    add             v18.8h, v18.8h, v16.8h   // d * 17 - c * 5 + e * 58 + g * 4
381
+    sub             v21.8h, v21.8h, v23.8h   // b - h - f * 10
382
+    add             v18.8h, v18.8h, v21.8h   // d * 17 - c * 5 + e * 58 + g * 4 + b - h - f * 10
383
+.endm
384
+
385
+.macro qpel_start_3_1
386
+    movi            v24.8h, #17
387
+    movi            v25.8h, #5
388
+    movi            v26.8h, #58
389
+    movi            v27.8h, #10
390
+.endm
391
+
392
+.macro qpel_filter_3_32b_1
393
+    smull           v17.4s, v3.4h, v24.4h    // 17 * d0
394
+    smull2          v18.4s, v3.8h, v24.8h    // 17 * d1
395
+    smull           v19.4s, v2.4h, v25.4h    //  5 * c0
396
+    smull2          v20.4s, v2.8h, v25.8h    //  5 * c1
397
+    smull           v21.4s, v4.4h, v26.4h    // 58 * e0
398
+    smull2          v22.4s, v4.8h, v26.8h    // 58 * e1
399
+    smull           v23.4s, v5.4h, v27.4h    // 10 * f0
400
+    smull2          v16.4s, v5.8h, v27.8h    // 10 * f1
401
+    sub             v17.4s, v17.4s, v19.4s   // 17 * d0 - 5 * c0
402
+    sub             v18.4s, v18.4s, v20.4s   // 17 * d1 - 5 * c1
403
+    sshll           v19.4s, v6.4h, #2        //  4 * g0
404
+    sshll2          v20.4s, v6.8h, #2        //  4 * g1
405
+    add             v17.4s, v17.4s, v21.4s   // 17 * d0 - 5 * c0 + 58 * e0
406
+    add             v18.4s, v18.4s, v22.4s   // 17 * d1 - 5 * c1 + 58 * e1
407
+    ssubl           v21.4s, v1.4h, v7.4h     // b0 - h0
408
+    ssubl2          v22.4s, v1.8h, v7.8h     // b1 - h1
409
+    add             v17.4s, v17.4s, v19.4s   // 17 * d0 - 5 * c0 + 58 * e0 + 4 * g0
410
+    add             v18.4s, v18.4s, v20.4s   // 17 * d1 - 5 * c1 + 58 * e1 + 4 * g1
411
+    sub             v21.4s, v21.4s, v23.4s   // b0 - h0 - 10 * f0
412
+    sub             v22.4s, v22.4s, v16.4s   // b1 - h1 - 10 * f1
413
+    add             v17.4s, v17.4s, v21.4s   // 17 * d0 - 5 * c0 + 58 * e0 + 4 * g0 + b0 - h0 - 10 * f0
414
+    add             v18.4s, v18.4s, v22.4s   // 17 * d1 - 5 * c1 + 58 * e1 + 4 * g1 + b1 - h1 - 10 * f1
415
+.endm
416
+
417
+.macro qpel_start_chroma_0
418
+    movi            v24.16b, #64
419
+.endm
420
+
421
+.macro qpel_filter_chroma_0_32b
422
+    umull           v17.8h, v1.8b, v24.8b    // 64*b
423
+.endm
424
+
425
+.macro qpel_filter_chroma_0_64b
426
+    umull           v17.8h, v1.8b, v24.8b    // 64*b
427
+    umull2          v18.8h, v1.16b, v24.16b  // 64*b
428
+.endm
429
+
430
+.macro qpel_start_chroma_0_1
431
+    movi            v24.8h, #64
432
+.endm
433
+
434
+.macro qpel_filter_chroma_0_32b_1
435
+    smull           v17.4s, v1.4h, v24.4h    // 64*b0
436
+    smull2          v18.4s, v1.8h, v24.8h    // 64*b1
437
+.endm
438
+
439
+.macro qpel_start_chroma_1
440
+    movi            v24.16b, #58
441
+    movi            v25.16b, #10
442
+.endm
443
+
444
+.macro qpel_filter_chroma_1_32b
445
+    umull           v17.8h, v1.8b, v24.8b    // 58 * b
446
+    umull           v19.8h, v2.8b, v25.8b    // 10 * c
447
+    uaddl           v22.8h, v0.8b, v3.8b     // a + d
448
+    shl             v22.8h, v22.8h, #1       // 2 * (a+d)
449
+    sub             v17.8h, v17.8h, v22.8h   // 58*b - 2*(a+d)
450
+    add             v17.8h, v17.8h, v19.8h   // 58*b-2*(a+d) + 10*c
451
+.endm
452
+
453
+.macro qpel_filter_chroma_1_64b
454
+    umull           v17.8h, v1.8b, v24.8b    // 58 * b
455
+    umull2          v18.8h, v1.16b, v24.16b  // 58 * b
456
+    umull           v19.8h, v2.8b, v25.8b    // 10 * c
457
+    umull2          v20.8h, v2.16b, v25.16b  // 10 * c
458
+    uaddl           v22.8h, v0.8b, v3.8b     // a + d
459
+    uaddl2          v23.8h, v0.16b, v3.16b   // a + d
460
+    shl             v22.8h, v22.8h, #1       // 2 * (a+d)
461
+    shl             v23.8h, v23.8h, #1       // 2 * (a+d)
462
+    sub             v17.8h, v17.8h, v22.8h   // 58*b - 2*(a+d)
463
+    sub             v18.8h, v18.8h, v23.8h   // 58*b - 2*(a+d)
464
+    add             v17.8h, v17.8h, v19.8h   // 58*b-2*(a+d) + 10*c
465
+    add             v18.8h, v18.8h, v20.8h   // 58*b-2*(a+d) + 10*c
466
+.endm
467
+
468
+.macro qpel_start_chroma_1_1
469
+    movi            v24.8h, #58
470
+    movi            v25.8h, #10
471
+.endm
472
+
473
+.macro qpel_filter_chroma_1_32b_1
474
+    smull           v17.4s, v1.4h, v24.4h    // 58 * b0
475
+    smull2          v18.4s, v1.8h, v24.8h    // 58 * b1
476
+    smull           v19.4s, v2.4h, v25.4h    // 10 * c0
477
+    smull2          v20.4s, v2.8h, v25.8h    // 10 * c1
478
+    add             v22.8h, v0.8h, v3.8h     // a + d
479
+    sshll           v21.4s, v22.4h, #1       // 2 * (a0+d0)
480
+    sshll2          v22.4s, v22.8h, #1       // 2 * (a1+d1)
481
+    sub             v17.4s, v17.4s, v21.4s   // 58*b0 - 2*(a0+d0)
482
+    sub             v18.4s, v18.4s, v22.4s   // 58*b1 - 2*(a1+d1)
483
+    add             v17.4s, v17.4s, v19.4s   // 58*b0-2*(a0+d0) + 10*c0
484
+    add             v18.4s, v18.4s, v20.4s   // 58*b1-2*(a1+d1) + 10*c1
485
+.endm
486
+
487
+.macro qpel_start_chroma_2
488
+    movi            v25.16b, #54
489
+.endm
490
+
491
+.macro qpel_filter_chroma_2_32b
492
+    umull           v17.8h, v1.8b, v25.8b    // 54 * b
493
+    ushll           v19.8h, v0.8b, #2        // 4 * a
494
+    ushll           v21.8h, v2.8b, #4        // 16 * c
495
+    ushll           v23.8h, v3.8b, #1        // 2 * d
496
+    add             v17.8h, v17.8h, v21.8h   // 54*b + 16*c
497
+    add             v19.8h, v19.8h, v23.8h   // 4*a + 2*d
498
+    sub             v17.8h, v17.8h, v19.8h   // 54*b+16*c - (4*a+2*d)
499
+.endm
500
+
501
+.macro qpel_filter_chroma_2_64b
502
+    umull           v17.8h, v1.8b, v25.8b    // 54 * b
503
+    umull2          v18.8h, v1.16b, v25.16b  // 54 * b
504
+    ushll           v19.8h, v0.8b, #2        // 4 * a
505
+    ushll2          v20.8h, v0.16b, #2       // 4 * a
506
+    ushll           v21.8h, v2.8b, #4        // 16 * c
507
+    ushll2          v22.8h, v2.16b, #4       // 16 * c
508
+    ushll           v23.8h, v3.8b, #1        // 2 * d
509
+    ushll2          v24.8h, v3.16b, #1       // 2 * d
510
+    add             v17.8h, v17.8h, v21.8h   // 54*b + 16*c
511
+    add             v18.8h, v18.8h, v22.8h   // 54*b + 16*c
512
+    add             v19.8h, v19.8h, v23.8h   // 4*a + 2*d
513
+    add             v20.8h, v20.8h, v24.8h   // 4*a + 2*d
514
+    sub             v17.8h, v17.8h, v19.8h   // 54*b+16*c - (4*a+2*d)
515
+    sub             v18.8h, v18.8h, v20.8h   // 54*b+16*c - (4*a+2*d)
516
+.endm
517
+
518
+.macro qpel_start_chroma_2_1
519
+    movi            v25.8h, #54
520
+.endm
521
+
522
+.macro qpel_filter_chroma_2_32b_1
523
+    smull           v17.4s, v1.4h, v25.4h    // 54 * b0
524
+    smull2          v18.4s, v1.8h, v25.8h    // 54 * b1
525
+    sshll           v19.4s, v0.4h, #2        // 4 * a0
526
+    sshll2          v20.4s, v0.8h, #2        // 4 * a1
527
+    sshll           v21.4s, v2.4h, #4        // 16 * c0
528
+    sshll2          v22.4s, v2.8h, #4        // 16 * c1
529
+    sshll           v23.4s, v3.4h, #1        // 2 * d0
530
+    sshll2          v24.4s, v3.8h, #1        // 2 * d1
531
+    add             v17.4s, v17.4s, v21.4s   // 54*b0 + 16*c0
532
+    add             v18.4s, v18.4s, v22.4s   // 54*b1 + 16*c1
533
+    add             v19.4s, v19.4s, v23.4s   // 4*a0 + 2*d0
534
+    add             v20.4s, v20.4s, v24.4s   // 4*a1 + 2*d1
535
+    sub             v17.4s, v17.4s, v19.4s   // 54*b0+16*c0 - (4*a0+2*d0)
536
+    sub             v18.4s, v18.4s, v20.4s   // 54*b1+16*c1 - (4*a1+2*d1)
537
+.endm
538
+
539
+.macro qpel_start_chroma_3
540
+    movi            v25.16b, #46
541
+    movi            v26.16b, #28
542
+    movi            v27.16b, #6
543
+.endm
544
+
545
+.macro qpel_filter_chroma_3_32b
546
+    umull           v17.8h, v1.8b, v25.8b    // 46 * b
547
+    umull           v19.8h, v2.8b, v26.8b    // 28 * c
548
+    ushll           v21.8h, v3.8b, #2        // 4 * d
549
+    umull           v23.8h, v0.8b, v27.8b    // 6 * a
550
+    add             v17.8h, v17.8h, v19.8h   // 46*b + 28*c
551
+    add             v21.8h, v21.8h, v23.8h   // 4*d + 6*a
552
+    sub             v17.8h, v17.8h, v21.8h   // 46*b+28*c - (4*d+6*a)
553
+.endm
554
+
555
+.macro qpel_filter_chroma_3_64b
556
+    umull           v17.8h, v1.8b, v25.8b    // 46 * b
557
+    umull2          v18.8h, v1.16b, v25.16b  // 46 * b
558
+    umull           v19.8h, v2.8b, v26.8b    // 28 * c
559
+    umull2          v20.8h, v2.16b, v26.16b  // 28 * c
560
+    ushll           v21.8h, v3.8b, #2        // 4 * d
561
+    ushll2          v22.8h, v3.16b, #2       // 4 * d
562
+    umull           v23.8h, v0.8b, v27.8b    // 6 * a
563
+    umull2          v24.8h, v0.16b, v27.16b  // 6 * a
564
+    add             v17.8h, v17.8h, v19.8h   // 46*b + 28*c
565
+    add             v18.8h, v18.8h, v20.8h   // 46*b + 28*c
566
+    add             v21.8h, v21.8h, v23.8h   // 4*d + 6*a
567
+    add             v22.8h, v22.8h, v24.8h   // 4*d + 6*a
568
+    sub             v17.8h, v17.8h, v21.8h   // 46*b+28*c - (4*d+6*a)
569
+    sub             v18.8h, v18.8h, v22.8h   // 46*b+28*c - (4*d+6*a)
570
+.endm
571
+
572
+.macro qpel_start_chroma_3_1
573
+    movi            v25.8h, #46
574
+    movi            v26.8h, #28
575
+    movi            v27.8h, #6
576
+.endm
577
+
578
+.macro qpel_filter_chroma_3_32b_1
579
+    smull           v17.4s, v1.4h, v25.4h    // 46 * b0
580
+    smull2          v18.4s, v1.8h, v25.8h    // 46 * b1
581
+    smull           v19.4s, v2.4h, v26.4h    // 28 * c0
582
+    smull2          v20.4s, v2.8h, v26.8h    // 28 * c1
583
+    sshll           v21.4s, v3.4h, #2        // 4 * d0
584
+    sshll2          v22.4s, v3.8h, #2        // 4 * d1
585
+    smull           v23.4s, v0.4h, v27.4h    // 6 * a0
586
+    smull2          v24.4s, v0.8h, v27.8h    // 6 * a1
587
+    add             v17.4s, v17.4s, v19.4s   // 46*b0 + 28*c0
588
+    add             v18.4s, v18.4s, v20.4s   // 46*b1 + 28*c1
589
+    add             v21.4s, v21.4s, v23.4s   // 4*d0 + 6*a0
590
+    add             v22.4s, v22.4s, v24.4s   // 4*d1 + 6*a1
591
+    sub             v17.4s, v17.4s, v21.4s   // 46*b0+28*c0 - (4*d0+6*a0)
592
+    sub             v18.4s, v18.4s, v22.4s   // 46*b1+28*c1 - (4*d1+6*a1)
593
+.endm
594
+
595
+.macro qpel_start_chroma_4
596
+    movi            v24.8h, #36
597
+.endm
598
+
599
+.macro qpel_filter_chroma_4_32b
600
+    uaddl           v20.8h, v0.8b, v3.8b     // a + d
601
+    uaddl           v17.8h, v1.8b, v2.8b     // b + c
602
+    shl             v20.8h, v20.8h, #2       // 4 * (a+d)
603
+    mul             v17.8h, v17.8h, v24.8h   // 36 * (b+c)
604
+    sub             v17.8h, v17.8h, v20.8h   // 36*(b+c) - 4*(a+d)
605
+.endm
606
+
607
+.macro qpel_filter_chroma_4_64b
608
+    uaddl           v20.8h, v0.8b, v3.8b     // a + d
609
+    uaddl2          v21.8h, v0.16b, v3.16b   // a + d
610
+    uaddl           v17.8h, v1.8b, v2.8b     // b + c
611
+    uaddl2          v18.8h, v1.16b, v2.16b   // b + c
612
+    shl             v20.8h, v20.8h, #2       // 4 * (a+d)
613
+    shl             v21.8h, v21.8h, #2       // 4 * (a+d)
614
+    mul             v17.8h, v17.8h, v24.8h   // 36 * (b+c)
615
+    mul             v18.8h, v18.8h, v24.8h   // 36 * (b+c)
616
+    sub             v17.8h, v17.8h, v20.8h   // 36*(b+c) - 4*(a+d)
617
+    sub             v18.8h, v18.8h, v21.8h   // 36*(b+c) - 4*(a+d)
618
+.endm
619
+
620
+.macro qpel_start_chroma_4_1
621
+    movi            v24.8h, #36
622
+.endm
623
+
624
+.macro qpel_filter_chroma_4_32b_1
625
+    add             v20.8h, v0.8h, v3.8h     // a + d
626
+    add             v21.8h, v1.8h, v2.8h     // b + c
627
+    smull           v17.4s, v21.4h, v24.4h   // 36 * (b0+c0)
628
+    smull2          v18.4s, v21.8h, v24.8h   // 36 * (b1+c1)
629
+    sshll           v21.4s, v20.4h, #2       // 4 * (a0+d0)
630
+    sshll2          v22.4s, v20.8h, #2       // 4 * (a1+d1)
631
+    sub             v17.4s, v17.4s, v21.4s   // 36*(b0+c0) - 4*(a0+d0)
632
+    sub             v18.4s, v18.4s, v22.4s   // 36*(b1+c1) - 4*(a1+d1)
633
+.endm
634
+
635
+.macro qpel_start_chroma_5
636
+    movi            v25.16b, #28
637
+    movi            v26.16b, #46
638
+    movi            v27.16b, #6
639
+.endm
640
+
641
+.macro qpel_filter_chroma_5_32b
642
+    umull           v17.8h, v1.8b, v25.8b    // 28 * b
643
+    umull           v19.8h, v2.8b, v26.8b    // 46 * c
644
+    ushll           v21.8h, v0.8b, #2        // 4 * a
645
+    umull           v23.8h, v3.8b, v27.8b    // 6 * d
646
+    add             v17.8h, v17.8h, v19.8h   // 28*b + 46*c
647
+    add             v21.8h, v21.8h, v23.8h   // 4*a + 6*d
648
+    sub             v17.8h, v17.8h, v21.8h   // 28*b+46*c - (4*a+6*d)
649
+.endm
650
+
651
+.macro qpel_filter_chroma_5_64b
652
+    umull           v17.8h, v1.8b, v25.8b    // 28 * b
653
+    umull2          v18.8h, v1.16b, v25.16b  // 28 * b
654
+    umull           v19.8h, v2.8b, v26.8b    // 46 * c
655
+    umull2          v20.8h, v2.16b, v26.16b  // 46 * c
656
+    ushll           v21.8h, v0.8b, #2        // 4 * a
657
+    ushll2          v22.8h, v0.16b, #2       // 4 * a
658
+    umull           v23.8h, v3.8b, v27.8b    // 6 * d
659
+    umull2          v24.8h, v3.16b, v27.16b  // 6 * d
660
+    add             v17.8h, v17.8h, v19.8h   // 28*b + 46*c
661
+    add             v18.8h, v18.8h, v20.8h   // 28*b + 46*c
662
+    add             v21.8h, v21.8h, v23.8h   // 4*a + 6*d
663
+    add             v22.8h, v22.8h, v24.8h   // 4*a + 6*d
664
+    sub             v17.8h, v17.8h, v21.8h   // 28*b+46*c - (4*a+6*d)
665
+    sub             v18.8h, v18.8h, v22.8h   // 28*b+46*c - (4*a+6*d)
666
+.endm
667
+
668
+.macro qpel_start_chroma_5_1
669
+    movi            v25.8h, #28
670
+    movi            v26.8h, #46
671
+    movi            v27.8h, #6
672
+.endm
673
+
674
+.macro qpel_filter_chroma_5_32b_1
675
+    smull           v17.4s, v1.4h, v25.4h    // 28 * b0
676
+    smull2          v18.4s, v1.8h, v25.8h    // 28 * b1
677
+    smull           v19.4s, v2.4h, v26.4h    // 46 * c0
678
+    smull2          v20.4s, v2.8h, v26.8h    // 46 * c1
679
+    sshll           v21.4s, v0.4h, #2        // 4 * a0
680
+    sshll2          v22.4s, v0.8h, #2        // 4 * a1
681
+    smull           v23.4s, v3.4h, v27.4h    // 6 * d0
682
+    smull2          v24.4s, v3.8h, v27.8h    // 6 * d1
683
+    add             v17.4s, v17.4s, v19.4s   // 28*b0 + 46*c0
684
+    add             v18.4s, v18.4s, v20.4s   // 28*b1 + 46*c1
685
+    add             v21.4s, v21.4s, v23.4s   // 4*a0 + 6*d0
686
+    add             v22.4s, v22.4s, v24.4s   // 4*a1 + 6*d1
687
+    sub             v17.4s, v17.4s, v21.4s   // 28*b0+46*c0 - (4*a0+6*d0)
688
+    sub             v18.4s, v18.4s, v22.4s   // 28*b1+46*c1 - (4*a1+6*d1)
689
+.endm
690
+
691
+.macro qpel_start_chroma_6
692
+    movi            v25.16b, #54
693
+.endm
694
+
695
+.macro qpel_filter_chroma_6_32b
696
+    umull           v17.8h, v2.8b, v25.8b    // 54 * c
697
+    ushll           v19.8h, v0.8b, #1        // 2 * a
698
+    ushll           v21.8h, v1.8b, #4        // 16 * b
699
+    ushll           v23.8h, v3.8b, #2        // 4 * d
700
+    add             v17.8h, v17.8h, v21.8h   // 54*c + 16*b
701
+    add             v19.8h, v19.8h, v23.8h   // 2*a + 4*d
702
+    sub             v17.8h, v17.8h, v19.8h   // 54*c+16*b - (2*a+4*d)
703
+.endm
704
+
705
+.macro qpel_filter_chroma_6_64b
706
+    umull           v17.8h, v2.8b, v25.8b    // 54 * c
707
+    umull2          v18.8h, v2.16b, v25.16b  // 54 * c
708
+    ushll           v19.8h, v0.8b, #1        // 2 * a
709
+    ushll2          v20.8h, v0.16b, #1       // 2 * a
710
+    ushll           v21.8h, v1.8b, #4        // 16 * b
711
+    ushll2          v22.8h, v1.16b, #4       // 16 * b
712
+    ushll           v23.8h, v3.8b, #2        // 4 * d
713
+    ushll2          v24.8h, v3.16b, #2       // 4 * d
714
+    add             v17.8h, v17.8h, v21.8h   // 54*c + 16*b
715
+    add             v18.8h, v18.8h, v22.8h   // 54*c + 16*b
716
+    add             v19.8h, v19.8h, v23.8h   // 2*a + 4*d
717
+    add             v20.8h, v20.8h, v24.8h   // 2*a + 4*d
718
+    sub             v17.8h, v17.8h, v19.8h   // 54*c+16*b - (2*a+4*d)
719
+    sub             v18.8h, v18.8h, v20.8h   // 54*c+16*b - (2*a+4*d)
720
+.endm
721
+
722
+.macro qpel_start_chroma_6_1
723
+    movi            v25.8h, #54
724
+.endm
725
+
726
+.macro qpel_filter_chroma_6_32b_1
727
+    smull           v17.4s, v2.4h, v25.4h    // 54 * c0
728
+    smull2          v18.4s, v2.8h, v25.8h    // 54 * c1
729
+    sshll           v19.4s, v0.4h, #1        // 2 * a0
730
+    sshll2          v20.4s, v0.8h, #1        // 2 * a1
731
+    sshll           v21.4s, v1.4h, #4        // 16 * b0
732
+    sshll2          v22.4s, v1.8h, #4        // 16 * b1
733
+    sshll           v23.4s, v3.4h, #2        // 4 * d0
734
+    sshll2          v24.4s, v3.8h, #2        // 4 * d1
735
+    add             v17.4s, v17.4s, v21.4s   // 54*c0 + 16*b0
736
+    add             v18.4s, v18.4s, v22.4s   // 54*c1 + 16*b1
737
+    add             v19.4s, v19.4s, v23.4s   // 2*a0 + 4*d0
738
+    add             v20.4s, v20.4s, v24.4s   // 2*a1 + 4*d1
739
+    sub             v17.4s, v17.4s, v19.4s   // 54*c0+16*b0 - (2*a0+4*d0)
740
+    sub             v18.4s, v18.4s, v20.4s   // 54*c1+16*b1 - (2*a1+4*d1)
741
+.endm
742
+
743
+.macro qpel_start_chroma_7
744
+    movi            v24.16b, #58
745
+    movi            v25.16b, #10
746
+.endm
747
+
748
+.macro qpel_filter_chroma_7_32b
749
+    uaddl           v20.8h, v0.8b, v3.8b     // a + d
750
+    umull           v17.8h, v2.8b, v24.8b    // 58 * c
751
+    shl             v20.8h, v20.8h, #1       // 2 * (a+d)
752
+    umull           v19.8h, v1.8b, v25.8b    // 10 * b
753
+    sub             v17.8h, v17.8h, v20.8h   // 58*c - 2*(a+d)
754
+    add             v17.8h, v17.8h, v19.8h   // 58*c-2*(a+d) + 10*b
755
+.endm
756
+
757
+.macro qpel_filter_chroma_7_64b
758
+    uaddl           v20.8h, v0.8b, v3.8b     // a + d
759
+    uaddl2          v21.8h, v0.16b, v3.16b   // a + d
760
+    umull           v17.8h, v2.8b, v24.8b    // 58 * c
761
+    umull2          v18.8h, v2.16b, v24.16b  // 58 * c
762
+    shl             v20.8h, v20.8h, #1       // 2 * (a+d)
763
+    shl             v21.8h, v21.8h, #1       // 2 * (a+d)
764
+    umull           v22.8h, v1.8b, v25.8b    // 10 * b
765
+    umull2          v23.8h, v1.16b, v25.16b  // 10 * b
766
+    sub             v17.8h, v17.8h, v20.8h   // 58*c - 2*(a+d)
767
+    sub             v18.8h, v18.8h, v21.8h   // 58*c - 2*(a+d)
768
+    add             v17.8h, v17.8h, v22.8h   // 58*c-2*(a+d) + 10*b
769
+    add             v18.8h, v18.8h, v23.8h   // 58*c-2*(a+d) + 10*b
770
+.endm
771
+
772
+.macro qpel_start_chroma_7_1
773
+    movi            v24.8h, #58
774
+    movi            v25.8h, #10
775
+.endm
776
+
777
+.macro qpel_filter_chroma_7_32b_1
778
+    add             v20.8h, v0.8h, v3.8h     // a + d
779
+    smull           v17.4s, v2.4h, v24.4h    // 58 * c0
780
+    smull2          v18.4s, v2.8h, v24.8h    // 58 * c1
781
+    sshll           v21.4s, v20.4h, #1       // 2 * (a0+d0)
782
+    sshll2          v22.4s, v20.8h, #1       // 2 * (a1+d1)
783
+    smull           v19.4s, v1.4h, v25.4h    // 10 * b0
784
+    smull2          v20.4s, v1.8h, v25.8h    // 10 * b1
785
+    sub             v17.4s, v17.4s, v21.4s   // 58*c0 - 2*(a0+d0)
786
+    sub             v18.4s, v18.4s, v22.4s   // 58*c1 - 2*(a1+d1)
787
+    add             v17.4s, v17.4s, v19.4s   // 58*c0-2*(a0+d0) + 10*b0
788
+    add             v18.4s, v18.4s, v20.4s   // 58*c1-2*(a1+d1) + 10*b1
789
+.endm
790
+
791
+.macro vpp_end
792
+    add             v17.8h, v17.8h, v31.8h
793
+    sqshrun         v17.8b, v17.8h, #6
794
+.endm
795
+
796
+.macro FILTER_LUMA_VPP w, h, v
797
+    lsl             x10, x1, #2      // x10 = 4 * x1
798
+    sub             x11, x10, x1     // x11 = 3 * x1
799
+    sub             x0, x0, x11      // src -= (8 / 2 - 1) * srcStride
800
+    mov             x5, #\h
801
+    mov             w12, #32
802
+    dup             v31.8h, w12
803
+    qpel_start_\v
804
+.loop_luma_vpp_\v\()_\w\()x\h:
805
+    mov             x7, x2
806
+    mov             x9, #0
807
+.loop_luma_vpp_w8_\v\()_\w\()x\h:
808
+    add             x6, x0, x9
809
+.if \w == 8 || \w == 24
810
+    qpel_load_32b \v
811
+    qpel_filter_\v\()_32b
812
+    vpp_end
813
+    str             d17, x7, #8
814
+    add             x9, x9, #8
815
+.elseif \w == 12
816
+    qpel_load_32b \v
817
+    qpel_filter_\v\()_32b
818
+    vpp_end
819
+    str             d17, x7, #8
820
+    add             x6, x0, #8
821
+    qpel_load_32b \v
822
+    qpel_filter_\v\()_32b
823
+    vpp_end
824
+    fmov            w6, s17
825
+    str             w6, x7, #4
826
+    add             x9, x9, #12
827
+.else
828
+    qpel_load_64b \v
829
+    qpel_filter_\v\()_64b
830
+    vpp_end
831
+    add             v18.8h, v18.8h, v31.8h
832
+    sqshrun2        v17.16b, v18.8h, #6
833
+    str             q17, x7, #16
834
+    add             x9, x9, #16
835
+.endif
836
+    cmp             x9, #\w
837
+    blt             .loop_luma_vpp_w8_\v\()_\w\()x\h
838
+    add             x0, x0, x1
839
+    add             x2, x2, x3
840
+    sub             x5, x5, #1
841
+    cbnz            x5, .loop_luma_vpp_\v\()_\w\()x\h
842
+    ret
843
+.endm
844
+
845
+.macro vps_end
846
+    sub             v17.8h, v17.8h, v31.8h
847
+.endm
848
+
849
+.macro FILTER_VPS w, h, v
850
+    lsl             x3, x3, #1
851
+    lsl             x10, x1, #2      // x10 = 4 * x1
852
+    sub             x11, x10, x1     // x11 = 3 * x1
853
+    sub             x0, x0, x11      // src -= (8 / 2 - 1) * srcStride
854
+    mov             x5, #\h
855
+    mov             w12, #8192
856
+    dup             v31.8h, w12
857
+    qpel_start_\v
858
+.loop_ps_\v\()_\w\()x\h:
859
+    mov             x7, x2
860
+    mov             x9, #0
861
+.loop_ps_w8_\v\()_\w\()x\h:
862
+    add             x6, x0, x9
863
+.if \w == 8 || \w == 24
864
+    qpel_load_32b \v
865
+    qpel_filter_\v\()_32b
866
+    vps_end
867
+    str             q17, x7, #16
868
+    add             x9, x9, #8
869
+.elseif \w == 12
870
+    qpel_load_32b \v
871
+    qpel_filter_\v\()_32b
872
+    vps_end
873
+    str             q17, x7, #16
874
+    add             x6, x0, #8
875
+    qpel_load_32b \v
876
+    qpel_filter_\v\()_32b
877
+    vps_end
878
+    str             d17, x7, #8
879
+    add             x9, x9, #12
880
+.else
881
+    qpel_load_64b \v
882
+    qpel_filter_\v\()_64b
883
+    vps_end
884
+    sub             v18.8h, v18.8h, v31.8h
885
+    stp             q17, q18, x7, #32
886
+    add             x9, x9, #16
887
+.endif
888
+    cmp             x9, #\w
889
+    blt             .loop_ps_w8_\v\()_\w\()x\h
890
+    add             x0, x0, x1
891
+    add             x2, x2, x3
892
+    sub             x5, x5, #1
893
+    cbnz            x5, .loop_ps_\v\()_\w\()x\h
894
+    ret
895
+.endm
896
+
897
+.macro vsp_end
898
+    add             v17.4s, v17.4s, v31.4s
899
+    add             v18.4s, v18.4s, v31.4s
900
+    sqshrun         v17.4h, v17.4s, #12
901
+    sqshrun2        v17.8h, v18.4s, #12
902
+    sqxtun          v17.8b, v17.8h
903
+.endm
904
+
905
+.macro FILTER_VSP w, h, v
906
+    lsl             x1, x1, #1
907
+    lsl             x10, x1, #2      // x10 = 4 * x1
908
+    sub             x11, x10, x1     // x11 = 3 * x1
909
+    sub             x0, x0, x11
910
+    mov             x5, #\h
911
+    mov             w12, #1
912
+    lsl             w12, w12, #19
913
+    add             w12, w12, #2048
914
+    dup             v31.4s, w12
915
+    mov             x12, #\w
916
+    lsl             x12, x12, #1
917
+    qpel_start_\v\()_1
918
+.loop_luma_vsp_\v\()_\w\()x\h:
919
+    mov             x7, x2
920
+    mov             x9, #0
921
+.loop_luma_vsp_w8_\v\()_\w\()x\h:
922
+    add             x6, x0, x9
923
+    qpel_load_64b \v
924
+    qpel_filter_\v\()_32b_1
925
+    vsp_end
926
+    str             d17, x7, #8
927
+    add             x9, x9, #16
928
+.if \w == 12
929
+    add             x6, x0, #16
930
+    qpel_load_64b \v
931
+    qpel_filter_\v\()_32b_1
932
+    vsp_end
933
+    str             s17, x7, #4
934
+    add             x9, x9, #8
935
+.endif
936
+    cmp             x9, x12
937
+    blt             .loop_luma_vsp_w8_\v\()_\w\()x\h
938
+    add             x0, x0, x1
939
+    add             x2, x2, x3
940
+    sub             x5, x5, #1
941
+    cbnz            x5, .loop_luma_vsp_\v\()_\w\()x\h
942
+    ret
943
+.endm
944
+
945
+.macro vss_end
946
+    sshr            v17.4s, v17.4s, #6
947
+    sshr            v18.4s, v18.4s, #6
948
+    uzp1            v17.8h, v17.8h, v18.8h
949
+.endm
950
+
951
+.macro FILTER_VSS w, h, v
952
+    lsl             x1, x1, #1
953
+    lsl             x10, x1, #2      // x10 = 4 * x1
954
+    sub             x11, x10, x1     // x11 = 3 * x1
955
+    sub             x0, x0, x11
956
+    lsl             x3, x3, #1
957
+    mov             x5, #\h
958
+    mov             x12, #\w
959
+    lsl             x12, x12, #1
960
+    qpel_start_\v\()_1
961
+.loop_luma_vss_\v\()_\w\()x\h:
962
+    mov             x7, x2
963
+    mov             x9, #0
964
+.loop_luma_vss_w8_\v\()_\w\()x\h:
965
+    add             x6, x0, x9
966
+    qpel_load_64b \v
967
+    qpel_filter_\v\()_32b_1
968
+    vss_end
969
+.if \w == 4
970
+    str             s17, x7, #4
971
+    add             x9, x9, #4
972
+.else
973
+    str             q17, x7, #16
974
+    add             x9, x9, #16
975
+.if \w == 12
976
+    add             x6, x0, x9
977
+    qpel_load_64b \v
978
+    qpel_filter_\v\()_32b_1
979
+    vss_end
980
+    str             d17, x7, #8
981
+    add             x9, x9, #8
982
+.endif
983
+.endif
984
+    cmp             x9, x12
985
+    blt             .loop_luma_vss_w8_\v\()_\w\()x\h
986
+    add             x0, x0, x1
987
+    add             x2, x2, x3
988
+    sub             x5, x5, #1
989
+    cbnz            x5, .loop_luma_vss_\v\()_\w\()x\h
990
+    ret
991
+.endm
992
+
993
+.macro hpp_end
994
+    add             v17.8h, v17.8h, v31.8h
995
+    sqshrun         v17.8b, v17.8h, #6
996
+.endm
997
+
998
+.macro FILTER_HPP w, h, v
999
+    mov             w6, #\h
1000
+    sub             x3, x3, #\w
1001
+    mov             w12, #32
1002
+    dup             v31.8h, w12
1003
+    qpel_start_\v
1004
+.if \w == 4
1005
+.rept \h
1006
+    mov             x11, x0
1007
+    sub             x11, x11, #4
1008
+    vextin8 \v
1009
+    qpel_filter_\v\()_32b
1010
+    hpp_end
1011
+    str             s17, x2, #4
1012
+    add             x0, x0, x1
1013
+    add             x2, x2, x3
1014
+.endr
1015
+    ret
1016
+.else
1017
+.loop1_hpp_\v\()_\w\()x\h:
1018
+    mov             x7, #\w
1019
+    mov             x11, x0
1020
+    sub             x11, x11, #4
1021
+.loop2_hpp_\v\()_\w\()x\h:
1022
+    vextin8 \v
1023
+    qpel_filter_\v\()_32b
1024
+    hpp_end
1025
+    str             d17, x2, #8
1026
+    sub             x11, x11, #8
1027
+    sub             x7, x7, #8
1028
+.if \w == 12
1029
+    vextin8 \v
1030
+    qpel_filter_\v\()_32b
1031
+    hpp_end
1032
+    str             s17, x2, #4
1033
+    sub             x7, x7, #4
1034
+.endif
1035
+    cbnz            x7, .loop2_hpp_\v\()_\w\()x\h
1036
+    sub             x6, x6, #1
1037
+    add             x0, x0, x1
1038
+    add             x2, x2, x3
1039
+    cbnz            x6, .loop1_hpp_\v\()_\w\()x\h
1040
+    ret
1041
+.endif
1042
+.endm
1043
+
1044
+.macro hps_end
1045
+    sub             v17.8h, v17.8h, v31.8h
1046
+.endm
1047
+
1048
+.macro FILTER_HPS w, h, v
1049
+    sub             x3, x3, #\w
1050
+    lsl             x3, x3, #1
1051
+    mov             w12, #8192
1052
+    dup             v31.8h, w12
1053
+    qpel_start_\v
1054
+.if \w == 4
1055
+.loop_hps_\v\()_\w\()x\h\():
1056
+    mov             x11, x0
1057
+    sub             x11, x11, #4
1058
+    vextin8 \v
1059
+    qpel_filter_\v\()_32b
1060
+    hps_end
1061
+    str             d17, x2, #8
1062
+    sub             w6, w6, #1
1063
+    add             x0, x0, x1
1064
+    add             x2, x2, x3
1065
+    cbnz            w6, .loop_hps_\v\()_\w\()x\h
1066
+    ret
1067
+.else
1068
+.loop1_hps_\v\()_\w\()x\h\():
1069
+    mov             w7, #\w
1070
+    mov             x11, x0
1071
+    sub             x11, x11, #4
1072
+.loop2_hps_\v\()_\w\()x\h\():
1073
+.if \w == 8 || \w == 12 || \w == 24
1074
+    vextin8 \v
1075
+    qpel_filter_\v\()_32b
1076
+    hps_end
1077
+    str             q17, x2, #16
1078
+    sub             w7, w7, #8
1079
+    sub             x11, x11, #8
1080
+.if \w == 12
1081
+    vextin8 \v
1082
+    qpel_filter_\v\()_32b
1083
+    hps_end
1084
+    str             d17, x2, #8
1085
+    sub             w7, w7, #4
1086
+.endif
1087
+.elseif \w == 16 || \w == 32 || \w == 48 || \w == 64
1088
+    vextin8_64 \v
1089
+    qpel_filter_\v\()_64b
1090
+    hps_end
1091
+    sub             v18.8h, v18.8h, v31.8h
1092
+    stp             q17, q18, x2, #32
1093
+    sub             w7, w7, #16
1094
+    sub             x11, x11, #16
1095
+.endif
1096
+    cbnz            w7, .loop2_hps_\v\()_\w\()x\h
1097
+    sub             w6, w6, #1
1098
+    add             x0, x0, x1
1099
+    add             x2, x2, x3
1100
+    cbnz            w6, .loop1_hps_\v\()_\w\()x\h
1101
+    ret
1102
+.endif
1103
+.endm
1104
+
1105
+.macro FILTER_CHROMA_VPP w, h, v
1106
+    qpel_start_chroma_\v
1107
+    mov             w12, #32
1108
+    dup             v31.8h, w12
1109
+    sub             x0, x0, x1
1110
+    mov             x5, #\h
1111
+.loop_chroma_vpp_\v\()_\w\()x\h:
1112
+    mov             x7, x2
1113
+    mov             x9, #0
1114
+.loop_chroma_vpp_w8_\v\()_\w\()x\h:
1115
+    add             x6, x0, x9
1116
+    qpel_chroma_load_32b \v
1117
+    qpel_filter_chroma_\v\()_32b
1118
+    vpp_end
1119
+    add             x9, x9, #8
1120
+.if \w == 2
1121
+    fmov            w12, s17
1122
+    strh            w12, x7, #2
1123
+.elseif \w == 4
1124
+    str             s17, x7, #4
1125
+.elseif \w == 6
1126
+    str             s17, x7, #4
1127
+    umov            w12, v17.h2
1128
+    strh            w12, x7, #2
1129
+.elseif \w == 12
1130
+    str             d17, x7, #8
1131
+    add             x6, x0, x9
1132
+    qpel_chroma_load_32b \v
1133
+    qpel_filter_chroma_\v\()_32b
1134
+    vpp_end
1135
+    str             s17, x7, #4
1136
+    add             x9, x9, #8
1137
+.else
1138
+    str             d17, x7, #8
1139
+.endif
1140
+    cmp             x9, #\w
1141
+    blt             .loop_chroma_vpp_w8_\v\()_\w\()x\h
1142
+    add             x0, x0, x1
1143
+    add             x2, x2, x3
1144
+    sub             x5, x5, #1
1145
+    cbnz            x5, .loop_chroma_vpp_\v\()_\w\()x\h
1146
+    ret
1147
+.endm
1148
+
1149
+.macro FILTER_CHROMA_VPS w, h, v
1150
+    qpel_start_chroma_\v
1151
+    mov             w12, #8192
1152
+    dup             v31.8h, w12
1153
+    lsl             x3, x3, #1
1154
+    sub             x0, x0, x1
1155
+    mov             x5, #\h
1156
+.loop_vps_\v\()_\w\()x\h:
1157
+    mov             x7, x2
1158
+    mov             x9, #0
1159
+.loop_vps_w8_\v\()_\w\()x\h:
1160
+    add             x6, x0, x9
1161
+    qpel_chroma_load_32b \v
1162
+    qpel_filter_chroma_\v\()_32b
1163
+    vps_end
1164
+    add             x9, x9, #8
1165
+.if \w == 2
1166
+    str             s17, x7, #4
1167
+.elseif \w == 4
1168
+    str             d17, x7, #8
1169
+.elseif \w == 6
1170
+    str             d17, x7, #8
1171
+    st1             {v17.s}2, x7, #4
1172
+.elseif \w == 12
1173
+    str             q17, x7, #16
1174
+    add             x6, x0, x9
1175
+    qpel_chroma_load_32b \v
1176
+    qpel_filter_chroma_\v\()_32b
1177
+    vps_end
1178
+    str             d17, x7, #8
1179
+    add             x9, x9, #8
1180
+.else
1181
+    str             q17, x7, #16
1182
+.endif
1183
+    cmp             x9, #\w
1184
+    blt             .loop_vps_w8_\v\()_\w\()x\h
1185
+
1186
+    add             x0, x0, x1
1187
+    add             x2, x2, x3
1188
+    sub             x5, x5, #1
1189
+    cbnz            x5, .loop_vps_\v\()_\w\()x\h
1190
+    ret
1191
+.endm
1192
+
1193
+.macro FILTER_CHROMA_VSP w, h, v
1194
+    lsl             x1, x1, #1
1195
+    sub             x0, x0, x1
1196
+    mov             x5, #\h
1197
+    mov             w12, #1
1198
+    lsl             w12, w12, #19
1199
+    add             w12, w12, #2048
1200
+    dup             v31.4s, w12
1201
+    mov             x12, #\w
1202
+    lsl             x12, x12, #1
1203
+    qpel_start_chroma_\v\()_1
1204
+.loop_vsp_\v\()_\w\()x\h:
1205
+    mov             x7, x2
1206
+    mov             x9, #0
1207
+.loop_vsp_w8_\v\()_\w\()x\h:
1208
+    add             x6, x0, x9
1209
+    qpel_chroma_load_64b \v
1210
+    qpel_filter_chroma_\v\()_32b_1
1211
+    vsp_end
1212
+    add             x9, x9, #16
1213
+.if \w == 4
1214
+    str             s17, x7, #4
1215
+.elseif \w == 12
1216
+    str             d17, x7, #8
1217
+    add             x6, x0, x9
1218
+    qpel_chroma_load_64b \v
1219
+    qpel_filter_chroma_\v\()_32b_1
1220
+    vsp_end
1221
+    str             s17, x7, #4
1222
+    add             x9, x9, #8
1223
+.else
1224
+    str             d17, x7, #8
1225
+.endif
1226
+    cmp             x9, x12
1227
+    blt             .loop_vsp_w8_\v\()_\w\()x\h
1228
+    add             x0, x0, x1
1229
+    add             x2, x2, x3
1230
+    sub             x5, x5, #1
1231
+    cbnz            x5, .loop_vsp_\v\()_\w\()x\h
1232
+    ret
1233
+.endm
1234
+
1235
+.macro FILTER_CHROMA_VSS w, h, v
1236
+    lsl             x1, x1, #1
1237
+    sub             x0, x0, x1
1238
+    lsl             x3, x3, #1
1239
+    mov             x5, #\h
1240
+    mov             x12, #\w
1241
+    lsl             x12, x12, #1
1242
+    qpel_start_chroma_\v\()_1
1243
+.loop_vss_\v\()_\w\()x\h:
1244
+    mov             x7, x2
1245
+    mov             x9, #0
1246
+.if \w == 4
1247
+.rept 2
1248
+    add             x6, x0, x9
1249
+    qpel_chroma_load_64b \v
1250
+    qpel_filter_chroma_\v\()_32b_1
1251
+    vss_end
1252
+    str             s17, x7, #4
1253
+    add             x9, x9, #4
1254
+.endr
1255
+.else
1256
+.loop_vss_w8_\v\()_\w\()x\h:
1257
+    add             x6, x0, x9
1258
+    qpel_chroma_load_64b \v
1259
+    qpel_filter_chroma_\v\()_32b_1
1260
+    vss_end
1261
+    str             q17, x7, #16
1262
+    add             x9, x9, #16
1263
+.if \w == 12
1264
+    add             x6, x0, x9
1265
+    qpel_chroma_load_64b \v
1266
+    qpel_filter_chroma_\v\()_32b_1
1267
+    vss_end
1268
+    str             d17, x7, #8
1269
+    add             x9, x9, #8
1270
+.endif
1271
+    cmp             x9, x12
1272
+    blt             .loop_vss_w8_\v\()_\w\()x\h
1273
+.endif
1274
+    add             x0, x0, x1
1275
+    add             x2, x2, x3
1276
+    sub             x5, x5, #1
1277
+    cbnz            x5, .loop_vss_\v\()_\w\()x\h
1278
+    ret
1279
+.endm
1280
+
1281
+.macro FILTER_CHROMA_HPP w, h, v
1282
+    qpel_start_chroma_\v
1283
+    mov             w12, #32
1284
+    dup             v31.8h, w12
1285
+    mov             w6, #\h
1286
+    sub             x3, x3, #\w
1287
+.if \w == 2 || \w == 4 || \w == 6 || \w == 12
1288
+.loop4_chroma_hpp_\v\()_\w\()x\h:
1289
+    mov             x11, x0
1290
+    sub             x11, x11, #2
1291
+    vextin8_chroma \v
1292
+    qpel_filter_chroma_\v\()_32b
1293
+    hpp_end
1294
+.if \w == 2
1295
+    fmov            w12, s17
1296
+    strh            w12, x2, #2
1297
+.elseif \w == 4
1298
+    str             s17, x2, #4
1299
+.elseif \w == 6
1300
+    str             s17, x2, #4
1301
+    umov            w12, v17.h2
1302
+    strh            w12, x2, #2
1303
+.elseif \w == 12
1304
+    str             d17, x2, #8
1305
+    sub             x11, x11, #8
1306
+    vextin8_chroma \v
1307
+    qpel_filter_chroma_\v\()_32b
1308
+    hpp_end
1309
+    str             s17, x2, #4
1310
+.endif
1311
+    sub             w6, w6, #1
1312
+    add             x0, x0, x1
1313
+    add             x2, x2, x3
1314
+    cbnz            w6, .loop4_chroma_hpp_\v\()_\w\()x\h
1315
+    ret
1316
+.else
1317
+.loop2_chroma_hpp_\v\()_\w\()x\h:
1318
+    mov             x7, #\w
1319
+    lsr             x7, x7, #3
1320
+    mov             x11, x0
1321
+    sub             x11, x11, #2
1322
+.loop3_chroma_hpp_\v\()_\w\()x\h:
1323
+.if \w == 8 || \w == 24
1324
+    vextin8_chroma \v
1325
+    qpel_filter_chroma_\v\()_32b
1326
+    hpp_end
1327
+    str             d17, x2, #8
1328
+    sub             x7, x7, #1
1329
+    sub             x11, x11, #8
1330
+.elseif \w == 16 || \w == 32 || \w == 48 || \w == 64
1331
+    vextin8_chroma_64 \v
1332
+    qpel_filter_chroma_\v\()_64b
1333
+    hpp_end
1334
+    add             v18.8h, v18.8h, v31.8h
1335
+    sqshrun2        v17.16b, v18.8h, #6
1336
+    str             q17, x2, #16
1337
+    sub             x7, x7, #2
1338
+    sub             x11, x11, #16
1339
+.endif
1340
+    cbnz            x7, .loop3_chroma_hpp_\v\()_\w\()x\h
1341
+    sub             w6, w6, #1
1342
+    add             x0, x0, x1
1343
+    add             x2, x2, x3
1344
+    cbnz            w6, .loop2_chroma_hpp_\v\()_\w\()x\h
1345
+    ret
1346
+.endif
1347
+.endm
1348
+
1349
+.macro CHROMA_HPS_2_4_6_12 w, v
1350
+    mov             x11, x0
1351
+    sub             x11, x11, #2
1352
+    vextin8_chroma \v
1353
+    qpel_filter_chroma_\v\()_32b
1354
+    hps_end
1355
+    sub             x11, x11, #8
1356
+.if \w == 2
1357
+    str             s17, x2, #4
1358
+.elseif \w == 4
1359
+    str             d17, x2, #8
1360
+.elseif \w == 6
1361
+    str             d17, x2, #8
1362
+    st1             {v17.s}2, x2, #4
1363
+.elseif \w == 12
1364
+    str             q17, x2, #16
1365
+    vextin8_chroma \v
1366
+    qpel_filter_chroma_\v\()_32b
1367
+    sub             v17.8h, v17.8h, v31.8h
1368
+    str             d17, x2, #8
1369
+.endif
1370
+    add             x0, x0, x1
1371
+    add             x2, x2, x3
1372
+.endm
1373
+
1374
+.macro FILTER_CHROMA_HPS w, h, v
1375
+    qpel_start_chroma_\v
1376
+    mov             w12, #8192
1377
+    dup             v31.8h, w12
1378
+    sub             x3, x3, #\w
1379
+    lsl             x3, x3, #1
1380
+
1381
+.if \w == 2 || \w == 4 || \w == 6 || \w == 12
1382
+    cmp             x5, #0
1383
+    beq             0f
1384
+    sub             x0, x0, x1
1385
+.rept 3
1386
+    CHROMA_HPS_2_4_6_12 \w, \v
1387
+.endr
1388
+0:
1389
+.rept \h
1390
+    CHROMA_HPS_2_4_6_12 \w, \v
1391
+.endr
1392
+    ret
1393
+.else
1394
+    mov             w10, #\h
1395
+    cmp             x5, #0
1396
+    beq             9f
1397
+    sub             x0, x0, x1
1398
+    add             w10, w10, #3
1399
+9:
1400
+    mov             w6, w10
1401
+.loop1_chroma_hps_\v\()_\w\()x\h\():
1402
+    mov             x7, #\w
1403
+    lsr             x7, x7, #3
1404
+    mov             x11, x0
1405
+    sub             x11, x11, #2
1406
+.loop2_chroma_hps_\v\()_\w\()x\h\():
1407
+.if \w == 8 || \w == 24
1408
+    vextin8_chroma \v
1409
+    qpel_filter_chroma_\v\()_32b
1410
+    hps_end
1411
+    str             q17, x2, #16
1412
+    sub             x7, x7, #1
1413
+    sub             x11, x11, #8
1414
+.elseif \w == 16 || \w == 32 || \w == 48 || \w == 64
1415
+    vextin8_chroma_64 \v
1416
+    qpel_filter_chroma_\v\()_64b
1417
+    hps_end
1418
+    sub             v18.8h, v18.8h, v31.8h
1419
+    stp             q17, q18, x2, #32
1420
+    sub             x7, x7, #2
1421
+    sub             x11, x11, #16
1422
+.endif
1423
+    cbnz            x7, .loop2_chroma_hps_\v\()_\w\()x\h\()
1424
+    sub             w6, w6, #1
1425
+    add             x0, x0, x1
1426
+    add             x2, x2, x3
1427
+    cbnz            w6, .loop1_chroma_hps_\v\()_\w\()x\h\()
1428
+    ret
1429
+.endif
1430
+.endm
1431
+
1432
+const g_lumaFilter, align=8
1433
+.word 0,0,0,0,0,0,64,64,0,0,0,0,0,0,0,0
1434
+.word -1,-1,4,4,-10,-10,58,58,17,17,-5,-5,1,1,0,0
1435
+.word -1,-1,4,4,-11,-11,40,40,40,40,-11,-11,4,4,-1,-1
1436
+.word 0,0,1,1,-5,-5,17,17,58,58,-10,-10,4,4,-1,-1
1437
+endconst
1438
x265_3.6.tar.gz/source/common/aarch64/ipfilter-sve2.S Added
1284
 
1
@@ -0,0 +1,1282 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// Functions in this file:
26
+// ***** luma_vpp *****
27
+// ***** luma_vps *****
28
+// ***** luma_vsp *****
29
+// ***** luma_vss *****
30
+// ***** luma_hpp *****
31
+// ***** luma_hps *****
32
+// ***** chroma_vpp *****
33
+// ***** chroma_vps *****
34
+// ***** chroma_vsp *****
35
+// ***** chroma_vss *****
36
+// ***** chroma_hpp *****
37
+// ***** chroma_hps *****
38
+
39
+#include "asm-sve.S"
40
+#include "ipfilter-common.S"
41
+
42
+.arch armv8-a+sve2
43
+
44
+#ifdef __APPLE__
45
+.section __RODATA,__rodata
46
+#else
47
+.section .rodata
48
+#endif
49
+
50
+.align 4
51
+
52
+.text
53
+
54
+.macro qpel_load_32b_sve2 v
55
+.if \v == 0
56
+    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
57
+    ld1b            {z3.h}, p0/z, x6
58
+    add             x6, x6, x1
59
+.elseif \v == 1 || \v == 2 || \v == 3
60
+.if \v != 3                           // not used in qpel_filter_3
61
+    ld1b            {z0.h}, p0/z, x6
62
+    add             x6, x6, x1
63
+.else
64
+    add             x6, x6, x1
65
+.endif
66
+    ld1b            {z1.h}, p0/z, x6
67
+    add             x6, x6, x1
68
+    ld1b            {z2.h}, p0/z, x6
69
+    add             x6, x6, x1
70
+    ld1b            {z3.h}, p0/z, x6
71
+    add             x6, x6, x1
72
+    ld1b            {z4.h}, p0/z, x6
73
+    add             x6, x6, x1
74
+    ld1b            {z5.h}, p0/z, x6
75
+    add             x6, x6, x1
76
+.if \v != 1                           // not used in qpel_filter_1
77
+    ld1b            {z6.h}, p0/z, x6
78
+    add             x6, x6, x1
79
+    ld1b            {z7.h}, p0/z, x6
80
+.else
81
+    ld1b            {z6.h}, p0/z, x6
82
+.endif
83
+.endif
84
+.endm
85
+
86
+.macro qpel_load_64b_sve2_gt_16 v
87
+.if \v == 0
88
+    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
89
+    ld1b            {z3.h}, p2/z, x6
90
+    add             x6, x6, x1
91
+.elseif \v == 1 || \v == 2 || \v == 3
92
+.if \v != 3                           // not used in qpel_filter_3
93
+    ld1b            {z0.h}, p2/z, x6
94
+    add             x6, x6, x1
95
+.else
96
+    add             x6, x6, x1
97
+.endif
98
+    ld1b            {z1.h}, p2/z, x6
99
+    add             x6, x6, x1
100
+    ld1b            {z2.h}, p2/z, x6
101
+    add             x6, x6, x1
102
+    ld1b            {z3.h}, p2/z, x6
103
+    add             x6, x6, x1
104
+    ld1b            {z4.h}, p2/z, x6
105
+    add             x6, x6, x1
106
+    ld1b            {z5.h}, p2/z, x6
107
+    add             x6, x6, x1
108
+.if \v != 1                           // not used in qpel_filter_1
109
+    ld1b            {z6.h}, p2/z, x6
110
+    add             x6, x6, x1
111
+    ld1b            {z7.h}, p2/z, x6
112
+.else
113
+    ld1b            {z6.h}, p2/z, x6
114
+.endif
115
+.endif
116
+.endm
117
+
118
+.macro qpel_chroma_load_32b_sve2 v
119
+.if \v == 0
120
+    // qpel_filter_chroma_0 only uses values in v1
121
+    add             x6, x6, x1
122
+    ld1b            {z1.h}, p0/z, x6
123
+.else
124
+    ld1b            {z0.h}, p0/z, x6
125
+    add             x6, x6, x1
126
+    ld1b            {z1.h}, p0/z, x6
127
+    add             x6, x6, x1
128
+    ld1b            {z2.h}, p0/z, x6
129
+    add             x6, x6, x1
130
+    ld1b            {z3.h}, p0/z, x6
131
+.endif
132
+.endm
133
+
134
+.macro qpel_start_sve2_0
135
+    mov             z24.h, #64
136
+.endm
137
+
138
+.macro qpel_filter_sve2_0_32b
139
+    mul             z17.h, z3.h, z24.h    // 64*d
140
+.endm
141
+
142
+.macro qpel_filter_sve2_0_64b
143
+    qpel_filter_sve2_0_32b
144
+    mul             z18.h, z11.h, z24.h
145
+.endm
146
+
147
+.macro qpel_start_sve2_1
148
+    mov             z24.h, #58
149
+    mov             z25.h, #10
150
+    mov             z26.h, #17
151
+    mov             z27.h, #5
152
+.endm
153
+
154
+.macro qpel_filter_sve2_1_32b
155
+    mul             z19.h, z2.h, z25.h  // c*10
156
+    mul             z17.h, z3.h, z24.h  // d*58
157
+    mul             z21.h, z4.h, z26.h  // e*17
158
+    mul             z23.h, z5.h, z27.h  // f*5
159
+    sub             z17.h, z17.h, z19.h // d*58 - c*10
160
+    lsl             z18.h, z1.h, #2      // b*4
161
+    add             z17.h, z17.h, z21.h // d*58 - c*10 + e*17
162
+    sub             z21.h, z6.h, z0.h   // g - a
163
+    add             z17.h, z17.h, z18.h // d*58 - c*10 + e*17 + b*4
164
+    sub             z21.h, z21.h, z23.h // g - a - f*5
165
+    add             z17.h, z17.h, z21.h // d*58 - c*10 + e*17 + b*4 + g - a - f*5
166
+.endm
167
+
168
+.macro qpel_filter_sve2_1_64b
169
+    qpel_filter_sve2_1_32b
170
+    mul             z20.h, z10.h, z25.h  // c*10
171
+    mul             z18.h, z11.h, z24.h  // d*58
172
+    mul             z21.h, z12.h, z26.h  // e*17
173
+    mul             z23.h, z13.h, z27.h  // f*5
174
+    sub             z18.h, z18.h, z20.h   // d*58 - c*10
175
+    lsl             z28.h, z30.h, #2       // b*4
176
+    add             z18.h, z18.h, z21.h   // d*58 - c*10 + e*17
177
+    sub             z21.h, z14.h, z29.h   // g - a
178
+    add             z18.h, z18.h, z28.h   // d*58 - c*10 + e*17 + b*4
179
+    sub             z21.h, z21.h, z23.h   // g - a - f*5
180
+    add             z18.h, z18.h, z21.h   // d*58 - c*10 + e*17 + b*4 + g - a - f*5
181
+.endm
182
+
183
+.macro qpel_start_sve2_2
184
+    mov             z24.h, #11
185
+    mov             z25.h, #40
186
+.endm
187
+
188
+.macro qpel_filter_sve2_2_32b
189
+    add             z17.h, z3.h, z4.h     // d + e
190
+    add             z19.h, z2.h, z5.h     // c + f
191
+    add             z23.h, z1.h, z6.h     // b + g
192
+    add             z21.h, z0.h, z7.h     // a + h
193
+    mul             z17.h, z17.h, z25.h   // 40 * (d + e)
194
+    mul             z19.h, z19.h, z24.h   // 11 * (c + f)
195
+    lsl             z23.h, z23.h, #2       // (b + g) * 4
196
+    add             z19.h, z19.h, z21.h   // 11 * (c + f) + a + h
197
+    add             z17.h, z17.h, z23.h   // 40 * (d + e) + (b + g) * 4
198
+    sub             z17.h, z17.h, z19.h   // 40 * (d + e) + (b + g) * 4 - 11 * (c + f) - a - h
199
+.endm
200
+
201
+.macro qpel_filter_sve2_2_64b
202
+    qpel_filter_sve2_2_32b
203
+    add             z27.h, z11.h, z12.h   // d + e
204
+    add             z16.h, z10.h, z13.h   // c + f
205
+    add             z23.h, z30.h, z14.h   // b + g
206
+    add             z21.h, z29.h, z15.h   // a + h
207
+    mul             z27.h, z27.h, z25.h   // 40 * (d + e)
208
+    mul             z16.h, z16.h, z24.h   // 11 * (c + f)
209
+    lsl             z23.h, z23.h, #2       // (b + g) * 4
210
+    add             z16.h, z16.h, z21.h   // 11 * (c + f) + a + h
211
+    add             z27.h, z27.h, z23.h   // 40 * (d + e) + (b + g) * 4
212
+    sub             z18.h, z27.h, z16.h   // 40 * (d + e) + (b + g) * 4 - 11 * (c + f) - a - h
213
+.endm
214
+
215
+.macro qpel_start_sve2_3
216
+    mov             z24.h, #17
217
+    mov             z25.h, #5
218
+    mov             z26.h, #58
219
+    mov             z27.h, #10
220
+.endm
221
+
222
+.macro qpel_filter_sve2_3_32b
223
+    mul             z19.h, z2.h, z25.h    // c * 5
224
+    mul             z17.h, z3.h, z24.h    // d * 17
225
+    mul             z21.h, z4.h, z26.h    // e * 58
226
+    mul             z23.h, z5.h, z27.h    // f * 10
227
+    sub             z17.h, z17.h, z19.h   // d * 17 - c * 5
228
+    lsl             z19.h, z6.h, #2        // g * 4
229
+    add             z17.h, z17.h, z21.h   // d * 17 - c * 5 + e * 58
230
+    sub             z21.h, z1.h, z7.h     // b - h
231
+    add             z17.h, z17.h, z19.h   // d * 17 - c * 5 + e * 58 + g * 4
232
+    sub             z21.h, z21.h, z23.h   // b - h - f * 10
233
+    add             z17.h, z17.h, z21.h   // d * 17 - c * 5 + e * 58 + g * 4 + b - h - f * 10
234
+.endm
235
+
236
+.macro qpel_filter_sve2_3_64b
237
+    qpel_filter_sve2_3_32b
238
+    mul             z16.h, z10.h, z25.h  // c * 5
239
+    mul             z18.h, z11.h, z24.h  // d * 17
240
+    mul             z21.h, z12.h, z26.h  // e * 58
241
+    mul             z23.h, z13.h, z27.h  // f * 10
242
+    sub             z18.h, z18.h, z16.h   // d * 17 - c * 5
243
+    lsl             z16.h, z14.h, #2       // g * 4
244
+    add             z18.h, z18.h, z21.h   // d * 17 - c * 5 + e * 58
245
+    sub             z21.h, z30.h, z15.h   // b - h
246
+    add             z18.h, z18.h, z16.h   // d * 17 - c * 5 + e * 58 + g * 4
247
+    sub             z21.h, z21.h, z23.h   // b - h - f * 10
248
+    add             z18.h, z18.h, z21.h   // d * 17 - c * 5 + e * 58 + g * 4 + b - h - f * 10
249
+.endm
250
+
251
+.macro qpel_start_chroma_sve2_0
252
+    mov             z29.h, #64
253
+.endm
254
+
255
+.macro qpel_filter_chroma_sve2_0_32b
256
+    mul             z17.h, z1.h, z29.h    // 64*b
257
+.endm
258
+
259
+.macro qpel_start_chroma_sve2_1
260
+    mov             z29.h, #58
261
+    mov             z30.h, #10
262
+.endm
263
+
264
+.macro qpel_filter_chroma_sve2_1_32b
265
+    mul             z17.h, z1.h, z29.h    // 58 * b
266
+    mul             z19.h, z2.h, z30.h    // 10 * c
267
+    add             z22.h, z0.h, z3.h     // a + d
268
+    lsl             z22.h, z22.h, #1       // 2 * (a+d)
269
+    sub             z17.h, z17.h, z22.h   // 58*b - 2*(a+d)
270
+    add             z17.h, z17.h, z19.h   // 58*b-2*(a+d) + 10*c
271
+.endm
272
+
273
+.macro qpel_start_chroma_sve2_2
274
+    mov             z30.h, #54
275
+.endm
276
+
277
+.macro qpel_filter_chroma_sve2_2_32b
278
+    mul             z17.h, z1.h, z30.h    // 54 * b
279
+    lsl             z19.h, z0.h, #2        // 4 * a
280
+    lsl             z21.h, z2.h, #4        // 16 * c
281
+    lsl             z23.h, z3.h, #1        // 2 * d
282
+    add             z17.h, z17.h, z21.h   // 54*b + 16*c
283
+    add             z19.h, z19.h, z23.h   // 4*a + 2*d
284
+    sub             z17.h, z17.h, z19.h   // 54*b+16*c - (4*a+2*d)
285
+.endm
286
+
287
+.macro qpel_start_chroma_sve2_3
288
+    mov             z28.h, #46
289
+    mov             z29.h, #28
290
+    mov             z30.h, #6
291
+.endm
292
+
293
+.macro qpel_filter_chroma_sve2_3_32b
294
+    mul             z17.h, z1.h, z28.h    // 46 * b
295
+    mul             z19.h, z2.h, z29.h    // 28 * c
296
+    lsl             z21.h, z3.h, #2        // 4 * d
297
+    mul             z23.h, z0.h, z30.h    // 6 * a
298
+    add             z17.h, z17.h, z19.h   // 46*b + 28*c
299
+    add             z21.h, z21.h, z23.h   // 4*d + 6*a
300
+    sub             z17.h, z17.h, z21.h   // 46*b+28*c - (4*d+6*a)
301
+.endm
302
+
303
+.macro qpel_start_chroma_sve2_4
304
+    mov             z29.h, #36
305
+.endm
306
+
307
+.macro qpel_filter_chroma_sve2_4_32b
308
+    add             z20.h, z0.h, z3.h     // a + d
309
+    add             z17.h, z1.h, z2.h     // b + c
310
+    lsl             z20.h, z20.h, #2       // 4 * (a+d)
311
+    mul             z17.h, z17.h, z29.h   // 36 * (b+c)
312
+    sub             z17.h, z17.h, z20.h   // 36*(b+c) - 4*(a+d)
313
+.endm
314
+
315
+.macro qpel_start_chroma_sve2_5
316
+    mov             z28.h, #28
317
+    mov             z29.h, #46
318
+    mov             z30.h, #6
319
+.endm
320
+
321
+.macro qpel_filter_chroma_sve2_5_32b
322
+    mul             z17.h, z1.h, z28.h    // 28 * b
323
+    mul             z19.h, z2.h, z29.h    // 46 * c
324
+    lsl             z21.h, z0.h, #2        // 4 * a
325
+    mul             z23.h, z3.h, z30.h    // 6 * d
326
+    add             z17.h, z17.h, z19.h   // 28*b + 46*c
327
+    add             z21.h, z21.h, z23.h   // 4*a + 6*d
328
+    sub             z17.h, z17.h, z21.h   // 28*b+46*c - (4*a+6*d)
329
+.endm
330
+
331
+.macro qpel_start_chroma_sve2_6
332
+    mov             z30.h, #54
333
+.endm
334
+
335
+.macro qpel_filter_chroma_sve2_6_32b
336
+    mul             z17.h, z2.h, z30.h    // 54 * c
337
+    lsl             z19.h, z0.h, #1        // 2 * a
338
+    lsl             z21.h, z1.h, #4        // 16 * b
339
+    lsl             z23.h, z3.h, #2        // 4 * d
340
+    add             z17.h, z17.h, z21.h   // 54*c + 16*b
341
+    add             z19.h, z19.h, z23.h   // 2*a + 4*d
342
+    sub             z17.h, z17.h, z19.h   // 54*c+16*b - (2*a+4*d)
343
+.endm
344
+
345
+.macro qpel_start_chroma_sve2_7
346
+    mov             z29.h, #58
347
+    mov             z30.h, #10
348
+.endm
349
+
350
+.macro qpel_filter_chroma_sve2_7_32b
351
+    add             z20.h, z0.h, z3.h     // a + d
352
+    mul             z17.h, z2.h, z29.h    // 58 * c
353
+    lsl             z20.h, z20.h, #1       // 2 * (a+d)
354
+    mul             z19.h, z1.h, z30.h    // 10 * b
355
+    sub             z17.h, z17.h, z20.h   // 58*c - 2*(a+d)
356
+    add             z17.h, z17.h, z19.h   // 58*c-2*(a+d) + 10*b
357
+.endm
358
+
359
+.macro vpp_end_sve2
360
+    add             z17.h, z17.h, z31.h
361
+    sqshrun         v17.8b, v17.8h, #6
362
+.endm
363
+
364
+.macro FILTER_LUMA_VPP_SVE2 w, h, v
365
+    lsl             x10, x1, #2      // x10 = 4 * x1
366
+    sub             x11, x10, x1     // x11 = 3 * x1
367
+    sub             x0, x0, x11      // src -= (8 / 2 - 1) * srcStride
368
+    mov             x5, #\h
369
+    mov             z31.h, #32
370
+    rdvl            x9, #1
371
+    cmp             x9, #16
372
+    bgt             .vl_gt_16_FILTER_LUMA_VPP_\v\()_\w\()x\h
373
+    qpel_start_\v
374
+.loop_luma_vpp_sve2_\v\()_\w\()x\h:
375
+    mov             x7, x2
376
+    mov             x9, #0
377
+.loop_luma_vpp_w8_sve2_\v\()_\w\()x\h:
378
+    add             x6, x0, x9
379
+.if \w == 8 || \w == 24
380
+    qpel_load_32b \v
381
+    qpel_filter_\v\()_32b
382
+    vpp_end
383
+    str             d17, x7, #8
384
+    add             x9, x9, #8
385
+.elseif \w == 12
386
+    qpel_load_32b \v
387
+    qpel_filter_\v\()_32b
388
+    vpp_end
389
+    str             d17, x7, #8
390
+    add             x6, x0, #8
391
+    qpel_load_32b \v
392
+    qpel_filter_\v\()_32b
393
+    vpp_end
394
+    fmov            w6, s17
395
+    str             w6, x7, #4
396
+    add             x9, x9, #12
397
+.else
398
+    qpel_load_64b \v
399
+    qpel_filter_\v\()_64b
400
+    vpp_end
401
+    add             v18.8h, v18.8h, v31.8h
402
+    sqshrun2        v17.16b, v18.8h, #6
403
+    str             q17, x7, #16
404
+    add             x9, x9, #16
405
+.endif
406
+    cmp             x9, #\w
407
+    blt             .loop_luma_vpp_w8_sve2_\v\()_\w\()x\h
408
+    add             x0, x0, x1
409
+    add             x2, x2, x3
410
+    sub             x5, x5, #1
411
+    cbnz            x5, .loop_luma_vpp_sve2_\v\()_\w\()x\h
412
+    ret
413
+.vl_gt_16_FILTER_LUMA_VPP_\v\()_\w\()x\h:
414
+    ptrue           p0.h, vl8
415
+    ptrue           p2.h, vl16
416
+    qpel_start_sve2_\v
417
+.gt_16_loop_luma_vpp_sve2_\v\()_\w\()x\h:
418
+    mov             x7, x2
419
+    mov             x9, #0
420
+.gt_16_loop_luma_vpp_w8_sve2_\v\()_\w\()x\h:
421
+    add             x6, x0, x9
422
+.if \w == 8 || \w == 24
423
+    qpel_load_32b_sve2 \v
424
+    qpel_filter_sve2_\v\()_32b
425
+    vpp_end_sve2
426
+    str             d17, x7, #8
427
+    add             x9, x9, #8
428
+.elseif \w == 12
429
+    qpel_load_32b_sve2 \v
430
+    qpel_filter_sve2_\v\()_32b
431
+    vpp_end_sve2
432
+    str             d17, x7, #8
433
+    add             x6, x0, #8
434
+    qpel_load_32b_sve2 \v
435
+    qpel_filter_sve2_\v\()_32b
436
+    vpp_end_sve2
437
+    fmov            w6, s17
438
+    str             w6, x7, #4
439
+    add             x9, x9, #12
440
+.else
441
+    qpel_load_64b_sve2_gt_16 \v
442
+    qpel_filter_sve2_\v\()_32b
443
+    vpp_end_sve2
444
+    add             z18.h, z18.h, z31.h
445
+    sqshrun2        v17.16b, v18.8h, #6
446
+    str             q17, x7, #16
447
+    add             x9, x9, #16
448
+.endif
449
+    cmp             x9, #\w
450
+    blt             .gt_16_loop_luma_vpp_w8_sve2_\v\()_\w\()x\h
451
+    add             x0, x0, x1
452
+    add             x2, x2, x3
453
+    sub             x5, x5, #1
454
+    cbnz            x5, .gt_16_loop_luma_vpp_sve2_\v\()_\w\()x\h
455
+    ret
456
+.endm
457
+
458
+// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
459
+.macro LUMA_VPP_SVE2 w, h
460
+function x265_interp_8tap_vert_pp_\w\()x\h\()_sve2
461
+    cmp             x4, #0
462
+    b.eq            0f
463
+    cmp             x4, #1
464
+    b.eq            1f
465
+    cmp             x4, #2
466
+    b.eq            2f
467
+    cmp             x4, #3
468
+    b.eq            3f
469
+0:
470
+    FILTER_LUMA_VPP_SVE2 \w, \h, 0
471
+1:
472
+    FILTER_LUMA_VPP_SVE2 \w, \h, 1
473
+2:
474
+    FILTER_LUMA_VPP_SVE2 \w, \h, 2
475
+3:
476
+    FILTER_LUMA_VPP_SVE2 \w, \h, 3
477
+endfunc
478
+.endm
479
+
480
+LUMA_VPP_SVE2 8, 4
481
+LUMA_VPP_SVE2 8, 8
482
+LUMA_VPP_SVE2 8, 16
483
+LUMA_VPP_SVE2 8, 32
484
+LUMA_VPP_SVE2 12, 16
485
+LUMA_VPP_SVE2 16, 4
486
+LUMA_VPP_SVE2 16, 8
487
+LUMA_VPP_SVE2 16, 16
488
+LUMA_VPP_SVE2 16, 32
489
+LUMA_VPP_SVE2 16, 64
490
+LUMA_VPP_SVE2 16, 12
491
+LUMA_VPP_SVE2 24, 32
492
+LUMA_VPP_SVE2 32, 8
493
+LUMA_VPP_SVE2 32, 16
494
+LUMA_VPP_SVE2 32, 32
495
+LUMA_VPP_SVE2 32, 64
496
+LUMA_VPP_SVE2 32, 24
497
+LUMA_VPP_SVE2 48, 64
498
+LUMA_VPP_SVE2 64, 16
499
+LUMA_VPP_SVE2 64, 32
500
+LUMA_VPP_SVE2 64, 64
501
+LUMA_VPP_SVE2 64, 48
502
+
503
+// void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
504
+.macro LUMA_VPS_4xN_SVE2 h
505
+function x265_interp_8tap_vert_ps_4x\h\()_sve2
506
+    lsl             x3, x3, #1
507
+    lsl             x5, x4, #6
508
+    lsl             x4, x1, #2
509
+    sub             x4, x4, x1
510
+    sub             x0, x0, x4
511
+
512
+    mov             z28.s, #8192
513
+    mov             x4, #\h
514
+    movrel          x12, g_lumaFilter
515
+    add             x12, x12, x5
516
+    ptrue           p0.s, vl4
517
+    ld1rd           {z16.d}, p0/z, x12
518
+    ld1rd           {z17.d}, p0/z, x12, #8
519
+    ld1rd           {z18.d}, p0/z, x12, #16
520
+    ld1rd           {z19.d}, p0/z, x12, #24
521
+    ld1rd           {z20.d}, p0/z, x12, #32
522
+    ld1rd           {z21.d}, p0/z, x12, #40
523
+    ld1rd           {z22.d}, p0/z, x12, #48
524
+    ld1rd           {z23.d}, p0/z, x12, #56
525
+
526
+.loop_vps_sve2_4x\h:
527
+    mov             x6, x0
528
+
529
+    ld1b            {z0.s}, p0/z, x6
530
+    add             x6, x6, x1
531
+    ld1b            {z1.s}, p0/z, x6
532
+    add             x6, x6, x1
533
+    ld1b            {z2.s}, p0/z, x6
534
+    add             x6, x6, x1
535
+    ld1b            {z3.s}, p0/z, x6
536
+    add             x6, x6, x1
537
+    ld1b            {z4.s}, p0/z, x6
538
+    add             x6, x6, x1
539
+    ld1b            {z5.s}, p0/z, x6
540
+    add             x6, x6, x1
541
+    ld1b            {z6.s}, p0/z, x6
542
+    add             x6, x6, x1
543
+    ld1b            {z7.s}, p0/z, x6
544
+    add             x6, x6, x1
545
+
546
+    mul             z0.s, z0.s, z16.s
547
+    mla             z0.s, p0/m, z1.s, z17.s
548
+    mla             z0.s, p0/m, z2.s, z18.s
549
+    mla             z0.s, p0/m, z3.s, z19.s
550
+    mla             z0.s, p0/m, z4.s, z20.s
551
+    mla             z0.s, p0/m, z5.s, z21.s
552
+    mla             z0.s, p0/m, z6.s, z22.s
553
+    mla             z0.s, p0/m, z7.s, z23.s
554
+
555
+    sub             z0.s, z0.s, z28.s
556
+    sqxtn           v0.4h, v0.4s
557
+    st1             {v0.8b}, x2, x3
558
+
559
+    add             x0, x0, x1
560
+    sub             x4, x4, #1
561
+    cbnz            x4, .loop_vps_sve2_4x\h
562
+    ret
563
+endfunc
564
+.endm
565
+
566
+LUMA_VPS_4xN_SVE2 4
567
+LUMA_VPS_4xN_SVE2 8
568
+LUMA_VPS_4xN_SVE2 16
569
+
570
+// void interp_vert_sp_c(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
571
+.macro LUMA_VSP_4xN_SVE2 h
572
+function x265_interp_8tap_vert_sp_4x\h\()_sve2
573
+    lsl             x5, x4, #6
574
+    lsl             x1, x1, #1
575
+    lsl             x4, x1, #2
576
+    sub             x4, x4, x1
577
+    sub             x0, x0, x4
578
+
579
+    mov             w12, #1
580
+    lsl             w12, w12, #19
581
+    add             w12, w12, #2048
582
+    dup             v24.4s, w12
583
+    mov             x4, #\h
584
+    movrel          x12, g_lumaFilter
585
+    add             x12, x12, x5
586
+
587
+    ptrue           p0.s, vl4
588
+    ld1rd           {z16.d}, p0/z, x12
589
+    ld1rd           {z17.d}, p0/z, x12, #8
590
+    ld1rd           {z18.d}, p0/z, x12, #16
591
+    ld1rd           {z19.d}, p0/z, x12, #24
592
+    ld1rd           {z20.d}, p0/z, x12, #32
593
+    ld1rd           {z21.d}, p0/z, x12, #40
594
+    ld1rd           {z22.d}, p0/z, x12, #48
595
+    ld1rd           {z23.d}, p0/z, x12, #56
596
+
597
+.loop_vsp_sve2_4x\h:
598
+    mov             x6, x0
599
+
600
+    ld1             {v0.8b}, x6, x1
601
+    ld1             {v1.8b}, x6, x1
602
+    ld1             {v2.8b}, x6, x1
603
+    ld1             {v3.8b}, x6, x1
604
+    ld1             {v4.8b}, x6, x1
605
+    ld1             {v5.8b}, x6, x1
606
+    ld1             {v6.8b}, x6, x1
607
+    ld1             {v7.8b}, x6, x1
608
+
609
+    sunpklo         z0.s, z0.h
610
+    sunpklo         z1.s, z1.h
611
+    mul             z0.s, z0.s, z16.s
612
+    sunpklo         z2.s, z2.h
613
+    mla             z0.s, p0/m, z1.s, z17.s
614
+    sunpklo         z3.s, z3.h
615
+    mla             z0.s, p0/m, z2.s, z18.s
616
+    sunpklo         z4.s, z4.h
617
+    mla             z0.s, p0/m, z3.s, z19.s
618
+    sunpklo         z5.s, z5.h
619
+    mla             z0.s, p0/m, z4.s, z20.s
620
+    sunpklo         z6.s, z6.h
621
+    mla             z0.s, p0/m, z5.s, z21.s
622
+    sunpklo         z7.s, z7.h
623
+    mla             z0.s, p0/m, z6.s, z22.s
624
+
625
+    mla             z0.s, p0/m, z7.s, z23.s
626
+
627
+    add             z0.s, z0.s, z24.s
628
+    sqshrun         v0.4h, v0.4s, #12
629
+    sqxtun          v0.8b, v0.8h
630
+    st1             {v0.s}0, x2, x3
631
+
632
+    add             x0, x0, x1
633
+    sub             x4, x4, #1
634
+    cbnz            x4, .loop_vsp_sve2_4x\h
635
+    ret
636
+endfunc
637
+.endm
638
+
639
+LUMA_VSP_4xN_SVE2 4
640
+LUMA_VSP_4xN_SVE2 8
641
+LUMA_VSP_4xN_SVE2 16
642
+
643
+.macro vps_end_sve2
644
+    sub             z17.h, z17.h, z31.h
645
+.endm
646
+
647
+.macro FILTER_VPS_SVE2 w, h, v
648
+    lsl             x3, x3, #1
649
+    lsl             x10, x1, #2      // x10 = 4 * x1
650
+    sub             x11, x10, x1     // x11 = 3 * x1
651
+    sub             x0, x0, x11      // src -= (8 / 2 - 1) * srcStride
652
+    mov             x5, #\h
653
+    mov             z31.h, #8192
654
+    rdvl            x14, #1
655
+    cmp             x14, #16
656
+    bgt             .vl_gt_16_FILTER_VPS_\v\()_\w\()x\h
657
+    qpel_start_\v
658
+.loop_ps_sve2_\v\()_\w\()x\h:
659
+    mov             x7, x2
660
+    mov             x9, #0
661
+.loop_ps_w8_sve2_\v\()_\w\()x\h:
662
+    add             x6, x0, x9
663
+.if \w == 8 || \w == 24
664
+    qpel_load_32b \v
665
+    qpel_filter_\v\()_32b
666
+    vps_end
667
+    str             q17, x7, #16
668
+    add             x9, x9, #8
669
+.elseif \w == 12
670
+    qpel_load_32b \v
671
+    qpel_filter_\v\()_32b
672
+    vps_end
673
+    str             q17, x7, #16
674
+    add             x6, x0, #8
675
+    qpel_load_32b \v
676
+    qpel_filter_\v\()_32b
677
+    vps_end
678
+    str             d17, x7, #8
679
+    add             x9, x9, #12
680
+.else
681
+    qpel_load_64b \v
682
+    qpel_filter_\v\()_64b
683
+    vps_end
684
+    sub             v18.8h, v18.8h, v31.8h
685
+    stp             q17, q18, x7, #32
686
+    add             x9, x9, #16
687
+.endif
688
+    cmp             x9, #\w
689
+    blt             .loop_ps_w8_sve2_\v\()_\w\()x\h
690
+    add             x0, x0, x1
691
+    add             x2, x2, x3
692
+    sub             x5, x5, #1
693
+    cbnz            x5, .loop_ps_sve2_\v\()_\w\()x\h
694
+    ret
695
+.vl_gt_16_FILTER_VPS_\v\()_\w\()x\h:
696
+    ptrue           p0.h, vl8
697
+    ptrue           p2.h, vl16
698
+    qpel_start_sve2_\v
699
+.gt_16_loop_ps_sve2_\v\()_\w\()x\h:
700
+    mov             x7, x2
701
+    mov             x9, #0
702
+.gt_16_loop_ps_w8_sve2_\v\()_\w\()x\h:
703
+    add             x6, x0, x9
704
+.if \w == 8 || \w == 24
705
+    qpel_load_32b_sve2 \v
706
+    qpel_filter_sve2_\v\()_32b
707
+    vps_end_sve2
708
+    str             q17, x7, #16
709
+    add             x9, x9, #8
710
+.elseif \w == 12
711
+    qpel_load_32b_sve2 \v
712
+    qpel_filter_sve2_\v\()_32b
713
+    vps_end_sve2
714
+    str             q17, x7, #16
715
+    add             x6, x0, #8
716
+    qpel_load_32b_sve2 \v
717
+    qpel_filter_sve2_\v\()_32b
718
+    vps_end_sve2
719
+    str             d17, x7, #8
720
+    add             x9, x9, #12
721
+.else
722
+    qpel_load_64b_sve2_gt_16 \v
723
+    qpel_filter_sve2_\v\()_32b
724
+    vps_end_sve2
725
+    sub             z18.h, z18.h, z31.h
726
+    stp             q17, q18, x7, #32
727
+    add             x9, x9, #16
728
+.endif
729
+    cmp             x9, #\w
730
+    blt             .gt_16_loop_ps_w8_sve2_\v\()_\w\()x\h
731
+    add             x0, x0, x1
732
+    add             x2, x2, x3
733
+    sub             x5, x5, #1
734
+    cbnz            x5, .gt_16_loop_ps_sve2_\v\()_\w\()x\h
735
+    ret
736
+.endm
737
+
738
+// void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
739
+.macro LUMA_VPS_SVE2 w, h
740
+function x265_interp_8tap_vert_ps_\w\()x\h\()_sve2
741
+    cmp             x4, #0
742
+    beq             0f
743
+    cmp             x4, #1
744
+    beq             1f
745
+    cmp             x4, #2
746
+    beq             2f
747
+    cmp             x4, #3
748
+    beq             3f
749
+0:
750
+    FILTER_VPS_SVE2 \w, \h, 0
751
+1:
752
+    FILTER_VPS_SVE2 \w, \h, 1
753
+2:
754
+    FILTER_VPS_SVE2 \w, \h, 2
755
+3:
756
+    FILTER_VPS_SVE2 \w, \h, 3
757
+endfunc
758
+.endm
759
+
760
+LUMA_VPS_SVE2 8, 4
761
+LUMA_VPS_SVE2 8, 8
762
+LUMA_VPS_SVE2 8, 16
763
+LUMA_VPS_SVE2 8, 32
764
+LUMA_VPS_SVE2 12, 16
765
+LUMA_VPS_SVE2 16, 4
766
+LUMA_VPS_SVE2 16, 8
767
+LUMA_VPS_SVE2 16, 16
768
+LUMA_VPS_SVE2 16, 32
769
+LUMA_VPS_SVE2 16, 64
770
+LUMA_VPS_SVE2 16, 12
771
+LUMA_VPS_SVE2 24, 32
772
+LUMA_VPS_SVE2 32, 8
773
+LUMA_VPS_SVE2 32, 16
774
+LUMA_VPS_SVE2 32, 32
775
+LUMA_VPS_SVE2 32, 64
776
+LUMA_VPS_SVE2 32, 24
777
+LUMA_VPS_SVE2 48, 64
778
+LUMA_VPS_SVE2 64, 16
779
+LUMA_VPS_SVE2 64, 32
780
+LUMA_VPS_SVE2 64, 64
781
+LUMA_VPS_SVE2 64, 48
782
+
783
+// ***** luma_vss *****
784
+.macro vss_end_sve2
785
+    asr             z17.s, z17.s, #6
786
+    asr             z18.s, z18.s, #6
787
+    uzp1            v17.8h, v17.8h, v18.8h
788
+.endm
789
+
790
+.macro FILTER_VSS_SVE2 w, h, v
791
+    lsl             x1, x1, #1
792
+    lsl             x10, x1, #2      // x10 = 4 * x1
793
+    sub             x11, x10, x1     // x11 = 3 * x1
794
+    sub             x0, x0, x11
795
+    lsl             x3, x3, #1
796
+    mov             x5, #\h
797
+    mov             x12, #\w
798
+    lsl             x12, x12, #1
799
+    qpel_start_\v\()_1
800
+.loop_luma_vss_sve2_\v\()_\w\()x\h:
801
+    mov             x7, x2
802
+    mov             x9, #0
803
+.loop_luma_vss_w8_sve2_\v\()_\w\()x\h:
804
+    add             x6, x0, x9
805
+    qpel_load_64b \v
806
+    qpel_filter_\v\()_32b_1
807
+    vss_end_sve2
808
+.if \w == 4
809
+    str             s17, x7, #4
810
+    add             x9, x9, #4
811
+.else
812
+    str             q17, x7, #16
813
+    add             x9, x9, #16
814
+.if \w == 12
815
+    add             x6, x0, x9
816
+    qpel_load_64b \v
817
+    qpel_filter_\v\()_32b_1
818
+    vss_end_sve2
819
+    str             d17, x7, #8
820
+    add             x9, x9, #8
821
+.endif
822
+.endif
823
+    cmp             x9, x12
824
+    blt             .loop_luma_vss_w8_sve2_\v\()_\w\()x\h
825
+    add             x0, x0, x1
826
+    add             x2, x2, x3
827
+    sub             x5, x5, #1
828
+    cbnz            x5, .loop_luma_vss_sve2_\v\()_\w\()x\h
829
+    ret
830
+.endm
831
+
832
+// void interp_vert_ss_c(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
833
+.macro LUMA_VSS_SVE2 w, h
834
+function x265_interp_8tap_vert_ss_\w\()x\h\()_sve2
835
+    cmp             x4, #0
836
+    beq             0f
837
+    cmp             x4, #1
838
+    beq             1f
839
+    cmp             x4, #2
840
+    beq             2f
841
+    cmp             x4, #3
842
+    beq             3f
843
+0:
844
+    FILTER_VSS_SVE2 \w, \h, 0
845
+1:
846
+    FILTER_VSS_SVE2 \w, \h, 1
847
+2:
848
+    FILTER_VSS_SVE2 \w, \h, 2
849
+3:
850
+    FILTER_VSS_SVE2 \w, \h, 3
851
+endfunc
852
+.endm
853
+
854
+LUMA_VSS_SVE2 4, 4
855
+LUMA_VSS_SVE2 4, 8
856
+LUMA_VSS_SVE2 4, 16
857
+LUMA_VSS_SVE2 8, 4
858
+LUMA_VSS_SVE2 8, 8
859
+LUMA_VSS_SVE2 8, 16
860
+LUMA_VSS_SVE2 8, 32
861
+LUMA_VSS_SVE2 12, 16
862
+LUMA_VSS_SVE2 16, 4
863
+LUMA_VSS_SVE2 16, 8
864
+LUMA_VSS_SVE2 16, 16
865
+LUMA_VSS_SVE2 16, 32
866
+LUMA_VSS_SVE2 16, 64
867
+LUMA_VSS_SVE2 16, 12
868
+LUMA_VSS_SVE2 32, 8
869
+LUMA_VSS_SVE2 32, 16
870
+LUMA_VSS_SVE2 32, 32
871
+LUMA_VSS_SVE2 32, 64
872
+LUMA_VSS_SVE2 32, 24
873
+LUMA_VSS_SVE2 64, 16
874
+LUMA_VSS_SVE2 64, 32
875
+LUMA_VSS_SVE2 64, 64
876
+LUMA_VSS_SVE2 64, 48
877
+LUMA_VSS_SVE2 24, 32
878
+LUMA_VSS_SVE2 48, 64
879
+
880
+// ***** luma_hps *****
881
+
882
+.macro FILTER_CHROMA_VPP_SVE2 w, h, v
883
+    ptrue           p0.h, vl8
884
+    qpel_start_chroma_sve2_\v
885
+    mov             z31.h, #32
886
+    sub             x0, x0, x1
887
+    mov             x5, #\h
888
+.loop_chroma_vpp_sve2_\v\()_\w\()x\h:
889
+    mov             x7, x2
890
+    mov             x9, #0
891
+.loop_chroma_vpp_w8_sve2_\v\()_\w\()x\h:
892
+    add             x6, x0, x9
893
+    qpel_chroma_load_32b_sve2 \v
894
+    qpel_filter_chroma_sve2_\v\()_32b
895
+    vpp_end_sve2
896
+    add             x9, x9, #8
897
+.if \w == 2
898
+    fmov            w12, s17
899
+    strh            w12, x7, #2
900
+.elseif \w == 4
901
+    str             s17, x7, #4
902
+.elseif \w == 6
903
+    str             s17, x7, #4
904
+    umov            w12, v17.h2
905
+    strh            w12, x7, #2
906
+.elseif \w == 12
907
+    str             d17, x7, #8
908
+    add             x6, x0, x9
909
+    qpel_chroma_load_32b_sve2 \v
910
+    qpel_filter_chroma_sve2_\v\()_32b
911
+    vpp_end_sve2
912
+    str             s17, x7, #4
913
+    add             x9, x9, #8
914
+.else
915
+    str             d17, x7, #8
916
+.endif
917
+    cmp             x9, #\w
918
+    blt             .loop_chroma_vpp_w8_sve2_\v\()_\w\()x\h
919
+    add             x0, x0, x1
920
+    add             x2, x2, x3
921
+    sub             x5, x5, #1
922
+    cbnz            x5, .loop_chroma_vpp_sve2_\v\()_\w\()x\h
923
+    ret
924
+.endm
925
+
926
+// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
927
+.macro CHROMA_VPP_SVE2 w, h
928
+function x265_interp_4tap_vert_pp_\w\()x\h\()_sve2
929
+    cmp             x4, #0
930
+    beq             0f
931
+    cmp             x4, #1
932
+    beq             1f
933
+    cmp             x4, #2
934
+    beq             2f
935
+    cmp             x4, #3
936
+    beq             3f
937
+    cmp             x4, #4
938
+    beq             4f
939
+    cmp             x4, #5
940
+    beq             5f
941
+    cmp             x4, #6
942
+    beq             6f
943
+    cmp             x4, #7
944
+    beq             7f
945
+0:
946
+    FILTER_CHROMA_VPP_SVE2  \w, \h, 0
947
+1:
948
+    FILTER_CHROMA_VPP_SVE2  \w, \h, 1
949
+2:
950
+    FILTER_CHROMA_VPP_SVE2  \w, \h, 2
951
+3:
952
+    FILTER_CHROMA_VPP_SVE2  \w, \h, 3
953
+4:
954
+    FILTER_CHROMA_VPP_SVE2  \w, \h, 4
955
+5:
956
+    FILTER_CHROMA_VPP_SVE2  \w, \h, 5
957
+6:
958
+    FILTER_CHROMA_VPP_SVE2  \w, \h, 6
959
+7:
960
+    FILTER_CHROMA_VPP_SVE2  \w, \h, 7
961
+endfunc
962
+.endm
963
+
964
+CHROMA_VPP_SVE2 2, 4
965
+CHROMA_VPP_SVE2 2, 8
966
+CHROMA_VPP_SVE2 2, 16
967
+CHROMA_VPP_SVE2 4, 2
968
+CHROMA_VPP_SVE2 4, 4
969
+CHROMA_VPP_SVE2 4, 8
970
+CHROMA_VPP_SVE2 4, 16
971
+CHROMA_VPP_SVE2 4, 32
972
+CHROMA_VPP_SVE2 6, 8
973
+CHROMA_VPP_SVE2 6, 16
974
+CHROMA_VPP_SVE2 8, 2
975
+CHROMA_VPP_SVE2 8, 4
976
+CHROMA_VPP_SVE2 8, 6
977
+CHROMA_VPP_SVE2 8, 8
978
+CHROMA_VPP_SVE2 8, 16
979
+CHROMA_VPP_SVE2 8, 32
980
+CHROMA_VPP_SVE2 8, 12
981
+CHROMA_VPP_SVE2 8, 64
982
+CHROMA_VPP_SVE2 12, 16
983
+CHROMA_VPP_SVE2 12, 32
984
+CHROMA_VPP_SVE2 16, 4
985
+CHROMA_VPP_SVE2 16, 8
986
+CHROMA_VPP_SVE2 16, 12
987
+CHROMA_VPP_SVE2 16, 16
988
+CHROMA_VPP_SVE2 16, 32
989
+CHROMA_VPP_SVE2 16, 64
990
+CHROMA_VPP_SVE2 16, 24
991
+CHROMA_VPP_SVE2 32, 8
992
+CHROMA_VPP_SVE2 32, 16
993
+CHROMA_VPP_SVE2 32, 24
994
+CHROMA_VPP_SVE2 32, 32
995
+CHROMA_VPP_SVE2 32, 64
996
+CHROMA_VPP_SVE2 32, 48
997
+CHROMA_VPP_SVE2 24, 32
998
+CHROMA_VPP_SVE2 24, 64
999
+CHROMA_VPP_SVE2 64, 16
1000
+CHROMA_VPP_SVE2 64, 32
1001
+CHROMA_VPP_SVE2 64, 48
1002
+CHROMA_VPP_SVE2 64, 64
1003
+CHROMA_VPP_SVE2 48, 64
1004
+
1005
+.macro FILTER_CHROMA_VPS_SVE2 w, h, v
1006
+    ptrue           p0.h, vl8
1007
+    qpel_start_chroma_sve2_\v
1008
+    mov             z31.h, #8192
1009
+    lsl             x3, x3, #1
1010
+    sub             x0, x0, x1
1011
+    mov             x5, #\h
1012
+.loop_vps_sve2_\v\()_\w\()x\h:
1013
+    mov             x7, x2
1014
+    mov             x9, #0
1015
+.loop_vps_w8_sve2_\v\()_\w\()x\h:
1016
+    add             x6, x0, x9
1017
+    qpel_chroma_load_32b_sve2 \v
1018
+    qpel_filter_chroma_sve2_\v\()_32b
1019
+    vps_end_sve2
1020
+    add             x9, x9, #8
1021
+.if \w == 2
1022
+    str             s17, x7, #4
1023
+.elseif \w == 4
1024
+    str             d17, x7, #8
1025
+.elseif \w == 6
1026
+    str             d17, x7, #8
1027
+    st1             {v17.s}2, x7, #4
1028
+.elseif \w == 12
1029
+    str             q17, x7, #16
1030
+    add             x6, x0, x9
1031
+    qpel_chroma_load_32b_sve2 \v
1032
+    qpel_filter_chroma_sve2_\v\()_32b
1033
+    vps_end_sve2
1034
+    str             d17, x7, #8
1035
+    add             x9, x9, #8
1036
+.else
1037
+    str             q17, x7, #16
1038
+.endif
1039
+    cmp             x9, #\w
1040
+    blt             .loop_vps_w8_sve2_\v\()_\w\()x\h
1041
+
1042
+    add             x0, x0, x1
1043
+    add             x2, x2, x3
1044
+    sub             x5, x5, #1
1045
+    cbnz            x5, .loop_vps_sve2_\v\()_\w\()x\h
1046
+    ret
1047
+.endm
1048
+
1049
+// void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
1050
+.macro CHROMA_VPS_SVE2 w, h
1051
+function x265_interp_4tap_vert_ps_\w\()x\h\()_sve2
1052
+    cmp             x4, #0
1053
+    beq             0f
1054
+    cmp             x4, #1
1055
+    beq             1f
1056
+    cmp             x4, #2
1057
+    beq             2f
1058
+    cmp             x4, #3
1059
+    beq             3f
1060
+    cmp             x4, #4
1061
+    beq             4f
1062
+    cmp             x4, #5
1063
+    beq             5f
1064
+    cmp             x4, #6
1065
+    beq             6f
1066
+    cmp             x4, #7
1067
+    beq             7f
1068
+0:
1069
+    FILTER_CHROMA_VPS_SVE2  \w, \h, 0
1070
+1:
1071
+    FILTER_CHROMA_VPS_SVE2  \w, \h, 1
1072
+2:
1073
+    FILTER_CHROMA_VPS_SVE2  \w, \h, 2
1074
+3:
1075
+    FILTER_CHROMA_VPS_SVE2  \w, \h, 3
1076
+4:
1077
+    FILTER_CHROMA_VPS_SVE2  \w, \h, 4
1078
+5:
1079
+    FILTER_CHROMA_VPS_SVE2  \w, \h, 5
1080
+6:
1081
+    FILTER_CHROMA_VPS_SVE2  \w, \h, 6
1082
+7:
1083
+    FILTER_CHROMA_VPS_SVE2  \w, \h, 7
1084
+endfunc
1085
+.endm
1086
+
1087
+CHROMA_VPS_SVE2 2, 4
1088
+CHROMA_VPS_SVE2 2, 8
1089
+CHROMA_VPS_SVE2 2, 16
1090
+CHROMA_VPS_SVE2 4, 2
1091
+CHROMA_VPS_SVE2 4, 4
1092
+CHROMA_VPS_SVE2 4, 8
1093
+CHROMA_VPS_SVE2 4, 16
1094
+CHROMA_VPS_SVE2 4, 32
1095
+CHROMA_VPS_SVE2 6, 8
1096
+CHROMA_VPS_SVE2 6, 16
1097
+CHROMA_VPS_SVE2 8, 2
1098
+CHROMA_VPS_SVE2 8, 4
1099
+CHROMA_VPS_SVE2 8, 6
1100
+CHROMA_VPS_SVE2 8, 8
1101
+CHROMA_VPS_SVE2 8, 16
1102
+CHROMA_VPS_SVE2 8, 32
1103
+CHROMA_VPS_SVE2 8, 12
1104
+CHROMA_VPS_SVE2 8, 64
1105
+CHROMA_VPS_SVE2 12, 16
1106
+CHROMA_VPS_SVE2 12, 32
1107
+CHROMA_VPS_SVE2 16, 4
1108
+CHROMA_VPS_SVE2 16, 8
1109
+CHROMA_VPS_SVE2 16, 12
1110
+CHROMA_VPS_SVE2 16, 16
1111
+CHROMA_VPS_SVE2 16, 32
1112
+CHROMA_VPS_SVE2 16, 64
1113
+CHROMA_VPS_SVE2 16, 24
1114
+CHROMA_VPS_SVE2 32, 8
1115
+CHROMA_VPS_SVE2 32, 16
1116
+CHROMA_VPS_SVE2 32, 24
1117
+CHROMA_VPS_SVE2 32, 32
1118
+CHROMA_VPS_SVE2 32, 64
1119
+CHROMA_VPS_SVE2 32, 48
1120
+CHROMA_VPS_SVE2 24, 32
1121
+CHROMA_VPS_SVE2 24, 64
1122
+CHROMA_VPS_SVE2 64, 16
1123
+CHROMA_VPS_SVE2 64, 32
1124
+CHROMA_VPS_SVE2 64, 48
1125
+CHROMA_VPS_SVE2 64, 64
1126
+CHROMA_VPS_SVE2 48, 64
1127
+
1128
+.macro qpel_start_chroma_sve2_0_1
1129
+    mov             z24.h, #64
1130
+.endm
1131
+
1132
+.macro qpel_start_chroma_sve2_1_1
1133
+    mov             z24.h, #58
1134
+    mov             z25.h, #10
1135
+.endm
1136
+
1137
+.macro qpel_start_chroma_sve2_2_1
1138
+    mov             z25.h, #54
1139
+.endm
1140
+
1141
+.macro qpel_start_chroma_sve2_3_1
1142
+    mov             z25.h, #46
1143
+    mov             z26.h, #28
1144
+    mov             z27.h, #6
1145
+.endm
1146
+
1147
+.macro qpel_start_chroma_sve2_4_1
1148
+    mov             z24.h, #36
1149
+.endm
1150
+
1151
+.macro qpel_start_chroma_sve2_5_1
1152
+    mov             z25.h, #28
1153
+    mov             z26.h, #46
1154
+    mov             z27.h, #6
1155
+.endm
1156
+
1157
+.macro qpel_start_chroma_sve2_6_1
1158
+    mov             z25.h, #54
1159
+.endm
1160
+
1161
+.macro qpel_start_chroma_sve2_7_1
1162
+    mov             z24.h, #58
1163
+    mov             z25.h, #10
1164
+.endm
1165
+
1166
+.macro FILTER_CHROMA_VSS_SVE2 w, h, v
1167
+    lsl             x1, x1, #1
1168
+    sub             x0, x0, x1
1169
+    lsl             x3, x3, #1
1170
+    mov             x5, #\h
1171
+    mov             x12, #\w
1172
+    lsl             x12, x12, #1
1173
+    qpel_start_chroma_sve2_\v\()_1
1174
+.loop_vss_sve2_\v\()_\w\()x\h:
1175
+    mov             x7, x2
1176
+    mov             x9, #0
1177
+.if \w == 4
1178
+.rept 2
1179
+    add             x6, x0, x9
1180
+    qpel_chroma_load_64b \v
1181
+    qpel_filter_chroma_\v\()_32b_1
1182
+    vss_end_sve2
1183
+    str             s17, x7, #4
1184
+    add             x9, x9, #4
1185
+.endr
1186
+.else
1187
+.loop_vss_w8_sve2_\v\()_\w\()x\h:
1188
+    add             x6, x0, x9
1189
+    qpel_chroma_load_64b \v
1190
+    qpel_filter_chroma_\v\()_32b_1
1191
+    vss_end_sve2
1192
+    str             q17, x7, #16
1193
+    add             x9, x9, #16
1194
+.if \w == 12
1195
+    add             x6, x0, x9
1196
+    qpel_chroma_load_64b \v
1197
+    qpel_filter_chroma_\v\()_32b_1
1198
+    vss_end_sve2
1199
+    str             d17, x7, #8
1200
+    add             x9, x9, #8
1201
+.endif
1202
+    cmp             x9, x12
1203
+    blt             .loop_vss_w8_sve2_\v\()_\w\()x\h
1204
+.endif
1205
+    add             x0, x0, x1
1206
+    add             x2, x2, x3
1207
+    sub             x5, x5, #1
1208
+    cbnz            x5, .loop_vss_sve2_\v\()_\w\()x\h
1209
+    ret
1210
+.endm
1211
+
1212
+// void interp_vert_ss_c(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
1213
+.macro CHROMA_VSS_SVE2 w, h
1214
+function x265_interp_4tap_vert_ss_\w\()x\h\()_sve2
1215
+    cmp             x4, #0
1216
+    beq             0f
1217
+    cmp             x4, #1
1218
+    beq             1f
1219
+    cmp             x4, #2
1220
+    beq             2f
1221
+    cmp             x4, #3
1222
+    beq             3f
1223
+    cmp             x4, #4
1224
+    beq             4f
1225
+    cmp             x4, #5
1226
+    beq             5f
1227
+    cmp             x4, #6
1228
+    beq             6f
1229
+    cmp             x4, #7
1230
+    beq             7f
1231
+0:
1232
+    FILTER_CHROMA_VSS_SVE2  \w, \h, 0
1233
+1:
1234
+    FILTER_CHROMA_VSS_SVE2  \w, \h, 1
1235
+2:
1236
+    FILTER_CHROMA_VSS_SVE2  \w, \h, 2
1237
+3:
1238
+    FILTER_CHROMA_VSS_SVE2  \w, \h, 3
1239
+4:
1240
+    FILTER_CHROMA_VSS_SVE2  \w, \h, 4
1241
+5:
1242
+    FILTER_CHROMA_VSS_SVE2  \w, \h, 5
1243
+6:
1244
+    FILTER_CHROMA_VSS_SVE2  \w, \h, 6
1245
+7:
1246
+    FILTER_CHROMA_VSS_SVE2  \w, \h, 7
1247
+endfunc
1248
+.endm
1249
+
1250
+CHROMA_VSS_SVE2 4, 4
1251
+CHROMA_VSS_SVE2 4, 8
1252
+CHROMA_VSS_SVE2 4, 16
1253
+CHROMA_VSS_SVE2 4, 32
1254
+CHROMA_VSS_SVE2 8, 2
1255
+CHROMA_VSS_SVE2 8, 4
1256
+CHROMA_VSS_SVE2 8, 6
1257
+CHROMA_VSS_SVE2 8, 8
1258
+CHROMA_VSS_SVE2 8, 16
1259
+CHROMA_VSS_SVE2 8, 32
1260
+CHROMA_VSS_SVE2 8, 12
1261
+CHROMA_VSS_SVE2 8, 64
1262
+CHROMA_VSS_SVE2 12, 16
1263
+CHROMA_VSS_SVE2 12, 32
1264
+CHROMA_VSS_SVE2 16, 4
1265
+CHROMA_VSS_SVE2 16, 8
1266
+CHROMA_VSS_SVE2 16, 12
1267
+CHROMA_VSS_SVE2 16, 16
1268
+CHROMA_VSS_SVE2 16, 32
1269
+CHROMA_VSS_SVE2 16, 64
1270
+CHROMA_VSS_SVE2 16, 24
1271
+CHROMA_VSS_SVE2 32, 8
1272
+CHROMA_VSS_SVE2 32, 16
1273
+CHROMA_VSS_SVE2 32, 24
1274
+CHROMA_VSS_SVE2 32, 32
1275
+CHROMA_VSS_SVE2 32, 64
1276
+CHROMA_VSS_SVE2 32, 48
1277
+CHROMA_VSS_SVE2 24, 32
1278
+CHROMA_VSS_SVE2 24, 64
1279
+CHROMA_VSS_SVE2 64, 16
1280
+CHROMA_VSS_SVE2 64, 32
1281
+CHROMA_VSS_SVE2 64, 48
1282
+CHROMA_VSS_SVE2 64, 64
1283
+CHROMA_VSS_SVE2 48, 64
1284
x265_3.6.tar.gz/source/common/aarch64/ipfilter.S Added
1056
 
1
@@ -0,0 +1,1054 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2021 MulticoreWare, Inc
4
+ *
5
+ * Authors: Sebastian Pop <spop@amazon.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// Functions in this file:
26
+// ***** luma_vpp *****
27
+// ***** luma_vps *****
28
+// ***** luma_vsp *****
29
+// ***** luma_vss *****
30
+// ***** luma_hpp *****
31
+// ***** luma_hps *****
32
+// ***** chroma_vpp *****
33
+// ***** chroma_vps *****
34
+// ***** chroma_vsp *****
35
+// ***** chroma_vss *****
36
+// ***** chroma_hpp *****
37
+// ***** chroma_hps *****
38
+
39
+#include "asm.S"
40
+#include "ipfilter-common.S"
41
+
42
+#ifdef __APPLE__
43
+.section __RODATA,__rodata
44
+#else
45
+.section .rodata
46
+#endif
47
+
48
+.align 4
49
+
50
+.text
51
+
52
+// ***** luma_vpp *****
53
+// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
54
+.macro LUMA_VPP_4xN h
55
+function x265_interp_8tap_vert_pp_4x\h\()_neon
56
+    movrel          x10, g_luma_s16
57
+    sub             x0, x0, x1
58
+    sub             x0, x0, x1, lsl #1         // src -= 3 * srcStride
59
+    lsl             x4, x4, #4
60
+    ldr             q0, x10, x4              // q0 = luma interpolate coeff
61
+    dup             v24.8h, v0.h0
62
+    dup             v25.8h, v0.h1
63
+    trn1            v24.2d, v24.2d, v25.2d
64
+    dup             v26.8h, v0.h2
65
+    dup             v27.8h, v0.h3
66
+    trn1            v26.2d, v26.2d, v27.2d
67
+    dup             v28.8h, v0.h4
68
+    dup             v29.8h, v0.h5
69
+    trn1            v28.2d, v28.2d, v29.2d
70
+    dup             v30.8h, v0.h6
71
+    dup             v31.8h, v0.h7
72
+    trn1            v30.2d, v30.2d, v31.2d
73
+
74
+    // prepare to load 8 lines
75
+    ld1             {v0.s}0, x0, x1
76
+    ld1             {v0.s}1, x0, x1
77
+    ushll           v0.8h, v0.8b, #0
78
+    ld1             {v1.s}0, x0, x1
79
+    ld1             {v1.s}1, x0, x1
80
+    ushll           v1.8h, v1.8b, #0
81
+    ld1             {v2.s}0, x0, x1
82
+    ld1             {v2.s}1, x0, x1
83
+    ushll           v2.8h, v2.8b, #0
84
+    ld1             {v3.s}0, x0, x1
85
+    ld1             {v3.s}1, x0, x1
86
+    ushll           v3.8h, v3.8b, #0
87
+
88
+    mov             x9, #\h
89
+.loop_4x\h:
90
+    ld1             {v4.s}0, x0, x1
91
+    ld1             {v4.s}1, x0, x1
92
+    ushll           v4.8h, v4.8b, #0
93
+
94
+    // row0-1
95
+    mul             v16.8h, v0.8h, v24.8h
96
+    ext             v21.16b, v0.16b, v1.16b, #8
97
+    mul             v17.8h, v21.8h, v24.8h
98
+    mov             v0.16b, v1.16b
99
+
100
+    // row2-3
101
+    mla             v16.8h, v1.8h, v26.8h
102
+    ext             v21.16b, v1.16b, v2.16b, #8
103
+    mla             v17.8h, v21.8h, v26.8h
104
+    mov             v1.16b, v2.16b
105
+
106
+    // row4-5
107
+    mla             v16.8h, v2.8h, v28.8h
108
+    ext             v21.16b, v2.16b, v3.16b, #8
109
+    mla             v17.8h, v21.8h, v28.8h
110
+    mov             v2.16b, v3.16b
111
+
112
+    // row6-7
113
+    mla             v16.8h, v3.8h, v30.8h
114
+    ext             v21.16b, v3.16b, v4.16b, #8
115
+    mla             v17.8h, v21.8h, v30.8h
116
+    mov             v3.16b, v4.16b
117
+
118
+    // sum row0-7
119
+    trn1            v20.2d, v16.2d, v17.2d
120
+    trn2            v21.2d, v16.2d, v17.2d
121
+    add             v16.8h, v20.8h, v21.8h
122
+
123
+    sqrshrun        v16.8b,  v16.8h,  #6
124
+    st1             {v16.s}0, x2, x3
125
+    st1             {v16.s}1, x2, x3
126
+
127
+    sub             x9, x9, #2
128
+    cbnz            x9, .loop_4x\h
129
+    ret
130
+endfunc
131
+.endm
132
+
133
+LUMA_VPP_4xN 4
134
+LUMA_VPP_4xN 8
135
+LUMA_VPP_4xN 16
136
+
137
+// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
138
+.macro LUMA_VPP w, h
139
+function x265_interp_8tap_vert_pp_\w\()x\h\()_neon
140
+    cmp             x4, #0
141
+    b.eq            0f
142
+    cmp             x4, #1
143
+    b.eq            1f
144
+    cmp             x4, #2
145
+    b.eq            2f
146
+    cmp             x4, #3
147
+    b.eq            3f
148
+0:
149
+    FILTER_LUMA_VPP \w, \h, 0
150
+1:
151
+    FILTER_LUMA_VPP \w, \h, 1
152
+2:
153
+    FILTER_LUMA_VPP \w, \h, 2
154
+3:
155
+    FILTER_LUMA_VPP \w, \h, 3
156
+endfunc
157
+.endm
158
+
159
+LUMA_VPP 8, 4
160
+LUMA_VPP 8, 8
161
+LUMA_VPP 8, 16
162
+LUMA_VPP 8, 32
163
+LUMA_VPP 12, 16
164
+LUMA_VPP 16, 4
165
+LUMA_VPP 16, 8
166
+LUMA_VPP 16, 16
167
+LUMA_VPP 16, 32
168
+LUMA_VPP 16, 64
169
+LUMA_VPP 16, 12
170
+LUMA_VPP 24, 32
171
+LUMA_VPP 32, 8
172
+LUMA_VPP 32, 16
173
+LUMA_VPP 32, 32
174
+LUMA_VPP 32, 64
175
+LUMA_VPP 32, 24
176
+LUMA_VPP 48, 64
177
+LUMA_VPP 64, 16
178
+LUMA_VPP 64, 32
179
+LUMA_VPP 64, 64
180
+LUMA_VPP 64, 48
181
+
182
+// ***** luma_vps *****
183
+// void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
184
+.macro LUMA_VPS_4xN h
185
+function x265_interp_8tap_vert_ps_4x\h\()_neon
186
+    lsl             x3, x3, #1
187
+    lsl             x5, x4, #6
188
+    lsl             x4, x1, #2
189
+    sub             x4, x4, x1
190
+    sub             x0, x0, x4
191
+
192
+    mov             w6, #8192
193
+    dup             v28.4s, w6
194
+    mov             x4, #\h
195
+    movrel          x12, g_lumaFilter
196
+    add             x12, x12, x5
197
+    ld1r            {v16.2d}, x12, #8
198
+    ld1r            {v17.2d}, x12, #8
199
+    ld1r            {v18.2d}, x12, #8
200
+    ld1r            {v19.2d}, x12, #8
201
+    ld1r            {v20.2d}, x12, #8
202
+    ld1r            {v21.2d}, x12, #8
203
+    ld1r            {v22.2d}, x12, #8
204
+    ld1r            {v23.2d}, x12, #8
205
+
206
+.loop_vps_4x\h:
207
+    mov             x6, x0
208
+
209
+    ld1             {v0.s}0, x6, x1
210
+    ld1             {v1.s}0, x6, x1
211
+    ld1             {v2.s}0, x6, x1
212
+    ld1             {v3.s}0, x6, x1
213
+    ld1             {v4.s}0, x6, x1
214
+    ld1             {v5.s}0, x6, x1
215
+    ld1             {v6.s}0, x6, x1
216
+    ld1             {v7.s}0, x6, x1
217
+    uxtl            v0.8h, v0.8b
218
+    uxtl            v0.4s, v0.4h
219
+
220
+    uxtl            v1.8h, v1.8b
221
+    uxtl            v1.4s, v1.4h
222
+    mul             v0.4s, v0.4s, v16.4s
223
+
224
+    uxtl            v2.8h, v2.8b
225
+    uxtl            v2.4s, v2.4h
226
+    mla             v0.4s, v1.4s, v17.4s
227
+
228
+    uxtl            v3.8h, v3.8b
229
+    uxtl            v3.4s, v3.4h
230
+    mla             v0.4s, v2.4s, v18.4s
231
+
232
+    uxtl            v4.8h, v4.8b
233
+    uxtl            v4.4s, v4.4h
234
+    mla             v0.4s, v3.4s, v19.4s
235
+
236
+    uxtl            v5.8h, v5.8b
237
+    uxtl            v5.4s, v5.4h
238
+    mla             v0.4s, v4.4s, v20.4s
239
+
240
+    uxtl            v6.8h, v6.8b
241
+    uxtl            v6.4s, v6.4h
242
+    mla             v0.4s, v5.4s, v21.4s
243
+
244
+    uxtl            v7.8h, v7.8b
245
+    uxtl            v7.4s, v7.4h
246
+    mla             v0.4s, v6.4s, v22.4s
247
+
248
+    mla             v0.4s, v7.4s, v23.4s
249
+
250
+    sub             v0.4s, v0.4s, v28.4s
251
+    sqxtn           v0.4h, v0.4s
252
+    st1             {v0.8b}, x2, x3
253
+
254
+    add             x0, x0, x1
255
+    sub             x4, x4, #1
256
+    cbnz            x4, .loop_vps_4x\h
257
+    ret
258
+endfunc
259
+.endm
260
+
261
+LUMA_VPS_4xN 4
262
+LUMA_VPS_4xN 8
263
+LUMA_VPS_4xN 16
264
+
265
+// void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
266
+.macro LUMA_VPS w, h
267
+function x265_interp_8tap_vert_ps_\w\()x\h\()_neon
268
+    cmp             x4, #0
269
+    beq             0f
270
+    cmp             x4, #1
271
+    beq             1f
272
+    cmp             x4, #2
273
+    beq             2f
274
+    cmp             x4, #3
275
+    beq             3f
276
+0:
277
+    FILTER_VPS \w, \h, 0
278
+1:
279
+    FILTER_VPS \w, \h, 1
280
+2:
281
+    FILTER_VPS \w, \h, 2
282
+3:
283
+    FILTER_VPS \w, \h, 3
284
+endfunc
285
+.endm
286
+
287
+LUMA_VPS 8, 4
288
+LUMA_VPS 8, 8
289
+LUMA_VPS 8, 16
290
+LUMA_VPS 8, 32
291
+LUMA_VPS 12, 16
292
+LUMA_VPS 16, 4
293
+LUMA_VPS 16, 8
294
+LUMA_VPS 16, 16
295
+LUMA_VPS 16, 32
296
+LUMA_VPS 16, 64
297
+LUMA_VPS 16, 12
298
+LUMA_VPS 24, 32
299
+LUMA_VPS 32, 8
300
+LUMA_VPS 32, 16
301
+LUMA_VPS 32, 32
302
+LUMA_VPS 32, 64
303
+LUMA_VPS 32, 24
304
+LUMA_VPS 48, 64
305
+LUMA_VPS 64, 16
306
+LUMA_VPS 64, 32
307
+LUMA_VPS 64, 64
308
+LUMA_VPS 64, 48
309
+
310
+// ***** luma_vsp *****
311
+// void interp_vert_sp_c(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
312
+.macro LUMA_VSP_4xN h
313
+function x265_interp_8tap_vert_sp_4x\h\()_neon
314
+    lsl             x5, x4, #6
315
+    lsl             x1, x1, #1
316
+    lsl             x4, x1, #2
317
+    sub             x4, x4, x1
318
+    sub             x0, x0, x4
319
+
320
+    mov             w12, #1
321
+    lsl             w12, w12, #19
322
+    add             w12, w12, #2048
323
+    dup             v24.4s, w12
324
+    mov             x4, #\h
325
+    movrel          x12, g_lumaFilter
326
+    add             x12, x12, x5
327
+    ld1r            {v16.2d}, x12, #8
328
+    ld1r            {v17.2d}, x12, #8
329
+    ld1r            {v18.2d}, x12, #8
330
+    ld1r            {v19.2d}, x12, #8
331
+    ld1r            {v20.2d}, x12, #8
332
+    ld1r            {v21.2d}, x12, #8
333
+    ld1r            {v22.2d}, x12, #8
334
+    ld1r            {v23.2d}, x12, #8
335
+.loop_vsp_4x\h:
336
+    mov             x6, x0
337
+
338
+    ld1             {v0.8b}, x6, x1
339
+    ld1             {v1.8b}, x6, x1
340
+    ld1             {v2.8b}, x6, x1
341
+    ld1             {v3.8b}, x6, x1
342
+    ld1             {v4.8b}, x6, x1
343
+    ld1             {v5.8b}, x6, x1
344
+    ld1             {v6.8b}, x6, x1
345
+    ld1             {v7.8b}, x6, x1
346
+
347
+    sshll           v0.4s, v0.4h, #0
348
+    sshll           v1.4s, v1.4h, #0
349
+    mul             v0.4s, v0.4s, v16.4s
350
+    sshll           v2.4s, v2.4h, #0
351
+    mla             v0.4s, v1.4s, v17.4s
352
+    sshll           v3.4s, v3.4h, #0
353
+    mla             v0.4s, v2.4s, v18.4s
354
+    sshll           v4.4s, v4.4h, #0
355
+    mla             v0.4s, v3.4s, v19.4s
356
+    sshll           v5.4s, v5.4h, #0
357
+    mla             v0.4s, v4.4s, v20.4s
358
+    sshll           v6.4s, v6.4h, #0
359
+    mla             v0.4s, v5.4s, v21.4s
360
+    sshll           v7.4s, v7.4h, #0
361
+    mla             v0.4s, v6.4s, v22.4s
362
+
363
+    mla             v0.4s, v7.4s, v23.4s
364
+
365
+    add             v0.4s, v0.4s, v24.4s
366
+    sqshrun         v0.4h, v0.4s, #12
367
+    sqxtun          v0.8b, v0.8h
368
+    st1             {v0.s}0, x2, x3
369
+
370
+    add             x0, x0, x1
371
+    sub             x4, x4, #1
372
+    cbnz            x4, .loop_vsp_4x\h
373
+    ret
374
+endfunc
375
+.endm
376
+
377
+LUMA_VSP_4xN 4
378
+LUMA_VSP_4xN 8
379
+LUMA_VSP_4xN 16
380
+
381
+// void interp_vert_sp_c(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
382
+.macro LUMA_VSP w, h
383
+function x265_interp_8tap_vert_sp_\w\()x\h\()_neon
384
+    cmp             x4, #0
385
+    beq             0f
386
+    cmp             x4, #1
387
+    beq             1f
388
+    cmp             x4, #2
389
+    beq             2f
390
+    cmp             x4, #3
391
+    beq             3f
392
+0:
393
+    FILTER_VSP \w, \h, 0
394
+1:
395
+    FILTER_VSP \w, \h, 1
396
+2:
397
+    FILTER_VSP \w, \h, 2
398
+3:
399
+    FILTER_VSP \w, \h, 3
400
+endfunc
401
+.endm
402
+
403
+LUMA_VSP 8, 4
404
+LUMA_VSP 8, 8
405
+LUMA_VSP 8, 16
406
+LUMA_VSP 8, 32
407
+LUMA_VSP 12, 16
408
+LUMA_VSP 16, 4
409
+LUMA_VSP 16, 8
410
+LUMA_VSP 16, 16
411
+LUMA_VSP 16, 32
412
+LUMA_VSP 16, 64
413
+LUMA_VSP 16, 12
414
+LUMA_VSP 32, 8
415
+LUMA_VSP 32, 16
416
+LUMA_VSP 32, 32
417
+LUMA_VSP 32, 64
418
+LUMA_VSP 32, 24
419
+LUMA_VSP 64, 16
420
+LUMA_VSP 64, 32
421
+LUMA_VSP 64, 64
422
+LUMA_VSP 64, 48
423
+LUMA_VSP 24, 32
424
+LUMA_VSP 48, 64
425
+
426
+// ***** luma_vss *****
427
+// void interp_vert_ss_c(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
428
+.macro LUMA_VSS w, h
429
+function x265_interp_8tap_vert_ss_\w\()x\h\()_neon
430
+    cmp             x4, #0
431
+    beq             0f
432
+    cmp             x4, #1
433
+    beq             1f
434
+    cmp             x4, #2
435
+    beq             2f
436
+    cmp             x4, #3
437
+    beq             3f
438
+0:
439
+    FILTER_VSS \w, \h, 0
440
+1:
441
+    FILTER_VSS \w, \h, 1
442
+2:
443
+    FILTER_VSS \w, \h, 2
444
+3:
445
+    FILTER_VSS \w, \h, 3
446
+endfunc
447
+.endm
448
+
449
+LUMA_VSS 4, 4
450
+LUMA_VSS 4, 8
451
+LUMA_VSS 4, 16
452
+LUMA_VSS 8, 4
453
+LUMA_VSS 8, 8
454
+LUMA_VSS 8, 16
455
+LUMA_VSS 8, 32
456
+LUMA_VSS 12, 16
457
+LUMA_VSS 16, 4
458
+LUMA_VSS 16, 8
459
+LUMA_VSS 16, 16
460
+LUMA_VSS 16, 32
461
+LUMA_VSS 16, 64
462
+LUMA_VSS 16, 12
463
+LUMA_VSS 32, 8
464
+LUMA_VSS 32, 16
465
+LUMA_VSS 32, 32
466
+LUMA_VSS 32, 64
467
+LUMA_VSS 32, 24
468
+LUMA_VSS 64, 16
469
+LUMA_VSS 64, 32
470
+LUMA_VSS 64, 64
471
+LUMA_VSS 64, 48
472
+LUMA_VSS 24, 32
473
+LUMA_VSS 48, 64
474
+
475
+// ***** luma_hpp *****
476
+// void interp_horiz_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
477
+.macro LUMA_HPP w, h
478
+function x265_interp_horiz_pp_\w\()x\h\()_neon
479
+    cmp             x4, #0
480
+    beq             0f
481
+    cmp             x4, #1
482
+    beq             1f
483
+    cmp             x4, #2
484
+    beq             2f
485
+    cmp             x4, #3
486
+    beq             3f
487
+0:
488
+    FILTER_HPP \w, \h, 0
489
+1:
490
+    FILTER_HPP \w, \h, 1
491
+2:
492
+    FILTER_HPP \w, \h, 2
493
+3:
494
+    FILTER_HPP \w, \h, 3
495
+endfunc
496
+.endm
497
+
498
+LUMA_HPP 4, 4
499
+LUMA_HPP 4, 8
500
+LUMA_HPP 4, 16
501
+LUMA_HPP 8, 4
502
+LUMA_HPP 8, 8
503
+LUMA_HPP 8, 16
504
+LUMA_HPP 8, 32
505
+LUMA_HPP 12, 16
506
+LUMA_HPP 16, 4
507
+LUMA_HPP 16, 8
508
+LUMA_HPP 16, 12
509
+LUMA_HPP 16, 16
510
+LUMA_HPP 16, 32
511
+LUMA_HPP 16, 64
512
+LUMA_HPP 24, 32
513
+LUMA_HPP 32, 8
514
+LUMA_HPP 32, 16
515
+LUMA_HPP 32, 24
516
+LUMA_HPP 32, 32
517
+LUMA_HPP 32, 64
518
+LUMA_HPP 48, 64
519
+LUMA_HPP 64, 16
520
+LUMA_HPP 64, 32
521
+LUMA_HPP 64, 48
522
+LUMA_HPP 64, 64
523
+
524
+// ***** luma_hps *****
525
+// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
526
+.macro LUMA_HPS w, h
527
+function x265_interp_horiz_ps_\w\()x\h\()_neon
528
+    mov             w10, #\h
529
+    cmp             w5, #0
530
+    b.eq            6f
531
+    sub             x0, x0, x1, lsl #2
532
+    add             x0, x0, x1
533
+    add             w10, w10, #7
534
+6:
535
+    mov             w6, w10
536
+    cmp             w4, #0
537
+    b.eq            0f
538
+    cmp             w4, #1
539
+    b.eq            1f
540
+    cmp             w4, #2
541
+    b.eq            2f
542
+    cmp             w4, #3
543
+    b.eq            3f
544
+0:
545
+    FILTER_HPS \w, \h, 0
546
+1:
547
+    FILTER_HPS \w, \h, 1
548
+2:
549
+    FILTER_HPS \w, \h, 2
550
+3:
551
+    FILTER_HPS \w, \h, 3
552
+endfunc
553
+.endm
554
+
555
+LUMA_HPS 4, 4
556
+LUMA_HPS 4, 8
557
+LUMA_HPS 4, 16
558
+LUMA_HPS 8, 4
559
+LUMA_HPS 8, 8
560
+LUMA_HPS 8, 16
561
+LUMA_HPS 8, 32
562
+LUMA_HPS 12, 16
563
+LUMA_HPS 16, 4
564
+LUMA_HPS 16, 8
565
+LUMA_HPS 16, 12
566
+LUMA_HPS 16, 16
567
+LUMA_HPS 16, 32
568
+LUMA_HPS 16, 64
569
+LUMA_HPS 24, 32
570
+LUMA_HPS 32, 8
571
+LUMA_HPS 32, 16
572
+LUMA_HPS 32, 24
573
+LUMA_HPS 32, 32
574
+LUMA_HPS 32, 64
575
+LUMA_HPS 48, 64
576
+LUMA_HPS 64, 16
577
+LUMA_HPS 64, 32
578
+LUMA_HPS 64, 48
579
+LUMA_HPS 64, 64
580
+
581
+// ***** chroma_vpp *****
582
+// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
583
+.macro CHROMA_VPP w, h
584
+function x265_interp_4tap_vert_pp_\w\()x\h\()_neon
585
+    cmp             x4, #0
586
+    beq             0f
587
+    cmp             x4, #1
588
+    beq             1f
589
+    cmp             x4, #2
590
+    beq             2f
591
+    cmp             x4, #3
592
+    beq             3f
593
+    cmp             x4, #4
594
+    beq             4f
595
+    cmp             x4, #5
596
+    beq             5f
597
+    cmp             x4, #6
598
+    beq             6f
599
+    cmp             x4, #7
600
+    beq             7f
601
+0:
602
+    FILTER_CHROMA_VPP  \w, \h, 0
603
+1:
604
+    FILTER_CHROMA_VPP  \w, \h, 1
605
+2:
606
+    FILTER_CHROMA_VPP  \w, \h, 2
607
+3:
608
+    FILTER_CHROMA_VPP  \w, \h, 3
609
+4:
610
+    FILTER_CHROMA_VPP  \w, \h, 4
611
+5:
612
+    FILTER_CHROMA_VPP  \w, \h, 5
613
+6:
614
+    FILTER_CHROMA_VPP  \w, \h, 6
615
+7:
616
+    FILTER_CHROMA_VPP  \w, \h, 7
617
+endfunc
618
+.endm
619
+
620
+CHROMA_VPP 2, 4
621
+CHROMA_VPP 2, 8
622
+CHROMA_VPP 2, 16
623
+CHROMA_VPP 4, 2
624
+CHROMA_VPP 4, 4
625
+CHROMA_VPP 4, 8
626
+CHROMA_VPP 4, 16
627
+CHROMA_VPP 4, 32
628
+CHROMA_VPP 6, 8
629
+CHROMA_VPP 6, 16
630
+CHROMA_VPP 8, 2
631
+CHROMA_VPP 8, 4
632
+CHROMA_VPP 8, 6
633
+CHROMA_VPP 8, 8
634
+CHROMA_VPP 8, 16
635
+CHROMA_VPP 8, 32
636
+CHROMA_VPP 8, 12
637
+CHROMA_VPP 8, 64
638
+CHROMA_VPP 12, 16
639
+CHROMA_VPP 12, 32
640
+CHROMA_VPP 16, 4
641
+CHROMA_VPP 16, 8
642
+CHROMA_VPP 16, 12
643
+CHROMA_VPP 16, 16
644
+CHROMA_VPP 16, 32
645
+CHROMA_VPP 16, 64
646
+CHROMA_VPP 16, 24
647
+CHROMA_VPP 32, 8
648
+CHROMA_VPP 32, 16
649
+CHROMA_VPP 32, 24
650
+CHROMA_VPP 32, 32
651
+CHROMA_VPP 32, 64
652
+CHROMA_VPP 32, 48
653
+CHROMA_VPP 24, 32
654
+CHROMA_VPP 24, 64
655
+CHROMA_VPP 64, 16
656
+CHROMA_VPP 64, 32
657
+CHROMA_VPP 64, 48
658
+CHROMA_VPP 64, 64
659
+CHROMA_VPP 48, 64
660
+
661
+// ***** chroma_vps *****
662
+// void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
663
+.macro CHROMA_VPS w, h
664
+function x265_interp_4tap_vert_ps_\w\()x\h\()_neon
665
+    cmp             x4, #0
666
+    beq             0f
667
+    cmp             x4, #1
668
+    beq             1f
669
+    cmp             x4, #2
670
+    beq             2f
671
+    cmp             x4, #3
672
+    beq             3f
673
+    cmp             x4, #4
674
+    beq             4f
675
+    cmp             x4, #5
676
+    beq             5f
677
+    cmp             x4, #6
678
+    beq             6f
679
+    cmp             x4, #7
680
+    beq             7f
681
+0:
682
+    FILTER_CHROMA_VPS  \w, \h, 0
683
+1:
684
+    FILTER_CHROMA_VPS  \w, \h, 1
685
+2:
686
+    FILTER_CHROMA_VPS  \w, \h, 2
687
+3:
688
+    FILTER_CHROMA_VPS  \w, \h, 3
689
+4:
690
+    FILTER_CHROMA_VPS  \w, \h, 4
691
+5:
692
+    FILTER_CHROMA_VPS  \w, \h, 5
693
+6:
694
+    FILTER_CHROMA_VPS  \w, \h, 6
695
+7:
696
+    FILTER_CHROMA_VPS  \w, \h, 7
697
+endfunc
698
+.endm
699
+
700
+CHROMA_VPS 2, 4
701
+CHROMA_VPS 2, 8
702
+CHROMA_VPS 2, 16
703
+CHROMA_VPS 4, 2
704
+CHROMA_VPS 4, 4
705
+CHROMA_VPS 4, 8
706
+CHROMA_VPS 4, 16
707
+CHROMA_VPS 4, 32
708
+CHROMA_VPS 6, 8
709
+CHROMA_VPS 6, 16
710
+CHROMA_VPS 8, 2
711
+CHROMA_VPS 8, 4
712
+CHROMA_VPS 8, 6
713
+CHROMA_VPS 8, 8
714
+CHROMA_VPS 8, 16
715
+CHROMA_VPS 8, 32
716
+CHROMA_VPS 8, 12
717
+CHROMA_VPS 8, 64
718
+CHROMA_VPS 12, 16
719
+CHROMA_VPS 12, 32
720
+CHROMA_VPS 16, 4
721
+CHROMA_VPS 16, 8
722
+CHROMA_VPS 16, 12
723
+CHROMA_VPS 16, 16
724
+CHROMA_VPS 16, 32
725
+CHROMA_VPS 16, 64
726
+CHROMA_VPS 16, 24
727
+CHROMA_VPS 32, 8
728
+CHROMA_VPS 32, 16
729
+CHROMA_VPS 32, 24
730
+CHROMA_VPS 32, 32
731
+CHROMA_VPS 32, 64
732
+CHROMA_VPS 32, 48
733
+CHROMA_VPS 24, 32
734
+CHROMA_VPS 24, 64
735
+CHROMA_VPS 64, 16
736
+CHROMA_VPS 64, 32
737
+CHROMA_VPS 64, 48
738
+CHROMA_VPS 64, 64
739
+CHROMA_VPS 48, 64
740
+
741
+// ***** chroma_vsp *****
742
+// void interp_vert_sp_c(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
743
+.macro CHROMA_VSP w, h
744
+function x265_interp_4tap_vert_sp_\w\()x\h\()_neon
745
+    cmp             x4, #0
746
+    beq             0f
747
+    cmp             x4, #1
748
+    beq             1f
749
+    cmp             x4, #2
750
+    beq             2f
751
+    cmp             x4, #3
752
+    beq             3f
753
+    cmp             x4, #4
754
+    beq             4f
755
+    cmp             x4, #5
756
+    beq             5f
757
+    cmp             x4, #6
758
+    beq             6f
759
+    cmp             x4, #7
760
+    beq             7f
761
+0:
762
+    FILTER_CHROMA_VSP  \w, \h, 0
763
+1:
764
+    FILTER_CHROMA_VSP  \w, \h, 1
765
+2:
766
+    FILTER_CHROMA_VSP  \w, \h, 2
767
+3:
768
+    FILTER_CHROMA_VSP  \w, \h, 3
769
+4:
770
+    FILTER_CHROMA_VSP  \w, \h, 4
771
+5:
772
+    FILTER_CHROMA_VSP  \w, \h, 5
773
+6:
774
+    FILTER_CHROMA_VSP  \w, \h, 6
775
+7:
776
+    FILTER_CHROMA_VSP  \w, \h, 7
777
+endfunc
778
+.endm
779
+
780
+CHROMA_VSP 4, 4
781
+CHROMA_VSP 4, 8
782
+CHROMA_VSP 4, 16
783
+CHROMA_VSP 4, 32
784
+CHROMA_VSP 8, 2
785
+CHROMA_VSP 8, 4
786
+CHROMA_VSP 8, 6
787
+CHROMA_VSP 8, 8
788
+CHROMA_VSP 8, 16
789
+CHROMA_VSP 8, 32
790
+CHROMA_VSP 8, 12
791
+CHROMA_VSP 8, 64
792
+CHROMA_VSP 12, 16
793
+CHROMA_VSP 12, 32
794
+CHROMA_VSP 16, 4
795
+CHROMA_VSP 16, 8
796
+CHROMA_VSP 16, 12
797
+CHROMA_VSP 16, 16
798
+CHROMA_VSP 16, 32
799
+CHROMA_VSP 16, 64
800
+CHROMA_VSP 16, 24
801
+CHROMA_VSP 32, 8
802
+CHROMA_VSP 32, 16
803
+CHROMA_VSP 32, 24
804
+CHROMA_VSP 32, 32
805
+CHROMA_VSP 32, 64
806
+CHROMA_VSP 32, 48
807
+CHROMA_VSP 24, 32
808
+CHROMA_VSP 24, 64
809
+CHROMA_VSP 64, 16
810
+CHROMA_VSP 64, 32
811
+CHROMA_VSP 64, 48
812
+CHROMA_VSP 64, 64
813
+CHROMA_VSP 48, 64
814
+
815
+// ***** chroma_vss *****
816
+// void interp_vert_ss_c(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
817
+.macro CHROMA_VSS w, h
818
+function x265_interp_4tap_vert_ss_\w\()x\h\()_neon
819
+    cmp             x4, #0
820
+    beq             0f
821
+    cmp             x4, #1
822
+    beq             1f
823
+    cmp             x4, #2
824
+    beq             2f
825
+    cmp             x4, #3
826
+    beq             3f
827
+    cmp             x4, #4
828
+    beq             4f
829
+    cmp             x4, #5
830
+    beq             5f
831
+    cmp             x4, #6
832
+    beq             6f
833
+    cmp             x4, #7
834
+    beq             7f
835
+0:
836
+    FILTER_CHROMA_VSS  \w, \h, 0
837
+1:
838
+    FILTER_CHROMA_VSS  \w, \h, 1
839
+2:
840
+    FILTER_CHROMA_VSS  \w, \h, 2
841
+3:
842
+    FILTER_CHROMA_VSS  \w, \h, 3
843
+4:
844
+    FILTER_CHROMA_VSS  \w, \h, 4
845
+5:
846
+    FILTER_CHROMA_VSS  \w, \h, 5
847
+6:
848
+    FILTER_CHROMA_VSS  \w, \h, 6
849
+7:
850
+    FILTER_CHROMA_VSS  \w, \h, 7
851
+endfunc
852
+.endm
853
+
854
+CHROMA_VSS 4, 4
855
+CHROMA_VSS 4, 8
856
+CHROMA_VSS 4, 16
857
+CHROMA_VSS 4, 32
858
+CHROMA_VSS 8, 2
859
+CHROMA_VSS 8, 4
860
+CHROMA_VSS 8, 6
861
+CHROMA_VSS 8, 8
862
+CHROMA_VSS 8, 16
863
+CHROMA_VSS 8, 32
864
+CHROMA_VSS 8, 12
865
+CHROMA_VSS 8, 64
866
+CHROMA_VSS 12, 16
867
+CHROMA_VSS 12, 32
868
+CHROMA_VSS 16, 4
869
+CHROMA_VSS 16, 8
870
+CHROMA_VSS 16, 12
871
+CHROMA_VSS 16, 16
872
+CHROMA_VSS 16, 32
873
+CHROMA_VSS 16, 64
874
+CHROMA_VSS 16, 24
875
+CHROMA_VSS 32, 8
876
+CHROMA_VSS 32, 16
877
+CHROMA_VSS 32, 24
878
+CHROMA_VSS 32, 32
879
+CHROMA_VSS 32, 64
880
+CHROMA_VSS 32, 48
881
+CHROMA_VSS 24, 32
882
+CHROMA_VSS 24, 64
883
+CHROMA_VSS 64, 16
884
+CHROMA_VSS 64, 32
885
+CHROMA_VSS 64, 48
886
+CHROMA_VSS 64, 64
887
+CHROMA_VSS 48, 64
888
+
889
+// ***** chroma_hpp *****
890
+// void interp_horiz_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
891
+.macro CHROMA_HPP w, h
892
+function x265_interp_4tap_horiz_pp_\w\()x\h\()_neon
893
+    cmp             x4, #0
894
+    beq             0f
895
+    cmp             x4, #1
896
+    beq             1f
897
+    cmp             x4, #2
898
+    beq             2f
899
+    cmp             x4, #3
900
+    beq             3f
901
+    cmp             x4, #4
902
+    beq             4f
903
+    cmp             x4, #5
904
+    beq             5f
905
+    cmp             x4, #6
906
+    beq             6f
907
+    cmp             x4, #7
908
+    beq             7f
909
+0:
910
+    FILTER_CHROMA_HPP  \w, \h, 0
911
+1:
912
+    FILTER_CHROMA_HPP  \w, \h, 1
913
+2:
914
+    FILTER_CHROMA_HPP  \w, \h, 2
915
+3:
916
+    FILTER_CHROMA_HPP  \w, \h, 3
917
+4:
918
+    FILTER_CHROMA_HPP  \w, \h, 4
919
+5:
920
+    FILTER_CHROMA_HPP  \w, \h, 5
921
+6:
922
+    FILTER_CHROMA_HPP  \w, \h, 6
923
+7:
924
+    FILTER_CHROMA_HPP  \w, \h, 7
925
+endfunc
926
+.endm
927
+
928
+CHROMA_HPP 2, 4
929
+CHROMA_HPP 2, 8
930
+CHROMA_HPP 2, 16
931
+CHROMA_HPP 4, 2
932
+CHROMA_HPP 4, 4
933
+CHROMA_HPP 4, 8
934
+CHROMA_HPP 4, 16
935
+CHROMA_HPP 4, 32
936
+CHROMA_HPP 6, 8
937
+CHROMA_HPP 6, 16
938
+CHROMA_HPP 8, 2
939
+CHROMA_HPP 8, 4
940
+CHROMA_HPP 8, 6
941
+CHROMA_HPP 8, 8
942
+CHROMA_HPP 8, 12
943
+CHROMA_HPP 8, 16
944
+CHROMA_HPP 8, 32
945
+CHROMA_HPP 8, 64
946
+CHROMA_HPP 12, 16
947
+CHROMA_HPP 12, 32
948
+CHROMA_HPP 16, 4
949
+CHROMA_HPP 16, 8
950
+CHROMA_HPP 16, 12
951
+CHROMA_HPP 16, 16
952
+CHROMA_HPP 16, 24
953
+CHROMA_HPP 16, 32
954
+CHROMA_HPP 16, 64
955
+CHROMA_HPP 24, 32
956
+CHROMA_HPP 24, 64
957
+CHROMA_HPP 32, 8
958
+CHROMA_HPP 32, 16
959
+CHROMA_HPP 32, 24
960
+CHROMA_HPP 32, 32
961
+CHROMA_HPP 32, 48
962
+CHROMA_HPP 32, 64
963
+CHROMA_HPP 48, 64
964
+CHROMA_HPP 64, 16
965
+CHROMA_HPP 64, 32
966
+CHROMA_HPP 64, 48
967
+CHROMA_HPP 64, 64
968
+
969
+// ***** chroma_hps *****
970
+// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
971
+.macro CHROMA_HPS w, h
972
+function x265_interp_4tap_horiz_ps_\w\()x\h\()_neon
973
+    cmp             x4, #0
974
+    beq             0f
975
+    cmp             x4, #1
976
+    beq             1f
977
+    cmp             x4, #2
978
+    beq             2f
979
+    cmp             x4, #3
980
+    beq             3f
981
+    cmp             x4, #4
982
+    beq             4f
983
+    cmp             x4, #5
984
+    beq             5f
985
+    cmp             x4, #6
986
+    beq             6f
987
+    cmp             x4, #7
988
+    beq             7f
989
+0:
990
+    FILTER_CHROMA_HPS  \w, \h, 0
991
+1:
992
+    FILTER_CHROMA_HPS  \w, \h, 1
993
+2:
994
+    FILTER_CHROMA_HPS  \w, \h, 2
995
+3:
996
+    FILTER_CHROMA_HPS  \w, \h, 3
997
+4:
998
+    FILTER_CHROMA_HPS  \w, \h, 4
999
+5:
1000
+    FILTER_CHROMA_HPS  \w, \h, 5
1001
+6:
1002
+    FILTER_CHROMA_HPS  \w, \h, 6
1003
+7:
1004
+    FILTER_CHROMA_HPS  \w, \h, 7
1005
+endfunc
1006
+.endm
1007
+
1008
+CHROMA_HPS 2, 4
1009
+CHROMA_HPS 2, 8
1010
+CHROMA_HPS 2, 16
1011
+CHROMA_HPS 4, 2
1012
+CHROMA_HPS 4, 4
1013
+CHROMA_HPS 4, 8
1014
+CHROMA_HPS 4, 16
1015
+CHROMA_HPS 4, 32
1016
+CHROMA_HPS 6, 8
1017
+CHROMA_HPS 6, 16
1018
+CHROMA_HPS 8, 2
1019
+CHROMA_HPS 8, 4
1020
+CHROMA_HPS 8, 6
1021
+CHROMA_HPS 8, 8
1022
+CHROMA_HPS 8, 12
1023
+CHROMA_HPS 8, 16
1024
+CHROMA_HPS 8, 32
1025
+CHROMA_HPS 8, 64
1026
+CHROMA_HPS 12, 16
1027
+CHROMA_HPS 12, 32
1028
+CHROMA_HPS 16, 4
1029
+CHROMA_HPS 16, 8
1030
+CHROMA_HPS 16, 12
1031
+CHROMA_HPS 16, 16
1032
+CHROMA_HPS 16, 24
1033
+CHROMA_HPS 16, 32
1034
+CHROMA_HPS 16, 64
1035
+CHROMA_HPS 24, 32
1036
+CHROMA_HPS 24, 64
1037
+CHROMA_HPS 32, 8
1038
+CHROMA_HPS 32, 16
1039
+CHROMA_HPS 32, 24
1040
+CHROMA_HPS 32, 32
1041
+CHROMA_HPS 32, 48
1042
+CHROMA_HPS 32, 64
1043
+CHROMA_HPS 48, 64
1044
+CHROMA_HPS 64, 16
1045
+CHROMA_HPS 64, 32
1046
+CHROMA_HPS 64, 48
1047
+CHROMA_HPS 64, 64
1048
+
1049
+const g_luma_s16, align=8
1050
+//       a, b,   c,  d,  e,   f, g,  h
1051
+.hword   0, 0,   0, 64,  0,   0, 0,  0
1052
+.hword  -1, 4, -10, 58, 17,  -5, 1,  0
1053
+.hword  -1, 4, -11, 40, 40, -11, 4, -1
1054
+.hword   0, 1,  -5, 17, 58, -10, 4, -1
1055
+endconst
1056
x265_3.6.tar.gz/source/common/aarch64/loopfilter-prim.cpp Added
293
 
1
@@ -0,0 +1,291 @@
2
+#include "loopfilter-prim.h"
3
+
4
+#define PIXEL_MIN 0
5
+
6
+
7
+
8
+#if !(HIGH_BIT_DEPTH) && defined(HAVE_NEON)
9
+#include<arm_neon.h>
10
+
11
+namespace
12
+{
13
+
14
+
15
+/* get the sign of input variable (TODO: this is a dup, make common) */
16
+static inline int8_t signOf(int x)
17
+{
18
+    return (x >> 31) | ((int)((((uint32_t) - x)) >> 31));
19
+}
20
+
21
+static inline int8x8_t sign_diff_neon(const uint8x8_t in0, const uint8x8_t in1)
22
+{
23
+    int16x8_t in = vsubl_u8(in0, in1);
24
+    return vmovn_s16(vmaxq_s16(vminq_s16(in, vdupq_n_s16(1)), vdupq_n_s16(-1)));
25
+}
26
+
27
+static void calSign_neon(int8_t *dst, const pixel *src1, const pixel *src2, const int endX)
28
+{
29
+    int x = 0;
30
+    for (; (x + 8) <= endX; x += 8)
31
+    {
32
+        *(int8x8_t *)&dstx  = sign_diff_neon(*(uint8x8_t *)&src1x, *(uint8x8_t *)&src2x);
33
+    }
34
+
35
+    for (; x < endX; x++)
36
+    {
37
+        dstx = signOf(src1x - src2x);
38
+    }
39
+}
40
+
41
+static void processSaoCUE0_neon(pixel *rec, int8_t *offsetEo, int width, int8_t *signLeft, intptr_t stride)
42
+{
43
+
44
+
45
+    int y;
46
+    int8_t signRight, signLeft0;
47
+    int8_t edgeType;
48
+
49
+    for (y = 0; y < 2; y++)
50
+    {
51
+        signLeft0 = signLefty;
52
+        int x = 0;
53
+
54
+        if (width >= 8)
55
+        {
56
+            int8x8_t vsignRight;
57
+            int8x8x2_t shifter;
58
+            shifter.val10 = signLeft0;
59
+            static const int8x8_t index = {8, 0, 1, 2, 3, 4, 5, 6};
60
+            int8x8_t tbl = *(int8x8_t *)offsetEo;
61
+            for (; (x + 8) <= width; x += 8)
62
+            {
63
+                uint8x8_t in = *(uint8x8_t *)&recx;
64
+                vsignRight = sign_diff_neon(in, *(uint8x8_t *)&recx + 1);
65
+                shifter.val0 = vneg_s8(vsignRight);
66
+                int8x8_t tmp = shifter.val0;
67
+                int8x8_t edge = vtbl2_s8(shifter, index);
68
+                int8x8_t vedgeType = vadd_s8(vadd_s8(vsignRight, edge), vdup_n_s8(2));
69
+                shifter.val10 = tmp7;
70
+                int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
71
+                t1 = vaddw_u8(t1, in);
72
+                t1 = vmaxq_s16(t1, vdupq_n_s16(0));
73
+                t1 = vminq_s16(t1, vdupq_n_s16(255));
74
+                *(uint8x8_t *)&recx = vmovn_u16(t1);
75
+            }
76
+            signLeft0 = shifter.val10;
77
+        }
78
+        for (; x < width; x++)
79
+        {
80
+            signRight = ((recx - recx + 1) < 0) ? -1 : ((recx - recx + 1) > 0) ? 1 : 0;
81
+            edgeType = signRight + signLeft0 + 2;
82
+            signLeft0 = -signRight;
83
+            recx = x265_clip(recx + offsetEoedgeType);
84
+        }
85
+        rec += stride;
86
+    }
87
+}
88
+
89
+static void processSaoCUE1_neon(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int width)
90
+{
91
+    int x = 0;
92
+    int8_t signDown;
93
+    int edgeType;
94
+
95
+    if (width >= 8)
96
+    {
97
+        int8x8_t tbl = *(int8x8_t *)offsetEo;
98
+        for (; (x + 8) <= width; x += 8)
99
+        {
100
+            uint8x8_t in0 = *(uint8x8_t *)&recx;
101
+            uint8x8_t in1 = *(uint8x8_t *)&recx + stride;
102
+            int8x8_t vsignDown = sign_diff_neon(in0, in1);
103
+            int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1x), vdup_n_s8(2));
104
+            *(int8x8_t *)&upBuff1x = vneg_s8(vsignDown);
105
+            int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
106
+            t1 = vaddw_u8(t1, in0);
107
+            *(uint8x8_t *)&recx = vqmovun_s16(t1);
108
+        }
109
+    }
110
+    for (; x < width; x++)
111
+    {
112
+        signDown = signOf(recx - recx + stride);
113
+        edgeType = signDown + upBuff1x + 2;
114
+        upBuff1x = -signDown;
115
+        recx = x265_clip(recx + offsetEoedgeType);
116
+    }
117
+}
118
+
119
+static void processSaoCUE1_2Rows_neon(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int width)
120
+{
121
+    int y;
122
+    int8_t signDown;
123
+    int edgeType;
124
+
125
+    for (y = 0; y < 2; y++)
126
+    {
127
+        int x = 0;
128
+        if (width >= 8)
129
+        {
130
+            int8x8_t tbl = *(int8x8_t *)offsetEo;
131
+            for (; (x + 8) <= width; x += 8)
132
+            {
133
+                uint8x8_t in0 = *(uint8x8_t *)&recx;
134
+                uint8x8_t in1 = *(uint8x8_t *)&recx + stride;
135
+                int8x8_t vsignDown = sign_diff_neon(in0, in1);
136
+                int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1x), vdup_n_s8(2));
137
+                *(int8x8_t *)&upBuff1x = vneg_s8(vsignDown);
138
+                int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
139
+                t1 = vaddw_u8(t1, in0);
140
+                t1 = vmaxq_s16(t1, vdupq_n_s16(0));
141
+                t1 = vminq_s16(t1, vdupq_n_s16(255));
142
+                *(uint8x8_t *)&recx = vmovn_u16(t1);
143
+
144
+            }
145
+        }
146
+        for (; x < width; x++)
147
+        {
148
+            signDown = signOf(recx - recx + stride);
149
+            edgeType = signDown + upBuff1x + 2;
150
+            upBuff1x = -signDown;
151
+            recx = x265_clip(recx + offsetEoedgeType);
152
+        }
153
+        rec += stride;
154
+    }
155
+}
156
+
157
+static void processSaoCUE2_neon(pixel *rec, int8_t *bufft, int8_t *buff1, int8_t *offsetEo, int width, intptr_t stride)
158
+{
159
+    int x;
160
+
161
+    if (abs(buff1 - bufft) < 16)
162
+    {
163
+        for (x = 0; x < width; x++)
164
+        {
165
+            int8_t signDown = signOf(recx - recx + stride + 1);
166
+            int edgeType = signDown + buff1x + 2;
167
+            bufftx + 1 = -signDown;
168
+            recx = x265_clip(recx + offsetEoedgeType);;
169
+        }
170
+    }
171
+    else
172
+    {
173
+        int8x8_t tbl = *(int8x8_t *)offsetEo;
174
+        x = 0;
175
+        for (; (x + 8) <= width; x += 8)
176
+        {
177
+            uint8x8_t in0 = *(uint8x8_t *)&recx;
178
+            uint8x8_t in1 = *(uint8x8_t *)&recx + stride + 1;
179
+            int8x8_t vsignDown = sign_diff_neon(in0, in1);
180
+            int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&buff1x), vdup_n_s8(2));
181
+            *(int8x8_t *)&bufftx + 1 = vneg_s8(vsignDown);
182
+            int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
183
+            t1 = vaddw_u8(t1, in0);
184
+            t1 = vmaxq_s16(t1, vdupq_n_s16(0));
185
+            t1 = vminq_s16(t1, vdupq_n_s16(255));
186
+            *(uint8x8_t *)&recx = vmovn_u16(t1);
187
+        }
188
+        for (; x < width; x++)
189
+        {
190
+            int8_t signDown = signOf(recx - recx + stride + 1);
191
+            int edgeType = signDown + buff1x + 2;
192
+            bufftx + 1 = -signDown;
193
+            recx = x265_clip(recx + offsetEoedgeType);;
194
+        }
195
+
196
+    }
197
+}
198
+
199
+
200
+static void processSaoCUE3_neon(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int startX, int endX)
201
+{
202
+    int8_t signDown;
203
+    int8_t edgeType;
204
+    int8x8_t tbl = *(int8x8_t *)offsetEo;
205
+
206
+    int x = startX + 1;
207
+    for (; (x + 8) <= endX; x += 8)
208
+    {
209
+        uint8x8_t in0 = *(uint8x8_t *)&recx;
210
+        uint8x8_t in1 = *(uint8x8_t *)&recx + stride;
211
+        int8x8_t vsignDown = sign_diff_neon(in0, in1);
212
+        int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1x), vdup_n_s8(2));
213
+        *(int8x8_t *)&upBuff1x - 1 = vneg_s8(vsignDown);
214
+        int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
215
+        t1 = vaddw_u8(t1, in0);
216
+        t1 = vmaxq_s16(t1, vdupq_n_s16(0));
217
+        t1 = vminq_s16(t1, vdupq_n_s16(255));
218
+        *(uint8x8_t *)&recx = vmovn_u16(t1);
219
+
220
+    }
221
+    for (; x < endX; x++)
222
+    {
223
+        signDown = signOf(recx - recx + stride);
224
+        edgeType = signDown + upBuff1x + 2;
225
+        upBuff1x - 1 = -signDown;
226
+        recx = x265_clip(recx + offsetEoedgeType);
227
+    }
228
+}
229
+
230
+static void processSaoCUB0_neon(pixel *rec, const int8_t *offset, int ctuWidth, int ctuHeight, intptr_t stride)
231
+{
232
+#define SAO_BO_BITS 5
233
+    const int boShift = X265_DEPTH - SAO_BO_BITS;
234
+    int x, y;
235
+    int8x8x4_t table;
236
+    table = *(int8x8x4_t *)offset;
237
+
238
+    for (y = 0; y < ctuHeight; y++)
239
+    {
240
+
241
+        for (x = 0; (x + 8) <= ctuWidth; x += 8)
242
+        {
243
+            int8x8_t in = *(int8x8_t *)&recx;
244
+            int8x8_t offsets = vtbl4_s8(table, vshr_n_u8(in, boShift));
245
+            int16x8_t tmp = vmovl_s8(offsets);
246
+            tmp = vaddw_u8(tmp, in);
247
+            tmp = vmaxq_s16(tmp, vdupq_n_s16(0));
248
+            tmp = vminq_s16(tmp, vdupq_n_s16(255));
249
+            *(uint8x8_t *)&recx = vmovn_u16(tmp);
250
+        }
251
+        for (; x < ctuWidth; x++)
252
+        {
253
+            recx = x265_clip(recx + offsetrecx >> boShift);
254
+        }
255
+        rec += stride;
256
+    }
257
+}
258
+
259
+}
260
+
261
+
262
+
263
+namespace X265_NS
264
+{
265
+void setupLoopFilterPrimitives_neon(EncoderPrimitives &p)
266
+{
267
+    p.saoCuOrgE0 = processSaoCUE0_neon;
268
+    p.saoCuOrgE1 = processSaoCUE1_neon;
269
+    p.saoCuOrgE1_2Rows = processSaoCUE1_2Rows_neon;
270
+    p.saoCuOrgE20 = processSaoCUE2_neon;
271
+    p.saoCuOrgE21 = processSaoCUE2_neon;
272
+    p.saoCuOrgE30 = processSaoCUE3_neon;
273
+    p.saoCuOrgE31 = processSaoCUE3_neon;
274
+    p.saoCuOrgB0 = processSaoCUB0_neon;
275
+    p.sign = calSign_neon;
276
+
277
+}
278
+
279
+
280
+#else //HIGH_BIT_DEPTH
281
+
282
+
283
+namespace X265_NS
284
+{
285
+void setupLoopFilterPrimitives_neon(EncoderPrimitives &)
286
+{
287
+}
288
+
289
+#endif
290
+
291
+
292
+}
293
x265_3.6.tar.gz/source/common/aarch64/loopfilter-prim.h Added
18
 
1
@@ -0,0 +1,16 @@
2
+#ifndef _LOOPFILTER_NEON_H__
3
+#define _LOOPFILTER_NEON_H__
4
+
5
+#include "common.h"
6
+#include "primitives.h"
7
+
8
+#define PIXEL_MIN 0
9
+
10
+namespace X265_NS
11
+{
12
+void setupLoopFilterPrimitives_neon(EncoderPrimitives &p);
13
+
14
+};
15
+
16
+
17
+#endif
18
x265_3.6.tar.gz/source/common/aarch64/mc-a-common.S Added
50
 
1
@@ -0,0 +1,48 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+.arch           armv8-a
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.macro addAvg_start
37
+    lsl             x3, x3, #1
38
+    lsl             x4, x4, #1
39
+    mov             w11, #0x40
40
+    dup             v30.16b, w11
41
+.endm
42
+
43
+.macro addavg_1 v0, v1
44
+    add             \v0\().8h, \v0\().8h, \v1\().8h
45
+    saddl           v16.4s, \v0\().4h, v30.4h
46
+    saddl2          v17.4s, \v0\().8h, v30.8h
47
+    shrn            \v0\().4h, v16.4s, #7
48
+    shrn2           \v0\().8h, v17.4s, #7
49
+.endm
50
x265_3.6.tar.gz/source/common/aarch64/mc-a-sve2.S Added
926
 
1
@@ -0,0 +1,924 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "mc-a-common.S"
27
+
28
+.arch armv8-a+sve2
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+function PFX(pixel_avg_pp_12x16_sve2)
41
+    sub             x1, x1, #4
42
+    sub             x3, x3, #4
43
+    sub             x5, x5, #4
44
+    ptrue           p0.s, vl1
45
+    ptrue           p1.b, vl8
46
+    mov             x11, #4
47
+.rept 16
48
+    ld1w            {z0.s}, p0/z, x2
49
+    ld1b            {z1.b}, p1/z, x2, x11
50
+    ld1w            {z2.s}, p0/z, x4
51
+    ld1b            {z3.b}, p1/z, x4, x11
52
+    add             x2, x2, #4
53
+    add             x2, x2, x3
54
+    add             x4, x4, #4
55
+    add             x4, x4, x5
56
+    urhadd          z0.b, p1/m, z0.b, z2.b
57
+    urhadd          z1.b, p1/m, z1.b, z3.b
58
+    st1b            {z0.b}, p1, x0
59
+    st1b            {z1.b}, p1, x0, x11
60
+    add             x0, x0, #4
61
+    add             x0, x0, x1
62
+.endr
63
+    ret
64
+endfunc
65
+
66
+function PFX(pixel_avg_pp_24x32_sve2)
67
+    mov             w12, #4
68
+    rdvl            x9, #1
69
+    cmp             x9, #16
70
+    bgt             .vl_gt_16_pixel_avg_pp_24x32
71
+    sub             x1, x1, #16
72
+    sub             x3, x3, #16
73
+    sub             x5, x5, #16
74
+.lpavg_24x32_sve2:
75
+    sub             w12, w12, #1
76
+.rept 8
77
+    ld1             {v0.16b}, x2, #16
78
+    ld1             {v1.8b}, x2, x3
79
+    ld1             {v2.16b}, x4, #16
80
+    ld1             {v3.8b}, x4, x5
81
+    urhadd          v0.16b, v0.16b, v2.16b
82
+    urhadd          v1.8b, v1.8b, v3.8b
83
+    st1             {v0.16b}, x0, #16
84
+    st1             {v1.8b}, x0, x1
85
+.endr
86
+    cbnz            w12, .lpavg_24x32_sve2
87
+    ret
88
+.vl_gt_16_pixel_avg_pp_24x32:
89
+    mov             x10, #24
90
+    mov             x11, #0
91
+    whilelt         p0.b, x11, x10
92
+.vl_gt_16_loop_pixel_avg_pp_24x32:
93
+    sub             w12, w12, #1
94
+.rept 8
95
+    ld1b            {z0.b}, p0/z, x2
96
+    ld1b            {z2.b}, p0/z, x4
97
+    add             x2, x2, x3
98
+    add             x4, x4, x5
99
+    urhadd          z0.b, p0/m, z0.b, z2.b
100
+    st1b            {z0.b}, p0, x0
101
+    add             x0, x0, x1
102
+.endr
103
+    cbnz            w12, .vl_gt_16_loop_pixel_avg_pp_24x32
104
+    ret
105
+endfunc
106
+
107
+.macro pixel_avg_pp_32xN_sve2 h
108
+function PFX(pixel_avg_pp_32x\h\()_sve2)
109
+    rdvl            x9, #1
110
+    cmp             x9, #16
111
+    bgt             .vl_gt_16_pixel_avg_pp_32_\h
112
+.rept \h
113
+    ld1             {v0.16b-v1.16b}, x2, x3
114
+    ld1             {v2.16b-v3.16b}, x4, x5
115
+    urhadd          v0.16b, v0.16b, v2.16b
116
+    urhadd          v1.16b, v1.16b, v3.16b
117
+    st1             {v0.16b-v1.16b}, x0, x1
118
+.endr
119
+    ret
120
+.vl_gt_16_pixel_avg_pp_32_\h:
121
+    ptrue           p0.b, vl32
122
+.rept \h
123
+    ld1b            {z0.b}, p0/z, x2
124
+    ld1b            {z2.b}, p0/z, x4
125
+    add             x2, x2, x3
126
+    add             x4, x4, x5
127
+    urhadd          z0.b, p0/m, z0.b, z2.b
128
+    st1b            {z0.b}, p0, x0
129
+    add             x0, x0, x1
130
+.endr
131
+    ret
132
+endfunc
133
+.endm
134
+
135
+pixel_avg_pp_32xN_sve2 8
136
+pixel_avg_pp_32xN_sve2 16
137
+pixel_avg_pp_32xN_sve2 24
138
+
139
+.macro pixel_avg_pp_32xN1_sve2 h
140
+function PFX(pixel_avg_pp_32x\h\()_sve2)
141
+    rdvl            x9, #1
142
+    cmp             x9, #16
143
+    bgt             .vl_gt_16_pixel_avg_pp_32xN1_\h
144
+    mov             w12, #\h / 8
145
+.lpavg_sve2_32x\h\():
146
+    sub             w12, w12, #1
147
+.rept 8
148
+    ld1             {v0.16b-v1.16b}, x2, x3
149
+    ld1             {v2.16b-v3.16b}, x4, x5
150
+    urhadd          v0.16b, v0.16b, v2.16b
151
+    urhadd          v1.16b, v1.16b, v3.16b
152
+    st1             {v0.16b-v1.16b}, x0, x1
153
+.endr
154
+    cbnz            w12, .lpavg_sve2_32x\h
155
+    ret
156
+.vl_gt_16_pixel_avg_pp_32xN1_\h:
157
+    ptrue           p0.b, vl32
158
+    mov             w12, #\h / 8
159
+.eq_32_loop_pixel_avg_pp_32xN1_\h\():
160
+    sub             w12, w12, #1
161
+.rept 8
162
+    ld1b            {z0.b}, p0/z, x2
163
+    ld1b            {z2.b}, p0/z, x4
164
+    add             x2, x2, x3
165
+    add             x4, x4, x5
166
+    urhadd          z0.b, p0/m, z0.b, z2.b
167
+    st1b            {z0.b}, p0, x0
168
+    add             x0, x0, x1
169
+.endr
170
+    cbnz            w12, .eq_32_loop_pixel_avg_pp_32xN1_\h
171
+    ret
172
+endfunc
173
+.endm
174
+
175
+pixel_avg_pp_32xN1_sve2 32
176
+pixel_avg_pp_32xN1_sve2 64
177
+
178
+function PFX(pixel_avg_pp_48x64_sve2)
179
+    rdvl            x9, #1
180
+    cmp             x9, #16
181
+    bgt             .vl_gt_16_pixel_avg_pp_48x64
182
+    mov             w12, #8
183
+.lpavg_48x64_sve2:
184
+    sub             w12, w12, #1
185
+.rept 8
186
+    ld1             {v0.16b-v2.16b}, x2, x3
187
+    ld1             {v3.16b-v5.16b}, x4, x5
188
+    urhadd          v0.16b, v0.16b, v3.16b
189
+    urhadd          v1.16b, v1.16b, v4.16b
190
+    urhadd          v2.16b, v2.16b, v5.16b
191
+    st1             {v0.16b-v2.16b}, x0, x1
192
+.endr
193
+    cbnz            w12, .lpavg_48x64_sve2
194
+    ret
195
+.vl_gt_16_pixel_avg_pp_48x64:
196
+    cmp             x9, #32
197
+    bgt             .vl_gt_32_pixel_avg_pp_48x64
198
+    ptrue           p0.b, vl32
199
+    ptrue           p1.b, vl16
200
+    mov             w12, #8
201
+.vl_eq_32_pixel_avg_pp_48x64:
202
+    sub             w12, w12, #1
203
+.rept 8
204
+    ld1b            {z0.b}, p0/z, x2
205
+    ld1b            {z1.b}, p1/z, x2, #1, mul vl
206
+    ld1b            {z2.b}, p0/z, x4
207
+    ld1b            {z3.b}, p1/z, x4, #1, mul vl
208
+    add             x2, x2, x3
209
+    add             x4, x4, x5
210
+    urhadd          z0.b, p0/m, z0.b, z2.b
211
+    urhadd          z1.b, p1/m, z1.b, z3.b
212
+    st1b            {z0.b}, p0, x0
213
+    st1b            {z1.b}, p1, x0, #1, mul vl
214
+    add             x0, x0, x1
215
+.endr
216
+    cbnz            w12, .vl_eq_32_pixel_avg_pp_48x64
217
+    ret
218
+.vl_gt_32_pixel_avg_pp_48x64:
219
+    mov             x10, #48
220
+    mov             x11, #0
221
+    whilelt         p0.b, x11, x10
222
+    mov             w12, #8
223
+.loop_gt_32_pixel_avg_pp_48x64:
224
+    sub             w12, w12, #1
225
+.rept 8
226
+    ld1b            {z0.b}, p0/z, x2
227
+    ld1b            {z2.b}, p0/z, x4
228
+    add             x2, x2, x3
229
+    add             x4, x4, x5
230
+    urhadd          z0.b, p0/m, z0.b, z2.b
231
+    st1b            {z0.b}, p0, x0
232
+    add             x0, x0, x1
233
+.endr
234
+    cbnz            w12, .loop_gt_32_pixel_avg_pp_48x64
235
+    ret
236
+endfunc
237
+
238
+.macro pixel_avg_pp_64xN_sve2 h
239
+function PFX(pixel_avg_pp_64x\h\()_sve2)
240
+    rdvl            x9, #1
241
+    cmp             x9, #16
242
+    bgt             .vl_gt_16_pixel_avg_pp_64x\h
243
+    mov             w12, #\h / 4
244
+.lpavg_sve2_64x\h\():
245
+    sub             w12, w12, #1
246
+.rept 4
247
+    ld1             {v0.16b-v3.16b}, x2, x3
248
+    ld1             {v4.16b-v7.16b}, x4, x5
249
+    urhadd          v0.16b, v0.16b, v4.16b
250
+    urhadd          v1.16b, v1.16b, v5.16b
251
+    urhadd          v2.16b, v2.16b, v6.16b
252
+    urhadd          v3.16b, v3.16b, v7.16b
253
+    st1             {v0.16b-v3.16b}, x0, x1
254
+.endr
255
+    cbnz            w12, .lpavg_sve2_64x\h
256
+    ret
257
+.vl_gt_16_pixel_avg_pp_64x\h\():
258
+    cmp             x9, #48
259
+    bgt             .vl_gt_48_pixel_avg_pp_64x\h
260
+    ptrue           p0.b, vl32
261
+    mov             w12, #\h / 4
262
+.vl_eq_32_pixel_avg_pp_64x\h\():
263
+    sub             w12, w12, #1
264
+.rept 4
265
+    ld1b            {z0.b}, p0/z, x2
266
+    ld1b            {z1.b}, p0/z, x2, #1, mul vl
267
+    ld1b            {z2.b}, p0/z, x4
268
+    ld1b            {z3.b}, p0/z, x4, #1, mul vl
269
+    add             x2, x2, x3
270
+    add             x4, x4, x5
271
+    urhadd          z0.b, p0/m, z0.b, z2.b
272
+    urhadd          z1.b, p0/m, z1.b, z3.b
273
+    st1b            {z0.b}, p0, x0
274
+    st1b            {z1.b}, p0, x0, #1, mul vl
275
+    add             x0, x0, x1
276
+.endr
277
+    cbnz            w12, .vl_eq_32_pixel_avg_pp_64x\h
278
+    ret
279
+.vl_gt_48_pixel_avg_pp_64x\h\():
280
+    ptrue           p0.b, vl64
281
+    mov             w12, #\h / 4
282
+.vl_eq_64_pixel_avg_pp_64x\h\():
283
+    sub             w12, w12, #1
284
+.rept 4
285
+    ld1b            {z0.b}, p0/z, x2
286
+    ld1b            {z2.b}, p0/z, x4
287
+    add             x2, x2, x3
288
+    add             x4, x4, x5
289
+    urhadd          z0.b, p0/m, z0.b, z2.b
290
+    st1b            {z0.b}, p0, x0
291
+    add             x0, x0, x1
292
+.endr
293
+    cbnz            w12, .vl_eq_64_pixel_avg_pp_64x\h
294
+    ret
295
+endfunc
296
+.endm
297
+
298
+pixel_avg_pp_64xN_sve2 16
299
+pixel_avg_pp_64xN_sve2 32
300
+pixel_avg_pp_64xN_sve2 48
301
+pixel_avg_pp_64xN_sve2 64
302
+
303
+// void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
304
+
305
+.macro addAvg_2xN_sve2 h
306
+function PFX(addAvg_2x\h\()_sve2)
307
+    ptrue           p0.s, vl2
308
+    ptrue           p1.h, vl4
309
+    ptrue           p2.h, vl2
310
+.rept \h / 2
311
+    ld1rw           {z0.s}, p0/z, x0
312
+    ld1rw           {z1.s}, p0/z, x1
313
+    add             x0, x0, x3, lsl #1
314
+    add             x1, x1, x4, lsl #1
315
+    ld1rw           {z2.s}, p0/z, x0
316
+    ld1rw           {z3.s}, p0/z, x1
317
+    add             x0, x0, x3, lsl #1
318
+    add             x1, x1, x4, lsl #1
319
+    add             z0.h, p1/m, z0.h, z1.h
320
+    add             z2.h, p1/m, z2.h, z3.h
321
+    sqrshrnb        z0.b, z0.h, #7
322
+    add             z0.b, z0.b, #0x80
323
+    sqrshrnb        z2.b, z2.h, #7
324
+    add             z2.b, z2.b, #0x80
325
+    st1b            {z0.h}, p2, x2
326
+    add             x2, x2, x5
327
+    st1b            {z2.h}, p2, x2
328
+    add             x2, x2, x5
329
+.endr
330
+    ret
331
+endfunc
332
+.endm
333
+
334
+addAvg_2xN_sve2 4
335
+addAvg_2xN_sve2 8
336
+addAvg_2xN_sve2 16
337
+
338
+.macro addAvg_6xN_sve2 h
339
+function PFX(addAvg_6x\h\()_sve2)
340
+    mov             w12, #\h / 2
341
+    ptrue           p0.b, vl16
342
+    ptrue           p2.h, vl6
343
+.loop_sve2_addavg_6x\h\():
344
+    sub             w12, w12, #1
345
+    ld1b            {z0.b}, p0/z, x0
346
+    ld1b            {z1.b}, p0/z, x1
347
+    add             x0, x0, x3, lsl #1
348
+    add             x1, x1, x4, lsl #1
349
+    ld1b            {z2.b}, p0/z, x0
350
+    ld1b            {z3.b}, p0/z, x1
351
+    add             x0, x0, x3, lsl #1
352
+    add             x1, x1, x4, lsl #1
353
+    add             z0.h, p0/m, z0.h, z1.h
354
+    add             z2.h, p0/m, z2.h, z3.h
355
+    sqrshrnb        z0.b, z0.h, #7
356
+    sqrshrnb        z2.b, z2.h, #7
357
+    add             z0.b, z0.b, #0x80
358
+    add             z2.b, z2.b, #0x80
359
+    st1b            {z0.h}, p2, x2
360
+    add             x2, x2, x5
361
+    st1b            {z2.h}, p2, x2
362
+    add             x2, x2, x5
363
+    cbnz            w12, .loop_sve2_addavg_6x\h
364
+    ret
365
+endfunc
366
+.endm
367
+
368
+addAvg_6xN_sve2 8
369
+addAvg_6xN_sve2 16
370
+
371
+.macro addAvg_8xN_sve2 h
372
+function PFX(addAvg_8x\h\()_sve2)
373
+    ptrue           p0.b, vl16
374
+.rept \h / 2
375
+    ld1b            {z0.b}, p0/z, x0
376
+    ld1b            {z1.b}, p0/z, x1
377
+    add             x0, x0, x3, lsl #1
378
+    add             x1, x1, x4, lsl #1
379
+    ld1b            {z2.b}, p0/z, x0
380
+    ld1b            {z3.b}, p0/z, x1
381
+    add             x0, x0, x3, lsl #1
382
+    add             x1, x1, x4, lsl #1
383
+    add             z0.h, p0/m, z0.h, z1.h
384
+    add             z2.h, p0/m, z2.h, z3.h
385
+    sqrshrnb        z0.b, z0.h, #7
386
+    add             z0.b, z0.b, #0x80
387
+    sqrshrnb        z2.b, z2.h, #7
388
+    add             z2.b, z2.b, #0x80
389
+    st1b            {z0.h}, p0, x2
390
+    add             x2, x2, x5
391
+    st1b            {z2.h}, p0, x2
392
+    add             x2, x2, x5
393
+.endr
394
+    ret
395
+endfunc
396
+.endm
397
+
398
+.macro addAvg_8xN1_sve2 h
399
+function PFX(addAvg_8x\h\()_sve2)
400
+    mov             w12, #\h / 2
401
+    ptrue           p0.b, vl16
402
+.loop_sve2_addavg_8x\h\():
403
+    sub             w12, w12, #1
404
+    ld1b            {z0.b}, p0/z, x0
405
+    ld1b            {z1.b}, p0/z, x1
406
+    add             x0, x0, x3, lsl #1
407
+    add             x1, x1, x4, lsl #1
408
+    ld1b            {z2.b}, p0/z, x0
409
+    ld1b            {z3.b}, p0/z, x1
410
+    add             x0, x0, x3, lsl #1
411
+    add             x1, x1, x4, lsl #1
412
+    add             z0.h, p0/m, z0.h, z1.h
413
+    add             z2.h, p0/m, z2.h, z3.h
414
+    sqrshrnb        z0.b, z0.h, #7
415
+    add             z0.b, z0.b, #0x80
416
+    sqrshrnb        z2.b, z2.h, #7
417
+    add             z2.b, z2.b, #0x80
418
+    st1b            {z0.h}, p0, x2
419
+    add             x2, x2, x5
420
+    st1b            {z2.h}, p0, x2
421
+    add             x2, x2, x5
422
+    cbnz            w12, .loop_sve2_addavg_8x\h
423
+    ret
424
+endfunc
425
+.endm
426
+
427
+addAvg_8xN_sve2 2
428
+addAvg_8xN_sve2 4
429
+addAvg_8xN_sve2 6
430
+addAvg_8xN_sve2 8
431
+addAvg_8xN_sve2 12
432
+addAvg_8xN_sve2 16
433
+addAvg_8xN1_sve2 32
434
+addAvg_8xN1_sve2 64
435
+
436
+.macro addAvg_12xN_sve2 h
437
+function PFX(addAvg_12x\h\()_sve2)
438
+    mov             w12, #\h
439
+    rdvl            x9, #1
440
+    cmp             x9, #16
441
+    bgt             .vl_gt_16_addAvg_12x\h
442
+    ptrue           p0.b, vl16
443
+    ptrue           p1.b, vl8
444
+.loop_sve2_addavg_12x\h\():
445
+    sub             w12, w12, #1
446
+    ld1b            {z0.b}, p0/z, x0
447
+    ld1b            {z1.b}, p0/z, x1
448
+    ld1b            {z2.b}, p1/z, x0, #1, mul vl
449
+    ld1b            {z3.b}, p1/z, x1, #1, mul vl
450
+    add             x0, x0, x3, lsl #1
451
+    add             x1, x1, x4, lsl #1
452
+    add             z0.h, p0/m, z0.h, z1.h
453
+    add             z2.h, p1/m, z2.h, z3.h
454
+    sqrshrnb        z0.b, z0.h, #7
455
+    add             z0.b, z0.b, #0x80
456
+    sqrshrnb        z2.b, z2.h, #7
457
+    add             z2.b, z2.b, #0x80
458
+    st1b            {z0.h}, p0, x2
459
+    st1b            {z2.h}, p1, x2, #1, mul vl
460
+    add             x2, x2, x5
461
+    cbnz            w12, .loop_sve2_addavg_12x\h
462
+    ret
463
+.vl_gt_16_addAvg_12x\h\():
464
+    mov             x10, #24
465
+    mov             x11, #0
466
+    whilelt         p0.b, x11, x10
467
+.loop_sve2_gt_16_addavg_12x\h\():
468
+    sub             w12, w12, #1
469
+    ld1b            {z0.b}, p0/z, x0
470
+    ld1b            {z1.b}, p0/z, x1
471
+    add             x0, x0, x3, lsl #1
472
+    add             x1, x1, x4, lsl #1
473
+    add             z0.h, p0/m, z0.h, z1.h
474
+    sqrshrnb        z0.b, z0.h, #7
475
+    add             z0.b, z0.b, #0x80
476
+    sqrshrnb        z2.b, z2.h, #7
477
+    add             z2.b, z2.b, #0x80
478
+    st1b            {z0.h}, p0, x2
479
+    add             x2, x2, x5
480
+    cbnz            w12, .loop_sve2_gt_16_addavg_12x\h
481
+    ret
482
+endfunc
483
+.endm
484
+
485
+addAvg_12xN_sve2 16
486
+addAvg_12xN_sve2 32
487
+
488
+.macro addAvg_16xN_sve2 h
489
+function PFX(addAvg_16x\h\()_sve2)
490
+    mov             w12, #\h
491
+    rdvl            x9, #1
492
+    cmp             x9, #16
493
+    bgt             .vl_gt_16_addAvg_16x\h
494
+    ptrue           p0.b, vl16
495
+.loop_eq_16_sve2_addavg_16x\h\():
496
+    sub             w12, w12, #1
497
+    ld1b            {z0.b}, p0/z, x0
498
+    ld1b            {z1.b}, p0/z, x1
499
+    ld1b            {z2.b}, p0/z, x0, #1, mul vl
500
+    ld1b            {z3.b}, p0/z, x1, #1, mul vl
501
+    add             x0, x0, x3, lsl #1
502
+    add             x1, x1, x4, lsl #1
503
+    add             z0.h, p0/m, z0.h, z1.h
504
+    add             z2.h, p0/m, z2.h, z3.h
505
+    sqrshrnb        z0.b, z0.h, #7
506
+    add             z0.b, z0.b, #0x80
507
+    sqrshrnb        z2.b, z2.h, #7
508
+    add             z2.b, z2.b, #0x80
509
+    st1b            {z0.h}, p0, x2
510
+    st1b            {z2.h}, p0, x2, #1, mul vl
511
+    add             x2, x2, x5
512
+    cbnz            w12, .loop_eq_16_sve2_addavg_16x\h
513
+    ret
514
+.vl_gt_16_addAvg_16x\h\():
515
+    cmp             x9, #32
516
+    bgt             .vl_gt_32_addAvg_16x\h
517
+    ptrue           p0.b, vl32
518
+.loop_gt_16_sve2_addavg_16x\h\():
519
+    sub             w12, w12, #1
520
+    ld1b            {z0.b}, p0/z, x0
521
+    ld1b            {z1.b}, p0/z, x1
522
+    add             x0, x0, x3, lsl #1
523
+    add             x1, x1, x4, lsl #1
524
+    add             z0.h, p0/m, z0.h, z1.h
525
+    sqrshrnb        z0.b, z0.h, #7
526
+    add             z0.b, z0.b, #0x80
527
+    st1b            {z0.h}, p1, x2
528
+    add             x2, x2, x5
529
+    cbnz            w12, .loop_gt_16_sve2_addavg_16x\h
530
+    ret
531
+.vl_gt_32_addAvg_16x\h\():
532
+    mov             x10, #48
533
+    mov             x11, #0
534
+    whilelt         p0.b, x11, x10
535
+.loop_gt_32_sve2_addavg_16x\h\():
536
+    sub             w12, w12, #1
537
+    ld1b            {z0.b}, p0/z, x0
538
+    add             x0, x0, x3, lsl #1
539
+    add             x1, x1, x4, lsl #1
540
+    add             z0.h, p0/m, z0.h, z1.h
541
+    sqrshrnb        z0.b, z0.h, #7
542
+    add             z0.b, z0.b, #0x80
543
+    st1b            {z0.h}, p0, x2
544
+    add             x2, x2, x5
545
+    cbnz            w12, .loop_gt_32_sve2_addavg_16x\h
546
+    ret
547
+endfunc
548
+.endm
549
+
550
+addAvg_16xN_sve2 4
551
+addAvg_16xN_sve2 8
552
+addAvg_16xN_sve2 12
553
+addAvg_16xN_sve2 16
554
+addAvg_16xN_sve2 24
555
+addAvg_16xN_sve2 32
556
+addAvg_16xN_sve2 64
557
+
558
+.macro addAvg_24xN_sve2 h
559
+function PFX(addAvg_24x\h\()_sve2)
560
+    mov             w12, #\h
561
+    rdvl            x9, #1
562
+    cmp             x9, #16
563
+    bgt             .vl_gt_16_addAvg_24x\h
564
+    addAvg_start
565
+.loop_eq_16_sve2_addavg_24x\h\():
566
+    sub             w12, w12, #1
567
+    ld1             {v0.16b-v2.16b}, x0, x3
568
+    ld1             {v3.16b-v5.16b}, x1, x4
569
+    addavg_1        v0, v3
570
+    addavg_1        v1, v4
571
+    addavg_1        v2, v5
572
+    sqxtun          v0.8b, v0.8h
573
+    sqxtun          v1.8b, v1.8h
574
+    sqxtun          v2.8b, v2.8h
575
+    st1             {v0.8b-v2.8b}, x2, x5
576
+    cbnz            w12, .loop_eq_16_sve2_addavg_24x\h
577
+    ret
578
+.vl_gt_16_addAvg_24x\h\():
579
+    cmp             x9, #48
580
+    bgt             .vl_gt_48_addAvg_24x\h
581
+    ptrue           p0.b, vl32
582
+    ptrue           p1.b, vl16
583
+.loop_gt_16_sve2_addavg_24x\h\():
584
+    sub             w12, w12, #1
585
+    ld1b            {z0.b}, p0/z, x0
586
+    ld1b            {z1.b}, p1/z, x0, #1, mul vl
587
+    ld1b            {z2.b}, p0/z, x1
588
+    ld1b            {z3.b}, p1/z, x1, #1, mul vl
589
+    add             x0, x0, x3, lsl #1
590
+    add             x1, x1, x4, lsl #1
591
+    add             z0.h, p0/m, z0.h, z2.h
592
+    add             z1.h, p1/m, z1.h, z3.h
593
+    sqrshrnb        z0.b, z0.h, #7
594
+    add             z0.b, z0.b, #0x80
595
+    sqrshrnb        z1.b, z1.h, #7
596
+    add             z1.b, z1.b, #0x80
597
+    st1b            {z0.h}, p0, x2
598
+    st1b            {z1.h}, p1, x2, #1, mul vl
599
+    add             x2, x2, x5
600
+    cbnz            w12, .loop_gt_16_sve2_addavg_24x\h
601
+    ret
602
+.vl_gt_48_addAvg_24x\h\():
603
+    mov             x10, #48
604
+    mov             x11, #0
605
+    whilelt         p0.b, x11, x10
606
+.loop_gt_48_sve2_addavg_24x\h\():
607
+    sub             w12, w12, #1
608
+    ld1b            {z0.b}, p0/z, x0
609
+    ld1b            {z2.b}, p0/z, x1
610
+    add             x0, x0, x3, lsl #1
611
+    add             x1, x1, x4, lsl #1
612
+    add             z0.h, p0/m, z0.h, z2.h
613
+    sqrshrnb        z0.b, z0.h, #7
614
+    add             z0.b, z0.b, #0x80
615
+    st1b            {z0.h}, p0, x2
616
+    add             x2, x2, x5
617
+    cbnz            w12, .loop_gt_48_sve2_addavg_24x\h
618
+    ret
619
+endfunc
620
+.endm
621
+
622
+addAvg_24xN_sve2 32
623
+addAvg_24xN_sve2 64
624
+
625
+.macro addAvg_32xN_sve2 h
626
+function PFX(addAvg_32x\h\()_sve2)
627
+    mov             w12, #\h
628
+    rdvl            x9, #1
629
+    cmp             x9, #16
630
+    bgt             .vl_gt_16_addAvg_32x\h
631
+    ptrue           p0.b, vl16
632
+.loop_eq_16_sve2_addavg_32x\h\():
633
+    sub             w12, w12, #1
634
+    ld1b            {z0.b}, p0/z, x0
635
+    ld1b            {z1.b}, p0/z, x0, #1, mul vl
636
+    ld1b            {z2.b}, p0/z, x0, #2, mul vl
637
+    ld1b            {z3.b}, p0/z, x0, #3, mul vl
638
+    ld1b            {z4.b}, p0/z, x1
639
+    ld1b            {z5.b}, p0/z, x1, #1, mul vl
640
+    ld1b            {z6.b}, p0/z, x1, #2, mul vl
641
+    ld1b            {z7.b}, p0/z, x1, #3, mul vl
642
+    add             x0, x0, x3, lsl #1
643
+    add             x1, x1, x4, lsl #1
644
+    add             z0.h, p0/m, z0.h, z4.h
645
+    add             z1.h, p0/m, z1.h, z5.h
646
+    add             z2.h, p0/m, z2.h, z6.h
647
+    add             z3.h, p0/m, z3.h, z7.h
648
+    sqrshrnb        z0.b, z0.h, #7
649
+    add             z0.b, z0.b, #0x80
650
+    sqrshrnb        z1.b, z1.h, #7
651
+    add             z1.b, z1.b, #0x80
652
+    sqrshrnb        z2.b, z2.h, #7
653
+    add             z2.b, z2.b, #0x80
654
+    sqrshrnb        z3.b, z3.h, #7
655
+    add             z3.b, z3.b, #0x80
656
+    st1b            {z0.h}, p0, x2
657
+    st1b            {z1.h}, p0, x2, #1, mul vl
658
+    st1b            {z2.h}, p0, x2, #2, mul vl
659
+    st1b            {z3.h}, p0, x2, #3, mul vl
660
+    add             x2, x2, x5
661
+    cbnz            w12, .loop_eq_16_sve2_addavg_32x\h
662
+    ret
663
+.vl_gt_16_addAvg_32x\h\():
664
+    cmp             x9, #48
665
+    bgt             .vl_gt_48_addAvg_32x\h
666
+    ptrue           p0.b, vl32
667
+.loop_gt_eq_32_sve2_addavg_32x\h\():
668
+    sub             w12, w12, #1
669
+    ld1b            {z0.b}, p0/z, x0
670
+    ld1b            {z1.b}, p0/z, x0, #1, mul vl
671
+    ld1b            {z2.b}, p0/z, x1
672
+    ld1b            {z3.b}, p0/z, x1, #1, mul vl
673
+    add             x0, x0, x3, lsl #1
674
+    add             x1, x1, x4, lsl #1
675
+    add             z0.h, p0/m, z0.h, z2.h
676
+    add             z1.h, p0/m, z1.h, z3.h
677
+    sqrshrnb        z0.b, z0.h, #7
678
+    add             z1.b, z1.b, #0x80
679
+    sqrshrnb        z1.b, z1.h, #7
680
+    add             z0.b, z0.b, #0x80
681
+    st1b            {z0.h}, p0, x2
682
+    st1b            {z1.h}, p0, x2, #1, mul vl
683
+    add             x2, x2, x5
684
+    cbnz            w12, .loop_gt_eq_32_sve2_addavg_32x\h
685
+    ret
686
+.vl_gt_48_addAvg_32x\h\():
687
+    ptrue           p0.b, vl64
688
+.loop_eq_64_sve2_addavg_32x\h\():
689
+    sub             w12, w12, #1
690
+    ld1b            {z0.b}, p0/z, x0
691
+    ld1b            {z1.b}, p0/z, x1
692
+    add             x0, x0, x3, lsl #1
693
+    add             x1, x1, x4, lsl #1
694
+    add             z0.h, p0/m, z0.h, z1.h
695
+    sqrshrnb        z0.b, z0.h, #7
696
+    add             z0.b, z0.b, #0x80
697
+    st1b            {z0.h}, p0, x2
698
+    add             x2, x2, x5
699
+    cbnz            w12, .loop_eq_64_sve2_addavg_32x\h
700
+    ret
701
+endfunc
702
+.endm
703
+
704
+addAvg_32xN_sve2 8
705
+addAvg_32xN_sve2 16
706
+addAvg_32xN_sve2 24
707
+addAvg_32xN_sve2 32
708
+addAvg_32xN_sve2 48
709
+addAvg_32xN_sve2 64
710
+
711
+function PFX(addAvg_48x64_sve2)
712
+    mov             w12, #64
713
+    rdvl            x9, #1
714
+    cmp             x9, #16
715
+    bgt             .vl_gt_16_addAvg_48x64
716
+    addAvg_start
717
+    sub             x3, x3, #64
718
+    sub             x4, x4, #64
719
+.loop_eq_16_sve2_addavg_48x64:
720
+    sub             w12, w12, #1
721
+    ld1             {v0.8h-v3.8h}, x0, #64
722
+    ld1             {v4.8h-v7.8h}, x1, #64
723
+    ld1             {v20.8h-v21.8h}, x0, x3
724
+    ld1             {v22.8h-v23.8h}, x1, x4
725
+    addavg_1        v0, v4
726
+    addavg_1        v1, v5
727
+    addavg_1        v2, v6
728
+    addavg_1        v3, v7
729
+    addavg_1        v20, v22
730
+    addavg_1        v21, v23
731
+    sqxtun          v0.8b, v0.8h
732
+    sqxtun2         v0.16b, v1.8h
733
+    sqxtun          v1.8b, v2.8h
734
+    sqxtun2         v1.16b, v3.8h
735
+    sqxtun          v2.8b, v20.8h
736
+    sqxtun2         v2.16b, v21.8h
737
+    st1             {v0.16b-v2.16b}, x2, x5
738
+    cbnz            w12, .loop_eq_16_sve2_addavg_48x64
739
+    ret
740
+.vl_gt_16_addAvg_48x64:
741
+    cmp             x9, #48
742
+    bgt             .vl_gt_48_addAvg_48x64
743
+    ptrue           p0.b, vl32
744
+.loop_gt_eq_32_sve2_addavg_48x64:
745
+    sub             w12, w12, #1
746
+    ld1b            {z0.b}, p0/z, x0
747
+    ld1b            {z1.b}, p0/z, x0, #1, mul vl
748
+    ld1b            {z2.b}, p0/z, x0, #2, mul vl
749
+    ld1b            {z4.b}, p0/z, x1
750
+    ld1b            {z5.b}, p0/z, x1, #1, mul vl
751
+    ld1b            {z6.b}, p0/z, x1, #2, mul vl
752
+    add             x0, x0, x3, lsl #1
753
+    add             x1, x1, x4, lsl #1
754
+    add             z0.h, p0/m, z0.h, z4.h
755
+    add             z1.h, p0/m, z1.h, z5.h
756
+    add             z2.h, p0/m, z2.h, z6.h
757
+    sqrshrnb        z0.b, z0.h, #7
758
+    add             z0.b, z0.b, #0x80
759
+    sqrshrnb        z1.b, z1.h, #7
760
+    add             z1.b, z1.b, #0x80
761
+    sqrshrnb        z2.b, z2.h, #7
762
+    add             z2.b, z2.b, #0x80
763
+    st1b            {z0.h}, p0, x2
764
+    st1b            {z1.h}, p0, x2, #1, mul vl
765
+    st1b            {z2.h}, p0, x2, #2, mul vl
766
+    add             x2, x2, x5
767
+    cbnz            w12, .loop_gt_eq_32_sve2_addavg_48x64
768
+    ret
769
+.vl_gt_48_addAvg_48x64:
770
+    cmp             x9, #112
771
+    bgt             .vl_gt_112_addAvg_48x64
772
+    ptrue           p0.b, vl64
773
+    ptrue           p1.b, vl32
774
+.loop_gt_48_sve2_addavg_48x64:
775
+    sub             w12, w12, #1
776
+    ld1b            {z0.b}, p0/z, x0
777
+    ld1b            {z1.b}, p1/z, x0, #1, mul vl
778
+    ld1b            {z4.b}, p0/z, x1
779
+    ld1b            {z5.b}, p1/z, x1, #1, mul vl
780
+    add             x0, x0, x3, lsl #1
781
+    add             x1, x1, x4, lsl #1
782
+    add             z0.h, p0/m, z0.h, z4.h
783
+    add             z1.h, p1/m, z1.h, z5.h
784
+    sqrshrnb        z0.b, z0.h, #7
785
+    add             z0.b, z0.b, #0x80
786
+    sqrshrnb        z1.b, z1.h, #7
787
+    add             z1.b, z1.b, #0x80
788
+    st1b            {z0.h}, p0, x2
789
+    st1b            {z1.h}, p1, x2, #1, mul vl
790
+    add             x2, x2, x5
791
+    cbnz            w12, .loop_gt_48_sve2_addavg_48x64
792
+    ret
793
+.vl_gt_112_addAvg_48x64:
794
+    mov             x10, #96
795
+    mov             x11, #0
796
+    whilelt         p0.b, x11, x10
797
+.loop_gt_112_sve2_addavg_48x64:
798
+    sub             w12, w12, #1
799
+    ld1b            {z0.b}, p0/z, x0
800
+    ld1b            {z4.b}, p0/z, x1
801
+    add             x0, x0, x3, lsl #1
802
+    add             x1, x1, x4, lsl #1
803
+    add             z0.h, p0/m, z0.h, z4.h
804
+    sqrshrnb        z0.b, z0.h, #7
805
+    add             z0.b, z0.b, #0x80
806
+    st1b            {z0.h}, p0, x2
807
+    add             x2, x2, x5
808
+    cbnz            w12, .loop_gt_112_sve2_addavg_48x64
809
+    ret
810
+endfunc
811
+
812
+.macro addAvg_64xN_sve2 h
813
+function PFX(addAvg_64x\h\()_sve2)
814
+    mov             w12, #\h
815
+    rdvl            x9, #1
816
+    cmp             x9, #16
817
+    bgt             .vl_gt_16_addAvg_64x\h
818
+    addAvg_start
819
+    sub             x3, x3, #64
820
+    sub             x4, x4, #64
821
+.loop_eq_16_sve2_addavg_64x\h\():
822
+    sub             w12, w12, #1
823
+    ld1             {v0.8h-v3.8h}, x0, #64
824
+    ld1             {v4.8h-v7.8h}, x1, #64
825
+    ld1             {v20.8h-v23.8h}, x0, x3
826
+    ld1             {v24.8h-v27.8h}, x1, x4
827
+    addavg_1        v0, v4
828
+    addavg_1        v1, v5
829
+    addavg_1        v2, v6
830
+    addavg_1        v3, v7
831
+    addavg_1        v20, v24
832
+    addavg_1        v21, v25
833
+    addavg_1        v22, v26
834
+    addavg_1        v23, v27
835
+    sqxtun          v0.8b, v0.8h
836
+    sqxtun2         v0.16b, v1.8h
837
+    sqxtun          v1.8b, v2.8h
838
+    sqxtun2         v1.16b, v3.8h
839
+    sqxtun          v2.8b, v20.8h
840
+    sqxtun2         v2.16b, v21.8h
841
+    sqxtun          v3.8b, v22.8h
842
+    sqxtun2         v3.16b, v23.8h
843
+    st1             {v0.16b-v3.16b}, x2, x5
844
+    cbnz            w12, .loop_eq_16_sve2_addavg_64x\h
845
+    ret
846
+.vl_gt_16_addAvg_64x\h\():
847
+    cmp             x9, #48
848
+    bgt             .vl_gt_48_addAvg_64x\h
849
+    ptrue           p0.b, vl32
850
+.loop_gt_eq_32_sve2_addavg_64x\h\():
851
+    sub             w12, w12, #1
852
+    ld1b            {z0.b}, p0/z, x0
853
+    ld1b            {z1.b}, p0/z, x0, #1, mul vl
854
+    ld1b            {z2.b}, p0/z, x0, #2, mul vl
855
+    ld1b            {z3.b}, p0/z, x0, #3, mul vl
856
+    ld1b            {z4.b}, p0/z, x1
857
+    ld1b            {z5.b}, p0/z, x1, #1, mul vl
858
+    ld1b            {z6.b}, p0/z, x1, #2, mul vl
859
+    ld1b            {z7.b}, p0/z, x1, #3, mul vl
860
+    add             x0, x0, x3, lsl #1
861
+    add             x1, x1, x4, lsl #1
862
+    add             z0.h, p0/m, z0.h, z4.h
863
+    add             z1.h, p0/m, z1.h, z5.h
864
+    add             z2.h, p0/m, z2.h, z6.h
865
+    add             z3.h, p0/m, z3.h, z7.h
866
+    sqrshrnb        z0.b, z0.h, #7
867
+    add             z0.b, z0.b, #0x80
868
+    sqrshrnb        z1.b, z1.h, #7
869
+    add             z1.b, z1.b, #0x80
870
+    sqrshrnb        z2.b, z2.h, #7
871
+    add             z2.b, z2.b, #0x80
872
+    sqrshrnb        z3.b, z3.h, #7
873
+    add             z3.b, z3.b, #0x80
874
+    st1b            {z0.h}, p0, x2
875
+    st1b            {z1.h}, p0, x2, #1, mul vl
876
+    st1b            {z2.h}, p0, x2, #2, mul vl
877
+    st1b            {z3.h}, p0, x2, #3, mul vl
878
+    add             x2, x2, x5
879
+    cbnz            w12, .loop_gt_eq_32_sve2_addavg_64x\h
880
+    ret
881
+.vl_gt_48_addAvg_64x\h\():
882
+    cmp             x9, #112
883
+    bgt             .vl_gt_112_addAvg_64x\h
884
+    ptrue           p0.b, vl64
885
+.loop_gt_eq_48_sve2_addavg_64x\h\():
886
+    sub             w12, w12, #1
887
+    ld1b            {z0.b}, p0/z, x0
888
+    ld1b            {z1.b}, p0/z, x0, #1, mul vl
889
+    ld1b            {z4.b}, p0/z, x1
890
+    ld1b            {z5.b}, p0/z, x1, #1, mul vl
891
+    add             x0, x0, x3, lsl #1
892
+    add             x1, x1, x4, lsl #1
893
+    add             z0.h, p0/m, z0.h, z4.h
894
+    add             z1.h, p0/m, z1.h, z5.h
895
+    sqrshrnb        z0.b, z0.h, #7
896
+    add             z0.b, z0.b, #0x80
897
+    sqrshrnb        z1.b, z1.h, #7
898
+    add             z1.b, z1.b, #0x80
899
+    st1b            {z0.h}, p0, x2
900
+    st1b            {z1.h}, p0, x2, #1, mul vl
901
+    add             x2, x2, x5
902
+    cbnz            w12, .loop_gt_eq_48_sve2_addavg_64x\h
903
+    ret
904
+.vl_gt_112_addAvg_64x\h\():
905
+    ptrue           p0.b, vl128
906
+.loop_gt_eq_128_sve2_addavg_64x\h\():
907
+    sub             w12, w12, #1
908
+    ld1b            {z0.b}, p0/z, x0
909
+    ld1b            {z4.b}, p0/z, x1
910
+    add             x0, x0, x3, lsl #1
911
+    add             x1, x1, x4, lsl #1
912
+    add             z0.h, p0/m, z0.h, z4.h
913
+    sqrshrnb        z0.b, z0.h, #7
914
+    add             z0.b, z0.b, #0x80
915
+    st1b            {z0.h}, p0, x2
916
+    add             x2, x2, x5
917
+    cbnz            w12, .loop_gt_eq_128_sve2_addavg_64x\h
918
+    ret
919
+endfunc
920
+.endm
921
+
922
+addAvg_64xN_sve2 16
923
+addAvg_64xN_sve2 32
924
+addAvg_64xN_sve2 48
925
+addAvg_64xN_sve2 64
926
x265_3.5.tar.gz/source/common/aarch64/mc-a.S -> x265_3.6.tar.gz/source/common/aarch64/mc-a.S Changed
534
 
1
@@ -1,7 +1,8 @@
2
 /*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
+ * Copyright (C) 2020-2021 MulticoreWare, Inc
5
  *
6
  * Authors: Hongbin Liu <liuhongbin1@huawei.com>
7
+ *          Sebastian Pop <spop@amazon.com>
8
  *
9
  * This program is free software; you can redistribute it and/or modify
10
  * it under the terms of the GNU General Public License as published by
11
@@ -22,15 +23,20 @@
12
  *****************************************************************************/
13
 
14
 #include "asm.S"
15
+#include "mc-a-common.S"
16
 
17
+#ifdef __APPLE__
18
+.section __RODATA,__rodata
19
+#else
20
 .section .rodata
21
+#endif
22
 
23
 .align 4
24
 
25
 .text
26
 
27
 .macro pixel_avg_pp_4xN_neon h
28
-function x265_pixel_avg_pp_4x\h\()_neon
29
+function PFX(pixel_avg_pp_4x\h\()_neon)
30
 .rept \h
31
     ld1             {v0.s}0, x2, x3
32
     ld1             {v1.s}0, x4, x5
33
@@ -46,7 +52,7 @@
34
 pixel_avg_pp_4xN_neon 16
35
 
36
 .macro pixel_avg_pp_8xN_neon h
37
-function x265_pixel_avg_pp_8x\h\()_neon
38
+function PFX(pixel_avg_pp_8x\h\()_neon)
39
 .rept \h
40
     ld1             {v0.8b}, x2, x3
41
     ld1             {v1.8b}, x4, x5
42
@@ -61,3 +67,491 @@
43
 pixel_avg_pp_8xN_neon 8
44
 pixel_avg_pp_8xN_neon 16
45
 pixel_avg_pp_8xN_neon 32
46
+
47
+function PFX(pixel_avg_pp_12x16_neon)
48
+    sub             x1, x1, #4
49
+    sub             x3, x3, #4
50
+    sub             x5, x5, #4
51
+.rept 16
52
+    ld1             {v0.s}0, x2, #4
53
+    ld1             {v1.8b}, x2, x3
54
+    ld1             {v2.s}0, x4, #4
55
+    ld1             {v3.8b}, x4, x5
56
+    urhadd          v4.8b, v0.8b, v2.8b
57
+    urhadd          v5.8b, v1.8b, v3.8b
58
+    st1             {v4.s}0, x0, #4
59
+    st1             {v5.8b}, x0, x1
60
+.endr
61
+    ret
62
+endfunc
63
+
64
+.macro pixel_avg_pp_16xN_neon h
65
+function PFX(pixel_avg_pp_16x\h\()_neon)
66
+.rept \h
67
+    ld1             {v0.16b}, x2, x3
68
+    ld1             {v1.16b}, x4, x5
69
+    urhadd          v2.16b, v0.16b, v1.16b
70
+    st1             {v2.16b}, x0, x1
71
+.endr
72
+    ret
73
+endfunc
74
+.endm
75
+
76
+pixel_avg_pp_16xN_neon 4
77
+pixel_avg_pp_16xN_neon 8
78
+pixel_avg_pp_16xN_neon 12
79
+pixel_avg_pp_16xN_neon 16
80
+pixel_avg_pp_16xN_neon 32
81
+
82
+function PFX(pixel_avg_pp_16x64_neon)
83
+    mov             w12, #8
84
+.lpavg_16x64:
85
+    sub             w12, w12, #1
86
+.rept 8
87
+    ld1             {v0.16b}, x2, x3
88
+    ld1             {v1.16b}, x4, x5
89
+    urhadd          v2.16b, v0.16b, v1.16b
90
+    st1             {v2.16b}, x0, x1
91
+.endr
92
+    cbnz            w12, .lpavg_16x64
93
+    ret
94
+endfunc
95
+
96
+function PFX(pixel_avg_pp_24x32_neon)
97
+    sub             x1, x1, #16
98
+    sub             x3, x3, #16
99
+    sub             x5, x5, #16
100
+    mov             w12, #4
101
+.lpavg_24x32:
102
+    sub             w12, w12, #1
103
+.rept 8
104
+    ld1             {v0.16b}, x2, #16
105
+    ld1             {v1.8b}, x2, x3
106
+    ld1             {v2.16b}, x4, #16
107
+    ld1             {v3.8b}, x4, x5
108
+    urhadd          v0.16b, v0.16b, v2.16b
109
+    urhadd          v1.8b, v1.8b, v3.8b
110
+    st1             {v0.16b}, x0, #16
111
+    st1             {v1.8b}, x0, x1
112
+.endr
113
+    cbnz            w12, .lpavg_24x32
114
+    ret
115
+endfunc
116
+
117
+.macro pixel_avg_pp_32xN_neon h
118
+function PFX(pixel_avg_pp_32x\h\()_neon)
119
+.rept \h
120
+    ld1             {v0.16b-v1.16b}, x2, x3
121
+    ld1             {v2.16b-v3.16b}, x4, x5
122
+    urhadd          v0.16b, v0.16b, v2.16b
123
+    urhadd          v1.16b, v1.16b, v3.16b
124
+    st1             {v0.16b-v1.16b}, x0, x1
125
+.endr
126
+    ret
127
+endfunc
128
+.endm
129
+
130
+pixel_avg_pp_32xN_neon 8
131
+pixel_avg_pp_32xN_neon 16
132
+pixel_avg_pp_32xN_neon 24
133
+
134
+.macro pixel_avg_pp_32xN1_neon h
135
+function PFX(pixel_avg_pp_32x\h\()_neon)
136
+    mov             w12, #\h / 8
137
+.lpavg_32x\h\():
138
+    sub             w12, w12, #1
139
+.rept 8
140
+    ld1             {v0.16b-v1.16b}, x2, x3
141
+    ld1             {v2.16b-v3.16b}, x4, x5
142
+    urhadd          v0.16b, v0.16b, v2.16b
143
+    urhadd          v1.16b, v1.16b, v3.16b
144
+    st1             {v0.16b-v1.16b}, x0, x1
145
+.endr
146
+    cbnz            w12, .lpavg_32x\h
147
+    ret
148
+endfunc
149
+.endm
150
+
151
+pixel_avg_pp_32xN1_neon 32
152
+pixel_avg_pp_32xN1_neon 64
153
+
154
+function PFX(pixel_avg_pp_48x64_neon)
155
+    mov             w12, #8
156
+.lpavg_48x64:
157
+    sub             w12, w12, #1
158
+.rept 8
159
+    ld1             {v0.16b-v2.16b}, x2, x3
160
+    ld1             {v3.16b-v5.16b}, x4, x5
161
+    urhadd          v0.16b, v0.16b, v3.16b
162
+    urhadd          v1.16b, v1.16b, v4.16b
163
+    urhadd          v2.16b, v2.16b, v5.16b
164
+    st1             {v0.16b-v2.16b}, x0, x1
165
+.endr
166
+    cbnz            w12, .lpavg_48x64
167
+    ret
168
+endfunc
169
+
170
+.macro pixel_avg_pp_64xN_neon h
171
+function PFX(pixel_avg_pp_64x\h\()_neon)
172
+    mov             w12, #\h / 4
173
+.lpavg_64x\h\():
174
+    sub             w12, w12, #1
175
+.rept 4
176
+    ld1             {v0.16b-v3.16b}, x2, x3
177
+    ld1             {v4.16b-v7.16b}, x4, x5
178
+    urhadd          v0.16b, v0.16b, v4.16b
179
+    urhadd          v1.16b, v1.16b, v5.16b
180
+    urhadd          v2.16b, v2.16b, v6.16b
181
+    urhadd          v3.16b, v3.16b, v7.16b
182
+    st1             {v0.16b-v3.16b}, x0, x1
183
+.endr
184
+    cbnz            w12, .lpavg_64x\h
185
+    ret
186
+endfunc
187
+.endm
188
+
189
+pixel_avg_pp_64xN_neon 16
190
+pixel_avg_pp_64xN_neon 32
191
+pixel_avg_pp_64xN_neon 48
192
+pixel_avg_pp_64xN_neon 64
193
+
194
+// void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
195
+.macro addAvg_2xN h
196
+function PFX(addAvg_2x\h\()_neon)
197
+    addAvg_start
198
+.rept \h / 2
199
+    ldr             w10, x0
200
+    ldr             w11, x1
201
+    add             x0, x0, x3
202
+    add             x1, x1, x4
203
+    ldr             w12, x0
204
+    ldr             w13, x1
205
+    add             x0, x0, x3
206
+    add             x1, x1, x4
207
+    dup             v0.2s, w10
208
+    dup             v1.2s, w11
209
+    dup             v2.2s, w12
210
+    dup             v3.2s, w13
211
+    add             v0.4h, v0.4h, v1.4h
212
+    add             v2.4h, v2.4h, v3.4h
213
+    saddl           v0.4s, v0.4h, v30.4h
214
+    saddl           v2.4s, v2.4h, v30.4h
215
+    shrn            v0.4h, v0.4s, #7
216
+    shrn2           v0.8h, v2.4s, #7
217
+    sqxtun          v0.8b, v0.8h
218
+    st1             {v0.h}0, x2, x5
219
+    st1             {v0.h}2, x2, x5
220
+.endr
221
+    ret
222
+endfunc
223
+.endm
224
+
225
+addAvg_2xN 4
226
+addAvg_2xN 8
227
+addAvg_2xN 16
228
+
229
+.macro addAvg_4xN h
230
+function PFX(addAvg_4x\h\()_neon)
231
+    addAvg_start
232
+.rept \h / 2
233
+    ld1             {v0.8b}, x0, x3
234
+    ld1             {v1.8b}, x1, x4
235
+    ld1             {v2.8b}, x0, x3
236
+    ld1             {v3.8b}, x1, x4
237
+    add             v0.4h, v0.4h, v1.4h
238
+    add             v2.4h, v2.4h, v3.4h
239
+    saddl           v0.4s, v0.4h, v30.4h
240
+    saddl           v2.4s, v2.4h, v30.4h
241
+    shrn            v0.4h, v0.4s, #7
242
+    shrn2           v0.8h, v2.4s, #7
243
+    sqxtun          v0.8b, v0.8h
244
+    st1             {v0.s}0, x2, x5
245
+    st1             {v0.s}1, x2, x5
246
+.endr
247
+    ret
248
+endfunc
249
+.endm
250
+
251
+addAvg_4xN 2
252
+addAvg_4xN 4
253
+addAvg_4xN 8
254
+addAvg_4xN 16
255
+addAvg_4xN 32
256
+
257
+.macro addAvg_6xN h
258
+function PFX(addAvg_6x\h\()_neon)
259
+    addAvg_start
260
+    mov             w12, #\h / 2
261
+    sub             x5, x5, #4
262
+.loop_addavg_6x\h:
263
+    sub             w12, w12, #1
264
+    ld1             {v0.16b}, x0, x3
265
+    ld1             {v1.16b}, x1, x4
266
+    ld1             {v2.16b}, x0, x3
267
+    ld1             {v3.16b}, x1, x4
268
+    add             v0.8h, v0.8h, v1.8h
269
+    add             v2.8h, v2.8h, v3.8h
270
+    saddl           v16.4s, v0.4h, v30.4h
271
+    saddl2          v17.4s, v0.8h, v30.8h
272
+    saddl           v18.4s, v2.4h, v30.4h
273
+    saddl2          v19.4s, v2.8h, v30.8h
274
+    shrn            v0.4h, v16.4s, #7
275
+    shrn2           v0.8h, v17.4s, #7
276
+    shrn            v1.4h, v18.4s, #7
277
+    shrn2           v1.8h, v19.4s, #7
278
+    sqxtun          v0.8b, v0.8h
279
+    sqxtun          v1.8b, v1.8h
280
+    str             s0, x2, #4
281
+    st1             {v0.h}2, x2, x5
282
+    str             s1, x2, #4
283
+    st1             {v1.h}2, x2, x5
284
+    cbnz            w12, .loop_addavg_6x\h
285
+    ret
286
+endfunc
287
+.endm
288
+
289
+addAvg_6xN 8
290
+addAvg_6xN 16
291
+
292
+.macro addAvg_8xN h
293
+function PFX(addAvg_8x\h\()_neon)
294
+    addAvg_start
295
+.rept \h / 2
296
+    ld1             {v0.16b}, x0, x3
297
+    ld1             {v1.16b}, x1, x4
298
+    ld1             {v2.16b}, x0, x3
299
+    ld1             {v3.16b}, x1, x4
300
+    add             v0.8h, v0.8h, v1.8h
301
+    add             v2.8h, v2.8h, v3.8h
302
+    saddl           v16.4s, v0.4h, v30.4h
303
+    saddl2          v17.4s, v0.8h, v30.8h
304
+    saddl           v18.4s, v2.4h, v30.4h
305
+    saddl2          v19.4s, v2.8h, v30.8h
306
+    shrn            v0.4h, v16.4s, #7
307
+    shrn2           v0.8h, v17.4s, #7
308
+    shrn            v1.4h, v18.4s, #7
309
+    shrn2           v1.8h, v19.4s, #7
310
+    sqxtun          v0.8b, v0.8h
311
+    sqxtun          v1.8b, v1.8h
312
+    st1             {v0.8b}, x2, x5
313
+    st1             {v1.8b}, x2, x5
314
+.endr
315
+    ret
316
+endfunc
317
+.endm
318
+
319
+.macro addAvg_8xN1 h
320
+function PFX(addAvg_8x\h\()_neon)
321
+    addAvg_start
322
+    mov             w12, #\h / 2
323
+.loop_addavg_8x\h:
324
+    sub             w12, w12, #1
325
+    ld1             {v0.16b}, x0, x3
326
+    ld1             {v1.16b}, x1, x4
327
+    ld1             {v2.16b}, x0, x3
328
+    ld1             {v3.16b}, x1, x4
329
+    add             v0.8h, v0.8h, v1.8h
330
+    add             v2.8h, v2.8h, v3.8h
331
+    saddl           v16.4s, v0.4h, v30.4h
332
+    saddl2          v17.4s, v0.8h, v30.8h
333
+    saddl           v18.4s, v2.4h, v30.4h
334
+    saddl2          v19.4s, v2.8h, v30.8h
335
+    shrn            v0.4h, v16.4s, #7
336
+    shrn2           v0.8h, v17.4s, #7
337
+    shrn            v1.4h, v18.4s, #7
338
+    shrn2           v1.8h, v19.4s, #7
339
+    sqxtun          v0.8b, v0.8h
340
+    sqxtun          v1.8b, v1.8h
341
+    st1             {v0.8b}, x2, x5
342
+    st1             {v1.8b}, x2, x5
343
+    cbnz            w12, .loop_addavg_8x\h
344
+    ret
345
+endfunc
346
+.endm
347
+
348
+addAvg_8xN 2
349
+addAvg_8xN 4
350
+addAvg_8xN 6
351
+addAvg_8xN 8
352
+addAvg_8xN 12
353
+addAvg_8xN 16
354
+addAvg_8xN1 32
355
+addAvg_8xN1 64
356
+
357
+.macro addAvg_12xN h
358
+function PFX(addAvg_12x\h\()_neon)
359
+    addAvg_start
360
+    sub             x3, x3, #16
361
+    sub             x4, x4, #16
362
+    sub             x5, x5, #8
363
+    mov             w12, #\h
364
+.loop_addAvg_12X\h\():
365
+    sub             w12, w12, #1
366
+    ld1             {v0.16b}, x0, #16
367
+    ld1             {v1.16b}, x1, #16
368
+    ld1             {v2.8b}, x0, x3
369
+    ld1             {v3.8b}, x1, x4
370
+    add             v0.8h, v0.8h, v1.8h
371
+    add             v2.4h, v2.4h, v3.4h
372
+    saddl           v16.4s, v0.4h, v30.4h
373
+    saddl2          v17.4s, v0.8h, v30.8h
374
+    saddl           v18.4s, v2.4h, v30.4h
375
+    shrn            v0.4h, v16.4s, #7
376
+    shrn2           v0.8h, v17.4s, #7
377
+    shrn            v1.4h, v18.4s, #7
378
+    sqxtun          v0.8b, v0.8h
379
+    sqxtun          v1.8b, v1.8h
380
+    st1             {v0.8b}, x2, #8
381
+    st1             {v1.s}0, x2, x5
382
+    cbnz            w12, .loop_addAvg_12X\h
383
+    ret
384
+endfunc
385
+.endm
386
+
387
+addAvg_12xN 16
388
+addAvg_12xN 32
389
+
390
+.macro addAvg_16xN h
391
+function PFX(addAvg_16x\h\()_neon)
392
+    addAvg_start
393
+    mov             w12, #\h
394
+.loop_addavg_16x\h:
395
+    sub             w12, w12, #1
396
+    ld1             {v0.8h-v1.8h}, x0, x3
397
+    ld1             {v2.8h-v3.8h}, x1, x4
398
+    addavg_1        v0, v2
399
+    addavg_1        v1, v3
400
+    sqxtun          v0.8b, v0.8h
401
+    sqxtun2         v0.16b, v1.8h
402
+    st1             {v0.16b}, x2, x5
403
+    cbnz            w12, .loop_addavg_16x\h
404
+    ret
405
+endfunc
406
+.endm
407
+
408
+addAvg_16xN 4
409
+addAvg_16xN 8
410
+addAvg_16xN 12
411
+addAvg_16xN 16
412
+addAvg_16xN 24
413
+addAvg_16xN 32
414
+addAvg_16xN 64
415
+
416
+.macro addAvg_24xN h
417
+function PFX(addAvg_24x\h\()_neon)
418
+    addAvg_start
419
+    mov             w12, #\h
420
+.loop_addavg_24x\h\():
421
+    sub             w12, w12, #1
422
+    ld1             {v0.16b-v2.16b}, x0, x3
423
+    ld1             {v3.16b-v5.16b}, x1, x4
424
+    addavg_1        v0, v3
425
+    addavg_1        v1, v4
426
+    addavg_1        v2, v5
427
+    sqxtun          v0.8b, v0.8h
428
+    sqxtun          v1.8b, v1.8h
429
+    sqxtun          v2.8b, v2.8h
430
+    st1             {v0.8b-v2.8b}, x2, x5
431
+    cbnz            w12, .loop_addavg_24x\h
432
+    ret
433
+endfunc
434
+.endm
435
+
436
+addAvg_24xN 32
437
+addAvg_24xN 64
438
+
439
+.macro addAvg_32xN h
440
+function PFX(addAvg_32x\h\()_neon)
441
+    addAvg_start
442
+    mov             w12, #\h
443
+.loop_addavg_32x\h\():
444
+    sub             w12, w12, #1
445
+    ld1             {v0.8h-v3.8h}, x0, x3
446
+    ld1             {v4.8h-v7.8h}, x1, x4
447
+    addavg_1        v0, v4
448
+    addavg_1        v1, v5
449
+    addavg_1        v2, v6
450
+    addavg_1        v3, v7
451
+    sqxtun          v0.8b, v0.8h
452
+    sqxtun          v1.8b, v1.8h
453
+    sqxtun          v2.8b, v2.8h
454
+    sqxtun          v3.8b, v3.8h
455
+    st1             {v0.8b-v3.8b}, x2, x5
456
+    cbnz            w12, .loop_addavg_32x\h
457
+    ret
458
+endfunc
459
+.endm
460
+
461
+addAvg_32xN 8
462
+addAvg_32xN 16
463
+addAvg_32xN 24
464
+addAvg_32xN 32
465
+addAvg_32xN 48
466
+addAvg_32xN 64
467
+
468
+function PFX(addAvg_48x64_neon)
469
+    addAvg_start
470
+    sub             x3, x3, #64
471
+    sub             x4, x4, #64
472
+    mov             w12, #64
473
+.loop_addavg_48x64:
474
+    sub             w12, w12, #1
475
+    ld1             {v0.8h-v3.8h}, x0, #64
476
+    ld1             {v4.8h-v7.8h}, x1, #64
477
+    ld1             {v20.8h-v21.8h}, x0, x3
478
+    ld1             {v22.8h-v23.8h}, x1, x4
479
+    addavg_1        v0, v4
480
+    addavg_1        v1, v5
481
+    addavg_1        v2, v6
482
+    addavg_1        v3, v7
483
+    addavg_1        v20, v22
484
+    addavg_1        v21, v23
485
+    sqxtun          v0.8b, v0.8h
486
+    sqxtun2         v0.16b, v1.8h
487
+    sqxtun          v1.8b, v2.8h
488
+    sqxtun2         v1.16b, v3.8h
489
+    sqxtun          v2.8b, v20.8h
490
+    sqxtun2         v2.16b, v21.8h
491
+    st1             {v0.16b-v2.16b}, x2, x5
492
+    cbnz            w12, .loop_addavg_48x64
493
+    ret
494
+endfunc
495
+
496
+.macro addAvg_64xN h
497
+function PFX(addAvg_64x\h\()_neon)
498
+    addAvg_start
499
+    mov             w12, #\h
500
+    sub             x3, x3, #64
501
+    sub             x4, x4, #64
502
+.loop_addavg_64x\h\():
503
+    sub             w12, w12, #1
504
+    ld1             {v0.8h-v3.8h}, x0, #64
505
+    ld1             {v4.8h-v7.8h}, x1, #64
506
+    ld1             {v20.8h-v23.8h}, x0, x3
507
+    ld1             {v24.8h-v27.8h}, x1, x4
508
+    addavg_1        v0, v4
509
+    addavg_1        v1, v5
510
+    addavg_1        v2, v6
511
+    addavg_1        v3, v7
512
+    addavg_1        v20, v24
513
+    addavg_1        v21, v25
514
+    addavg_1        v22, v26
515
+    addavg_1        v23, v27
516
+    sqxtun          v0.8b, v0.8h
517
+    sqxtun2         v0.16b, v1.8h
518
+    sqxtun          v1.8b, v2.8h
519
+    sqxtun2         v1.16b, v3.8h
520
+    sqxtun          v2.8b, v20.8h
521
+    sqxtun2         v2.16b, v21.8h
522
+    sqxtun          v3.8b, v22.8h
523
+    sqxtun2         v3.16b, v23.8h
524
+    st1             {v0.16b-v3.16b}, x2, x5
525
+    cbnz            w12, .loop_addavg_64x\h
526
+    ret
527
+endfunc
528
+.endm
529
+
530
+addAvg_64xN 16
531
+addAvg_64xN 32
532
+addAvg_64xN 48
533
+addAvg_64xN 64
534
x265_3.6.tar.gz/source/common/aarch64/p2s-common.S Added
104
 
1
@@ -0,0 +1,102 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+.arch           armv8-a
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+#if HIGH_BIT_DEPTH
39
+# if BIT_DEPTH == 10
40
+#  define P2S_SHIFT 4
41
+# elif BIT_DEPTH == 12
42
+#  define P2S_SHIFT 2
43
+# endif
44
+.macro p2s_start
45
+    add             x3, x3, x3
46
+    add             x1, x1, x1
47
+    movi            v31.8h, #0xe0, lsl #8
48
+.endm
49
+
50
+#else // if !HIGH_BIT_DEPTH
51
+# define P2S_SHIFT 6
52
+.macro p2s_start
53
+    add             x3, x3, x3
54
+    movi            v31.8h, #0xe0, lsl #8
55
+.endm
56
+#endif // HIGH_BIT_DEPTH
57
+
58
+.macro p2s_2x2
59
+#if HIGH_BIT_DEPTH
60
+    ld1             {v0.s}0, x0, x1
61
+    ld1             {v0.s}1, x0, x1
62
+    shl             v3.8h, v0.8h, #P2S_SHIFT
63
+#else
64
+    ldrh            w10, x0
65
+    add             x0, x0, x1
66
+    ldrh            w11, x0
67
+    orr             w10, w10, w11, lsl #16
68
+    add             x0, x0, x1
69
+    dup             v0.4s, w10
70
+    ushll           v3.8h, v0.8b, #P2S_SHIFT
71
+#endif
72
+    add             v3.8h, v3.8h, v31.8h
73
+    st1             {v3.s}0, x2, x3
74
+    st1             {v3.s}1, x2, x3
75
+.endm
76
+
77
+.macro p2s_6x2
78
+#if HIGH_BIT_DEPTH
79
+    ld1             {v0.d}0, x0, #8
80
+    ld1             {v1.s}0, x0, x1
81
+    ld1             {v0.d}1, x0, #8
82
+    ld1             {v1.s}1, x0, x1
83
+    shl             v3.8h, v0.8h, #P2S_SHIFT
84
+    shl             v4.8h, v1.8h, #P2S_SHIFT
85
+#else
86
+    ldr             s0, x0
87
+    ldrh            w10, x0, #4
88
+    add             x0, x0, x1
89
+    ld1             {v0.s}1, x0
90
+    ldrh            w11, x0, #4
91
+    add             x0, x0, x1
92
+    orr             w10, w10, w11, lsl #16
93
+    dup             v1.4s, w10
94
+    ushll           v3.8h, v0.8b, #P2S_SHIFT
95
+    ushll           v4.8h, v1.8b, #P2S_SHIFT
96
+#endif
97
+    add             v3.8h, v3.8h, v31.8h
98
+    add             v4.8h, v4.8h, v31.8h
99
+    st1             {v3.d}0, x2, #8
100
+    st1             {v4.s}0, x2, x3
101
+    st1             {v3.d}1, x2, #8
102
+    st1             {v4.s}1, x2, x3
103
+.endm
104
x265_3.6.tar.gz/source/common/aarch64/p2s-sve.S Added
447
 
1
@@ -0,0 +1,445 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "p2s-common.S"
27
+
28
+.arch armv8-a+sve
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+#if HIGH_BIT_DEPTH
41
+# if BIT_DEPTH == 10
42
+#  define P2S_SHIFT 4
43
+# elif BIT_DEPTH == 12
44
+#  define P2S_SHIFT 2
45
+# endif
46
+
47
+.macro p2s_start_sve
48
+    add             x3, x3, x3
49
+    add             x1, x1, x1
50
+    mov             z31.h, #0xe0, lsl #8
51
+.endm
52
+
53
+#else // if !HIGH_BIT_DEPTH
54
+# define P2S_SHIFT 6
55
+.macro p2s_start_sve
56
+    add             x3, x3, x3
57
+    mov             z31.h, #0xe0, lsl #8
58
+.endm
59
+
60
+#endif // HIGH_BIT_DEPTH
61
+
62
+// filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
63
+.macro p2s_2xN_sve h
64
+function PFX(filterPixelToShort_2x\h\()_sve)
65
+    p2s_start_sve
66
+.rept \h / 2
67
+    p2s_2x2
68
+.endr
69
+    ret
70
+endfunc
71
+.endm
72
+
73
+p2s_2xN_sve 4
74
+p2s_2xN_sve 8
75
+p2s_2xN_sve 16
76
+
77
+.macro p2s_6xN_sve h
78
+function PFX(filterPixelToShort_6x\h\()_sve)
79
+    p2s_start_sve
80
+    sub             x3, x3, #8
81
+#if HIGH_BIT_DEPTH
82
+    sub             x1, x1, #8
83
+#endif
84
+.rept \h / 2
85
+    p2s_6x2
86
+.endr
87
+    ret
88
+endfunc
89
+.endm
90
+
91
+p2s_6xN_sve 8
92
+p2s_6xN_sve 16
93
+
94
+function PFX(filterPixelToShort_4x2_sve)
95
+    p2s_start_sve
96
+#if HIGH_BIT_DEPTH
97
+    ptrue           p0.h, vl8
98
+    index           z1.d, #0, x1
99
+    index           z2.d, #0, x3
100
+    ld1d            {z3.d}, p0/z, x0, z1.d
101
+    lsl             z3.h, p0/m, z3.h, #P2S_SHIFT
102
+    add             z3.h, p0/m, z3.h, z31.h
103
+    st1d            {z3.d}, p0, x2, z2.d
104
+#else
105
+    ptrue           p0.h, vl4
106
+    ld1b            {z0.h}, p0/z, x0
107
+    add             x0, x0, x1
108
+    ld1b            {z1.h}, p0/z, x0
109
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
110
+    lsl             z1.h, p0/m, z1.h, #P2S_SHIFT
111
+    add             z0.h, p0/m, z0.h, z31.h
112
+    add             z1.h, p0/m, z1.h, z31.h
113
+    st1h            {z0.h}, p0, x2
114
+    add             x2, x2, x3
115
+    st1h            {z1.h}, p0, x2
116
+#endif
117
+    ret
118
+endfunc
119
+
120
+
121
+.macro p2s_8xN_sve h
122
+function PFX(filterPixelToShort_8x\h\()_sve)
123
+    p2s_start_sve
124
+    ptrue           p0.h, vl8
125
+.rept \h
126
+#if HIGH_BIT_DEPTH
127
+    ld1d            {z0.d}, p0/z, x0
128
+    add             x0, x0, x1
129
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
130
+    add             z0.h, p0/m, z0.h, z31.h
131
+    st1h            {z0.h}, p0, x2
132
+    add             x2, x2, x3
133
+#else
134
+    ld1b            {z0.h}, p0/z, x0
135
+    add             x0, x0, x1
136
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
137
+    add             z0.h, p0/m, z0.h, z31.h
138
+    st1h            {z0.h}, p0, x2
139
+    add             x2, x2, x3
140
+#endif
141
+.endr
142
+    ret
143
+endfunc
144
+.endm
145
+
146
+p2s_8xN_sve 2
147
+
148
+.macro p2s_32xN_sve h
149
+function PFX(filterPixelToShort_32x\h\()_sve)
150
+#if HIGH_BIT_DEPTH
151
+    p2s_start_sve
152
+    rdvl            x9, #1
153
+    cmp             x9, #16
154
+    bgt             .vl_gt_16_filterPixelToShort_high_32x\h
155
+    ptrue           p0.h, vl8
156
+.rept \h
157
+    ld1h            {z0.h}, p0/z, x0
158
+    ld1h            {z1.h}, p0/z, x0, #1, mul vl
159
+    ld1h            {z2.h}, p0/z, x0, #2, mul vl
160
+    ld1h            {z3.h}, p0/z, x0, #3, mul vl
161
+    add             x0, x0, x1
162
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
163
+    lsl             z1.h, p0/m, z1.h, #P2S_SHIFT
164
+    lsl             z2.h, p0/m, z2.h, #P2S_SHIFT
165
+    lsl             z3.h, p0/m, z3.h, #P2S_SHIFT
166
+    add             z0.h, p0/m, z0.h, z31.h
167
+    add             z1.h, p0/m, z1.h, z31.h
168
+    add             z2.h, p0/m, z2.h, z31.h
169
+    add             z3.h, p0/m, z3.h, z31.h
170
+    st1h            {z0.h}, p0, x2
171
+    st1h            {z1.h}, p0, x2, #1, mul vl
172
+    st1h            {z2.h}, p0, x2, #2, mul vl
173
+    st1h            {z3.h}, p0, x2, #3, mul vl
174
+    add             x2, x2, x3
175
+.endr
176
+    ret
177
+.vl_gt_16_filterPixelToShort_high_32x\h\():
178
+    cmp             x9, #48
179
+    bgt             .vl_gt_48_filterPixelToShort_high_32x\h
180
+    ptrue           p0.h, vl16
181
+.rept \h
182
+    ld1h            {z0.h}, p0/z, x0
183
+    ld1h            {z1.h}, p0/z, x0, #1, mul vl
184
+    add             x0, x0, x1
185
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
186
+    lsl             z1.h, p0/m, z1.h, #P2S_SHIFT
187
+    add             z0.h, p0/m, z0.h, z31.h
188
+    add             z1.h, p0/m, z1.h, z31.h
189
+    st1h            {z0.h}, p0, x2
190
+    st1h            {z1.h}, p0, x2, #1, mul vl
191
+    add             x2, x2, x3
192
+.endr
193
+    ret
194
+.vl_gt_48_filterPixelToShort_high_32x\h\():
195
+    ptrue           p0.h, vl32
196
+.rept \h
197
+    ld1h            {z0.h}, p0/z, x0
198
+    add             x0, x0, x1
199
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
200
+    add             z0.h, p0/m, z0.h, z31.h
201
+    st1h            {z0.h}, p0, x2
202
+    add             x2, x2, x3
203
+.endr
204
+    ret
205
+#else
206
+    p2s_start
207
+    mov             x9, #\h
208
+.loop_filter_sve_P2S_32x\h:
209
+    sub             x9, x9, #1
210
+    ld1             {v0.16b-v1.16b}, x0, x1
211
+    ushll           v22.8h, v0.8b,  #P2S_SHIFT
212
+    ushll2          v23.8h, v0.16b, #P2S_SHIFT
213
+    ushll           v24.8h, v1.8b,  #P2S_SHIFT
214
+    ushll2          v25.8h, v1.16b, #P2S_SHIFT
215
+    add             v22.8h, v22.8h, v31.8h
216
+    add             v23.8h, v23.8h, v31.8h
217
+    add             v24.8h, v24.8h, v31.8h
218
+    add             v25.8h, v25.8h, v31.8h
219
+    st1             {v22.16b-v25.16b}, x2, x3
220
+    cbnz            x9, .loop_filter_sve_P2S_32x\h
221
+    ret
222
+#endif
223
+endfunc
224
+.endm
225
+
226
+p2s_32xN_sve 8
227
+p2s_32xN_sve 16
228
+p2s_32xN_sve 24
229
+p2s_32xN_sve 32
230
+p2s_32xN_sve 48
231
+p2s_32xN_sve 64
232
+
233
+.macro p2s_64xN_sve h
234
+function PFX(filterPixelToShort_64x\h\()_sve)
235
+#if HIGH_BIT_DEPTH
236
+    p2s_start_sve
237
+    rdvl            x9, #1
238
+    cmp             x9, #16
239
+    bgt             .vl_gt_16_filterPixelToShort_high_64x\h
240
+    ptrue           p0.h, vl8
241
+.rept \h
242
+    ld1h            {z0.h}, p0/z, x0
243
+    ld1h            {z1.h}, p0/z, x0, #1, mul vl
244
+    ld1h            {z2.h}, p0/z, x0, #2, mul vl
245
+    ld1h            {z3.h}, p0/z, x0, #3, mul vl
246
+    ld1h            {z4.h}, p0/z, x0, #4, mul vl
247
+    ld1h            {z5.h}, p0/z, x0, #5, mul vl
248
+    ld1h            {z6.h}, p0/z, x0, #6, mul vl
249
+    ld1h            {z7.h}, p0/z, x0, #7, mul vl
250
+    add             x0, x0, x1
251
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
252
+    lsl             z1.h, p0/m, z1.h, #P2S_SHIFT
253
+    lsl             z2.h, p0/m, z2.h, #P2S_SHIFT
254
+    lsl             z3.h, p0/m, z3.h, #P2S_SHIFT
255
+    lsl             z4.h, p0/m, z4.h, #P2S_SHIFT
256
+    lsl             z5.h, p0/m, z5.h, #P2S_SHIFT
257
+    lsl             z6.h, p0/m, z6.h, #P2S_SHIFT
258
+    lsl             z7.h, p0/m, z7.h, #P2S_SHIFT
259
+    add             z0.h, p0/m, z0.h, z31.h
260
+    add             z1.h, p0/m, z1.h, z31.h
261
+    add             z2.h, p0/m, z2.h, z31.h
262
+    add             z3.h, p0/m, z3.h, z31.h
263
+    add             z4.h, p0/m, z4.h, z31.h
264
+    add             z5.h, p0/m, z5.h, z31.h
265
+    add             z6.h, p0/m, z6.h, z31.h
266
+    add             z7.h, p0/m, z7.h, z31.h
267
+    st1h            {z0.h}, p0, x2
268
+    st1h            {z1.h}, p0, x2, #1, mul vl
269
+    st1h            {z2.h}, p0, x2, #2, mul vl
270
+    st1h            {z3.h}, p0, x2, #3, mul vl
271
+    st1h            {z4.h}, p0, x2, #4, mul vl
272
+    st1h            {z5.h}, p0, x2, #5, mul vl
273
+    st1h            {z6.h}, p0, x2, #6, mul vl
274
+    st1h            {z7.h}, p0, x2, #7, mul vl
275
+    add             x2, x2, x3
276
+.endr
277
+    ret
278
+.vl_gt_16_filterPixelToShort_high_64x\h\():
279
+    cmp             x9, #48
280
+    bgt             .vl_gt_48_filterPixelToShort_high_64x\h
281
+    ptrue           p0.h, vl16
282
+.rept \h
283
+    ld1h            {z0.h}, p0/z, x0
284
+    ld1h            {z1.h}, p0/z, x0, #1, mul vl
285
+    ld1h            {z2.h}, p0/z, x0, #2, mul vl
286
+    ld1h            {z3.h}, p0/z, x0, #3, mul vl
287
+    add             x0, x0, x1
288
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
289
+    lsl             z1.h, p0/m, z1.h, #P2S_SHIFT
290
+    lsl             z2.h, p0/m, z2.h, #P2S_SHIFT
291
+    lsl             z3.h, p0/m, z3.h, #P2S_SHIFT
292
+    add             z0.h, p0/m, z0.h, z31.h
293
+    add             z1.h, p0/m, z1.h, z31.h
294
+    add             z2.h, p0/m, z2.h, z31.h
295
+    add             z3.h, p0/m, z3.h, z31.h
296
+    st1h            {z0.h}, p0, x2
297
+    st1h            {z1.h}, p0, x2, #1, mul vl
298
+    st1h            {z2.h}, p0, x2, #2, mul vl
299
+    st1h            {z3.h}, p0, x2, #3, mul vl
300
+    add             x2, x2, x3
301
+.endr
302
+    ret
303
+.vl_gt_48_filterPixelToShort_high_64x\h\():
304
+    cmp             x9, #112
305
+    bgt             .vl_gt_112_filterPixelToShort_high_64x\h
306
+    ptrue           p0.h, vl32
307
+.rept \h
308
+    ld1h            {z0.h}, p0/z, x0
309
+    ld1h            {z1.h}, p0/z, x0, #1, mul vl
310
+    add             x0, x0, x1
311
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
312
+    lsl             z1.h, p0/m, z1.h, #P2S_SHIFT
313
+    add             z0.h, p0/m, z0.h, z31.h
314
+    add             z1.h, p0/m, z1.h, z31.h
315
+    st1h            {z0.h}, p0, x2
316
+    st1h            {z1.h}, p0, x2, #1, mul vl
317
+    add             x2, x2, x3
318
+.endr
319
+    ret
320
+.vl_gt_112_filterPixelToShort_high_64x\h\():
321
+    ptrue           p0.h, vl64
322
+.rept \h
323
+    ld1h            {z0.h}, p0/z, x0
324
+    add             x0, x0, x1
325
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
326
+    add             z0.h, p0/m, z0.h, z31.h
327
+    st1h            {z0.h}, p0, x2
328
+    add             x2, x2, x3
329
+.endr
330
+    ret
331
+#else
332
+    p2s_start
333
+    sub             x3, x3, #64
334
+    mov             x9, #\h
335
+.loop_filter_sve_P2S_64x\h:
336
+    sub             x9, x9, #1
337
+    ld1             {v0.16b-v3.16b}, x0, x1
338
+    ushll           v16.8h, v0.8b,  #P2S_SHIFT
339
+    ushll2          v17.8h, v0.16b, #P2S_SHIFT
340
+    ushll           v18.8h, v1.8b,  #P2S_SHIFT
341
+    ushll2          v19.8h, v1.16b, #P2S_SHIFT
342
+    ushll           v20.8h, v2.8b,  #P2S_SHIFT
343
+    ushll2          v21.8h, v2.16b, #P2S_SHIFT
344
+    ushll           v22.8h, v3.8b,  #P2S_SHIFT
345
+    ushll2          v23.8h, v3.16b, #P2S_SHIFT
346
+    add             v16.8h, v16.8h, v31.8h
347
+    add             v17.8h, v17.8h, v31.8h
348
+    add             v18.8h, v18.8h, v31.8h
349
+    add             v19.8h, v19.8h, v31.8h
350
+    add             v20.8h, v20.8h, v31.8h
351
+    add             v21.8h, v21.8h, v31.8h
352
+    add             v22.8h, v22.8h, v31.8h
353
+    add             v23.8h, v23.8h, v31.8h
354
+    st1             {v16.16b-v19.16b}, x2, #64
355
+    st1             {v20.16b-v23.16b}, x2, x3
356
+    cbnz            x9, .loop_filter_sve_P2S_64x\h
357
+    ret
358
+#endif
359
+endfunc
360
+.endm
361
+
362
+p2s_64xN_sve 16
363
+p2s_64xN_sve 32
364
+p2s_64xN_sve 48
365
+p2s_64xN_sve 64
366
+
367
+function PFX(filterPixelToShort_48x64_sve)
368
+#if HIGH_BIT_DEPTH
369
+    p2s_start_sve
370
+    rdvl            x9, #1
371
+    cmp             x9, #16
372
+    bgt             .vl_gt_16_filterPixelToShort_high_48x64
373
+    ptrue           p0.h, vl8
374
+.rept 64
375
+    ld1h            {z0.h}, p0/z, x0
376
+    ld1h            {z1.h}, p0/z, x0, #1, mul vl
377
+    ld1h            {z2.h}, p0/z, x0, #2, mul vl
378
+    ld1h            {z3.h}, p0/z, x0, #3, mul vl
379
+    ld1h            {z4.h}, p0/z, x0, #4, mul vl
380
+    ld1h            {z5.h}, p0/z, x0, #5, mul vl
381
+    add             x0, x0, x1
382
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
383
+    lsl             z1.h, p0/m, z1.h, #P2S_SHIFT
384
+    lsl             z2.h, p0/m, z2.h, #P2S_SHIFT
385
+    lsl             z3.h, p0/m, z3.h, #P2S_SHIFT
386
+    lsl             z4.h, p0/m, z4.h, #P2S_SHIFT
387
+    lsl             z5.h, p0/m, z5.h, #P2S_SHIFT
388
+    add             z0.h, p0/m, z0.h, z31.h
389
+    add             z1.h, p0/m, z1.h, z31.h
390
+    add             z2.h, p0/m, z2.h, z31.h
391
+    add             z3.h, p0/m, z3.h, z31.h
392
+    add             z4.h, p0/m, z4.h, z31.h
393
+    add             z5.h, p0/m, z5.h, z31.h
394
+    st1h            {z0.h}, p0, x2
395
+    st1h            {z1.h}, p0, x2, #1, mul vl
396
+    st1h            {z2.h}, p0, x2, #2, mul vl
397
+    st1h            {z3.h}, p0, x2, #3, mul vl
398
+    st1h            {z4.h}, p0, x2, #4, mul vl
399
+    st1h            {z5.h}, p0, x2, #5, mul vl
400
+    add             x2, x2, x3
401
+.endr
402
+    ret
403
+.vl_gt_16_filterPixelToShort_high_48x64:
404
+    ptrue           p0.h, vl16
405
+.rept 64
406
+    ld1h            {z0.h}, p0/z, x0
407
+    ld1h            {z1.h}, p0/z, x0, #1, mul vl
408
+    ld1h            {z2.h}, p0/z, x0, #2, mul vl
409
+    add             x0, x0, x1
410
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
411
+    lsl             z1.h, p0/m, z1.h, #P2S_SHIFT
412
+    lsl             z2.h, p0/m, z2.h, #P2S_SHIFT
413
+    add             z0.h, p0/m, z0.h, z31.h
414
+    add             z1.h, p0/m, z1.h, z31.h
415
+    add             z2.h, p0/m, z2.h, z31.h
416
+    st1h            {z0.h}, p0, x2
417
+    st1h            {z1.h}, p0, x2, #1, mul vl
418
+    st1h            {z2.h}, p0, x2, #2, mul vl
419
+    add             x2, x2, x3
420
+.endr
421
+    ret
422
+#else
423
+    p2s_start
424
+    sub             x3, x3, #64
425
+    mov             x9, #64
426
+.loop_filterP2S_sve_48x64:
427
+    sub            x9, x9, #1
428
+    ld1             {v0.16b-v2.16b}, x0, x1
429
+    ushll           v16.8h, v0.8b,  #P2S_SHIFT
430
+    ushll2          v17.8h, v0.16b, #P2S_SHIFT
431
+    ushll           v18.8h, v1.8b,  #P2S_SHIFT
432
+    ushll2          v19.8h, v1.16b, #P2S_SHIFT
433
+    ushll           v20.8h, v2.8b,  #P2S_SHIFT
434
+    ushll2          v21.8h, v2.16b, #P2S_SHIFT
435
+    add             v16.8h, v16.8h, v31.8h
436
+    add             v17.8h, v17.8h, v31.8h
437
+    add             v18.8h, v18.8h, v31.8h
438
+    add             v19.8h, v19.8h, v31.8h
439
+    add             v20.8h, v20.8h, v31.8h
440
+    add             v21.8h, v21.8h, v31.8h
441
+    st1             {v16.16b-v19.16b}, x2, #64
442
+    st1             {v20.16b-v21.16b}, x2, x3
443
+    cbnz            x9, .loop_filterP2S_sve_48x64
444
+    ret
445
+#endif
446
+endfunc
447
x265_3.6.tar.gz/source/common/aarch64/p2s.S Added
388
 
1
@@ -0,0 +1,386 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2021 MulticoreWare, Inc
4
+ *
5
+ * Authors: Sebastian Pop <spop@amazon.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+#include "p2s-common.S"
27
+
28
+#ifdef __APPLE__
29
+.section __RODATA,__rodata
30
+#else
31
+.section .rodata
32
+#endif
33
+
34
+.align 4
35
+
36
+.text
37
+
38
+// filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
39
+.macro p2s_2xN h
40
+function PFX(filterPixelToShort_2x\h\()_neon)
41
+    p2s_start
42
+.rept \h / 2
43
+    p2s_2x2
44
+.endr
45
+    ret
46
+endfunc
47
+.endm
48
+
49
+p2s_2xN 4
50
+p2s_2xN 8
51
+p2s_2xN 16
52
+
53
+.macro p2s_6xN h
54
+function PFX(filterPixelToShort_6x\h\()_neon)
55
+    p2s_start
56
+    sub             x3, x3, #8
57
+#if HIGH_BIT_DEPTH
58
+    sub             x1, x1, #8
59
+#endif
60
+.rept \h / 2
61
+    p2s_6x2
62
+.endr
63
+    ret
64
+endfunc
65
+.endm
66
+
67
+p2s_6xN 8
68
+p2s_6xN 16
69
+
70
+function PFX(filterPixelToShort_4x2_neon)
71
+    p2s_start
72
+#if HIGH_BIT_DEPTH
73
+    ld1             {v0.d}0, x0, x1
74
+    ld1             {v0.d}1, x0, x1
75
+    shl             v3.8h, v0.8h, #P2S_SHIFT
76
+#else
77
+    ld1             {v0.s}0, x0, x1
78
+    ld1             {v0.s}1, x0, x1
79
+    ushll           v3.8h, v0.8b, #P2S_SHIFT
80
+#endif
81
+    add             v3.8h, v3.8h, v31.8h
82
+    st1             {v3.d}0, x2, x3
83
+    st1             {v3.d}1, x2, x3
84
+    ret
85
+endfunc
86
+
87
+function PFX(filterPixelToShort_4x4_neon)
88
+    p2s_start
89
+#if HIGH_BIT_DEPTH
90
+    ld1             {v0.d}0, x0, x1
91
+    ld1             {v0.d}1, x0, x1
92
+    shl             v3.8h, v0.8h, #P2S_SHIFT
93
+#else
94
+    ld1             {v0.s}0, x0, x1
95
+    ld1             {v0.s}1, x0, x1
96
+    ushll           v3.8h, v0.8b, #P2S_SHIFT
97
+#endif
98
+    add             v3.8h, v3.8h, v31.8h
99
+    st1             {v3.d}0, x2, x3
100
+    st1             {v3.d}1, x2, x3
101
+#if HIGH_BIT_DEPTH
102
+    ld1             {v1.d}0, x0, x1
103
+    ld1             {v1.d}1, x0, x1
104
+    shl             v4.8h, v1.8h, #P2S_SHIFT
105
+#else
106
+    ld1             {v1.s}0, x0, x1
107
+    ld1             {v1.s}1, x0, x1
108
+    ushll           v4.8h, v1.8b, #P2S_SHIFT
109
+#endif
110
+    add             v4.8h, v4.8h, v31.8h
111
+    st1             {v4.d}0, x2, x3
112
+    st1             {v4.d}1, x2, x3
113
+    ret
114
+endfunc
115
+
116
+.macro p2s_4xN h
117
+function PFX(filterPixelToShort_4x\h\()_neon)
118
+    p2s_start
119
+.rept \h / 2
120
+#if HIGH_BIT_DEPTH
121
+    ld1             {v0.16b}, x0, x1
122
+    shl             v0.8h, v0.8h, #P2S_SHIFT
123
+#else
124
+    ld1             {v0.8b}, x0, x1
125
+    ushll           v0.8h, v0.8b, #P2S_SHIFT
126
+#endif
127
+    add             v2.4h, v0.4h, v31.4h
128
+    st1             {v2.4h}, x2, x3
129
+#if HIGH_BIT_DEPTH
130
+    ld1             {v1.16b}, x0, x1
131
+    shl             v1.8h, v1.8h, #P2S_SHIFT
132
+#else
133
+    ld1             {v1.8b}, x0, x1
134
+    ushll           v1.8h, v1.8b, #P2S_SHIFT
135
+#endif
136
+    add             v3.4h, v1.4h, v31.4h
137
+    st1             {v3.4h}, x2, x3
138
+.endr
139
+    ret
140
+endfunc
141
+.endm
142
+
143
+p2s_4xN 8
144
+p2s_4xN 16
145
+p2s_4xN 32
146
+
147
+.macro p2s_8xN h
148
+function PFX(filterPixelToShort_8x\h\()_neon)
149
+    p2s_start
150
+.rept \h / 2
151
+#if HIGH_BIT_DEPTH
152
+    ld1             {v0.16b}, x0, x1
153
+    ld1             {v1.16b}, x0, x1
154
+    shl             v0.8h, v0.8h, #P2S_SHIFT
155
+    shl             v1.8h, v1.8h, #P2S_SHIFT
156
+#else
157
+    ld1             {v0.8b}, x0, x1
158
+    ld1             {v1.8b}, x0, x1
159
+    ushll           v0.8h, v0.8b, #P2S_SHIFT
160
+    ushll           v1.8h, v1.8b, #P2S_SHIFT
161
+#endif
162
+    add             v2.8h, v0.8h, v31.8h
163
+    st1             {v2.8h}, x2, x3
164
+    add             v3.8h, v1.8h, v31.8h
165
+    st1             {v3.8h}, x2, x3
166
+.endr
167
+    ret
168
+endfunc
169
+.endm
170
+
171
+p2s_8xN 2
172
+p2s_8xN 4
173
+p2s_8xN 6
174
+p2s_8xN 8
175
+p2s_8xN 12
176
+p2s_8xN 16
177
+p2s_8xN 32
178
+p2s_8xN 64
179
+
180
+.macro p2s_12xN h
181
+function PFX(filterPixelToShort_12x\h\()_neon)
182
+    p2s_start
183
+    sub             x3, x3, #16
184
+.rept \h
185
+#if HIGH_BIT_DEPTH
186
+    ld1             {v0.16b-v1.16b}, x0, x1
187
+    shl             v2.8h, v0.8h, #P2S_SHIFT
188
+    shl             v3.8h, v1.8h, #P2S_SHIFT
189
+#else
190
+    ld1             {v0.16b}, x0, x1
191
+    ushll           v2.8h, v0.8b,  #P2S_SHIFT
192
+    ushll2          v3.8h, v0.16b, #P2S_SHIFT
193
+#endif
194
+    add             v2.8h, v2.8h, v31.8h
195
+    add             v3.8h, v3.8h, v31.8h
196
+    st1             {v2.16b}, x2, #16
197
+    st1             {v3.8b}, x2, x3
198
+.endr
199
+    ret
200
+endfunc
201
+.endm
202
+
203
+p2s_12xN 16
204
+p2s_12xN 32
205
+
206
+.macro p2s_16xN h
207
+function PFX(filterPixelToShort_16x\h\()_neon)
208
+    p2s_start
209
+.rept \h
210
+#if HIGH_BIT_DEPTH
211
+    ld1             {v0.16b-v1.16b}, x0, x1
212
+    shl             v2.8h, v0.8h, #P2S_SHIFT
213
+    shl             v3.8h, v1.8h, #P2S_SHIFT
214
+#else
215
+    ld1             {v0.16b}, x0, x1
216
+    ushll           v2.8h, v0.8b,  #P2S_SHIFT
217
+    ushll2          v3.8h, v0.16b, #P2S_SHIFT
218
+#endif
219
+    add             v2.8h, v2.8h, v31.8h
220
+    add             v3.8h, v3.8h, v31.8h
221
+    st1             {v2.16b-v3.16b}, x2, x3
222
+.endr
223
+    ret
224
+endfunc
225
+.endm
226
+
227
+p2s_16xN 4
228
+p2s_16xN 8
229
+p2s_16xN 12
230
+p2s_16xN 16
231
+p2s_16xN 24
232
+p2s_16xN 32
233
+p2s_16xN 64
234
+
235
+.macro p2s_24xN h
236
+function PFX(filterPixelToShort_24x\h\()_neon)
237
+    p2s_start
238
+.rept \h
239
+#if HIGH_BIT_DEPTH
240
+    ld1             {v0.16b-v2.16b}, x0, x1
241
+    shl             v3.8h, v0.8h, #P2S_SHIFT
242
+    shl             v4.8h, v1.8h, #P2S_SHIFT
243
+    shl             v5.8h, v2.8h, #P2S_SHIFT
244
+#else
245
+    ld1             {v0.8b-v2.8b}, x0, x1
246
+    ushll           v3.8h, v0.8b, #P2S_SHIFT
247
+    ushll           v4.8h, v1.8b, #P2S_SHIFT
248
+    ushll           v5.8h, v2.8b, #P2S_SHIFT
249
+#endif
250
+    add             v3.8h, v3.8h, v31.8h
251
+    add             v4.8h, v4.8h, v31.8h
252
+    add             v5.8h, v5.8h, v31.8h
253
+    st1             {v3.16b-v5.16b}, x2, x3
254
+.endr
255
+    ret
256
+endfunc
257
+.endm
258
+
259
+p2s_24xN 32
260
+p2s_24xN 64
261
+
262
+.macro p2s_32xN h
263
+function PFX(filterPixelToShort_32x\h\()_neon)
264
+    p2s_start
265
+    mov             x9, #\h
266
+.loop_filterP2S_32x\h:
267
+    sub             x9, x9, #1
268
+#if HIGH_BIT_DEPTH
269
+    ld1             {v0.16b-v3.16b}, x0, x1
270
+    shl             v22.8h, v0.8h, #P2S_SHIFT
271
+    shl             v23.8h, v1.8h, #P2S_SHIFT
272
+    shl             v24.8h, v2.8h, #P2S_SHIFT
273
+    shl             v25.8h, v3.8h, #P2S_SHIFT
274
+#else
275
+    ld1             {v0.16b-v1.16b}, x0, x1
276
+    ushll           v22.8h, v0.8b,  #P2S_SHIFT
277
+    ushll2          v23.8h, v0.16b, #P2S_SHIFT
278
+    ushll           v24.8h, v1.8b,  #P2S_SHIFT
279
+    ushll2          v25.8h, v1.16b, #P2S_SHIFT
280
+#endif
281
+    add             v22.8h, v22.8h, v31.8h
282
+    add             v23.8h, v23.8h, v31.8h
283
+    add             v24.8h, v24.8h, v31.8h
284
+    add             v25.8h, v25.8h, v31.8h
285
+    st1             {v22.16b-v25.16b}, x2, x3
286
+    cbnz            x9, .loop_filterP2S_32x\h
287
+    ret
288
+endfunc
289
+.endm
290
+
291
+p2s_32xN 8
292
+p2s_32xN 16
293
+p2s_32xN 24
294
+p2s_32xN 32
295
+p2s_32xN 48
296
+p2s_32xN 64
297
+
298
+.macro p2s_64xN h
299
+function PFX(filterPixelToShort_64x\h\()_neon)
300
+    p2s_start
301
+#if HIGH_BIT_DEPTH
302
+    sub             x1, x1, #64
303
+#endif
304
+    sub             x3, x3, #64
305
+    mov             x9, #\h
306
+.loop_filterP2S_64x\h:
307
+    sub             x9, x9, #1
308
+#if HIGH_BIT_DEPTH
309
+    ld1             {v0.16b-v3.16b}, x0, #64
310
+    ld1             {v4.16b-v7.16b}, x0, x1
311
+    shl             v16.8h, v0.8h, #P2S_SHIFT
312
+    shl             v17.8h, v1.8h, #P2S_SHIFT
313
+    shl             v18.8h, v2.8h, #P2S_SHIFT
314
+    shl             v19.8h, v3.8h, #P2S_SHIFT
315
+    shl             v20.8h, v4.8h, #P2S_SHIFT
316
+    shl             v21.8h, v5.8h, #P2S_SHIFT
317
+    shl             v22.8h, v6.8h, #P2S_SHIFT
318
+    shl             v23.8h, v7.8h, #P2S_SHIFT
319
+#else
320
+    ld1             {v0.16b-v3.16b}, x0, x1
321
+    ushll           v16.8h, v0.8b,  #P2S_SHIFT
322
+    ushll2          v17.8h, v0.16b, #P2S_SHIFT
323
+    ushll           v18.8h, v1.8b,  #P2S_SHIFT
324
+    ushll2          v19.8h, v1.16b, #P2S_SHIFT
325
+    ushll           v20.8h, v2.8b,  #P2S_SHIFT
326
+    ushll2          v21.8h, v2.16b, #P2S_SHIFT
327
+    ushll           v22.8h, v3.8b,  #P2S_SHIFT
328
+    ushll2          v23.8h, v3.16b, #P2S_SHIFT
329
+#endif
330
+    add             v16.8h, v16.8h, v31.8h
331
+    add             v17.8h, v17.8h, v31.8h
332
+    add             v18.8h, v18.8h, v31.8h
333
+    add             v19.8h, v19.8h, v31.8h
334
+    add             v20.8h, v20.8h, v31.8h
335
+    add             v21.8h, v21.8h, v31.8h
336
+    add             v22.8h, v22.8h, v31.8h
337
+    add             v23.8h, v23.8h, v31.8h
338
+    st1             {v16.16b-v19.16b}, x2, #64
339
+    st1             {v20.16b-v23.16b}, x2, x3
340
+    cbnz            x9, .loop_filterP2S_64x\h
341
+    ret
342
+endfunc
343
+.endm
344
+
345
+p2s_64xN 16
346
+p2s_64xN 32
347
+p2s_64xN 48
348
+p2s_64xN 64
349
+
350
+function PFX(filterPixelToShort_48x64_neon)
351
+    p2s_start
352
+#if HIGH_BIT_DEPTH
353
+    sub             x1, x1, #64
354
+#endif
355
+    sub             x3, x3, #64
356
+    mov             x9, #64
357
+.loop_filterP2S_48x64:
358
+    sub            x9, x9, #1
359
+#if HIGH_BIT_DEPTH
360
+    ld1             {v0.16b-v3.16b}, x0, #64
361
+    ld1             {v4.16b-v5.16b}, x0, x1
362
+    shl             v16.8h, v0.8h, #P2S_SHIFT
363
+    shl             v17.8h, v1.8h, #P2S_SHIFT
364
+    shl             v18.8h, v2.8h, #P2S_SHIFT
365
+    shl             v19.8h, v3.8h, #P2S_SHIFT
366
+    shl             v20.8h, v4.8h, #P2S_SHIFT
367
+    shl             v21.8h, v5.8h, #P2S_SHIFT
368
+#else
369
+    ld1             {v0.16b-v2.16b}, x0, x1
370
+    ushll           v16.8h, v0.8b,  #P2S_SHIFT
371
+    ushll2          v17.8h, v0.16b, #P2S_SHIFT
372
+    ushll           v18.8h, v1.8b,  #P2S_SHIFT
373
+    ushll2          v19.8h, v1.16b, #P2S_SHIFT
374
+    ushll           v20.8h, v2.8b,  #P2S_SHIFT
375
+    ushll2          v21.8h, v2.16b, #P2S_SHIFT
376
+#endif
377
+    add             v16.8h, v16.8h, v31.8h
378
+    add             v17.8h, v17.8h, v31.8h
379
+    add             v18.8h, v18.8h, v31.8h
380
+    add             v19.8h, v19.8h, v31.8h
381
+    add             v20.8h, v20.8h, v31.8h
382
+    add             v21.8h, v21.8h, v31.8h
383
+    st1             {v16.16b-v19.16b}, x2, #64
384
+    st1             {v20.16b-v21.16b}, x2, x3
385
+    cbnz            x9, .loop_filterP2S_48x64
386
+    ret
387
+endfunc
388
x265_3.6.tar.gz/source/common/aarch64/pixel-prim.cpp Added
2061
 
1
@@ -0,0 +1,2059 @@
2
+#include "common.h"
3
+#include "slicetype.h"      // LOWRES_COST_MASK
4
+#include "primitives.h"
5
+#include "x265.h"
6
+
7
+#include "pixel-prim.h"
8
+#include "arm64-utils.h"
9
+#if HAVE_NEON
10
+
11
+#include <arm_neon.h>
12
+
13
+using namespace X265_NS;
14
+
15
+
16
+
17
+namespace
18
+{
19
+
20
+
21
+/* SATD SA8D variants - based on x264 */
22
+static inline void SUMSUB_AB(int16x8_t &sum, int16x8_t &sub, const int16x8_t a, const int16x8_t b)
23
+{
24
+    sum = vaddq_s16(a, b);
25
+    sub = vsubq_s16(a, b);
26
+}
27
+
28
+static inline void transpose_8h(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
29
+{
30
+    t1 = vtrn1q_s16(s1, s2);
31
+    t2 = vtrn2q_s16(s1, s2);
32
+}
33
+
34
+static inline void transpose_4s(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
35
+{
36
+    t1 = vtrn1q_s32(s1, s2);
37
+    t2 = vtrn2q_s32(s1, s2);
38
+}
39
+
40
+#if (X265_DEPTH <= 10)
41
+static inline void transpose_2d(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
42
+{
43
+    t1 = vtrn1q_s64(s1, s2);
44
+    t2 = vtrn2q_s64(s1, s2);
45
+}
46
+#endif
47
+
48
+
49
+static inline void SUMSUB_ABCD(int16x8_t &s1, int16x8_t &d1, int16x8_t &s2, int16x8_t &d2,
50
+                               int16x8_t a, int16x8_t  b, int16x8_t  c, int16x8_t  d)
51
+{
52
+    SUMSUB_AB(s1, d1, a, b);
53
+    SUMSUB_AB(s2, d2, c, d);
54
+}
55
+
56
+static inline void HADAMARD4_V(int16x8_t &r1, int16x8_t &r2, int16x8_t &r3, int16x8_t &r4,
57
+                               int16x8_t &t1, int16x8_t &t2, int16x8_t &t3, int16x8_t &t4)
58
+{
59
+    SUMSUB_ABCD(t1, t2, t3, t4, r1, r2, r3, r4);
60
+    SUMSUB_ABCD(r1, r3, r2, r4, t1, t3, t2, t4);
61
+}
62
+
63
+
64
+static int _satd_4x8_8x4_end_neon(int16x8_t v0, int16x8_t v1, int16x8_t v2, int16x8_t v3)
65
+
66
+{
67
+
68
+    int16x8_t v4, v5, v6, v7, v16, v17, v18, v19;
69
+
70
+
71
+    SUMSUB_AB(v16, v17, v0,  v1);
72
+    SUMSUB_AB(v18, v19, v2,  v3);
73
+
74
+    SUMSUB_AB(v4 , v6 , v16, v18);
75
+    SUMSUB_AB(v5 , v7 , v17, v19);
76
+
77
+    v0 = vtrn1q_s16(v4, v5);
78
+    v1 = vtrn2q_s16(v4, v5);
79
+    v2 = vtrn1q_s16(v6, v7);
80
+    v3 = vtrn2q_s16(v6, v7);
81
+
82
+    SUMSUB_AB(v16, v17, v0,  v1);
83
+    SUMSUB_AB(v18, v19, v2,  v3);
84
+
85
+    v0 = vtrn1q_s32(v16, v18);
86
+    v1 = vtrn2q_s32(v16, v18);
87
+    v2 = vtrn1q_s32(v17, v19);
88
+    v3 = vtrn2q_s32(v17, v19);
89
+
90
+    v0 = vabsq_s16(v0);
91
+    v1 = vabsq_s16(v1);
92
+    v2 = vabsq_s16(v2);
93
+    v3 = vabsq_s16(v3);
94
+
95
+    v0 = vmaxq_u16(v0, v1);
96
+    v1 = vmaxq_u16(v2, v3);
97
+
98
+    v0 = vaddq_u16(v0, v1);
99
+    return vaddlvq_u16(v0);
100
+}
101
+
102
+static inline int _satd_4x4_neon(int16x8_t v0, int16x8_t v1)
103
+{
104
+    int16x8_t v2, v3;
105
+    SUMSUB_AB(v2,  v3,  v0,  v1);
106
+
107
+    v0 = vzip1q_s64(v2, v3);
108
+    v1 = vzip2q_s64(v2, v3);
109
+    SUMSUB_AB(v2,  v3,  v0,  v1);
110
+
111
+    v0 = vtrn1q_s16(v2, v3);
112
+    v1 = vtrn2q_s16(v2, v3);
113
+    SUMSUB_AB(v2,  v3,  v0,  v1);
114
+
115
+    v0 = vtrn1q_s32(v2, v3);
116
+    v1 = vtrn2q_s32(v2, v3);
117
+
118
+    v0 = vabsq_s16(v0);
119
+    v1 = vabsq_s16(v1);
120
+    v0 = vmaxq_u16(v0, v1);
121
+
122
+    return vaddlvq_s16(v0);
123
+}
124
+
125
+static void _satd_8x4v_8x8h_neon(int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3, int16x8_t &v20,
126
+                                 int16x8_t &v21, int16x8_t &v22, int16x8_t &v23)
127
+{
128
+    int16x8_t v16, v17, v18, v19, v4, v5, v6, v7;
129
+
130
+    SUMSUB_AB(v16, v18, v0,  v2);
131
+    SUMSUB_AB(v17, v19, v1,  v3);
132
+
133
+    HADAMARD4_V(v20, v21, v22, v23, v0,  v1, v2, v3);
134
+
135
+    transpose_8h(v0,  v1,  v16, v17);
136
+    transpose_8h(v2,  v3,  v18, v19);
137
+    transpose_8h(v4,  v5,  v20, v21);
138
+    transpose_8h(v6,  v7,  v22, v23);
139
+
140
+    SUMSUB_AB(v16, v17, v0,  v1);
141
+    SUMSUB_AB(v18, v19, v2,  v3);
142
+    SUMSUB_AB(v20, v21, v4,  v5);
143
+    SUMSUB_AB(v22, v23, v6,  v7);
144
+
145
+    transpose_4s(v0,  v2,  v16, v18);
146
+    transpose_4s(v1,  v3,  v17, v19);
147
+    transpose_4s(v4,  v6,  v20, v22);
148
+    transpose_4s(v5,  v7,  v21, v23);
149
+
150
+    v0 = vabsq_s16(v0);
151
+    v1 = vabsq_s16(v1);
152
+    v2 = vabsq_s16(v2);
153
+    v3 = vabsq_s16(v3);
154
+    v4 = vabsq_s16(v4);
155
+    v5 = vabsq_s16(v5);
156
+    v6 = vabsq_s16(v6);
157
+    v7 = vabsq_s16(v7);
158
+
159
+    v0 = vmaxq_u16(v0, v2);
160
+    v1 = vmaxq_u16(v1, v3);
161
+    v2 = vmaxq_u16(v4, v6);
162
+    v3 = vmaxq_u16(v5, v7);
163
+
164
+}
165
+
166
+#if HIGH_BIT_DEPTH
167
+
168
+#if (X265_DEPTH > 10)
169
+static inline void transpose_2d(int32x4_t &t1, int32x4_t &t2, const int32x4_t s1, const int32x4_t s2)
170
+{
171
+    t1 = vtrn1q_s64(s1, s2);
172
+    t2 = vtrn2q_s64(s1, s2);
173
+}
174
+
175
+static inline void ISUMSUB_AB(int32x4_t &sum, int32x4_t &sub, const int32x4_t a, const int32x4_t b)
176
+{
177
+    sum = vaddq_s32(a, b);
178
+    sub = vsubq_s32(a, b);
179
+}
180
+
181
+static inline void ISUMSUB_AB_FROM_INT16(int32x4_t &suml, int32x4_t &sumh, int32x4_t &subl, int32x4_t &subh,
182
+        const int16x8_t a, const int16x8_t b)
183
+{
184
+    suml = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
185
+    sumh = vaddl_high_s16(a, b);
186
+    subl = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
187
+    subh = vsubl_high_s16(a, b);
188
+}
189
+
190
+#endif
191
+
192
+static inline void _sub_8x8_fly(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2,
193
+                                int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3,
194
+                                int16x8_t &v20, int16x8_t &v21, int16x8_t &v22, int16x8_t &v23)
195
+{
196
+    uint16x8_t r0, r1, r2, r3;
197
+    uint16x8_t t0, t1, t2, t3;
198
+    int16x8_t v16, v17;
199
+    int16x8_t v18, v19;
200
+
201
+    r0 = *(uint16x8_t *)(pix1 + 0 * stride_pix1);
202
+    r1 = *(uint16x8_t *)(pix1 + 1 * stride_pix1);
203
+    r2 = *(uint16x8_t *)(pix1 + 2 * stride_pix1);
204
+    r3 = *(uint16x8_t *)(pix1 + 3 * stride_pix1);
205
+
206
+    t0 = *(uint16x8_t *)(pix2 + 0 * stride_pix2);
207
+    t1 = *(uint16x8_t *)(pix2 + 1 * stride_pix2);
208
+    t2 = *(uint16x8_t *)(pix2 + 2 * stride_pix2);
209
+    t3 = *(uint16x8_t *)(pix2 + 3 * stride_pix2);
210
+
211
+    v16 = vsubq_u16(r0, t0);
212
+    v17 = vsubq_u16(r1, t1);
213
+    v18 = vsubq_u16(r2, t2);
214
+    v19 = vsubq_u16(r3, t3);
215
+
216
+    r0 = *(uint16x8_t *)(pix1 + 4 * stride_pix1);
217
+    r1 = *(uint16x8_t *)(pix1 + 5 * stride_pix1);
218
+    r2 = *(uint16x8_t *)(pix1 + 6 * stride_pix1);
219
+    r3 = *(uint16x8_t *)(pix1 + 7 * stride_pix1);
220
+
221
+    t0 = *(uint16x8_t *)(pix2 + 4 * stride_pix2);
222
+    t1 = *(uint16x8_t *)(pix2 + 5 * stride_pix2);
223
+    t2 = *(uint16x8_t *)(pix2 + 6 * stride_pix2);
224
+    t3 = *(uint16x8_t *)(pix2 + 7 * stride_pix2);
225
+
226
+    v20 = vsubq_u16(r0, t0);
227
+    v21 = vsubq_u16(r1, t1);
228
+    v22 = vsubq_u16(r2, t2);
229
+    v23 = vsubq_u16(r3, t3);
230
+
231
+    SUMSUB_AB(v0,  v1,  v16, v17);
232
+    SUMSUB_AB(v2,  v3,  v18, v19);
233
+
234
+}
235
+
236
+
237
+
238
+
239
+static void _satd_16x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2,
240
+                            int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3)
241
+{
242
+    uint8x16_t r0, r1, r2, r3;
243
+    uint8x16_t t0, t1, t2, t3;
244
+    int16x8_t v16, v17, v20, v21;
245
+    int16x8_t v18, v19, v22, v23;
246
+
247
+    r0 = *(int16x8_t *)(pix1 + 0 * stride_pix1);
248
+    r1 = *(int16x8_t *)(pix1 + 1 * stride_pix1);
249
+    r2 = *(int16x8_t *)(pix1 + 2 * stride_pix1);
250
+    r3 = *(int16x8_t *)(pix1 + 3 * stride_pix1);
251
+
252
+    t0 = *(int16x8_t *)(pix2 + 0 * stride_pix2);
253
+    t1 = *(int16x8_t *)(pix2 + 1 * stride_pix2);
254
+    t2 = *(int16x8_t *)(pix2 + 2 * stride_pix2);
255
+    t3 = *(int16x8_t *)(pix2 + 3 * stride_pix2);
256
+
257
+
258
+    v16 = vsubq_u16((r0), (t0));
259
+    v17 = vsubq_u16((r1), (t1));
260
+    v18 = vsubq_u16((r2), (t2));
261
+    v19 = vsubq_u16((r3), (t3));
262
+
263
+    r0 = *(int16x8_t *)(pix1 + 0 * stride_pix1 + 8);
264
+    r1 = *(int16x8_t *)(pix1 + 1 * stride_pix1 + 8);
265
+    r2 = *(int16x8_t *)(pix1 + 2 * stride_pix1 + 8);
266
+    r3 = *(int16x8_t *)(pix1 + 3 * stride_pix1 + 8);
267
+
268
+    t0 = *(int16x8_t *)(pix2 + 0 * stride_pix2 + 8);
269
+    t1 = *(int16x8_t *)(pix2 + 1 * stride_pix2 + 8);
270
+    t2 = *(int16x8_t *)(pix2 + 2 * stride_pix2 + 8);
271
+    t3 = *(int16x8_t *)(pix2 + 3 * stride_pix2 + 8);
272
+
273
+
274
+    v20 = vsubq_u16(r0, t0);
275
+    v21 = vsubq_u16(r1, t1);
276
+    v22 = vsubq_u16(r2, t2);
277
+    v23 = vsubq_u16(r3, t3);
278
+
279
+    SUMSUB_AB(v0,  v1,  v16, v17);
280
+    SUMSUB_AB(v2,  v3,  v18, v19);
281
+
282
+    _satd_8x4v_8x8h_neon(v0, v1, v2, v3, v20, v21, v22, v23);
283
+
284
+}
285
+
286
+
287
+int pixel_satd_4x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2)
288
+{
289
+    uint64x2_t t0, t1, r0, r1;
290
+    t00 = *(uint64_t *)(pix1 + 0 * stride_pix1);
291
+    t10 = *(uint64_t *)(pix1 + 1 * stride_pix1);
292
+    t01 = *(uint64_t *)(pix1 + 2 * stride_pix1);
293
+    t11 = *(uint64_t *)(pix1 + 3 * stride_pix1);
294
+
295
+    r00 = *(uint64_t *)(pix2 + 0 * stride_pix1);
296
+    r10 = *(uint64_t *)(pix2 + 1 * stride_pix2);
297
+    r01 = *(uint64_t *)(pix2 + 2 * stride_pix2);
298
+    r11 = *(uint64_t *)(pix2 + 3 * stride_pix2);
299
+
300
+    return _satd_4x4_neon(vsubq_u16(t0, r0), vsubq_u16(r1, t1));
301
+}
302
+
303
+
304
+
305
+
306
+
307
+
308
+int pixel_satd_8x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2)
309
+{
310
+    uint16x8_t i0, i1, i2, i3, i4, i5, i6, i7;
311
+
312
+    i0 = *(uint16x8_t *)(pix1 + 0 * stride_pix1);
313
+    i1 = *(uint16x8_t *)(pix2 + 0 * stride_pix2);
314
+    i2 = *(uint16x8_t *)(pix1 + 1 * stride_pix1);
315
+    i3 = *(uint16x8_t *)(pix2 + 1 * stride_pix2);
316
+    i4 = *(uint16x8_t *)(pix1 + 2 * stride_pix1);
317
+    i5 = *(uint16x8_t *)(pix2 + 2 * stride_pix2);
318
+    i6 = *(uint16x8_t *)(pix1 + 3 * stride_pix1);
319
+    i7 = *(uint16x8_t *)(pix2 + 3 * stride_pix2);
320
+
321
+    int16x8_t v0 = vsubq_u16(i0, i1);
322
+    int16x8_t v1 = vsubq_u16(i2, i3);
323
+    int16x8_t v2 = vsubq_u16(i4, i5);
324
+    int16x8_t v3 = vsubq_u16(i6, i7);
325
+
326
+    return _satd_4x8_8x4_end_neon(v0, v1, v2, v3);
327
+}
328
+
329
+
330
+int pixel_satd_16x16_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2)
331
+{
332
+    int32x4_t v30 = vdupq_n_u32(0), v31 = vdupq_n_u32(0);
333
+    int16x8_t v0, v1, v2, v3;
334
+    for (int offset = 0; offset <= 12; offset += 4) {
335
+        _satd_16x4_neon(pix1 + offset * stride_pix1, stride_pix1, pix2 + offset * stride_pix2, stride_pix2, v0, v1, v2, v3);
336
+        v30 = vpadalq_u16(v30, v0);
337
+        v30 = vpadalq_u16(v30, v1);
338
+        v31 = vpadalq_u16(v31, v2);
339
+        v31 = vpadalq_u16(v31, v3);
340
+    }
341
+    return vaddvq_s32(vaddq_s32(v30, v31));
342
+
343
+}
344
+
345
+#else       //HIGH_BIT_DEPTH
346
+
347
+static void _satd_16x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2,
348
+                            int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3)
349
+{
350
+    uint8x16_t r0, r1, r2, r3;
351
+    uint8x16_t t0, t1, t2, t3;
352
+    int16x8_t v16, v17, v20, v21;
353
+    int16x8_t v18, v19, v22, v23;
354
+
355
+    r0 = *(uint8x16_t *)(pix1 + 0 * stride_pix1);
356
+    r1 = *(uint8x16_t *)(pix1 + 1 * stride_pix1);
357
+    r2 = *(uint8x16_t *)(pix1 + 2 * stride_pix1);
358
+    r3 = *(uint8x16_t *)(pix1 + 3 * stride_pix1);
359
+
360
+    t0 = *(uint8x16_t *)(pix2 + 0 * stride_pix2);
361
+    t1 = *(uint8x16_t *)(pix2 + 1 * stride_pix2);
362
+    t2 = *(uint8x16_t *)(pix2 + 2 * stride_pix2);
363
+    t3 = *(uint8x16_t *)(pix2 + 3 * stride_pix2);
364
+
365
+
366
+
367
+    v16 = vsubl_u8(vget_low_u8(r0), vget_low_u8(t0));
368
+    v20 = vsubl_high_u8(r0, t0);
369
+    v17 = vsubl_u8(vget_low_u8(r1), vget_low_u8(t1));
370
+    v21 = vsubl_high_u8(r1, t1);
371
+    v18 = vsubl_u8(vget_low_u8(r2), vget_low_u8(t2));
372
+    v22 = vsubl_high_u8(r2, t2);
373
+    v19 = vsubl_u8(vget_low_u8(r3), vget_low_u8(t3));
374
+    v23 = vsubl_high_u8(r3, t3);
375
+
376
+    SUMSUB_AB(v0,  v1,  v16, v17);
377
+    SUMSUB_AB(v2,  v3,  v18, v19);
378
+
379
+    _satd_8x4v_8x8h_neon(v0, v1, v2, v3, v20, v21, v22, v23);
380
+
381
+}
382
+
383
+
384
+static inline void _sub_8x8_fly(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2,
385
+                                int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3,
386
+                                int16x8_t &v20, int16x8_t &v21, int16x8_t &v22, int16x8_t &v23)
387
+{
388
+    uint8x8_t r0, r1, r2, r3;
389
+    uint8x8_t t0, t1, t2, t3;
390
+    int16x8_t v16, v17;
391
+    int16x8_t v18, v19;
392
+
393
+    r0 = *(uint8x8_t *)(pix1 + 0 * stride_pix1);
394
+    r1 = *(uint8x8_t *)(pix1 + 1 * stride_pix1);
395
+    r2 = *(uint8x8_t *)(pix1 + 2 * stride_pix1);
396
+    r3 = *(uint8x8_t *)(pix1 + 3 * stride_pix1);
397
+
398
+    t0 = *(uint8x8_t *)(pix2 + 0 * stride_pix2);
399
+    t1 = *(uint8x8_t *)(pix2 + 1 * stride_pix2);
400
+    t2 = *(uint8x8_t *)(pix2 + 2 * stride_pix2);
401
+    t3 = *(uint8x8_t *)(pix2 + 3 * stride_pix2);
402
+
403
+    v16 = vsubl_u8(r0, t0);
404
+    v17 = vsubl_u8(r1, t1);
405
+    v18 = vsubl_u8(r2, t2);
406
+    v19 = vsubl_u8(r3, t3);
407
+
408
+    r0 = *(uint8x8_t *)(pix1 + 4 * stride_pix1);
409
+    r1 = *(uint8x8_t *)(pix1 + 5 * stride_pix1);
410
+    r2 = *(uint8x8_t *)(pix1 + 6 * stride_pix1);
411
+    r3 = *(uint8x8_t *)(pix1 + 7 * stride_pix1);
412
+
413
+    t0 = *(uint8x8_t *)(pix2 + 4 * stride_pix2);
414
+    t1 = *(uint8x8_t *)(pix2 + 5 * stride_pix2);
415
+    t2 = *(uint8x8_t *)(pix2 + 6 * stride_pix2);
416
+    t3 = *(uint8x8_t *)(pix2 + 7 * stride_pix2);
417
+
418
+    v20 = vsubl_u8(r0, t0);
419
+    v21 = vsubl_u8(r1, t1);
420
+    v22 = vsubl_u8(r2, t2);
421
+    v23 = vsubl_u8(r3, t3);
422
+
423
+
424
+    SUMSUB_AB(v0,  v1,  v16, v17);
425
+    SUMSUB_AB(v2,  v3,  v18, v19);
426
+
427
+}
428
+
429
+int pixel_satd_4x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2)
430
+{
431
+    uint32x2_t t0, t1, r0, r1;
432
+    t00 = *(uint32_t *)(pix1 + 0 * stride_pix1);
433
+    t10 = *(uint32_t *)(pix1 + 1 * stride_pix1);
434
+    t01 = *(uint32_t *)(pix1 + 2 * stride_pix1);
435
+    t11 = *(uint32_t *)(pix1 + 3 * stride_pix1);
436
+
437
+    r00 = *(uint32_t *)(pix2 + 0 * stride_pix1);
438
+    r10 = *(uint32_t *)(pix2 + 1 * stride_pix2);
439
+    r01 = *(uint32_t *)(pix2 + 2 * stride_pix2);
440
+    r11 = *(uint32_t *)(pix2 + 3 * stride_pix2);
441
+
442
+    return _satd_4x4_neon(vsubl_u8(t0, r0), vsubl_u8(r1, t1));
443
+}
444
+
445
+
446
+int pixel_satd_8x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2)
447
+{
448
+    uint8x8_t i0, i1, i2, i3, i4, i5, i6, i7;
449
+
450
+    i0 = *(uint8x8_t *)(pix1 + 0 * stride_pix1);
451
+    i1 = *(uint8x8_t *)(pix2 + 0 * stride_pix2);
452
+    i2 = *(uint8x8_t *)(pix1 + 1 * stride_pix1);
453
+    i3 = *(uint8x8_t *)(pix2 + 1 * stride_pix2);
454
+    i4 = *(uint8x8_t *)(pix1 + 2 * stride_pix1);
455
+    i5 = *(uint8x8_t *)(pix2 + 2 * stride_pix2);
456
+    i6 = *(uint8x8_t *)(pix1 + 3 * stride_pix1);
457
+    i7 = *(uint8x8_t *)(pix2 + 3 * stride_pix2);
458
+
459
+    int16x8_t v0 = vsubl_u8(i0, i1);
460
+    int16x8_t v1 = vsubl_u8(i2, i3);
461
+    int16x8_t v2 = vsubl_u8(i4, i5);
462
+    int16x8_t v3 = vsubl_u8(i6, i7);
463
+
464
+    return _satd_4x8_8x4_end_neon(v0, v1, v2, v3);
465
+}
466
+
467
+int pixel_satd_16x16_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2)
468
+{
469
+    int16x8_t v30, v31;
470
+    int16x8_t v0, v1, v2, v3;
471
+
472
+    _satd_16x4_neon(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3);
473
+    v30 = vaddq_s16(v0, v1);
474
+    v31 = vaddq_s16(v2, v3);
475
+
476
+    _satd_16x4_neon(pix1 + 4 * stride_pix1, stride_pix1, pix2 + 4 * stride_pix2, stride_pix2, v0, v1, v2, v3);
477
+    v0 = vaddq_s16(v0, v1);
478
+    v1 = vaddq_s16(v2, v3);
479
+    v30 = vaddq_s16(v30, v0);
480
+    v31 = vaddq_s16(v31, v1);
481
+
482
+    _satd_16x4_neon(pix1 + 8 * stride_pix1, stride_pix1, pix2 + 8 * stride_pix2, stride_pix2, v0, v1, v2, v3);
483
+    v0 = vaddq_s16(v0, v1);
484
+    v1 = vaddq_s16(v2, v3);
485
+    v30 = vaddq_s16(v30, v0);
486
+    v31 = vaddq_s16(v31, v1);
487
+
488
+    _satd_16x4_neon(pix1 + 12 * stride_pix1, stride_pix1, pix2 + 12 * stride_pix2, stride_pix2, v0, v1, v2, v3);
489
+    v0 = vaddq_s16(v0, v1);
490
+    v1 = vaddq_s16(v2, v3);
491
+    v30 = vaddq_s16(v30, v0);
492
+    v31 = vaddq_s16(v31, v1);
493
+
494
+    int32x4_t sum0 = vpaddlq_u16(v30);
495
+    int32x4_t sum1 = vpaddlq_u16(v31);
496
+    sum0 = vaddq_s32(sum0, sum1);
497
+    return vaddvq_s32(sum0);
498
+
499
+}
500
+#endif      //HIGH_BIT_DEPTH
501
+
502
+
503
+static inline void _sa8d_8x8_neon_end(int16x8_t &v0, int16x8_t &v1, int16x8_t v2, int16x8_t v3,
504
+                                      int16x8_t v20, int16x8_t v21, int16x8_t v22, int16x8_t v23)
505
+{
506
+    int16x8_t v16, v17, v18, v19;
507
+    int16x8_t v4, v5, v6, v7;
508
+
509
+    SUMSUB_AB(v16, v18, v0,  v2);
510
+    SUMSUB_AB(v17, v19, v1,  v3);
511
+
512
+    HADAMARD4_V(v20, v21, v22, v23, v0,  v1, v2, v3);
513
+
514
+    SUMSUB_AB(v0,  v16, v16, v20);
515
+    SUMSUB_AB(v1,  v17, v17, v21);
516
+    SUMSUB_AB(v2,  v18, v18, v22);
517
+    SUMSUB_AB(v3,  v19, v19, v23);
518
+
519
+    transpose_8h(v20, v21, v16, v17);
520
+    transpose_8h(v4,  v5,  v0,  v1);
521
+    transpose_8h(v22, v23, v18, v19);
522
+    transpose_8h(v6,  v7,  v2,  v3);
523
+
524
+#if (X265_DEPTH <= 10)
525
+
526
+    int16x8_t v24, v25;
527
+
528
+    SUMSUB_AB(v2,  v3,  v20, v21);
529
+    SUMSUB_AB(v24, v25, v4,  v5);
530
+    SUMSUB_AB(v0,  v1,  v22, v23);
531
+    SUMSUB_AB(v4,  v5,  v6,  v7);
532
+
533
+    transpose_4s(v20, v22, v2,  v0);
534
+    transpose_4s(v21, v23, v3,  v1);
535
+    transpose_4s(v16, v18, v24, v4);
536
+    transpose_4s(v17, v19, v25, v5);
537
+
538
+    SUMSUB_AB(v0,  v2,  v20, v22);
539
+    SUMSUB_AB(v1,  v3,  v21, v23);
540
+    SUMSUB_AB(v4,  v6,  v16, v18);
541
+    SUMSUB_AB(v5,  v7,  v17, v19);
542
+
543
+    transpose_2d(v16, v20,  v0,  v4);
544
+    transpose_2d(v17, v21,  v1,  v5);
545
+    transpose_2d(v18, v22,  v2,  v6);
546
+    transpose_2d(v19, v23,  v3,  v7);
547
+
548
+
549
+    v16 = vabsq_s16(v16);
550
+    v17 = vabsq_s16(v17);
551
+    v18 = vabsq_s16(v18);
552
+    v19 = vabsq_s16(v19);
553
+    v20 = vabsq_s16(v20);
554
+    v21 = vabsq_s16(v21);
555
+    v22 = vabsq_s16(v22);
556
+    v23 = vabsq_s16(v23);
557
+
558
+    v16 = vmaxq_u16(v16, v20);
559
+    v17 = vmaxq_u16(v17, v21);
560
+    v18 = vmaxq_u16(v18, v22);
561
+    v19 = vmaxq_u16(v19, v23);
562
+
563
+#if HIGH_BIT_DEPTH
564
+    v0 = vpaddlq_u16(v16);
565
+    v1 = vpaddlq_u16(v17);
566
+    v0 = vpadalq_u16(v0, v18);
567
+    v1 = vpadalq_u16(v1, v19);
568
+
569
+#else //HIGH_BIT_DEPTH
570
+
571
+    v0 = vaddq_u16(v16, v17);
572
+    v1 = vaddq_u16(v18, v19);
573
+
574
+#endif //HIGH_BIT_DEPTH
575
+
576
+#else // HIGH_BIT_DEPTH 12 bit only, switching math to int32, each int16x8 is up-convreted to 2 int32x4 (low and high)
577
+
578
+    int32x4_t v2l, v2h, v3l, v3h, v24l, v24h, v25l, v25h, v0l, v0h, v1l, v1h;
579
+    int32x4_t v22l, v22h, v23l, v23h;
580
+    int32x4_t v4l, v4h, v5l, v5h;
581
+    int32x4_t v6l, v6h, v7l, v7h;
582
+    int32x4_t v16l, v16h, v17l, v17h;
583
+    int32x4_t v18l, v18h, v19l, v19h;
584
+    int32x4_t v20l, v20h, v21l, v21h;
585
+
586
+    ISUMSUB_AB_FROM_INT16(v2l, v2h, v3l, v3h, v20, v21);
587
+    ISUMSUB_AB_FROM_INT16(v24l, v24h, v25l, v25h, v4, v5);
588
+
589
+    v22l = vmovl_s16(vget_low_s16(v22));
590
+    v22h = vmovl_high_s16(v22);
591
+    v23l = vmovl_s16(vget_low_s16(v23));
592
+    v23h = vmovl_high_s16(v23);
593
+
594
+    ISUMSUB_AB(v0l,  v1l,  v22l, v23l);
595
+    ISUMSUB_AB(v0h,  v1h,  v22h, v23h);
596
+
597
+    v6l = vmovl_s16(vget_low_s16(v6));
598
+    v6h = vmovl_high_s16(v6);
599
+    v7l = vmovl_s16(vget_low_s16(v7));
600
+    v7h = vmovl_high_s16(v7);
601
+
602
+    ISUMSUB_AB(v4l,  v5l,  v6l,  v7l);
603
+    ISUMSUB_AB(v4h,  v5h,  v6h,  v7h);
604
+
605
+    transpose_2d(v20l, v22l, v2l,  v0l);
606
+    transpose_2d(v21l, v23l, v3l,  v1l);
607
+    transpose_2d(v16l, v18l, v24l, v4l);
608
+    transpose_2d(v17l, v19l, v25l, v5l);
609
+
610
+    transpose_2d(v20h, v22h, v2h,  v0h);
611
+    transpose_2d(v21h, v23h, v3h,  v1h);
612
+    transpose_2d(v16h, v18h, v24h, v4h);
613
+    transpose_2d(v17h, v19h, v25h, v5h);
614
+
615
+    ISUMSUB_AB(v0l,  v2l,  v20l, v22l);
616
+    ISUMSUB_AB(v1l,  v3l,  v21l, v23l);
617
+    ISUMSUB_AB(v4l,  v6l,  v16l, v18l);
618
+    ISUMSUB_AB(v5l,  v7l,  v17l, v19l);
619
+
620
+    ISUMSUB_AB(v0h,  v2h,  v20h, v22h);
621
+    ISUMSUB_AB(v1h,  v3h,  v21h, v23h);
622
+    ISUMSUB_AB(v4h,  v6h,  v16h, v18h);
623
+    ISUMSUB_AB(v5h,  v7h,  v17h, v19h);
624
+
625
+    v16l = v0l;
626
+    v16h = v4l;
627
+    v20l = v0h;
628
+    v20h = v4h;
629
+
630
+    v17l = v1l;
631
+    v17h = v5l;
632
+    v21l = v1h;
633
+    v21h = v5h;
634
+
635
+    v18l = v2l;
636
+    v18h = v6l;
637
+    v22l = v2h;
638
+    v22h = v6h;
639
+
640
+    v19l = v3l;
641
+    v19h = v7l;
642
+    v23l = v3h;
643
+    v23h = v7h;
644
+
645
+    v16l = vabsq_s32(v16l);
646
+    v17l = vabsq_s32(v17l);
647
+    v18l = vabsq_s32(v18l);
648
+    v19l = vabsq_s32(v19l);
649
+    v20l = vabsq_s32(v20l);
650
+    v21l = vabsq_s32(v21l);
651
+    v22l = vabsq_s32(v22l);
652
+    v23l = vabsq_s32(v23l);
653
+
654
+    v16h = vabsq_s32(v16h);
655
+    v17h = vabsq_s32(v17h);
656
+    v18h = vabsq_s32(v18h);
657
+    v19h = vabsq_s32(v19h);
658
+    v20h = vabsq_s32(v20h);
659
+    v21h = vabsq_s32(v21h);
660
+    v22h = vabsq_s32(v22h);
661
+    v23h = vabsq_s32(v23h);
662
+
663
+    v16l = vmaxq_u32(v16l, v20l);
664
+    v17l = vmaxq_u32(v17l, v21l);
665
+    v18l = vmaxq_u32(v18l, v22l);
666
+    v19l = vmaxq_u32(v19l, v23l);
667
+
668
+    v16h = vmaxq_u32(v16h, v20h);
669
+    v17h = vmaxq_u32(v17h, v21h);
670
+    v18h = vmaxq_u32(v18h, v22h);
671
+    v19h = vmaxq_u32(v19h, v23h);
672
+
673
+    v16l = vaddq_u32(v16l, v16h);
674
+    v17l = vaddq_u32(v17l, v17h);
675
+    v18l = vaddq_u32(v18l, v18h);
676
+    v19l = vaddq_u32(v19l, v19h);
677
+
678
+    v0 = vaddq_u32(v16l, v17l);
679
+    v1 = vaddq_u32(v18l, v19l);
680
+
681
+
682
+#endif
683
+
684
+}
685
+
686
+
687
+
688
+static inline void _satd_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2,
689
+                                  int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3)
690
+{
691
+
692
+    int16x8_t v20, v21, v22, v23;
693
+    _sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
694
+    _satd_8x4v_8x8h_neon(v0, v1, v2, v3, v20, v21, v22, v23);
695
+
696
+}
697
+
698
+
699
+
700
+int pixel_satd_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
701
+{
702
+    int16x8_t v30, v31;
703
+    int16x8_t v0, v1, v2, v3;
704
+
705
+    _satd_8x8_neon(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3);
706
+#if !(HIGH_BIT_DEPTH)
707
+    v30 = vaddq_u16(v0, v1);
708
+    v31 = vaddq_u16(v2, v3);
709
+
710
+    uint16x8_t sum = vaddq_u16(v30, v31);
711
+    return vaddvq_s32(vpaddlq_u16(sum));
712
+#else
713
+
714
+    v30 = vaddq_u16(v0, v1);
715
+    v31 = vaddq_u16(v2, v3);
716
+
717
+    int32x4_t sum = vpaddlq_u16(v30);
718
+    sum = vpadalq_u16(sum, v31);
719
+    return vaddvq_s32(sum);
720
+#endif
721
+}
722
+
723
+
724
+int pixel_sa8d_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
725
+{
726
+    int16x8_t v0, v1, v2, v3;
727
+    int16x8_t v20, v21, v22, v23;
728
+
729
+    _sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
730
+    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
731
+
732
+#if HIGH_BIT_DEPTH
733
+    int32x4_t s = vaddq_u32(v0, v1);
734
+    return (vaddvq_u32(s) + 1) >> 1;
735
+#else
736
+    return (vaddlvq_s16(vaddq_u16(v0, v1)) + 1) >> 1;
737
+#endif
738
+}
739
+
740
+
741
+
742
+
743
+
744
+int pixel_sa8d_16x16_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
745
+{
746
+    int16x8_t v0, v1, v2, v3;
747
+    int16x8_t v20, v21, v22, v23;
748
+    int32x4_t v30, v31;
749
+
750
+    _sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
751
+    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
752
+
753
+#if !(HIGH_BIT_DEPTH)
754
+    v30 = vpaddlq_u16(v0);
755
+    v31 = vpaddlq_u16(v1);
756
+#else
757
+    v30 = vaddq_s32(v0, v1);
758
+#endif
759
+
760
+    _sub_8x8_fly(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
761
+    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
762
+
763
+#if !(HIGH_BIT_DEPTH)
764
+    v30 = vpadalq_u16(v30, v0);
765
+    v31 = vpadalq_u16(v31, v1);
766
+#else
767
+    v31 = vaddq_s32(v0, v1);
768
+#endif
769
+
770
+
771
+    _sub_8x8_fly(pix1 + 8 * stride_pix1, stride_pix1, pix2 + 8 * stride_pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22,
772
+                 v23);
773
+    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
774
+
775
+#if !(HIGH_BIT_DEPTH)
776
+    v30 = vpadalq_u16(v30, v0);
777
+    v31 = vpadalq_u16(v31, v1);
778
+#else
779
+    v30 = vaddq_s32(v30, v0);
780
+    v31 = vaddq_s32(v31, v1);
781
+#endif
782
+
783
+    _sub_8x8_fly(pix1 + 8 * stride_pix1 + 8, stride_pix1, pix2 + 8 * stride_pix2 + 8, stride_pix2, v0, v1, v2, v3, v20, v21,
784
+                 v22, v23);
785
+    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
786
+
787
+#if !(HIGH_BIT_DEPTH)
788
+    v30 = vpadalq_u16(v30, v0);
789
+    v31 = vpadalq_u16(v31, v1);
790
+#else
791
+    v30 = vaddq_s32(v30, v0);
792
+    v31 = vaddq_s32(v31, v1);
793
+#endif
794
+
795
+    v30 = vaddq_u32(v30, v31);
796
+
797
+    return (vaddvq_u32(v30) + 1) >> 1;
798
+}
799
+
800
+
801
+
802
+
803
+
804
+
805
+
806
+
807
+template<int size>
808
+void blockfill_s_neon(int16_t *dst, intptr_t dstride, int16_t val)
809
+{
810
+    for (int y = 0; y < size; y++)
811
+    {
812
+        int x = 0;
813
+        int16x8_t v = vdupq_n_s16(val);
814
+        for (; (x + 8) <= size; x += 8)
815
+        {
816
+            *(int16x8_t *)&dsty * dstride + x = v;
817
+        }
818
+        for (; x < size; x++)
819
+        {
820
+            dsty * dstride + x = val;
821
+        }
822
+    }
823
+}
824
+
825
+template<int lx, int ly>
826
+int sad_pp_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
827
+{
828
+    int sum = 0;
829
+
830
+
831
+    for (int y = 0; y < ly; y++)
832
+    {
833
+#if HIGH_BIT_DEPTH
834
+        int x = 0;
835
+        uint16x8_t vsum16_1 = vdupq_n_u16(0);
836
+        for (; (x + 8) <= lx; x += 8)
837
+        {
838
+            uint16x8_t p1 = *(uint16x8_t *)&pix1x;
839
+            uint16x8_t p2 = *(uint16x8_t *)&pix2x;
840
+            vsum16_1 = vabaq_s16(vsum16_1, p1, p2);
841
+
842
+        }
843
+        if (lx & 4)
844
+        {
845
+            uint16x4_t p1 = *(uint16x4_t *)&pix1x;
846
+            uint16x4_t p2 = *(uint16x4_t *)&pix2x;
847
+            sum += vaddlv_s16(vaba_s16(vdup_n_s16(0), p1, p2));
848
+            x += 4;
849
+        }
850
+        if (lx >= 4)
851
+        {
852
+            sum += vaddlvq_s16(vsum16_1);
853
+        }
854
+
855
+#else
856
+
857
+        int x = 0;
858
+        uint16x8_t vsum16_1 = vdupq_n_u16(0);
859
+        uint16x8_t vsum16_2 = vdupq_n_u16(0);
860
+
861
+        for (; (x + 16) <= lx; x += 16)
862
+        {
863
+            uint8x16_t p1 = *(uint8x16_t *)&pix1x;
864
+            uint8x16_t p2 = *(uint8x16_t *)&pix2x;
865
+            vsum16_1 = vabal_u8(vsum16_1, vget_low_u8(p1), vget_low_u8(p2));
866
+            vsum16_2 = vabal_high_u8(vsum16_2, p1, p2);
867
+        }
868
+        if (lx & 8)
869
+        {
870
+            uint8x8_t p1 = *(uint8x8_t *)&pix1x;
871
+            uint8x8_t p2 = *(uint8x8_t *)&pix2x;
872
+            vsum16_1 = vabal_u8(vsum16_1, p1, p2);
873
+            x += 8;
874
+        }
875
+        if (lx & 4)
876
+        {
877
+            uint32x2_t p1 = vdup_n_u32(0);
878
+            p10 = *(uint32_t *)&pix1x;
879
+            uint32x2_t p2 = vdup_n_u32(0);
880
+            p20 = *(uint32_t *)&pix2x;
881
+            vsum16_1 = vabal_u8(vsum16_1, p1, p2);
882
+            x += 4;
883
+        }
884
+        if (lx >= 16)
885
+        {
886
+            vsum16_1 = vaddq_u16(vsum16_1, vsum16_2);
887
+        }
888
+        if (lx >= 4)
889
+        {
890
+            sum += vaddvq_u16(vsum16_1);
891
+        }
892
+
893
+#endif
894
+        if (lx & 3) for (; x < lx; x++)
895
+            {
896
+                sum += abs(pix1x - pix2x);
897
+            }
898
+
899
+        pix1 += stride_pix1;
900
+        pix2 += stride_pix2;
901
+    }
902
+
903
+    return sum;
904
+}
905
+
906
+template<int lx, int ly>
907
+void sad_x3_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const pixel *pix4, intptr_t frefstride,
908
+                 int32_t *res)
909
+{
910
+    res0 = 0;
911
+    res1 = 0;
912
+    res2 = 0;
913
+    for (int y = 0; y < ly; y++)
914
+    {
915
+        int x = 0;
916
+        uint16x8_t vsum16_0 = vdupq_n_u16(0);
917
+        uint16x8_t vsum16_1 = vdupq_n_u16(0);
918
+        uint16x8_t vsum16_2 = vdupq_n_u16(0);
919
+#if HIGH_BIT_DEPTH
920
+        for (; (x + 8) <= lx; x += 8)
921
+        {
922
+            uint16x8_t p1 = *(uint16x8_t *)&pix1x;
923
+            uint16x8_t p2 = *(uint16x8_t *)&pix2x;
924
+            uint16x8_t p3 = *(uint16x8_t *)&pix3x;
925
+            uint16x8_t p4 = *(uint16x8_t *)&pix4x;
926
+            vsum16_0 = vabaq_s16(vsum16_0, p1, p2);
927
+            vsum16_1 = vabaq_s16(vsum16_1, p1, p3);
928
+            vsum16_2 = vabaq_s16(vsum16_2, p1, p4);
929
+
930
+        }
931
+        if (lx & 4)
932
+        {
933
+            uint16x4_t p1 = *(uint16x4_t *)&pix1x;
934
+            uint16x4_t p2 = *(uint16x4_t *)&pix2x;
935
+            uint16x4_t p3 = *(uint16x4_t *)&pix3x;
936
+            uint16x4_t p4 = *(uint16x4_t *)&pix4x;
937
+            res0 += vaddlv_s16(vaba_s16(vdup_n_s16(0), p1, p2));
938
+            res1 += vaddlv_s16(vaba_s16(vdup_n_s16(0), p1, p3));
939
+            res2 += vaddlv_s16(vaba_s16(vdup_n_s16(0), p1, p4));
940
+            x += 4;
941
+        }
942
+        if (lx >= 4)
943
+        {
944
+            res0 += vaddlvq_s16(vsum16_0);
945
+            res1 += vaddlvq_s16(vsum16_1);
946
+            res2 += vaddlvq_s16(vsum16_2);
947
+        }
948
+#else
949
+
950
+        for (; (x + 16) <= lx; x += 16)
951
+        {
952
+            uint8x16_t p1 = *(uint8x16_t *)&pix1x;
953
+            uint8x16_t p2 = *(uint8x16_t *)&pix2x;
954
+            uint8x16_t p3 = *(uint8x16_t *)&pix3x;
955
+            uint8x16_t p4 = *(uint8x16_t *)&pix4x;
956
+            vsum16_0 = vabal_u8(vsum16_0, vget_low_u8(p1), vget_low_u8(p2));
957
+            vsum16_0 = vabal_high_u8(vsum16_0, p1, p2);
958
+            vsum16_1 = vabal_u8(vsum16_1, vget_low_u8(p1), vget_low_u8(p3));
959
+            vsum16_1 = vabal_high_u8(vsum16_1, p1, p3);
960
+            vsum16_2 = vabal_u8(vsum16_2, vget_low_u8(p1), vget_low_u8(p4));
961
+            vsum16_2 = vabal_high_u8(vsum16_2, p1, p4);
962
+        }
963
+        if (lx & 8)
964
+        {
965
+            uint8x8_t p1 = *(uint8x8_t *)&pix1x;
966
+            uint8x8_t p2 = *(uint8x8_t *)&pix2x;
967
+            uint8x8_t p3 = *(uint8x8_t *)&pix3x;
968
+            uint8x8_t p4 = *(uint8x8_t *)&pix4x;
969
+            vsum16_0 = vabal_u8(vsum16_0, p1, p2);
970
+            vsum16_1 = vabal_u8(vsum16_1, p1, p3);
971
+            vsum16_2 = vabal_u8(vsum16_2, p1, p4);
972
+            x += 8;
973
+        }
974
+        if (lx & 4)
975
+        {
976
+            uint32x2_t p1 = vdup_n_u32(0);
977
+            p10 = *(uint32_t *)&pix1x;
978
+            uint32x2_t p2 = vdup_n_u32(0);
979
+            p20 = *(uint32_t *)&pix2x;
980
+            uint32x2_t p3 = vdup_n_u32(0);
981
+            p30 = *(uint32_t *)&pix3x;
982
+            uint32x2_t p4 = vdup_n_u32(0);
983
+            p40 = *(uint32_t *)&pix4x;
984
+            vsum16_0 = vabal_u8(vsum16_0, p1, p2);
985
+            vsum16_1 = vabal_u8(vsum16_1, p1, p3);
986
+            vsum16_2 = vabal_u8(vsum16_2, p1, p4);
987
+            x += 4;
988
+        }
989
+        if (lx >= 4)
990
+        {
991
+            res0 += vaddvq_u16(vsum16_0);
992
+            res1 += vaddvq_u16(vsum16_1);
993
+            res2 += vaddvq_u16(vsum16_2);
994
+        }
995
+
996
+#endif
997
+        if (lx & 3) for (; x < lx; x++)
998
+            {
999
+                res0 += abs(pix1x - pix2x);
1000
+                res1 += abs(pix1x - pix3x);
1001
+                res2 += abs(pix1x - pix4x);
1002
+            }
1003
+
1004
+        pix1 += FENC_STRIDE;
1005
+        pix2 += frefstride;
1006
+        pix3 += frefstride;
1007
+        pix4 += frefstride;
1008
+    }
1009
+}
1010
+
1011
+template<int lx, int ly>
1012
+void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const pixel *pix4, const pixel *pix5,
1013
+                 intptr_t frefstride, int32_t *res)
1014
+{
1015
+    int32x4_t result = {0};
1016
+    for (int y = 0; y < ly; y++)
1017
+    {
1018
+        int x = 0;
1019
+        uint16x8_t vsum16_0 = vdupq_n_u16(0);
1020
+        uint16x8_t vsum16_1 = vdupq_n_u16(0);
1021
+        uint16x8_t vsum16_2 = vdupq_n_u16(0);
1022
+        uint16x8_t vsum16_3 = vdupq_n_u16(0);
1023
+#if HIGH_BIT_DEPTH
1024
+        for (; (x + 16) <= lx; x += 16)
1025
+        {
1026
+            uint16x8x2_t p1 = vld1q_u16_x2(&pix1x);
1027
+            uint16x8x2_t p2 = vld1q_u16_x2(&pix2x);
1028
+            uint16x8x2_t p3 = vld1q_u16_x2(&pix3x);
1029
+            uint16x8x2_t p4 = vld1q_u16_x2(&pix4x);
1030
+            uint16x8x2_t p5 = vld1q_u16_x2(&pix5x);
1031
+            vsum16_0 = vabaq_s16(vsum16_0, p1.val0, p2.val0);
1032
+            vsum16_1 = vabaq_s16(vsum16_1, p1.val0, p3.val0);
1033
+            vsum16_2 = vabaq_s16(vsum16_2, p1.val0, p4.val0);
1034
+            vsum16_3 = vabaq_s16(vsum16_3, p1.val0, p5.val0);
1035
+            vsum16_0 = vabaq_s16(vsum16_0, p1.val1, p2.val1);
1036
+            vsum16_1 = vabaq_s16(vsum16_1, p1.val1, p3.val1);
1037
+            vsum16_2 = vabaq_s16(vsum16_2, p1.val1, p4.val1);
1038
+            vsum16_3 = vabaq_s16(vsum16_3, p1.val1, p5.val1);
1039
+        }
1040
+        if (lx & 8)
1041
+        {
1042
+            uint16x8_t p1 = *(uint16x8_t *)&pix1x;
1043
+            uint16x8_t p2 = *(uint16x8_t *)&pix2x;
1044
+            uint16x8_t p3 = *(uint16x8_t *)&pix3x;
1045
+            uint16x8_t p4 = *(uint16x8_t *)&pix4x;
1046
+            uint16x8_t p5 = *(uint16x8_t *)&pix5x;
1047
+            vsum16_0 = vabaq_s16(vsum16_0, p1, p2);
1048
+            vsum16_1 = vabaq_s16(vsum16_1, p1, p3);
1049
+            vsum16_2 = vabaq_s16(vsum16_2, p1, p4);
1050
+            vsum16_3 = vabaq_s16(vsum16_3, p1, p5);
1051
+            x += 8;
1052
+        }
1053
+        if (lx & 4)
1054
+        {
1055
+            /* This is equivalent to getting the absolute difference of pix1x with each of
1056
+             * pix2 - pix5, then summing across the vector (4 values each) and adding the
1057
+             * result to result. */
1058
+            uint16x8_t p1 = vreinterpretq_s16_u64(
1059
+                    vld1q_dup_u64((uint64_t *)&pix1x));
1060
+            uint16x8_t p2_3 = vcombine_s16(*(uint16x4_t *)&pix2x, *(uint16x4_t *)&pix3x);
1061
+            uint16x8_t p4_5 = vcombine_s16(*(uint16x4_t *)&pix4x, *(uint16x4_t *)&pix5x);
1062
+
1063
+            uint16x8_t a = vabdq_u16(p1, p2_3);
1064
+            uint16x8_t b = vabdq_u16(p1, p4_5);
1065
+
1066
+            result = vpadalq_s16(result, vpaddq_s16(a, b));
1067
+            x += 4;
1068
+        }
1069
+        if (lx >= 4)
1070
+        {
1071
+            /* This is equivalent to adding across each of the sum vectors and then adding
1072
+             * to result. */
1073
+            uint16x8_t a = vpaddq_s16(vsum16_0, vsum16_1);
1074
+            uint16x8_t b = vpaddq_s16(vsum16_2, vsum16_3);
1075
+            uint16x8_t c = vpaddq_s16(a, b);
1076
+            result = vpadalq_s16(result, c);
1077
+        }
1078
+
1079
+#else
1080
+
1081
+        for (; (x + 16) <= lx; x += 16)
1082
+        {
1083
+            uint8x16_t p1 = *(uint8x16_t *)&pix1x;
1084
+            uint8x16_t p2 = *(uint8x16_t *)&pix2x;
1085
+            uint8x16_t p3 = *(uint8x16_t *)&pix3x;
1086
+            uint8x16_t p4 = *(uint8x16_t *)&pix4x;
1087
+            uint8x16_t p5 = *(uint8x16_t *)&pix5x;
1088
+            vsum16_0 = vabal_u8(vsum16_0, vget_low_u8(p1), vget_low_u8(p2));
1089
+            vsum16_0 = vabal_high_u8(vsum16_0, p1, p2);
1090
+            vsum16_1 = vabal_u8(vsum16_1, vget_low_u8(p1), vget_low_u8(p3));
1091
+            vsum16_1 = vabal_high_u8(vsum16_1, p1, p3);
1092
+            vsum16_2 = vabal_u8(vsum16_2, vget_low_u8(p1), vget_low_u8(p4));
1093
+            vsum16_2 = vabal_high_u8(vsum16_2, p1, p4);
1094
+            vsum16_3 = vabal_u8(vsum16_3, vget_low_u8(p1), vget_low_u8(p5));
1095
+            vsum16_3 = vabal_high_u8(vsum16_3, p1, p5);
1096
+        }
1097
+        if (lx & 8)
1098
+        {
1099
+            uint8x8_t p1 = *(uint8x8_t *)&pix1x;
1100
+            uint8x8_t p2 = *(uint8x8_t *)&pix2x;
1101
+            uint8x8_t p3 = *(uint8x8_t *)&pix3x;
1102
+            uint8x8_t p4 = *(uint8x8_t *)&pix4x;
1103
+            uint8x8_t p5 = *(uint8x8_t *)&pix5x;
1104
+            vsum16_0 = vabal_u8(vsum16_0, p1, p2);
1105
+            vsum16_1 = vabal_u8(vsum16_1, p1, p3);
1106
+            vsum16_2 = vabal_u8(vsum16_2, p1, p4);
1107
+            vsum16_3 = vabal_u8(vsum16_3, p1, p5);
1108
+            x += 8;
1109
+        }
1110
+        if (lx & 4)
1111
+        {
1112
+            uint8x16_t p1 = vreinterpretq_u32_u8(
1113
+                vld1q_dup_u32((uint32_t *)&pix1x));
1114
+
1115
+            uint32x4_t p_x4;
1116
+            p_x4 = vld1q_lane_u32((uint32_t *)&pix2x, p_x4, 0);
1117
+            p_x4 = vld1q_lane_u32((uint32_t *)&pix3x, p_x4, 1);
1118
+            p_x4 = vld1q_lane_u32((uint32_t *)&pix4x, p_x4, 2);
1119
+            p_x4 = vld1q_lane_u32((uint32_t *)&pix5x, p_x4, 3);
1120
+
1121
+            uint16x8_t sum = vabdl_u8(vget_low_u8(p1), vget_low_u8(p_x4));
1122
+            uint16x8_t sum2 = vabdl_high_u8(p1, p_x4);
1123
+
1124
+            uint16x8_t a = vpaddq_u16(sum, sum2);
1125
+            result = vpadalq_u16(result, a);
1126
+        }
1127
+        if (lx >= 4)
1128
+        {
1129
+            result0 += vaddvq_u16(vsum16_0);
1130
+            result1 += vaddvq_u16(vsum16_1);
1131
+            result2 += vaddvq_u16(vsum16_2);
1132
+            result3 += vaddvq_u16(vsum16_3);
1133
+        }
1134
+
1135
+#endif
1136
+        if (lx & 3) for (; x < lx; x++)
1137
+        {
1138
+            result0 += abs(pix1x - pix2x);
1139
+            result1 += abs(pix1x - pix3x);
1140
+            result2 += abs(pix1x - pix4x);
1141
+            result3 += abs(pix1x - pix5x);
1142
+        }
1143
+
1144
+        pix1 += FENC_STRIDE;
1145
+        pix2 += frefstride;
1146
+        pix3 += frefstride;
1147
+        pix4 += frefstride;
1148
+        pix5 += frefstride;
1149
+    }
1150
+    vst1q_s32(res, result);
1151
+}
1152
+
1153
+
1154
+template<int lx, int ly, class T1, class T2>
1155
+sse_t sse_neon(const T1 *pix1, intptr_t stride_pix1, const T2 *pix2, intptr_t stride_pix2)
1156
+{
1157
+    sse_t sum = 0;
1158
+
1159
+    int32x4_t vsum1 = vdupq_n_s32(0);
1160
+    int32x4_t vsum2 = vdupq_n_s32(0);
1161
+    for (int y = 0; y < ly; y++)
1162
+    {
1163
+        int x = 0;
1164
+        for (; (x + 8) <= lx; x += 8)
1165
+        {
1166
+            int16x8_t tmp;
1167
+            if (sizeof(T1) == 2 && sizeof(T2) == 2)
1168
+            {
1169
+                tmp = vsubq_s16(*(int16x8_t *)&pix1x, *(int16x8_t *)&pix2x);
1170
+            }
1171
+            else if (sizeof(T1) == 1 && sizeof(T2) == 1)
1172
+            {
1173
+                tmp = vsubl_u8(*(uint8x8_t *)&pix1x, *(uint8x8_t *)&pix2x);
1174
+            }
1175
+            else
1176
+            {
1177
+                X265_CHECK(false, "unsupported sse");
1178
+            }
1179
+            vsum1 = vmlal_s16(vsum1, vget_low_s16(tmp), vget_low_s16(tmp));
1180
+            vsum2 = vmlal_high_s16(vsum2, tmp, tmp);
1181
+        }
1182
+        for (; x < lx; x++)
1183
+        {
1184
+            int tmp = pix1x - pix2x;
1185
+            sum += (tmp * tmp);
1186
+        }
1187
+
1188
+        if (sizeof(T1) == 2 && sizeof(T2) == 2)
1189
+        {
1190
+            int32x4_t vsum = vaddq_u32(vsum1, vsum2);;
1191
+            sum += vaddvq_u32(vsum);
1192
+            vsum1 = vsum2 = vdupq_n_u16(0);
1193
+        }
1194
+
1195
+        pix1 += stride_pix1;
1196
+        pix2 += stride_pix2;
1197
+    }
1198
+    int32x4_t vsum = vaddq_u32(vsum1, vsum2);
1199
+
1200
+    return sum + vaddvq_u32(vsum);
1201
+}
1202
+
1203
+
1204
+template<int bx, int by>
1205
+void blockcopy_ps_neon(int16_t *a, intptr_t stridea, const pixel *b, intptr_t strideb)
1206
+{
1207
+    for (int y = 0; y < by; y++)
1208
+    {
1209
+        int x = 0;
1210
+        for (; (x + 8) <= bx; x += 8)
1211
+        {
1212
+#if HIGH_BIT_DEPTH
1213
+            *(int16x8_t *)&ax = *(int16x8_t *)&bx;
1214
+#else
1215
+            *(int16x8_t *)&ax = vmovl_u8(*(int8x8_t *)&bx);
1216
+#endif
1217
+        }
1218
+        for (; x < bx; x++)
1219
+        {
1220
+            ax = (int16_t)bx;
1221
+        }
1222
+
1223
+        a += stridea;
1224
+        b += strideb;
1225
+    }
1226
+}
1227
+
1228
+
1229
+template<int bx, int by>
1230
+void blockcopy_pp_neon(pixel *a, intptr_t stridea, const pixel *b, intptr_t strideb)
1231
+{
1232
+    for (int y = 0; y < by; y++)
1233
+    {
1234
+        int x = 0;
1235
+#if HIGH_BIT_DEPTH
1236
+        for (; (x + 8) <= bx; x += 8)
1237
+        {
1238
+            *(int16x8_t *)&ax = *(int16x8_t *)&bx;
1239
+        }
1240
+        if (bx & 4)
1241
+        {
1242
+            *(uint64_t *)&ax = *(uint64_t *)&bx;
1243
+            x += 4;
1244
+        }
1245
+#else
1246
+        for (; (x + 16) <= bx; x += 16)
1247
+        {
1248
+            *(uint8x16_t *)&ax = *(uint8x16_t *)&bx;
1249
+        }
1250
+        if (bx & 8)
1251
+        {
1252
+            *(uint8x8_t *)&ax = *(uint8x8_t *)&bx;
1253
+            x += 8;
1254
+        }
1255
+        if (bx & 4)
1256
+        {
1257
+            *(uint32_t *)&ax = *(uint32_t *)&bx;
1258
+            x += 4;
1259
+        }
1260
+#endif
1261
+        for (; x < bx; x++)
1262
+        {
1263
+            ax = bx;
1264
+        }
1265
+
1266
+        a += stridea;
1267
+        b += strideb;
1268
+    }
1269
+}
1270
+
1271
+
1272
+template<int bx, int by>
1273
+void pixel_sub_ps_neon(int16_t *a, intptr_t dstride, const pixel *b0, const pixel *b1, intptr_t sstride0,
1274
+                       intptr_t sstride1)
1275
+{
1276
+    for (int y = 0; y < by; y++)
1277
+    {
1278
+        int x = 0;
1279
+        for (; (x + 8) <= bx; x += 8)
1280
+        {
1281
+#if HIGH_BIT_DEPTH
1282
+            *(int16x8_t *)&ax = vsubq_s16(*(int16x8_t *)&b0x, *(int16x8_t *)&b1x);
1283
+#else
1284
+            *(int16x8_t *)&ax = vsubl_u8(*(uint8x8_t *)&b0x, *(uint8x8_t *)&b1x);
1285
+#endif
1286
+        }
1287
+        for (; x < bx; x++)
1288
+        {
1289
+            ax = (int16_t)(b0x - b1x);
1290
+        }
1291
+
1292
+        b0 += sstride0;
1293
+        b1 += sstride1;
1294
+        a += dstride;
1295
+    }
1296
+}
1297
+
1298
+template<int bx, int by>
1299
+void pixel_add_ps_neon(pixel *a, intptr_t dstride, const pixel *b0, const int16_t *b1, intptr_t sstride0,
1300
+                       intptr_t sstride1)
1301
+{
1302
+    for (int y = 0; y < by; y++)
1303
+    {
1304
+        int x = 0;
1305
+        for (; (x + 8) <= bx; x += 8)
1306
+        {
1307
+            int16x8_t t;
1308
+            int16x8_t b1e = *(int16x8_t *)&b1x;
1309
+            int16x8_t b0e;
1310
+#if HIGH_BIT_DEPTH
1311
+            b0e = *(int16x8_t *)&b0x;
1312
+            t = vaddq_s16(b0e, b1e);
1313
+            t = vminq_s16(t, vdupq_n_s16((1 << X265_DEPTH) - 1));
1314
+            t = vmaxq_s16(t, vdupq_n_s16(0));
1315
+            *(int16x8_t *)&ax = t;
1316
+#else
1317
+            b0e = vmovl_u8(*(uint8x8_t *)&b0x);
1318
+            t = vaddq_s16(b0e, b1e);
1319
+            *(uint8x8_t *)&ax = vqmovun_s16(t);
1320
+#endif
1321
+        }
1322
+        for (; x < bx; x++)
1323
+        {
1324
+            ax = (int16_t)x265_clip(b0x + b1x);
1325
+        }
1326
+
1327
+        b0 += sstride0;
1328
+        b1 += sstride1;
1329
+        a += dstride;
1330
+    }
1331
+}
1332
+
1333
+template<int bx, int by>
1334
+void addAvg_neon(const int16_t *src0, const int16_t *src1, pixel *dst, intptr_t src0Stride, intptr_t src1Stride,
1335
+                 intptr_t dstStride)
1336
+{
1337
+
1338
+    const int shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH;
1339
+    const int offset = (1 << (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS;
1340
+
1341
+    const int32x4_t addon = vdupq_n_s32(offset);
1342
+    for (int y = 0; y < by; y++)
1343
+    {
1344
+        int x = 0;
1345
+
1346
+        for (; (x + 8) <= bx; x += 8)
1347
+        {
1348
+            int16x8_t in0 = *(int16x8_t *)&src0x;
1349
+            int16x8_t in1 = *(int16x8_t *)&src1x;
1350
+            int32x4_t t1 = vaddl_s16(vget_low_s16(in0), vget_low_s16(in1));
1351
+            int32x4_t t2 = vaddl_high_s16(in0, in1);
1352
+            t1 = vaddq_s32(t1, addon);
1353
+            t2 = vaddq_s32(t2, addon);
1354
+            t1 = vshrq_n_s32(t1, shiftNum);
1355
+            t2 = vshrq_n_s32(t2, shiftNum);
1356
+            int16x8_t t = vuzp1q_s16(t1, t2);
1357
+#if HIGH_BIT_DEPTH
1358
+            t = vminq_s16(t, vdupq_n_s16((1 << X265_DEPTH) - 1));
1359
+            t = vmaxq_s16(t, vdupq_n_s16(0));
1360
+            *(int16x8_t *)&dstx = t;
1361
+#else
1362
+            *(uint8x8_t *)&dstx = vqmovun_s16(t);
1363
+#endif
1364
+        }
1365
+        for (; x < bx; x += 2)
1366
+        {
1367
+            dstx + 0 = x265_clip((src0x + 0 + src1x + 0 + offset) >> shiftNum);
1368
+            dstx + 1 = x265_clip((src0x + 1 + src1x + 1 + offset) >> shiftNum);
1369
+        }
1370
+
1371
+        src0 += src0Stride;
1372
+        src1 += src1Stride;
1373
+        dst  += dstStride;
1374
+    }
1375
+}
1376
+
1377
+template<int lx, int ly>
1378
+void pixelavg_pp_neon(pixel *dst, intptr_t dstride, const pixel *src0, intptr_t sstride0, const pixel *src1,
1379
+                      intptr_t sstride1, int)
1380
+{
1381
+    for (int y = 0; y < ly; y++)
1382
+    {
1383
+        int x = 0;
1384
+        for (; (x + 8) <= lx; x += 8)
1385
+        {
1386
+#if HIGH_BIT_DEPTH
1387
+            uint16x8_t in0 = *(uint16x8_t *)&src0x;
1388
+            uint16x8_t in1 = *(uint16x8_t *)&src1x;
1389
+            uint16x8_t t = vrhaddq_u16(in0, in1);
1390
+            *(uint16x8_t *)&dstx = t;
1391
+#else
1392
+            int16x8_t in0 = vmovl_u8(*(uint8x8_t *)&src0x);
1393
+            int16x8_t in1 = vmovl_u8(*(uint8x8_t *)&src1x);
1394
+            int16x8_t t = vrhaddq_s16(in0, in1);
1395
+            *(uint8x8_t *)&dstx = vmovn_u16(t);
1396
+#endif
1397
+        }
1398
+        for (; x < lx; x++)
1399
+        {
1400
+            dstx = (src0x + src1x + 1) >> 1;
1401
+        }
1402
+
1403
+        src0 += sstride0;
1404
+        src1 += sstride1;
1405
+        dst += dstride;
1406
+    }
1407
+}
1408
+
1409
+
1410
+template<int size>
1411
+void cpy1Dto2D_shl_neon(int16_t *dst, const int16_t *src, intptr_t dstStride, int shift)
1412
+{
1413
+    X265_CHECK((((intptr_t)dst | (dstStride * sizeof(*dst))) & 15) == 0 || size == 4, "dst alignment error\n");
1414
+    X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
1415
+    X265_CHECK(shift >= 0, "invalid shift\n");
1416
+
1417
+    for (int i = 0; i < size; i++)
1418
+    {
1419
+        int j = 0;
1420
+        for (; (j + 8) <= size; j += 8)
1421
+        {
1422
+            *(int16x8_t *)&dstj = vshlq_s16(*(int16x8_t *)&srcj, vdupq_n_s16(shift));
1423
+        }
1424
+        for (; j < size; j++)
1425
+        {
1426
+            dstj = srcj << shift;
1427
+        }
1428
+        src += size;
1429
+        dst += dstStride;
1430
+    }
1431
+}
1432
+
1433
+
1434
+template<int size>
1435
+uint64_t pixel_var_neon(const uint8_t *pix, intptr_t i_stride)
1436
+{
1437
+    uint32_t sum = 0, sqr = 0;
1438
+
1439
+    int32x4_t vsqr = vdupq_n_s32(0);
1440
+    for (int y = 0; y < size; y++)
1441
+    {
1442
+        int x = 0;
1443
+        int16x8_t vsum = vdupq_n_s16(0);
1444
+        for (; (x + 8) <= size; x += 8)
1445
+        {
1446
+            int16x8_t in;
1447
+            in = vmovl_u8(*(uint8x8_t *)&pixx);
1448
+            vsum = vaddq_u16(vsum, in);
1449
+            vsqr = vmlal_s16(vsqr, vget_low_s16(in), vget_low_s16(in));
1450
+            vsqr = vmlal_high_s16(vsqr, in, in);
1451
+        }
1452
+        for (; x < size; x++)
1453
+        {
1454
+            sum += pixx;
1455
+            sqr += pixx * pixx;
1456
+        }
1457
+        sum += vaddvq_s16(vsum);
1458
+
1459
+        pix += i_stride;
1460
+    }
1461
+    sqr += vaddvq_u32(vsqr);
1462
+    return sum + ((uint64_t)sqr << 32);
1463
+}
1464
+
1465
+template<int blockSize>
1466
+void getResidual_neon(const pixel *fenc, const pixel *pred, int16_t *residual, intptr_t stride)
1467
+{
1468
+    for (int y = 0; y < blockSize; y++)
1469
+    {
1470
+        int x = 0;
1471
+        for (; (x + 8) < blockSize; x += 8)
1472
+        {
1473
+            int16x8_t vfenc, vpred;
1474
+#if HIGH_BIT_DEPTH
1475
+            vfenc = *(int16x8_t *)&fencx;
1476
+            vpred = *(int16x8_t *)&predx;
1477
+#else
1478
+            vfenc = vmovl_u8(*(uint8x8_t *)&fencx);
1479
+            vpred = vmovl_u8(*(uint8x8_t *)&predx);
1480
+#endif
1481
+            *(int16x8_t *)&residualx = vsubq_s16(vfenc, vpred);
1482
+        }
1483
+        for (; x < blockSize; x++)
1484
+        {
1485
+            residualx = static_cast<int16_t>(fencx) - static_cast<int16_t>(predx);
1486
+        }
1487
+        fenc += stride;
1488
+        residual += stride;
1489
+        pred += stride;
1490
+    }
1491
+}
1492
+
1493
+template<int size>
1494
+int psyCost_pp_neon(const pixel *source, intptr_t sstride, const pixel *recon, intptr_t rstride)
1495
+{
1496
+    static pixel zeroBuf8 /* = { 0 } */;
1497
+
1498
+    if (size)
1499
+    {
1500
+        int dim = 1 << (size + 2);
1501
+        uint32_t totEnergy = 0;
1502
+        for (int i = 0; i < dim; i += 8)
1503
+        {
1504
+            for (int j = 0; j < dim; j += 8)
1505
+            {
1506
+                /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
1507
+                int sourceEnergy = pixel_sa8d_8x8_neon(source + i * sstride + j, sstride, zeroBuf, 0) -
1508
+                                   (sad_pp_neon<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2);
1509
+                int reconEnergy =  pixel_sa8d_8x8_neon(recon + i * rstride + j, rstride, zeroBuf, 0) -
1510
+                                   (sad_pp_neon<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2);
1511
+
1512
+                totEnergy += abs(sourceEnergy - reconEnergy);
1513
+            }
1514
+        }
1515
+        return totEnergy;
1516
+    }
1517
+    else
1518
+    {
1519
+        /* 4x4 is too small for sa8d */
1520
+        int sourceEnergy = pixel_satd_4x4_neon(source, sstride, zeroBuf, 0) - (sad_pp_neon<4, 4>(source, sstride, zeroBuf,
1521
+                           0) >> 2);
1522
+        int reconEnergy = pixel_satd_4x4_neon(recon, rstride, zeroBuf, 0) - (sad_pp_neon<4, 4>(recon, rstride, zeroBuf,
1523
+                          0) >> 2);
1524
+        return abs(sourceEnergy - reconEnergy);
1525
+    }
1526
+}
1527
+
1528
+
1529
+template<int w, int h>
1530
+// Calculate sa8d in blocks of 8x8
1531
+int sa8d8(const pixel *pix1, intptr_t i_pix1, const pixel *pix2, intptr_t i_pix2)
1532
+{
1533
+    int cost = 0;
1534
+
1535
+    for (int y = 0; y < h; y += 8)
1536
+        for (int x = 0; x < w; x += 8)
1537
+        {
1538
+            cost += pixel_sa8d_8x8_neon(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);
1539
+        }
1540
+
1541
+    return cost;
1542
+}
1543
+
1544
+template<int w, int h>
1545
+// Calculate sa8d in blocks of 16x16
1546
+int sa8d16(const pixel *pix1, intptr_t i_pix1, const pixel *pix2, intptr_t i_pix2)
1547
+{
1548
+    int cost = 0;
1549
+
1550
+    for (int y = 0; y < h; y += 16)
1551
+        for (int x = 0; x < w; x += 16)
1552
+        {
1553
+            cost += pixel_sa8d_16x16_neon(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);
1554
+        }
1555
+
1556
+    return cost;
1557
+}
1558
+
1559
+template<int size>
1560
+void cpy2Dto1D_shl_neon(int16_t *dst, const int16_t *src, intptr_t srcStride, int shift)
1561
+{
1562
+    X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
1563
+    X265_CHECK((((intptr_t)src | (srcStride * sizeof(*src))) & 15) == 0 || size == 4, "src alignment error\n");
1564
+    X265_CHECK(shift >= 0, "invalid shift\n");
1565
+
1566
+    for (int i = 0; i < size; i++)
1567
+    {
1568
+        for (int j = 0; j < size; j++)
1569
+        {
1570
+            dstj = srcj << shift;
1571
+        }
1572
+
1573
+        src += srcStride;
1574
+        dst += size;
1575
+    }
1576
+}
1577
+
1578
+
1579
+template<int w, int h>
1580
+// calculate satd in blocks of 4x4
1581
+int satd4_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
1582
+{
1583
+    int satd = 0;
1584
+
1585
+    for (int row = 0; row < h; row += 4)
1586
+        for (int col = 0; col < w; col += 4)
1587
+            satd += pixel_satd_4x4_neon(pix1 + row * stride_pix1 + col, stride_pix1,
1588
+                                        pix2 + row * stride_pix2 + col, stride_pix2);
1589
+
1590
+    return satd;
1591
+}
1592
+
1593
+template<int w, int h>
1594
+// calculate satd in blocks of 8x4
1595
+int satd8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
1596
+{
1597
+    int satd = 0;
1598
+
1599
+    if (((w | h) & 15) == 0)
1600
+    {
1601
+        for (int row = 0; row < h; row += 16)
1602
+            for (int col = 0; col < w; col += 16)
1603
+                satd += pixel_satd_16x16_neon(pix1 + row * stride_pix1 + col, stride_pix1,
1604
+                                              pix2 + row * stride_pix2 + col, stride_pix2);
1605
+
1606
+    }
1607
+    else if (((w | h) & 7) == 0)
1608
+    {
1609
+        for (int row = 0; row < h; row += 8)
1610
+            for (int col = 0; col < w; col += 8)
1611
+                satd += pixel_satd_8x8_neon(pix1 + row * stride_pix1 + col, stride_pix1,
1612
+                                            pix2 + row * stride_pix2 + col, stride_pix2);
1613
+
1614
+    }
1615
+    else
1616
+    {
1617
+        for (int row = 0; row < h; row += 4)
1618
+            for (int col = 0; col < w; col += 8)
1619
+                satd += pixel_satd_8x4_neon(pix1 + row * stride_pix1 + col, stride_pix1,
1620
+                                            pix2 + row * stride_pix2 + col, stride_pix2);
1621
+    }
1622
+
1623
+    return satd;
1624
+}
1625
+
1626
+
1627
+template<int blockSize>
1628
+void transpose_neon(pixel *dst, const pixel *src, intptr_t stride)
1629
+{
1630
+    for (int k = 0; k < blockSize; k++)
1631
+        for (int l = 0; l < blockSize; l++)
1632
+        {
1633
+            dstk * blockSize + l = srcl * stride + k;
1634
+        }
1635
+}
1636
+
1637
+
1638
+template<>
1639
+void transpose_neon<8>(pixel *dst, const pixel *src, intptr_t stride)
1640
+{
1641
+    transpose8x8(dst, src, 8, stride);
1642
+}
1643
+
1644
+template<>
1645
+void transpose_neon<16>(pixel *dst, const pixel *src, intptr_t stride)
1646
+{
1647
+    transpose16x16(dst, src, 16, stride);
1648
+}
1649
+
1650
+template<>
1651
+void transpose_neon<32>(pixel *dst, const pixel *src, intptr_t stride)
1652
+{
1653
+    transpose32x32(dst, src, 32, stride);
1654
+}
1655
+
1656
+
1657
+template<>
1658
+void transpose_neon<64>(pixel *dst, const pixel *src, intptr_t stride)
1659
+{
1660
+    transpose32x32(dst, src, 64, stride);
1661
+    transpose32x32(dst + 32 * 64 + 32, src + 32 * stride + 32, 64, stride);
1662
+    transpose32x32(dst + 32 * 64, src + 32, 64, stride);
1663
+    transpose32x32(dst + 32, src + 32 * stride, 64, stride);
1664
+}
1665
+
1666
+
1667
+template<int size>
1668
+sse_t pixel_ssd_s_neon(const int16_t *a, intptr_t dstride)
1669
+{
1670
+    sse_t sum = 0;
1671
+
1672
+
1673
+    int32x4_t vsum = vdupq_n_s32(0);
1674
+
1675
+    for (int y = 0; y < size; y++)
1676
+    {
1677
+        int x = 0;
1678
+
1679
+        for (; (x + 8) <= size; x += 8)
1680
+        {
1681
+            int16x8_t in = *(int16x8_t *)&ax;
1682
+            vsum = vmlal_s16(vsum, vget_low_s16(in), vget_low_s16(in));
1683
+            vsum = vmlal_high_s16(vsum, (in), (in));
1684
+        }
1685
+        for (; x < size; x++)
1686
+        {
1687
+            sum += ax * ax;
1688
+        }
1689
+
1690
+        a += dstride;
1691
+    }
1692
+    return sum + vaddvq_s32(vsum);
1693
+}
1694
+
1695
+
1696
+};
1697
+
1698
+
1699
+
1700
+
1701
+namespace X265_NS
1702
+{
1703
+
1704
+
1705
+void setupPixelPrimitives_neon(EncoderPrimitives &p)
1706
+{
1707
+#define LUMA_PU(W, H) \
1708
+    p.puLUMA_ ## W ## x ## H.copy_pp = blockcopy_pp_neon<W, H>; \
1709
+    p.puLUMA_ ## W ## x ## H.addAvgNONALIGNED = addAvg_neon<W, H>; \
1710
+    p.puLUMA_ ## W ## x ## H.addAvgALIGNED = addAvg_neon<W, H>; \
1711
+    p.puLUMA_ ## W ## x ## H.sad = sad_pp_neon<W, H>; \
1712
+    p.puLUMA_ ## W ## x ## H.sad_x3 = sad_x3_neon<W, H>; \
1713
+    p.puLUMA_ ## W ## x ## H.sad_x4 = sad_x4_neon<W, H>; \
1714
+    p.puLUMA_ ## W ## x ## H.pixelavg_ppNONALIGNED = pixelavg_pp_neon<W, H>; \
1715
+    p.puLUMA_ ## W ## x ## H.pixelavg_ppALIGNED = pixelavg_pp_neon<W, H>;
1716
+
1717
+#if !(HIGH_BIT_DEPTH)
1718
+#define LUMA_PU_S(W, H) \
1719
+    p.puLUMA_ ## W ## x ## H.copy_pp = blockcopy_pp_neon<W, H>; \
1720
+    p.puLUMA_ ## W ## x ## H.addAvgNONALIGNED = addAvg_neon<W, H>; \
1721
+    p.puLUMA_ ## W ## x ## H.addAvgALIGNED = addAvg_neon<W, H>;
1722
+#else // !(HIGH_BIT_DEPTH)
1723
+#define LUMA_PU_S(W, H) \
1724
+    p.puLUMA_ ## W ## x ## H.copy_pp = blockcopy_pp_neon<W, H>; \
1725
+    p.puLUMA_ ## W ## x ## H.addAvgNONALIGNED = addAvg_neon<W, H>; \
1726
+    p.puLUMA_ ## W ## x ## H.addAvgALIGNED = addAvg_neon<W, H>; \
1727
+    p.puLUMA_ ## W ## x ## H.sad_x3 = sad_x3_neon<W, H>; \
1728
+    p.puLUMA_ ## W ## x ## H.sad_x4 = sad_x4_neon<W, H>; \
1729
+    p.puLUMA_ ## W ## x ## H.pixelavg_ppNONALIGNED = pixelavg_pp_neon<W, H>; \
1730
+    p.puLUMA_ ## W ## x ## H.pixelavg_ppALIGNED = pixelavg_pp_neon<W, H>;
1731
+#endif // !(HIGH_BIT_DEPTH)
1732
+
1733
+#define LUMA_CU(W, H) \
1734
+    p.cuBLOCK_ ## W ## x ## H.sub_ps        = pixel_sub_ps_neon<W, H>; \
1735
+    p.cuBLOCK_ ## W ## x ## H.add_psNONALIGNED    = pixel_add_ps_neon<W, H>; \
1736
+    p.cuBLOCK_ ## W ## x ## H.add_psALIGNED = pixel_add_ps_neon<W, H>; \
1737
+    p.cuBLOCK_ ## W ## x ## H.copy_pp       = blockcopy_pp_neon<W, H>; \
1738
+    p.cuBLOCK_ ## W ## x ## H.copy_ps       = blockcopy_ps_neon<W, H>; \
1739
+    p.cuBLOCK_ ## W ## x ## H.copy_pp       = blockcopy_pp_neon<W, H>; \
1740
+    p.cuBLOCK_ ## W ## x ## H.cpy2Dto1D_shl = cpy2Dto1D_shl_neon<W>; \
1741
+    p.cuBLOCK_ ## W ## x ## H.cpy1Dto2D_shlNONALIGNED = cpy1Dto2D_shl_neon<W>; \
1742
+    p.cuBLOCK_ ## W ## x ## H.cpy1Dto2D_shlALIGNED = cpy1Dto2D_shl_neon<W>; \
1743
+    p.cuBLOCK_ ## W ## x ## H.psy_cost_pp   = psyCost_pp_neon<BLOCK_ ## W ## x ## H>; \
1744
+    p.cuBLOCK_ ## W ## x ## H.transpose     = transpose_neon<W>;
1745
+
1746
+
1747
+    LUMA_PU_S(4, 4);
1748
+    LUMA_PU_S(8, 8);
1749
+    LUMA_PU(16, 16);
1750
+    LUMA_PU(32, 32);
1751
+    LUMA_PU(64, 64);
1752
+    LUMA_PU_S(4, 8);
1753
+    LUMA_PU_S(8, 4);
1754
+    LUMA_PU(16,  8);
1755
+    LUMA_PU_S(8, 16);
1756
+    LUMA_PU(16, 12);
1757
+    LUMA_PU(12, 16);
1758
+    LUMA_PU(16,  4);
1759
+    LUMA_PU_S(4, 16);
1760
+    LUMA_PU(32, 16);
1761
+    LUMA_PU(16, 32);
1762
+    LUMA_PU(32, 24);
1763
+    LUMA_PU(24, 32);
1764
+    LUMA_PU(32,  8);
1765
+    LUMA_PU_S(8, 32);
1766
+    LUMA_PU(64, 32);
1767
+    LUMA_PU(32, 64);
1768
+    LUMA_PU(64, 48);
1769
+    LUMA_PU(48, 64);
1770
+    LUMA_PU(64, 16);
1771
+    LUMA_PU(16, 64);
1772
+    
1773
+#if defined(__APPLE__)
1774
+    p.puLUMA_4x4.sad = sad_pp_neon<4, 4>;
1775
+    p.puLUMA_4x8.sad = sad_pp_neon<4, 8>;
1776
+    p.puLUMA_4x16.sad = sad_pp_neon<4, 16>;
1777
+#endif // defined(__APPLE__)
1778
+    p.puLUMA_8x4.sad = sad_pp_neon<8, 4>;
1779
+    p.puLUMA_8x8.sad = sad_pp_neon<8, 8>;
1780
+    p.puLUMA_8x16.sad = sad_pp_neon<8, 16>;
1781
+    p.puLUMA_8x32.sad = sad_pp_neon<8, 32>;
1782
+
1783
+#if !(HIGH_BIT_DEPTH)
1784
+    p.puLUMA_4x4.sad_x3 = sad_x3_neon<4, 4>;
1785
+    p.puLUMA_4x4.sad_x4 = sad_x4_neon<4, 4>;
1786
+    p.puLUMA_4x8.sad_x3 = sad_x3_neon<4, 8>;
1787
+    p.puLUMA_4x8.sad_x4 = sad_x4_neon<4, 8>;
1788
+    p.puLUMA_4x16.sad_x3 = sad_x3_neon<4, 16>;
1789
+    p.puLUMA_4x16.sad_x4 = sad_x4_neon<4, 16>;
1790
+#endif // !(HIGH_BIT_DEPTH)
1791
+
1792
+    p.puLUMA_4x4.satd   = pixel_satd_4x4_neon;
1793
+    p.puLUMA_8x4.satd   = pixel_satd_8x4_neon;
1794
+    
1795
+    p.puLUMA_8x8.satd   = satd8_neon<8, 8>;
1796
+    p.puLUMA_16x16.satd = satd8_neon<16, 16>;
1797
+    p.puLUMA_16x8.satd  = satd8_neon<16, 8>;
1798
+    p.puLUMA_8x16.satd  = satd8_neon<8, 16>;
1799
+    p.puLUMA_16x12.satd = satd8_neon<16, 12>;
1800
+    p.puLUMA_16x4.satd  = satd8_neon<16, 4>;
1801
+    p.puLUMA_32x32.satd = satd8_neon<32, 32>;
1802
+    p.puLUMA_32x16.satd = satd8_neon<32, 16>;
1803
+    p.puLUMA_16x32.satd = satd8_neon<16, 32>;
1804
+    p.puLUMA_32x24.satd = satd8_neon<32, 24>;
1805
+    p.puLUMA_24x32.satd = satd8_neon<24, 32>;
1806
+    p.puLUMA_32x8.satd  = satd8_neon<32, 8>;
1807
+    p.puLUMA_8x32.satd  = satd8_neon<8, 32>;
1808
+    p.puLUMA_64x64.satd = satd8_neon<64, 64>;
1809
+    p.puLUMA_64x32.satd = satd8_neon<64, 32>;
1810
+    p.puLUMA_32x64.satd = satd8_neon<32, 64>;
1811
+    p.puLUMA_64x48.satd = satd8_neon<64, 48>;
1812
+    p.puLUMA_48x64.satd = satd8_neon<48, 64>;
1813
+    p.puLUMA_64x16.satd = satd8_neon<64, 16>;
1814
+    p.puLUMA_16x64.satd = satd8_neon<16, 64>;
1815
+
1816
+#if HIGH_BIT_DEPTH
1817
+    p.puLUMA_4x8.satd   = satd4_neon<4, 8>;
1818
+    p.puLUMA_4x16.satd  = satd4_neon<4, 16>;
1819
+#endif // HIGH_BIT_DEPTH
1820
+
1821
+#if !defined(__APPLE__) || HIGH_BIT_DEPTH
1822
+    p.puLUMA_12x16.satd = satd4_neon<12, 16>;
1823
+#endif // !defined(__APPLE__)
1824
+
1825
+
1826
+    LUMA_CU(4, 4);
1827
+    LUMA_CU(8, 8);
1828
+    LUMA_CU(16, 16);
1829
+    LUMA_CU(32, 32);
1830
+    LUMA_CU(64, 64);
1831
+    
1832
+#if !(HIGH_BIT_DEPTH)
1833
+    p.cuBLOCK_8x8.var   = pixel_var_neon<8>;
1834
+    p.cuBLOCK_16x16.var = pixel_var_neon<16>;
1835
+#if defined(__APPLE__)
1836
+    p.cuBLOCK_32x32.var   = pixel_var_neon<32>;
1837
+    p.cuBLOCK_64x64.var = pixel_var_neon<64>;
1838
+#endif // defined(__APPLE__)
1839
+#endif // !(HIGH_BIT_DEPTH)
1840
+
1841
+    p.cuBLOCK_16x16.blockfill_sNONALIGNED = blockfill_s_neon<16>; 
1842
+    p.cuBLOCK_16x16.blockfill_sALIGNED    = blockfill_s_neon<16>;
1843
+    p.cuBLOCK_32x32.blockfill_sNONALIGNED = blockfill_s_neon<32>; 
1844
+    p.cuBLOCK_32x32.blockfill_sALIGNED    = blockfill_s_neon<32>;
1845
+    p.cuBLOCK_64x64.blockfill_sNONALIGNED = blockfill_s_neon<64>; 
1846
+    p.cuBLOCK_64x64.blockfill_sALIGNED    = blockfill_s_neon<64>;
1847
+
1848
+
1849
+    p.cuBLOCK_4x4.calcresidualNONALIGNED    = getResidual_neon<4>;
1850
+    p.cuBLOCK_4x4.calcresidualALIGNED       = getResidual_neon<4>;
1851
+    p.cuBLOCK_8x8.calcresidualNONALIGNED    = getResidual_neon<8>;
1852
+    p.cuBLOCK_8x8.calcresidualALIGNED       = getResidual_neon<8>;
1853
+    p.cuBLOCK_16x16.calcresidualNONALIGNED  = getResidual_neon<16>;
1854
+    p.cuBLOCK_16x16.calcresidualALIGNED     = getResidual_neon<16>;
1855
+    
1856
+#if defined(__APPLE__)
1857
+    p.cuBLOCK_32x32.calcresidualNONALIGNED  = getResidual_neon<32>;
1858
+    p.cuBLOCK_32x32.calcresidualALIGNED     = getResidual_neon<32>;
1859
+#endif // defined(__APPLE__)
1860
+
1861
+    p.cuBLOCK_4x4.sa8d   = pixel_satd_4x4_neon;
1862
+    p.cuBLOCK_8x8.sa8d   = pixel_sa8d_8x8_neon;
1863
+    p.cuBLOCK_16x16.sa8d = pixel_sa8d_16x16_neon;
1864
+    p.cuBLOCK_32x32.sa8d = sa8d16<32, 32>;
1865
+    p.cuBLOCK_64x64.sa8d = sa8d16<64, 64>;
1866
+
1867
+
1868
+#define CHROMA_PU_420(W, H) \
1869
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.addAvgNONALIGNED  = addAvg_neon<W, H>;         \
1870
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.addAvgALIGNED  = addAvg_neon<W, H>;         \
1871
+    p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.copy_pp = blockcopy_pp_neon<W, H>; \
1872
+
1873
+
1874
+    CHROMA_PU_420(4, 4);
1875
+    CHROMA_PU_420(8, 8);
1876
+    CHROMA_PU_420(16, 16);
1877
+    CHROMA_PU_420(32, 32);
1878
+    CHROMA_PU_420(4, 2);
1879
+    CHROMA_PU_420(8, 4);
1880
+    CHROMA_PU_420(4, 8);
1881
+    CHROMA_PU_420(8, 6);
1882
+    CHROMA_PU_420(6, 8);
1883
+    CHROMA_PU_420(8, 2);
1884
+    CHROMA_PU_420(2, 8);
1885
+    CHROMA_PU_420(16, 8);
1886
+    CHROMA_PU_420(8,  16);
1887
+    CHROMA_PU_420(16, 12);
1888
+    CHROMA_PU_420(12, 16);
1889
+    CHROMA_PU_420(16, 4);
1890
+    CHROMA_PU_420(4,  16);
1891
+    CHROMA_PU_420(32, 16);
1892
+    CHROMA_PU_420(16, 32);
1893
+    CHROMA_PU_420(32, 24);
1894
+    CHROMA_PU_420(24, 32);
1895
+    CHROMA_PU_420(32, 8);
1896
+    CHROMA_PU_420(8,  32);
1897
+
1898
+
1899
+
1900
+    p.chromaX265_CSP_I420.puCHROMA_420_2x2.satd   = NULL;
1901
+    p.chromaX265_CSP_I420.puCHROMA_420_4x4.satd   = pixel_satd_4x4_neon;
1902
+    p.chromaX265_CSP_I420.puCHROMA_420_8x8.satd   = satd8_neon<8, 8>;
1903
+    p.chromaX265_CSP_I420.puCHROMA_420_16x16.satd = satd8_neon<16, 16>;
1904
+    p.chromaX265_CSP_I420.puCHROMA_420_32x32.satd = satd8_neon<32, 32>;
1905
+
1906
+    p.chromaX265_CSP_I420.puCHROMA_420_4x2.satd   = NULL;
1907
+    p.chromaX265_CSP_I420.puCHROMA_420_2x4.satd   = NULL;
1908
+    p.chromaX265_CSP_I420.puCHROMA_420_8x4.satd   = pixel_satd_8x4_neon;
1909
+    p.chromaX265_CSP_I420.puCHROMA_420_16x8.satd  = satd8_neon<16, 8>;
1910
+    p.chromaX265_CSP_I420.puCHROMA_420_8x16.satd  = satd8_neon<8, 16>;
1911
+    p.chromaX265_CSP_I420.puCHROMA_420_32x16.satd = satd8_neon<32, 16>;
1912
+    p.chromaX265_CSP_I420.puCHROMA_420_16x32.satd = satd8_neon<16, 32>;
1913
+
1914
+    p.chromaX265_CSP_I420.puCHROMA_420_8x6.satd   = NULL;
1915
+    p.chromaX265_CSP_I420.puCHROMA_420_6x8.satd   = NULL;
1916
+    p.chromaX265_CSP_I420.puCHROMA_420_8x2.satd   = NULL;
1917
+    p.chromaX265_CSP_I420.puCHROMA_420_2x8.satd   = NULL;
1918
+    p.chromaX265_CSP_I420.puCHROMA_420_16x12.satd = satd4_neon<16, 12>;
1919
+    p.chromaX265_CSP_I420.puCHROMA_420_16x4.satd  = satd4_neon<16, 4>;
1920
+    p.chromaX265_CSP_I420.puCHROMA_420_32x24.satd = satd8_neon<32, 24>;
1921
+    p.chromaX265_CSP_I420.puCHROMA_420_24x32.satd = satd8_neon<24, 32>;
1922
+    p.chromaX265_CSP_I420.puCHROMA_420_32x8.satd  = satd8_neon<32, 8>;
1923
+    p.chromaX265_CSP_I420.puCHROMA_420_8x32.satd  = satd8_neon<8, 32>;
1924
+    
1925
+#if HIGH_BIT_DEPTH
1926
+    p.chromaX265_CSP_I420.puCHROMA_420_4x8.satd   = satd4_neon<4, 8>;
1927
+    p.chromaX265_CSP_I420.puCHROMA_420_4x16.satd  = satd4_neon<4, 16>;
1928
+#endif // HIGH_BIT_DEPTH
1929
+
1930
+#if !defined(__APPLE__) || HIGH_BIT_DEPTH
1931
+    p.chromaX265_CSP_I420.puCHROMA_420_12x16.satd = satd4_neon<12, 16>;
1932
+#endif // !defined(__APPLE__)
1933
+
1934
+
1935
+#define CHROMA_CU_420(W, H) \
1936
+    p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.sse_pp  = sse_neon<W, H, pixel, pixel>; \
1937
+    p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.copy_pp = blockcopy_pp_neon<W, H>; \
1938
+    p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.copy_ps = blockcopy_ps_neon<W, H>; \
1939
+    p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.sub_ps = pixel_sub_ps_neon<W, H>;  \
1940
+    p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.add_psNONALIGNED = pixel_add_ps_neon<W, H>; \
1941
+    p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.add_psALIGNED = pixel_add_ps_neon<W, H>;
1942
+    
1943
+#define CHROMA_CU_S_420(W, H) \
1944
+    p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.copy_pp = blockcopy_pp_neon<W, H>; \
1945
+    p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.copy_ps = blockcopy_ps_neon<W, H>; \
1946
+    p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.sub_ps = pixel_sub_ps_neon<W, H>;  \
1947
+    p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.add_psNONALIGNED = pixel_add_ps_neon<W, H>; \
1948
+    p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.add_psALIGNED = pixel_add_ps_neon<W, H>;
1949
+
1950
+
1951
+    CHROMA_CU_S_420(4, 4)
1952
+    CHROMA_CU_420(8, 8)
1953
+    CHROMA_CU_420(16, 16)
1954
+    CHROMA_CU_420(32, 32)
1955
+
1956
+
1957
+    p.chromaX265_CSP_I420.cuBLOCK_8x8.sa8d   = p.chromaX265_CSP_I420.puCHROMA_420_4x4.satd;
1958
+    p.chromaX265_CSP_I420.cuBLOCK_16x16.sa8d = sa8d8<8, 8>;
1959
+    p.chromaX265_CSP_I420.cuBLOCK_32x32.sa8d = sa8d16<16, 16>;
1960
+    p.chromaX265_CSP_I420.cuBLOCK_64x64.sa8d = sa8d16<32, 32>;
1961
+
1962
+
1963
+#define CHROMA_PU_422(W, H) \
1964
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.addAvgNONALIGNED  = addAvg_neon<W, H>;         \
1965
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.addAvgALIGNED  = addAvg_neon<W, H>;         \
1966
+    p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.copy_pp = blockcopy_pp_neon<W, H>; \
1967
+
1968
+
1969
+    CHROMA_PU_422(4, 8);
1970
+    CHROMA_PU_422(8, 16);
1971
+    CHROMA_PU_422(16, 32);
1972
+    CHROMA_PU_422(32, 64);
1973
+    CHROMA_PU_422(4, 4);
1974
+    CHROMA_PU_422(2, 8);
1975
+    CHROMA_PU_422(8, 8);
1976
+    CHROMA_PU_422(4, 16);
1977
+    CHROMA_PU_422(8, 12);
1978
+    CHROMA_PU_422(6, 16);
1979
+    CHROMA_PU_422(8, 4);
1980
+    CHROMA_PU_422(2, 16);
1981
+    CHROMA_PU_422(16, 16);
1982
+    CHROMA_PU_422(8, 32);
1983
+    CHROMA_PU_422(16, 24);
1984
+    CHROMA_PU_422(12, 32);
1985
+    CHROMA_PU_422(16, 8);
1986
+    CHROMA_PU_422(4,  32);
1987
+    CHROMA_PU_422(32, 32);
1988
+    CHROMA_PU_422(16, 64);
1989
+    CHROMA_PU_422(32, 48);
1990
+    CHROMA_PU_422(24, 64);
1991
+    CHROMA_PU_422(32, 16);
1992
+    CHROMA_PU_422(8,  64);
1993
+
1994
+
1995
+    p.chromaX265_CSP_I422.puCHROMA_422_2x4.satd   = NULL;
1996
+    p.chromaX265_CSP_I422.puCHROMA_422_8x16.satd  = satd8_neon<8, 16>;
1997
+    p.chromaX265_CSP_I422.puCHROMA_422_16x32.satd = satd8_neon<16, 32>;
1998
+    p.chromaX265_CSP_I422.puCHROMA_422_32x64.satd = satd8_neon<32, 64>;
1999
+    p.chromaX265_CSP_I422.puCHROMA_422_4x4.satd   = pixel_satd_4x4_neon;
2000
+    p.chromaX265_CSP_I422.puCHROMA_422_2x8.satd   = NULL;
2001
+    p.chromaX265_CSP_I422.puCHROMA_422_8x8.satd   = satd8_neon<8, 8>;
2002
+    p.chromaX265_CSP_I422.puCHROMA_422_16x16.satd = satd8_neon<16, 16>;
2003
+    p.chromaX265_CSP_I422.puCHROMA_422_8x32.satd  = satd8_neon<8, 32>;
2004
+    p.chromaX265_CSP_I422.puCHROMA_422_32x32.satd = satd8_neon<32, 32>;
2005
+    p.chromaX265_CSP_I422.puCHROMA_422_16x64.satd = satd8_neon<16, 64>;
2006
+    p.chromaX265_CSP_I422.puCHROMA_422_6x16.satd  = NULL;
2007
+    p.chromaX265_CSP_I422.puCHROMA_422_8x4.satd   = satd4_neon<8, 4>;
2008
+    p.chromaX265_CSP_I422.puCHROMA_422_2x16.satd  = NULL;
2009
+    p.chromaX265_CSP_I422.puCHROMA_422_16x8.satd  = satd8_neon<16, 8>;
2010
+    p.chromaX265_CSP_I422.puCHROMA_422_32x16.satd = satd8_neon<32, 16>;
2011
+    
2012
+    p.chromaX265_CSP_I422.puCHROMA_422_8x12.satd  = satd4_neon<8, 12>;
2013
+    p.chromaX265_CSP_I422.puCHROMA_422_8x64.satd  = satd8_neon<8, 64>;
2014
+    p.chromaX265_CSP_I422.puCHROMA_422_12x32.satd = satd4_neon<12, 32>;
2015
+    p.chromaX265_CSP_I422.puCHROMA_422_16x24.satd = satd8_neon<16, 24>;
2016
+    p.chromaX265_CSP_I422.puCHROMA_422_24x64.satd = satd8_neon<24, 64>;
2017
+    p.chromaX265_CSP_I422.puCHROMA_422_32x48.satd = satd8_neon<32, 48>;
2018
+
2019
+#if HIGH_BIT_DEPTH
2020
+    p.chromaX265_CSP_I422.puCHROMA_422_4x8.satd   = satd4_neon<4, 8>;
2021
+    p.chromaX265_CSP_I422.puCHROMA_422_4x16.satd  = satd4_neon<4, 16>;
2022
+    p.chromaX265_CSP_I422.puCHROMA_422_4x32.satd  = satd4_neon<4, 32>;
2023
+#endif // HIGH_BIT_DEPTH
2024
+
2025
+
2026
+#define CHROMA_CU_422(W, H) \
2027
+    p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.sse_pp  = sse_neon<W, H, pixel, pixel>;  \
2028
+    p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.copy_pp = blockcopy_pp_neon<W, H>; \
2029
+    p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.copy_ps = blockcopy_ps_neon<W, H>; \
2030
+    p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.sub_ps = pixel_sub_ps_neon<W, H>; \
2031
+    p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.add_psNONALIGNED = pixel_add_ps_neon<W, H>; \
2032
+    p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.add_psALIGNED = pixel_add_ps_neon<W, H>;
2033
+
2034
+#define CHROMA_CU_S_422(W, H) \
2035
+    p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.copy_pp = blockcopy_pp_neon<W, H>; \
2036
+    p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.copy_ps = blockcopy_ps_neon<W, H>; \
2037
+    p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.sub_ps = pixel_sub_ps_neon<W, H>; \
2038
+    p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.add_psNONALIGNED = pixel_add_ps_neon<W, H>; \
2039
+    p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.add_psALIGNED = pixel_add_ps_neon<W, H>;
2040
+    
2041
+    
2042
+    CHROMA_CU_S_422(4, 8)
2043
+    CHROMA_CU_422(8, 16)
2044
+    CHROMA_CU_422(16, 32)
2045
+    CHROMA_CU_422(32, 64)
2046
+
2047
+    p.chromaX265_CSP_I422.cuBLOCK_8x8.sa8d   = p.chromaX265_CSP_I422.puCHROMA_422_4x8.satd;
2048
+    p.chromaX265_CSP_I422.cuBLOCK_16x16.sa8d = sa8d8<8, 16>;
2049
+    p.chromaX265_CSP_I422.cuBLOCK_32x32.sa8d = sa8d16<16, 32>;
2050
+    p.chromaX265_CSP_I422.cuBLOCK_64x64.sa8d = sa8d16<32, 64>;
2051
+
2052
+
2053
+}
2054
+
2055
+
2056
+}
2057
+
2058
+
2059
+#endif
2060
+
2061
x265_3.6.tar.gz/source/common/aarch64/pixel-prim.h Added
25
 
1
@@ -0,0 +1,23 @@
2
+#ifndef PIXEL_PRIM_NEON_H__
3
+#define PIXEL_PRIM_NEON_H__
4
+
5
+#include "common.h"
6
+#include "slicetype.h"      // LOWRES_COST_MASK
7
+#include "primitives.h"
8
+#include "x265.h"
9
+
10
+
11
+
12
+namespace X265_NS
13
+{
14
+
15
+
16
+
17
+void setupPixelPrimitives_neon(EncoderPrimitives &p);
18
+
19
+
20
+}
21
+
22
+
23
+#endif
24
+
25
x265_3.6.tar.gz/source/common/aarch64/pixel-util-common.S Added
86
 
1
@@ -0,0 +1,84 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+.arch           armv8-a
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.macro pixel_var_start
39
+    movi            v0.16b, #0
40
+    movi            v1.16b, #0
41
+    movi            v2.16b, #0
42
+    movi            v3.16b, #0
43
+.endm
44
+
45
+.macro pixel_var_1 v
46
+    uaddw           v0.8h, v0.8h, \v\().8b
47
+    umull           v30.8h, \v\().8b, \v\().8b
48
+    uaddw2          v1.8h, v1.8h, \v\().16b
49
+    umull2          v31.8h, \v\().16b, \v\().16b
50
+    uadalp          v2.4s, v30.8h
51
+    uadalp          v3.4s, v31.8h
52
+.endm
53
+
54
+.macro pixel_var_end
55
+    uaddlv          s0, v0.8h
56
+    uaddlv          s1, v1.8h
57
+    add             v2.4s, v2.4s, v3.4s
58
+    fadd            s0, s0, s1
59
+    uaddlv          d2, v2.4s
60
+    fmov            w0, s0
61
+    fmov            x2, d2
62
+    orr             x0, x0, x2, lsl #32
63
+.endm
64
+
65
+.macro ssimDist_start
66
+    movi            v0.16b, #0
67
+    movi            v1.16b, #0
68
+.endm
69
+
70
+.macro ssimDist_end
71
+    uaddlv          d0, v0.4s
72
+    uaddlv          d1, v1.4s
73
+    str             d0, x6
74
+    str             d1, x4
75
+.endm
76
+
77
+.macro normFact_start
78
+    movi            v0.16b, #0
79
+.endm
80
+
81
+.macro normFact_end
82
+    uaddlv          d0, v0.4s
83
+    str             d0, x3
84
+.endm
85
+
86
x265_3.6.tar.gz/source/common/aarch64/pixel-util-sve.S Added
375
 
1
@@ -0,0 +1,373 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "pixel-util-common.S"
27
+
28
+.arch armv8-a+sve
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+function PFX(pixel_sub_ps_8x16_sve)
41
+    lsl             x1, x1, #1
42
+    ptrue           p0.h, vl8
43
+.rept 8
44
+    ld1b            {z0.h}, p0/z, x2
45
+    ld1b            {z1.h}, p0/z, x3
46
+    add             x2, x2, x4
47
+    add             x3, x3, x5
48
+    ld1b            {z2.h}, p0/z, x2
49
+    ld1b            {z3.h}, p0/z, x3
50
+    add             x2, x2, x4
51
+    add             x3, x3, x5
52
+    sub             z4.h, z0.h, z1.h
53
+    sub             z5.h, z2.h, z3.h
54
+    st1             {v4.8h}, x0, x1
55
+    st1             {v5.8h}, x0, x1
56
+.endr
57
+    ret
58
+endfunc
59
+
60
+//******* satd *******
61
+.macro satd_4x4_sve
62
+    ld1b            {z0.h}, p0/z, x0
63
+    ld1b            {z2.h}, p0/z, x2
64
+    add             x0, x0, x1
65
+    add             x2, x2, x3
66
+    ld1b            {z1.h}, p0/z, x0
67
+    ld1b            {z3.h}, p0/z, x2
68
+    add             x0, x0, x1
69
+    add             x2, x2, x3
70
+    ld1b            {z4.h}, p0/z, x0
71
+    ld1b            {z6.h}, p0/z, x2
72
+    add             x0, x0, x1
73
+    add             x2, x2, x3
74
+    ld1b            {z5.h}, p0/z, x0
75
+    ld1b            {z7.h}, p0/z, x2
76
+    add             x0, x0, x1
77
+    add             x2, x2, x3
78
+
79
+    sub             z0.h, z0.h, z2.h
80
+    sub             z1.h, z1.h, z3.h
81
+    sub             z2.h, z4.h, z6.h
82
+    sub             z3.h, z5.h, z7.h
83
+
84
+    add             z4.h, z0.h, z2.h
85
+    add             z5.h, z1.h, z3.h
86
+    sub             z6.h, z0.h, z2.h
87
+    sub             z7.h, z1.h, z3.h
88
+
89
+    add             z0.h, z4.h, z5.h
90
+    sub             z1.h, z4.h, z5.h
91
+
92
+    add             z2.h, z6.h, z7.h
93
+    sub             z3.h, z6.h, z7.h
94
+
95
+    trn1            z4.h, z0.h, z2.h
96
+    trn2            z5.h, z0.h, z2.h
97
+
98
+    trn1            z6.h, z1.h, z3.h
99
+    trn2            z7.h, z1.h, z3.h
100
+
101
+    add             z0.h, z4.h, z5.h
102
+    sub             z1.h, z4.h, z5.h
103
+
104
+    add             z2.h, z6.h, z7.h
105
+    sub             z3.h, z6.h, z7.h
106
+
107
+    trn1            z4.s, z0.s, z1.s
108
+    trn2            z5.s, z0.s, z1.s
109
+
110
+    trn1            z6.s, z2.s, z3.s
111
+    trn2            z7.s, z2.s, z3.s
112
+
113
+    abs             z4.h, p0/m, z4.h
114
+    abs             z5.h, p0/m, z5.h
115
+    abs             z6.h, p0/m, z6.h
116
+    abs             z7.h, p0/m, z7.h
117
+
118
+    smax            z4.h, p0/m, z4.h, z5.h
119
+    smax            z6.h, p0/m, z6.h, z7.h
120
+
121
+    add             z0.h, z4.h, z6.h
122
+
123
+    uaddlp          v0.2s, v0.4h
124
+    uaddlp          v0.1d, v0.2s
125
+.endm
126
+
127
+// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
128
+function PFX(pixel_satd_4x4_sve)
129
+    ptrue           p0.h, vl4
130
+    satd_4x4_sve
131
+    fmov            x0, d0
132
+    ret
133
+endfunc
134
+
135
+function PFX(pixel_satd_8x4_sve)
136
+    ptrue           p0.h, vl4
137
+    mov             x4, x0
138
+    mov             x5, x2
139
+    satd_4x4_sve
140
+    add             x0, x4, #4
141
+    add             x2, x5, #4
142
+    umov            x6, v0.d0
143
+    satd_4x4_sve
144
+    umov            x0, v0.d0
145
+    add             x0, x0, x6
146
+    ret
147
+endfunc
148
+
149
+function PFX(pixel_satd_8x12_sve)
150
+    ptrue           p0.h, vl4
151
+    mov             x4, x0
152
+    mov             x5, x2
153
+    mov             x7, #0
154
+    satd_4x4_sve
155
+    umov            x6, v0.d0
156
+    add             x7, x7, x6
157
+    add             x0, x4, #4
158
+    add             x2, x5, #4
159
+    satd_4x4_sve
160
+    umov            x6, v0.d0
161
+    add             x7, x7, x6
162
+.rept 2
163
+    sub             x0, x0, #4
164
+    sub             x2, x2, #4
165
+    mov             x4, x0
166
+    mov             x5, x2
167
+    satd_4x4_sve
168
+    umov            x6, v0.d0
169
+    add             x7, x7, x6
170
+    add             x0, x4, #4
171
+    add             x2, x5, #4
172
+    satd_4x4_sve
173
+    umov            x6, v0.d0
174
+    add             x7, x7, x6
175
+.endr
176
+    mov             x0, x7
177
+    ret
178
+endfunc
179
+
180
+.macro LOAD_DIFF_16x4_sve v0 v1 v2 v3 v4 v5 v6 v7
181
+    mov             x11, #8 // in order to consider CPUs whose vector size is greater than 128 bits
182
+    ld1b            {z0.h}, p0/z, x0
183
+    ld1b            {z1.h}, p0/z, x0, x11
184
+    ld1b            {z2.h}, p0/z, x2
185
+    ld1b            {z3.h}, p0/z, x2, x11
186
+    add             x0, x0, x1
187
+    add             x2, x2, x3
188
+    ld1b            {z4.h}, p0/z, x0
189
+    ld1b            {z5.h}, p0/z, x0, x11
190
+    ld1b            {z6.h}, p0/z, x2
191
+    ld1b            {z7.h}, p0/z, x2, x11
192
+    add             x0, x0, x1
193
+    add             x2, x2, x3
194
+    ld1b            {z29.h}, p0/z, x0
195
+    ld1b            {z9.h}, p0/z, x0, x11
196
+    ld1b            {z10.h}, p0/z, x2
197
+    ld1b            {z11.h}, p0/z, x2, x11
198
+    add             x0, x0, x1
199
+    add             x2, x2, x3
200
+    ld1b            {z12.h}, p0/z, x0
201
+    ld1b            {z13.h}, p0/z, x0, x11
202
+    ld1b            {z14.h}, p0/z, x2
203
+    ld1b            {z15.h}, p0/z, x2, x11
204
+    add             x0, x0, x1
205
+    add             x2, x2, x3
206
+
207
+    sub             \v0\().h, z0.h, z2.h
208
+    sub             \v4\().h, z1.h, z3.h
209
+    sub             \v1\().h, z4.h, z6.h
210
+    sub             \v5\().h, z5.h, z7.h
211
+    sub             \v2\().h, z29.h, z10.h
212
+    sub             \v6\().h, z9.h, z11.h
213
+    sub             \v3\().h, z12.h, z14.h
214
+    sub             \v7\().h, z13.h, z15.h
215
+.endm
216
+
217
+// one vertical hadamard pass and two horizontal
218
+function PFX(satd_8x4v_8x8h_sve), export=0
219
+    HADAMARD4_V     z16.h, z18.h, z17.h, z19.h, z0.h, z2.h, z1.h, z3.h
220
+    HADAMARD4_V     z20.h, z21.h, z22.h, z23.h, z0.h, z1.h, z2.h, z3.h
221
+    trn4            z0.h, z1.h, z2.h, z3.h, z16.h, z17.h, z18.h, z19.h
222
+    trn4            z4.h, z5.h, z6.h, z7.h, z20.h, z21.h, z22.h, z23.h
223
+    SUMSUB_ABCD     z16.h, z17.h, z18.h, z19.h, z0.h, z1.h, z2.h, z3.h
224
+    SUMSUB_ABCD     z20.h, z21.h, z22.h, z23.h, z4.h, z5.h, z6.h, z7.h
225
+    trn4            z0.s, z2.s, z1.s, z3.s, z16.s, z18.s, z17.s, z19.s
226
+    trn4            z4.s, z6.s, z5.s, z7.s, z20.s, z22.s, z21.s, z23.s
227
+    ABS8_SVE        z0.h, z1.h, z2.h, z3.h, z4.h, z5.h, z6.h, z7.h, p0
228
+    smax            z0.h, p0/m, z0.h, z2.h
229
+    smax            z1.h, p0/m, z1.h, z3.h
230
+    smax            z4.h, p0/m, z4.h, z6.h
231
+    smax            z5.h, p0/m, z5.h, z7.h
232
+    ret
233
+endfunc
234
+
235
+function PFX(satd_16x4_sve), export=0
236
+    LOAD_DIFF_16x4_sve  z16, z17, z18, z19, z20, z21, z22, z23
237
+    b                    PFX(satd_8x4v_8x8h_sve)
238
+endfunc
239
+
240
+.macro pixel_satd_32x8_sve
241
+    mov             x4, x0
242
+    mov             x5, x2
243
+.rept 2
244
+    bl              PFX(satd_16x4_sve)
245
+    add             z30.h, z30.h, z0.h
246
+    add             z31.h, z31.h, z1.h
247
+    add             z30.h, z30.h, z4.h
248
+    add             z31.h, z31.h, z5.h
249
+.endr
250
+    add             x0, x4, #16
251
+    add             x2, x5, #16
252
+.rept 2
253
+    bl              PFX(satd_16x4_sve)
254
+    add             z30.h, z30.h, z0.h
255
+    add             z31.h, z31.h, z1.h
256
+    add             z30.h, z30.h, z4.h
257
+    add             z31.h, z31.h, z5.h
258
+.endr
259
+.endm
260
+
261
+.macro satd_32x16_sve
262
+    movi            v30.2d, #0
263
+    movi            v31.2d, #0
264
+    pixel_satd_32x8_sve
265
+    sub             x0, x0, #16
266
+    sub             x2, x2, #16
267
+    pixel_satd_32x8_sve
268
+    add             z0.h, z30.h, z31.h
269
+    uaddlv          s0, v0.8h
270
+    mov             w6, v0.s0
271
+.endm
272
+
273
+function PFX(pixel_satd_32x16_sve)
274
+    ptrue           p0.h, vl8
275
+    mov             x10, x30
276
+    satd_32x16_sve
277
+    mov             x0, x6
278
+    ret             x10
279
+endfunc
280
+
281
+function PFX(pixel_satd_32x32_sve)
282
+    ptrue           p0.h, vl8
283
+    mov             x10, x30
284
+    mov             x7, #0
285
+    satd_32x16_sve
286
+    sub             x0, x0, #16
287
+    sub             x2, x2, #16
288
+    add             x7, x7, x6
289
+    satd_32x16_sve
290
+    add             x0, x7, x6
291
+    ret             x10
292
+endfunc
293
+
294
+.macro satd_64x16_sve
295
+    mov             x8, x0
296
+    mov             x9, x2
297
+    satd_32x16_sve
298
+    add             x7, x7, x6
299
+    add             x0, x8, #32
300
+    add             x2, x9, #32
301
+    satd_32x16_sve
302
+    add             x7, x7, x6
303
+.endm
304
+
305
+function PFX(pixel_satd_64x48_sve)
306
+    ptrue           p0.h, vl8
307
+    mov             x10, x30
308
+    mov             x7, #0
309
+.rept 2
310
+    satd_64x16_sve
311
+    sub             x0, x0, #48
312
+    sub             x2, x2, #48
313
+.endr
314
+    satd_64x16_sve
315
+    mov             x0, x7
316
+    ret             x10
317
+endfunc
318
+
319
+/********* ssim ***********/
320
+// uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
321
+// No need to fully use sve instructions for this function
322
+function PFX(quant_sve)
323
+    mov             w9, #1
324
+    lsl             w9, w9, w4
325
+    mov             z0.s, w9
326
+    neg             w9, w4
327
+    mov             z1.s, w9
328
+    add             w9, w9, #8
329
+    mov             z2.s, w9
330
+    mov             z3.s, w5
331
+
332
+    lsr             w6, w6, #2
333
+    eor             z4.d, z4.d, z4.d
334
+    eor             w10, w10, w10
335
+    eor             z17.d, z17.d, z17.d
336
+
337
+.loop_quant_sve:
338
+    ld1             {v18.4h}, x0, #8
339
+    ld1             {v7.4s}, x1, #16
340
+    sxtl            v6.4s, v18.4h
341
+
342
+    cmlt            v5.4s, v6.4s, #0
343
+
344
+    abs             v6.4s, v6.4s
345
+
346
+
347
+    mul             v6.4s, v6.4s, v7.4s
348
+
349
+    add             v7.4s, v6.4s, v3.4s
350
+    sshl            v7.4s, v7.4s, v1.4s
351
+
352
+    mls             v6.4s, v7.4s, v0.s0
353
+    sshl            v16.4s, v6.4s, v2.4s
354
+    st1             {v16.4s}, x2, #16
355
+
356
+    // numsig
357
+    cmeq            v16.4s, v7.4s, v17.4s
358
+    add             v4.4s, v4.4s, v16.4s
359
+    add             w10, w10, #4
360
+
361
+    // level *= sign
362
+    eor             z16.d, z7.d, z5.d
363
+    sub             v16.4s, v16.4s, v5.4s
364
+    sqxtn           v5.4h, v16.4s
365
+    st1             {v5.4h}, x3, #8
366
+
367
+    subs            w6, w6, #1
368
+    b.ne             .loop_quant_sve
369
+
370
+    addv            s4, v4.4s
371
+    mov             w9, v4.s0
372
+    add             w0, w10, w9
373
+    ret
374
+endfunc
375
x265_3.6.tar.gz/source/common/aarch64/pixel-util-sve2.S Added
1688
 
1
@@ -0,0 +1,1686 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "pixel-util-common.S"
27
+
28
+.arch armv8-a+sve2
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+// uint64_t pixel_var(const pixel* pix, intptr_t i_stride)
41
+function PFX(pixel_var_8x8_sve2)
42
+    ptrue           p0.h, vl8
43
+    ld1b            {z0.h}, p0/z, x0
44
+    add             x0, x0, x1
45
+    mul             z31.h, z0.h, z0.h
46
+    uaddlp          v1.4s, v31.8h
47
+.rept 7
48
+    ld1b            {z4.h}, p0/z, x0
49
+    add             x0, x0, x1
50
+    add             z0.h, z0.h, z4.h
51
+    mul             z31.h, z4.h, z4.h
52
+    uadalp          z1.s, p0/m, z31.h
53
+.endr
54
+    uaddlv          s0, v0.8h
55
+    uaddlv          d1, v1.4s
56
+    fmov            w0, s0
57
+    fmov            x1, d1
58
+    orr             x0, x0, x1, lsl #32
59
+    ret
60
+endfunc
61
+
62
+function PFX(pixel_var_16x16_sve2)
63
+    rdvl            x9, #1
64
+    cmp             x9, #16
65
+    bgt             .vl_gt_16_pixel_var_16x16
66
+    pixel_var_start
67
+    mov             w12, #16
68
+.loop_var_16_sve2:
69
+    sub             w12, w12, #1
70
+    ld1             {v4.16b}, x0, x1
71
+    pixel_var_1 v4
72
+    cbnz            w12, .loop_var_16_sve2
73
+    pixel_var_end
74
+    ret
75
+.vl_gt_16_pixel_var_16x16:
76
+    ptrue           p0.h, vl16
77
+    mov             z0.d, #0
78
+.rept 16
79
+    ld1b            {z4.h}, p0/z, x0
80
+    add             x0, x0, x1
81
+    add             z0.h, z0.h, z4.h
82
+    mul             z30.h, z4.h, z4.h
83
+    uadalp          z1.s, p0/m, z30.h
84
+.endr
85
+    uaddv           d0, p0, z0.h
86
+    uaddv           d1, p0, z1.s
87
+    fmov            w0, s0
88
+    fmov            x1, d1
89
+    orr             x0, x0, x1, lsl #32
90
+    ret
91
+endfunc
92
+
93
+function PFX(pixel_var_32x32_sve2)
94
+    rdvl            x9, #1
95
+    cmp             x9, #16
96
+    bgt             .vl_gt_16_pixel_var_32x32
97
+    pixel_var_start
98
+    mov             w12, #32
99
+.loop_var_32_sve2:
100
+    sub             w12, w12, #1
101
+    ld1             {v4.16b-v5.16b}, x0, x1
102
+    pixel_var_1 v4
103
+    pixel_var_1 v5
104
+    cbnz            w12, .loop_var_32_sve2
105
+    pixel_var_end
106
+    ret
107
+.vl_gt_16_pixel_var_32x32:
108
+    cmp             x9, #48
109
+    bgt             .vl_gt_48_pixel_var_32x32
110
+    ptrue           p0.b, vl32
111
+    mov             z0.d, #0
112
+    mov             z1.d, #0
113
+.rept 32
114
+    ld1b            {z4.b}, p0/z, x0
115
+    add             x0, x0, x1
116
+    uaddwb          z0.h, z0.h, z4.b
117
+    uaddwt          z0.h, z0.h, z4.b
118
+    umullb          z28.h, z4.b, z4.b
119
+    umullt          z29.h, z4.b, z4.b
120
+    uadalp          z1.s, p0/m, z28.h
121
+    uadalp          z1.s, p0/m, z29.h
122
+.endr
123
+    uaddv           d0, p0, z0.h
124
+    uaddv           d1, p0, z1.s
125
+    fmov            w0, s0
126
+    fmov            x1, d1
127
+    orr             x0, x0, x1, lsl #32
128
+    ret
129
+.vl_gt_48_pixel_var_32x32:
130
+    ptrue           p0.h, vl32
131
+    mov             z0.d, #0
132
+    mov             z1.d, #0
133
+.rept 32
134
+    ld1b            {z4.h}, p0/z, x0
135
+    add             x0, x0, x1
136
+    add             z0.h, z0.h, z4.h
137
+    mul             z28.h, z4.h, z4.h
138
+    uadalp          z1.s, p0/m, z28.h
139
+.endr
140
+    uaddv           d0, p0, z0.h
141
+    uaddv           d1, p0, z1.s
142
+    fmov            w0, s0
143
+    fmov            x1, d1
144
+    orr             x0, x0, x1, lsl #32
145
+    ret
146
+endfunc
147
+
148
+function PFX(pixel_var_64x64_sve2)
149
+    rdvl            x9, #1
150
+    cmp             x9, #16
151
+    bgt             .vl_gt_16_pixel_var_64x64
152
+    pixel_var_start
153
+    mov             w12, #64
154
+.loop_var_64_sve2:
155
+    sub             w12, w12, #1
156
+    ld1             {v4.16b-v7.16b}, x0, x1
157
+    pixel_var_1 v4
158
+    pixel_var_1 v5
159
+    pixel_var_1 v6
160
+    pixel_var_1 v7
161
+    cbnz            w12, .loop_var_64_sve2
162
+    pixel_var_end
163
+    ret
164
+.vl_gt_16_pixel_var_64x64:
165
+    cmp             x9, #48
166
+    bgt             .vl_gt_48_pixel_var_64x64
167
+    ptrue           p0.b, vl32
168
+    mov             z0.d, #0
169
+    mov             z2.d, #0
170
+.rept 64
171
+    ld1b            {z4.b}, p0/z, x0
172
+    ld1b            {z5.b}, p0/z, x0, #1, mul vl
173
+    add             x0, x0, x1
174
+    uaddwb          z0.h, z0.h, z4.b
175
+    uaddwt          z0.h, z0.h, z4.b
176
+    uaddwb          z0.h, z0.h, z5.b
177
+    uaddwt          z0.h, z0.h, z5.b
178
+    umullb          z24.h, z4.b, z4.b
179
+    umullt          z25.h, z4.b, z4.b
180
+    umullb          z26.h, z5.b, z5.b
181
+    umullt          z27.h, z5.b, z5.b
182
+    uadalp          z2.s, p0/m, z24.h
183
+    uadalp          z2.s, p0/m, z25.h
184
+    uadalp          z2.s, p0/m, z26.h
185
+    uadalp          z2.s, p0/m, z27.h
186
+.endr
187
+    uaddv           d0, p0, z0.h
188
+    uaddv           d1, p0, z2.s
189
+    fmov            w0, s0
190
+    fmov            x1, d1
191
+    orr             x0, x0, x1, lsl #32
192
+    ret
193
+.vl_gt_48_pixel_var_64x64:
194
+    cmp             x9, #112
195
+    bgt             .vl_gt_112_pixel_var_64x64
196
+    ptrue           p0.b, vl64
197
+    mov             z0.d, #0
198
+    mov             z1.d, #0
199
+.rept 64
200
+    ld1b            {z4.b}, p0/z, x0
201
+    add             x0, x0, x1
202
+    uaddwb          z0.h, z0.h, z4.b
203
+    uaddwt          z0.h, z0.h, z4.b
204
+    umullb          z24.h, z4.b, z4.b
205
+    umullt          z25.h, z4.b, z4.b
206
+    uadalp          z2.s, p0/m, z24.h
207
+    uadalp          z2.s, p0/m, z25.h
208
+.endr
209
+    uaddv           d0, p0, z0.h
210
+    uaddv           d1, p0, z2.s
211
+    fmov            w0, s0
212
+    fmov            x1, d1
213
+    orr             x0, x0, x1, lsl #32
214
+    ret
215
+.vl_gt_112_pixel_var_64x64:
216
+    ptrue           p0.h, vl64
217
+    mov             z0.d, #0
218
+    mov             z1.d, #0
219
+.rept 64
220
+    ld1b            {z4.h}, p0/z, x0
221
+    add             x0, x0, x1
222
+    add             z0.h, z0.h, z4.h
223
+    mul             z24.h, z4.h, z4.h
224
+    uadalp          z1.s, p0/m, z24.h
225
+.endr
226
+    uaddv           d0, p0, z0.h
227
+    uaddv           d1, p0, z1.s
228
+    fmov            w0, s0
229
+    fmov            x1, d1
230
+    orr             x0, x0, x1, lsl #32
231
+    ret
232
+endfunc
233
+
234
+function PFX(getResidual16_sve2)
235
+    rdvl            x9, #1
236
+    cmp             x9, #16
237
+    bgt             .vl_gt_16_getResidual16
238
+    lsl             x4, x3, #1
239
+.rept 8
240
+    ld1             {v0.16b}, x0, x3
241
+    ld1             {v1.16b}, x1, x3
242
+    ld1             {v2.16b}, x0, x3
243
+    ld1             {v3.16b}, x1, x3
244
+    usubl           v4.8h, v0.8b, v1.8b
245
+    usubl2          v5.8h, v0.16b, v1.16b
246
+    usubl           v6.8h, v2.8b, v3.8b
247
+    usubl2          v7.8h, v2.16b, v3.16b
248
+    st1             {v4.8h-v5.8h}, x2, x4
249
+    st1             {v6.8h-v7.8h}, x2, x4
250
+.endr
251
+    ret
252
+.vl_gt_16_getResidual16:
253
+    ptrue           p0.h, vl16
254
+.rept 16
255
+    ld1b            {z0.h}, p0/z, x0
256
+    ld1b            {z2.h}, p0/z, x1
257
+    add             x0, x0, x3
258
+    add             x1, x1, x3
259
+    sub             z4.h, z0.h, z2.h
260
+    st1h            {z4.h}, p0, x2
261
+    add             x2, x2, x3, lsl #1
262
+.endr
263
+    ret
264
+endfunc
265
+
266
+function PFX(getResidual32_sve2)
267
+    rdvl            x9, #1
268
+    cmp             x9, #16
269
+    bgt             .vl_gt_16_getResidual32
270
+    lsl             x4, x3, #1
271
+    mov             w12, #4
272
+.loop_residual_32:
273
+    sub             w12, w12, #1
274
+.rept 4
275
+    ld1             {v0.16b-v1.16b}, x0, x3
276
+    ld1             {v2.16b-v3.16b}, x1, x3
277
+    ld1             {v4.16b-v5.16b}, x0, x3
278
+    ld1             {v6.16b-v7.16b}, x1, x3
279
+    usubl           v16.8h, v0.8b, v2.8b
280
+    usubl2          v17.8h, v0.16b, v2.16b
281
+    usubl           v18.8h, v1.8b, v3.8b
282
+    usubl2          v19.8h, v1.16b, v3.16b
283
+    usubl           v20.8h, v4.8b, v6.8b
284
+    usubl2          v21.8h, v4.16b, v6.16b
285
+    usubl           v22.8h, v5.8b, v7.8b
286
+    usubl2          v23.8h, v5.16b, v7.16b
287
+    st1             {v16.8h-v19.8h}, x2, x4
288
+    st1             {v20.8h-v23.8h}, x2, x4
289
+.endr
290
+    cbnz            w12, .loop_residual_32
291
+    ret
292
+.vl_gt_16_getResidual32:
293
+    cmp             x9, #48
294
+    bgt             .vl_gt_48_getResidual32
295
+    ptrue           p0.b, vl32
296
+.rept 32
297
+    ld1b            {z0.b}, p0/z, x0
298
+    ld1b            {z2.b}, p0/z, x1
299
+    add             x0, x0, x3
300
+    add             x1, x1, x3
301
+    usublb          z4.h, z0.b, z2.b
302
+    usublt          z5.h, z0.b, z2.b
303
+    st2h            {z4.h, z5.h}, p0, x2
304
+    add             x2, x2, x3, lsl #1
305
+.endr
306
+    ret
307
+.vl_gt_48_getResidual32:
308
+    ptrue           p0.h, vl32
309
+.rept 32
310
+    ld1b            {z0.h}, p0/z, x0
311
+    ld1b            {z4.h}, p0/z, x1
312
+    add             x0, x0, x3
313
+    add             x1, x1, x3
314
+    sub             z8.h, z0.h, z4.h
315
+    st1h            {z8.h}, p0, x2
316
+    add             x2, x2, x3, lsl #1
317
+.endr
318
+    ret
319
+endfunc
320
+
321
+function PFX(pixel_sub_ps_32x32_sve2)
322
+    rdvl            x9, #1
323
+    cmp             x9, #16
324
+    bgt             .vl_gt_16_pixel_sub_ps_32x32
325
+    lsl             x1, x1, #1
326
+    mov             w12, #4
327
+.loop_sub_ps_32_sve2:
328
+    sub             w12, w12, #1
329
+.rept 4
330
+    ld1             {v0.16b-v1.16b}, x2, x4
331
+    ld1             {v2.16b-v3.16b}, x3, x5
332
+    ld1             {v4.16b-v5.16b}, x2, x4
333
+    ld1             {v6.16b-v7.16b}, x3, x5
334
+    usubl           v16.8h, v0.8b, v2.8b
335
+    usubl2          v17.8h, v0.16b, v2.16b
336
+    usubl           v18.8h, v1.8b, v3.8b
337
+    usubl2          v19.8h, v1.16b, v3.16b
338
+    usubl           v20.8h, v4.8b, v6.8b
339
+    usubl2          v21.8h, v4.16b, v6.16b
340
+    usubl           v22.8h, v5.8b, v7.8b
341
+    usubl2          v23.8h, v5.16b, v7.16b
342
+    st1             {v16.8h-v19.8h}, x0, x1
343
+    st1             {v20.8h-v23.8h}, x0, x1
344
+.endr
345
+    cbnz            w12, .loop_sub_ps_32_sve2
346
+    ret
347
+.vl_gt_16_pixel_sub_ps_32x32:
348
+    cmp             x9, #48
349
+    bgt             .vl_gt_48_pixel_sub_ps_32x32
350
+    ptrue           p0.b, vl32
351
+    mov             w12, #8
352
+.vl_gt_16_loop_sub_ps_32_sve2:
353
+    sub             w12, w12, #1
354
+.rept 4
355
+    ld1b            {z0.b}, p0/z, x2
356
+    ld1b            {z2.b}, p0/z, x3
357
+    add             x2, x2, x4
358
+    add             x3, x3, x5
359
+    usublb          z16.h, z0.b, z2.b
360
+    usublt          z17.h, z0.b, z2.b
361
+    st2h            {z16.h, z17.h}, p0, x0
362
+    add             x0, x0, x1, lsl #1
363
+.endr
364
+    cbnz            w12, .vl_gt_16_loop_sub_ps_32_sve2
365
+    ret
366
+.vl_gt_48_pixel_sub_ps_32x32:
367
+    ptrue           p0.h, vl32
368
+    mov             w12, #8
369
+.vl_gt_48_loop_sub_ps_32_sve2:
370
+    sub             w12, w12, #1
371
+.rept 4
372
+    ld1b            {z0.h}, p0/z, x2
373
+    ld1b            {z4.h}, p0/z, x3
374
+    add             x2, x2, x4
375
+    add             x3, x3, x5
376
+    sub             z8.h, z0.h, z4.h
377
+    st1h            {z8.h}, p0, x0
378
+    add             x0, x0, x1, lsl #1
379
+.endr
380
+    cbnz            w12, .vl_gt_48_loop_sub_ps_32_sve2
381
+    ret
382
+endfunc
383
+
384
+function PFX(pixel_sub_ps_64x64_sve2)
385
+    rdvl            x9, #1
386
+    cmp             x9, #16
387
+    bgt             .vl_gt_16_pixel_sub_ps_64x64
388
+    lsl             x1, x1, #1
389
+    sub             x1, x1, #64
390
+    mov             w12, #16
391
+.loop_sub_ps_64_sve2:
392
+    sub             w12, w12, #1
393
+.rept 4
394
+    ld1             {v0.16b-v3.16b}, x2, x4
395
+    ld1             {v4.16b-v7.16b}, x3, x5
396
+    usubl           v16.8h, v0.8b, v4.8b
397
+    usubl2          v17.8h, v0.16b, v4.16b
398
+    usubl           v18.8h, v1.8b, v5.8b
399
+    usubl2          v19.8h, v1.16b, v5.16b
400
+    usubl           v20.8h, v2.8b, v6.8b
401
+    usubl2          v21.8h, v2.16b, v6.16b
402
+    usubl           v22.8h, v3.8b, v7.8b
403
+    usubl2          v23.8h, v3.16b, v7.16b
404
+    st1             {v16.8h-v19.8h}, x0, #64
405
+    st1             {v20.8h-v23.8h}, x0, x1
406
+.endr
407
+    cbnz            w12, .loop_sub_ps_64_sve2
408
+    ret
409
+.vl_gt_16_pixel_sub_ps_64x64:
410
+    rdvl            x9, #1
411
+    cmp             x9, #16
412
+    bgt             .vl_gt_16_pixel_sub_ps_64x64
413
+    ptrue           p0.b, vl32
414
+    mov             w12, #16
415
+.vl_gt_16_loop_sub_ps_64_sve2:
416
+    sub             w12, w12, #1
417
+.rept 4
418
+    ld1b            {z0.b}, p0/z, x2
419
+    ld1b            {z1.b}, p0/z, x2, #1, mul vl
420
+    ld1b            {z4.b}, p0/z, x3
421
+    ld1b            {z5.b}, p0/z, x3, #1, mul vl
422
+    add             x2, x2, x4
423
+    add             x3, x3, x5
424
+    usublb          z16.h, z0.b, z4.b
425
+    usublt          z17.h, z0.b, z4.b
426
+    usublb          z18.h, z1.b, z5.b
427
+    usublt          z19.h, z1.b, z5.b
428
+    st2h            {z16.h, z17.h}, p0, x0
429
+    st2h            {z18.h, z19.h}, p0, x0, #2, mul vl
430
+    add             x0, x0, x1, lsl #1
431
+.endr
432
+    cbnz            w12, .vl_gt_16_loop_sub_ps_64_sve2
433
+    ret
434
+.vl_gt_48_pixel_sub_ps_64x64:
435
+    cmp             x9, #112
436
+    bgt             .vl_gt_112_pixel_sub_ps_64x64
437
+    ptrue           p0.b, vl64
438
+    mov             w12, #16
439
+.vl_gt_48_loop_sub_ps_64_sve2:
440
+    sub             w12, w12, #1
441
+.rept 4
442
+    ld1b            {z0.b}, p0/z, x2
443
+    ld1b            {z4.b}, p0/z, x3
444
+    add             x2, x2, x4
445
+    add             x3, x3, x5
446
+    usublb          z16.h, z0.b, z4.b
447
+    usublt          z17.h, z0.b, z4.b
448
+    st2h            {z16.h, z17.h}, p0, x0
449
+    add             x0, x0, x1, lsl #1
450
+.endr
451
+    cbnz            w12, .vl_gt_48_loop_sub_ps_64_sve2
452
+    ret
453
+.vl_gt_112_pixel_sub_ps_64x64:
454
+    ptrue           p0.h, vl64
455
+    mov             w12, #16
456
+.vl_gt_112_loop_sub_ps_64_sve2:
457
+    sub             w12, w12, #1
458
+.rept 4
459
+    ld1b            {z0.h}, p0/z, x2
460
+    ld1b            {z8.h}, p0/z, x3
461
+    add             x2, x2, x4
462
+    add             x3, x3, x5
463
+    sub             z16.h, z0.h, z8.h
464
+    st1h            {z16.h}, p0, x0
465
+    add             x0, x0, x1, lsl #1
466
+.endr
467
+    cbnz            w12, .vl_gt_112_loop_sub_ps_64_sve2
468
+    ret
469
+endfunc
470
+
471
+function PFX(pixel_sub_ps_32x64_sve2)
472
+    rdvl            x9, #1
473
+    cmp             x9, #16
474
+    bgt             .vl_gt_16_pixel_sub_ps_32x64
475
+    lsl             x1, x1, #1
476
+    mov             w12, #8
477
+.loop_sub_ps_32x64_sve2:
478
+    sub             w12, w12, #1
479
+.rept 4
480
+    ld1             {v0.16b-v1.16b}, x2, x4
481
+    ld1             {v2.16b-v3.16b}, x3, x5
482
+    ld1             {v4.16b-v5.16b}, x2, x4
483
+    ld1             {v6.16b-v7.16b}, x3, x5
484
+    usubl           v16.8h, v0.8b, v2.8b
485
+    usubl2          v17.8h, v0.16b, v2.16b
486
+    usubl           v18.8h, v1.8b, v3.8b
487
+    usubl2          v19.8h, v1.16b, v3.16b
488
+    usubl           v20.8h, v4.8b, v6.8b
489
+    usubl2          v21.8h, v4.16b, v6.16b
490
+    usubl           v22.8h, v5.8b, v7.8b
491
+    usubl2          v23.8h, v5.16b, v7.16b
492
+    st1             {v16.8h-v19.8h}, x0, x1
493
+    st1             {v20.8h-v23.8h}, x0, x1
494
+.endr
495
+    cbnz            w12, .loop_sub_ps_32x64_sve2
496
+    ret
497
+.vl_gt_16_pixel_sub_ps_32x64:
498
+    cmp             x9, #48
499
+    bgt             .vl_gt_48_pixel_sub_ps_32x64
500
+    ptrue           p0.b, vl32
501
+    mov             w12, #8
502
+.vl_gt_16_loop_sub_ps_32x64_sve2:
503
+    sub             w12, w12, #1
504
+.rept 8
505
+    ld1b            {z0.b}, p0/z, x2
506
+    ld1b            {z2.b}, p0/z, x3
507
+    add             x2, x2, x4
508
+    add             x3, x3, x5
509
+    usublb          z16.h, z0.b, z2.b
510
+    usublt          z17.h, z0.b, z2.b
511
+    st2h            {z16.h, z17.h}, p0, x0
512
+    add             x0, x0, x1, lsl #1
513
+.endr
514
+    cbnz            w12, .vl_gt_16_loop_sub_ps_32x64_sve2
515
+    ret
516
+.vl_gt_48_pixel_sub_ps_32x64:
517
+    ptrue           p0.h, vl32
518
+    mov             w12, #8
519
+.vl_gt_48_loop_sub_ps_32x64_sve2:
520
+    sub             w12, w12, #1
521
+.rept 8
522
+    ld1b            {z0.h}, p0/z, x2
523
+    ld1b            {z4.h}, p0/z, x3
524
+    add             x2, x2, x4
525
+    add             x3, x3, x5
526
+    sub             z8.h, z0.h, z4.h
527
+    st1h            {z8.h}, p0, x0
528
+    add             x0, x0, x1, lsl #1
529
+.endr
530
+    cbnz            w12, .vl_gt_48_loop_sub_ps_32x64_sve2
531
+    ret
532
+endfunc
533
+
534
+function PFX(pixel_add_ps_4x4_sve2)
535
+    ptrue           p0.h, vl8
536
+    ptrue           p1.h, vl4
537
+.rept 4
538
+    ld1b            {z0.h}, p0/z, x2
539
+    ld1h            {z2.h}, p1/z, x3
540
+    add             x2, x2, x4
541
+    add             x3, x3, x5, lsl #1
542
+    add             z4.h, z0.h, z2.h
543
+    sqxtunb         z4.b, z4.h
544
+    st1b            {z4.h}, p1, x0
545
+    add             x0, x0, x1
546
+.endr
547
+    ret
548
+endfunc
549
+
550
+function PFX(pixel_add_ps_8x8_sve2)
551
+    ptrue           p0.h, vl8
552
+.rept 8
553
+    ld1b            {z0.h}, p0/z, x2
554
+    ld1h            {z2.h}, p0/z, x3
555
+    add             x2, x2, x4
556
+    add             x3, x3, x5, lsl #1
557
+    add             z4.h, z0.h, z2.h
558
+    sqxtunb         z4.b, z4.h
559
+    st1b            {z4.h}, p0, x0
560
+    add             x0, x0, x1
561
+.endr
562
+    ret
563
+endfunc
564
+
565
+.macro pixel_add_ps_16xN_sve2 h
566
+function PFX(pixel_add_ps_16x\h\()_sve2)
567
+    rdvl            x9, #1
568
+    cmp             x9, #16
569
+    bgt             .vl_gt_16_pixel_add_ps_16x\h
570
+    ptrue           p0.b, vl16
571
+.rept \h
572
+    ld1b            {z0.h}, p0/z, x2
573
+    ld1b            {z1.h}, p0/z, x2, #1, mul vl
574
+    ld1h            {z2.h}, p0/z, x3
575
+    ld1h            {z3.h}, p0/z, x3, #1, mul vl
576
+    add             x2, x2, x4
577
+    add             x3, x3, x5, lsl #1
578
+    add             z24.h, z0.h, z2.h
579
+    add             z25.h, z1.h, z3.h
580
+    sqxtunb         z6.b, z24.h
581
+    sqxtunb         z7.b, z25.h
582
+    st1b            {z6.h}, p0, x0
583
+    st1b            {z7.h}, p0, x0, #1, mul vl
584
+    add             x0, x0, x1
585
+.endr
586
+    ret
587
+.vl_gt_16_pixel_add_ps_16x\h\():
588
+    ptrue           p0.b, vl32
589
+.rept \h
590
+    ld1b            {z0.h}, p0/z, x2
591
+    ld1h            {z2.h}, p0/z, x3
592
+    add             x2, x2, x4
593
+    add             x3, x3, x5, lsl #1
594
+    add             z24.h, z0.h, z2.h
595
+    sqxtunb         z6.b, z24.h
596
+    st1b            {z6.h}, p0, x0
597
+    add             x0, x0, x1
598
+.endr
599
+    ret
600
+endfunc
601
+.endm
602
+
603
+pixel_add_ps_16xN_sve2 16
604
+pixel_add_ps_16xN_sve2 32
605
+
606
+.macro pixel_add_ps_32xN_sve2 h
607
+ function PFX(pixel_add_ps_32x\h\()_sve2)
608
+    rdvl            x9, #1
609
+    cmp             x9, #16
610
+    bgt             .vl_gt_16_pixel_add_ps_32x\h
611
+    lsl             x5, x5, #1
612
+    mov             w12, #\h / 4
613
+.loop_add_ps__sve2_32x\h\():
614
+    sub             w12, w12, #1
615
+.rept 4
616
+    ld1             {v0.16b-v1.16b}, x2, x4
617
+    ld1             {v16.8h-v19.8h}, x3, x5
618
+    uxtl            v4.8h, v0.8b
619
+    uxtl2           v5.8h, v0.16b
620
+    uxtl            v6.8h, v1.8b
621
+    uxtl2           v7.8h, v1.16b
622
+    add             v24.8h, v4.8h, v16.8h
623
+    add             v25.8h, v5.8h, v17.8h
624
+    add             v26.8h, v6.8h, v18.8h
625
+    add             v27.8h, v7.8h, v19.8h
626
+    sqxtun          v4.8b, v24.8h
627
+    sqxtun2         v4.16b, v25.8h
628
+    sqxtun          v5.8b, v26.8h
629
+    sqxtun2         v5.16b, v27.8h
630
+    st1             {v4.16b-v5.16b}, x0, x1
631
+.endr
632
+    cbnz            w12, .loop_add_ps__sve2_32x\h
633
+    ret
634
+.vl_gt_16_pixel_add_ps_32x\h\():
635
+    cmp             x9, #48
636
+    bgt             .vl_gt_48_pixel_add_ps_32x\h
637
+    ptrue           p0.b, vl32
638
+.rept \h
639
+    ld1b            {z0.h}, p0/z, x2
640
+    ld1b            {z1.h}, p0/z, x2, #1, mul vl
641
+    ld1h            {z4.h}, p0/z, x3
642
+    ld1h            {z5.h}, p0/z, x3, #1, mul vl
643
+    add             x2, x2, x4
644
+    add             x3, x3, x5, lsl #1
645
+    add             z24.h, z0.h, z4.h
646
+    add             z25.h, z1.h, z5.h
647
+    sqxtunb         z6.b, z24.h
648
+    sqxtunb         z7.b, z25.h
649
+    st1b            {z6.h}, p0, x0
650
+    st1b            {z7.h}, p0, x0, #1, mul vl
651
+    add             x0, x0, x1
652
+.endr
653
+    ret
654
+.vl_gt_48_pixel_add_ps_32x\h\():
655
+    ptrue           p0.b, vl64
656
+.rept \h
657
+    ld1b            {z0.h}, p0/z, x2
658
+    ld1h            {z4.h}, p0/z, x3
659
+    add             x2, x2, x4
660
+    add             x3, x3, x5, lsl #1
661
+    add             z24.h, z0.h, z4.h
662
+    sqxtunb         z6.b, z24.h
663
+    st1b            {z6.h}, p0, x0
664
+    add             x0, x0, x1
665
+.endr
666
+    ret
667
+endfunc
668
+.endm
669
+
670
+pixel_add_ps_32xN_sve2 32
671
+pixel_add_ps_32xN_sve2 64
672
+
673
+function PFX(pixel_add_ps_64x64_sve2)
674
+    rdvl            x9, #1
675
+    cmp             x9, #16
676
+    bgt             .vl_gt_16_pixel_add_ps_64x64
677
+    ptrue           p0.b, vl16
678
+.rept 64
679
+    ld1b            {z0.h}, p0/z, x2
680
+    ld1b            {z1.h}, p0/z, x2, #1, mul vl
681
+    ld1b            {z2.h}, p0/z, x2, #2, mul vl
682
+    ld1b            {z3.h}, p0/z, x2, #3, mul vl
683
+    ld1b            {z4.h}, p0/z, x2, #4 ,mul vl
684
+    ld1b            {z5.h}, p0/z, x2, #5, mul vl
685
+    ld1b            {z6.h}, p0/z, x2, #6, mul vl
686
+    ld1b            {z7.h}, p0/z, x2, #7, mul vl
687
+    ld1h            {z8.h}, p0/z, x3
688
+    ld1h            {z9.h}, p0/z, x3, #1, mul vl
689
+    ld1h            {z10.h}, p0/z, x3, #2, mul vl
690
+    ld1h            {z11.h}, p0/z, x3, #3, mul vl
691
+    ld1h            {z12.h}, p0/z, x3, #4, mul vl
692
+    ld1h            {z13.h}, p0/z, x3, #5, mul vl
693
+    ld1h            {z14.h}, p0/z, x3, #6, mul vl
694
+    ld1h            {z15.h}, p0/z, x3, #7, mul vl
695
+    add             x2, x2, x4
696
+    add             x3, x3, x5, lsl #1
697
+    add             z24.h, z0.h, z8.h
698
+    add             z25.h, z1.h, z9.h
699
+    add             z26.h, z2.h, z10.h
700
+    add             z27.h, z3.h, z11.h
701
+    add             z28.h, z4.h, z12.h
702
+    add             z29.h, z5.h, z13.h
703
+    add             z30.h, z6.h, z14.h
704
+    add             z31.h, z7.h, z15.h
705
+    sqxtunb         z6.b, z24.h
706
+    sqxtunb         z7.b, z25.h
707
+    sqxtunb         z8.b, z26.h
708
+    sqxtunb         z9.b, z27.h
709
+    sqxtunb         z10.b, z28.h
710
+    sqxtunb         z11.b, z29.h
711
+    sqxtunb         z12.b, z30.h
712
+    sqxtunb         z13.b, z31.h
713
+    st1b            {z6.h}, p0, x0
714
+    st1b            {z7.h}, p0, x0, #1, mul vl
715
+    st1b            {z8.h}, p0, x0, #2, mul vl
716
+    st1b            {z9.h}, p0, x0, #3, mul vl
717
+    st1b            {z10.h}, p0, x0, #4, mul vl
718
+    st1b            {z11.h}, p0, x0, #5, mul vl
719
+    st1b            {z12.h}, p0, x0, #6, mul vl
720
+    st1b            {z13.h}, p0, x0, #7, mul vl
721
+    add             x0, x0, x1
722
+.endr
723
+    ret
724
+.vl_gt_16_pixel_add_ps_64x64:
725
+    cmp             x9, #48
726
+    bgt             .vl_gt_48_pixel_add_ps_64x64
727
+    ptrue           p0.b, vl32
728
+.rept 64
729
+    ld1b            {z0.h}, p0/z, x2
730
+    ld1b            {z1.h}, p0/z, x2, #1, mul vl
731
+    ld1b            {z2.h}, p0/z, x2, #2, mul vl
732
+    ld1b            {z3.h}, p0/z, x2, #3, mul vl
733
+    ld1h            {z8.h}, p0/z, x3
734
+    ld1h            {z9.h}, p0/z, x3, #1, mul vl
735
+    ld1h            {z10.h}, p0/z, x3, #2, mul vl
736
+    ld1h            {z11.h}, p0/z, x3, #3, mul vl
737
+    add             x2, x2, x4
738
+    add             x3, x3, x5, lsl #1
739
+    add             z24.h, z0.h, z8.h
740
+    add             z25.h, z1.h, z9.h
741
+    add             z26.h, z2.h, z10.h
742
+    add             z27.h, z3.h, z11.h
743
+    sqxtunb         z6.b, z24.h
744
+    sqxtunb         z7.b, z25.h
745
+    sqxtunb         z8.b, z26.h
746
+    sqxtunb         z9.b, z27.h
747
+    st1b            {z6.h}, p0, x0
748
+    st1b            {z7.h}, p0, x0, #1, mul vl
749
+    st1b            {z8.h}, p0, x0, #2, mul vl
750
+    st1b            {z9.h}, p0, x0, #3, mul vl
751
+    add             x0, x0, x1
752
+.endr
753
+    ret
754
+.vl_gt_48_pixel_add_ps_64x64:
755
+    cmp             x9, #112
756
+    bgt             .vl_gt_112_pixel_add_ps_64x64
757
+    ptrue           p0.b, vl64
758
+.rept 64
759
+    ld1b            {z0.h}, p0/z, x2
760
+    ld1b            {z1.h}, p0/z, x2, #1, mul vl
761
+    ld1h            {z8.h}, p0/z, x3
762
+    ld1h            {z9.h}, p0/z, x3, #1, mul vl
763
+    add             x2, x2, x4
764
+    add             x3, x3, x5, lsl #1
765
+    add             z24.h, z0.h, z8.h
766
+    add             z25.h, z1.h, z9.h
767
+    sqxtunb         z6.b, z24.h
768
+    sqxtunb         z7.b, z25.h
769
+    st1b            {z6.h}, p0, x0
770
+    st1b            {z7.h}, p0, x0, #1, mul vl
771
+    add             x0, x0, x1
772
+.endr
773
+    ret
774
+.vl_gt_112_pixel_add_ps_64x64:
775
+    ptrue           p0.b, vl128
776
+.rept 64
777
+    ld1b            {z0.h}, p0/z, x2
778
+    ld1h            {z8.h}, p0/z, x3
779
+    add             x2, x2, x4
780
+    add             x3, x3, x5, lsl #1
781
+    add             z24.h, z0.h, z8.h
782
+    sqxtunb         z6.b, z24.h
783
+    st1b            {z6.h}, p0, x0
784
+    add             x0, x0, x1
785
+.endr
786
+    ret
787
+endfunc
788
+
789
+// Chroma add_ps
790
+function PFX(pixel_add_ps_4x8_sve2)
791
+    ptrue           p0.h,vl4
792
+.rept 8
793
+    ld1b            {z0.h}, p0/z, x2
794
+    ld1h            {z2.h}, p0/z, x3
795
+    add             x2, x2, x4
796
+    add             x3, x3, x5, lsl #1
797
+    add             z4.h, z0.h, z2.h
798
+    sqxtunb         z4.b, z4.h
799
+    st1b            {z4.h}, p0, x0
800
+    add             x0, x0, x1
801
+.endr
802
+    ret
803
+endfunc
804
+
805
+function PFX(pixel_add_ps_8x16_sve2)
806
+    ptrue           p0.h,vl8
807
+.rept 16
808
+    ld1b            {z0.h}, p0/z, x2
809
+    ld1h            {z2.h}, p0/z, x3
810
+    add             x2, x2, x4
811
+    add             x3, x3, x5, lsl #1
812
+    add             z4.h, z0.h, z2.h
813
+    sqxtunb         z4.b, z4.h
814
+    st1b            {z4.h}, p0, x0
815
+    add             x0, x0, x1
816
+.endr
817
+    ret
818
+endfunc
819
+
820
+// void scale1D_128to64(pixel *dst, const pixel *src)
821
+function PFX(scale1D_128to64_sve2)
822
+    rdvl            x9, #1
823
+    cmp             x9, #16
824
+    bgt             .vl_gt_16_scale1D_128to64
825
+    ptrue           p0.b, vl16
826
+.rept 2
827
+    ld2b            {z0.b, z1.b}, p0/z, x1
828
+    ld2b            {z2.b, z3.b}, p0/z, x1, #2, mul vl
829
+    ld2b            {z4.b, z5.b}, p0/z, x1, #4, mul vl
830
+    ld2b            {z6.b, z7.b}, p0/z, x1, #6, mul vl
831
+    add             x1, x1, #128
832
+    urhadd          z0.b, p0/m, z0.b, z1.b
833
+    urhadd          z2.b, p0/m, z2.b, z3.b
834
+    urhadd          z4.b, p0/m, z4.b, z5.b
835
+    urhadd          z6.b, p0/m, z6.b, z7.b
836
+    st1b            {z0.b}, p0, x0
837
+    st1b            {z2.b}, p0, x0, #1, mul vl
838
+    st1b            {z4.b}, p0, x0, #2, mul vl
839
+    st1b            {z6.b}, p0, x0, #3, mul vl
840
+    add             x0, x0, #64
841
+.endr
842
+    ret
843
+.vl_gt_16_scale1D_128to64:
844
+    cmp             x9, #48
845
+    bgt             .vl_gt_48_scale1D_128to64
846
+    ptrue           p0.b, vl32
847
+.rept 2
848
+    ld2b            {z0.b, z1.b}, p0/z, x1
849
+    ld2b            {z2.b, z3.b}, p0/z, x1, #2, mul vl
850
+    add             x1, x1, #128
851
+    urhadd          z0.b, p0/m, z0.b, z1.b
852
+    urhadd          z2.b, p0/m, z2.b, z3.b
853
+    st1b            {z0.b}, p0, x0
854
+    st1b            {z2.b}, p0, x0, #1, mul vl
855
+    add             x0, x0, #64
856
+.endr
857
+    ret
858
+.vl_gt_48_scale1D_128to64:
859
+    ptrue           p0.b, vl64
860
+.rept 2
861
+    ld2b            {z0.b, z1.b}, p0/z, x1
862
+    add             x1, x1, #128
863
+    urhadd          z0.b, p0/m, z0.b, z1.b
864
+    st1b            {z0.b}, p0, x0
865
+    add             x0, x0, #64
866
+.endr
867
+    ret
868
+endfunc
869
+
870
+/***** dequant_scaling*****/
871
+// void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
872
+function PFX(dequant_scaling_sve2)
873
+    ptrue           p0.h, vl8
874
+    add             x5, x5, #4              // shift + 4
875
+    lsr             x3, x3, #3              // num / 8
876
+    cmp             x5, x4
877
+    blt             .dequant_skip_sve2
878
+
879
+    mov             x12, #1
880
+    sub             x6, x5, x4          // shift - per
881
+    sub             x6, x6, #1          // shift - per - 1
882
+    lsl             x6, x12, x6         // 1 << shift - per - 1 (add)
883
+    mov             z0.s, w6
884
+    sub             x7, x4, x5          // per - shift
885
+    mov             z3.s, w7
886
+
887
+.dequant_loop1_sve2:
888
+    ld1h            {z19.h}, p0/z, x0
889
+    ld1w            {z2.s}, p0/z, x1
890
+    add             x1, x1, #16
891
+    ld1w            {z20.s}, p0/z, x1
892
+    add             x0, x0, #16
893
+    add             x1, x1, #16
894
+
895
+    sub             x3, x3, #1
896
+    sunpklo         z1.s, z19.h
897
+    sunpkhi         z19.s, z19.h
898
+
899
+    mul             z1.s, z1.s, z2.s // quantCoef * deQuantCoef
900
+    mul             z19.s, z19.s, z20.s
901
+    add             z1.s, z1.s, z0.s // quantCoef * deQuantCoef + add
902
+    add             z19.s, z19.s, z0.s
903
+
904
+    // No equivalent instructions in SVE2 for sshl
905
+    // as sqshl has double latency
906
+    sshl            v1.4s, v1.4s, v3.4s
907
+    sshl            v19.4s, v19.4s, v3.4s
908
+
909
+    sqxtnb          z16.h, z1.s
910
+    sqxtnb          z17.h, z19.s
911
+    st1h            {z16.s}, p0, x2
912
+    st1h            {z17.s}, p0, x2, #1, mul vl
913
+    add             x2, x2, #16
914
+    cbnz            x3, .dequant_loop1_sve2
915
+    ret
916
+
917
+.dequant_skip_sve2:
918
+    sub             x6, x4, x5          // per - shift
919
+    mov             z0.h, w6
920
+
921
+.dequant_loop2_sve2:
922
+    ld1h            {z19.h}, p0/z, x0
923
+    ld1w            {z2.s}, p0/z, x1
924
+    add             x1, x1, #16
925
+    ld1w            {z20.s}, p0/z, x1
926
+    add             x0, x0, #16
927
+    add             x1, x1, #16
928
+
929
+
930
+    sub             x3, x3, #1
931
+    sunpklo         z1.s, z19.h
932
+    sunpkhi         z19.s, z19.h
933
+
934
+    mul             z1.s, z1.s, z2.s // quantCoef * deQuantCoef
935
+    mul             z19.s, z19.s, z20.s
936
+
937
+    // Keeping NEON instructions here in order to have
938
+    // one sqshl later
939
+    sqxtn           v16.4h, v1.4s       // x265_clip3
940
+    sqxtn2          v16.8h, v19.4s
941
+
942
+    sqshl           z16.h, p0/m, z16.h, z0.h // coefQ << per - shift
943
+    st1h            {z16.h}, p0, x2
944
+    add             x2, x2, #16
945
+    cbnz            x3, .dequant_loop2_sve2
946
+    ret
947
+endfunc
948
+
949
+// void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
950
+function PFX(dequant_normal_sve2)
951
+    lsr             w2, w2, #4              // num / 16
952
+    neg             w4, w4
953
+    mov             z0.h, w3
954
+    mov             z1.s, w4
955
+    rdvl            x9, #1
956
+    cmp             x9, #16
957
+    bgt             .vl_gt_16_dequant_normal_sve2
958
+.dqn_loop1_sve2:
959
+    ld1             {v2.8h, v3.8h}, x0, #32
960
+    smull           v16.4s, v2.4h, v0.4h
961
+    smull2          v17.4s, v2.8h, v0.8h
962
+    smull           v18.4s, v3.4h, v0.4h
963
+    smull2          v19.4s, v3.8h, v0.8h
964
+
965
+    srshl           v16.4s, v16.4s, v1.4s
966
+    srshl           v17.4s, v17.4s, v1.4s
967
+    srshl           v18.4s, v18.4s, v1.4s
968
+    srshl           v19.4s, v19.4s, v1.4s
969
+
970
+    sqxtn           v2.4h, v16.4s
971
+    sqxtn2          v2.8h, v17.4s
972
+    sqxtn           v3.4h, v18.4s
973
+    sqxtn2          v3.8h, v19.4s
974
+
975
+    sub             w2, w2, #1
976
+    st1             {v2.8h, v3.8h}, x1, #32
977
+    cbnz            w2, .dqn_loop1_sve2
978
+    ret
979
+.vl_gt_16_dequant_normal_sve2:
980
+    ptrue           p0.h, vl16
981
+.gt_16_dqn_loop1_sve2:
982
+    ld1h            {z2.h}, p0/z, x0
983
+    add             x0, x0, #32
984
+    smullb          z16.s, z2.h, z0.h
985
+    smullt          z17.s, z2.h, z0.h
986
+
987
+    srshl           z16.s, p0/m, z16.s, z1.s
988
+    srshl           z17.s, p0/m, z17.s, z1.s
989
+
990
+    sqxtnb          z2.h, z16.s
991
+    sqxtnt          z2.h, z17.s
992
+    
993
+    sub             w2, w2, #1
994
+    st1h            {z2.h}, p0, x1
995
+    add             x1, x1, #32
996
+    cbnz            w2, .gt_16_dqn_loop1_sve2
997
+    ret
998
+
999
+endfunc
1000
+
1001
+// void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24)
1002
+function PFX(ssim_4x4x2_core_sve2)
1003
+    ptrue           p0.b, vl16
1004
+    movi            v30.2d, #0
1005
+    movi            v31.2d, #0
1006
+
1007
+    ld1b            {z0.h}, p0/z, x0
1008
+    add             x0, x0, x1
1009
+    ld1b            {z1.h}, p0/z, x0
1010
+    add             x0, x0, x1
1011
+    ld1b            {z2.h}, p0/z, x0
1012
+    add             x0, x0, x1
1013
+    ld1b            {z3.h}, p0/z, x0
1014
+    add             x0, x0, x1
1015
+
1016
+    ld1b            {z4.h}, p0/z, x2
1017
+    add             x2, x2, x3
1018
+    ld1b            {z5.h}, p0/z, x2
1019
+    add             x2, x2, x3
1020
+    ld1b            {z6.h}, p0/z, x2
1021
+    add             x2, x2, x3
1022
+    ld1b            {z7.h}, p0/z, x2
1023
+    add             x2, x2, x3
1024
+
1025
+    mul             z16.h, z0.h, z0.h
1026
+    mul             z17.h, z1.h, z1.h
1027
+    mul             z18.h, z2.h, z2.h
1028
+    uaddlp          v30.4s, v16.8h
1029
+
1030
+    mul             z19.h, z3.h, z3.h
1031
+    mul             z20.h, z4.h, z4.h
1032
+    mul             z21.h, z5.h, z5.h
1033
+    uadalp          v30.4s, v17.8h
1034
+
1035
+    mul             z22.h, z6.h, z6.h
1036
+    mul             z23.h, z7.h, z7.h
1037
+    mul             z24.h, z0.h, z4.h
1038
+    uadalp          v30.4s, v18.8h
1039
+
1040
+    mul             z25.h, z1.h, z5.h
1041
+    mul             z26.h, z2.h, z6.h
1042
+    mul             z27.h, z3.h, z7.h
1043
+    uadalp          v30.4s, v19.8h
1044
+
1045
+    add             z28.h, z0.h, z1.h
1046
+    add             z29.h, z4.h, z5.h
1047
+    uadalp          v30.4s, v20.8h
1048
+    uaddlp          v31.4s, v24.8h
1049
+
1050
+    add             z28.h, z28.h, z2.h
1051
+    add             z29.h, z29.h, z6.h
1052
+    uadalp          v30.4s, v21.8h
1053
+    uadalp          v31.4s, v25.8h
1054
+
1055
+    add             z28.h, z28.h, z3.h
1056
+    add             z29.h, z29.h, z7.h
1057
+    uadalp          v30.4s, v22.8h
1058
+    uadalp          v31.4s, v26.8h
1059
+
1060
+    // Better use NEON instructions here
1061
+    uaddlp          v28.4s, v28.8h
1062
+    uaddlp          v29.4s, v29.8h
1063
+    uadalp          v30.4s, v23.8h
1064
+    uadalp          v31.4s, v27.8h
1065
+
1066
+    addp            v28.4s, v28.4s, v28.4s
1067
+    addp            v29.4s, v29.4s, v29.4s
1068
+    addp            v30.4s, v30.4s, v30.4s
1069
+    addp            v31.4s, v31.4s, v31.4s
1070
+
1071
+    st4             {v28.2s, v29.2s, v30.2s, v31.2s}, x4
1072
+    ret
1073
+endfunc
1074
+
1075
+// void ssimDist_c(const pixel* fenc, uint32_t fStride, const pixel* recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k)
1076
+.macro ssimDist_start_sve2
1077
+    mov             z0.d, #0
1078
+    mov             z1.d, #0
1079
+.endm
1080
+
1081
+.macro ssimDist_1_sve2  z0 z1 z2 z3
1082
+    sub             z16.s, \z0\().s, \z2\().s
1083
+    sub             z17.s, \z1\().s, \z3\().s
1084
+    mul             z18.s, \z0\().s, \z0\().s
1085
+    mul             z19.s, \z1\().s, \z1\().s
1086
+    mul             z20.s, z16.s, z16.s
1087
+    mul             z21.s, z17.s, z17.s
1088
+    add             z0.s, z0.s, z18.s
1089
+    add             z0.s, z0.s, z19.s
1090
+    add             z1.s, z1.s, z20.s
1091
+    add             z1.s, z1.s, z21.s
1092
+.endm
1093
+
1094
+.macro ssimDist_end_sve2
1095
+    uaddv           d0, p0, z0.s
1096
+    uaddv           d1, p0, z1.s
1097
+    str             d0, x6
1098
+    str             d1, x4
1099
+.endm
1100
+
1101
+function PFX(ssimDist4_sve2)
1102
+    ssimDist_start
1103
+    ptrue           p0.s, vl4
1104
+.rept 4
1105
+    ld1b            {z4.s}, p0/z, x0
1106
+    add             x0, x0, x1
1107
+    ld1b            {z5.s}, p0/z, x2
1108
+    add             x2, x2, x3
1109
+    sub             z2.s, z4.s, z5.s
1110
+    mul             z3.s, z4.s, z4.s
1111
+    mul             z2.s, z2.s, z2.s
1112
+    add             z0.s, z0.s, z3.s
1113
+    add             z1.s, z1.s, z2.s
1114
+.endr
1115
+    ssimDist_end
1116
+    ret
1117
+endfunc
1118
+
1119
+function PFX(ssimDist8_sve2)
1120
+    rdvl            x9, #1
1121
+    cmp             x9, #16
1122
+    bgt             .vl_gt_16_ssimDist8
1123
+    ssimDist_start
1124
+    ptrue           p0.s, vl4
1125
+.rept 8
1126
+    ld1b            {z4.s}, p0/z, x0
1127
+    ld1b            {z5.s}, p0/z, x0, #1, mul vl
1128
+    add             x0, x0, x1
1129
+    ld1b            {z6.s}, p0/z, x2
1130
+    ld1b            {z7.s}, p0/z, x2, #1, mul vl
1131
+    add             x2, x2, x3
1132
+    ssimDist_1_sve2 z4, z5, z6, z7
1133
+.endr
1134
+    ssimDist_end
1135
+    ret
1136
+.vl_gt_16_ssimDist8:
1137
+    ssimDist_start_sve2
1138
+    ptrue           p0.s, vl8
1139
+.rept 8
1140
+    ld1b            {z4.s}, p0/z, x0
1141
+    add             x0, x0, x1
1142
+    ld1b            {z6.s}, p0/z, x2
1143
+    add             x2, x2, x3
1144
+    sub             z20.s, z4.s, z6.s
1145
+    mul             z16.s, z4.s, z4.s
1146
+    mul             z18.s, z20.s, z20.s
1147
+    add             z0.s, z0.s, z16.s
1148
+    add             z1.s, z1.s, z18.s
1149
+.endr
1150
+    ssimDist_end_sve2
1151
+    ret
1152
+endfunc
1153
+
1154
+function PFX(ssimDist16_sve2)
1155
+    mov             w12, #16
1156
+    rdvl            x9, #1
1157
+    cmp             x9, #16
1158
+    bgt             .vl_gt_16_ssimDist16
1159
+    ssimDist_start
1160
+    ptrue           p0.s, vl4
1161
+.loop_ssimDist16_sve2:
1162
+    sub             w12, w12, #1
1163
+    ld1b            {z4.s}, p0/z, x0
1164
+    ld1b            {z5.s}, p0/z, x0, #1, mul vl
1165
+    ld1b            {z6.s}, p0/z, x0, #2, mul vl
1166
+    ld1b            {z7.s}, p0/z, x0, #3, mul vl
1167
+    add             x0, x0, x1
1168
+    ld1b            {z8.s}, p0/z, x2
1169
+    ld1b            {z9.s}, p0/z, x2, #1, mul vl
1170
+    ld1b            {z10.s}, p0/z, x2, #2, mul vl
1171
+    ld1b            {z11.s}, p0/z, x2, #3, mul vl
1172
+    add             x2, x2, x3
1173
+    ssimDist_1_sve2 z4, z5, z8, z9
1174
+    ssimDist_1_sve2 z6, z7, z10, z11
1175
+    cbnz            w12, .loop_ssimDist16_sve2
1176
+    ssimDist_end
1177
+    ret
1178
+.vl_gt_16_ssimDist16:
1179
+    cmp             x9, #48
1180
+    bgt             .vl_gt_48_ssimDist16
1181
+    ssimDist_start_sve2
1182
+    ptrue           p0.s, vl8
1183
+.vl_gt_16_loop_ssimDist16_sve2:
1184
+    sub             w12, w12, #1
1185
+    ld1b            {z4.s}, p0/z, x0
1186
+    ld1b            {z5.s}, p0/z, x0, #1, mul vl
1187
+    add             x0, x0, x1
1188
+    ld1b            {z8.s}, p0/z, x2
1189
+    ld1b            {z9.s}, p0/z, x2, #1, mul vl
1190
+    add             x2, x2, x3
1191
+    ssimDist_1_sve2 z4, z5, z8, z9
1192
+    cbnz            w12, .vl_gt_16_loop_ssimDist16_sve2
1193
+    ssimDist_end_sve2
1194
+    ret
1195
+.vl_gt_48_ssimDist16:
1196
+    ssimDist_start_sve2
1197
+    ptrue           p0.s, vl16
1198
+.vl_gt_48_loop_ssimDist16_sve2:
1199
+    sub             w12, w12, #1
1200
+    ld1b            {z4.s}, p0/z, x0
1201
+    add             x0, x0, x1
1202
+    ld1b            {z8.s}, p0/z, x2
1203
+    add             x2, x2, x3
1204
+    sub             z20.s, z4.s, z8.s
1205
+    mul             z16.s, z4.s, z4.s
1206
+    mul             z18.s, z20.s, z20.s
1207
+    add             z0.s, z0.s, z16.s
1208
+    add             z1.s, z1.s, z18.s
1209
+    cbnz            w12, .vl_gt_48_loop_ssimDist16_sve2
1210
+    ssimDist_end_sve2
1211
+    ret
1212
+endfunc
1213
+
1214
+function PFX(ssimDist32_sve2)
1215
+    mov             w12, #32
1216
+    rdvl            x9, #1
1217
+    cmp             x9, #16
1218
+    bgt             .vl_gt_16_ssimDist32
1219
+    ssimDist_start
1220
+    ptrue           p0.s, vl4
1221
+.loop_ssimDist32_sve2:
1222
+    sub             w12, w12, #1
1223
+    ld1b            {z2.s}, p0/z, x0
1224
+    ld1b            {z3.s}, p0/z, x0, #1, mul vl
1225
+    ld1b            {z4.s}, p0/z, x0, #2, mul vl
1226
+    ld1b            {z5.s}, p0/z, x0, #3, mul vl
1227
+    ld1b            {z6.s}, p0/z, x0, #4, mul vl
1228
+    ld1b            {z7.s}, p0/z, x0, #5, mul vl
1229
+    ld1b            {z8.s}, p0/z, x0, #6, mul vl
1230
+    ld1b            {z9.s}, p0/z, x0, #7, mul vl
1231
+    add             x0, x0, x1
1232
+    ld1b            {z10.s}, p0/z, x2
1233
+    ld1b            {z11.s}, p0/z, x2, #1, mul vl
1234
+    ld1b            {z12.s}, p0/z, x2, #2, mul vl
1235
+    ld1b            {z13.s}, p0/z, x2, #3, mul vl
1236
+    ld1b            {z14.s}, p0/z, x2, #4, mul vl
1237
+    ld1b            {z15.s}, p0/z, x2, #5, mul vl
1238
+    ld1b            {z30.s}, p0/z, x2, #6, mul vl
1239
+    ld1b            {z31.s}, p0/z, x2, #7, mul vl
1240
+    add             x2, x2, x3
1241
+    ssimDist_1_sve2 z2, z3, z10, z11
1242
+    ssimDist_1_sve2 z4, z5, z12, z13
1243
+    ssimDist_1_sve2 z6, z7, z14, z15
1244
+    ssimDist_1_sve2 z8, z9, z30, z31
1245
+    cbnz            w12, .loop_ssimDist32_sve2
1246
+    ssimDist_end
1247
+    ret
1248
+.vl_gt_16_ssimDist32:
1249
+    cmp             x9, #48
1250
+    bgt             .vl_gt_48_ssimDist32
1251
+    ssimDist_start_sve2
1252
+    ptrue           p0.s, vl8
1253
+.vl_gt_16_loop_ssimDist32_sve2:
1254
+    sub             w12, w12, #1
1255
+    ld1b            {z2.s}, p0/z, x0
1256
+    ld1b            {z3.s}, p0/z, x0, #1, mul vl
1257
+    ld1b            {z4.s}, p0/z, x0, #2, mul vl
1258
+    ld1b            {z5.s}, p0/z, x0, #3, mul vl
1259
+    add             x0, x0, x1
1260
+    ld1b            {z10.s}, p0/z, x2
1261
+    ld1b            {z11.s}, p0/z, x2, #1, mul vl
1262
+    ld1b            {z12.s}, p0/z, x2, #2, mul vl
1263
+    ld1b            {z13.s}, p0/z, x2, #3, mul vl
1264
+    add             x2, x2, x3
1265
+    ssimDist_1_sve2 z2, z3, z10, z11
1266
+    ssimDist_1_sve2 z4, z5, z12, z13
1267
+    cbnz            w12, .vl_gt_16_loop_ssimDist32_sve2
1268
+    ssimDist_end_sve2
1269
+    ret
1270
+.vl_gt_48_ssimDist32:
1271
+    cmp             x9, #112
1272
+    bgt             .vl_gt_112_ssimDist32
1273
+    ssimDist_start_sve2
1274
+    ptrue           p0.s, vl16
1275
+.vl_gt_48_loop_ssimDist32_sve2:
1276
+    sub             w12, w12, #1
1277
+    ld1b            {z2.s}, p0/z, x0
1278
+    ld1b            {z3.s}, p0/z, x0, #1, mul vl
1279
+    add             x0, x0, x1
1280
+    ld1b            {z10.s}, p0/z, x2
1281
+    ld1b            {z11.s}, p0/z, x2, #1, mul vl
1282
+    add             x2, x2, x3
1283
+    ssimDist_1_sve2 z2, z3, z10, z11
1284
+    cbnz            w12, .vl_gt_48_loop_ssimDist32_sve2
1285
+    ssimDist_end_sve2
1286
+    ret
1287
+.vl_gt_112_ssimDist32:
1288
+    ssimDist_start_sve2
1289
+    ptrue           p0.s, vl32
1290
+.vl_gt_112_loop_ssimDist32_sve2:
1291
+    sub             w12, w12, #1
1292
+    ld1b            {z2.s}, p0/z, x0
1293
+    add             x0, x0, x1
1294
+    ld1b            {z10.s}, p0/z, x2
1295
+    add             x2, x2, x3
1296
+    sub             z20.s, z2.s, z10.s
1297
+    mul             z16.s, z2.s, z2.s
1298
+    mul             z18.s, z20.s, z20.s
1299
+    add             z0.s, z0.s, z16.s
1300
+    add             z1.s, z1.s, z18.s
1301
+    cbnz            w12, .vl_gt_112_loop_ssimDist32_sve2
1302
+    ssimDist_end_sve2
1303
+    ret
1304
+endfunc
1305
+
1306
+function PFX(ssimDist64_sve2)
1307
+    mov             w12, #64
1308
+    rdvl            x9, #1
1309
+    cmp             x9, #16
1310
+    bgt             .vl_gt_16_ssimDist64
1311
+    ssimDist_start
1312
+    ptrue           p0.s, vl4
1313
+.loop_ssimDist64_sve2:
1314
+    sub             w12, w12, #1
1315
+    ld1b            {z2.s}, p0/z, x0
1316
+    ld1b            {z3.s}, p0/z, x0, #1, mul vl
1317
+    ld1b            {z4.s}, p0/z, x0, #2, mul vl
1318
+    ld1b            {z5.s}, p0/z, x0, #3, mul vl
1319
+    ld1b            {z6.s}, p0/z, x0, #4, mul vl
1320
+    ld1b            {z7.s}, p0/z, x0, #5, mul vl
1321
+    ld1b            {z8.s}, p0/z, x0, #6, mul vl
1322
+    ld1b            {z9.s}, p0/z, x0, #7, mul vl
1323
+    ld1b            {z23.s}, p0/z, x2
1324
+    ld1b            {z24.s}, p0/z, x2, #1, mul vl
1325
+    ld1b            {z25.s}, p0/z, x2, #2, mul vl
1326
+    ld1b            {z26.s}, p0/z, x2, #3, mul vl
1327
+    ld1b            {z27.s}, p0/z, x2, #4, mul vl
1328
+    ld1b            {z28.s}, p0/z, x2, #5, mul vl
1329
+    ld1b            {z29.s}, p0/z, x2, #6, mul vl
1330
+    ld1b            {z30.s}, p0/z, x2, #7, mul vl
1331
+    ssimDist_1_sve2 z2, z3, z23, z24
1332
+    ssimDist_1_sve2 z4, z5, z25, z26
1333
+    ssimDist_1_sve2 z6, z7, z27, z28
1334
+    ssimDist_1_sve2 z8, z9, z29, z30
1335
+    mov             x4, x0
1336
+    mov             x5, x2
1337
+    add             x4, x4, #32
1338
+    add             x5, x5, #32
1339
+    ld1b            {z2.s}, p0/z, x4
1340
+    ld1b            {z3.s}, p0/z, x4, #1, mul vl
1341
+    ld1b            {z4.s}, p0/z, x4, #2, mul vl
1342
+    ld1b            {z5.s}, p0/z, x4, #3, mul vl
1343
+    ld1b            {z6.s}, p0/z, x4, #4, mul vl
1344
+    ld1b            {z7.s}, p0/z, x4, #5, mul vl
1345
+    ld1b            {z8.s}, p0/z, x4, #6, mul vl
1346
+    ld1b            {z9.s}, p0/z, x4, #7, mul vl
1347
+    ld1b            {z23.s}, p0/z, x5
1348
+    ld1b            {z24.s}, p0/z, x5, #1, mul vl
1349
+    ld1b            {z25.s}, p0/z, x5, #2, mul vl
1350
+    ld1b            {z26.s}, p0/z, x5, #3, mul vl
1351
+    ld1b            {z27.s}, p0/z, x5, #4, mul vl
1352
+    ld1b            {z28.s}, p0/z, x5, #5, mul vl
1353
+    ld1b            {z29.s}, p0/z, x5, #6, mul vl
1354
+    ld1b            {z30.s}, p0/z, x5, #7, mul vl
1355
+    ssimDist_1_sve2 z2, z3, z23, z24
1356
+    ssimDist_1_sve2 z4, z5, z25, z26
1357
+    ssimDist_1_sve2 z6, z7, z27, z28
1358
+    ssimDist_1_sve2 z8, z9, z29, z30
1359
+    add             x0, x0, x1
1360
+    add             x2, x2, x3
1361
+    cbnz            w12, .loop_ssimDist64_sve2
1362
+    ssimDist_end
1363
+    ret
1364
+.vl_gt_16_ssimDist64:
1365
+    cmp             x9, #48
1366
+    bgt             .vl_gt_48_ssimDist64
1367
+    ssimDist_start_sve2
1368
+    ptrue           p0.s, vl8
1369
+.vl_gt_16_loop_ssimDist64_sve2:
1370
+    sub             w12, w12, #1
1371
+    ld1b            {z2.s}, p0/z, x0
1372
+    ld1b            {z3.s}, p0/z, x0, #1, mul vl
1373
+    ld1b            {z4.s}, p0/z, x0, #2, mul vl
1374
+    ld1b            {z5.s}, p0/z, x0, #3, mul vl
1375
+    ld1b            {z6.s}, p0/z, x0, #4, mul vl
1376
+    ld1b            {z7.s}, p0/z, x0, #5, mul vl
1377
+    ld1b            {z8.s}, p0/z, x0, #6, mul vl
1378
+    ld1b            {z9.s}, p0/z, x0, #7, mul vl
1379
+    ld1b            {z23.s}, p0/z, x2
1380
+    ld1b            {z24.s}, p0/z, x2, #1, mul vl
1381
+    ld1b            {z25.s}, p0/z, x2, #2, mul vl
1382
+    ld1b            {z26.s}, p0/z, x2, #3, mul vl
1383
+    ld1b            {z27.s}, p0/z, x2, #4, mul vl
1384
+    ld1b            {z28.s}, p0/z, x2, #5, mul vl
1385
+    ld1b            {z29.s}, p0/z, x2, #6, mul vl
1386
+    ld1b            {z30.s}, p0/z, x2, #7, mul vl
1387
+    ssimDist_1_sve2 z2, z3, z23, z24
1388
+    ssimDist_1_sve2 z4, z5, z25, z26
1389
+    ssimDist_1_sve2 z6, z7, z27, z28
1390
+    ssimDist_1_sve2 z8, z9, z29, z30
1391
+    add             x0, x0, x1
1392
+    add             x2, x2, x3
1393
+    cbnz            w12, .vl_gt_16_loop_ssimDist64_sve2
1394
+    ssimDist_end_sve2
1395
+    ret
1396
+.vl_gt_48_ssimDist64:
1397
+    cmp             x9, #112
1398
+    bgt             .vl_gt_112_ssimDist64
1399
+    ssimDist_start_sve2
1400
+    ptrue           p0.s, vl16
1401
+.vl_gt_48_loop_ssimDist64_sve2:
1402
+    sub             w12, w12, #1
1403
+    ld1b            {z2.s}, p0/z, x0
1404
+    ld1b            {z3.s}, p0/z, x0, #1, mul vl
1405
+    ld1b            {z4.s}, p0/z, x0, #2, mul vl
1406
+    ld1b            {z5.s}, p0/z, x0, #3, mul vl
1407
+    ld1b            {z23.s}, p0/z, x2
1408
+    ld1b            {z24.s}, p0/z, x2, #1, mul vl
1409
+    ld1b            {z25.s}, p0/z, x2, #2, mul vl
1410
+    ld1b            {z26.s}, p0/z, x2, #3, mul vl
1411
+    ssimDist_1_sve2 z2, z3, z23, z24
1412
+    ssimDist_1_sve2 z4, z5, z25, z26
1413
+    add             x0, x0, x1
1414
+    add             x2, x2, x3
1415
+    cbnz            w12, .vl_gt_48_loop_ssimDist64_sve2
1416
+    ssimDist_end_sve2
1417
+    ret
1418
+.vl_gt_112_ssimDist64:
1419
+    ssimDist_start_sve2
1420
+    ptrue           p0.s, vl32
1421
+.vl_gt_112_loop_ssimDist64_sve2:
1422
+    sub             w12, w12, #1
1423
+    ld1b            {z2.s}, p0/z, x0
1424
+    ld1b            {z3.s}, p0/z, x0, #1, mul vl
1425
+    ld1b            {z23.s}, p0/z, x2
1426
+    ld1b            {z24.s}, p0/z, x2, #1, mul vl
1427
+    ssimDist_1_sve2 z2, z3, z23, z24
1428
+    add             x0, x0, x1
1429
+    add             x2, x2, x3
1430
+    cbnz            w12, .vl_gt_112_loop_ssimDist64_sve2
1431
+    ssimDist_end_sve2
1432
+    ret
1433
+endfunc
1434
+
1435
+// void normFact_c(const pixel* src, uint32_t blockSize, int shift, uint64_t *z_k)
1436
+.macro normFact_start_sve2
1437
+    mov             z0.d, #0
1438
+.endm
1439
+
1440
+.macro normFact_1_sve2  z0, z1
1441
+    mul             z16.s, \z0\().s, \z0\().s
1442
+    mul             z17.s, \z1\().s, \z1\().s
1443
+    add             z0.s, z0.s, z16.s
1444
+    add             z0.s, z0.s, z17.s
1445
+.endm
1446
+
1447
+.macro normFact_end_sve2
1448
+    uaddv           d0, p0, z0.s
1449
+    str             d0, x3
1450
+.endm
1451
+
1452
+function PFX(normFact8_sve2)
1453
+    rdvl            x9, #1
1454
+    cmp             x9, #16
1455
+    bgt             .vl_gt_16_normFact8
1456
+    normFact_start
1457
+    ptrue           p0.s, vl4
1458
+.rept 8
1459
+    ld1b            {z4.s}, p0/z, x0
1460
+    ld1b            {z5.s}, p0/z, x0, #1, mul vl
1461
+    add             x0, x0, x1
1462
+    normFact_1_sve2 z4, z5
1463
+.endr
1464
+    normFact_end
1465
+    ret
1466
+.vl_gt_16_normFact8:
1467
+    normFact_start_sve2
1468
+    ptrue           p0.s, vl8
1469
+.rept 8
1470
+    ld1b            {z4.s}, p0/z, x0
1471
+    add             x0, x0, x1
1472
+    mul             z16.s, z4.s, z4.s
1473
+    add             z0.s, z0.s, z16.s
1474
+.endr
1475
+    normFact_end_sve2
1476
+    ret
1477
+endfunc
1478
+
1479
+function PFX(normFact16_sve2)
1480
+    mov             w12, #16
1481
+    rdvl            x9, #1
1482
+    cmp             x9, #16
1483
+    bgt             .vl_gt_16_normFact16
1484
+    normFact_start
1485
+    ptrue           p0.s, vl4
1486
+.loop_normFact16_sve2:
1487
+    sub             w12, w12, #1
1488
+    ld1b            {z4.s}, p0/z, x0
1489
+    ld1b            {z5.s}, p0/z, x0, #1, mul vl
1490
+    ld1b            {z6.s}, p0/z, x0, #2, mul vl
1491
+    ld1b            {z7.s}, p0/z, x0, #3, mul vl
1492
+    add             x0, x0, x1
1493
+    normFact_1_sve2 z4, z5
1494
+    normFact_1_sve2 z6, z7
1495
+    cbnz            w12, .loop_normFact16_sve2
1496
+    normFact_end
1497
+    ret
1498
+.vl_gt_16_normFact16:
1499
+    cmp             x9, #48
1500
+    bgt             .vl_gt_48_normFact16
1501
+    normFact_start_sve2
1502
+    ptrue           p0.s, vl8
1503
+.vl_gt_16_loop_normFact16_sve2:
1504
+    sub             w12, w12, #1
1505
+    ld1b            {z4.s}, p0/z, x0
1506
+    ld1b            {z5.s}, p0/z, x0, #1, mul vl
1507
+    add             x0, x0, x1
1508
+    normFact_1_sve2 z4, z5
1509
+    cbnz            w12, .vl_gt_16_loop_normFact16_sve2
1510
+    normFact_end_sve2
1511
+    ret
1512
+.vl_gt_48_normFact16:
1513
+    normFact_start_sve2
1514
+    ptrue           p0.s, vl16
1515
+.vl_gt_48_loop_normFact16_sve2:
1516
+    sub             w12, w12, #1
1517
+    ld1b            {z4.s}, p0/z, x0
1518
+    add             x0, x0, x1
1519
+    mul             z16.s, z4.s, z4.s
1520
+    add             z0.s, z0.s, z16.s
1521
+    cbnz            w12, .vl_gt_48_loop_normFact16_sve2
1522
+    normFact_end_sve2
1523
+    ret
1524
+endfunc
1525
+
1526
+function PFX(normFact32_sve2)
1527
+    mov             w12, #32
1528
+    rdvl            x9, #1
1529
+    cmp             x9, #16
1530
+    bgt             .vl_gt_16_normFact32
1531
+    normFact_start
1532
+    ptrue           p0.s, vl4
1533
+.loop_normFact32_sve2:
1534
+    sub             w12, w12, #1
1535
+    ld1b            {z4.s}, p0/z, x0
1536
+    ld1b            {z5.s}, p0/z, x0, #1, mul vl
1537
+    ld1b            {z6.s}, p0/z, x0, #2, mul vl
1538
+    ld1b            {z7.s}, p0/z, x0, #3, mul vl
1539
+    ld1b            {z8.s}, p0/z, x0, #4, mul vl
1540
+    ld1b            {z9.s}, p0/z, x0, #5, mul vl
1541
+    ld1b            {z10.s}, p0/z, x0, #6, mul vl
1542
+    ld1b            {z11.s}, p0/z, x0, #7, mul vl
1543
+    add             x0, x0, x1
1544
+    normFact_1_sve2 z4, z5
1545
+    normFact_1_sve2 z6, z7
1546
+    normFact_1_sve2 z8, z9
1547
+    normFact_1_sve2 z10, z11
1548
+    cbnz            w12, .loop_normFact32_sve2
1549
+    normFact_end
1550
+    ret
1551
+.vl_gt_16_normFact32:
1552
+    cmp             x9, #48
1553
+    bgt             .vl_gt_48_normFact32
1554
+    normFact_start_sve2
1555
+    ptrue           p0.s, vl8
1556
+.vl_gt_16_loop_normFact32_sve2:
1557
+    sub             w12, w12, #1
1558
+    ld1b            {z4.s}, p0/z, x0
1559
+    ld1b            {z5.s}, p0/z, x0, #1, mul vl
1560
+    ld1b            {z6.s}, p0/z, x0, #2, mul vl
1561
+    ld1b            {z7.s}, p0/z, x0, #3, mul vl
1562
+    add             x0, x0, x1
1563
+    normFact_1_sve2 z4, z5
1564
+    normFact_1_sve2 z6, z7
1565
+    cbnz            w12, .vl_gt_16_loop_normFact32_sve2
1566
+    normFact_end_sve2
1567
+    ret
1568
+.vl_gt_48_normFact32:
1569
+    cmp             x9, #112
1570
+    bgt             .vl_gt_112_normFact32
1571
+    normFact_start_sve2
1572
+    ptrue           p0.s, vl16
1573
+.vl_gt_48_loop_normFact32_sve2:
1574
+    sub             w12, w12, #1
1575
+    ld1b            {z4.s}, p0/z, x0
1576
+    ld1b            {z5.s}, p0/z, x0, #1, mul vl
1577
+    add             x0, x0, x1
1578
+    normFact_1_sve2 z4, z5
1579
+    cbnz            w12, .vl_gt_48_loop_normFact32_sve2
1580
+    normFact_end_sve2
1581
+    ret
1582
+.vl_gt_112_normFact32:
1583
+    normFact_start_sve2
1584
+    ptrue           p0.s, vl32
1585
+.vl_gt_112_loop_normFact32_sve2:
1586
+    sub             w12, w12, #1
1587
+    ld1b            {z4.s}, p0/z, x0
1588
+    add             x0, x0, x1
1589
+    mul             z16.s, z4.s, z4.s
1590
+    add             z0.s, z0.s, z16.s
1591
+    cbnz            w12, .vl_gt_112_loop_normFact32_sve2
1592
+    normFact_end_sve2
1593
+    ret
1594
+endfunc
1595
+
1596
+function PFX(normFact64_sve2)
1597
+    mov             w12, #64
1598
+    rdvl            x9, #1
1599
+    cmp             x9, #16
1600
+    bgt             .vl_gt_16_normFact64
1601
+    normFact_start
1602
+    ptrue           p0.s, vl4
1603
+.loop_normFact64_sve2:
1604
+    sub             w12, w12, #1
1605
+    ld1b            {z4.s}, p0/z, x0
1606
+    ld1b            {z5.s}, p0/z, x0, #1, mul vl
1607
+    ld1b            {z6.s}, p0/z, x0, #2, mul vl
1608
+    ld1b            {z7.s}, p0/z, x0, #3, mul vl
1609
+    ld1b            {z8.s}, p0/z, x0, #4, mul vl
1610
+    ld1b            {z9.s}, p0/z, x0, #5, mul vl
1611
+    ld1b            {z10.s}, p0/z, x0, #6, mul vl
1612
+    ld1b            {z11.s}, p0/z, x0, #7, mul vl
1613
+    normFact_1_sve2 z4, z5
1614
+    normFact_1_sve2 z6, z7
1615
+    normFact_1_sve2 z8, z9
1616
+    normFact_1_sve2 z10, z11
1617
+    mov             x2, x0
1618
+    add             x2, x2, #32
1619
+    ld1b            {z4.s}, p0/z, x2
1620
+    ld1b            {z5.s}, p0/z, x2, #1, mul vl
1621
+    ld1b            {z6.s}, p0/z, x2, #2, mul vl
1622
+    ld1b            {z7.s}, p0/z, x2, #3, mul vl
1623
+    ld1b            {z8.s}, p0/z, x2, #4, mul vl
1624
+    ld1b            {z9.s}, p0/z, x2, #5, mul vl
1625
+    ld1b            {z10.s}, p0/z, x2, #6, mul vl
1626
+    ld1b            {z11.s}, p0/z, x2, #7, mul vl
1627
+    normFact_1_sve2 z4, z5
1628
+    normFact_1_sve2 z6, z7
1629
+    normFact_1_sve2 z8, z9
1630
+    normFact_1_sve2 z10, z11
1631
+    add             x0, x0, x1
1632
+    cbnz            w12, .loop_normFact64_sve2
1633
+    normFact_end
1634
+    ret
1635
+.vl_gt_16_normFact64:
1636
+    cmp             x9, #48
1637
+    bgt             .vl_gt_48_normFact64
1638
+    normFact_start_sve2
1639
+    ptrue           p0.s, vl8
1640
+.vl_gt_16_loop_normFact64_sve2:
1641
+    sub             w12, w12, #1
1642
+    ld1b            {z4.s}, p0/z, x0
1643
+    ld1b            {z5.s}, p0/z, x0, #1, mul vl
1644
+    ld1b            {z6.s}, p0/z, x0, #2, mul vl
1645
+    ld1b            {z7.s}, p0/z, x0, #3, mul vl
1646
+    ld1b            {z8.s}, p0/z, x0, #4, mul vl
1647
+    ld1b            {z9.s}, p0/z, x0, #5, mul vl
1648
+    ld1b            {z10.s}, p0/z, x0, #6, mul vl
1649
+    ld1b            {z11.s}, p0/z, x0, #7, mul vl
1650
+    normFact_1_sve2 z4, z5
1651
+    normFact_1_sve2 z6, z7
1652
+    normFact_1_sve2 z8, z9
1653
+    normFact_1_sve2 z10, z11
1654
+    add             x0, x0, x1
1655
+    cbnz            w12, .vl_gt_16_loop_normFact64_sve2
1656
+    normFact_end_sve2
1657
+    ret
1658
+.vl_gt_48_normFact64:
1659
+    cmp             x9, #112
1660
+    bgt             .vl_gt_112_normFact64
1661
+    normFact_start_sve2
1662
+    ptrue           p0.s, vl16
1663
+.vl_gt_48_loop_normFact64_sve2:
1664
+    sub             w12, w12, #1
1665
+    ld1b            {z4.s}, p0/z, x0
1666
+    ld1b            {z5.s}, p0/z, x0, #1, mul vl
1667
+    ld1b            {z6.s}, p0/z, x0, #2, mul vl
1668
+    ld1b            {z7.s}, p0/z, x0, #3, mul vl
1669
+    normFact_1_sve2 z4, z5
1670
+    normFact_1_sve2 z6, z7
1671
+    add             x0, x0, x1
1672
+    cbnz            w12, .vl_gt_48_loop_normFact64_sve2
1673
+    normFact_end_sve2
1674
+    ret
1675
+.vl_gt_112_normFact64:
1676
+    normFact_start_sve2
1677
+    ptrue           p0.s, vl32
1678
+.vl_gt_112_loop_normFact64_sve2:
1679
+    sub             w12, w12, #1
1680
+    ld1b            {z4.s}, p0/z, x0
1681
+    ld1b            {z5.s}, p0/z, x0, #1, mul vl
1682
+    normFact_1_sve2 z4, z5
1683
+    add             x0, x0, x1
1684
+    cbnz            w12, .vl_gt_112_loop_normFact64_sve2
1685
+    normFact_end_sve2
1686
+    ret
1687
+endfunc
1688
x265_3.5.tar.gz/source/common/aarch64/pixel-util.S -> x265_3.6.tar.gz/source/common/aarch64/pixel-util.S Changed
2419
 
1
@@ -1,8 +1,9 @@
2
 /*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
+ * Copyright (C) 2020-2021 MulticoreWare, Inc
5
  *
6
  * Authors: Yimeng Su <yimeng.su@huawei.com>
7
  *          Hongbin Liu <liuhongbin1@huawei.com>
8
+ *          Sebastian Pop <spop@amazon.com>
9
  *
10
  * This program is free software; you can redistribute it and/or modify
11
  * it under the terms of the GNU General Public License as published by
12
@@ -23,13 +24,652 @@
13
  *****************************************************************************/
14
 
15
 #include "asm.S"
16
+#include "pixel-util-common.S"
17
 
18
+#ifdef __APPLE__
19
+.section __RODATA,__rodata
20
+#else
21
 .section .rodata
22
+#endif
23
 
24
 .align 4
25
 
26
 .text
27
 
28
+// uint64_t pixel_var(const pixel* pix, intptr_t i_stride)
29
+function PFX(pixel_var_8x8_neon)
30
+    ld1             {v4.8b}, x0, x1        // pixx
31
+    uxtl            v0.8h, v4.8b             // sum = pixx
32
+    umull           v1.8h, v4.8b, v4.8b
33
+    uaddlp          v1.4s, v1.8h             // sqr = pixx * pixx
34
+
35
+.rept 7
36
+    ld1             {v4.8b}, x0, x1        // pixx
37
+    umull           v31.8h, v4.8b, v4.8b
38
+    uaddw           v0.8h, v0.8h, v4.8b      // sum += pixx
39
+    uadalp          v1.4s, v31.8h            // sqr += pixx * pixx
40
+.endr
41
+    uaddlv          s0, v0.8h
42
+    uaddlv          d1, v1.4s
43
+    fmov            w0, s0
44
+    fmov            x1, d1
45
+    orr             x0, x0, x1, lsl #32      // return sum + ((uint64_t)sqr << 32);
46
+    ret
47
+endfunc
48
+
49
+function PFX(pixel_var_16x16_neon)
50
+    pixel_var_start
51
+    mov             w12, #16
52
+.loop_var_16:
53
+    sub             w12, w12, #1
54
+    ld1             {v4.16b}, x0, x1
55
+    pixel_var_1 v4
56
+    cbnz            w12, .loop_var_16
57
+    pixel_var_end
58
+    ret
59
+endfunc
60
+
61
+function PFX(pixel_var_32x32_neon)
62
+    pixel_var_start
63
+    mov             w12, #32
64
+.loop_var_32:
65
+    sub             w12, w12, #1
66
+    ld1             {v4.16b-v5.16b}, x0, x1
67
+    pixel_var_1 v4
68
+    pixel_var_1 v5
69
+    cbnz            w12, .loop_var_32
70
+    pixel_var_end
71
+    ret
72
+endfunc
73
+
74
+function PFX(pixel_var_64x64_neon)
75
+    pixel_var_start
76
+    mov             w12, #64
77
+.loop_var_64:
78
+    sub             w12, w12, #1
79
+    ld1             {v4.16b-v7.16b}, x0, x1
80
+    pixel_var_1 v4
81
+    pixel_var_1 v5
82
+    pixel_var_1 v6
83
+    pixel_var_1 v7
84
+    cbnz            w12, .loop_var_64
85
+    pixel_var_end
86
+    ret
87
+endfunc
88
+
89
+// void getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride)
90
+function PFX(getResidual4_neon)
91
+    lsl             x4, x3, #1
92
+.rept 2
93
+    ld1             {v0.8b}, x0, x3
94
+    ld1             {v1.8b}, x1, x3
95
+    ld1             {v2.8b}, x0, x3
96
+    ld1             {v3.8b}, x1, x3
97
+    usubl           v4.8h, v0.8b, v1.8b
98
+    usubl           v5.8h, v2.8b, v3.8b
99
+    st1             {v4.8b}, x2, x4
100
+    st1             {v5.8b}, x2, x4
101
+.endr
102
+    ret
103
+endfunc
104
+
105
+function PFX(getResidual8_neon)
106
+    lsl             x4, x3, #1
107
+.rept 4
108
+    ld1             {v0.8b}, x0, x3
109
+    ld1             {v1.8b}, x1, x3
110
+    ld1             {v2.8b}, x0, x3
111
+    ld1             {v3.8b}, x1, x3
112
+    usubl           v4.8h, v0.8b, v1.8b
113
+    usubl           v5.8h, v2.8b, v3.8b
114
+    st1             {v4.16b}, x2, x4
115
+    st1             {v5.16b}, x2, x4
116
+.endr
117
+    ret
118
+endfunc
119
+
120
+function PFX(getResidual16_neon)
121
+    lsl             x4, x3, #1
122
+.rept 8
123
+    ld1             {v0.16b}, x0, x3
124
+    ld1             {v1.16b}, x1, x3
125
+    ld1             {v2.16b}, x0, x3
126
+    ld1             {v3.16b}, x1, x3
127
+    usubl           v4.8h, v0.8b, v1.8b
128
+    usubl2          v5.8h, v0.16b, v1.16b
129
+    usubl           v6.8h, v2.8b, v3.8b
130
+    usubl2          v7.8h, v2.16b, v3.16b
131
+    st1             {v4.8h-v5.8h}, x2, x4
132
+    st1             {v6.8h-v7.8h}, x2, x4
133
+.endr
134
+    ret
135
+endfunc
136
+
137
+function PFX(getResidual32_neon)
138
+    lsl             x4, x3, #1
139
+    mov             w12, #4
140
+.loop_residual_32:
141
+    sub             w12, w12, #1
142
+.rept 4
143
+    ld1             {v0.16b-v1.16b}, x0, x3
144
+    ld1             {v2.16b-v3.16b}, x1, x3
145
+    ld1             {v4.16b-v5.16b}, x0, x3
146
+    ld1             {v6.16b-v7.16b}, x1, x3
147
+    usubl           v16.8h, v0.8b, v2.8b
148
+    usubl2          v17.8h, v0.16b, v2.16b
149
+    usubl           v18.8h, v1.8b, v3.8b
150
+    usubl2          v19.8h, v1.16b, v3.16b
151
+    usubl           v20.8h, v4.8b, v6.8b
152
+    usubl2          v21.8h, v4.16b, v6.16b
153
+    usubl           v22.8h, v5.8b, v7.8b
154
+    usubl2          v23.8h, v5.16b, v7.16b
155
+    st1             {v16.8h-v19.8h}, x2, x4
156
+    st1             {v20.8h-v23.8h}, x2, x4
157
+.endr
158
+    cbnz            w12, .loop_residual_32
159
+    ret
160
+endfunc
161
+
162
+// void pixel_sub_ps_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1)
163
+function PFX(pixel_sub_ps_4x4_neon)
164
+    lsl             x1, x1, #1
165
+.rept 2
166
+    ld1             {v0.8b}, x2, x4
167
+    ld1             {v1.8b}, x3, x5
168
+    ld1             {v2.8b}, x2, x4
169
+    ld1             {v3.8b}, x3, x5
170
+    usubl           v4.8h, v0.8b, v1.8b
171
+    usubl           v5.8h, v2.8b, v3.8b
172
+    st1             {v4.4h}, x0, x1
173
+    st1             {v5.4h}, x0, x1
174
+.endr
175
+    ret
176
+endfunc
177
+
178
+function PFX(pixel_sub_ps_8x8_neon)
179
+    lsl             x1, x1, #1
180
+.rept 4
181
+    ld1             {v0.8b}, x2, x4
182
+    ld1             {v1.8b}, x3, x5
183
+    ld1             {v2.8b}, x2, x4
184
+    ld1             {v3.8b}, x3, x5
185
+    usubl           v4.8h, v0.8b, v1.8b
186
+    usubl           v5.8h, v2.8b, v3.8b
187
+    st1             {v4.8h}, x0, x1
188
+    st1             {v5.8h}, x0, x1
189
+.endr
190
+    ret
191
+endfunc
192
+
193
+function PFX(pixel_sub_ps_16x16_neon)
194
+    lsl             x1, x1, #1
195
+.rept 8
196
+    ld1             {v0.16b}, x2, x4
197
+    ld1             {v1.16b}, x3, x5
198
+    ld1             {v2.16b}, x2, x4
199
+    ld1             {v3.16b}, x3, x5
200
+    usubl           v4.8h, v0.8b, v1.8b
201
+    usubl2          v5.8h, v0.16b, v1.16b
202
+    usubl           v6.8h, v2.8b, v3.8b
203
+    usubl2          v7.8h, v2.16b, v3.16b
204
+    st1             {v4.8h-v5.8h}, x0, x1
205
+    st1             {v6.8h-v7.8h}, x0, x1
206
+.endr
207
+    ret
208
+endfunc
209
+
210
+function PFX(pixel_sub_ps_32x32_neon)
211
+    lsl             x1, x1, #1
212
+    mov             w12, #4
213
+.loop_sub_ps_32:
214
+    sub             w12, w12, #1
215
+.rept 4
216
+    ld1             {v0.16b-v1.16b}, x2, x4
217
+    ld1             {v2.16b-v3.16b}, x3, x5
218
+    ld1             {v4.16b-v5.16b}, x2, x4
219
+    ld1             {v6.16b-v7.16b}, x3, x5
220
+    usubl           v16.8h, v0.8b, v2.8b
221
+    usubl2          v17.8h, v0.16b, v2.16b
222
+    usubl           v18.8h, v1.8b, v3.8b
223
+    usubl2          v19.8h, v1.16b, v3.16b
224
+    usubl           v20.8h, v4.8b, v6.8b
225
+    usubl2          v21.8h, v4.16b, v6.16b
226
+    usubl           v22.8h, v5.8b, v7.8b
227
+    usubl2          v23.8h, v5.16b, v7.16b
228
+    st1             {v16.8h-v19.8h}, x0, x1
229
+    st1             {v20.8h-v23.8h}, x0, x1
230
+.endr
231
+    cbnz            w12, .loop_sub_ps_32
232
+    ret
233
+endfunc
234
+
235
+function PFX(pixel_sub_ps_64x64_neon)
236
+    lsl             x1, x1, #1
237
+    sub             x1, x1, #64
238
+    mov             w12, #16
239
+.loop_sub_ps_64:
240
+    sub             w12, w12, #1
241
+.rept 4
242
+    ld1             {v0.16b-v3.16b}, x2, x4
243
+    ld1             {v4.16b-v7.16b}, x3, x5
244
+    usubl           v16.8h, v0.8b, v4.8b
245
+    usubl2          v17.8h, v0.16b, v4.16b
246
+    usubl           v18.8h, v1.8b, v5.8b
247
+    usubl2          v19.8h, v1.16b, v5.16b
248
+    usubl           v20.8h, v2.8b, v6.8b
249
+    usubl2          v21.8h, v2.16b, v6.16b
250
+    usubl           v22.8h, v3.8b, v7.8b
251
+    usubl2          v23.8h, v3.16b, v7.16b
252
+    st1             {v16.8h-v19.8h}, x0, #64
253
+    st1             {v20.8h-v23.8h}, x0, x1
254
+.endr
255
+    cbnz            w12, .loop_sub_ps_64
256
+    ret
257
+endfunc
258
+
259
+// chroma sub_ps
260
+function PFX(pixel_sub_ps_4x8_neon)
261
+    lsl             x1, x1, #1
262
+.rept 4
263
+    ld1             {v0.8b}, x2, x4
264
+    ld1             {v1.8b}, x3, x5
265
+    ld1             {v2.8b}, x2, x4
266
+    ld1             {v3.8b}, x3, x5
267
+    usubl           v4.8h, v0.8b, v1.8b
268
+    usubl           v5.8h, v2.8b, v3.8b
269
+    st1             {v4.4h}, x0, x1
270
+    st1             {v5.4h}, x0, x1
271
+.endr
272
+    ret
273
+endfunc
274
+
275
+function PFX(pixel_sub_ps_8x16_neon)
276
+    lsl             x1, x1, #1
277
+.rept 8
278
+    ld1             {v0.8b}, x2, x4
279
+    ld1             {v1.8b}, x3, x5
280
+    ld1             {v2.8b}, x2, x4
281
+    ld1             {v3.8b}, x3, x5
282
+    usubl           v4.8h, v0.8b, v1.8b
283
+    usubl           v5.8h, v2.8b, v3.8b
284
+    st1             {v4.8h}, x0, x1
285
+    st1             {v5.8h}, x0, x1
286
+.endr
287
+    ret
288
+endfunc
289
+
290
+function PFX(pixel_sub_ps_16x32_neon)
291
+    lsl             x1, x1, #1
292
+.rept 16
293
+    ld1             {v0.16b}, x2, x4
294
+    ld1             {v1.16b}, x3, x5
295
+    ld1             {v2.16b}, x2, x4
296
+    ld1             {v3.16b}, x3, x5
297
+    usubl           v4.8h, v0.8b, v1.8b
298
+    usubl2          v5.8h, v0.16b, v1.16b
299
+    usubl           v6.8h, v2.8b, v3.8b
300
+    usubl2          v7.8h, v2.16b, v3.16b
301
+    st1             {v4.8h-v5.8h}, x0, x1
302
+    st1             {v6.8h-v7.8h}, x0, x1
303
+.endr
304
+    ret
305
+endfunc
306
+
307
+function PFX(pixel_sub_ps_32x64_neon)
308
+    lsl             x1, x1, #1
309
+    mov             w12, #8
310
+.loop_sub_ps_32x64:
311
+    sub             w12, w12, #1
312
+.rept 4
313
+    ld1             {v0.16b-v1.16b}, x2, x4
314
+    ld1             {v2.16b-v3.16b}, x3, x5
315
+    ld1             {v4.16b-v5.16b}, x2, x4
316
+    ld1             {v6.16b-v7.16b}, x3, x5
317
+    usubl           v16.8h, v0.8b, v2.8b
318
+    usubl2          v17.8h, v0.16b, v2.16b
319
+    usubl           v18.8h, v1.8b, v3.8b
320
+    usubl2          v19.8h, v1.16b, v3.16b
321
+    usubl           v20.8h, v4.8b, v6.8b
322
+    usubl2          v21.8h, v4.16b, v6.16b
323
+    usubl           v22.8h, v5.8b, v7.8b
324
+    usubl2          v23.8h, v5.16b, v7.16b
325
+    st1             {v16.8h-v19.8h}, x0, x1
326
+    st1             {v20.8h-v23.8h}, x0, x1
327
+.endr
328
+    cbnz            w12, .loop_sub_ps_32x64
329
+    ret
330
+endfunc
331
+
332
+// void x265_pixel_add_ps_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
333
+function PFX(pixel_add_ps_4x4_neon)
334
+    lsl             x5, x5, #1
335
+.rept 2
336
+    ld1             {v0.8b}, x2, x4
337
+    ld1             {v1.8b}, x2, x4
338
+    ld1             {v2.4h}, x3, x5
339
+    ld1             {v3.4h}, x3, x5
340
+    uxtl            v0.8h, v0.8b
341
+    uxtl            v1.8h, v1.8b
342
+    add             v4.8h, v0.8h, v2.8h
343
+    add             v5.8h, v1.8h, v3.8h
344
+    sqxtun          v4.8b, v4.8h
345
+    sqxtun          v5.8b, v5.8h
346
+    st1             {v4.s}0, x0, x1
347
+    st1             {v5.s}0, x0, x1
348
+.endr
349
+    ret
350
+endfunc
351
+
352
+function PFX(pixel_add_ps_8x8_neon)
353
+    lsl             x5, x5, #1
354
+.rept 4
355
+    ld1             {v0.8b}, x2, x4
356
+    ld1             {v1.8b}, x2, x4
357
+    ld1             {v2.8h}, x3, x5
358
+    ld1             {v3.8h}, x3, x5
359
+    uxtl            v0.8h, v0.8b
360
+    uxtl            v1.8h, v1.8b
361
+    add             v4.8h, v0.8h, v2.8h
362
+    add             v5.8h, v1.8h, v3.8h
363
+    sqxtun          v4.8b, v4.8h
364
+    sqxtun          v5.8b, v5.8h
365
+    st1             {v4.8b}, x0, x1
366
+    st1             {v5.8b}, x0, x1
367
+.endr
368
+    ret
369
+endfunc
370
+
371
+.macro pixel_add_ps_16xN_neon h
372
+function PFX(pixel_add_ps_16x\h\()_neon)
373
+    lsl             x5, x5, #1
374
+    mov             w12, #\h / 8
375
+.loop_add_ps_16x\h\():
376
+    sub             w12, w12, #1
377
+.rept 4
378
+    ld1             {v0.16b}, x2, x4
379
+    ld1             {v1.16b}, x2, x4
380
+    ld1             {v16.8h-v17.8h}, x3, x5
381
+    ld1             {v18.8h-v19.8h}, x3, x5
382
+    uxtl            v4.8h, v0.8b
383
+    uxtl2           v5.8h, v0.16b
384
+    uxtl            v6.8h, v1.8b
385
+    uxtl2           v7.8h, v1.16b
386
+    add             v24.8h, v4.8h, v16.8h
387
+    add             v25.8h, v5.8h, v17.8h
388
+    add             v26.8h, v6.8h, v18.8h
389
+    add             v27.8h, v7.8h, v19.8h
390
+    sqxtun          v4.8b, v24.8h
391
+    sqxtun2         v4.16b, v25.8h
392
+    sqxtun          v5.8b, v26.8h
393
+    sqxtun2         v5.16b, v27.8h
394
+    st1             {v4.16b}, x0, x1
395
+    st1             {v5.16b}, x0, x1
396
+.endr
397
+    cbnz            w12, .loop_add_ps_16x\h
398
+    ret
399
+endfunc
400
+.endm
401
+
402
+pixel_add_ps_16xN_neon 16
403
+pixel_add_ps_16xN_neon 32
404
+
405
+.macro pixel_add_ps_32xN_neon h
406
+ function PFX(pixel_add_ps_32x\h\()_neon)
407
+    lsl             x5, x5, #1
408
+    mov             w12, #\h / 4
409
+.loop_add_ps_32x\h\():
410
+    sub             w12, w12, #1
411
+.rept 4
412
+    ld1             {v0.16b-v1.16b}, x2, x4
413
+    ld1             {v16.8h-v19.8h}, x3, x5
414
+    uxtl            v4.8h, v0.8b
415
+    uxtl2           v5.8h, v0.16b
416
+    uxtl            v6.8h, v1.8b
417
+    uxtl2           v7.8h, v1.16b
418
+    add             v24.8h, v4.8h, v16.8h
419
+    add             v25.8h, v5.8h, v17.8h
420
+    add             v26.8h, v6.8h, v18.8h
421
+    add             v27.8h, v7.8h, v19.8h
422
+    sqxtun          v4.8b, v24.8h
423
+    sqxtun2         v4.16b, v25.8h
424
+    sqxtun          v5.8b, v26.8h
425
+    sqxtun2         v5.16b, v27.8h
426
+    st1             {v4.16b-v5.16b}, x0, x1
427
+.endr
428
+    cbnz            w12, .loop_add_ps_32x\h
429
+    ret
430
+endfunc
431
+.endm
432
+
433
+pixel_add_ps_32xN_neon 32
434
+pixel_add_ps_32xN_neon 64
435
+
436
+function PFX(pixel_add_ps_64x64_neon)
437
+    lsl             x5, x5, #1
438
+    sub             x5, x5, #64
439
+    mov             w12, #32
440
+.loop_add_ps_64x64:
441
+    sub             w12, w12, #1
442
+.rept 2
443
+    ld1             {v0.16b-v3.16b}, x2, x4
444
+    ld1             {v16.8h-v19.8h}, x3, #64
445
+    ld1             {v20.8h-v23.8h}, x3, x5
446
+    uxtl            v4.8h, v0.8b
447
+    uxtl2           v5.8h, v0.16b
448
+    uxtl            v6.8h, v1.8b
449
+    uxtl2           v7.8h, v1.16b
450
+    uxtl            v24.8h, v2.8b
451
+    uxtl2           v25.8h, v2.16b
452
+    uxtl            v26.8h, v3.8b
453
+    uxtl2           v27.8h, v3.16b
454
+    add             v0.8h, v4.8h, v16.8h
455
+    add             v1.8h, v5.8h, v17.8h
456
+    add             v2.8h, v6.8h, v18.8h
457
+    add             v3.8h, v7.8h, v19.8h
458
+    add             v4.8h, v24.8h, v20.8h
459
+    add             v5.8h, v25.8h, v21.8h
460
+    add             v6.8h, v26.8h, v22.8h
461
+    add             v7.8h, v27.8h, v23.8h
462
+    sqxtun          v0.8b, v0.8h
463
+    sqxtun2         v0.16b, v1.8h
464
+    sqxtun          v1.8b, v2.8h
465
+    sqxtun2         v1.16b, v3.8h
466
+    sqxtun          v2.8b, v4.8h
467
+    sqxtun2         v2.16b, v5.8h
468
+    sqxtun          v3.8b, v6.8h
469
+    sqxtun2         v3.16b, v7.8h
470
+    st1             {v0.16b-v3.16b}, x0, x1
471
+.endr
472
+    cbnz            w12, .loop_add_ps_64x64
473
+    ret
474
+endfunc
475
+
476
+// Chroma add_ps
477
+function PFX(pixel_add_ps_4x8_neon)
478
+    lsl             x5, x5, #1
479
+.rept 4
480
+    ld1             {v0.8b}, x2, x4
481
+    ld1             {v1.8b}, x2, x4
482
+    ld1             {v2.4h}, x3, x5
483
+    ld1             {v3.4h}, x3, x5
484
+    uxtl            v0.8h, v0.8b
485
+    uxtl            v1.8h, v1.8b
486
+    add             v4.8h, v0.8h, v2.8h
487
+    add             v5.8h, v1.8h, v3.8h
488
+    sqxtun          v4.8b, v4.8h
489
+    sqxtun          v5.8b, v5.8h
490
+    st1             {v4.s}0, x0, x1
491
+    st1             {v5.s}0, x0, x1
492
+.endr
493
+    ret
494
+endfunc
495
+
496
+function PFX(pixel_add_ps_8x16_neon)
497
+    lsl             x5, x5, #1
498
+.rept 8
499
+    ld1             {v0.8b}, x2, x4
500
+    ld1             {v1.8b}, x2, x4
501
+    ld1             {v2.8h}, x3, x5
502
+    ld1             {v3.8h}, x3, x5
503
+    uxtl            v0.8h, v0.8b
504
+    uxtl            v1.8h, v1.8b
505
+    add             v4.8h, v0.8h, v2.8h
506
+    add             v5.8h, v1.8h, v3.8h
507
+    sqxtun          v4.8b, v4.8h
508
+    sqxtun          v5.8b, v5.8h
509
+    st1             {v4.8b}, x0, x1
510
+    st1             {v5.8b}, x0, x1
511
+.endr
512
+    ret
513
+endfunc
514
+
515
+// void scale1D_128to64(pixel *dst, const pixel *src)
516
+function PFX(scale1D_128to64_neon)
517
+.rept 2
518
+    ld2             {v0.16b, v1.16b}, x1, #32
519
+    ld2             {v2.16b, v3.16b}, x1, #32
520
+    ld2             {v4.16b, v5.16b}, x1, #32
521
+    ld2             {v6.16b, v7.16b}, x1, #32
522
+    urhadd          v0.16b, v0.16b, v1.16b
523
+    urhadd          v1.16b, v2.16b, v3.16b
524
+    urhadd          v2.16b, v4.16b, v5.16b
525
+    urhadd          v3.16b, v6.16b, v7.16b
526
+    st1             {v0.16b-v3.16b}, x0, #64
527
+.endr
528
+    ret
529
+endfunc
530
+
531
+.macro scale2D_1  v0, v1
532
+    uaddlp          \v0\().8h, \v0\().16b
533
+    uaddlp          \v1\().8h, \v1\().16b
534
+    add             \v0\().8h, \v0\().8h, \v1\().8h
535
+.endm
536
+
537
+// void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride)
538
+function PFX(scale2D_64to32_neon)
539
+    mov             w12, #32
540
+.loop_scale2D:
541
+    ld1             {v0.16b-v3.16b}, x1, x2
542
+    sub             w12, w12, #1
543
+    ld1             {v4.16b-v7.16b}, x1, x2
544
+    scale2D_1       v0, v4
545
+    scale2D_1       v1, v5
546
+    scale2D_1       v2, v6
547
+    scale2D_1       v3, v7
548
+    uqrshrn         v0.8b, v0.8h, #2
549
+    uqrshrn2        v0.16b, v1.8h, #2
550
+    uqrshrn         v1.8b, v2.8h, #2
551
+    uqrshrn2        v1.16b, v3.8h, #2
552
+    st1             {v0.16b-v1.16b}, x0, #32
553
+    cbnz            w12, .loop_scale2D
554
+    ret
555
+endfunc
556
+
557
+// void planecopy_cp_c(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift)
558
+function PFX(pixel_planecopy_cp_neon)
559
+    dup             v2.16b, w6
560
+    sub             x5, x5, #1
561
+.loop_h:
562
+    mov             x6, x0
563
+    mov             x12, x2
564
+    mov             x7, #0
565
+.loop_w:
566
+    ldr             q0, x6, #16
567
+    ushl            v0.16b, v0.16b, v2.16b
568
+    str             q0, x12, #16
569
+    add             x7, x7, #16
570
+    cmp             x7, x4
571
+    blt             .loop_w
572
+
573
+    add             x0, x0, x1
574
+    add             x2, x2, x3
575
+    sub             x5, x5, #1
576
+    cbnz            x5, .loop_h
577
+
578
+// handle last row
579
+    mov             x5, x4
580
+    lsr             x5, x5, #3
581
+.loopW8:
582
+    ldr             d0, x0, #8
583
+    ushl            v0.8b, v0.8b, v2.8b
584
+    str             d0, x2, #8
585
+    sub             x4, x4, #8
586
+    sub             x5, x5, #1
587
+    cbnz            x5, .loopW8
588
+
589
+    mov             x5, #8
590
+    sub             x5, x5, x4
591
+    sub             x0, x0, x5
592
+    sub             x2, x2, x5
593
+    ldr             d0, x0
594
+    ushl            v0.8b, v0.8b, v2.8b
595
+    str             d0, x2
596
+    ret
597
+endfunc
598
+
599
+//******* satd *******
600
+.macro satd_4x4_neon
601
+    ld1             {v0.s}0, x0, x1
602
+    ld1             {v0.s}1, x0, x1
603
+    ld1             {v1.s}0, x2, x3
604
+    ld1             {v1.s}1, x2, x3
605
+    ld1             {v2.s}0, x0, x1
606
+    ld1             {v2.s}1, x0, x1
607
+    ld1             {v3.s}0, x2, x3
608
+    ld1             {v3.s}1, x2, x3
609
+
610
+    usubl           v4.8h, v0.8b, v1.8b
611
+    usubl           v5.8h, v2.8b, v3.8b
612
+
613
+    add             v6.8h, v4.8h, v5.8h
614
+    sub             v7.8h, v4.8h, v5.8h
615
+
616
+    mov             v4.d0, v6.d1
617
+    add             v0.4h, v6.4h, v4.4h
618
+    sub             v2.4h, v6.4h, v4.4h
619
+
620
+    mov             v5.d0, v7.d1
621
+    add             v1.4h, v7.4h, v5.4h
622
+    sub             v3.4h, v7.4h, v5.4h
623
+
624
+    trn1            v4.4h, v0.4h, v1.4h
625
+    trn2            v5.4h, v0.4h, v1.4h
626
+
627
+    trn1            v6.4h, v2.4h, v3.4h
628
+    trn2            v7.4h, v2.4h, v3.4h
629
+
630
+    add             v0.4h, v4.4h, v5.4h
631
+    sub             v1.4h, v4.4h, v5.4h
632
+
633
+    add             v2.4h, v6.4h, v7.4h
634
+    sub             v3.4h, v6.4h, v7.4h
635
+
636
+    trn1            v4.2s, v0.2s, v1.2s
637
+    trn2            v5.2s, v0.2s, v1.2s
638
+
639
+    trn1            v6.2s, v2.2s, v3.2s
640
+    trn2            v7.2s, v2.2s, v3.2s
641
+
642
+    abs             v4.4h, v4.4h
643
+    abs             v5.4h, v5.4h
644
+    abs             v6.4h, v6.4h
645
+    abs             v7.4h, v7.4h
646
+
647
+    smax            v1.4h, v4.4h, v5.4h
648
+    smax            v2.4h, v6.4h, v7.4h
649
+
650
+    add             v0.4h, v1.4h, v2.4h
651
+    uaddlp          v0.2s, v0.4h
652
+    uaddlp          v0.1d, v0.2s
653
+.endm
654
+
655
+// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
656
+function PFX(pixel_satd_4x4_neon)
657
+    satd_4x4_neon
658
+    fmov            x0, d0
659
+    ret
660
+endfunc
661
+
662
 .macro x265_satd_4x8_8x4_end_neon
663
     add             v0.8h, v4.8h, v6.8h
664
     add             v1.8h, v5.8h, v7.8h
665
@@ -59,7 +699,7 @@
666
 .endm
667
 
668
 .macro pixel_satd_4x8_neon
669
-    ld1r             {v1.2s}, x2, x3
670
+    ld1r            {v1.2s}, x2, x3
671
     ld1r            {v0.2s}, x0, x1
672
     ld1r            {v3.2s}, x2, x3
673
     ld1r            {v2.2s}, x0, x1
674
@@ -82,129 +722,995 @@
675
     sub             v5.8h, v0.8h, v1.8h
676
     ld1             {v6.s}1, x0, x1
677
     usubl           v3.8h, v6.8b, v7.8b
678
-    add         v6.8h, v2.8h, v3.8h
679
-    sub         v7.8h, v2.8h, v3.8h
680
+    add             v6.8h, v2.8h, v3.8h
681
+    sub             v7.8h, v2.8h, v3.8h
682
     x265_satd_4x8_8x4_end_neon
683
 .endm
684
 
685
-// template<int w, int h>
686
-// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
687
-function x265_pixel_satd_4x8_neon
688
-    pixel_satd_4x8_neon
689
-    mov               w0, v0.s0
690
-    ret
691
+// template<int w, int h>
692
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
693
+function PFX(pixel_satd_4x8_neon)
694
+    pixel_satd_4x8_neon
695
+    mov             w0, v0.s0
696
+    ret
697
+endfunc
698
+
699
+function PFX(pixel_satd_4x16_neon)
700
+    mov             w4, #0
701
+    pixel_satd_4x8_neon
702
+    mov             w5, v0.s0
703
+    add             w4, w4, w5
704
+    pixel_satd_4x8_neon
705
+    mov             w5, v0.s0
706
+    add             w0, w5, w4
707
+    ret
708
+endfunc
709
+
710
+function PFX(pixel_satd_4x32_neon)
711
+    mov             w4, #0
712
+.rept 4
713
+    pixel_satd_4x8_neon
714
+    mov             w5, v0.s0
715
+    add             w4, w4, w5
716
+.endr
717
+    mov             w0, w4
718
+    ret
719
+endfunc
720
+
721
+function PFX(pixel_satd_12x16_neon)
722
+    mov             x4, x0
723
+    mov             x5, x2
724
+    mov             w7, #0
725
+    pixel_satd_4x8_neon
726
+    mov             w6, v0.s0
727
+    add             w7, w7, w6
728
+    pixel_satd_4x8_neon
729
+    mov             w6, v0.s0
730
+    add             w7, w7, w6
731
+
732
+    add             x0, x4, #4
733
+    add             x2, x5, #4
734
+    pixel_satd_4x8_neon
735
+    mov             w6, v0.s0
736
+    add             w7, w7, w6
737
+    pixel_satd_4x8_neon
738
+    mov             w6, v0.s0
739
+    add             w7, w7, w6
740
+
741
+    add             x0, x4, #8
742
+    add             x2, x5, #8
743
+    pixel_satd_4x8_neon
744
+    mov             w6, v0.s0
745
+    add             w7, w7, w6
746
+    pixel_satd_4x8_neon
747
+    mov             w6, v0.s0
748
+    add             w0, w7, w6
749
+    ret
750
+endfunc
751
+
752
+function PFX(pixel_satd_12x32_neon)
753
+    mov             x4, x0
754
+    mov             x5, x2
755
+    mov             w7, #0
756
+.rept 4
757
+    pixel_satd_4x8_neon
758
+    mov             w6, v0.s0
759
+    add             w7, w7, w6
760
+.endr
761
+
762
+    add             x0, x4, #4
763
+    add             x2, x5, #4
764
+.rept 4
765
+    pixel_satd_4x8_neon
766
+    mov             w6, v0.s0
767
+    add             w7, w7, w6
768
+.endr
769
+
770
+    add             x0, x4, #8
771
+    add             x2, x5, #8
772
+.rept 4
773
+    pixel_satd_4x8_neon
774
+    mov             w6, v0.s0
775
+    add             w7, w7, w6
776
+.endr
777
+
778
+    mov             w0, w7
779
+    ret
780
+endfunc
781
+
782
+function PFX(pixel_satd_8x4_neon)
783
+    mov             x4, x0
784
+    mov             x5, x2
785
+    satd_4x4_neon
786
+    add             x0, x4, #4
787
+    add             x2, x5, #4
788
+    umov            x6, v0.d0
789
+    satd_4x4_neon
790
+    umov            x0, v0.d0
791
+    add             x0, x0, x6
792
+    ret
793
+endfunc
794
+
795
+.macro LOAD_DIFF_8x4 v0 v1 v2 v3
796
+    ld1             {v0.8b}, x0, x1
797
+    ld1             {v1.8b}, x2, x3
798
+    ld1             {v2.8b}, x0, x1
799
+    ld1             {v3.8b}, x2, x3
800
+    ld1             {v4.8b}, x0, x1
801
+    ld1             {v5.8b}, x2, x3
802
+    ld1             {v6.8b}, x0, x1
803
+    ld1             {v7.8b}, x2, x3
804
+    usubl           \v0, v0.8b, v1.8b
805
+    usubl           \v1, v2.8b, v3.8b
806
+    usubl           \v2, v4.8b, v5.8b
807
+    usubl           \v3, v6.8b, v7.8b
808
+.endm
809
+
810
+.macro LOAD_DIFF_16x4 v0 v1 v2 v3 v4 v5 v6 v7
811
+    ld1             {v0.16b}, x0, x1
812
+    ld1             {v1.16b}, x2, x3
813
+    ld1             {v2.16b}, x0, x1
814
+    ld1             {v3.16b}, x2, x3
815
+    ld1             {v4.16b}, x0, x1
816
+    ld1             {v5.16b}, x2, x3
817
+    ld1             {v6.16b}, x0, x1
818
+    ld1             {v7.16b}, x2, x3
819
+    usubl           \v0, v0.8b, v1.8b
820
+    usubl           \v1, v2.8b, v3.8b
821
+    usubl           \v2, v4.8b, v5.8b
822
+    usubl           \v3, v6.8b, v7.8b
823
+    usubl2          \v4, v0.16b, v1.16b
824
+    usubl2          \v5, v2.16b, v3.16b
825
+    usubl2          \v6, v4.16b, v5.16b
826
+    usubl2          \v7, v6.16b, v7.16b
827
+.endm
828
+
829
+function PFX(satd_16x4_neon), export=0
830
+    LOAD_DIFF_16x4  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
831
+    b               PFX(satd_8x4v_8x8h_neon)
832
+endfunc
833
+
834
+function PFX(satd_8x8_neon), export=0
835
+    LOAD_DIFF_8x4   v16.8h, v17.8h, v18.8h, v19.8h
836
+    LOAD_DIFF_8x4   v20.8h, v21.8h, v22.8h, v23.8h
837
+    b               PFX(satd_8x4v_8x8h_neon)
838
+endfunc
839
+
840
+// one vertical hadamard pass and two horizontal
841
+function PFX(satd_8x4v_8x8h_neon), export=0
842
+    HADAMARD4_V     v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h
843
+    HADAMARD4_V     v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
844
+    trn4            v0.8h, v1.8h, v2.8h, v3.8h, v16.8h, v17.8h, v18.8h, v19.8h
845
+    trn4            v4.8h, v5.8h, v6.8h, v7.8h, v20.8h, v21.8h, v22.8h, v23.8h
846
+    SUMSUB_ABCD     v16.8h, v17.8h, v18.8h, v19.8h, v0.8h, v1.8h, v2.8h, v3.8h
847
+    SUMSUB_ABCD     v20.8h, v21.8h, v22.8h, v23.8h, v4.8h, v5.8h, v6.8h, v7.8h
848
+    trn4            v0.4s, v2.4s, v1.4s, v3.4s, v16.4s, v18.4s, v17.4s, v19.4s
849
+    trn4            v4.4s, v6.4s, v5.4s, v7.4s, v20.4s, v22.4s, v21.4s, v23.4s
850
+    ABS8            v0.8h, v1.8h, v2.8h, v3.8h, v4.8h, v5.8h, v6.8h, v7.8h
851
+    smax            v0.8h, v0.8h, v2.8h
852
+    smax            v1.8h, v1.8h, v3.8h
853
+    smax            v2.8h, v4.8h, v6.8h
854
+    smax            v3.8h, v5.8h, v7.8h
855
+    ret
856
+endfunc
857
+
858
+function PFX(pixel_satd_8x8_neon)
859
+    mov             x10, x30
860
+    bl              PFX(satd_8x8_neon)
861
+    add             v0.8h, v0.8h, v1.8h
862
+    add             v1.8h, v2.8h, v3.8h
863
+    add             v0.8h, v0.8h, v1.8h
864
+    uaddlv          s0, v0.8h
865
+    mov             w0, v0.s0
866
+    ret             x10
867
+endfunc
868
+
869
+function PFX(pixel_satd_8x12_neon)
870
+    mov             x4, x0
871
+    mov             x5, x2
872
+    mov             x7, #0
873
+    satd_4x4_neon
874
+    umov            x6, v0.d0
875
+    add             x7, x7, x6
876
+    add             x0, x4, #4
877
+    add             x2, x5, #4
878
+    satd_4x4_neon
879
+    umov            x6, v0.d0
880
+    add             x7, x7, x6
881
+.rept 2
882
+    sub             x0, x0, #4
883
+    sub             x2, x2, #4
884
+    mov             x4, x0
885
+    mov             x5, x2
886
+    satd_4x4_neon
887
+    umov            x6, v0.d0
888
+    add             x7, x7, x6
889
+    add             x0, x4, #4
890
+    add             x2, x5, #4
891
+    satd_4x4_neon
892
+    umov            x6, v0.d0
893
+    add             x7, x7, x6
894
+.endr
895
+    mov             x0, x7
896
+    ret
897
+endfunc
898
+
899
+function PFX(pixel_satd_8x16_neon)
900
+    mov             x10, x30
901
+    bl              PFX(satd_8x8_neon)
902
+    add             v30.8h, v0.8h, v1.8h
903
+    add             v31.8h, v2.8h, v3.8h
904
+    bl              PFX(satd_8x8_neon)
905
+    add             v30.8h, v30.8h, v0.8h
906
+    add             v31.8h, v31.8h, v1.8h
907
+    add             v30.8h, v30.8h, v2.8h
908
+    add             v31.8h, v31.8h, v3.8h
909
+    add             v0.8h, v30.8h, v31.8h
910
+    uaddlv          s0, v0.8h
911
+    mov             w0, v0.s0
912
+    ret             x10
913
+endfunc
914
+
915
+function PFX(pixel_satd_8x32_neon)
916
+    mov             x10, x30
917
+    bl              PFX(satd_8x8_neon)
918
+    add             v30.8h, v0.8h, v1.8h
919
+    add             v31.8h, v2.8h, v3.8h
920
+.rept 3
921
+    bl              PFX(satd_8x8_neon)
922
+    add             v30.8h, v30.8h, v0.8h
923
+    add             v31.8h, v31.8h, v1.8h
924
+    add             v30.8h, v30.8h, v2.8h
925
+    add             v31.8h, v31.8h, v3.8h
926
+.endr
927
+    add             v0.8h, v30.8h, v31.8h
928
+    uaddlv          s0, v0.8h
929
+    mov             w0, v0.s0
930
+    ret             x10
931
+endfunc
932
+
933
+function PFX(pixel_satd_8x64_neon)
934
+    mov             x10, x30
935
+    bl              PFX(satd_8x8_neon)
936
+    add             v30.8h, v0.8h, v1.8h
937
+    add             v31.8h, v2.8h, v3.8h
938
+.rept 7
939
+    bl              PFX(satd_8x8_neon)
940
+    add             v30.8h, v30.8h, v0.8h
941
+    add             v31.8h, v31.8h, v1.8h
942
+    add             v30.8h, v30.8h, v2.8h
943
+    add             v31.8h, v31.8h, v3.8h
944
+.endr
945
+    add             v0.8h, v30.8h, v31.8h
946
+    uaddlv          s0, v0.8h
947
+    mov             w0, v0.s0
948
+    ret             x10
949
+endfunc
950
+
951
+function PFX(pixel_satd_16x4_neon)
952
+    mov             x10, x30
953
+    bl              PFX(satd_16x4_neon)
954
+    add             v30.8h, v0.8h, v1.8h
955
+    add             v31.8h, v2.8h, v3.8h
956
+    add             v0.8h, v30.8h, v31.8h
957
+    uaddlv          s0, v0.8h
958
+    mov             w0, v0.s0
959
+    ret             x10
960
+endfunc
961
+
962
+function PFX(pixel_satd_16x8_neon)
963
+    mov             x10, x30
964
+    bl              PFX(satd_16x4_neon)
965
+    add             v30.8h, v0.8h, v1.8h
966
+    add             v31.8h, v2.8h, v3.8h
967
+    bl              PFX(satd_16x4_neon)
968
+    add             v30.8h, v30.8h, v0.8h
969
+    add             v31.8h, v31.8h, v1.8h
970
+    add             v30.8h, v30.8h, v2.8h
971
+    add             v31.8h, v31.8h, v3.8h
972
+    add             v0.8h, v30.8h, v31.8h
973
+    uaddlv          s0, v0.8h
974
+    mov             w0, v0.s0
975
+    ret             x10
976
+endfunc
977
+
978
+function PFX(pixel_satd_16x12_neon)
979
+    mov             x10, x30
980
+    bl              PFX(satd_16x4_neon)
981
+    add             v30.8h, v0.8h, v1.8h
982
+    add             v31.8h, v2.8h, v3.8h
983
+.rept 2
984
+    bl              PFX(satd_16x4_neon)
985
+    add             v30.8h, v30.8h, v0.8h
986
+    add             v31.8h, v31.8h, v1.8h
987
+    add             v30.8h, v30.8h, v2.8h
988
+    add             v31.8h, v31.8h, v3.8h
989
+.endr
990
+    add             v0.8h, v30.8h, v31.8h
991
+    uaddlv          s0, v0.8h
992
+    mov             w0, v0.s0
993
+    ret             x10
994
+endfunc
995
+
996
+function PFX(pixel_satd_16x16_neon)
997
+    mov             x10, x30
998
+    bl              PFX(satd_16x4_neon)
999
+    add             v30.8h, v0.8h, v1.8h
1000
+    add             v31.8h, v2.8h, v3.8h
1001
+.rept 3
1002
+    bl              PFX(satd_16x4_neon)
1003
+    add             v30.8h, v30.8h, v0.8h
1004
+    add             v31.8h, v31.8h, v1.8h
1005
+    add             v30.8h, v30.8h, v2.8h
1006
+    add             v31.8h, v31.8h, v3.8h
1007
+.endr
1008
+    add             v0.8h, v30.8h, v31.8h
1009
+    uaddlv          s0, v0.8h
1010
+    mov             w0, v0.s0
1011
+    ret             x10
1012
+endfunc
1013
+
1014
+function PFX(pixel_satd_16x24_neon)
1015
+    mov             x10, x30
1016
+    bl              PFX(satd_16x4_neon)
1017
+    add             v30.8h, v0.8h, v1.8h
1018
+    add             v31.8h, v2.8h, v3.8h
1019
+.rept 5
1020
+    bl              PFX(satd_16x4_neon)
1021
+    add             v30.8h, v30.8h, v0.8h
1022
+    add             v31.8h, v31.8h, v1.8h
1023
+    add             v30.8h, v30.8h, v2.8h
1024
+    add             v31.8h, v31.8h, v3.8h
1025
+.endr
1026
+    add             v0.8h, v30.8h, v31.8h
1027
+    uaddlv          s0, v0.8h
1028
+    mov             w0, v0.s0
1029
+    ret             x10
1030
+endfunc
1031
+
1032
+.macro pixel_satd_16x32_neon
1033
+    bl              PFX(satd_16x4_neon)
1034
+    add             v30.8h, v0.8h, v1.8h
1035
+    add             v31.8h, v2.8h, v3.8h
1036
+.rept 7
1037
+    bl              PFX(satd_16x4_neon)
1038
+    add             v30.8h, v30.8h, v0.8h
1039
+    add             v31.8h, v31.8h, v1.8h
1040
+    add             v30.8h, v30.8h, v2.8h
1041
+    add             v31.8h, v31.8h, v3.8h
1042
+.endr
1043
+.endm
1044
+
1045
+function PFX(pixel_satd_16x32_neon)
1046
+    mov             x10, x30
1047
+    pixel_satd_16x32_neon
1048
+    add             v0.8h, v30.8h, v31.8h
1049
+    uaddlv          s0, v0.8h
1050
+    mov             w0, v0.s0
1051
+    ret             x10
1052
 endfunc
1053
 
1054
-// template<int w, int h>
1055
-// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1056
-function x265_pixel_satd_4x16_neon
1057
-    eor             w4, w4, w4
1058
-    pixel_satd_4x8_neon
1059
-    mov               w5, v0.s0
1060
-    add             w4, w4, w5
1061
-    pixel_satd_4x8_neon
1062
-    mov               w5, v0.s0
1063
-    add             w0, w5, w4
1064
-    ret
1065
+function PFX(pixel_satd_16x64_neon)
1066
+    mov             x10, x30
1067
+    bl              PFX(satd_16x4_neon)
1068
+    add             v30.8h, v0.8h, v1.8h
1069
+    add             v31.8h, v2.8h, v3.8h
1070
+.rept 15
1071
+    bl              PFX(satd_16x4_neon)
1072
+    add             v30.8h, v30.8h, v0.8h
1073
+    add             v31.8h, v31.8h, v1.8h
1074
+    add             v30.8h, v30.8h, v2.8h
1075
+    add             v31.8h, v31.8h, v3.8h
1076
+.endr
1077
+    add             v0.8h, v30.8h, v31.8h
1078
+    uaddlv          s0, v0.8h
1079
+    mov             w0, v0.s0
1080
+    ret             x10
1081
 endfunc
1082
 
1083
-// template<int w, int h>
1084
-// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1085
-function x265_pixel_satd_4x32_neon
1086
-    eor             w4, w4, w4
1087
+function PFX(pixel_satd_24x32_neon)
1088
+    mov             x10, x30
1089
+    mov             x7, #0
1090
+    mov             x4, x0
1091
+    mov             x5, x2
1092
+.rept 3
1093
+    movi            v30.8h, #0
1094
+    movi            v31.8h, #0
1095
 .rept 4
1096
-    pixel_satd_4x8_neon
1097
-    mov             w5, v0.s0
1098
-    add             w4, w4, w5
1099
+    bl              PFX(satd_8x8_neon)
1100
+    add             v30.8h, v30.8h, v0.8h
1101
+    add             v31.8h, v31.8h, v1.8h
1102
+    add             v30.8h, v30.8h, v2.8h
1103
+    add             v31.8h, v31.8h, v3.8h
1104
 .endr
1105
-    mov             w0, w4
1106
-    ret
1107
+    add             v0.8h, v30.8h, v31.8h
1108
+    uaddlv          s0, v0.8h
1109
+    mov             w6, v0.s0
1110
+    add             x7, x7, x6
1111
+    add             x4, x4, #8
1112
+    add             x5, x5, #8
1113
+    mov             x0, x4
1114
+    mov             x2, x5
1115
+.endr
1116
+    mov             x0, x7
1117
+    ret             x10
1118
 endfunc
1119
 
1120
-// template<int w, int h>
1121
-// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1122
-function x265_pixel_satd_12x16_neon
1123
+function PFX(pixel_satd_24x64_neon)
1124
+    mov             x10, x30
1125
+    mov             x7, #0
1126
     mov             x4, x0
1127
     mov             x5, x2
1128
-    eor             w7, w7, w7
1129
-    pixel_satd_4x8_neon
1130
+.rept 3
1131
+    movi            v30.8h, #0
1132
+    movi            v31.8h, #0
1133
+.rept 4
1134
+    bl              PFX(satd_8x8_neon)
1135
+    add             v30.8h, v30.8h, v0.8h
1136
+    add             v31.8h, v31.8h, v1.8h
1137
+    add             v30.8h, v30.8h, v2.8h
1138
+    add             v31.8h, v31.8h, v3.8h
1139
+.endr
1140
+    add             v0.8h, v30.8h, v31.8h
1141
+    uaddlv          s0, v0.8h
1142
     mov             w6, v0.s0
1143
-    add             w7, w7, w6
1144
-    pixel_satd_4x8_neon
1145
+    add             x7, x7, x6
1146
+    add             x4, x4, #8
1147
+    add             x5, x5, #8
1148
+    mov             x0, x4
1149
+    mov             x2, x5
1150
+.endr
1151
+    sub             x4, x4, #24
1152
+    sub             x5, x5, #24
1153
+    add             x0, x4, x1, lsl #5
1154
+    add             x2, x5, x3, lsl #5
1155
+    mov             x4, x0
1156
+    mov             x5, x2
1157
+.rept 3
1158
+    movi            v30.8h, #0
1159
+    movi            v31.8h, #0
1160
+.rept 4
1161
+    bl              PFX(satd_8x8_neon)
1162
+    add             v30.8h, v30.8h, v0.8h
1163
+    add             v31.8h, v31.8h, v1.8h
1164
+    add             v30.8h, v30.8h, v2.8h
1165
+    add             v31.8h, v31.8h, v3.8h
1166
+.endr
1167
+    add             v0.8h, v30.8h, v31.8h
1168
+    uaddlv          s0, v0.8h
1169
     mov             w6, v0.s0
1170
-    add             w7, w7, w6
1171
+    add             x7, x7, x6
1172
+    add             x4, x4, #8
1173
+    add             x5, x5, #8
1174
+    mov             x0, x4
1175
+    mov             x2, x5
1176
+.endr
1177
+    mov             x0, x7
1178
+    ret             x10
1179
+endfunc
1180
 
1181
-    add             x0, x4, #4
1182
-    add             x2, x5, #4
1183
-    pixel_satd_4x8_neon
1184
-    mov             w6, v0.s0
1185
-    add             w7, w7, w6
1186
-    pixel_satd_4x8_neon
1187
-    mov             w6, v0.s0
1188
-    add             w7, w7, w6
1189
+.macro pixel_satd_32x8
1190
+    mov             x4, x0
1191
+    mov             x5, x2
1192
+.rept 2
1193
+    bl              PFX(satd_16x4_neon)
1194
+    add             v30.8h, v30.8h, v0.8h
1195
+    add             v31.8h, v31.8h, v1.8h
1196
+    add             v30.8h, v30.8h, v2.8h
1197
+    add             v31.8h, v31.8h, v3.8h
1198
+.endr
1199
+    add             x0, x4, #16
1200
+    add             x2, x5, #16
1201
+.rept 2
1202
+    bl              PFX(satd_16x4_neon)
1203
+    add             v30.8h, v30.8h, v0.8h
1204
+    add             v31.8h, v31.8h, v1.8h
1205
+    add             v30.8h, v30.8h, v2.8h
1206
+    add             v31.8h, v31.8h, v3.8h
1207
+.endr
1208
+.endm
1209
 
1210
-    add             x0, x4, #8
1211
-    add             x2, x5, #8
1212
-    pixel_satd_4x8_neon
1213
-    mov             w6, v0.s0
1214
-    add             w7, w7, w6
1215
-    pixel_satd_4x8_neon
1216
+.macro satd_32x16_neon
1217
+    movi            v30.8h, #0
1218
+    movi            v31.8h, #0
1219
+    pixel_satd_32x8
1220
+    sub             x0, x0, #16
1221
+    sub             x2, x2, #16
1222
+    pixel_satd_32x8
1223
+    add             v0.8h, v30.8h, v31.8h
1224
+    uaddlv          s0, v0.8h
1225
     mov             w6, v0.s0
1226
-    add             w0, w7, w6
1227
-    ret
1228
-endfunc
1229
+.endm
1230
 
1231
-// template<int w, int h>
1232
-// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1233
-function x265_pixel_satd_12x32_neon
1234
+.macro satd_64x16_neon
1235
+    mov             x8, x0
1236
+    mov             x9, x2
1237
+    satd_32x16_neon
1238
+    add             x7, x7, x6
1239
+    add             x0, x8, #32
1240
+    add             x2, x9, #32
1241
+    satd_32x16_neon
1242
+    add             x7, x7, x6
1243
+.endm
1244
+
1245
+function PFX(pixel_satd_32x8_neon)
1246
+    mov             x10, x30
1247
+    mov             x7, #0
1248
     mov             x4, x0
1249
     mov             x5, x2
1250
-    eor             w7, w7, w7
1251
-.rept 4
1252
-    pixel_satd_4x8_neon
1253
-    mov             w6, v0.s0
1254
-    add             w7, w7, w6
1255
+    movi            v30.8h, #0
1256
+    movi            v31.8h, #0
1257
+    pixel_satd_32x8
1258
+    add             v0.8h, v30.8h, v31.8h
1259
+    uaddlv          s0, v0.8h
1260
+    mov             w0, v0.s0
1261
+    ret             x10
1262
+endfunc
1263
+
1264
+function PFX(pixel_satd_32x16_neon)
1265
+    mov             x10, x30
1266
+    satd_32x16_neon
1267
+    mov             x0, x6
1268
+    ret             x10
1269
+endfunc
1270
+
1271
+function PFX(pixel_satd_32x24_neon)
1272
+    mov             x10, x30
1273
+    satd_32x16_neon
1274
+    movi            v30.8h, #0
1275
+    movi            v31.8h, #0
1276
+    sub             x0, x0, #16
1277
+    sub             x2, x2, #16
1278
+    pixel_satd_32x8
1279
+    add             v0.8h, v30.8h, v31.8h
1280
+    uaddlv          s0, v0.8h
1281
+    mov             w0, v0.s0
1282
+    add             x0, x0, x6
1283
+    ret             x10
1284
+endfunc
1285
+
1286
+function PFX(pixel_satd_32x32_neon)
1287
+    mov             x10, x30
1288
+    mov             x7, #0
1289
+    satd_32x16_neon
1290
+    sub             x0, x0, #16
1291
+    sub             x2, x2, #16
1292
+    add             x7, x7, x6
1293
+    satd_32x16_neon
1294
+    add             x0, x7, x6
1295
+    ret             x10
1296
+endfunc
1297
+
1298
+function PFX(pixel_satd_32x48_neon)
1299
+    mov             x10, x30
1300
+    mov             x7, #0
1301
+.rept 2
1302
+    satd_32x16_neon
1303
+    sub             x0, x0, #16
1304
+    sub             x2, x2, #16
1305
+    add             x7, x7, x6
1306
 .endr
1307
+    satd_32x16_neon
1308
+    add             x0, x7, x6
1309
+    ret             x10
1310
+endfunc
1311
 
1312
-    add             x0, x4, #4
1313
-    add             x2, x5, #4
1314
-.rept 4
1315
-    pixel_satd_4x8_neon
1316
-    mov             w6, v0.s0
1317
-    add             w7, w7, w6
1318
+function PFX(pixel_satd_32x64_neon)
1319
+    mov             x10, x30
1320
+    mov             x7, #0
1321
+.rept 3
1322
+    satd_32x16_neon
1323
+    sub             x0, x0, #16
1324
+    sub             x2, x2, #16
1325
+    add             x7, x7, x6
1326
 .endr
1327
+    satd_32x16_neon
1328
+    add             x0, x7, x6
1329
+    ret             x10
1330
+endfunc
1331
 
1332
-    add             x0, x4, #8
1333
-    add             x2, x5, #8
1334
-.rept 4
1335
-    pixel_satd_4x8_neon
1336
-    mov             w6, v0.s0
1337
-    add             w7, w7, w6
1338
+function PFX(pixel_satd_64x16_neon)
1339
+    mov             x10, x30
1340
+    mov             x7, #0
1341
+    satd_64x16_neon
1342
+    mov             x0, x7
1343
+    ret             x10
1344
+endfunc
1345
+
1346
+function PFX(pixel_satd_64x32_neon)
1347
+    mov             x10, x30
1348
+    mov             x7, #0
1349
+    satd_64x16_neon
1350
+    sub             x0, x0, #48
1351
+    sub             x2, x2, #48
1352
+    satd_64x16_neon
1353
+    mov             x0, x7
1354
+    ret             x10
1355
+endfunc
1356
+
1357
+function PFX(pixel_satd_64x48_neon)
1358
+    mov             x10, x30
1359
+    mov             x7, #0
1360
+.rept 2
1361
+    satd_64x16_neon
1362
+    sub             x0, x0, #48
1363
+    sub             x2, x2, #48
1364
 .endr
1365
+    satd_64x16_neon
1366
+    mov             x0, x7
1367
+    ret             x10
1368
+endfunc
1369
 
1370
-    mov             w0, w7
1371
+function PFX(pixel_satd_64x64_neon)
1372
+    mov             x10, x30
1373
+    mov             x7, #0
1374
+.rept 3
1375
+    satd_64x16_neon
1376
+    sub             x0, x0, #48
1377
+    sub             x2, x2, #48
1378
+.endr
1379
+    satd_64x16_neon
1380
+    mov             x0, x7
1381
+    ret             x10
1382
+endfunc
1383
+
1384
+function PFX(pixel_satd_48x64_neon)
1385
+    mov             x10, x30
1386
+    mov             x7, #0
1387
+    mov             x8, x0
1388
+    mov             x9, x2
1389
+.rept 3
1390
+    satd_32x16_neon
1391
+    sub             x0, x0, #16
1392
+    sub             x2, x2, #16
1393
+    add             x7, x7, x6
1394
+.endr
1395
+    satd_32x16_neon
1396
+    add             x7, x7, x6
1397
+
1398
+    add             x0, x8, #32
1399
+    add             x2, x9, #32
1400
+    pixel_satd_16x32_neon
1401
+    add             v0.8h, v30.8h, v31.8h
1402
+    uaddlv          s0, v0.8h
1403
+    mov             w6, v0.s0
1404
+    add             x7, x7, x6
1405
+
1406
+    movi            v30.8h, #0
1407
+    movi            v31.8h, #0
1408
+    pixel_satd_16x32_neon
1409
+    add             v0.8h, v30.8h, v31.8h
1410
+    uaddlv          s0, v0.8h
1411
+    mov             w6, v0.s0
1412
+    add             x0, x7, x6
1413
+    ret             x10
1414
+endfunc
1415
+
1416
+function PFX(sa8d_8x8_neon), export=0
1417
+    LOAD_DIFF_8x4   v16.8h, v17.8h, v18.8h, v19.8h
1418
+    LOAD_DIFF_8x4   v20.8h, v21.8h, v22.8h, v23.8h
1419
+    HADAMARD4_V     v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h
1420
+    HADAMARD4_V     v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
1421
+    SUMSUB_ABCD     v0.8h, v16.8h, v1.8h, v17.8h, v16.8h, v20.8h, v17.8h, v21.8h
1422
+    SUMSUB_ABCD     v2.8h, v18.8h, v3.8h, v19.8h, v18.8h, v22.8h, v19.8h, v23.8h
1423
+    trn4            v4.8h, v5.8h, v6.8h, v7.8h, v0.8h, v1.8h, v2.8h, v3.8h
1424
+    trn4            v20.8h, v21.8h, v22.8h, v23.8h, v16.8h, v17.8h, v18.8h, v19.8h
1425
+    SUMSUB_ABCD     v2.8h, v3.8h, v24.8h, v25.8h, v20.8h, v21.8h, v4.8h, v5.8h
1426
+    SUMSUB_ABCD     v0.8h, v1.8h, v4.8h, v5.8h, v22.8h, v23.8h, v6.8h, v7.8h
1427
+    trn4            v20.4s, v22.4s, v21.4s, v23.4s, v2.4s, v0.4s, v3.4s, v1.4s
1428
+    trn4            v16.4s, v18.4s, v17.4s, v19.4s, v24.4s, v4.4s, v25.4s, v5.4s
1429
+    SUMSUB_ABCD     v0.8h, v2.8h, v1.8h, v3.8h, v20.8h, v22.8h, v21.8h, v23.8h
1430
+    SUMSUB_ABCD     v4.8h, v6.8h, v5.8h, v7.8h, v16.8h, v18.8h, v17.8h, v19.8h
1431
+    trn4            v16.2d, v20.2d, v17.2d, v21.2d, v0.2d, v4.2d, v1.2d, v5.2d
1432
+    trn4            v18.2d, v22.2d, v19.2d, v23.2d, v2.2d, v6.2d, v3.2d, v7.2d
1433
+    ABS8            v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
1434
+    smax            v16.8h, v16.8h, v20.8h
1435
+    smax            v17.8h, v17.8h, v21.8h
1436
+    smax            v18.8h, v18.8h, v22.8h
1437
+    smax            v19.8h, v19.8h, v23.8h
1438
+    add             v0.8h, v16.8h, v17.8h
1439
+    add             v1.8h, v18.8h, v19.8h
1440
     ret
1441
 endfunc
1442
 
1443
-// template<int w, int h>
1444
-// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1445
-function x265_pixel_satd_8x8_neon
1446
-    eor             w4, w4, w4
1447
-    mov             x6, x0
1448
-    mov             x7, x2
1449
-    pixel_satd_4x8_neon
1450
-    mov             w5, v0.s0
1451
-    add             w4, w4, w5
1452
-    add             x0, x6, #4
1453
-    add             x2, x7, #4
1454
-    pixel_satd_4x8_neon
1455
+function PFX(pixel_sa8d_8x8_neon)
1456
+    mov             x10, x30
1457
+    bl              PFX(sa8d_8x8_neon)
1458
+    add             v0.8h, v0.8h, v1.8h
1459
+    uaddlv          s0, v0.8h
1460
+    mov             w0, v0.s0
1461
+    add             w0, w0, #1
1462
+    lsr             w0, w0, #1
1463
+    ret             x10
1464
+endfunc
1465
+
1466
+function PFX(pixel_sa8d_8x16_neon)
1467
+    mov             x10, x30
1468
+    bl              PFX(sa8d_8x8_neon)
1469
+    add             v0.8h, v0.8h, v1.8h
1470
+    uaddlv          s0, v0.8h
1471
     mov             w5, v0.s0
1472
+    add             w5, w5, #1
1473
+    lsr             w5, w5, #1
1474
+    bl              PFX(sa8d_8x8_neon)
1475
+    add             v0.8h, v0.8h, v1.8h
1476
+    uaddlv          s0, v0.8h
1477
+    mov             w4, v0.s0
1478
+    add             w4, w4, #1
1479
+    lsr             w4, w4, #1
1480
+    add             w0, w4, w5
1481
+    ret             x10
1482
+endfunc
1483
+
1484
+.macro sa8d_16x16 reg
1485
+    bl              PFX(sa8d_8x8_neon)
1486
+    uaddlp          v30.4s, v0.8h
1487
+    uaddlp          v31.4s, v1.8h
1488
+    bl              PFX(sa8d_8x8_neon)
1489
+    uadalp          v30.4s, v0.8h
1490
+    uadalp          v31.4s, v1.8h
1491
+    sub             x0, x0, x1, lsl #4
1492
+    sub             x2, x2, x3, lsl #4
1493
+    add             x0, x0, #8
1494
+    add             x2, x2, #8
1495
+    bl              PFX(sa8d_8x8_neon)
1496
+    uadalp          v30.4s, v0.8h
1497
+    uadalp          v31.4s, v1.8h
1498
+    bl              PFX(sa8d_8x8_neon)
1499
+    uadalp          v30.4s, v0.8h
1500
+    uadalp          v31.4s, v1.8h
1501
+    add             v0.4s, v30.4s, v31.4s
1502
+    addv            s0, v0.4s
1503
+    mov             \reg, v0.s0
1504
+    add             \reg, \reg, #1
1505
+    lsr             \reg, \reg, #1
1506
+.endm
1507
+
1508
+function PFX(pixel_sa8d_16x16_neon)
1509
+    mov             x10, x30
1510
+    sa8d_16x16      w0
1511
+    ret             x10
1512
+endfunc
1513
+
1514
+function PFX(pixel_sa8d_16x32_neon)
1515
+    mov             x10, x30
1516
+    sa8d_16x16      w4
1517
+    sub             x0, x0, #8
1518
+    sub             x2, x2, #8
1519
+    sa8d_16x16      w5
1520
     add             w0, w4, w5
1521
+    ret             x10
1522
+endfunc
1523
+
1524
+function PFX(pixel_sa8d_32x32_neon)
1525
+    mov             x10, x30
1526
+    sa8d_16x16      w4
1527
+    sub             x0, x0, x1, lsl #4
1528
+    sub             x2, x2, x3, lsl #4
1529
+    add             x0, x0, #8
1530
+    add             x2, x2, #8
1531
+    sa8d_16x16      w5
1532
+    sub             x0, x0, #24
1533
+    sub             x2, x2, #24
1534
+    sa8d_16x16      w6
1535
+    sub             x0, x0, x1, lsl #4
1536
+    sub             x2, x2, x3, lsl #4
1537
+    add             x0, x0, #8
1538
+    add             x2, x2, #8
1539
+    sa8d_16x16      w7
1540
+    add             w4, w4, w5
1541
+    add             w6, w6, w7
1542
+    add             w0, w4, w6
1543
+    ret             x10
1544
+endfunc
1545
+
1546
+function PFX(pixel_sa8d_32x64_neon)
1547
+    mov             x10, x30
1548
+    mov             w11, #4
1549
+    mov             w9, #0
1550
+.loop_sa8d_32:
1551
+    sub             w11, w11, #1
1552
+    sa8d_16x16      w4
1553
+    sub             x0, x0, x1, lsl #4
1554
+    sub             x2, x2, x3, lsl #4
1555
+    add             x0, x0, #8
1556
+    add             x2, x2, #8
1557
+    sa8d_16x16      w5
1558
+    add             w4, w4, w5
1559
+    add             w9, w9, w4
1560
+    sub             x0, x0, #24
1561
+    sub             x2, x2, #24
1562
+    cbnz            w11, .loop_sa8d_32
1563
+    mov             w0, w9
1564
+    ret             x10
1565
+endfunc
1566
+
1567
+function PFX(pixel_sa8d_64x64_neon)
1568
+    mov             x10, x30
1569
+    mov             w11, #4
1570
+    mov             w9, #0
1571
+.loop_sa8d_64:
1572
+    sub             w11, w11, #1
1573
+    sa8d_16x16      w4
1574
+    sub             x0, x0, x1, lsl #4
1575
+    sub             x2, x2, x3, lsl #4
1576
+    add             x0, x0, #8
1577
+    add             x2, x2, #8
1578
+    sa8d_16x16      w5
1579
+    sub             x0, x0, x1, lsl #4
1580
+    sub             x2, x2, x3, lsl #4
1581
+    add             x0, x0, #8
1582
+    add             x2, x2, #8
1583
+    sa8d_16x16      w6
1584
+    sub             x0, x0, x1, lsl #4
1585
+    sub             x2, x2, x3, lsl #4
1586
+    add             x0, x0, #8
1587
+    add             x2, x2, #8
1588
+    sa8d_16x16      w7
1589
+    add             w4, w4, w5
1590
+    add             w6, w6, w7
1591
+    add             w8, w4, w6
1592
+    add             w9, w9, w8
1593
+
1594
+    sub             x0, x0, #56
1595
+    sub             x2, x2, #56
1596
+    cbnz            w11, .loop_sa8d_64
1597
+    mov             w0, w9
1598
+    ret             x10
1599
+endfunc
1600
+
1601
+/***** dequant_scaling*****/
1602
+// void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
1603
+function PFX(dequant_scaling_neon)
1604
+    add             x5, x5, #4              // shift + 4
1605
+    lsr             x3, x3, #3              // num / 8
1606
+    cmp             x5, x4
1607
+    blt             .dequant_skip
1608
+
1609
+    mov             x12, #1
1610
+    sub             x6, x5, x4          // shift - per
1611
+    sub             x6, x6, #1          // shift - per - 1
1612
+    lsl             x6, x12, x6         // 1 << shift - per - 1 (add)
1613
+    dup             v0.4s, w6
1614
+    sub             x7, x4, x5          // per - shift
1615
+    dup             v3.4s, w7
1616
+
1617
+.dequant_loop1:
1618
+    ld1             {v19.8h}, x0, #16 // quantCoef
1619
+    ld1             {v2.4s}, x1, #16  // deQuantCoef
1620
+    ld1             {v20.4s}, x1, #16
1621
+    sub             x3, x3, #1
1622
+    sxtl            v1.4s, v19.4h
1623
+    sxtl2           v19.4s, v19.8h
1624
+
1625
+    mul             v1.4s, v1.4s, v2.4s // quantCoef * deQuantCoef
1626
+    mul             v19.4s, v19.4s, v20.4s
1627
+    add             v1.4s, v1.4s, v0.4s // quantCoef * deQuantCoef + add
1628
+    add             v19.4s, v19.4s, v0.4s
1629
+
1630
+    sshl            v1.4s, v1.4s, v3.4s
1631
+    sshl            v19.4s, v19.4s, v3.4s
1632
+    sqxtn           v16.4h, v1.4s       // x265_clip3
1633
+    sqxtn2          v16.8h, v19.4s
1634
+    st1             {v16.8h}, x2, #16
1635
+    cbnz            x3, .dequant_loop1
1636
+    ret
1637
+
1638
+.dequant_skip:
1639
+    sub             x6, x4, x5          // per - shift
1640
+    dup             v0.8h, w6
1641
+
1642
+.dequant_loop2:
1643
+    ld1             {v19.8h}, x0, #16 // quantCoef
1644
+    ld1             {v2.4s}, x1, #16  // deQuantCoef
1645
+    ld1             {v20.4s}, x1, #16
1646
+    sub             x3, x3, #1
1647
+    sxtl            v1.4s, v19.4h
1648
+    sxtl2           v19.4s, v19.8h
1649
+
1650
+    mul             v1.4s, v1.4s, v2.4s // quantCoef * deQuantCoef
1651
+    mul             v19.4s, v19.4s, v20.4s
1652
+    sqxtn           v16.4h, v1.4s       // x265_clip3
1653
+    sqxtn2          v16.8h, v19.4s
1654
+
1655
+    sqshl           v16.8h, v16.8h, v0.8h // coefQ << per - shift
1656
+    st1             {v16.8h}, x2, #16
1657
+    cbnz            x3, .dequant_loop2
1658
+    ret
1659
+endfunc
1660
+
1661
+// void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
1662
+function PFX(dequant_normal_neon)
1663
+    lsr             w2, w2, #4              // num / 16
1664
+    neg             w4, w4
1665
+    dup             v0.8h, w3
1666
+    dup             v1.4s, w4
1667
+
1668
+.dqn_loop1:
1669
+    ld1             {v2.8h, v3.8h}, x0, #32
1670
+    smull           v16.4s, v2.4h, v0.4h
1671
+    smull2          v17.4s, v2.8h, v0.8h
1672
+    smull           v18.4s, v3.4h, v0.4h
1673
+    smull2          v19.4s, v3.8h, v0.8h
1674
+
1675
+    srshl           v16.4s, v16.4s, v1.4s
1676
+    srshl           v17.4s, v17.4s, v1.4s
1677
+    srshl           v18.4s, v18.4s, v1.4s
1678
+    srshl           v19.4s, v19.4s, v1.4s
1679
+
1680
+    sqxtn           v2.4h, v16.4s
1681
+    sqxtn2          v2.8h, v17.4s
1682
+    sqxtn           v3.4h, v18.4s
1683
+    sqxtn2          v3.8h, v19.4s
1684
+
1685
+    sub             w2, w2, #1
1686
+    st1             {v2.8h, v3.8h}, x1, #32
1687
+    cbnz            w2, .dqn_loop1
1688
+    ret
1689
+endfunc
1690
+
1691
+/********* ssim ***********/
1692
+// void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24)
1693
+function PFX(ssim_4x4x2_core_neon)
1694
+    ld1             {v0.8b}, x0, x1
1695
+    ld1             {v1.8b}, x0, x1
1696
+    ld1             {v2.8b}, x0, x1
1697
+    ld1             {v3.8b}, x0, x1
1698
+
1699
+    ld1             {v4.8b}, x2, x3
1700
+    ld1             {v5.8b}, x2, x3
1701
+    ld1             {v6.8b}, x2, x3
1702
+    ld1             {v7.8b}, x2, x3
1703
+
1704
+    umull           v16.8h, v0.8b, v0.8b
1705
+    umull           v17.8h, v1.8b, v1.8b
1706
+    umull           v18.8h, v2.8b, v2.8b
1707
+    uaddlp          v30.4s, v16.8h
1708
+    umull           v19.8h, v3.8b, v3.8b
1709
+    umull           v20.8h, v4.8b, v4.8b
1710
+    umull           v21.8h, v5.8b, v5.8b
1711
+    uadalp          v30.4s, v17.8h
1712
+    umull           v22.8h, v6.8b, v6.8b
1713
+    umull           v23.8h, v7.8b, v7.8b
1714
+
1715
+    umull           v24.8h, v0.8b, v4.8b
1716
+    uadalp          v30.4s, v18.8h
1717
+    umull           v25.8h, v1.8b, v5.8b
1718
+    umull           v26.8h, v2.8b, v6.8b
1719
+    umull           v27.8h, v3.8b, v7.8b
1720
+    uadalp          v30.4s, v19.8h
1721
+
1722
+    uaddl           v28.8h, v0.8b, v1.8b
1723
+    uaddl           v29.8h, v4.8b, v5.8b
1724
+    uadalp          v30.4s, v20.8h
1725
+    uaddlp          v31.4s, v24.8h
1726
+
1727
+    uaddw           v28.8h, v28.8h, v2.8b
1728
+    uaddw           v29.8h, v29.8h, v6.8b
1729
+    uadalp          v30.4s, v21.8h
1730
+    uadalp          v31.4s, v25.8h
1731
+
1732
+    uaddw           v28.8h, v28.8h, v3.8b
1733
+    uaddw           v29.8h, v29.8h, v7.8b
1734
+    uadalp          v30.4s, v22.8h
1735
+    uadalp          v31.4s, v26.8h
1736
+
1737
+    uaddlp          v28.4s, v28.8h
1738
+    uaddlp          v29.4s, v29.8h
1739
+    uadalp          v30.4s, v23.8h
1740
+    uadalp          v31.4s, v27.8h
1741
+
1742
+    addp            v28.4s, v28.4s, v28.4s
1743
+    addp            v29.4s, v29.4s, v29.4s
1744
+    addp            v30.4s, v30.4s, v30.4s
1745
+    addp            v31.4s, v31.4s, v31.4s
1746
+
1747
+    st4             {v28.2s, v29.2s, v30.2s, v31.2s}, x4
1748
     ret
1749
 endfunc
1750
 
1751
 // int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
1752
-function x265_psyCost_4x4_neon
1753
+function PFX(psyCost_4x4_neon)
1754
     ld1r            {v4.2s}, x0, x1
1755
     ld1r            {v5.2s}, x0, x1
1756
     ld1             {v4.s}1, x0, x1
1757
@@ -286,7 +1792,7 @@
1758
 endfunc
1759
 
1760
 // uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
1761
-function x265_quant_neon
1762
+function PFX(quant_neon)
1763
     mov             w9, #1
1764
     lsl             w9, w9, w4
1765
     dup             v0.2s, w9
1766
@@ -341,79 +1847,597 @@
1767
     ret
1768
 endfunc
1769
 
1770
-.macro satd_4x4_neon
1771
-    ld1             {v1.s}0, x2, x3
1772
-    ld1             {v0.s}0, x0, x1
1773
-    ld1             {v3.s}0, x2, x3
1774
-    ld1             {v2.s}0, x0, x1
1775
+// uint32_t nquant_c(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff)
1776
+function PFX(nquant_neon)
1777
+    neg             x12, x3
1778
+    dup             v0.4s, w12             // q0= -qbits
1779
+    dup             v1.4s, w4              // add
1780
 
1781
-    ld1             {v1.s}1, x2, x3
1782
-    ld1             {v0.s}1, x0, x1
1783
-    ld1             {v3.s}1, x2, x3
1784
-    ld1             {v2.s}1, x0, x1
1785
+    lsr             w5, w5, #2
1786
+    movi            v4.4s, #0              // v4= accumulate numsig
1787
+    mov             x4, #0
1788
+    movi            v22.4s, #0
1789
 
1790
-    usubl           v4.8h, v0.8b, v1.8b
1791
-    usubl           v5.8h, v2.8b, v3.8b
1792
+.loop_nquant:
1793
+    ld1             {v16.4h}, x0, #8
1794
+    sub             w5, w5, #1
1795
+    sxtl            v19.4s, v16.4h         // v19 = coefblockpos
1796
 
1797
-    add             v6.8h, v4.8h, v5.8h
1798
-    sub             v7.8h, v4.8h, v5.8h
1799
+    cmlt            v18.4s, v19.4s, #0     // v18 = sign
1800
 
1801
-    mov             v4.d0, v6.d1
1802
-    add             v0.8h, v6.8h, v4.8h
1803
-    sub             v2.8h, v6.8h, v4.8h
1804
+    abs             v19.4s, v19.4s         // v19 = level=abs(coefblockpos)
1805
+    ld1             {v20.4s}, x1, #16    // v20 = quantCoeffblockpos
1806
+    mul             v19.4s, v19.4s, v20.4s // v19 = tmplevel = abs(level) * quantCoeffblockpos;
1807
 
1808
-    mov             v5.d0, v7.d1
1809
-    add             v1.8h, v7.8h, v5.8h
1810
-    sub             v3.8h, v7.8h, v5.8h
1811
+    add             v20.4s, v19.4s, v1.4s  // v20 = tmplevel+add
1812
+    sshl            v20.4s, v20.4s, v0.4s  // v20 = level =(tmplevel+add) >> qbits
1813
 
1814
-    trn1            v4.4h, v0.4h, v1.4h
1815
-    trn2            v5.4h, v0.4h, v1.4h
1816
+    // numsig
1817
+    cmeq            v21.4s, v20.4s, v22.4s
1818
+    add             v4.4s, v4.4s, v21.4s
1819
+    add             x4, x4, #4
1820
 
1821
-    trn1            v6.4h, v2.4h, v3.4h
1822
-    trn2            v7.4h, v2.4h, v3.4h
1823
+    eor             v21.16b, v20.16b, v18.16b
1824
+    sub             v21.4s, v21.4s, v18.4s
1825
+    sqxtn           v16.4h, v21.4s
1826
+    abs             v17.4h, v16.4h
1827
+    st1             {v17.4h}, x2, #8
1828
 
1829
-    add             v0.4h, v4.4h, v5.4h
1830
-    sub             v1.4h, v4.4h, v5.4h
1831
+    cbnz            w5, .loop_nquant
1832
 
1833
-    add             v2.4h, v6.4h, v7.4h
1834
-    sub             v3.4h, v6.4h, v7.4h
1835
+    uaddlv          d4, v4.4s
1836
+    fmov            x12, d4
1837
+    add             x0, x4, x12
1838
+    ret
1839
+endfunc
1840
 
1841
-    trn1            v4.2s, v0.2s, v1.2s
1842
-    trn2            v5.2s, v0.2s, v1.2s
1843
+// void ssimDist_c(const pixel* fenc, uint32_t fStride, const pixel* recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k)
1844
+.macro ssimDist_1  v4 v5
1845
+    sub             v20.8h, \v4\().8h, \v5\().8h
1846
+    smull           v16.4s, \v4\().4h, \v4\().4h
1847
+    smull2          v17.4s, \v4\().8h, \v4\().8h
1848
+    smull           v18.4s, v20.4h, v20.4h
1849
+    smull2          v19.4s, v20.8h, v20.8h
1850
+    add             v0.4s, v0.4s, v16.4s
1851
+    add             v0.4s, v0.4s, v17.4s
1852
+    add             v1.4s, v1.4s, v18.4s
1853
+    add             v1.4s, v1.4s, v19.4s
1854
+.endm
1855
 
1856
-    trn1            v6.2s, v2.2s, v3.2s
1857
-    trn2            v7.2s, v2.2s, v3.2s
1858
+function PFX(ssimDist4_neon)
1859
+    ssimDist_start
1860
+.rept 4
1861
+    ld1             {v4.s}0, x0, x1
1862
+    ld1             {v5.s}0, x2, x3
1863
+    uxtl            v4.8h, v4.8b
1864
+    uxtl            v5.8h, v5.8b
1865
+    sub             v2.4h, v4.4h, v5.4h
1866
+    smull           v3.4s, v4.4h, v4.4h
1867
+    smull           v2.4s, v2.4h, v2.4h
1868
+    add             v0.4s, v0.4s, v3.4s
1869
+    add             v1.4s, v1.4s, v2.4s
1870
+.endr
1871
+    ssimDist_end
1872
+    ret
1873
+endfunc
1874
 
1875
-    abs             v4.4h, v4.4h
1876
-    abs             v5.4h, v5.4h
1877
-    abs             v6.4h, v6.4h
1878
-    abs             v7.4h, v7.4h
1879
+function PFX(ssimDist8_neon)
1880
+    ssimDist_start
1881
+.rept 8
1882
+    ld1             {v4.8b}, x0, x1
1883
+    ld1             {v5.8b}, x2, x3
1884
+    uxtl            v4.8h, v4.8b
1885
+    uxtl            v5.8h, v5.8b
1886
+    ssimDist_1      v4, v5
1887
+.endr
1888
+    ssimDist_end
1889
+    ret
1890
+endfunc
1891
 
1892
-    smax            v1.4h, v4.4h, v5.4h
1893
-    smax            v2.4h, v6.4h, v7.4h
1894
+function PFX(ssimDist16_neon)
1895
+    mov w12, #16
1896
+    ssimDist_start
1897
+.loop_ssimDist16:
1898
+    sub             w12, w12, #1
1899
+    ld1             {v4.16b}, x0, x1
1900
+    ld1             {v5.16b}, x2, x3
1901
+    uxtl            v6.8h, v4.8b
1902
+    uxtl            v7.8h, v5.8b
1903
+    uxtl2           v4.8h, v4.16b
1904
+    uxtl2           v5.8h, v5.16b
1905
+    ssimDist_1      v6, v7
1906
+    ssimDist_1      v4, v5
1907
+    cbnz            w12, .loop_ssimDist16
1908
+    ssimDist_end
1909
+    ret
1910
+endfunc
1911
 
1912
-    add             v0.4h, v1.4h, v2.4h
1913
-    uaddlp          v0.2s, v0.4h
1914
-    uaddlp          v0.1d, v0.2s
1915
+function PFX(ssimDist32_neon)
1916
+    mov w12, #32
1917
+    ssimDist_start
1918
+.loop_ssimDist32:
1919
+    sub             w12, w12, #1
1920
+    ld1             {v4.16b-v5.16b}, x0, x1
1921
+    ld1             {v6.16b-v7.16b}, x2, x3
1922
+    uxtl            v21.8h, v4.8b
1923
+    uxtl            v22.8h, v6.8b
1924
+    uxtl            v23.8h, v5.8b
1925
+    uxtl            v24.8h, v7.8b
1926
+    uxtl2           v25.8h, v4.16b
1927
+    uxtl2           v26.8h, v6.16b
1928
+    uxtl2           v27.8h, v5.16b
1929
+    uxtl2           v28.8h, v7.16b
1930
+    ssimDist_1      v21, v22
1931
+    ssimDist_1      v23, v24
1932
+    ssimDist_1      v25, v26
1933
+    ssimDist_1      v27, v28
1934
+    cbnz            w12, .loop_ssimDist32
1935
+    ssimDist_end
1936
+    ret
1937
+endfunc
1938
+
1939
+function PFX(ssimDist64_neon)
1940
+    mov w12, #64
1941
+    ssimDist_start
1942
+.loop_ssimDist64:
1943
+    sub             w12, w12, #1
1944
+    ld1             {v4.16b-v7.16b}, x0, x1
1945
+    ld1             {v16.16b-v19.16b}, x2, x3
1946
+    uxtl            v21.8h, v4.8b
1947
+    uxtl            v22.8h, v16.8b
1948
+    uxtl            v23.8h, v5.8b
1949
+    uxtl            v24.8h, v17.8b
1950
+    uxtl2           v25.8h, v4.16b
1951
+    uxtl2           v26.8h, v16.16b
1952
+    uxtl2           v27.8h, v5.16b
1953
+    uxtl2           v28.8h, v17.16b
1954
+    ssimDist_1      v21, v22
1955
+    ssimDist_1      v23, v24
1956
+    ssimDist_1      v25, v26
1957
+    ssimDist_1      v27, v28
1958
+    uxtl            v21.8h, v6.8b
1959
+    uxtl            v22.8h, v18.8b
1960
+    uxtl            v23.8h, v7.8b
1961
+    uxtl            v24.8h, v19.8b
1962
+    uxtl2           v25.8h, v6.16b
1963
+    uxtl2           v26.8h, v18.16b
1964
+    uxtl2           v27.8h, v7.16b
1965
+    uxtl2           v28.8h, v19.16b
1966
+    ssimDist_1      v21, v22
1967
+    ssimDist_1      v23, v24
1968
+    ssimDist_1      v25, v26
1969
+    ssimDist_1      v27, v28
1970
+    cbnz            w12, .loop_ssimDist64
1971
+    ssimDist_end
1972
+    ret
1973
+endfunc
1974
+
1975
+// void normFact_c(const pixel* src, uint32_t blockSize, int shift, uint64_t *z_k)
1976
+
1977
+.macro normFact_1  v4
1978
+    smull           v16.4s, \v4\().4h, \v4\().4h
1979
+    smull2          v17.4s, \v4\().8h, \v4\().8h
1980
+    add             v0.4s, v0.4s, v16.4s
1981
+    add             v0.4s, v0.4s, v17.4s
1982
 .endm
1983
 
1984
-// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1985
-function x265_pixel_satd_4x4_neon
1986
-    satd_4x4_neon
1987
-    umov            x0, v0.d0
1988
+function PFX(normFact8_neon)
1989
+    normFact_start
1990
+.rept 8
1991
+    ld1             {v4.8b}, x0, x1
1992
+    uxtl            v4.8h, v4.8b
1993
+    normFact_1      v4
1994
+.endr
1995
+    normFact_end
1996
     ret
1997
 endfunc
1998
 
1999
-// int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2000
-function x265_pixel_satd_8x4_neon
2001
-    mov             x4, x0
2002
-    mov             x5, x2
2003
-    satd_4x4_neon
2004
-    add             x0, x4, #4
2005
-    add             x2, x5, #4
2006
-    umov            x6, v0.d0
2007
-    satd_4x4_neon
2008
-    umov            x0, v0.d0
2009
-    add             x0, x0, x6
2010
+function PFX(normFact16_neon)
2011
+    mov w12, #16
2012
+    normFact_start
2013
+.loop_normFact16:
2014
+    sub             w12, w12, #1
2015
+    ld1             {v4.16b}, x0, x1
2016
+    uxtl            v5.8h, v4.8b
2017
+    uxtl2           v4.8h, v4.16b
2018
+    normFact_1      v5
2019
+    normFact_1      v4
2020
+    cbnz            w12, .loop_normFact16
2021
+    normFact_end
2022
+    ret
2023
+endfunc
2024
+
2025
+function PFX(normFact32_neon)
2026
+    mov w12, #32
2027
+    normFact_start
2028
+.loop_normFact32:
2029
+    sub             w12, w12, #1
2030
+    ld1             {v4.16b-v5.16b}, x0, x1
2031
+    uxtl            v6.8h, v4.8b
2032
+    uxtl2           v4.8h, v4.16b
2033
+    uxtl            v7.8h, v5.8b
2034
+    uxtl2           v5.8h, v5.16b
2035
+    normFact_1      v4
2036
+    normFact_1      v5
2037
+    normFact_1      v6
2038
+    normFact_1      v7
2039
+    cbnz            w12, .loop_normFact32
2040
+    normFact_end
2041
+    ret
2042
+endfunc
2043
+
2044
+function PFX(normFact64_neon)
2045
+    mov w12, #64
2046
+    normFact_start
2047
+.loop_normFact64:
2048
+    sub             w12, w12, #1
2049
+    ld1             {v4.16b-v7.16b}, x0, x1
2050
+    uxtl            v26.8h, v4.8b
2051
+    uxtl2           v24.8h, v4.16b
2052
+    uxtl            v27.8h, v5.8b
2053
+    uxtl2           v25.8h, v5.16b
2054
+    normFact_1      v24
2055
+    normFact_1      v25
2056
+    normFact_1      v26
2057
+    normFact_1      v27
2058
+    uxtl            v26.8h, v6.8b
2059
+    uxtl2           v24.8h, v6.16b
2060
+    uxtl            v27.8h, v7.8b
2061
+    uxtl2           v25.8h, v7.16b
2062
+    normFact_1      v24
2063
+    normFact_1      v25
2064
+    normFact_1      v26
2065
+    normFact_1      v27
2066
+    cbnz            w12, .loop_normFact64
2067
+    normFact_end
2068
+    ret
2069
+endfunc
2070
+
2071
+// void weight_pp_c(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
2072
+function PFX(weight_pp_neon)
2073
+    sub             x2, x2, x3
2074
+    ldr             w9, sp              // offset
2075
+    lsl             w5, w5, #6            // w0 << correction
2076
+
2077
+    // count trailing zeros in w5 and compare against shift right amount.
2078
+    rbit            w10, w5
2079
+    clz             w10, w10
2080
+    cmp             w10, w7
2081
+    b.lt            .unfoldedShift
2082
+
2083
+    // shift right only removes trailing zeros: hoist LSR out of the loop.
2084
+    lsr             w10, w5, w7           // w0 << correction >> shift
2085
+    dup             v25.16b, w10
2086
+    lsr             w6, w6, w7            // round >> shift
2087
+    add             w6, w6, w9            // round >> shift + offset
2088
+    dup             v26.8h, w6
2089
+
2090
+    // Check arithmetic range.
2091
+    mov             w11, #255
2092
+    madd            w11, w11, w10, w6
2093
+    add             w11, w11, w9
2094
+    lsr             w11, w11, #16
2095
+    cbnz            w11, .widenTo32Bit
2096
+
2097
+    // 16-bit arithmetic is enough.
2098
+.loopHpp:
2099
+    mov             x12, x3
2100
+.loopWpp:
2101
+    ldr             q0, x0, #16
2102
+    sub             x12, x12, #16
2103
+    umull           v1.8h, v0.8b, v25.8b  // val *= w0 << correction >> shift
2104
+    umull2          v2.8h, v0.16b, v25.16b
2105
+    add             v1.8h, v1.8h, v26.8h  // val += round >> shift + offset
2106
+    add             v2.8h, v2.8h, v26.8h
2107
+    sqxtun          v0.8b, v1.8h          // val = x265_clip(val)
2108
+    sqxtun2         v0.16b, v2.8h
2109
+    str             q0, x1, #16
2110
+    cbnz            x12, .loopWpp
2111
+    add             x1, x1, x2
2112
+    add             x0, x0, x2
2113
+    sub             x4, x4, #1
2114
+    cbnz            x4, .loopHpp
2115
+    ret
2116
+
2117
+    // 32-bit arithmetic is needed.
2118
+.widenTo32Bit:
2119
+.loopHpp32:
2120
+    mov             x12, x3
2121
+.loopWpp32:
2122
+    ldr             d0, x0, #8
2123
+    sub             x12, x12, #8
2124
+    uxtl            v0.8h, v0.8b
2125
+    umull           v1.4s, v0.4h, v25.4h  // val *= w0 << correction >> shift
2126
+    umull2          v2.4s, v0.8h, v25.8h
2127
+    add             v1.4s, v1.4s, v26.4s  // val += round >> shift + offset
2128
+    add             v2.4s, v2.4s, v26.4s
2129
+    sqxtn           v0.4h, v1.4s          // val = x265_clip(val)
2130
+    sqxtn2          v0.8h, v2.4s
2131
+    sqxtun          v0.8b, v0.8h
2132
+    str             d0, x1, #8
2133
+    cbnz            x12, .loopWpp32
2134
+    add             x1, x1, x2
2135
+    add             x0, x0, x2
2136
+    sub             x4, x4, #1
2137
+    cbnz            x4, .loopHpp32
2138
+    ret
2139
+
2140
+    // The shift right cannot be moved out of the loop.
2141
+.unfoldedShift:
2142
+    dup             v25.8h, w5            // w0 << correction
2143
+    dup             v26.4s, w6            // round
2144
+    neg             w7, w7                // -shift
2145
+    dup             v27.4s, w7
2146
+    dup             v29.4s, w9            // offset
2147
+.loopHppUS:
2148
+    mov             x12, x3
2149
+.loopWppUS:
2150
+    ldr             d0, x0, #8
2151
+    sub             x12, x12, #8
2152
+    uxtl            v0.8h, v0.8b
2153
+    umull           v1.4s, v0.4h, v25.4h  // val *= w0
2154
+    umull2          v2.4s, v0.8h, v25.8h
2155
+    add             v1.4s, v1.4s, v26.4s  // val += round
2156
+    add             v2.4s, v2.4s, v26.4s
2157
+    sshl            v1.4s, v1.4s, v27.4s  // val >>= shift
2158
+    sshl            v2.4s, v2.4s, v27.4s
2159
+    add             v1.4s, v1.4s, v29.4s  // val += offset
2160
+    add             v2.4s, v2.4s, v29.4s
2161
+    sqxtn           v0.4h, v1.4s          // val = x265_clip(val)
2162
+    sqxtn2          v0.8h, v2.4s
2163
+    sqxtun          v0.8b, v0.8h
2164
+    str             d0, x1, #8
2165
+    cbnz            x12, .loopWppUS
2166
+    add             x1, x1, x2
2167
+    add             x0, x0, x2
2168
+    sub             x4, x4, #1
2169
+    cbnz            x4, .loopHppUS
2170
+    ret
2171
+endfunc
2172
+
2173
+// int scanPosLast(
2174
+//     const uint16_t *scan,      // x0
2175
+//     const coeff_t *coeff,      // x1
2176
+//     uint16_t *coeffSign,       // x2
2177
+//     uint16_t *coeffFlag,       // x3
2178
+//     uint8_t *coeffNum,         // x4
2179
+//     int numSig,                // x5
2180
+//     const uint16_t* scanCG4x4, // x6
2181
+//     const int trSize)          // x7
2182
+function PFX(scanPosLast_neon)
2183
+    // convert unit of Stride(trSize) to int16_t
2184
+    add             x7, x7, x7
2185
+
2186
+    // load scan table and convert to Byte
2187
+    ldp             q0, q1, x6
2188
+    xtn             v0.8b, v0.8h
2189
+    xtn2            v0.16b, v1.8h   // v0 - Zigzag scan table
2190
+
2191
+    movrel          x10, g_SPL_and_mask
2192
+    ldr             q28, x10      // v28 = mask for pmovmskb
2193
+    movi            v31.16b, #0     // v31 = {0, ..., 0}
2194
+    add             x10, x7, x7     // 2*x7
2195
+    add             x11, x10, x7    // 3*x7
2196
+    add             x9, x4, #1      // CG count
2197
+
2198
+.loop_spl:
2199
+    // position of current CG
2200
+    ldrh            w6, x0, #32
2201
+    add             x6, x1, x6, lsl #1
2202
+
2203
+    // loading current CG
2204
+    ldr             d2, x6
2205
+    ldr             d3, x6, x7
2206
+    ldr             d4, x6, x10
2207
+    ldr             d5, x6, x11
2208
+    mov             v2.d1, v3.d0
2209
+    mov             v4.d1, v5.d0
2210
+    sqxtn           v2.8b, v2.8h
2211
+    sqxtn2          v2.16b, v4.8h
2212
+
2213
+    // Zigzag
2214
+    tbl             v3.16b, {v2.16b}, v0.16b
2215
+
2216
+    // get sign
2217
+    cmhi            v5.16b, v3.16b, v31.16b   // v5 = non-zero
2218
+    cmlt            v3.16b, v3.16b, #0        // v3 = negative
2219
+
2220
+    // val - w13 = pmovmskb(v3)
2221
+    and             v3.16b, v3.16b, v28.16b
2222
+    mov             d4, v3.d1
2223
+    addv            b23, v3.8b
2224
+    addv            b24, v4.8b
2225
+    mov             v23.b1, v24.b0
2226
+    fmov            w13, s23
2227
+
2228
+    // mask - w15 = pmovmskb(v5)
2229
+    and             v5.16b, v5.16b, v28.16b
2230
+    mov             d6, v5.d1
2231
+    addv            b25, v5.8b
2232
+    addv            b26, v6.8b
2233
+    mov             v25.b1, v26.b0
2234
+    fmov            w15, s25
2235
+
2236
+    // coeffFlag = reverse_bit(w15) in 16-bit
2237
+    rbit            w12, w15
2238
+    lsr             w12, w12, #16
2239
+    fmov            s30, w12
2240
+    strh            w12, x3, #2
2241
+
2242
+    // accelerate by preparing w13 = w13 & w15
2243
+    and             w13, w13, w15
2244
+    mov             x14, xzr
2245
+.loop_spl_1:
2246
+    cbz             w15, .pext_end
2247
+    clz             w6, w15
2248
+    lsl             w13, w13, w6
2249
+    lsl             w15, w15, w6
2250
+    extr            w14, w14, w13, #31
2251
+    bfm             w15, wzr, #1, #0
2252
+    b               .loop_spl_1
2253
+.pext_end:
2254
+    strh            w14, x2, #2
2255
+
2256
+    // compute coeffNum = popcount(coeffFlag)
2257
+    cnt             v30.8b, v30.8b
2258
+    addp            v30.8b, v30.8b, v30.8b
2259
+    fmov            w6, s30
2260
+    sub             x5, x5, x6
2261
+    strb            w6, x4, #1
2262
+
2263
+    cbnz            x5, .loop_spl
2264
+
2265
+    // count trailing zeros
2266
+    rbit            w13, w12
2267
+    clz             w13, w13
2268
+    lsr             w12, w12, w13
2269
+    strh            w12, x3, #-2
2270
+
2271
+    // get last pos
2272
+    sub             x9, x4, x9
2273
+    lsl             x0, x9, #4
2274
+    eor             w13, w13, #15
2275
+    add             x0, x0, x13
2276
+    ret
2277
+endfunc
2278
+
2279
+// uint32_t costCoeffNxN(
2280
+//    uint16_t *scan,        // x0
2281
+//    coeff_t *coeff,        // x1
2282
+//    intptr_t trSize,       // x2
2283
+//    uint16_t *absCoeff,    // x3
2284
+//    uint8_t *tabSigCtx,    // x4
2285
+//    uint16_t scanFlagMask, // x5
2286
+//    uint8_t *baseCtx,      // x6
2287
+//    int offset,            // x7
2288
+//    int scanPosSigOff,     // sp
2289
+//    int subPosBase)        // sp + 8
2290
+function PFX(costCoeffNxN_neon)
2291
+    // abs(coeff)
2292
+    add             x2, x2, x2
2293
+    ld1             {v1.d}0, x1, x2
2294
+    ld1             {v1.d}1, x1, x2
2295
+    ld1             {v2.d}0, x1, x2
2296
+    ld1             {v2.d}1, x1, x2
2297
+    abs             v1.8h, v1.8h
2298
+    abs             v2.8h, v2.8h
2299
+
2300
+    // WARNING: beyond-bound read here!
2301
+    // loading scan table
2302
+    ldr             w2, sp
2303
+    eor             w15, w2, #15
2304
+    add             x1, x0, x15, lsl #1
2305
+    ldp             q20, q21, x1
2306
+    uzp1            v20.16b, v20.16b, v21.16b
2307
+    movi            v21.16b, #15
2308
+    eor             v0.16b, v20.16b, v21.16b
2309
+
2310
+    // reorder coeff
2311
+    uzp1           v22.16b, v1.16b, v2.16b
2312
+    uzp2           v23.16b, v1.16b, v2.16b
2313
+    tbl            v24.16b, {v22.16b}, v0.16b
2314
+    tbl            v25.16b, {v23.16b}, v0.16b
2315
+    zip1           v2.16b, v24.16b, v25.16b
2316
+    zip2           v3.16b, v24.16b, v25.16b
2317
+
2318
+    // loading tabSigCtx (+offset)
2319
+    ldr             q1, x4
2320
+    tbl             v1.16b, {v1.16b}, v0.16b
2321
+    dup             v4.16b, w7
2322
+    movi            v5.16b, #0
2323
+    tbl             v4.16b, {v4.16b}, v5.16b
2324
+    add             v1.16b, v1.16b, v4.16b
2325
+
2326
+    // register mapping
2327
+    // x0 - sum
2328
+    // x1 - entropyStateBits
2329
+    // v1 - sigCtx
2330
+    // {v3,v2} - abs(coeff)
2331
+    // x2 - scanPosSigOff
2332
+    // x3 - absCoeff
2333
+    // x4 - numNonZero
2334
+    // x5 - scanFlagMask
2335
+    // x6 - baseCtx
2336
+    mov             x0, #0
2337
+    movrel          x1, PFX_C(entropyStateBits)
2338
+    mov             x4, #0
2339
+    mov             x11, #0
2340
+    movi            v31.16b, #0
2341
+    cbz             x2, .idx_zero
2342
+.loop_ccnn:
2343
+//   {
2344
+//        const uint32_t cnt = tabSigCtxblkPos + offset + posOffset;
2345
+//        ctxSig = cnt & posZeroMask;
2346
+//        const uint32_t mstate = baseCtxctxSig;
2347
+//        const uint32_t mps = mstate & 1;
2348
+//        const uint32_t stateBits = x265_entropyStateBitsmstate ^ sig;
2349
+//        uint32_t nextState = (stateBits >> 24) + mps;
2350
+//        if ((mstate ^ sig) == 1)
2351
+//            nextState = sig;
2352
+//        baseCtxctxSig = (uint8_t)nextState;
2353
+//        sum += stateBits;
2354
+//    }
2355
+//    absCoeffnumNonZero = tmpCoeffblkPos;
2356
+//    numNonZero += sig;
2357
+//    scanPosSigOff--;
2358
+
2359
+    add             x13, x3, x4, lsl #1
2360
+    sub             x2, x2, #1
2361
+    str             h2, x13             // absCoeffnumNonZero = tmpCoeffblkPos
2362
+    fmov            w14, s1               // x14 = ctxSig
2363
+    uxtb            w14, w14
2364
+    ubfx            w11, w5, #0, #1       // x11 = sig
2365
+    lsr             x5, x5, #1
2366
+    add             x4, x4, x11           // numNonZero += sig
2367
+    ext             v1.16b, v1.16b, v31.16b, #1
2368
+    ext             v2.16b, v2.16b, v3.16b, #2
2369
+    ext             v3.16b, v3.16b, v31.16b, #2
2370
+    ldrb            w9, x6, x14         // mstate = baseCtxctxSig
2371
+    and             w10, w9, #1           // mps = mstate & 1
2372
+    eor             w9, w9, w11           // x9 = mstate ^ sig
2373
+    add             x12, x1, x9, lsl #2
2374
+    ldr             w13, x12
2375
+    add             w0, w0, w13           // sum += x265_entropyStateBitsmstate ^ sig
2376
+    ldrb            w13, x12, #3
2377
+    add             w10, w10, w13         // nextState = (stateBits >> 24) + mps
2378
+    cmp             w9, #1
2379
+    csel            w10, w11, w10, eq
2380
+    strb            w10, x6, x14
2381
+    cbnz            x2, .loop_ccnn
2382
+.idx_zero:
2383
+
2384
+    add             x13, x3, x4, lsl #1
2385
+    add             x4, x4, x15
2386
+    str             h2, x13              // absCoeffnumNonZero = tmpCoeffblkPos
2387
+
2388
+    ldr             x9, sp, #8           // subPosBase
2389
+    uxth            w9, w9
2390
+    cmp             w9, #0
2391
+    cset            x2, eq
2392
+    add             x4, x4, x2
2393
+    cbz             x4, .exit_ccnn
2394
+
2395
+    sub             w2, w2, #1
2396
+    uxtb            w2, w2
2397
+    fmov            w3, s1
2398
+    and             w2, w2, w3
2399
+
2400
+    ldrb            w3, x6, x2         // mstate = baseCtxctxSig
2401
+    eor             w4, w5, w3            // x5 = mstate ^ sig
2402
+    and             w3, w3, #1            // mps = mstate & 1
2403
+    add             x1, x1, x4, lsl #2
2404
+    ldr             w11, x1
2405
+    ldrb            w12, x1, #3
2406
+    add             w0, w0, w11           // sum += x265_entropyStateBitsmstate ^ sig
2407
+    add             w3, w3, w12           // nextState = (stateBits >> 24) + mps
2408
+    cmp             w4, #1
2409
+    csel            w3, w5, w3, eq
2410
+    strb            w3, x6, x2
2411
+.exit_ccnn:
2412
+    ubfx            w0, w0, #0, #24
2413
     ret
2414
 endfunc
2415
+
2416
+const g_SPL_and_mask, align=8
2417
+.byte 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
2418
+endconst
2419
x265_3.6.tar.gz/source/common/aarch64/sad-a-common.S Added
516
 
1
@@ -0,0 +1,514 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+#include "asm.S"
29
+
30
+.arch           armv8-a
31
+
32
+#ifdef __APPLE__
33
+.section __RODATA,__rodata
34
+#else
35
+.section .rodata
36
+#endif
37
+
38
+.align 4
39
+
40
+.macro SAD_START_4 f
41
+    ld1             {v0.s}0, x0, x1
42
+    ld1             {v0.s}1, x0, x1
43
+    ld1             {v1.s}0, x2, x3
44
+    ld1             {v1.s}1, x2, x3
45
+    \f              v16.8h, v0.8b, v1.8b
46
+.endm
47
+
48
+.macro SAD_4 h
49
+.rept \h / 2 - 1
50
+    SAD_START_4 uabal
51
+.endr
52
+.endm
53
+
54
+.macro SAD_START_8 f
55
+    ld1             {v0.8b}, x0, x1
56
+    ld1             {v1.8b}, x2, x3
57
+    ld1             {v2.8b}, x0, x1
58
+    ld1             {v3.8b}, x2, x3
59
+    \f              v16.8h, v0.8b, v1.8b
60
+    \f              v17.8h, v2.8b, v3.8b
61
+.endm
62
+
63
+.macro SAD_8 h
64
+.rept \h / 2 - 1
65
+    SAD_START_8 uabal
66
+.endr
67
+.endm
68
+
69
+.macro SAD_START_16 f
70
+    ld1             {v0.16b}, x0, x1
71
+    ld1             {v1.16b}, x2, x3
72
+    ld1             {v2.16b}, x0, x1
73
+    ld1             {v3.16b}, x2, x3
74
+    \f              v16.8h, v0.8b, v1.8b
75
+    \f\()2          v17.8h, v0.16b, v1.16b
76
+    uabal           v16.8h, v2.8b, v3.8b
77
+    uabal2          v17.8h, v2.16b, v3.16b
78
+.endm
79
+
80
+.macro SAD_16 h
81
+.rept \h / 2 - 1
82
+    SAD_START_16 uabal
83
+.endr
84
+.endm
85
+
86
+.macro SAD_START_32
87
+    movi            v16.16b, #0
88
+    movi            v17.16b, #0
89
+    movi            v18.16b, #0
90
+    movi            v19.16b, #0
91
+.endm
92
+
93
+.macro SAD_32
94
+    ld1             {v0.16b-v1.16b}, x0, x1
95
+    ld1             {v2.16b-v3.16b}, x2, x3
96
+    ld1             {v4.16b-v5.16b}, x0, x1
97
+    ld1             {v6.16b-v7.16b}, x2, x3
98
+    uabal           v16.8h, v0.8b, v2.8b
99
+    uabal2          v17.8h, v0.16b, v2.16b
100
+    uabal           v18.8h, v1.8b, v3.8b
101
+    uabal2          v19.8h, v1.16b, v3.16b
102
+    uabal           v16.8h, v4.8b, v6.8b
103
+    uabal2          v17.8h, v4.16b, v6.16b
104
+    uabal           v18.8h, v5.8b, v7.8b
105
+    uabal2          v19.8h, v5.16b, v7.16b
106
+.endm
107
+
108
+.macro SAD_END_32
109
+    add             v16.8h, v16.8h, v17.8h
110
+    add             v17.8h, v18.8h, v19.8h
111
+    add             v16.8h, v16.8h, v17.8h
112
+    uaddlv          s0, v16.8h
113
+    fmov            w0, s0
114
+    ret
115
+.endm
116
+
117
+.macro SAD_START_64
118
+    movi            v16.16b, #0
119
+    movi            v17.16b, #0
120
+    movi            v18.16b, #0
121
+    movi            v19.16b, #0
122
+    movi            v20.16b, #0
123
+    movi            v21.16b, #0
124
+    movi            v22.16b, #0
125
+    movi            v23.16b, #0
126
+.endm
127
+
128
+.macro SAD_64
129
+    ld1             {v0.16b-v3.16b}, x0, x1
130
+    ld1             {v4.16b-v7.16b}, x2, x3
131
+    ld1             {v24.16b-v27.16b}, x0, x1
132
+    ld1             {v28.16b-v31.16b}, x2, x3
133
+    uabal           v16.8h, v0.8b, v4.8b
134
+    uabal2          v17.8h, v0.16b, v4.16b
135
+    uabal           v18.8h, v1.8b, v5.8b
136
+    uabal2          v19.8h, v1.16b, v5.16b
137
+    uabal           v20.8h, v2.8b, v6.8b
138
+    uabal2          v21.8h, v2.16b, v6.16b
139
+    uabal           v22.8h, v3.8b, v7.8b
140
+    uabal2          v23.8h, v3.16b, v7.16b
141
+
142
+    uabal           v16.8h, v24.8b, v28.8b
143
+    uabal2          v17.8h, v24.16b, v28.16b
144
+    uabal           v18.8h, v25.8b, v29.8b
145
+    uabal2          v19.8h, v25.16b, v29.16b
146
+    uabal           v20.8h, v26.8b, v30.8b
147
+    uabal2          v21.8h, v26.16b, v30.16b
148
+    uabal           v22.8h, v27.8b, v31.8b
149
+    uabal2          v23.8h, v27.16b, v31.16b
150
+.endm
151
+
152
+.macro SAD_END_64
153
+    add             v16.8h, v16.8h, v17.8h
154
+    add             v17.8h, v18.8h, v19.8h
155
+    add             v16.8h, v16.8h, v17.8h
156
+    uaddlp          v16.4s, v16.8h
157
+    add             v18.8h, v20.8h, v21.8h
158
+    add             v19.8h, v22.8h, v23.8h
159
+    add             v17.8h, v18.8h, v19.8h
160
+    uaddlp          v17.4s, v17.8h
161
+    add             v16.4s, v16.4s, v17.4s
162
+    uaddlv          d0, v16.4s
163
+    fmov            x0, d0
164
+    ret
165
+.endm
166
+
167
+.macro SAD_START_12
168
+    movrel          x12, sad12_mask
169
+    ld1             {v31.16b}, x12
170
+    movi            v16.16b, #0
171
+    movi            v17.16b, #0
172
+.endm
173
+
174
+.macro SAD_12
175
+    ld1             {v0.16b}, x0, x1
176
+    and             v0.16b, v0.16b, v31.16b
177
+    ld1             {v1.16b}, x2, x3
178
+    and             v1.16b, v1.16b, v31.16b
179
+    ld1             {v2.16b}, x0, x1
180
+    and             v2.16b, v2.16b, v31.16b
181
+    ld1             {v3.16b}, x2, x3
182
+    and             v3.16b, v3.16b, v31.16b
183
+    uabal           v16.8h, v0.8b, v1.8b
184
+    uabal2          v17.8h, v0.16b, v1.16b
185
+    uabal           v16.8h, v2.8b, v3.8b
186
+    uabal2          v17.8h, v2.16b, v3.16b
187
+.endm
188
+
189
+.macro SAD_END_12
190
+    add             v16.8h, v16.8h, v17.8h
191
+    uaddlv          s0, v16.8h
192
+    fmov            w0, s0
193
+    ret
194
+.endm
195
+
196
+.macro SAD_START_24
197
+    movi            v16.16b, #0
198
+    movi            v17.16b, #0
199
+    movi            v18.16b, #0
200
+    sub             x1, x1, #16
201
+    sub             x3, x3, #16
202
+.endm
203
+
204
+.macro SAD_24
205
+    ld1             {v0.16b}, x0, #16
206
+    ld1             {v1.8b}, x0, x1
207
+    ld1             {v2.16b}, x2, #16
208
+    ld1             {v3.8b}, x2, x3
209
+    ld1             {v4.16b}, x0, #16
210
+    ld1             {v5.8b}, x0, x1
211
+    ld1             {v6.16b}, x2, #16
212
+    ld1             {v7.8b}, x2, x3
213
+    uabal           v16.8h, v0.8b, v2.8b
214
+    uabal2          v17.8h, v0.16b, v2.16b
215
+    uabal           v18.8h, v1.8b, v3.8b
216
+    uabal           v16.8h, v4.8b, v6.8b
217
+    uabal2          v17.8h, v4.16b, v6.16b
218
+    uabal           v18.8h, v5.8b, v7.8b
219
+.endm
220
+
221
+.macro SAD_END_24
222
+    add             v16.8h, v16.8h, v17.8h
223
+    add             v16.8h, v16.8h, v18.8h
224
+    uaddlv          s0, v16.8h
225
+    fmov            w0, s0
226
+    ret
227
+.endm
228
+
229
+.macro SAD_START_48
230
+    movi            v16.16b, #0
231
+    movi            v17.16b, #0
232
+    movi            v18.16b, #0
233
+    movi            v19.16b, #0
234
+    movi            v20.16b, #0
235
+    movi            v21.16b, #0
236
+.endm
237
+
238
+.macro SAD_48
239
+    ld1             {v0.16b-v2.16b}, x0, x1
240
+    ld1             {v4.16b-v6.16b}, x2, x3
241
+    ld1             {v24.16b-v26.16b}, x0, x1
242
+    ld1             {v28.16b-v30.16b}, x2, x3
243
+    uabal           v16.8h, v0.8b, v4.8b
244
+    uabal2          v17.8h, v0.16b, v4.16b
245
+    uabal           v18.8h, v1.8b, v5.8b
246
+    uabal2          v19.8h, v1.16b, v5.16b
247
+    uabal           v20.8h, v2.8b, v6.8b
248
+    uabal2          v21.8h, v2.16b, v6.16b
249
+
250
+    uabal           v16.8h, v24.8b, v28.8b
251
+    uabal2          v17.8h, v24.16b, v28.16b
252
+    uabal           v18.8h, v25.8b, v29.8b
253
+    uabal2          v19.8h, v25.16b, v29.16b
254
+    uabal           v20.8h, v26.8b, v30.8b
255
+    uabal2          v21.8h, v26.16b, v30.16b
256
+.endm
257
+
258
+.macro SAD_END_48
259
+    add             v16.8h, v16.8h, v17.8h
260
+    add             v17.8h, v18.8h, v19.8h
261
+    add             v16.8h, v16.8h, v17.8h
262
+    uaddlv          s0, v16.8h
263
+    fmov            w0, s0
264
+    add             v18.8h, v20.8h, v21.8h
265
+    uaddlv          s1, v18.8h
266
+    fmov            w1, s1
267
+    add             w0, w0, w1
268
+    ret
269
+.endm
270
+
271
+.macro SAD_X_START_4 h, x, f
272
+    ld1             {v0.s}0, x0, x9
273
+    ld1             {v0.s}1, x0, x9
274
+    ld1             {v1.s}0, x1, x5
275
+    ld1             {v1.s}1, x1, x5
276
+    ld1             {v2.s}0, x2, x5
277
+    ld1             {v2.s}1, x2, x5
278
+    ld1             {v3.s}0, x3, x5
279
+    ld1             {v3.s}1, x3, x5
280
+    \f              v16.8h, v0.8b, v1.8b
281
+    \f              v17.8h, v0.8b, v2.8b
282
+    \f              v18.8h, v0.8b, v3.8b
283
+.if \x == 4
284
+    ld1             {v4.s}0, x4, x5
285
+    ld1             {v4.s}1, x4, x5
286
+    \f              v19.8h, v0.8b, v4.8b
287
+.endif
288
+.endm
289
+
290
+.macro SAD_X_4 h, x
291
+.rept \h/2 - 1
292
+    SAD_X_START_4 \h, \x, uabal
293
+.endr
294
+.endm
295
+
296
+.macro SAD_X_END_4 x
297
+    uaddlv          s0, v16.8h
298
+    uaddlv          s1, v17.8h
299
+    uaddlv          s2, v18.8h
300
+    stp             s0, s1, x6
301
+.if \x == 3
302
+    str             s2, x6, #8
303
+.elseif \x == 4
304
+    uaddlv          s3, v19.8h
305
+    stp             s2, s3, x6, #8
306
+.endif
307
+    ret
308
+.endm
309
+
310
+.macro SAD_X_START_8 h, x, f
311
+    ld1             {v0.8b}, x0, x9
312
+    ld1             {v1.8b}, x1, x5
313
+    ld1             {v2.8b}, x2, x5
314
+    ld1             {v3.8b}, x3, x5
315
+    \f              v16.8h, v0.8b, v1.8b
316
+    \f              v17.8h, v0.8b, v2.8b
317
+    \f              v18.8h, v0.8b, v3.8b
318
+.if \x == 4
319
+    ld1             {v4.8b}, x4, x5
320
+    \f              v19.8h, v0.8b, v4.8b
321
+.endif
322
+.endm
323
+
324
+.macro SAD_X_8 h x
325
+.rept \h - 1
326
+    SAD_X_START_8 \h, \x, uabal
327
+.endr
328
+.endm
329
+
330
+.macro SAD_X_END_8 x
331
+    SAD_X_END_4 \x
332
+.endm
333
+
334
+.macro SAD_X_START_12 h, x, f
335
+    ld1             {v0.16b}, x0, x9
336
+    and             v0.16b, v0.16b, v31.16b
337
+    ld1             {v1.16b}, x1, x5
338
+    and             v1.16b, v1.16b, v31.16b
339
+    ld1             {v2.16b}, x2, x5
340
+    and             v2.16b, v2.16b, v31.16b
341
+    ld1             {v3.16b}, x3, x5
342
+    and             v3.16b, v3.16b, v31.16b
343
+    \f              v16.8h, v1.8b, v0.8b
344
+    \f\()2          v20.8h, v1.16b, v0.16b
345
+    \f              v17.8h, v2.8b, v0.8b
346
+    \f\()2          v21.8h, v2.16b, v0.16b
347
+    \f              v18.8h, v3.8b, v0.8b
348
+    \f\()2          v22.8h, v3.16b, v0.16b
349
+.if \x == 4
350
+    ld1             {v4.16b}, x4, x5
351
+    and             v4.16b, v4.16b, v31.16b
352
+    \f              v19.8h, v4.8b, v0.8b
353
+    \f\()2          v23.8h, v4.16b, v0.16b
354
+.endif
355
+.endm
356
+
357
+.macro SAD_X_12 h x
358
+.rept \h - 1
359
+    SAD_X_START_12 \h, \x, uabal
360
+.endr
361
+.endm
362
+
363
+.macro SAD_X_END_12 x
364
+    SAD_X_END_16 \x
365
+.endm
366
+
367
+.macro SAD_X_START_16 h, x, f
368
+    ld1             {v0.16b}, x0, x9
369
+    ld1             {v1.16b}, x1, x5
370
+    ld1             {v2.16b}, x2, x5
371
+    ld1             {v3.16b}, x3, x5
372
+    \f              v16.8h, v1.8b, v0.8b
373
+    \f\()2          v20.8h, v1.16b, v0.16b
374
+    \f              v17.8h, v2.8b, v0.8b
375
+    \f\()2          v21.8h, v2.16b, v0.16b
376
+    \f              v18.8h, v3.8b, v0.8b
377
+    \f\()2          v22.8h, v3.16b, v0.16b
378
+.if \x == 4
379
+    ld1             {v4.16b}, x4, x5
380
+    \f              v19.8h, v4.8b, v0.8b
381
+    \f\()2          v23.8h, v4.16b, v0.16b
382
+.endif
383
+.endm
384
+
385
+.macro SAD_X_16 h x
386
+.rept \h - 1
387
+    SAD_X_START_16 \h, \x, uabal
388
+.endr
389
+.endm
390
+
391
+.macro SAD_X_END_16 x
392
+    add             v16.8h, v16.8h, v20.8h
393
+    add             v17.8h, v17.8h, v21.8h
394
+    add             v18.8h, v18.8h, v22.8h
395
+.if \x == 4
396
+    add             v19.8h, v19.8h, v23.8h
397
+.endif
398
+
399
+    SAD_X_END_4 \x
400
+.endm
401
+
402
+.macro SAD_X_START_24 x
403
+    SAD_X_START_32 \x
404
+    sub             x5, x5, #16
405
+    sub             x9, x9, #16
406
+.endm
407
+
408
+.macro SAD_X_24 base v1 v2
409
+    ld1             {v0.16b},  \base , #16
410
+    ld1             {v1.8b},  \base , x5
411
+    uabal           \v1\().8h, v0.8b, v6.8b
412
+    uabal           \v1\().8h, v1.8b, v7.8b
413
+    uabal2          \v2\().8h, v0.16b, v6.16b
414
+.endm
415
+
416
+.macro SAD_X_END_24 x
417
+    SAD_X_END_16 \x
418
+.endm
419
+
420
+.macro SAD_X_START_32 x
421
+    movi v16.16b, #0
422
+    movi v17.16b, #0
423
+    movi v18.16b, #0
424
+    movi v20.16b, #0
425
+    movi v21.16b, #0
426
+    movi v22.16b, #0
427
+.if \x == 4
428
+    movi v19.16b, #0
429
+    movi v23.16b, #0
430
+.endif
431
+.endm
432
+
433
+.macro SAD_X_32 base v1 v2
434
+    ld1             {v0.16b-v1.16b},  \base , x5
435
+    uabal           \v1\().8h, v0.8b, v6.8b
436
+    uabal           \v1\().8h, v1.8b, v7.8b
437
+    uabal2          \v2\().8h, v0.16b, v6.16b
438
+    uabal2          \v2\().8h, v1.16b, v7.16b
439
+.endm
440
+
441
+.macro SAD_X_END_32 x
442
+    SAD_X_END_16 \x
443
+.endm
444
+
445
+.macro SAD_X_START_48 x
446
+    SAD_X_START_32 \x
447
+.endm
448
+
449
+.macro SAD_X_48 x1 v1 v2
450
+    ld1             {v0.16b-v2.16b},  \x1 , x5
451
+    uabal           \v1\().8h, v0.8b, v4.8b
452
+    uabal           \v1\().8h, v1.8b, v5.8b
453
+    uabal           \v1\().8h, v2.8b, v6.8b
454
+    uabal2          \v2\().8h, v0.16b, v4.16b
455
+    uabal2          \v2\().8h, v1.16b, v5.16b
456
+    uabal2          \v2\().8h, v2.16b, v6.16b
457
+.endm
458
+
459
+.macro SAD_X_END_48 x
460
+    SAD_X_END_64 \x
461
+.endm
462
+
463
+.macro SAD_X_START_64 x
464
+    SAD_X_START_32 \x
465
+.endm
466
+
467
+.macro SAD_X_64 x1 v1 v2
468
+    ld1             {v0.16b-v3.16b},  \x1 , x5
469
+    uabal           \v1\().8h, v0.8b, v4.8b
470
+    uabal           \v1\().8h, v1.8b, v5.8b
471
+    uabal           \v1\().8h, v2.8b, v6.8b
472
+    uabal           \v1\().8h, v3.8b, v7.8b
473
+    uabal2          \v2\().8h, v0.16b, v4.16b
474
+    uabal2          \v2\().8h, v1.16b, v5.16b
475
+    uabal2          \v2\().8h, v2.16b, v6.16b
476
+    uabal2          \v2\().8h, v3.16b, v7.16b
477
+.endm
478
+
479
+.macro SAD_X_END_64 x
480
+    uaddlp          v16.4s, v16.8h
481
+    uaddlp          v17.4s, v17.8h
482
+    uaddlp          v18.4s, v18.8h
483
+    uaddlp          v20.4s, v20.8h
484
+    uaddlp          v21.4s, v21.8h
485
+    uaddlp          v22.4s, v22.8h
486
+    add             v16.4s, v16.4s, v20.4s
487
+    add             v17.4s, v17.4s, v21.4s
488
+    add             v18.4s, v18.4s, v22.4s
489
+    trn2            v20.2d, v16.2d, v16.2d
490
+    trn2            v21.2d, v17.2d, v17.2d
491
+    trn2            v22.2d, v18.2d, v18.2d
492
+    add             v16.2s, v16.2s, v20.2s
493
+    add             v17.2s, v17.2s, v21.2s
494
+    add             v18.2s, v18.2s, v22.2s
495
+    uaddlp          v16.1d, v16.2s
496
+    uaddlp          v17.1d, v17.2s
497
+    uaddlp          v18.1d, v18.2s
498
+    stp             s16, s17, x6, #8
499
+.if \x == 3
500
+    str             s18, x6
501
+.elseif \x == 4
502
+    uaddlp          v19.4s, v19.8h
503
+    uaddlp          v23.4s, v23.8h
504
+    add             v19.4s, v19.4s, v23.4s
505
+    trn2            v23.2d, v19.2d, v19.2d
506
+    add             v19.2s, v19.2s, v23.2s
507
+    uaddlp          v19.1d, v19.2s
508
+    stp             s18, s19, x6
509
+.endif
510
+    ret
511
+.endm
512
+
513
+const sad12_mask, align=8
514
+.byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0
515
+endconst
516
x265_3.6.tar.gz/source/common/aarch64/sad-a-sve2.S Added
513
 
1
@@ -0,0 +1,511 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "sad-a-common.S"
27
+
28
+.arch armv8-a+sve2
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+.macro SAD_SVE2_16 h
41
+    mov             z16.d, #0
42
+    ptrue           p0.h, vl16
43
+.rept \h
44
+    ld1b            {z0.h}, p0/z, x0
45
+    ld1b            {z2.h}, p0/z, x2
46
+    add             x0, x0, x1
47
+    add             x2, x2, x3
48
+    uaba            z16.h, z0.h, z2.h
49
+.endr
50
+    uaddv           d0, p0, z16.h
51
+    fmov            w0, s0
52
+    ret
53
+.endm
54
+
55
+.macro SAD_SVE2_32 h
56
+    ptrue           p0.b, vl32
57
+.rept \h
58
+    ld1b            {z0.b}, p0/z, x0
59
+    ld1b            {z4.b}, p0/z, x2
60
+    add             x0, x0, x1
61
+    add             x2, x2, x3
62
+    uabalb          z16.h, z0.b, z4.b
63
+    uabalt          z16.h, z0.b, z4.b
64
+.endr
65
+    uaddv           d0, p0, z16.h
66
+    fmov            w0, s0
67
+    ret
68
+.endm
69
+
70
+.macro SAD_SVE2_64 h
71
+    cmp             x9, #48
72
+    bgt             .vl_gt_48_pixel_sad_64x\h
73
+    mov             z16.d, #0
74
+    mov             z17.d, #0
75
+    mov             z18.d, #0
76
+    mov             z19.d, #0
77
+    ptrue           p0.b, vl32
78
+.rept \h
79
+    ld1b            {z0.b}, p0/z, x0
80
+    ld1b            {z1.b}, p0/z, x0, #1, mul vl
81
+    ld1b            {z4.b}, p0/z, x2
82
+    ld1b            {z5.b}, p0/z, x2, #1, mul vl
83
+    add             x0, x0, x1
84
+    add             x2, x2, x3
85
+    uabalb          z16.h, z0.b, z4.b
86
+    uabalt          z17.h, z0.b, z4.b
87
+    uabalb          z18.h, z1.b, z5.b
88
+    uabalt          z19.h, z1.b, z5.b
89
+.endr
90
+    add             z16.h, z16.h, z17.h
91
+    add             z17.h, z18.h, z19.h
92
+    add             z16.h, z16.h, z17.h
93
+    uadalp          z24.s, p0/m, z16.h
94
+    uaddv           d5, p0, z24.s
95
+    fmov            x0, d5
96
+    ret
97
+.vl_gt_48_pixel_sad_64x\h\():
98
+    mov             z16.d, #0
99
+    mov             z17.d, #0
100
+    mov             z24.d, #0
101
+    ptrue           p0.b, vl64
102
+.rept \h
103
+    ld1b            {z0.b}, p0/z, x0
104
+    ld1b            {z4.b}, p0/z, x2
105
+    add             x0, x0, x1
106
+    add             x2, x2, x3
107
+    uabalb          z16.h, z0.b, z4.b
108
+    uabalt          z17.h, z0.b, z4.b
109
+.endr
110
+    add             z16.h, z16.h, z17.h
111
+    uadalp          z24.s, p0/m, z16.h
112
+    uaddv           d5, p0, z24.s
113
+    fmov            x0, d5
114
+    ret
115
+.endm
116
+
117
+.macro SAD_SVE2_24 h
118
+    mov             z16.d, #0
119
+    mov             x10, #24
120
+    mov             x11, #0
121
+    whilelt         p0.b, x11, x10
122
+.rept \h
123
+    ld1b            {z0.b}, p0/z, x0
124
+    ld1b            {z8.b}, p0/z, x2
125
+    add             x0, x0, x1
126
+    add             x2, x2, x3
127
+    uabalb          z16.h, z0.b, z8.b
128
+    uabalt          z16.h, z0.b, z8.b
129
+.endr
130
+    uaddv           d5, p0, z16.h
131
+    fmov            w0, s5
132
+    ret
133
+.endm
134
+
135
+.macro SAD_SVE2_48 h
136
+    cmp             x9, #48
137
+    bgt             .vl_gt_48_pixel_sad_48x\h
138
+    mov             z16.d, #0
139
+    mov             z17.d, #0
140
+    mov             z18.d, #0
141
+    mov             z19.d, #0
142
+    ptrue           p0.b, vl32
143
+    ptrue           p1.b, vl16
144
+.rept \h
145
+    ld1b            {z0.b}, p0/z, x0
146
+    ld1b            {z1.b}, p1/z, x0, #1, mul vl
147
+    ld1b            {z8.b}, p0/z, x2
148
+    ld1b            {z9.b}, p1/z, x2, #1, mul vl
149
+    add             x0, x0, x1
150
+    add             x2, x2, x3
151
+    uabalb          z16.h, z0.b, z8.b
152
+    uabalt          z17.h, z0.b, z8.b
153
+    uabalb          z18.h, z1.b, z9.b
154
+    uabalt          z19.h, z1.b, z9.b
155
+.endr
156
+    add             z16.h, z16.h, z17.h
157
+    add             z17.h, z18.h, z19.h
158
+    add             z16.h, z16.h, z17.h
159
+    uaddv           d5, p0, z16.h
160
+    fmov            w0, s5
161
+    ret
162
+.vl_gt_48_pixel_sad_48x\h\():
163
+    mov             z16.d, #0
164
+    mov             z17.d, #0
165
+    mov             x10, #48
166
+    mov             x11, #0
167
+    whilelt         p0.b, x11, x10
168
+.rept \h
169
+    ld1b            {z0.b}, p0/z, x0
170
+    ld1b            {z8.b}, p0/z, x2
171
+    add             x0, x0, x1
172
+    add             x2, x2, x3
173
+    uabalb          z16.h, z0.b, z8.b
174
+    uabalt          z17.h, z0.b, z8.b
175
+.endr
176
+    add             z16.h, z16.h, z17.h
177
+    uaddv           d5, p0, z16.h
178
+    fmov            w0, s5
179
+    ret
180
+.endm
181
+
182
+// Fully unrolled.
183
+.macro SAD_FUNC_SVE2 w, h
184
+function PFX(pixel_sad_\w\()x\h\()_sve2)
185
+    rdvl            x9, #1
186
+    cmp             x9, #16
187
+    bgt             .vl_gt_16_pixel_sad_\w\()x\h
188
+    SAD_START_\w uabdl
189
+    SAD_\w \h
190
+.if \w > 4
191
+    add             v16.8h, v16.8h, v17.8h
192
+.endif
193
+    uaddlv          s0, v16.8h
194
+    fmov            w0, s0
195
+    ret
196
+.vl_gt_16_pixel_sad_\w\()x\h\():
197
+.if \w == 4 || \w == 8 || \w == 12
198
+    SAD_START_\w uabdl
199
+    SAD_\w \h
200
+.if \w > 4
201
+    add             v16.8h, v16.8h, v17.8h
202
+.endif
203
+    uaddlv          s0, v16.8h
204
+    fmov            w0, s0
205
+    ret
206
+.else
207
+    SAD_SVE2_\w \h
208
+.endif
209
+endfunc
210
+.endm
211
+
212
+// Loop unrolled 4.
213
+.macro SAD_FUNC_LOOP_SVE2 w, h
214
+function PFX(pixel_sad_\w\()x\h\()_sve2)
215
+    rdvl            x9, #1
216
+    cmp             x9, #16
217
+    bgt             .vl_gt_16_pixel_sad_loop_\w\()x\h
218
+    SAD_START_\w
219
+
220
+    mov             w9, #\h/8
221
+.loop_sve2_\w\()x\h:
222
+    sub             w9, w9, #1
223
+.rept 4
224
+    SAD_\w
225
+.endr
226
+    cbnz            w9, .loop_sve2_\w\()x\h
227
+
228
+    SAD_END_\w
229
+
230
+.vl_gt_16_pixel_sad_loop_\w\()x\h\():
231
+.if \w == 4 || \w == 8 || \w == 12
232
+    SAD_START_\w
233
+
234
+    mov             w9, #\h/8
235
+.loop_sve2_loop_\w\()x\h:
236
+    sub             w9, w9, #1
237
+.rept 4
238
+    SAD_\w
239
+.endr
240
+    cbnz            w9, .loop_sve2_loop_\w\()x\h
241
+
242
+    SAD_END_\w
243
+.else
244
+    SAD_SVE2_\w \h
245
+.endif
246
+endfunc
247
+.endm
248
+
249
+SAD_FUNC_SVE2  4,  4
250
+SAD_FUNC_SVE2  4,  8
251
+SAD_FUNC_SVE2  4,  16
252
+SAD_FUNC_SVE2  8,  4
253
+SAD_FUNC_SVE2  8,  8
254
+SAD_FUNC_SVE2  8,  16
255
+SAD_FUNC_SVE2  8,  32
256
+SAD_FUNC_SVE2  16, 4
257
+SAD_FUNC_SVE2  16, 8
258
+SAD_FUNC_SVE2  16, 12
259
+SAD_FUNC_SVE2  16, 16
260
+SAD_FUNC_SVE2  16, 32
261
+SAD_FUNC_SVE2  16, 64
262
+
263
+SAD_FUNC_LOOP_SVE2  32, 8
264
+SAD_FUNC_LOOP_SVE2  32, 16
265
+SAD_FUNC_LOOP_SVE2  32, 24
266
+SAD_FUNC_LOOP_SVE2  32, 32
267
+SAD_FUNC_LOOP_SVE2  32, 64
268
+SAD_FUNC_LOOP_SVE2  64, 16
269
+SAD_FUNC_LOOP_SVE2  64, 32
270
+SAD_FUNC_LOOP_SVE2  64, 48
271
+SAD_FUNC_LOOP_SVE2  64, 64
272
+SAD_FUNC_LOOP_SVE2  12, 16
273
+SAD_FUNC_LOOP_SVE2  24, 32
274
+SAD_FUNC_LOOP_SVE2  48, 64
275
+
276
+// SAD_X3 and SAD_X4 code start
277
+
278
+.macro SAD_X_SVE2_24_INNER_GT_16 base z
279
+    ld1b            {z4.b}, p0/z,  \base 
280
+    add             \base, \base, x5
281
+    uabalb          \z\().h, z4.b, z0.b
282
+    uabalt          \z\().h, z4.b, z0.b
283
+.endm
284
+
285
+.macro SAD_X_SVE2_24 h x
286
+    mov             z20.d, #0
287
+    mov             z21.d, #0
288
+    mov             z22.d, #0
289
+    mov             z23.d, #0
290
+    mov             x10, #24
291
+    mov             x11, #0
292
+    whilelt         p0.b, x11, x10
293
+.rept \h
294
+    ld1b            {z0.b}, p0/z, x0
295
+    add             x0, x0, x9
296
+    SAD_X_SVE2_24_INNER_GT_16 x1, z20
297
+    SAD_X_SVE2_24_INNER_GT_16 x2, z21
298
+    SAD_X_SVE2_24_INNER_GT_16 x3, z22
299
+.if \x == 4
300
+    SAD_X_SVE2_24_INNER_GT_16 x4, z23
301
+.endif
302
+.endr
303
+    uaddlv          s0, v20.8h
304
+    uaddlv          s1, v21.8h
305
+    uaddlv          s2, v22.8h
306
+    stp             s0, s1, x6
307
+.if \x == 3
308
+    str             s2, x6, #8
309
+.elseif \x == 4
310
+    uaddv           d0, p0, z20.h
311
+    uaddv           d1, p0, z21.h
312
+    uaddv           d2, p0, z22.h
313
+    stp             s2, s3, x6, #8
314
+.endif
315
+    ret
316
+.endm
317
+
318
+.macro SAD_X_SVE2_32_INNER_GT_16 base z
319
+    ld1b            {z4.b}, p0/z,  \base 
320
+    add             \base, \base, x5
321
+    uabalb          \z\().h, z4.b, z0.b
322
+    uabalt          \z\().h, z4.b, z0.b
323
+.endm
324
+
325
+.macro SAD_X_SVE2_32 h x
326
+    mov             z20.d, #0
327
+    mov             z21.d, #0
328
+    mov             z22.d, #0
329
+    mov             z23.d, #0
330
+    ptrue           p0.b, vl32
331
+.rept \h
332
+    ld1b            {z0.b}, p0/z, x0
333
+    add             x0, x0, x9
334
+    SAD_X_SVE2_32_INNER_GT_16 x1, z20
335
+    SAD_X_SVE2_32_INNER_GT_16 x2, z21
336
+    SAD_X_SVE2_32_INNER_GT_16 x3, z22
337
+.if \x == 4
338
+    SAD_X_SVE2_32_INNER_GT_16 x4, z23
339
+.endif
340
+.endr
341
+    uaddv           d0, p0, z20.h
342
+    uaddv           d1, p0, z21.h
343
+    uaddv           d2, p0, z22.h
344
+    stp             s0, s1, x6
345
+.if \x == 3
346
+    str             s2, x6, #8
347
+.elseif \x == 4
348
+    uaddv           d3, p0, z23.h
349
+    stp             s2, s3, x6, #8
350
+.endif
351
+    ret
352
+.endm
353
+
354
+// static void x264_pixel_sad_x3_##size(pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, intptr_t i_stride, int scores3)
355
+// static void x264_pixel_sad_x4_##size(pixel *fenc, pixel *pix0, pixel *pix1,pixel *pix2, pixel *pix3, intptr_t i_stride, int scores4)
356
+.macro SAD_X_FUNC_SVE2 x, w, h
357
+function PFX(sad_x\x\()_\w\()x\h\()_sve2)
358
+    mov             x9, #FENC_STRIDE
359
+
360
+// Make function arguments for x == 3 look like x == 4.
361
+.if \x == 3
362
+    mov             x6, x5
363
+    mov             x5, x4
364
+.endif
365
+    rdvl            x11, #1
366
+    cmp             x11, #16
367
+    bgt             .vl_gt_16_sad_x\x\()_\w\()x\h
368
+.if \w == 12
369
+    movrel          x12, sad12_mask
370
+    ld1             {v31.16b}, x12
371
+.endif
372
+
373
+    SAD_X_START_\w \h, \x, uabdl
374
+    SAD_X_\w \h, \x
375
+    SAD_X_END_\w \x
376
+.vl_gt_16_sad_x\x\()_\w\()x\h\():
377
+.if \w == 24 || \w == 32
378
+    SAD_X_SVE2_\w \h, \x
379
+.else
380
+.if \w == 12
381
+    movrel          x12, sad12_mask
382
+    ld1             {v31.16b}, x12
383
+.endif
384
+
385
+    SAD_X_START_\w \h, \x, uabdl
386
+    SAD_X_\w \h, \x
387
+    SAD_X_END_\w \x
388
+.endif
389
+endfunc
390
+.endm
391
+
392
+.macro SAD_X_LOOP_SVE2 x, w, h
393
+function PFX(sad_x\x\()_\w\()x\h\()_sve2)
394
+    mov             x9, #FENC_STRIDE
395
+
396
+// Make function arguments for x == 3 look like x == 4.
397
+.if \x == 3
398
+    mov             x6, x5
399
+    mov             x5, x4
400
+.endif
401
+    rdvl            x11, #1
402
+    cmp             x11, #16
403
+    bgt             .vl_gt_16_sad_x_loop_\x\()_\w\()x\h
404
+    SAD_X_START_\w \x
405
+    mov             w12, #\h/4
406
+.loop_sad_sve2_x\x\()_\w\()x\h:
407
+    sub             w12, w12, #1
408
+ .rept 4
409
+  .if \w == 24
410
+    ld1             {v6.16b}, x0, #16
411
+    ld1             {v7.8b}, x0, x9
412
+  .elseif \w == 32
413
+    ld1             {v6.16b-v7.16b}, x0, x9
414
+  .elseif \w == 48
415
+    ld1             {v4.16b-v6.16b}, x0, x9
416
+  .elseif \w == 64
417
+    ld1             {v4.16b-v7.16b}, x0, x9
418
+  .endif
419
+    SAD_X_\w x1, v16, v20
420
+    SAD_X_\w x2, v17, v21
421
+    SAD_X_\w x3, v18, v22
422
+  .if \x == 4
423
+    SAD_X_\w x4, v19, v23
424
+  .endif
425
+ .endr
426
+    cbnz            w12, .loop_sad_sve2_x\x\()_\w\()x\h
427
+    SAD_X_END_\w \x
428
+.vl_gt_16_sad_x_loop_\x\()_\w\()x\h\():
429
+.if \w == 24 || \w == 32
430
+    SAD_X_SVE2_\w \h, \x
431
+    ret
432
+.else
433
+    SAD_X_START_\w \x
434
+    mov             w12, #\h/4
435
+.loop_sad_sve2_gt_16_x\x\()_\w\()x\h:
436
+    sub             w12, w12, #1
437
+ .rept 4
438
+  .if \w == 24
439
+    ld1             {v6.16b}, x0, #16
440
+    ld1             {v7.8b}, x0, x9
441
+  .elseif \w == 32
442
+    ld1             {v6.16b-v7.16b}, x0, x9
443
+  .elseif \w == 48
444
+    ld1             {v4.16b-v6.16b}, x0, x9
445
+  .elseif \w == 64
446
+    ld1             {v4.16b-v7.16b}, x0, x9
447
+  .endif
448
+    SAD_X_\w x1, v16, v20
449
+    SAD_X_\w x2, v17, v21
450
+    SAD_X_\w x3, v18, v22
451
+  .if \x == 4
452
+    SAD_X_\w x4, v19, v23
453
+  .endif
454
+ .endr
455
+    cbnz            w12, .loop_sad_sve2_gt_16_x\x\()_\w\()x\h
456
+    SAD_X_END_\w \x
457
+.endif
458
+endfunc
459
+.endm
460
+
461
+
462
+SAD_X_FUNC_SVE2  3, 4,  4
463
+SAD_X_FUNC_SVE2  3, 4,  8
464
+SAD_X_FUNC_SVE2  3, 4,  16
465
+SAD_X_FUNC_SVE2  3, 8,  4
466
+SAD_X_FUNC_SVE2  3, 8,  8
467
+SAD_X_FUNC_SVE2  3, 8,  16
468
+SAD_X_FUNC_SVE2  3, 8,  32
469
+SAD_X_FUNC_SVE2  3, 12, 16
470
+SAD_X_FUNC_SVE2  3, 16, 4
471
+SAD_X_FUNC_SVE2  3, 16, 8
472
+SAD_X_FUNC_SVE2  3, 16, 12
473
+SAD_X_FUNC_SVE2  3, 16, 16
474
+SAD_X_FUNC_SVE2  3, 16, 32
475
+SAD_X_FUNC_SVE2  3, 16, 64
476
+SAD_X_LOOP_SVE2  3, 24, 32
477
+SAD_X_LOOP_SVE2  3, 32, 8
478
+SAD_X_LOOP_SVE2  3, 32, 16
479
+SAD_X_LOOP_SVE2  3, 32, 24
480
+SAD_X_LOOP_SVE2  3, 32, 32
481
+SAD_X_LOOP_SVE2  3, 32, 64
482
+SAD_X_LOOP_SVE2  3, 48, 64
483
+SAD_X_LOOP_SVE2  3, 64, 16
484
+SAD_X_LOOP_SVE2  3, 64, 32
485
+SAD_X_LOOP_SVE2  3, 64, 48
486
+SAD_X_LOOP_SVE2  3, 64, 64
487
+
488
+SAD_X_FUNC_SVE2  4, 4,  4
489
+SAD_X_FUNC_SVE2  4, 4,  8
490
+SAD_X_FUNC_SVE2  4, 4,  16
491
+SAD_X_FUNC_SVE2  4, 8,  4
492
+SAD_X_FUNC_SVE2  4, 8,  8
493
+SAD_X_FUNC_SVE2  4, 8,  16
494
+SAD_X_FUNC_SVE2  4, 8,  32
495
+SAD_X_FUNC_SVE2  4, 12, 16
496
+SAD_X_FUNC_SVE2  4, 16, 4
497
+SAD_X_FUNC_SVE2  4, 16, 8
498
+SAD_X_FUNC_SVE2  4, 16, 12
499
+SAD_X_FUNC_SVE2  4, 16, 16
500
+SAD_X_FUNC_SVE2  4, 16, 32
501
+SAD_X_FUNC_SVE2  4, 16, 64
502
+SAD_X_LOOP_SVE2  4, 24, 32
503
+SAD_X_LOOP_SVE2  4, 32, 8
504
+SAD_X_LOOP_SVE2  4, 32, 16
505
+SAD_X_LOOP_SVE2  4, 32, 24
506
+SAD_X_LOOP_SVE2  4, 32, 32
507
+SAD_X_LOOP_SVE2  4, 32, 64
508
+SAD_X_LOOP_SVE2  4, 48, 64
509
+SAD_X_LOOP_SVE2  4, 64, 16
510
+SAD_X_LOOP_SVE2  4, 64, 32
511
+SAD_X_LOOP_SVE2  4, 64, 48
512
+SAD_X_LOOP_SVE2  4, 64, 64
513
x265_3.5.tar.gz/source/common/aarch64/sad-a.S -> x265_3.6.tar.gz/source/common/aarch64/sad-a.S Changed
256
 
1
@@ -1,7 +1,8 @@
2
 /*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
+ * Copyright (C) 2020-2021 MulticoreWare, Inc
5
  *
6
  * Authors: Hongbin Liu <liuhongbin1@huawei.com>
7
+ *          Sebastian Pop <spop@amazon.com>
8
  *
9
  * This program is free software; you can redistribute it and/or modify
10
  * it under the terms of the GNU General Public License as published by
11
@@ -22,84 +23,186 @@
12
  *****************************************************************************/
13
 
14
 #include "asm.S"
15
+#include "sad-a-common.S"
16
 
17
+#ifdef __APPLE__
18
+.section __RODATA,__rodata
19
+#else
20
 .section .rodata
21
+#endif
22
 
23
 .align 4
24
 
25
 .text
26
 
27
-.macro SAD_X_START_8 x
28
-    ld1             {v0.8b}, x0, x9
29
-.if \x == 3
30
-    ld1             {v1.8b}, x1, x4
31
-    ld1             {v2.8b}, x2, x4
32
-    ld1             {v3.8b}, x3, x4
33
-.elseif \x == 4
34
-    ld1             {v1.8b}, x1, x5
35
-    ld1             {v2.8b}, x2, x5
36
-    ld1             {v3.8b}, x3, x5
37
-    ld1             {v4.8b}, x4, x5
38
-.endif
39
-    uabdl           v16.8h, v0.8b, v1.8b
40
-    uabdl           v17.8h, v0.8b, v2.8b
41
-    uabdl           v18.8h, v0.8b, v3.8b
42
-.if \x == 4
43
-    uabdl           v19.8h, v0.8b, v4.8b
44
+// Fully unrolled.
45
+.macro SAD_FUNC w, h
46
+function PFX(pixel_sad_\w\()x\h\()_neon)
47
+    SAD_START_\w uabdl
48
+    SAD_\w \h
49
+.if \w > 4
50
+    add             v16.8h, v16.8h, v17.8h
51
 .endif
52
+    uaddlv          s0, v16.8h
53
+    fmov            w0, s0
54
+    ret
55
+endfunc
56
+.endm
57
+
58
+// Loop unrolled 4.
59
+.macro SAD_FUNC_LOOP w, h
60
+function PFX(pixel_sad_\w\()x\h\()_neon)
61
+    SAD_START_\w
62
+
63
+    mov             w9, #\h/8
64
+.loop_\w\()x\h:
65
+    sub             w9, w9, #1
66
+.rept 4
67
+    SAD_\w
68
+.endr
69
+    cbnz            w9, .loop_\w\()x\h
70
+
71
+    SAD_END_\w
72
+endfunc
73
 .endm
74
 
75
-.macro SAD_X_8 x
76
-    ld1             {v0.8b}, x0, x9
77
+SAD_FUNC  4,  4
78
+SAD_FUNC  4,  8
79
+SAD_FUNC  4,  16
80
+SAD_FUNC  8,  4
81
+SAD_FUNC  8,  8
82
+SAD_FUNC  8,  16
83
+SAD_FUNC  8,  32
84
+SAD_FUNC  16, 4
85
+SAD_FUNC  16, 8
86
+SAD_FUNC  16, 12
87
+SAD_FUNC  16, 16
88
+SAD_FUNC  16, 32
89
+SAD_FUNC  16, 64
90
+
91
+SAD_FUNC_LOOP  32, 8
92
+SAD_FUNC_LOOP  32, 16
93
+SAD_FUNC_LOOP  32, 24
94
+SAD_FUNC_LOOP  32, 32
95
+SAD_FUNC_LOOP  32, 64
96
+SAD_FUNC_LOOP  64, 16
97
+SAD_FUNC_LOOP  64, 32
98
+SAD_FUNC_LOOP  64, 48
99
+SAD_FUNC_LOOP  64, 64
100
+SAD_FUNC_LOOP  12, 16
101
+SAD_FUNC_LOOP  24, 32
102
+SAD_FUNC_LOOP  48, 64
103
+
104
+// SAD_X3 and SAD_X4 code start
105
+
106
+// static void x264_pixel_sad_x3_##size(pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, intptr_t i_stride, int scores3)
107
+// static void x264_pixel_sad_x4_##size(pixel *fenc, pixel *pix0, pixel *pix1,pixel *pix2, pixel *pix3, intptr_t i_stride, int scores4)
108
+.macro SAD_X_FUNC x, w, h
109
+function PFX(sad_x\x\()_\w\()x\h\()_neon)
110
+    mov             x9, #FENC_STRIDE
111
+
112
+// Make function arguments for x == 3 look like x == 4.
113
 .if \x == 3
114
-    ld1             {v1.8b}, x1, x4
115
-    ld1             {v2.8b}, x2, x4
116
-    ld1             {v3.8b}, x3, x4
117
-.elseif \x == 4
118
-    ld1             {v1.8b}, x1, x5
119
-    ld1             {v2.8b}, x2, x5
120
-    ld1             {v3.8b}, x3, x5
121
-    ld1             {v4.8b}, x4, x5
122
+    mov             x6, x5
123
+    mov             x5, x4
124
 .endif
125
-    uabal           v16.8h, v0.8b, v1.8b
126
-    uabal           v17.8h, v0.8b, v2.8b
127
-    uabal           v18.8h, v0.8b, v3.8b
128
-.if \x == 4
129
-    uabal           v19.8h, v0.8b, v4.8b
130
+
131
+.if \w == 12
132
+    movrel          x12, sad12_mask
133
+    ld1             {v31.16b}, x12
134
 .endif
135
+
136
+    SAD_X_START_\w \h, \x, uabdl
137
+    SAD_X_\w \h, \x
138
+    SAD_X_END_\w \x
139
+endfunc
140
 .endm
141
 
142
-.macro SAD_X_8xN x, h
143
-function x265_sad_x\x\()_8x\h\()_neon
144
+.macro SAD_X_LOOP x, w, h
145
+function PFX(sad_x\x\()_\w\()x\h\()_neon)
146
     mov             x9, #FENC_STRIDE
147
-    SAD_X_START_8 \x
148
-.rept \h - 1
149
-    SAD_X_8 \x
150
-.endr
151
-    uaddlv          s0, v16.8h
152
-    uaddlv          s1, v17.8h
153
-    uaddlv          s2, v18.8h
154
-.if \x == 4
155
-    uaddlv          s3, v19.8h
156
-.endif
157
 
158
+// Make function arguments for x == 3 look like x == 4.
159
 .if \x == 3
160
-    stp             s0, s1, x5
161
-    str             s2, x5, #8
162
-.elseif \x == 4
163
-    stp             s0, s1, x6
164
-    stp             s2, s3, x6, #8
165
+    mov             x6, x5
166
+    mov             x5, x4
167
 .endif
168
-    ret
169
+    SAD_X_START_\w \x
170
+    mov             w12, #\h/4
171
+.loop_sad_x\x\()_\w\()x\h:
172
+    sub             w12, w12, #1
173
+ .rept 4
174
+  .if \w == 24
175
+    ld1             {v6.16b}, x0, #16
176
+    ld1             {v7.8b}, x0, x9
177
+  .elseif \w == 32
178
+    ld1             {v6.16b-v7.16b}, x0, x9
179
+  .elseif \w == 48
180
+    ld1             {v4.16b-v6.16b}, x0, x9
181
+  .elseif \w == 64
182
+    ld1             {v4.16b-v7.16b}, x0, x9
183
+  .endif
184
+    SAD_X_\w x1, v16, v20
185
+    SAD_X_\w x2, v17, v21
186
+    SAD_X_\w x3, v18, v22
187
+  .if \x == 4
188
+    SAD_X_\w x4, v19, v23
189
+  .endif
190
+ .endr
191
+    cbnz            w12, .loop_sad_x\x\()_\w\()x\h
192
+    SAD_X_END_\w \x
193
 endfunc
194
 .endm
195
 
196
-SAD_X_8xN 3 4
197
-SAD_X_8xN 3 8
198
-SAD_X_8xN 3 16
199
-SAD_X_8xN 3 32
200
 
201
-SAD_X_8xN 4 4
202
-SAD_X_8xN 4 8
203
-SAD_X_8xN 4 16
204
-SAD_X_8xN 4 32
205
+SAD_X_FUNC  3, 4,  4
206
+SAD_X_FUNC  3, 4,  8
207
+SAD_X_FUNC  3, 4,  16
208
+SAD_X_FUNC  3, 8,  4
209
+SAD_X_FUNC  3, 8,  8
210
+SAD_X_FUNC  3, 8,  16
211
+SAD_X_FUNC  3, 8,  32
212
+SAD_X_FUNC  3, 12, 16
213
+SAD_X_FUNC  3, 16, 4
214
+SAD_X_FUNC  3, 16, 8
215
+SAD_X_FUNC  3, 16, 12
216
+SAD_X_FUNC  3, 16, 16
217
+SAD_X_FUNC  3, 16, 32
218
+SAD_X_FUNC  3, 16, 64
219
+SAD_X_LOOP  3, 24, 32
220
+SAD_X_LOOP  3, 32, 8
221
+SAD_X_LOOP  3, 32, 16
222
+SAD_X_LOOP  3, 32, 24
223
+SAD_X_LOOP  3, 32, 32
224
+SAD_X_LOOP  3, 32, 64
225
+SAD_X_LOOP  3, 48, 64
226
+SAD_X_LOOP  3, 64, 16
227
+SAD_X_LOOP  3, 64, 32
228
+SAD_X_LOOP  3, 64, 48
229
+SAD_X_LOOP  3, 64, 64
230
+
231
+SAD_X_FUNC  4, 4,  4
232
+SAD_X_FUNC  4, 4,  8
233
+SAD_X_FUNC  4, 4,  16
234
+SAD_X_FUNC  4, 8,  4
235
+SAD_X_FUNC  4, 8,  8
236
+SAD_X_FUNC  4, 8,  16
237
+SAD_X_FUNC  4, 8,  32
238
+SAD_X_FUNC  4, 12, 16
239
+SAD_X_FUNC  4, 16, 4
240
+SAD_X_FUNC  4, 16, 8
241
+SAD_X_FUNC  4, 16, 12
242
+SAD_X_FUNC  4, 16, 16
243
+SAD_X_FUNC  4, 16, 32
244
+SAD_X_FUNC  4, 16, 64
245
+SAD_X_LOOP  4, 24, 32
246
+SAD_X_LOOP  4, 32, 8
247
+SAD_X_LOOP  4, 32, 16
248
+SAD_X_LOOP  4, 32, 24
249
+SAD_X_LOOP  4, 32, 32
250
+SAD_X_LOOP  4, 32, 64
251
+SAD_X_LOOP  4, 48, 64
252
+SAD_X_LOOP  4, 64, 16
253
+SAD_X_LOOP  4, 64, 32
254
+SAD_X_LOOP  4, 64, 48
255
+SAD_X_LOOP  4, 64, 64
256
x265_3.6.tar.gz/source/common/aarch64/ssd-a-common.S Added
39
 
1
@@ -0,0 +1,37 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+#include "asm.S"
29
+
30
+.arch           armv8-a
31
+
32
+.macro ret_v0_w0
33
+    trn2            v1.2d, v0.2d, v0.2d
34
+    add             v0.2s, v0.2s, v1.2s
35
+    addp            v0.2s, v0.2s, v0.2s
36
+    fmov            w0, s0
37
+    ret
38
+.endm
39
x265_3.6.tar.gz/source/common/aarch64/ssd-a-sve.S Added
80
 
1
@@ -0,0 +1,78 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+
27
+.arch armv8-a+sve
28
+
29
+#ifdef __APPLE__
30
+.section __RODATA,__rodata
31
+#else
32
+.section .rodata
33
+#endif
34
+
35
+.align 4
36
+
37
+.text
38
+
39
+function PFX(pixel_sse_pp_4x4_sve)
40
+    ptrue           p0.s, vl4
41
+    ld1b            {z0.s}, p0/z, x0
42
+    ld1b            {z17.s}, p0/z, x2
43
+    add             x0, x0, x1
44
+    add             x2, x2, x3
45
+    sub             z0.s, p0/m, z0.s, z17.s
46
+    mul             z0.s, p0/m, z0.s, z0.s
47
+.rept 3
48
+    ld1b            {z16.s}, p0/z, x0
49
+    ld1b            {z17.s}, p0/z, x2
50
+    add             x0, x0, x1
51
+    add             x2, x2, x3
52
+    sub             z16.s, p0/m, z16.s, z17.s
53
+    mla             z0.s, p0/m, z16.s, z16.s
54
+.endr
55
+    uaddv           d0, p0, z0.s
56
+    fmov            w0, s0
57
+    ret
58
+endfunc
59
+
60
+function PFX(pixel_sse_pp_4x8_sve)
61
+    ptrue           p0.s, vl4
62
+    ld1b            {z0.s}, p0/z, x0
63
+    ld1b            {z17.s}, p0/z, x2
64
+    add             x0, x0, x1
65
+    add             x2, x2, x3
66
+    sub             z0.s, p0/m, z0.s, z17.s
67
+    mul             z0.s, p0/m, z0.s, z0.s
68
+.rept 7
69
+    ld1b            {z16.s}, p0/z, x0
70
+    ld1b            {z17.s}, p0/z, x2
71
+    add             x0, x0, x1
72
+    add             x2, x2, x3
73
+    sub             z16.s, p0/m, z16.s, z17.s
74
+    mla             z0.s, p0/m, z16.s, z16.s
75
+.endr
76
+    uaddv           d0, p0, z0.s
77
+    fmov            w0, s0
78
+    ret
79
+endfunc
80
x265_3.6.tar.gz/source/common/aarch64/ssd-a-sve2.S Added
889
 
1
@@ -0,0 +1,887 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "ssd-a-common.S"
27
+
28
+.arch armv8-a+sve2
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+function PFX(pixel_sse_pp_32x32_sve2)
41
+    rdvl            x9, #1
42
+    cmp             x9, #16
43
+    bgt             .vl_gt_16_pixel_sse_pp_32x32
44
+    mov             w12, #8
45
+    movi            v0.16b, #0
46
+    movi            v1.16b, #0
47
+.loop_sse_pp_32_sve2:
48
+    sub             w12, w12, #1
49
+.rept 4
50
+    ld1             {v16.16b,v17.16b}, x0, x1
51
+    ld1             {v18.16b,v19.16b}, x2, x3
52
+    usubl           v2.8h, v16.8b, v18.8b
53
+    usubl2          v3.8h, v16.16b, v18.16b
54
+    usubl           v4.8h, v17.8b, v19.8b
55
+    usubl2          v5.8h, v17.16b, v19.16b
56
+    smlal           v0.4s, v2.4h, v2.4h
57
+    smlal2          v1.4s, v2.8h, v2.8h
58
+    smlal           v0.4s, v3.4h, v3.4h
59
+    smlal2          v1.4s, v3.8h, v3.8h
60
+    smlal           v0.4s, v4.4h, v4.4h
61
+    smlal2          v1.4s, v4.8h, v4.8h
62
+    smlal           v0.4s, v5.4h, v5.4h
63
+    smlal2          v1.4s, v5.8h, v5.8h
64
+.endr
65
+    cbnz            w12, .loop_sse_pp_32_sve2
66
+    add             v0.4s, v0.4s, v1.4s
67
+    ret_v0_w0
68
+.vl_gt_16_pixel_sse_pp_32x32:
69
+    ptrue           p0.b, vl32
70
+    ld1b            {z16.b}, p0/z, x0
71
+    ld1b            {z18.b}, p0/z, x2
72
+    add             x0, x0, x1
73
+    add             x2, x2, x3
74
+    usublb          z1.h, z16.b, z18.b
75
+    usublt          z2.h, z16.b, z18.b
76
+    smullb          z0.s, z1.h, z1.h
77
+    smlalt          z0.s, z1.h, z1.h
78
+    smlalb          z0.s, z2.h, z2.h
79
+    smlalt          z0.s, z2.h, z2.h
80
+.rept 31
81
+    ld1b            {z16.b}, p0/z, x0
82
+    ld1b            {z18.b}, p0/z, x2
83
+    add             x0, x0, x1
84
+    add             x2, x2, x3
85
+    usublb          z1.h, z16.b, z18.b
86
+    usublt          z2.h, z16.b, z18.b
87
+    smullb          z0.s, z1.h, z1.h
88
+    smlalt          z0.s, z1.h, z1.h
89
+    smlalb          z0.s, z2.h, z2.h
90
+    smlalt          z0.s, z2.h, z2.h
91
+.endr
92
+    uaddv           d3, p0, z0.s
93
+    fmov            w0, s3
94
+    ret
95
+endfunc
96
+
97
+function PFX(pixel_sse_pp_32x64_sve2)
98
+    rdvl            x9, #1
99
+    cmp             x9, #16
100
+    bgt             .vl_gt_16_pixel_sse_pp_32x64
101
+    ptrue           p0.b, vl16
102
+    ld1b            {z16.b}, p0/z, x0
103
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
104
+    ld1b            {z18.b}, p0/z, x2
105
+    ld1b            {z19.b}, p0/z, x2, #1, mul vl
106
+    add             x0, x0, x1
107
+    add             x2, x2, x3
108
+    usublb          z1.h, z16.b, z18.b
109
+    usublt          z2.h, z16.b, z18.b
110
+    usublb          z3.h, z17.b, z19.b
111
+    usublt          z4.h, z17.b, z19.b
112
+    smullb          z20.s, z1.h, z1.h
113
+    smullt          z21.s, z1.h, z1.h
114
+    smlalb          z20.s, z2.h, z2.h
115
+    smlalt          z21.s, z2.h, z2.h
116
+    smlalb          z20.s, z3.h, z3.h
117
+    smlalt          z21.s, z3.h, z3.h
118
+    smlalb          z20.s, z4.h, z4.h
119
+    smlalt          z21.s, z4.h, z4.h
120
+.rept 63
121
+    ld1b            {z16.b}, p0/z, x0
122
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
123
+    ld1b            {z18.b}, p0/z, x2
124
+    ld1b            {z19.b}, p0/z, x2, #1, mul vl
125
+    add             x0, x0, x1
126
+    add             x2, x2, x3
127
+    usublb          z1.h, z16.b, z18.b
128
+    usublt          z2.h, z16.b, z18.b
129
+    usublb          z3.h, z17.b, z19.b
130
+    usublt          z4.h, z17.b, z19.b
131
+    smlalb          z20.s, z1.h, z1.h
132
+    smlalt          z21.s, z1.h, z1.h
133
+    smlalb          z20.s, z2.h, z2.h
134
+    smlalt          z21.s, z2.h, z2.h
135
+    smlalb          z20.s, z3.h, z3.h
136
+    smlalt          z21.s, z3.h, z3.h
137
+    smlalb          z20.s, z4.h, z4.h
138
+    smlalt          z21.s, z4.h, z4.h
139
+.endr
140
+    uaddv           d3, p0, z20.s
141
+    fmov            w0, s3
142
+    uaddv           d4, p0, z21.s
143
+    fmov            w1, s4
144
+    add             w0, w0, w1
145
+    ret
146
+.vl_gt_16_pixel_sse_pp_32x64:
147
+    ptrue           p0.b, vl32
148
+    ld1b            {z16.b}, p0/z, x0
149
+    ld1b            {z18.b}, p0/z, x2
150
+    add             x0, x0, x1
151
+    add             x2, x2, x3
152
+    usublb          z1.h, z16.b, z18.b
153
+    usublt          z2.h, z16.b, z18.b
154
+    smullb          z20.s, z1.h, z1.h
155
+    smullt          z21.s, z1.h, z1.h
156
+    smlalb          z20.s, z2.h, z2.h
157
+    smlalt          z21.s, z2.h, z2.h
158
+.rept 63
159
+    ld1b            {z16.b}, p0/z, x0
160
+    ld1b            {z18.b}, p0/z, x2
161
+    add             x0, x0, x1
162
+    add             x2, x2, x3
163
+    usublb          z1.h, z16.b, z18.b
164
+    usublt          z2.h, z16.b, z18.b
165
+    smlalb          z20.s, z1.h, z1.h
166
+    smlalt          z21.s, z1.h, z1.h
167
+    smlalb          z20.s, z2.h, z2.h
168
+    smlalt          z21.s, z2.h, z2.h
169
+.endr
170
+    uaddv           d3, p0, z20.s
171
+    fmov            w0, s3
172
+    uaddv           d4, p0, z21.s
173
+    fmov            w1, s4
174
+    add             w0, w0, w1
175
+    ret
176
+endfunc
177
+
178
+function PFX(pixel_sse_pp_64x64_sve2)
179
+    rdvl            x9, #1
180
+    cmp             x9, #16
181
+    bgt             .vl_gt_16_pixel_sse_pp_64x64
182
+    mov             w12, #16
183
+    movi            v0.16b, #0
184
+    movi            v1.16b, #0
185
+
186
+.loop_sse_pp_64_sve2:
187
+    sub             w12, w12, #1
188
+.rept 4
189
+    ld1             {v16.16b-v19.16b}, x0, x1
190
+    ld1             {v20.16b-v23.16b}, x2, x3
191
+
192
+    usubl           v2.8h, v16.8b, v20.8b
193
+    usubl2          v3.8h, v16.16b, v20.16b
194
+    usubl           v4.8h, v17.8b, v21.8b
195
+    usubl2          v5.8h, v17.16b, v21.16b
196
+    smlal           v0.4s, v2.4h, v2.4h
197
+    smlal2          v1.4s, v2.8h, v2.8h
198
+    smlal           v0.4s, v3.4h, v3.4h
199
+    smlal2          v1.4s, v3.8h, v3.8h
200
+    smlal           v0.4s, v4.4h, v4.4h
201
+    smlal2          v1.4s, v4.8h, v4.8h
202
+    smlal           v0.4s, v5.4h, v5.4h
203
+    smlal2          v1.4s, v5.8h, v5.8h
204
+
205
+    usubl           v2.8h, v18.8b, v22.8b
206
+    usubl2          v3.8h, v18.16b, v22.16b
207
+    usubl           v4.8h, v19.8b, v23.8b
208
+    usubl2          v5.8h, v19.16b, v23.16b
209
+    smlal           v0.4s, v2.4h, v2.4h
210
+    smlal2          v1.4s, v2.8h, v2.8h
211
+    smlal           v0.4s, v3.4h, v3.4h
212
+    smlal2          v1.4s, v3.8h, v3.8h
213
+    smlal           v0.4s, v4.4h, v4.4h
214
+    smlal2          v1.4s, v4.8h, v4.8h
215
+    smlal           v0.4s, v5.4h, v5.4h
216
+    smlal2          v1.4s, v5.8h, v5.8h
217
+.endr
218
+    cbnz            w12, .loop_sse_pp_64_sve2
219
+    add             v0.4s, v0.4s, v1.4s
220
+    ret_v0_w0
221
+.vl_gt_16_pixel_sse_pp_64x64:
222
+    cmp             x9, #48
223
+    bgt             .vl_gt_48_pixel_sse_pp_64x64
224
+    ptrue           p0.b, vl32
225
+    ld1b            {z16.b}, p0/z, x0
226
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
227
+    ld1b            {z20.b}, p0/z, x2
228
+    ld1b            {z21.b}, p0/z, x2, #1, mul vl
229
+    add             x0, x0, x1
230
+    add             x2, x2, x3
231
+    usublb          z1.h, z16.b, z20.b
232
+    usublt          z2.h, z16.b, z20.b
233
+    usublb          z3.h, z17.b, z21.b
234
+    usublt          z4.h, z17.b, z21.b
235
+    smullb          z24.s, z1.h, z1.h
236
+    smullt          z25.s, z1.h, z1.h
237
+    smlalb          z24.s, z2.h, z2.h
238
+    smlalt          z25.s, z2.h, z2.h
239
+    smlalb          z24.s, z3.h, z3.h
240
+    smlalt          z25.s, z3.h, z3.h
241
+    smlalb          z24.s, z4.h, z4.h
242
+    smlalt          z25.s, z4.h, z4.h
243
+.rept 63
244
+    ld1b            {z16.b}, p0/z, x0
245
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
246
+    ld1b            {z20.b}, p0/z, x2
247
+    ld1b            {z21.b}, p0/z, x2, #1, mul vl
248
+    add             x0, x0, x1
249
+    add             x2, x2, x3
250
+    usublb          z1.h, z16.b, z20.b
251
+    usublt          z2.h, z16.b, z20.b
252
+    usublb          z3.h, z17.b, z21.b
253
+    usublt          z4.h, z17.b, z21.b
254
+    smlalb          z24.s, z1.h, z1.h
255
+    smlalt          z25.s, z1.h, z1.h
256
+    smlalb          z24.s, z2.h, z2.h
257
+    smlalt          z25.s, z2.h, z2.h
258
+    smlalb          z24.s, z3.h, z3.h
259
+    smlalt          z25.s, z3.h, z3.h
260
+    smlalb          z24.s, z4.h, z4.h
261
+    smlalt          z25.s, z4.h, z4.h
262
+.endr
263
+    uaddv           d3, p0, z24.s
264
+    fmov            w0, s3
265
+    uaddv           d4, p0, z25.s
266
+    fmov            w1, s4
267
+    add             w0, w0, w1
268
+    ret
269
+.vl_gt_48_pixel_sse_pp_64x64:
270
+    ptrue           p0.b, vl64
271
+    ld1b            {z16.b}, p0/z, x0
272
+    ld1b            {z20.b}, p0/z, x2
273
+    add             x0, x0, x1
274
+    add             x2, x2, x3
275
+    usublb          z1.h, z16.b, z20.b
276
+    usublt          z2.h, z16.b, z20.b
277
+    smullb          z24.s, z1.h, z1.h
278
+    smullt          z25.s, z1.h, z1.h
279
+    smlalb          z24.s, z2.h, z2.h
280
+    smlalt          z25.s, z2.h, z2.h
281
+.rept 63
282
+    ld1b            {z16.b}, p0/z, x0
283
+    ld1b            {z20.b}, p0/z, x2
284
+    add             x0, x0, x1
285
+    add             x2, x2, x3
286
+    usublb          z1.h, z16.b, z20.b
287
+    usublt          z2.h, z16.b, z20.b
288
+    smlalb          z24.s, z1.h, z1.h
289
+    smlalt          z25.s, z1.h, z1.h
290
+    smlalb          z24.s, z2.h, z2.h
291
+    smlalt          z25.s, z2.h, z2.h
292
+.endr
293
+    uaddv           d3, p0, z24.s
294
+    fmov            w0, s3
295
+    uaddv           d4, p0, z25.s
296
+    fmov            w1, s4
297
+    add             w0, w0, w1
298
+    ret
299
+endfunc
300
+
301
+function PFX(pixel_sse_ss_4x4_sve2)
302
+    ptrue           p0.b, vl8
303
+    ld1b            {z16.b}, p0/z, x0
304
+    ld1b            {z17.b}, p0/z, x2
305
+    add             x0, x0, x1, lsl #1
306
+    add             x2, x2, x3, lsl #1
307
+    sub             z1.h, z16.h, z17.h
308
+    smullb          z3.s, z1.h, z1.h
309
+    smullt          z4.s, z1.h, z1.h
310
+.rept 3
311
+    ld1b            {z16.b}, p0/z, x0
312
+    ld1b            {z17.b}, p0/z, x2
313
+    add             x0, x0, x1, lsl #1
314
+    add             x2, x2, x3, lsl #1
315
+    sub             z1.h, z16.h, z17.h
316
+    smlalb          z3.s, z1.h, z1.h
317
+    smlalt          z4.s, z1.h, z1.h
318
+.endr
319
+    uaddv           d3, p0, z3.s
320
+    fmov            w0, s3
321
+    uaddv           d4, p0, z4.s
322
+    fmov            w1, s4
323
+    add             w0, w0, w1
324
+    ret
325
+endfunc
326
+
327
+function PFX(pixel_sse_ss_8x8_sve2)
328
+    ptrue           p0.b, vl16
329
+    ld1b            {z16.b}, p0/z, x0
330
+    ld1b            {z17.b}, p0/z, x2
331
+    add             x0, x0, x1, lsl #1
332
+    add             x2, x2, x3, lsl #1
333
+    sub             z1.h, z16.h, z17.h
334
+    smullb          z3.s, z1.h, z1.h
335
+    smullt          z4.s, z1.h, z1.h
336
+.rept 7
337
+    ld1b            {z16.b}, p0/z, x0
338
+    ld1b            {z17.b}, p0/z, x2
339
+    add             x0, x0, x1, lsl #1
340
+    add             x2, x2, x3, lsl #1
341
+    sub             z1.h, z16.h, z17.h
342
+    smlalb          z3.s, z1.h, z1.h
343
+    smlalt          z4.s, z1.h, z1.h
344
+.endr
345
+    uaddv           d3, p0, z3.s
346
+    fmov            w0, s3
347
+    uaddv           d4, p0, z4.s
348
+    fmov            w1, s4
349
+    add             w0, w0, w1
350
+    ret
351
+endfunc
352
+
353
+function PFX(pixel_sse_ss_16x16_sve2)
354
+    rdvl            x9, #1
355
+    cmp             x9, #16
356
+    bgt             .vl_gt_16_pixel_sse_ss_16x16
357
+    ptrue           p0.b, vl16
358
+    ld1b            {z16.b}, p0/z, x0
359
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
360
+    ld1b            {z18.b}, p0/z, x2
361
+    ld1b            {z19.b}, p0/z, x2, #1, mul vl
362
+    add             x0, x0, x1, lsl #1
363
+    add             x2, x2, x3, lsl #1
364
+    sub             z1.h, z16.h, z18.h
365
+    sub             z2.h, z17.h, z19.h
366
+    smullb          z3.s, z1.h, z1.h
367
+    smullt          z4.s, z1.h, z1.h
368
+    smlalb          z3.s, z2.h, z2.h
369
+    smlalt          z4.s, z2.h, z2.h
370
+.rept 15
371
+    ld1b            {z16.b}, p0/z, x0
372
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
373
+    ld1b            {z18.b}, p0/z, x2
374
+    ld1b            {z19.b}, p0/z, x2, #1, mul vl
375
+    add             x0, x0, x1, lsl #1
376
+    add             x2, x2, x3, lsl #1
377
+    sub             z1.h, z16.h, z18.h
378
+    sub             z2.h, z17.h, z19.h
379
+    smlalb          z3.s, z1.h, z1.h
380
+    smlalt          z4.s, z1.h, z1.h
381
+    smlalb          z3.s, z2.h, z2.h
382
+    smlalt          z4.s, z2.h, z2.h
383
+.endr
384
+    uaddv           d3, p0, z3.s
385
+    fmov            w0, s3
386
+    uaddv           d4, p0, z4.s
387
+    fmov            w1, s4
388
+    add             w0, w0, w1
389
+    ret
390
+.vl_gt_16_pixel_sse_ss_16x16:
391
+    ptrue           p0.b, vl32
392
+    ld1b            {z16.b}, p0/z, x0
393
+    ld1b            {z18.b}, p0/z, x2
394
+    add             x0, x0, x1, lsl #1
395
+    add             x2, x2, x3, lsl #1
396
+    sub             z1.h, z16.h, z18.h
397
+    smullb          z3.s, z1.h, z1.h
398
+    smullt          z4.s, z1.h, z1.h
399
+.rept 15
400
+    ld1b            {z16.b}, p0/z, x0
401
+    ld1b            {z18.b}, p0/z, x2
402
+    add             x0, x0, x1, lsl #1
403
+    add             x2, x2, x3, lsl #1
404
+    sub             z1.h, z16.h, z18.h
405
+    smlalb          z3.s, z1.h, z1.h
406
+    smlalt          z4.s, z1.h, z1.h
407
+.endr
408
+    uaddv           d3, p0, z3.s
409
+    fmov            w0, s3
410
+    uaddv           d4, p0, z4.s
411
+    fmov            w1, s4
412
+    add             w0, w0, w1
413
+    ret
414
+endfunc
415
+
416
+function PFX(pixel_sse_ss_32x32_sve2)
417
+    rdvl            x9, #1
418
+    cmp             x9, #16
419
+    bgt             .vl_gt_16_pixel_sse_ss_32x32
420
+    ptrue           p0.b, vl16
421
+    ld1b            {z16.b}, p0/z, x0
422
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
423
+    ld1b            {z18.b}, p0/z, x0, #2, mul vl
424
+    ld1b            {z19.b}, p0/z, x0, #3, mul vl
425
+    ld1b            {z20.b}, p0/z, x2
426
+    ld1b            {z21.b}, p0/z, x2, #1, mul vl
427
+    ld1b            {z22.b}, p0/z, x2, #2, mul vl
428
+    ld1b            {z23.b}, p0/z, x2, #3, mul vl
429
+    add             x0, x0, x1, lsl #1
430
+    add             x2, x2, x3, lsl #1
431
+    sub             z1.h, z16.h, z20.h
432
+    sub             z2.h, z17.h, z21.h
433
+    sub             z3.h, z18.h, z22.h
434
+    sub             z4.h, z19.h, z23.h
435
+    smullb          z5.s, z1.h, z1.h
436
+    smullt          z6.s, z1.h, z1.h
437
+    smlalb          z5.s, z2.h, z2.h
438
+    smlalt          z6.s, z2.h, z2.h
439
+    smlalb          z5.s, z3.h, z3.h
440
+    smlalt          z6.s, z3.h, z3.h
441
+    smlalb          z5.s, z4.h, z4.h
442
+    smlalt          z6.s, z4.h, z4.h
443
+.rept 31
444
+    ld1b            {z16.b}, p0/z, x0
445
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
446
+    ld1b            {z18.b}, p0/z, x0, #2, mul vl
447
+    ld1b            {z19.b}, p0/z, x0, #3, mul vl
448
+    ld1b            {z20.b}, p0/z, x2
449
+    ld1b            {z21.b}, p0/z, x2, #1, mul vl
450
+    ld1b            {z22.b}, p0/z, x2, #2, mul vl
451
+    ld1b            {z23.b}, p0/z, x2, #3, mul vl
452
+    add             x0, x0, x1, lsl #1
453
+    add             x2, x2, x3, lsl #1
454
+    sub             z1.h, z16.h, z20.h
455
+    sub             z2.h, z17.h, z21.h
456
+    sub             z3.h, z18.h, z22.h
457
+    sub             z4.h, z19.h, z23.h
458
+    smlalb          z5.s, z1.h, z1.h
459
+    smlalt          z6.s, z1.h, z1.h
460
+    smlalb          z5.s, z2.h, z2.h
461
+    smlalt          z6.s, z2.h, z2.h
462
+    smlalb          z5.s, z3.h, z3.h
463
+    smlalt          z6.s, z3.h, z3.h
464
+    smlalb          z5.s, z4.h, z4.h
465
+    smlalt          z6.s, z4.h, z4.h
466
+.endr
467
+    uaddv           d3, p0, z5.s
468
+    fmov            w0, s3
469
+    uaddv           d4, p0, z6.s
470
+    fmov            w1, s4
471
+    add             w0, w0, w1
472
+    ret
473
+.vl_gt_16_pixel_sse_ss_32x32:
474
+    cmp             x9, #48
475
+    bgt             .vl_gt_48_pixel_sse_ss_32x32
476
+    ptrue           p0.b, vl32
477
+    ld1b            {z16.b}, p0/z, x0
478
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
479
+    ld1b            {z20.b}, p0/z, x2
480
+    ld1b            {z21.b}, p0/z, x2, #1, mul vl
481
+    add             x0, x0, x1, lsl #1
482
+    add             x2, x2, x3, lsl #1
483
+    sub             z1.h, z16.h, z20.h
484
+    sub             z2.h, z17.h, z21.h
485
+    smullb          z5.s, z1.h, z1.h
486
+    smullt          z6.s, z1.h, z1.h
487
+    smlalb          z5.s, z2.h, z2.h
488
+    smlalt          z6.s, z2.h, z2.h
489
+.rept 31
490
+    ld1b            {z16.b}, p0/z, x0
491
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
492
+    ld1b            {z20.b}, p0/z, x2
493
+    ld1b            {z21.b}, p0/z, x2, #1, mul vl
494
+    add             x0, x0, x1, lsl #1
495
+    add             x2, x2, x3, lsl #1
496
+    sub             z1.h, z16.h, z20.h
497
+    sub             z2.h, z17.h, z21.h
498
+    smlalb          z5.s, z1.h, z1.h
499
+    smlalt          z6.s, z1.h, z1.h
500
+    smlalb          z5.s, z2.h, z2.h
501
+    smlalt          z6.s, z2.h, z2.h
502
+.endr
503
+    uaddv           d3, p0, z5.s
504
+    fmov            w0, s3
505
+    uaddv           d4, p0, z6.s
506
+    fmov            w1, s4
507
+    add             w0, w0, w1
508
+    ret
509
+.vl_gt_48_pixel_sse_ss_32x32:
510
+    ptrue           p0.b, vl64
511
+    ld1b            {z16.b}, p0/z, x0
512
+    ld1b            {z20.b}, p0/z, x2
513
+    add             x0, x0, x1, lsl #1
514
+    add             x2, x2, x3, lsl #1
515
+    sub             z1.h, z16.h, z20.h
516
+    smullb          z5.s, z1.h, z1.h
517
+    smullt          z6.s, z1.h, z1.h
518
+.rept 31
519
+    ld1b            {z16.b}, p0/z, x0
520
+    ld1b            {z20.b}, p0/z, x2
521
+    add             x0, x0, x1, lsl #1
522
+    add             x2, x2, x3, lsl #1
523
+    sub             z1.h, z16.h, z20.h
524
+    smlalb          z5.s, z1.h, z1.h
525
+    smlalt          z6.s, z1.h, z1.h
526
+.endr
527
+    uaddv           d3, p0, z5.s
528
+    fmov            w0, s3
529
+    uaddv           d4, p0, z6.s
530
+    fmov            w1, s4
531
+    add             w0, w0, w1
532
+    ret
533
+endfunc
534
+
535
+function PFX(pixel_sse_ss_64x64_sve2)
536
+    rdvl            x9, #1
537
+    cmp             x9, #16
538
+    bgt             .vl_gt_16_pixel_sse_ss_64x64
539
+    ptrue           p0.b, vl16
540
+    ld1b            {z24.b}, p0/z, x0
541
+    ld1b            {z25.b}, p0/z, x0, #1, mul vl
542
+    ld1b            {z26.b}, p0/z, x0, #2, mul vl
543
+    ld1b            {z27.b}, p0/z, x0, #3, mul vl
544
+    ld1b            {z28.b}, p0/z, x2
545
+    ld1b            {z29.b}, p0/z, x2, #1, mul vl
546
+    ld1b            {z30.b}, p0/z, x2, #2, mul vl
547
+    ld1b            {z31.b}, p0/z, x2, #3, mul vl
548
+    sub             z0.h, z24.h, z28.h
549
+    sub             z1.h, z25.h, z29.h
550
+    sub             z2.h, z26.h, z30.h
551
+    sub             z3.h, z27.h, z31.h
552
+    smullb          z5.s, z0.h, z0.h
553
+    smullt          z6.s, z0.h, z0.h
554
+    smlalb          z5.s, z1.h, z1.h
555
+    smlalt          z6.s, z1.h, z1.h
556
+    smlalb          z5.s, z2.h, z2.h
557
+    smlalt          z6.s, z2.h, z2.h
558
+    smlalb          z5.s, z3.h, z3.h
559
+    smlalt          z6.s, z3.h, z3.h
560
+    ld1b            {z24.b}, p0/z, x0, #4, mul vl
561
+    ld1b            {z25.b}, p0/z, x0, #5, mul vl
562
+    ld1b            {z26.b}, p0/z, x0, #6, mul vl
563
+    ld1b            {z27.b}, p0/z, x0, #7, mul vl
564
+    ld1b            {z28.b}, p0/z, x2, #4, mul vl
565
+    ld1b            {z29.b}, p0/z, x2, #5, mul vl
566
+    ld1b            {z30.b}, p0/z, x2, #6, mul vl
567
+    ld1b            {z31.b}, p0/z, x2, #7, mul vl
568
+    sub             z0.h, z24.h, z28.h
569
+    sub             z1.h, z25.h, z29.h
570
+    sub             z2.h, z26.h, z30.h
571
+    sub             z3.h, z27.h, z31.h
572
+    smlalb          z5.s, z0.h, z0.h
573
+    smlalt          z6.s, z0.h, z0.h
574
+    smlalb          z5.s, z1.h, z1.h
575
+    smlalt          z6.s, z1.h, z1.h
576
+    smlalb          z5.s, z2.h, z2.h
577
+    smlalt          z6.s, z2.h, z2.h
578
+    smlalb          z5.s, z3.h, z3.h
579
+    smlalt          z6.s, z3.h, z3.h
580
+    add             x0, x0, x1, lsl #1
581
+    add             x2, x2, x3, lsl #1
582
+.rept 63
583
+    ld1b            {z24.b}, p0/z, x0
584
+    ld1b            {z25.b}, p0/z, x0, #1, mul vl
585
+    ld1b            {z26.b}, p0/z, x0, #2, mul vl
586
+    ld1b            {z27.b}, p0/z, x0, #3, mul vl
587
+    ld1b            {z28.b}, p0/z, x2
588
+    ld1b            {z29.b}, p0/z, x2, #1, mul vl
589
+    ld1b            {z30.b}, p0/z, x2, #2, mul vl
590
+    ld1b            {z31.b}, p0/z, x2, #3, mul vl
591
+    sub             z0.h, z24.h, z28.h
592
+    sub             z1.h, z25.h, z29.h
593
+    sub             z2.h, z26.h, z30.h
594
+    sub             z3.h, z27.h, z31.h
595
+    smlalb          z5.s, z0.h, z0.h
596
+    smlalt          z6.s, z0.h, z0.h
597
+    smlalb          z5.s, z1.h, z1.h
598
+    smlalt          z6.s, z1.h, z1.h
599
+    smlalb          z5.s, z2.h, z2.h
600
+    smlalt          z6.s, z2.h, z2.h
601
+    smlalb          z5.s, z3.h, z3.h
602
+    smlalt          z6.s, z3.h, z3.h
603
+    ld1b            {z24.b}, p0/z, x0, #4, mul vl
604
+    ld1b            {z25.b}, p0/z, x0, #5, mul vl
605
+    ld1b            {z26.b}, p0/z, x0, #6, mul vl
606
+    ld1b            {z27.b}, p0/z, x0, #7, mul vl
607
+    ld1b            {z28.b}, p0/z, x2, #4, mul vl
608
+    ld1b            {z29.b}, p0/z, x2, #5, mul vl
609
+    ld1b            {z30.b}, p0/z, x2, #6, mul vl
610
+    ld1b            {z31.b}, p0/z, x2, #7, mul vl
611
+    sub             z0.h, z24.h, z28.h
612
+    sub             z1.h, z25.h, z29.h
613
+    sub             z2.h, z26.h, z30.h
614
+    sub             z3.h, z27.h, z31.h
615
+    smlalb          z5.s, z0.h, z0.h
616
+    smlalt          z6.s, z0.h, z0.h
617
+    smlalb          z5.s, z1.h, z1.h
618
+    smlalt          z6.s, z1.h, z1.h
619
+    smlalb          z5.s, z2.h, z2.h
620
+    smlalt          z6.s, z2.h, z2.h
621
+    smlalb          z5.s, z3.h, z3.h
622
+    smlalt          z6.s, z3.h, z3.h
623
+    add             x0, x0, x1, lsl #1
624
+    add             x2, x2, x3, lsl #1
625
+.endr
626
+    uaddv           d3, p0, z5.s
627
+    fmov            w0, s3
628
+    uaddv           d4, p0, z6.s
629
+    fmov            w1, s4
630
+    add             w0, w0, w1
631
+    ret
632
+.vl_gt_16_pixel_sse_ss_64x64:
633
+    cmp             x9, #48
634
+    bgt             .vl_gt_48_pixel_sse_ss_64x64
635
+    ptrue           p0.b, vl32
636
+    ld1b            {z24.b}, p0/z, x0
637
+    ld1b            {z25.b}, p0/z, x0, #1, mul vl
638
+    ld1b            {z28.b}, p0/z, x2
639
+    ld1b            {z29.b}, p0/z, x2, #1, mul vl
640
+    sub             z0.h, z24.h, z28.h
641
+    sub             z1.h, z25.h, z29.h
642
+    smullb          z5.s, z0.h, z0.h
643
+    smullt          z6.s, z0.h, z0.h
644
+    smlalb          z5.s, z1.h, z1.h
645
+    smlalt          z6.s, z1.h, z1.h
646
+    ld1b            {z24.b}, p0/z, x0, #1, mul vl
647
+    ld1b            {z25.b}, p0/z, x0, #2, mul vl
648
+    ld1b            {z28.b}, p0/z, x2, #1, mul vl
649
+    ld1b            {z29.b}, p0/z, x2, #2, mul vl
650
+    sub             z0.h, z24.h, z28.h
651
+    sub             z1.h, z25.h, z29.h
652
+    smlalb          z5.s, z0.h, z0.h
653
+    smlalt          z6.s, z0.h, z0.h
654
+    smlalb          z5.s, z1.h, z1.h
655
+    smlalt          z6.s, z1.h, z1.h
656
+    add             x0, x0, x1, lsl #1
657
+    add             x2, x2, x3, lsl #1
658
+.rept 63
659
+    ld1b            {z24.b}, p0/z, x0
660
+    ld1b            {z25.b}, p0/z, x0, #1, mul vl
661
+    ld1b            {z28.b}, p0/z, x2
662
+    ld1b            {z29.b}, p0/z, x2, #1, mul vl
663
+    sub             z0.h, z24.h, z28.h
664
+    sub             z1.h, z25.h, z29.h
665
+    smlalb          z5.s, z0.h, z0.h
666
+    smlalt          z6.s, z0.h, z0.h
667
+    smlalb          z5.s, z1.h, z1.h
668
+    smlalt          z6.s, z1.h, z1.h
669
+    ld1b            {z24.b}, p0/z, x0, #1, mul vl
670
+    ld1b            {z25.b}, p0/z, x0, #2, mul vl
671
+    ld1b            {z28.b}, p0/z, x2, #1, mul vl
672
+    ld1b            {z29.b}, p0/z, x2, #2, mul vl
673
+    sub             z0.h, z24.h, z28.h
674
+    sub             z1.h, z25.h, z29.h
675
+    smlalb          z5.s, z0.h, z0.h
676
+    smlalt          z6.s, z0.h, z0.h
677
+    smlalb          z5.s, z1.h, z1.h
678
+    smlalt          z6.s, z1.h, z1.h
679
+    add             x0, x0, x1, lsl #1
680
+    add             x2, x2, x3, lsl #1
681
+.endr
682
+    uaddv           d3, p0, z5.s
683
+    fmov            w0, s3
684
+    uaddv           d4, p0, z6.s
685
+    fmov            w1, s4
686
+    add             w0, w0, w1
687
+    ret
688
+.vl_gt_48_pixel_sse_ss_64x64:
689
+    cmp             x9, #112
690
+    bgt             .vl_gt_112_pixel_sse_ss_64x64
691
+    ptrue           p0.b, vl64
692
+    ld1b            {z24.b}, p0/z, x0
693
+    ld1b            {z28.b}, p0/z, x2
694
+    sub             z0.h, z24.h, z28.h
695
+    smullb          z5.s, z0.h, z0.h
696
+    smullt          z6.s, z0.h, z0.h
697
+    ld1b            {z24.b}, p0/z, x0, #1, mul vl
698
+    ld1b            {z28.b}, p0/z, x2, #1, mul vl
699
+    sub             z0.h, z24.h, z28.h
700
+    smlalb          z5.s, z0.h, z0.h
701
+    smlalt          z6.s, z0.h, z0.h
702
+    add             x0, x0, x1, lsl #1
703
+    add             x2, x2, x3, lsl #1
704
+.rept 63
705
+    ld1b            {z24.b}, p0/z, x0
706
+    ld1b            {z28.b}, p0/z, x2
707
+    sub             z0.h, z24.h, z28.h
708
+    smlalb          z5.s, z0.h, z0.h
709
+    smlalt          z6.s, z0.h, z0.h
710
+    ld1b            {z24.b}, p0/z, x0, #1, mul vl
711
+    ld1b            {z28.b}, p0/z, x2, #1, mul vl
712
+    sub             z0.h, z24.h, z28.h
713
+    smlalb          z5.s, z0.h, z0.h
714
+    smlalt          z6.s, z0.h, z0.h
715
+    add             x0, x0, x1, lsl #1
716
+    add             x2, x2, x3, lsl #1
717
+.endr
718
+    uaddv           d3, p0, z5.s
719
+    fmov            w0, s3
720
+    uaddv           d4, p0, z6.s
721
+    fmov            w1, s4
722
+    add             w0, w0, w1
723
+    ret
724
+.vl_gt_112_pixel_sse_ss_64x64:
725
+    ptrue           p0.b, vl128
726
+    ld1b            {z24.b}, p0/z, x0
727
+    ld1b            {z28.b}, p0/z, x2
728
+    sub             z0.h, z24.h, z28.h
729
+    smullb          z5.s, z0.h, z0.h
730
+    smullt          z6.s, z0.h, z0.h
731
+    add             x0, x0, x1, lsl #1
732
+    add             x2, x2, x3, lsl #1
733
+.rept 63
734
+    ld1b            {z24.b}, p0/z, x0
735
+    ld1b            {z28.b}, p0/z, x2
736
+    sub             z0.h, z24.h, z28.h
737
+    smlalb          z5.s, z0.h, z0.h
738
+    smlalt          z6.s, z0.h, z0.h
739
+    add             x0, x0, x1, lsl #1
740
+    add             x2, x2, x3, lsl #1
741
+.endr
742
+    uaddv           d3, p0, z5.s
743
+    fmov            w0, s3
744
+    uaddv           d4, p0, z6.s
745
+    fmov            w1, s4
746
+    add             w0, w0, w1
747
+    ret
748
+endfunc
749
+
750
+function PFX(pixel_ssd_s_4x4_sve2)
751
+    ptrue           p0.b, vl8
752
+    ld1b            {z16.b}, p0/z, x0
753
+    add             x0, x0, x1, lsl #1
754
+    smullb          z0.s, z16.h, z16.h
755
+    smlalt          z0.s, z16.h, z16.h
756
+.rept 3
757
+    ld1b            {z16.b}, p0/z, x0
758
+    add             x0, x0, x1, lsl #1
759
+    smlalb          z0.s, z16.h, z16.h
760
+    smlalt          z0.s, z16.h, z16.h
761
+.endr
762
+    uaddv           d3, p0, z0.s
763
+    fmov            w0, s3
764
+    ret
765
+endfunc
766
+
767
+function PFX(pixel_ssd_s_8x8_sve2)
768
+    ptrue           p0.b, vl16
769
+    ld1b            {z16.b}, p0/z, x0
770
+    add             x0, x0, x1, lsl #1
771
+    smullb          z0.s, z16.h, z16.h
772
+    smlalt          z0.s, z16.h, z16.h
773
+.rept 7
774
+    ld1b            {z16.b}, p0/z, x0
775
+    add             x0, x0, x1, lsl #1
776
+    smlalb          z0.s, z16.h, z16.h
777
+    smlalt          z0.s, z16.h, z16.h
778
+.endr
779
+    uaddv           d3, p0, z0.s
780
+    fmov            w0, s3
781
+    ret
782
+endfunc
783
+
784
+function PFX(pixel_ssd_s_16x16_sve2)
785
+    rdvl            x9, #1
786
+    cmp             x9, #16
787
+    bgt             .vl_gt_16_pixel_ssd_s_16x16
788
+    add             x1, x1, x1
789
+    mov             w12, #4
790
+    movi            v0.16b, #0
791
+    movi            v1.16b, #0
792
+.loop_ssd_s_16_sve2:
793
+    sub             w12, w12, #1
794
+.rept 2
795
+    ld1             {v4.16b,v5.16b}, x0, x1
796
+    ld1             {v6.16b,v7.16b}, x0, x1
797
+    smlal           v0.4s, v4.4h, v4.4h
798
+    smlal2          v1.4s, v4.8h, v4.8h
799
+    smlal           v0.4s, v5.4h, v5.4h
800
+    smlal2          v1.4s, v5.8h, v5.8h
801
+    smlal           v0.4s, v6.4h, v6.4h
802
+    smlal2          v1.4s, v6.8h, v6.8h
803
+    smlal           v0.4s, v7.4h, v7.4h
804
+    smlal2          v1.4s, v7.8h, v7.8h
805
+.endr
806
+    cbnz            w12, .loop_ssd_s_16_sve2
807
+    add             v0.4s, v0.4s, v1.4s
808
+    ret_v0_w0
809
+.vl_gt_16_pixel_ssd_s_16x16:
810
+    ptrue           p0.b, vl32
811
+    ld1b            {z16.b}, p0/z, x0
812
+    add             x0, x0, x1, lsl #1
813
+    smullb          z0.s, z16.h, z16.h
814
+    smlalt          z0.s, z16.h, z16.h
815
+.rept 15
816
+    ld1b            {z16.b}, p0/z, x0
817
+    add             x0, x0, x1, lsl #1
818
+    smlalb          z0.s, z16.h, z16.h
819
+    smlalt          z0.s, z16.h, z16.h
820
+.endr
821
+    uaddv           d3, p0, z0.s
822
+    fmov            w0, s3
823
+    ret
824
+endfunc
825
+
826
+function PFX(pixel_ssd_s_32x32_sve2)
827
+    rdvl            x9, #1
828
+    cmp             x9, #16
829
+    bgt             .vl_gt_16_pixel_ssd_s_32x32
830
+    add             x1, x1, x1
831
+    mov             w12, #8
832
+    movi            v0.16b, #0
833
+    movi            v1.16b, #0
834
+.loop_ssd_s_32:
835
+    sub             w12, w12, #1
836
+.rept 4
837
+    ld1             {v4.16b-v7.16b}, x0, x1
838
+    smlal           v0.4s, v4.4h, v4.4h
839
+    smlal2          v1.4s, v4.8h, v4.8h
840
+    smlal           v0.4s, v5.4h, v5.4h
841
+    smlal2          v1.4s, v5.8h, v5.8h
842
+    smlal           v0.4s, v6.4h, v6.4h
843
+    smlal2          v1.4s, v6.8h, v6.8h
844
+    smlal           v0.4s, v7.4h, v7.4h
845
+    smlal2          v1.4s, v7.8h, v7.8h
846
+.endr
847
+    cbnz            w12, .loop_ssd_s_32
848
+    add             v0.4s, v0.4s, v1.4s
849
+    ret_v0_w0
850
+.vl_gt_16_pixel_ssd_s_32x32:
851
+    cmp             x9, #48
852
+    bgt             .vl_gt_48_pixel_ssd_s_32x32
853
+    ptrue           p0.b, vl32
854
+    ld1b            {z16.b}, p0/z, x0
855
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
856
+    add             x0, x0, x1, lsl #1
857
+    smullb          z0.s, z16.h, z16.h
858
+    smlalt          z0.s, z16.h, z16.h
859
+    smlalb          z0.s, z17.h, z17.h
860
+    smlalt          z0.s, z17.h, z17.h
861
+.rept 31
862
+    ld1b            {z16.b}, p0/z, x0
863
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
864
+    add             x0, x0, x1, lsl #1
865
+    smlalb          z0.s, z16.h, z16.h
866
+    smlalt          z0.s, z16.h, z16.h
867
+    smlalb          z0.s, z17.h, z17.h
868
+    smlalt          z0.s, z17.h, z17.h
869
+.endr
870
+    uaddv           d3, p0, z0.s
871
+    fmov            w0, s3
872
+    ret
873
+.vl_gt_48_pixel_ssd_s_32x32:
874
+    ptrue           p0.b, vl64
875
+    ld1b            {z16.b}, p0/z, x0
876
+    add             x0, x0, x1, lsl #1
877
+    smullb          z0.s, z16.h, z16.h
878
+    smlalt          z0.s, z16.h, z16.h
879
+.rept 31
880
+    ld1b            {z16.b}, p0/z, x0
881
+    add             x0, x0, x1, lsl #1
882
+    smlalb          z0.s, z16.h, z16.h
883
+    smlalt          z0.s, z16.h, z16.h
884
+.endr
885
+    uaddv           d3, p0, z0.s
886
+    fmov            w0, s3
887
+    ret
888
+endfunc
889
x265_3.6.tar.gz/source/common/aarch64/ssd-a.S Added
478
 
1
@@ -0,0 +1,476 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2021 MulticoreWare, Inc
4
+ *
5
+ * Authors: Sebastian Pop <spop@amazon.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+#include "ssd-a-common.S"
27
+
28
+#ifdef __APPLE__
29
+.section __RODATA,__rodata
30
+#else
31
+.section .rodata
32
+#endif
33
+
34
+.align 4
35
+
36
+.text
37
+
38
+function PFX(pixel_sse_pp_4x4_neon)
39
+    ld1             {v16.s}0, x0, x1
40
+    ld1             {v17.s}0, x2, x3
41
+    ld1             {v18.s}0, x0, x1
42
+    ld1             {v19.s}0, x2, x3
43
+    ld1             {v20.s}0, x0, x1
44
+    ld1             {v21.s}0, x2, x3
45
+    ld1             {v22.s}0, x0, x1
46
+    ld1             {v23.s}0, x2, x3
47
+
48
+    usubl           v1.8h, v16.8b, v17.8b
49
+    usubl           v2.8h, v18.8b, v19.8b
50
+    usubl           v3.8h, v20.8b, v21.8b
51
+    usubl           v4.8h, v22.8b, v23.8b
52
+
53
+    smull           v0.4s, v1.4h, v1.4h
54
+    smlal           v0.4s, v2.4h, v2.4h
55
+    smlal           v0.4s, v3.4h, v3.4h
56
+    smlal           v0.4s, v4.4h, v4.4h
57
+    ret_v0_w0
58
+endfunc
59
+
60
+function PFX(pixel_sse_pp_4x8_neon)
61
+    ld1             {v16.s}0, x0, x1
62
+    ld1             {v17.s}0, x2, x3
63
+    usubl           v1.8h, v16.8b, v17.8b
64
+    ld1             {v16.s}0, x0, x1
65
+    ld1             {v17.s}0, x2, x3
66
+    smull           v0.4s, v1.4h, v1.4h
67
+.rept 6
68
+    usubl           v1.8h, v16.8b, v17.8b
69
+    ld1             {v16.s}0, x0, x1
70
+    smlal           v0.4s, v1.4h, v1.4h
71
+    ld1             {v17.s}0, x2, x3
72
+.endr
73
+    usubl           v1.8h, v16.8b, v17.8b
74
+    smlal           v0.4s, v1.4h, v1.4h
75
+    ret_v0_w0
76
+endfunc
77
+
78
+function PFX(pixel_sse_pp_8x8_neon)
79
+    ld1             {v16.8b}, x0, x1
80
+    ld1             {v17.8b}, x2, x3
81
+    usubl           v1.8h, v16.8b, v17.8b
82
+    ld1             {v16.8b}, x0, x1
83
+    smull           v0.4s, v1.4h, v1.4h
84
+    smlal2          v0.4s, v1.8h, v1.8h
85
+    ld1             {v17.8b}, x2, x3
86
+
87
+.rept 6
88
+    usubl           v1.8h, v16.8b, v17.8b
89
+    ld1             {v16.8b}, x0, x1
90
+    smlal           v0.4s, v1.4h, v1.4h
91
+    smlal2          v0.4s, v1.8h, v1.8h
92
+    ld1             {v17.8b}, x2, x3
93
+.endr
94
+    usubl           v1.8h, v16.8b, v17.8b
95
+    smlal           v0.4s, v1.4h, v1.4h
96
+    smlal2          v0.4s, v1.8h, v1.8h
97
+    ret_v0_w0
98
+endfunc
99
+
100
+function PFX(pixel_sse_pp_8x16_neon)
101
+    ld1             {v16.8b}, x0, x1
102
+    ld1             {v17.8b}, x2, x3
103
+    usubl           v1.8h, v16.8b, v17.8b
104
+    ld1             {v16.8b}, x0, x1
105
+    smull           v0.4s, v1.4h, v1.4h
106
+    smlal2          v0.4s, v1.8h, v1.8h
107
+    ld1             {v17.8b}, x2, x3
108
+
109
+.rept 14
110
+    usubl           v1.8h, v16.8b, v17.8b
111
+    ld1             {v16.8b}, x0, x1
112
+    smlal           v0.4s, v1.4h, v1.4h
113
+    smlal2          v0.4s, v1.8h, v1.8h
114
+    ld1             {v17.8b}, x2, x3
115
+.endr
116
+    usubl           v1.8h, v16.8b, v17.8b
117
+    smlal           v0.4s, v1.4h, v1.4h
118
+    smlal2          v0.4s, v1.8h, v1.8h
119
+    ret_v0_w0
120
+endfunc
121
+
122
+.macro sse_pp_16xN h
123
+function PFX(pixel_sse_pp_16x\h\()_neon)
124
+    ld1             {v16.16b}, x0, x1
125
+    ld1             {v17.16b}, x2, x3
126
+    usubl           v1.8h, v16.8b, v17.8b
127
+    usubl2          v2.8h, v16.16b, v17.16b
128
+    ld1             {v16.16b}, x0, x1
129
+    ld1             {v17.16b}, x2, x3
130
+    smull           v0.4s, v1.4h, v1.4h
131
+    smlal2          v0.4s, v1.8h, v1.8h
132
+    smlal           v0.4s, v2.4h, v2.4h
133
+    smlal2          v0.4s, v2.8h, v2.8h
134
+.rept \h - 2
135
+    usubl           v1.8h, v16.8b, v17.8b
136
+    usubl2          v2.8h, v16.16b, v17.16b
137
+    ld1             {v16.16b}, x0, x1
138
+    smlal           v0.4s, v1.4h, v1.4h
139
+    smlal2          v0.4s, v1.8h, v1.8h
140
+    ld1             {v17.16b}, x2, x3
141
+    smlal           v0.4s, v2.4h, v2.4h
142
+    smlal2          v0.4s, v2.8h, v2.8h
143
+.endr
144
+    usubl           v1.8h, v16.8b, v17.8b
145
+    usubl2          v2.8h, v16.16b, v17.16b
146
+    smlal           v0.4s, v1.4h, v1.4h
147
+    smlal2          v0.4s, v1.8h, v1.8h
148
+    smlal           v0.4s, v2.4h, v2.4h
149
+    smlal2          v0.4s, v2.8h, v2.8h
150
+    ret_v0_w0
151
+endfunc
152
+.endm
153
+
154
+sse_pp_16xN 16
155
+sse_pp_16xN 32
156
+
157
+function PFX(pixel_sse_pp_32x32_neon)
158
+    mov             w12, #8
159
+    movi            v0.16b, #0
160
+    movi            v1.16b, #0
161
+.loop_sse_pp_32:
162
+    sub             w12, w12, #1
163
+.rept 4
164
+    ld1             {v16.16b,v17.16b}, x0, x1
165
+    ld1             {v18.16b,v19.16b}, x2, x3
166
+    usubl           v2.8h, v16.8b, v18.8b
167
+    usubl2          v3.8h, v16.16b, v18.16b
168
+    usubl           v4.8h, v17.8b, v19.8b
169
+    usubl2          v5.8h, v17.16b, v19.16b
170
+    smlal           v0.4s, v2.4h, v2.4h
171
+    smlal2          v1.4s, v2.8h, v2.8h
172
+    smlal           v0.4s, v3.4h, v3.4h
173
+    smlal2          v1.4s, v3.8h, v3.8h
174
+    smlal           v0.4s, v4.4h, v4.4h
175
+    smlal2          v1.4s, v4.8h, v4.8h
176
+    smlal           v0.4s, v5.4h, v5.4h
177
+    smlal2          v1.4s, v5.8h, v5.8h
178
+.endr
179
+    cbnz            w12, .loop_sse_pp_32
180
+    add             v0.4s, v0.4s, v1.4s
181
+    ret_v0_w0
182
+endfunc
183
+
184
+function PFX(pixel_sse_pp_32x64_neon)
185
+    mov             w12, #16
186
+    movi            v0.16b, #0
187
+    movi            v1.16b, #0
188
+.loop_sse_pp_32x64:
189
+    sub             w12, w12, #1
190
+.rept 4
191
+    ld1             {v16.16b,v17.16b}, x0, x1
192
+    ld1             {v18.16b,v19.16b}, x2, x3
193
+    usubl           v2.8h, v16.8b, v18.8b
194
+    usubl2          v3.8h, v16.16b, v18.16b
195
+    usubl           v4.8h, v17.8b, v19.8b
196
+    usubl2          v5.8h, v17.16b, v19.16b
197
+    smlal           v0.4s, v2.4h, v2.4h
198
+    smlal2          v1.4s, v2.8h, v2.8h
199
+    smlal           v0.4s, v3.4h, v3.4h
200
+    smlal2          v1.4s, v3.8h, v3.8h
201
+    smlal           v0.4s, v4.4h, v4.4h
202
+    smlal2          v1.4s, v4.8h, v4.8h
203
+    smlal           v0.4s, v5.4h, v5.4h
204
+    smlal2          v1.4s, v5.8h, v5.8h
205
+.endr
206
+    cbnz            w12, .loop_sse_pp_32x64
207
+    add             v0.4s, v0.4s, v1.4s
208
+    ret_v0_w0
209
+endfunc
210
+
211
+function PFX(pixel_sse_pp_64x64_neon)
212
+    mov             w12, #16
213
+    movi            v0.16b, #0
214
+    movi            v1.16b, #0
215
+
216
+.loop_sse_pp_64:
217
+    sub             w12, w12, #1
218
+.rept 4
219
+    ld1             {v16.16b-v19.16b}, x0, x1
220
+    ld1             {v20.16b-v23.16b}, x2, x3
221
+
222
+    usubl           v2.8h, v16.8b, v20.8b
223
+    usubl2          v3.8h, v16.16b, v20.16b
224
+    usubl           v4.8h, v17.8b, v21.8b
225
+    usubl2          v5.8h, v17.16b, v21.16b
226
+    smlal           v0.4s, v2.4h, v2.4h
227
+    smlal2          v1.4s, v2.8h, v2.8h
228
+    smlal           v0.4s, v3.4h, v3.4h
229
+    smlal2          v1.4s, v3.8h, v3.8h
230
+    smlal           v0.4s, v4.4h, v4.4h
231
+    smlal2          v1.4s, v4.8h, v4.8h
232
+    smlal           v0.4s, v5.4h, v5.4h
233
+    smlal2          v1.4s, v5.8h, v5.8h
234
+
235
+    usubl           v2.8h, v18.8b, v22.8b
236
+    usubl2          v3.8h, v18.16b, v22.16b
237
+    usubl           v4.8h, v19.8b, v23.8b
238
+    usubl2          v5.8h, v19.16b, v23.16b
239
+    smlal           v0.4s, v2.4h, v2.4h
240
+    smlal2          v1.4s, v2.8h, v2.8h
241
+    smlal           v0.4s, v3.4h, v3.4h
242
+    smlal2          v1.4s, v3.8h, v3.8h
243
+    smlal           v0.4s, v4.4h, v4.4h
244
+    smlal2          v1.4s, v4.8h, v4.8h
245
+    smlal           v0.4s, v5.4h, v5.4h
246
+    smlal2          v1.4s, v5.8h, v5.8h
247
+.endr
248
+    cbnz            w12, .loop_sse_pp_64
249
+    add             v0.4s, v0.4s, v1.4s
250
+    ret_v0_w0
251
+endfunc
252
+
253
+function PFX(pixel_sse_ss_4x4_neon)
254
+    add             x1, x1, x1
255
+    add             x3, x3, x3
256
+    ld1             {v16.8b}, x0, x1
257
+    ld1             {v17.8b}, x2, x3
258
+    sub             v2.4h, v16.4h, v17.4h
259
+    ld1             {v16.8b}, x0, x1
260
+    ld1             {v17.8b}, x2, x3
261
+    smull           v0.4s, v2.4h, v2.4h
262
+    sub             v2.4h, v16.4h, v17.4h
263
+    ld1             {v16.8b}, x0, x1
264
+    ld1             {v17.8b}, x2, x3
265
+    smlal           v0.4s, v2.4h, v2.4h
266
+    sub             v2.4h, v16.4h, v17.4h
267
+    ld1             {v16.8b}, x0, x1
268
+    smlal           v0.4s, v2.4h, v2.4h
269
+    ld1             {v17.8b}, x2, x3
270
+    sub             v2.4h, v16.4h, v17.4h
271
+    smlal           v0.4s, v2.4h, v2.4h
272
+    ret_v0_w0
273
+endfunc
274
+
275
+function PFX(pixel_sse_ss_8x8_neon)
276
+    add             x1, x1, x1
277
+    add             x3, x3, x3
278
+    ld1             {v16.16b}, x0, x1
279
+    ld1             {v17.16b}, x2, x3
280
+    sub             v2.8h, v16.8h, v17.8h
281
+    ld1             {v16.16b}, x0, x1
282
+    ld1             {v17.16b}, x2, x3
283
+    smull           v0.4s, v2.4h, v2.4h
284
+    smull2          v1.4s, v2.8h, v2.8h
285
+    sub             v2.8h, v16.8h, v17.8h
286
+.rept 6
287
+    ld1             {v16.16b}, x0, x1
288
+    ld1             {v17.16b}, x2, x3
289
+    smlal           v0.4s, v2.4h, v2.4h
290
+    smlal2          v1.4s, v2.8h, v2.8h
291
+    sub             v2.8h, v16.8h, v17.8h
292
+.endr
293
+    smlal           v0.4s, v2.4h, v2.4h
294
+    smlal2          v1.4s, v2.8h, v2.8h
295
+    add             v0.4s, v0.4s, v1.4s
296
+    ret_v0_w0
297
+endfunc
298
+
299
+function PFX(pixel_sse_ss_16x16_neon)
300
+    add             x1, x1, x1
301
+    add             x3, x3, x3
302
+    mov             w12, #4
303
+    movi            v0.16b, #0
304
+    movi            v1.16b, #0
305
+.loop_sse_ss_16:
306
+    sub             w12, w12, #1
307
+.rept 4
308
+    ld1             {v16.16b, v17.16b}, x0, x1
309
+    ld1             {v18.16b, v19.16b}, x2, x3
310
+    sub             v2.8h, v16.8h, v18.8h
311
+    sub             v3.8h, v17.8h, v19.8h
312
+    smlal           v0.4s, v2.4h, v2.4h
313
+    smlal2          v1.4s, v2.8h, v2.8h
314
+    smlal           v0.4s, v3.4h, v3.4h
315
+    smlal2          v1.4s, v3.8h, v3.8h
316
+.endr
317
+    cbnz            w12, .loop_sse_ss_16
318
+    add             v0.4s, v0.4s, v1.4s
319
+    ret_v0_w0
320
+endfunc
321
+
322
+function PFX(pixel_sse_ss_32x32_neon)
323
+    add             x1, x1, x1
324
+    add             x3, x3, x3
325
+
326
+    mov             w12, #8
327
+    movi            v0.16b, #0
328
+    movi            v1.16b, #0
329
+.loop_sse_ss_32:
330
+    sub             w12, w12, #1
331
+.rept 4
332
+    ld1             {v16.16b-v19.16b}, x0, x1
333
+    ld1             {v20.16b-v23.16b}, x2, x3
334
+    sub             v2.8h, v16.8h, v20.8h
335
+    sub             v3.8h, v17.8h, v21.8h
336
+    sub             v4.8h, v18.8h, v22.8h
337
+    sub             v5.8h, v19.8h, v23.8h
338
+    smlal           v0.4s, v2.4h, v2.4h
339
+    smlal2          v1.4s, v2.8h, v2.8h
340
+    smlal           v0.4s, v3.4h, v3.4h
341
+    smlal2          v1.4s, v3.8h, v3.8h
342
+    smlal           v0.4s, v4.4h, v4.4h
343
+    smlal2          v1.4s, v4.8h, v4.8h
344
+    smlal           v0.4s, v5.4h, v5.4h
345
+    smlal2          v1.4s, v5.8h, v5.8h
346
+.endr
347
+    cbnz            w12, .loop_sse_ss_32
348
+    add             v0.4s, v0.4s, v1.4s
349
+    ret_v0_w0
350
+endfunc
351
+
352
+function PFX(pixel_sse_ss_64x64_neon)
353
+    add             x1, x1, x1
354
+    add             x3, x3, x3
355
+    sub             x1, x1, #64
356
+    sub             x3, x3, #64
357
+
358
+    mov             w12, #32
359
+    movi            v0.16b, #0
360
+    movi            v1.16b, #0
361
+.loop_sse_ss_64:
362
+    sub             w12, w12, #1
363
+.rept 2
364
+    ld1             {v16.16b-v19.16b}, x0, #64
365
+    ld1             {v20.16b-v23.16b}, x2, #64
366
+    sub             v2.8h, v16.8h, v20.8h
367
+    sub             v3.8h, v17.8h, v21.8h
368
+    sub             v4.8h, v18.8h, v22.8h
369
+    sub             v5.8h, v19.8h, v23.8h
370
+    ld1             {v16.16b-v19.16b}, x0, x1
371
+    ld1             {v20.16b-v23.16b}, x2, x3
372
+    smlal           v0.4s, v2.4h, v2.4h
373
+    smlal2          v1.4s, v2.8h, v2.8h
374
+    smlal           v0.4s, v3.4h, v3.4h
375
+    smlal2          v1.4s, v3.8h, v3.8h
376
+    smlal           v0.4s, v4.4h, v4.4h
377
+    smlal2          v1.4s, v4.8h, v4.8h
378
+    smlal           v0.4s, v5.4h, v5.4h
379
+    smlal2          v1.4s, v5.8h, v5.8h
380
+    sub             v2.8h, v16.8h, v20.8h
381
+    sub             v3.8h, v17.8h, v21.8h
382
+    sub             v4.8h, v18.8h, v22.8h
383
+    sub             v5.8h, v19.8h, v23.8h
384
+    smlal           v0.4s, v2.4h, v2.4h
385
+    smlal2          v1.4s, v2.8h, v2.8h
386
+    smlal           v0.4s, v3.4h, v3.4h
387
+    smlal2          v1.4s, v3.8h, v3.8h
388
+    smlal           v0.4s, v4.4h, v4.4h
389
+    smlal2          v1.4s, v4.8h, v4.8h
390
+    smlal           v0.4s, v5.4h, v5.4h
391
+    smlal2          v1.4s, v5.8h, v5.8h
392
+.endr
393
+    cbnz            w12, .loop_sse_ss_64
394
+    add             v0.4s, v0.4s, v1.4s
395
+    ret_v0_w0
396
+endfunc
397
+
398
+function PFX(pixel_ssd_s_4x4_neon)
399
+    add             x1, x1, x1
400
+    ld1             {v4.8b}, x0, x1
401
+    ld1             {v5.8b}, x0, x1
402
+    ld1             {v6.8b}, x0, x1
403
+    ld1             {v7.8b}, x0
404
+    smull           v0.4s, v4.4h, v4.4h
405
+    smull           v1.4s, v5.4h, v5.4h
406
+    smlal           v0.4s, v6.4h, v6.4h
407
+    smlal           v1.4s, v7.4h, v7.4h
408
+    add             v0.4s, v0.4s, v1.4s
409
+    ret_v0_w0
410
+endfunc
411
+
412
+function PFX(pixel_ssd_s_8x8_neon)
413
+    add             x1, x1, x1
414
+    ld1             {v4.16b}, x0, x1
415
+    ld1             {v5.16b}, x0, x1
416
+    smull           v0.4s, v4.4h, v4.4h
417
+    smull2          v1.4s, v4.8h, v4.8h
418
+    smlal           v0.4s, v5.4h, v5.4h
419
+    smlal2          v1.4s, v5.8h, v5.8h
420
+.rept 3
421
+    ld1             {v4.16b}, x0, x1
422
+    ld1             {v5.16b}, x0, x1
423
+    smlal           v0.4s, v4.4h, v4.4h
424
+    smlal2          v1.4s, v4.8h, v4.8h
425
+    smlal           v0.4s, v5.4h, v5.4h
426
+    smlal2          v1.4s, v5.8h, v5.8h
427
+.endr
428
+    add             v0.4s, v0.4s, v1.4s
429
+    ret_v0_w0
430
+endfunc
431
+
432
+function PFX(pixel_ssd_s_16x16_neon)
433
+    add             x1, x1, x1
434
+    mov             w12, #4
435
+    movi            v0.16b, #0
436
+    movi            v1.16b, #0
437
+.loop_ssd_s_16:
438
+    sub             w12, w12, #1
439
+.rept 2
440
+    ld1             {v4.16b,v5.16b}, x0, x1
441
+    ld1             {v6.16b,v7.16b}, x0, x1
442
+    smlal           v0.4s, v4.4h, v4.4h
443
+    smlal2          v1.4s, v4.8h, v4.8h
444
+    smlal           v0.4s, v5.4h, v5.4h
445
+    smlal2          v1.4s, v5.8h, v5.8h
446
+    smlal           v0.4s, v6.4h, v6.4h
447
+    smlal2          v1.4s, v6.8h, v6.8h
448
+    smlal           v0.4s, v7.4h, v7.4h
449
+    smlal2          v1.4s, v7.8h, v7.8h
450
+.endr
451
+    cbnz            w12, .loop_ssd_s_16
452
+    add             v0.4s, v0.4s, v1.4s
453
+    ret_v0_w0
454
+endfunc
455
+
456
+function PFX(pixel_ssd_s_32x32_neon)
457
+    add             x1, x1, x1
458
+    mov             w12, #8
459
+    movi            v0.16b, #0
460
+    movi            v1.16b, #0
461
+.loop_ssd_s_32:
462
+    sub             w12, w12, #1
463
+.rept 4
464
+    ld1             {v4.16b-v7.16b}, x0, x1
465
+    smlal           v0.4s, v4.4h, v4.4h
466
+    smlal2          v1.4s, v4.8h, v4.8h
467
+    smlal           v0.4s, v5.4h, v5.4h
468
+    smlal2          v1.4s, v5.8h, v5.8h
469
+    smlal           v0.4s, v6.4h, v6.4h
470
+    smlal2          v1.4s, v6.8h, v6.8h
471
+    smlal           v0.4s, v7.4h, v7.4h
472
+    smlal2          v1.4s, v7.8h, v7.8h
473
+.endr
474
+    cbnz            w12, .loop_ssd_s_32
475
+    add             v0.4s, v0.4s, v1.4s
476
+    ret_v0_w0
477
+endfunc
478
x265_3.5.tar.gz/source/common/common.h -> x265_3.6.tar.gz/source/common/common.h Changed
51
 
1
@@ -130,7 +130,6 @@
2
 typedef uint64_t pixel4;
3
 typedef int64_t  ssum2_t;
4
 #define SHIFT_TO_BITPLANE 9
5
-#define HISTOGRAM_BINS 1024
6
 #else
7
 typedef uint8_t  pixel;
8
 typedef uint16_t sum_t;
9
@@ -138,7 +137,6 @@
10
 typedef uint32_t pixel4;
11
 typedef int32_t  ssum2_t; // Signed sum
12
 #define SHIFT_TO_BITPLANE 7
13
-#define HISTOGRAM_BINS 256
14
 #endif // if HIGH_BIT_DEPTH
15
 
16
 #if X265_DEPTH < 10
17
@@ -162,6 +160,8 @@
18
 
19
 #define MIN_QPSCALE     0.21249999999999999
20
 #define MAX_MAX_QPSCALE 615.46574234477100
21
+#define FRAME_BRIGHTNESS_THRESHOLD  50.0 // Min % of pixels in a frame, that are above BRIGHTNESS_THRESHOLD for it to be considered a bright frame
22
+#define FRAME_EDGE_THRESHOLD  10.0 // Min % of edge pixels in a frame, for it to be considered to have high edge density
23
 
24
 
25
 template<typename T>
26
@@ -340,6 +340,9 @@
27
 #define FILLER_OVERHEAD (NAL_TYPE_OVERHEAD + START_CODE_OVERHEAD + 1)
28
 
29
 #define MAX_NUM_DYN_REFINE          (NUM_CU_DEPTH * X265_REFINE_INTER_LEVELS)
30
+#define X265_BYTE 8
31
+
32
+#define MAX_MCSTF_TEMPORAL_WINDOW_LENGTH 8
33
 
34
 namespace X265_NS {
35
 
36
@@ -434,6 +437,14 @@
37
 #define  x265_unlink(fileName) unlink(fileName)
38
 #define  x265_rename(oldName, newName) rename(oldName, newName)
39
 #endif
40
+/* Close a file */
41
+#define  x265_fclose(file) if (file != NULL) fclose(file); file=NULL;
42
+#define x265_fread(val, size, readSize, fileOffset,errorMessage)\
43
+    if (fread(val, size, readSize, fileOffset) != readSize)\
44
+    {\
45
+        x265_log(NULL, X265_LOG_ERROR, errorMessage); \
46
+        return; \
47
+    }
48
 int      x265_exp2fix8(double x);
49
 
50
 double   x265_ssim2dB(double ssim);
51
x265_3.5.tar.gz/source/common/cpu.cpp -> x265_3.6.tar.gz/source/common/cpu.cpp Changed
58
 
1
@@ -7,6 +7,8 @@
2
  *          Steve Borho <steve@borho.org>
3
  *          Hongbin Liu <liuhongbin1@huawei.com>
4
  *          Yimeng Su <yimeng.su@huawei.com>
5
+ *          Josh Dekker <josh@itanimul.li>
6
+ *          Jean-Baptiste Kempf <jb@videolan.org>
7
  *
8
  * This program is free software; you can redistribute it and/or modify
9
  * it under the terms of the GNU General Public License as published by
10
@@ -105,6 +107,14 @@
11
     { "NEON",            X265_CPU_NEON },
12
     { "FastNeonMRC",     X265_CPU_FAST_NEON_MRC },
13
 
14
+#elif X265_ARCH_ARM64
15
+    { "NEON",            X265_CPU_NEON },
16
+#if defined(HAVE_SVE)
17
+    { "SVE",            X265_CPU_SVE },
18
+#endif
19
+#if defined(HAVE_SVE2)
20
+    { "SVE2",            X265_CPU_SVE2 },
21
+#endif
22
 #elif X265_ARCH_POWER8
23
     { "Altivec",         X265_CPU_ALTIVEC },
24
 
25
@@ -369,12 +379,30 @@
26
     flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
27
 #endif
28
     // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
29
-#elif X265_ARCH_ARM64
30
-    flags |= X265_CPU_NEON;
31
 #endif // if HAVE_ARMV6
32
     return flags;
33
 }
34
 
35
+#elif X265_ARCH_ARM64
36
+
37
+uint32_t cpu_detect(bool benableavx512)
38
+{
39
+    int flags = 0;
40
+
41
+    #if defined(HAVE_SVE2)
42
+         flags |= X265_CPU_SVE2;
43
+         flags |= X265_CPU_SVE;
44
+         flags |= X265_CPU_NEON;
45
+    #elif defined(HAVE_SVE)
46
+         flags |= X265_CPU_SVE;
47
+         flags |= X265_CPU_NEON;
48
+    #elif HAVE_NEON
49
+         flags |= X265_CPU_NEON;
50
+    #endif
51
+        
52
+    return flags;
53
+}
54
+
55
 #elif X265_ARCH_POWER8
56
 
57
 uint32_t cpu_detect(bool benableavx512)
58
x265_3.5.tar.gz/source/common/frame.cpp -> x265_3.6.tar.gz/source/common/frame.cpp Changed
102
 
1
@@ -64,12 +64,40 @@
2
     m_edgeBitPlane = NULL;
3
     m_edgeBitPic = NULL;
4
     m_isInsideWindow = 0;
5
+
6
+    // mcstf
7
+    m_isSubSampled = NULL;
8
+    m_mcstf = NULL;
9
+    m_refPicCnt0 = 0;
10
+    m_refPicCnt1 = 0;
11
+    m_nextMCSTF = NULL;
12
+    m_prevMCSTF = NULL;
13
+
14
+    m_tempLayer = 0;
15
+    m_sameLayerRefPic = false;
16
 }
17
 
18
 bool Frame::create(x265_param *param, float* quantOffsets)
19
 {
20
     m_fencPic = new PicYuv;
21
     m_param = param;
22
+
23
+    if (m_param->bEnableTemporalFilter)
24
+    {
25
+        m_mcstf = new TemporalFilter;
26
+        m_mcstf->init(param);
27
+
28
+        m_fencPicSubsampled2 = new PicYuv;
29
+        m_fencPicSubsampled4 = new PicYuv;
30
+
31
+        if (!m_fencPicSubsampled2->createScaledPicYUV(param, 2))
32
+            return false;
33
+        if (!m_fencPicSubsampled4->createScaledPicYUV(param, 4))
34
+            return false;
35
+
36
+        CHECKED_MALLOC_ZERO(m_isSubSampled, int, 1);
37
+    }
38
+
39
     CHECKED_MALLOC_ZERO(m_rcData, RcStats, 1);
40
 
41
     if (param->bCTUInfo)
42
@@ -151,6 +179,22 @@
43
     return false;
44
 }
45
 
46
+bool Frame::createSubSample()
47
+{
48
+
49
+    m_fencPicSubsampled2 = new PicYuv;
50
+    m_fencPicSubsampled4 = new PicYuv;
51
+
52
+    if (!m_fencPicSubsampled2->createScaledPicYUV(m_param, 2))
53
+        return false;
54
+    if (!m_fencPicSubsampled4->createScaledPicYUV(m_param, 4))
55
+        return false;
56
+    CHECKED_MALLOC_ZERO(m_isSubSampled, int, 1);
57
+    return true;
58
+fail:
59
+    return false;
60
+}
61
+
62
 bool Frame::allocEncodeData(x265_param *param, const SPS& sps)
63
 {
64
     m_encData = new FrameData;
65
@@ -207,6 +251,26 @@
66
         m_fencPic = NULL;
67
     }
68
 
69
+    if (m_param->bEnableTemporalFilter)
70
+    {
71
+
72
+        if (m_fencPicSubsampled2)
73
+        {
74
+            m_fencPicSubsampled2->destroy();
75
+            delete m_fencPicSubsampled2;
76
+            m_fencPicSubsampled2 = NULL;
77
+        }
78
+
79
+        if (m_fencPicSubsampled4)
80
+        {
81
+            m_fencPicSubsampled4->destroy();
82
+            delete m_fencPicSubsampled4;
83
+            m_fencPicSubsampled4 = NULL;
84
+        }
85
+        delete m_mcstf;
86
+        X265_FREE(m_isSubSampled);
87
+    }
88
+
89
     if (m_reconPic)
90
     {
91
         m_reconPic->destroy();
92
@@ -267,7 +331,8 @@
93
         X265_FREE(m_addOnPrevChange);
94
         m_addOnPrevChange = NULL;
95
     }
96
-    m_lowres.destroy();
97
+
98
+    m_lowres.destroy(m_param);
99
     X265_FREE(m_rcData);
100
 
101
     if (m_param->bDynamicRefine)
102
x265_3.5.tar.gz/source/common/frame.h -> x265_3.6.tar.gz/source/common/frame.h Changed
60
 
1
@@ -28,6 +28,7 @@
2
 #include "common.h"
3
 #include "lowres.h"
4
 #include "threading.h"
5
+#include "temporalfilter.h"
6
 
7
 namespace X265_NS {
8
 // private namespace
9
@@ -70,6 +71,7 @@
10
     double   count4;
11
     double   offset4;
12
     double   bufferFillFinal;
13
+    int64_t  currentSatd;
14
 };
15
 
16
 class Frame
17
@@ -83,8 +85,12 @@
18
 
19
     /* Data associated with x265_picture */
20
     PicYuv*                m_fencPic;
21
+    PicYuv*                m_fencPicSubsampled2;
22
+    PicYuv*                m_fencPicSubsampled4;
23
+
24
     int                    m_poc;
25
     int                    m_encodeOrder;
26
+    int                    m_gopOffset;
27
     int64_t                m_pts;                // user provided presentation time stamp
28
     int64_t                m_reorderedPts;
29
     int64_t                m_dts;
30
@@ -132,6 +138,13 @@
31
     bool                   m_classifyFrame;
32
     int                    m_fieldNum;
33
 
34
+    /*MCSTF*/
35
+    TemporalFilter*        m_mcstf;
36
+    int                    m_refPicCnt2;
37
+    Frame*                 m_nextMCSTF;           // PicList doubly linked list pointers
38
+    Frame*                 m_prevMCSTF;
39
+    int*                   m_isSubSampled;
40
+
41
     /* aq-mode 4 : Gaussian, edge and theta frames for edge information */
42
     pixel*                 m_edgePic;
43
     pixel*                 m_gaussianPic;
44
@@ -143,9 +156,15 @@
45
 
46
     int                    m_isInsideWindow;
47
 
48
+    /*Frame's temporal layer info*/
49
+    uint8_t                m_tempLayer;
50
+    int8_t                 m_gopId;
51
+    bool                   m_sameLayerRefPic;
52
+
53
     Frame();
54
 
55
     bool create(x265_param *param, float* quantOffsets);
56
+    bool createSubSample();
57
     bool allocEncodeData(x265_param *param, const SPS& sps);
58
     void reinit(const SPS& sps);
59
     void destroy();
60
x265_3.5.tar.gz/source/common/framedata.cpp -> x265_3.6.tar.gz/source/common/framedata.cpp Changed
10
 
1
@@ -62,7 +62,7 @@
2
     }
3
     else
4
         return false;
5
-    CHECKED_MALLOC_ZERO(m_cuStat, RCStatCU, sps.numCUsInFrame);
6
+    CHECKED_MALLOC_ZERO(m_cuStat, RCStatCU, sps.numCUsInFrame + 1);
7
     CHECKED_MALLOC(m_rowStat, RCStatRow, sps.numCuInHeight);
8
     reinit(sps);
9
     
10
x265_3.5.tar.gz/source/common/lowres.cpp -> x265_3.6.tar.gz/source/common/lowres.cpp Changed
154
 
1
@@ -28,6 +28,28 @@
2
 
3
 using namespace X265_NS;
4
 
5
+/*
6
+ * Down Sample input picture
7
+ */
8
+static
9
+void frame_lowres_core(const pixel* src0, pixel* dst0,
10
+    intptr_t src_stride, intptr_t dst_stride, int width, int height)
11
+{
12
+    for (int y = 0; y < height; y++)
13
+    {
14
+        const pixel* src1 = src0 + src_stride;
15
+        for (int x = 0; x < width; x++)
16
+        {
17
+            // slower than naive bilinear, but matches asm
18
+#define FILTER(a, b, c, d) ((((a + b + 1) >> 1) + ((c + d + 1) >> 1) + 1) >> 1)
19
+            dst0x = FILTER(src02 * x, src12 * x, src02 * x + 1, src12 * x + 1);
20
+#undef FILTER
21
+        }
22
+        src0 += src_stride * 2;
23
+        dst0 += dst_stride;
24
+    }
25
+}
26
+
27
 bool PicQPAdaptationLayer::create(uint32_t width, uint32_t height, uint32_t partWidth, uint32_t partHeight, uint32_t numAQPartInWidthExt, uint32_t numAQPartInHeightExt)
28
 {
29
     aqPartWidth = partWidth;
30
@@ -73,7 +95,7 @@
31
 
32
     size_t planesize = lumaStride * (lines + 2 * origPic->m_lumaMarginY);
33
     size_t padoffset = lumaStride * origPic->m_lumaMarginY + origPic->m_lumaMarginX;
34
-    if (!!param->rc.aqMode || !!param->rc.hevcAq || !!param->bAQMotion)
35
+    if (!!param->rc.aqMode || !!param->rc.hevcAq || !!param->bAQMotion || !!param->bEnableWeightedPred || !!param->bEnableWeightedBiPred)
36
     {
37
         CHECKED_MALLOC_ZERO(qpAqOffset, double, cuCountFullRes);
38
         CHECKED_MALLOC_ZERO(invQscaleFactor, int, cuCountFullRes);
39
@@ -190,13 +212,45 @@
40
         }
41
     }
42
 
43
+    if (param->bHistBasedSceneCut)
44
+    {
45
+        quarterSampleLowResWidth = widthFullRes / 4;
46
+        quarterSampleLowResHeight = heightFullRes / 4;
47
+        quarterSampleLowResOriginX = 16;
48
+        quarterSampleLowResOriginY = 16;
49
+        quarterSampleLowResStrideY = quarterSampleLowResWidth + 2 * quarterSampleLowResOriginY;
50
+
51
+        size_t quarterSampleLowResPlanesize = quarterSampleLowResStrideY * (quarterSampleLowResHeight + 2 * quarterSampleLowResOriginX);
52
+        /* allocate quarter sampled lowres buffers */
53
+        CHECKED_MALLOC_ZERO(quarterSampleLowResBuffer, pixel, quarterSampleLowResPlanesize);
54
+
55
+        // Allocate memory for Histograms
56
+        picHistogram = X265_MALLOC(uint32_t***, NUMBER_OF_SEGMENTS_IN_WIDTH * sizeof(uint32_t***));
57
+        picHistogram0 = X265_MALLOC(uint32_t**, NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
58
+        for (uint32_t wd = 1; wd < NUMBER_OF_SEGMENTS_IN_WIDTH; wd++) {
59
+            picHistogramwd = picHistogram0 + wd * NUMBER_OF_SEGMENTS_IN_HEIGHT;
60
+        }
61
+
62
+        for (uint32_t regionInPictureWidthIndex = 0; regionInPictureWidthIndex < NUMBER_OF_SEGMENTS_IN_WIDTH; regionInPictureWidthIndex++)
63
+        {
64
+            for (uint32_t regionInPictureHeightIndex = 0; regionInPictureHeightIndex < NUMBER_OF_SEGMENTS_IN_HEIGHT; regionInPictureHeightIndex++)
65
+            {
66
+                picHistogramregionInPictureWidthIndexregionInPictureHeightIndex = X265_MALLOC(uint32_t*, NUMBER_OF_SEGMENTS_IN_WIDTH *sizeof(uint32_t*));
67
+                picHistogramregionInPictureWidthIndexregionInPictureHeightIndex0 = X265_MALLOC(uint32_t, 3 * HISTOGRAM_NUMBER_OF_BINS * sizeof(uint32_t));
68
+                for (uint32_t wd = 1; wd < 3; wd++) {
69
+                    picHistogramregionInPictureWidthIndexregionInPictureHeightIndexwd = picHistogramregionInPictureWidthIndexregionInPictureHeightIndex0 + wd * HISTOGRAM_NUMBER_OF_BINS;
70
+                }
71
+            }
72
+        }
73
+    }
74
+
75
     return true;
76
 
77
 fail:
78
     return false;
79
 }
80
 
81
-void Lowres::destroy()
82
+void Lowres::destroy(x265_param* param)
83
 {
84
     X265_FREE(buffer0);
85
     if(bEnableHME)
86
@@ -234,7 +288,8 @@
87
     X265_FREE(invQscaleFactor8x8);
88
     X265_FREE(edgeInclined);
89
     X265_FREE(qpAqMotionOffset);
90
-    X265_FREE(blockVariance);
91
+    if (param->bDynamicRefine || param->bEnableFades)
92
+        X265_FREE(blockVariance);
93
     if (maxAQDepth > 0)
94
     {
95
         for (uint32_t d = 0; d < 4; d++)
96
@@ -254,6 +309,29 @@
97
 
98
         delete pAQLayer;
99
     }
100
+
101
+    // Histograms
102
+    if (param->bHistBasedSceneCut)
103
+    {
104
+        for (uint32_t segmentInFrameWidthIdx = 0; segmentInFrameWidthIdx < NUMBER_OF_SEGMENTS_IN_WIDTH; segmentInFrameWidthIdx++)
105
+        {
106
+            if (picHistogramsegmentInFrameWidthIdx)
107
+            {
108
+                for (uint32_t segmentInFrameHeightIdx = 0; segmentInFrameHeightIdx < NUMBER_OF_SEGMENTS_IN_HEIGHT; segmentInFrameHeightIdx++)
109
+                {
110
+                    if (picHistogramsegmentInFrameWidthIdxsegmentInFrameHeightIdx)
111
+                        X265_FREE(picHistogramsegmentInFrameWidthIdxsegmentInFrameHeightIdx0);
112
+                    X265_FREE(picHistogramsegmentInFrameWidthIdxsegmentInFrameHeightIdx);
113
+                }
114
+            }
115
+        }
116
+        if (picHistogram)
117
+            X265_FREE(picHistogram0);
118
+        X265_FREE(picHistogram);
119
+
120
+        X265_FREE(quarterSampleLowResBuffer);
121
+
122
+    }
123
 }
124
 // (re) initialize lowres state
125
 void Lowres::init(PicYuv *origPic, int poc)
126
@@ -266,10 +344,6 @@
127
     indB = 0;
128
     memset(costEst, -1, sizeof(costEst));
129
     memset(weightedCostDelta, 0, sizeof(weightedCostDelta));
130
-    interPCostPercDiff = 0.0;
131
-    intraCostPercDiff = 0.0;
132
-    m_bIsMaxThres = false;
133
-    m_bIsHardScenecut = false;
134
 
135
     if (qpAqOffset && invQscaleFactor)
136
         memset(costEstAq, -1, sizeof(costEstAq));
137
@@ -314,4 +388,16 @@
138
     }
139
 
140
     fpelPlane0 = lowresPlane0;
141
+
142
+    if (origPic->m_param->bHistBasedSceneCut)
143
+    {
144
+        // Quarter Sampled Input Picture Formation
145
+        // TO DO: Replace with ASM function
146
+        frame_lowres_core(
147
+            lowresPlane0,
148
+            quarterSampleLowResBuffer + quarterSampleLowResOriginX + quarterSampleLowResOriginY * quarterSampleLowResStrideY,
149
+            lumaStride,
150
+            quarterSampleLowResStrideY,
151
+            widthFullRes / 4, heightFullRes / 4);
152
+    }
153
 }
154
x265_3.5.tar.gz/source/common/lowres.h -> x265_3.6.tar.gz/source/common/lowres.h Changed
73
 
1
@@ -32,6 +32,10 @@
2
 namespace X265_NS {
3
 // private namespace
4
 
5
+#define HISTOGRAM_NUMBER_OF_BINS         256
6
+#define NUMBER_OF_SEGMENTS_IN_WIDTH      4
7
+#define NUMBER_OF_SEGMENTS_IN_HEIGHT     4
8
+
9
 struct ReferencePlanes
10
 {
11
     ReferencePlanes() { memset(this, 0, sizeof(ReferencePlanes)); }
12
@@ -171,6 +175,7 @@
13
 
14
     int    frameNum;         // Presentation frame number
15
     int    sliceType;        // Slice type decided by lookahead
16
+    int    sliceTypeReq;     // Slice type required as per the QP file
17
     int    width;            // width of lowres frame in pixels
18
     int    lines;            // height of lowres frame in pixel lines
19
     int    leadingBframes;   // number of leading B frames for P or I
20
@@ -214,13 +219,13 @@
21
     double*   qpAqOffset;      // AQ QP offset values for each 16x16 CU
22
     double*   qpCuTreeOffset;  // cuTree QP offset values for each 16x16 CU
23
     double*   qpAqMotionOffset;
24
-    int*      invQscaleFactor; // qScale values for qp Aq Offsets
25
+    int*      invQscaleFactor;    // qScale values for qp Aq Offsets
26
     int*      invQscaleFactor8x8; // temporary buffer for qg-size 8
27
     uint32_t* blockVariance;
28
     uint64_t  wp_ssd3;       // This is different than SSDY, this is sum(pixel^2) - sum(pixel)^2 for entire frame
29
     uint64_t  wp_sum3;
30
     double    frameVariance;
31
-    int* edgeInclined;
32
+    int*      edgeInclined;
33
 
34
 
35
     /* cutree intermediate data */
36
@@ -230,18 +235,30 @@
37
     uint32_t heightFullRes;
38
     uint32_t m_maxCUSize;
39
     uint32_t m_qgSize;
40
-    
41
+
42
     uint16_t* propagateCost;
43
     double    weightedCostDeltaX265_BFRAME_MAX + 2;
44
     ReferencePlanes weightedRefX265_BFRAME_MAX + 2;
45
+
46
     /* For hist-based scenecut */
47
-    bool   m_bIsMaxThres;
48
-    double interPCostPercDiff;
49
-    double intraCostPercDiff;
50
-    bool   m_bIsHardScenecut;
51
+    int          quarterSampleLowResWidth;     // width of 1/4 lowres frame in pixels
52
+    int          quarterSampleLowResHeight;    // height of 1/4 lowres frame in pixels
53
+    int          quarterSampleLowResStrideY;
54
+    int          quarterSampleLowResOriginX;
55
+    int          quarterSampleLowResOriginY;
56
+    pixel       *quarterSampleLowResBuffer;
57
+    bool         bHistScenecutAnalyzed;
58
+
59
+    uint16_t     picAvgVariance;
60
+    uint16_t     picAvgVarianceCb;
61
+    uint16_t     picAvgVarianceCr;
62
+
63
+    uint32_t ****picHistogram;
64
+    uint64_t     averageIntensityPerSegmentNUMBER_OF_SEGMENTS_IN_WIDTHNUMBER_OF_SEGMENTS_IN_HEIGHT3;
65
+    uint8_t      averageIntensity3;
66
 
67
     bool create(x265_param* param, PicYuv *origPic, uint32_t qgSize);
68
-    void destroy();
69
+    void destroy(x265_param* param);
70
     void init(PicYuv *origPic, int poc);
71
 };
72
 }
73
x265_3.5.tar.gz/source/common/mv.h -> x265_3.6.tar.gz/source/common/mv.h Changed
10
 
1
@@ -105,6 +105,8 @@
2
     {
3
         return x >= _min.x && x <= _max.x && y >= _min.y && y <= _max.y;
4
     }
5
+
6
+    void set(int32_t _x, int32_t _y) { x = _x; y = _y; }
7
 };
8
 }
9
 
10
x265_3.5.tar.gz/source/common/param.cpp -> x265_3.6.tar.gz/source/common/param.cpp Changed
668
 
1
@@ -145,6 +145,8 @@
2
     param->bAnnexB = 1;
3
     param->bRepeatHeaders = 0;
4
     param->bEnableAccessUnitDelimiters = 0;
5
+    param->bEnableEndOfBitstream = 0;
6
+    param->bEnableEndOfSequence = 0;
7
     param->bEmitHRDSEI = 0;
8
     param->bEmitInfoSEI = 1;
9
     param->bEmitHDRSEI = 0; /*Deprecated*/
10
@@ -163,12 +165,12 @@
11
     param->keyframeMax = 250;
12
     param->gopLookahead = 0;
13
     param->bOpenGOP = 1;
14
+   param->craNal = 0;
15
     param->bframes = 4;
16
     param->lookaheadDepth = 20;
17
     param->bFrameAdaptive = X265_B_ADAPT_TRELLIS;
18
     param->bBPyramid = 1;
19
     param->scenecutThreshold = 40; /* Magic number pulled in from x264 */
20
-    param->edgeTransitionThreshold = 0.03;
21
     param->bHistBasedSceneCut = 0;
22
     param->lookaheadSlices = 8;
23
     param->lookaheadThreads = 0;
24
@@ -179,12 +181,20 @@
25
     param->bEnableHRDConcatFlag = 0;
26
     param->bEnableFades = 0;
27
     param->bEnableSceneCutAwareQp = 0;
28
-    param->fwdScenecutWindow = 500;
29
-    param->fwdRefQpDelta = 5;
30
-    param->fwdNonRefQpDelta = param->fwdRefQpDelta + (SLICE_TYPE_DELTA * param->fwdRefQpDelta);
31
-    param->bwdScenecutWindow = 100;
32
-    param->bwdRefQpDelta = -1;
33
-    param->bwdNonRefQpDelta = -1;
34
+    param->fwdMaxScenecutWindow = 1200;
35
+    param->bwdMaxScenecutWindow = 600;
36
+    for (int i = 0; i < 6; i++)
37
+    {
38
+        int deltas6 = { 5, 4, 3, 2, 1, 0 };
39
+
40
+        param->fwdScenecutWindowi = 200;
41
+        param->fwdRefQpDeltai = deltasi;
42
+        param->fwdNonRefQpDeltai = param->fwdRefQpDeltai + (SLICE_TYPE_DELTA * param->fwdRefQpDeltai);
43
+
44
+        param->bwdScenecutWindowi = 100;
45
+        param->bwdRefQpDeltai = -1;
46
+        param->bwdNonRefQpDeltai = -1;
47
+    }
48
 
49
     /* Intra Coding Tools */
50
     param->bEnableConstrainedIntra = 0;
51
@@ -278,7 +288,10 @@
52
     param->rc.rfConstantMin = 0;
53
     param->rc.bStatRead = 0;
54
     param->rc.bStatWrite = 0;
55
+    param->rc.dataShareMode = X265_SHARE_MODE_FILE;
56
     param->rc.statFileName = NULL;
57
+    param->rc.sharedMemName = NULL;
58
+    param->rc.bEncFocusedFramesOnly = 0;
59
     param->rc.complexityBlur = 20;
60
     param->rc.qblur = 0.5;
61
     param->rc.zoneCount = 0;
62
@@ -321,6 +334,7 @@
63
     param->maxLuma = PIXEL_MAX;
64
     param->log2MaxPocLsb = 8;
65
     param->maxSlices = 1;
66
+    param->videoSignalTypePreset = NULL;
67
 
68
     /*Conformance window*/
69
     param->confWinRightOffset = 0;
70
@@ -373,10 +387,17 @@
71
     param->bEnableSvtHevc = 0;
72
     param->svtHevcParam = NULL;
73
 
74
+    /* MCSTF */
75
+    param->bEnableTemporalFilter = 0;
76
+    param->temporalFilterStrength = 0.95;
77
+
78
 #ifdef SVT_HEVC
79
     param->svtHevcParam = svtParam;
80
     svt_param_default(param);
81
 #endif
82
+    /* Film grain characteristics model filename */
83
+    param->filmGrain = NULL;
84
+    param->bEnableSBRC = 0;
85
 }
86
 
87
 int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
88
@@ -666,6 +687,46 @@
89
 #define atof(str) x265_atof(str, bError)
90
 #define atobool(str) (x265_atobool(str, bError))
91
 
92
+int x265_scenecut_aware_qp_param_parse(x265_param* p, const char* name, const char* value)
93
+{
94
+    bool bError = false;
95
+    char nameBuf64;
96
+    if (!name)
97
+        return X265_PARAM_BAD_NAME;
98
+    // skip -- prefix if provided
99
+    if (name0 == '-' && name1 == '-')
100
+        name += 2;
101
+    // s/_/-/g
102
+    if (strlen(name) + 1 < sizeof(nameBuf) && strchr(name, '_'))
103
+    {
104
+        char *c;
105
+        strcpy(nameBuf, name);
106
+        while ((c = strchr(nameBuf, '_')) != 0)
107
+            *c = '-';
108
+        name = nameBuf;
109
+    }
110
+    if (!value)
111
+        value = "true";
112
+    else if (value0 == '=')
113
+        value++;
114
+#define OPT(STR) else if (!strcmp(name, STR))
115
+    if (0);
116
+    OPT("scenecut-aware-qp") p->bEnableSceneCutAwareQp = x265_atoi(value, bError);
117
+    OPT("masking-strength") bError = parseMaskingStrength(p, value);
118
+    else
119
+        return X265_PARAM_BAD_NAME;
120
+#undef OPT
121
+    return bError ? X265_PARAM_BAD_VALUE : 0;
122
+}
123
+
124
+
125
+/* internal versions of string-to-int with additional error checking */
126
+#undef atoi
127
+#undef atof
128
+#define atoi(str) x265_atoi(str, bError)
129
+#define atof(str) x265_atof(str, bError)
130
+#define atobool(str) (x265_atobool(str, bError))
131
+
132
 int x265_zone_param_parse(x265_param* p, const char* name, const char* value)
133
 {
134
     bool bError = false;
135
@@ -949,10 +1010,9 @@
136
        {
137
            bError = false;
138
            p->scenecutThreshold = atoi(value);
139
-           p->bHistBasedSceneCut = 0;
140
        }
141
     }
142
-    OPT("temporal-layers") p->bEnableTemporalSubLayers = atobool(value);
143
+    OPT("temporal-layers") p->bEnableTemporalSubLayers = atoi(value);
144
     OPT("keyint") p->keyframeMax = atoi(value);
145
     OPT("min-keyint") p->keyframeMin = atoi(value);
146
     OPT("rc-lookahead") p->lookaheadDepth = atoi(value);
147
@@ -1184,6 +1244,7 @@
148
         int pass = x265_clip3(0, 3, atoi(value));
149
         p->rc.bStatWrite = pass & 1;
150
         p->rc.bStatRead = pass & 2;
151
+        p->rc.dataShareMode = X265_SHARE_MODE_FILE;
152
     }
153
     OPT("stats") p->rc.statFileName = strdup(value);
154
     OPT("scaling-list") p->scalingLists = strdup(value);
155
@@ -1216,21 +1277,7 @@
156
         OPT("opt-ref-list-length-pps") p->bOptRefListLengthPPS = atobool(value);
157
         OPT("multi-pass-opt-rps") p->bMultiPassOptRPS = atobool(value);
158
         OPT("scenecut-bias") p->scenecutBias = atof(value);
159
-        OPT("hist-scenecut")
160
-        {
161
-            p->bHistBasedSceneCut = atobool(value);
162
-            if (bError)
163
-            {
164
-                bError = false;
165
-                p->bHistBasedSceneCut = 0;
166
-            }
167
-            if (p->bHistBasedSceneCut)
168
-            {
169
-                bError = false;
170
-                p->scenecutThreshold = 0;
171
-            }
172
-        }
173
-        OPT("hist-threshold") p->edgeTransitionThreshold = atof(value);
174
+        OPT("hist-scenecut") p->bHistBasedSceneCut = atobool(value);
175
         OPT("rskip-edge-threshold") p->edgeVarThreshold = atoi(value)/100.0f;
176
         OPT("lookahead-threads") p->lookaheadThreads = atoi(value);
177
         OPT("opt-cu-delta-qp") p->bOptCUDeltaQP = atobool(value);
178
@@ -1238,6 +1285,7 @@
179
         OPT("multi-pass-opt-distortion") p->analysisMultiPassDistortion = atobool(value);
180
         OPT("aq-motion") p->bAQMotion = atobool(value);
181
         OPT("dynamic-rd") p->dynamicRd = atof(value);
182
+       OPT("cra-nal") p->craNal = atobool(value);
183
         OPT("analysis-reuse-level")
184
         {
185
             p->analysisReuseLevel = atoi(value);
186
@@ -1348,71 +1396,7 @@
187
         }
188
         OPT("fades") p->bEnableFades = atobool(value);
189
         OPT("scenecut-aware-qp") p->bEnableSceneCutAwareQp = atoi(value);
190
-        OPT("masking-strength")
191
-        {
192
-            int window1;
193
-            double refQpDelta1, nonRefQpDelta1;
194
-
195
-            if (p->bEnableSceneCutAwareQp == FORWARD)
196
-            {
197
-                if (3 == sscanf(value, "%d,%lf,%lf", &window1, &refQpDelta1, &nonRefQpDelta1))
198
-                {
199
-                    if (window1 > 0)
200
-                        p->fwdScenecutWindow = window1;
201
-                    if (refQpDelta1 > 0)
202
-                        p->fwdRefQpDelta = refQpDelta1;
203
-                    if (nonRefQpDelta1 > 0)
204
-                        p->fwdNonRefQpDelta = nonRefQpDelta1;
205
-                }
206
-                else
207
-                {
208
-                    x265_log(NULL, X265_LOG_ERROR, "Specify all the necessary offsets for masking-strength \n");
209
-                    bError = true;
210
-                }
211
-            }
212
-            else if (p->bEnableSceneCutAwareQp == BACKWARD)
213
-            {
214
-                if (3 == sscanf(value, "%d,%lf,%lf", &window1, &refQpDelta1, &nonRefQpDelta1))
215
-                {
216
-                    if (window1 > 0)
217
-                        p->bwdScenecutWindow = window1;
218
-                    if (refQpDelta1 > 0)
219
-                        p->bwdRefQpDelta = refQpDelta1;
220
-                    if (nonRefQpDelta1 > 0)
221
-                        p->bwdNonRefQpDelta = nonRefQpDelta1;
222
-                }
223
-                else
224
-                {
225
-                    x265_log(NULL, X265_LOG_ERROR, "Specify all the necessary offsets for masking-strength \n");
226
-                    bError = true;
227
-                }
228
-            }
229
-            else if (p->bEnableSceneCutAwareQp == BI_DIRECTIONAL)
230
-            {
231
-                int window2;
232
-                double refQpDelta2, nonRefQpDelta2;
233
-                if (6 == sscanf(value, "%d,%lf,%lf,%d,%lf,%lf", &window1, &refQpDelta1, &nonRefQpDelta1, &window2, &refQpDelta2, &nonRefQpDelta2))
234
-                {
235
-                    if (window1 > 0)
236
-                        p->fwdScenecutWindow = window1;
237
-                    if (refQpDelta1 > 0)
238
-                        p->fwdRefQpDelta = refQpDelta1;
239
-                    if (nonRefQpDelta1 > 0)
240
-                        p->fwdNonRefQpDelta = nonRefQpDelta1;
241
-                    if (window2 > 0)
242
-                        p->bwdScenecutWindow = window2;
243
-                    if (refQpDelta2 > 0)
244
-                        p->bwdRefQpDelta = refQpDelta2;
245
-                    if (nonRefQpDelta2 > 0)
246
-                        p->bwdNonRefQpDelta = nonRefQpDelta2;
247
-                }
248
-                else
249
-                {
250
-                    x265_log(NULL, X265_LOG_ERROR, "Specify all the necessary offsets for masking-strength \n");
251
-                    bError = true;
252
-                }
253
-            }
254
-        }
255
+        OPT("masking-strength") bError |= parseMaskingStrength(p, value);
256
         OPT("field") p->bField = atobool( value );
257
         OPT("cll") p->bEmitCLL = atobool(value);
258
         OPT("frame-dup") p->bEnableFrameDuplication = atobool(value);
259
@@ -1446,6 +1430,13 @@
260
         OPT("vbv-live-multi-pass") p->bliveVBV2pass = atobool(value);
261
         OPT("min-vbv-fullness") p->minVbvFullness = atof(value);
262
         OPT("max-vbv-fullness") p->maxVbvFullness = atof(value);
263
+        OPT("video-signal-type-preset") p->videoSignalTypePreset = strdup(value);
264
+        OPT("eob") p->bEnableEndOfBitstream = atobool(value);
265
+        OPT("eos") p->bEnableEndOfSequence = atobool(value);
266
+        /* Film grain characterstics model filename */
267
+        OPT("film-grain") p->filmGrain = (char* )value;
268
+        OPT("mcstf") p->bEnableTemporalFilter = atobool(value);
269
+        OPT("sbrc") p->bEnableSBRC = atobool(value);
270
         else
271
             return X265_PARAM_BAD_NAME;
272
     }
273
@@ -1761,8 +1752,6 @@
274
           "scenecutThreshold must be greater than 0");
275
     CHECK(param->scenecutBias < 0 || 100 < param->scenecutBias,
276
             "scenecut-bias must be between 0 and 100");
277
-    CHECK(param->edgeTransitionThreshold < 0.0 || 1.0 < param->edgeTransitionThreshold,
278
-            "hist-threshold must be between 0.0 and 1.0");
279
     CHECK(param->radl < 0 || param->radl > param->bframes,
280
           "radl must be between 0 and bframes");
281
     CHECK(param->rdPenalty < 0 || param->rdPenalty > 2,
282
@@ -1824,15 +1813,15 @@
283
         "Invalid refine-ctu-distortion value, must be either 0 or 1");
284
     CHECK(param->maxAUSizeFactor < 0.5 || param->maxAUSizeFactor > 1.0,
285
         "Supported factor for controlling max AU size is from 0.5 to 1");
286
-    CHECK((param->dolbyProfile != 0) && (param->dolbyProfile != 50) && (param->dolbyProfile != 81) && (param->dolbyProfile != 82),
287
-        "Unsupported Dolby Vision profile, only profile 5, profile 8.1 and profile 8.2 enabled");
288
+    CHECK((param->dolbyProfile != 0) && (param->dolbyProfile != 50) && (param->dolbyProfile != 81) && (param->dolbyProfile != 82) && (param->dolbyProfile != 84),
289
+        "Unsupported Dolby Vision profile, only profile 5, profile 8.1, profile 8.2 and profile 8.4 enabled");
290
     CHECK(param->dupThreshold < 1 || 99 < param->dupThreshold,
291
         "Invalid frame-duplication threshold. Value must be between 1 and 99.");
292
     if (param->dolbyProfile)
293
     {
294
         CHECK((param->rc.vbvMaxBitrate <= 0 || param->rc.vbvBufferSize <= 0), "Dolby Vision requires VBV settings to enable HRD.\n");
295
-        CHECK((param->internalBitDepth != 10), "Dolby Vision profile - 5, profile - 8.1 and profile - 8.2 is Main10 only\n");
296
-        CHECK((param->internalCsp != X265_CSP_I420), "Dolby Vision profile - 5, profile - 8.1 and profile - 8.2 requires YCbCr 4:2:0 color space\n");
297
+        CHECK((param->internalBitDepth != 10), "Dolby Vision profile - 5, profile - 8.1, profile - 8.2 and profile - 8.4 are Main10 only\n");
298
+        CHECK((param->internalCsp != X265_CSP_I420), "Dolby Vision profile - 5, profile - 8.1, profile - 8.2 and profile - 8.4 requires YCbCr 4:2:0 color space\n");
299
         if (param->dolbyProfile == 81)
300
             CHECK(!(param->masteringDisplayColorVolume), "Dolby Vision profile - 8.1 requires Mastering display color volume information\n");
301
     }
302
@@ -1854,19 +1843,22 @@
303
         {
304
             CHECK(param->bEnableSceneCutAwareQp < 0 || param->bEnableSceneCutAwareQp > 3,
305
             "Invalid masking direction. Value must be between 0 and 3(inclusive)");
306
-            CHECK(param->fwdScenecutWindow < 0 || param->fwdScenecutWindow > 1000,
307
-            "Invalid forward scenecut Window duration. Value must be between 0 and 1000(inclusive)");
308
-            CHECK(param->fwdRefQpDelta < 0 || param->fwdRefQpDelta > 10,
309
-            "Invalid fwdRefQpDelta value. Value must be between 0 and 10 (inclusive)");
310
-            CHECK(param->fwdNonRefQpDelta < 0 || param->fwdNonRefQpDelta > 10,
311
-            "Invalid fwdNonRefQpDelta value. Value must be between 0 and 10 (inclusive)");
312
-
313
-            CHECK(param->bwdScenecutWindow < 0 || param->bwdScenecutWindow > 1000,
314
-                "Invalid backward scenecut Window duration. Value must be between 0 and 1000(inclusive)");
315
-            CHECK(param->bwdRefQpDelta < -1 || param->bwdRefQpDelta > 10,
316
-                "Invalid bwdRefQpDelta value. Value must be between 0 and 10 (inclusive)");
317
-            CHECK(param->bwdNonRefQpDelta < -1 || param->bwdNonRefQpDelta > 10,
318
-                "Invalid bwdNonRefQpDelta value. Value must be between 0 and 10 (inclusive)");
319
+            for (int i = 0; i < 6; i++)
320
+            {
321
+                CHECK(param->fwdScenecutWindowi < 0 || param->fwdScenecutWindowi > 1000,
322
+                    "Invalid forward scenecut Window duration. Value must be between 0 and 1000(inclusive)");
323
+                CHECK(param->fwdRefQpDeltai < 0 || param->fwdRefQpDeltai > 20,
324
+                    "Invalid fwdRefQpDelta value. Value must be between 0 and 20 (inclusive)");
325
+                CHECK(param->fwdNonRefQpDeltai < 0 || param->fwdNonRefQpDeltai > 20,
326
+                    "Invalid fwdNonRefQpDelta value. Value must be between 0 and 20 (inclusive)");
327
+
328
+                CHECK(param->bwdScenecutWindowi < 0 || param->bwdScenecutWindowi > 1000,
329
+                    "Invalid backward scenecut Window duration. Value must be between 0 and 1000(inclusive)");
330
+                CHECK(param->bwdRefQpDeltai < -1 || param->bwdRefQpDeltai > 20,
331
+                    "Invalid bwdRefQpDelta value. Value must be between 0 and 20 (inclusive)");
332
+                CHECK(param->bwdNonRefQpDeltai < -1 || param->bwdNonRefQpDeltai > 20,
333
+                    "Invalid bwdNonRefQpDelta value. Value must be between 0 and 20 (inclusive)");
334
+            }
335
         }
336
     }
337
     if (param->bEnableHME)
338
@@ -1898,6 +1890,11 @@
339
         param->bSingleSeiNal = 0;
340
         x265_log(param, X265_LOG_WARNING, "None of the SEI messages are enabled. Disabling Single SEI NAL\n");
341
     }
342
+    if (param->bEnableTemporalFilter && (param->frameNumThreads > 1))
343
+    {
344
+        param->bEnableTemporalFilter = 0;
345
+        x265_log(param, X265_LOG_WARNING, "MCSTF can be enabled with frame thread = 1 only. Disabling MCSTF\n");
346
+    }
347
     CHECK(param->confWinRightOffset < 0, "Conformance Window Right Offset must be 0 or greater");
348
     CHECK(param->confWinBottomOffset < 0, "Conformance Window Bottom Offset must be 0 or greater");
349
     CHECK(param->decoderVbvMaxRate < 0, "Invalid Decoder Vbv Maxrate. Value can not be less than zero");
350
@@ -1910,6 +1907,7 @@
351
             x265_log(param, X265_LOG_WARNING, "Live VBV enabled without VBV settings.Disabling live VBV in 2 pass\n");
352
         }
353
     }
354
+    CHECK(param->rc.dataShareMode != X265_SHARE_MODE_FILE && param->rc.dataShareMode != X265_SHARE_MODE_SHAREDMEM, "Invalid data share mode. It must be one of the X265_DATA_SHARE_MODES enum values\n" );
355
     return check_failed;
356
 }
357
 
358
@@ -1970,8 +1968,8 @@
359
         x265_log(param, X265_LOG_INFO, "Keyframe min / max / scenecut / bias  : %d / %d / %d / %.2lf \n",
360
                  param->keyframeMin, param->keyframeMax, param->scenecutThreshold, param->scenecutBias * 100);
361
     else if (param->bHistBasedSceneCut && param->keyframeMax != INT_MAX) 
362
-        x265_log(param, X265_LOG_INFO, "Keyframe min / max / scenecut / edge threshold  : %d / %d / %d / %.2lf\n",
363
-                 param->keyframeMin, param->keyframeMax, param->bHistBasedSceneCut, param->edgeTransitionThreshold);
364
+        x265_log(param, X265_LOG_INFO, "Keyframe min / max / scenecut  : %d / %d / %d\n",
365
+                 param->keyframeMin, param->keyframeMax, param->bHistBasedSceneCut);
366
     else if (param->keyframeMax == INT_MAX)
367
         x265_log(param, X265_LOG_INFO, "Keyframe min / max / scenecut       : disabled\n");
368
 
369
@@ -2089,6 +2087,8 @@
370
         bufSize += strlen(p->numaPools);
371
     if (p->masteringDisplayColorVolume)
372
         bufSize += strlen(p->masteringDisplayColorVolume);
373
+    if (p->videoSignalTypePreset)
374
+        bufSize += strlen(p->videoSignalTypePreset);
375
 
376
     buf = s = X265_MALLOC(char, bufSize);
377
     if (!buf)
378
@@ -2126,10 +2126,12 @@
379
     BOOL(p->bRepeatHeaders, "repeat-headers");
380
     BOOL(p->bAnnexB, "annexb");
381
     BOOL(p->bEnableAccessUnitDelimiters, "aud");
382
+    BOOL(p->bEnableEndOfBitstream, "eob");
383
+    BOOL(p->bEnableEndOfSequence, "eos");
384
     BOOL(p->bEmitHRDSEI, "hrd");
385
     BOOL(p->bEmitInfoSEI, "info");
386
     s += sprintf(s, " hash=%d", p->decodedPictureHashSEI);
387
-    BOOL(p->bEnableTemporalSubLayers, "temporal-layers");
388
+    s += sprintf(s, " temporal-layers=%d", p->bEnableTemporalSubLayers);
389
     BOOL(p->bOpenGOP, "open-gop");
390
     s += sprintf(s, " min-keyint=%d", p->keyframeMin);
391
     s += sprintf(s, " keyint=%d", p->keyframeMax);
392
@@ -2141,7 +2143,7 @@
393
     s += sprintf(s, " rc-lookahead=%d", p->lookaheadDepth);
394
     s += sprintf(s, " lookahead-slices=%d", p->lookaheadSlices);
395
     s += sprintf(s, " scenecut=%d", p->scenecutThreshold);
396
-    s += sprintf(s, " hist-scenecut=%d", p->bHistBasedSceneCut);
397
+    BOOL(p->bHistBasedSceneCut, "hist-scenecut");
398
     s += sprintf(s, " radl=%d", p->radl);
399
     BOOL(p->bEnableHRDConcatFlag, "splice");
400
     BOOL(p->bIntraRefresh, "intra-refresh");
401
@@ -2295,7 +2297,6 @@
402
     BOOL(p->bOptRefListLengthPPS, "opt-ref-list-length-pps");
403
     BOOL(p->bMultiPassOptRPS, "multi-pass-opt-rps");
404
     s += sprintf(s, " scenecut-bias=%.2f", p->scenecutBias);
405
-    s += sprintf(s, " hist-threshold=%.2f", p->edgeTransitionThreshold);
406
     BOOL(p->bOptCUDeltaQP, "opt-cu-delta-qp");
407
     BOOL(p->bAQMotion, "aq-motion");
408
     BOOL(p->bEmitHDR10SEI, "hdr10");
409
@@ -2328,10 +2329,14 @@
410
     s += sprintf(s, " qp-adaptation-range=%.2f", p->rc.qpAdaptationRange);
411
     s += sprintf(s, " scenecut-aware-qp=%d", p->bEnableSceneCutAwareQp);
412
     if (p->bEnableSceneCutAwareQp)
413
-        s += sprintf(s, " fwd-scenecut-window=%d fwd-ref-qp-delta=%f fwd-nonref-qp-delta=%f bwd-scenecut-window=%d bwd-ref-qp-delta=%f bwd-nonref-qp-delta=%f", p->fwdScenecutWindow, p->fwdRefQpDelta, p->fwdNonRefQpDelta, p->bwdScenecutWindow, p->bwdRefQpDelta, p->bwdNonRefQpDelta);
414
+        s += sprintf(s, " fwd-scenecut-window=%d fwd-ref-qp-delta=%f fwd-nonref-qp-delta=%f bwd-scenecut-window=%d bwd-ref-qp-delta=%f bwd-nonref-qp-delta=%f", p->fwdMaxScenecutWindow, p->fwdRefQpDelta0, p->fwdNonRefQpDelta0, p->bwdMaxScenecutWindow, p->bwdRefQpDelta0, p->bwdNonRefQpDelta0);
415
     s += sprintf(s, "conformance-window-offsets right=%d bottom=%d", p->confWinRightOffset, p->confWinBottomOffset);
416
     s += sprintf(s, " decoder-max-rate=%d", p->decoderVbvMaxRate);
417
     BOOL(p->bliveVBV2pass, "vbv-live-multi-pass");
418
+    if (p->filmGrain)
419
+        s += sprintf(s, " film-grain=%s", p->filmGrain); // Film grain characteristics model filename
420
+    BOOL(p->bEnableTemporalFilter, "mcstf");
421
+    BOOL(p->bEnableSBRC, "sbrc");
422
 #undef BOOL
423
     return buf;
424
 }
425
@@ -2406,6 +2411,151 @@
426
     return false;
427
 }
428
 
429
+bool parseMaskingStrength(x265_param* p, const char* value)
430
+{
431
+    bool bError = false;
432
+    int window16;
433
+    double refQpDelta16, nonRefQpDelta16;
434
+    if (p->bEnableSceneCutAwareQp == FORWARD)
435
+    {
436
+        if (3 == sscanf(value, "%d,%lf,%lf", &window10, &refQpDelta10, &nonRefQpDelta10))
437
+        {
438
+            if (window10 > 0)
439
+                p->fwdMaxScenecutWindow = window10;
440
+            if (refQpDelta10 > 0)
441
+                p->fwdRefQpDelta0 = refQpDelta10;
442
+            if (nonRefQpDelta10 > 0)
443
+                p->fwdNonRefQpDelta0 = nonRefQpDelta10;
444
+
445
+            p->fwdScenecutWindow0 = p->fwdMaxScenecutWindow / 6;
446
+            for (int i = 1; i < 6; i++)
447
+            {
448
+                p->fwdScenecutWindowi = p->fwdMaxScenecutWindow / 6;
449
+                p->fwdRefQpDeltai = p->fwdRefQpDeltai - 1 - (0.15 * p->fwdRefQpDeltai - 1);
450
+                p->fwdNonRefQpDeltai = p->fwdNonRefQpDeltai - 1 - (0.15 * p->fwdNonRefQpDeltai - 1);
451
+            }
452
+        }
453
+        else if (18 == sscanf(value, "%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf"
454
+            , &window10, &refQpDelta10, &nonRefQpDelta10, &window11, &refQpDelta11, &nonRefQpDelta11
455
+            , &window12, &refQpDelta12, &nonRefQpDelta12, &window13, &refQpDelta13, &nonRefQpDelta13
456
+            , &window14, &refQpDelta14, &nonRefQpDelta14, &window15, &refQpDelta15, &nonRefQpDelta15))
457
+        {
458
+            p->fwdMaxScenecutWindow = 0;
459
+            for (int i = 0; i < 6; i++)
460
+            {
461
+                p->fwdScenecutWindowi = window1i;
462
+                p->fwdRefQpDeltai = refQpDelta1i;
463
+                p->fwdNonRefQpDeltai = nonRefQpDelta1i;
464
+                p->fwdMaxScenecutWindow += p->fwdScenecutWindowi;
465
+            }
466
+        }
467
+        else
468
+        {
469
+            x265_log(NULL, X265_LOG_ERROR, "Specify all the necessary offsets for masking-strength \n");
470
+            bError = true;
471
+        }
472
+    }
473
+    else if (p->bEnableSceneCutAwareQp == BACKWARD)
474
+    {
475
+        if (3 == sscanf(value, "%d,%lf,%lf", &window10, &refQpDelta10, &nonRefQpDelta10))
476
+        {
477
+            if (window10 > 0)
478
+                p->bwdMaxScenecutWindow = window10;
479
+            if (refQpDelta10 > 0)
480
+                p->bwdRefQpDelta0 = refQpDelta10;
481
+            if (nonRefQpDelta10 > 0)
482
+                p->bwdNonRefQpDelta0 = nonRefQpDelta10;
483
+
484
+            p->bwdScenecutWindow0 = p->bwdMaxScenecutWindow / 6;
485
+            for (int i = 1; i < 6; i++)
486
+            {
487
+                p->bwdScenecutWindowi = p->bwdMaxScenecutWindow / 6;
488
+                p->bwdRefQpDeltai = p->bwdRefQpDeltai - 1 - (0.15 * p->bwdRefQpDeltai - 1);
489
+                p->bwdNonRefQpDeltai = p->bwdNonRefQpDeltai - 1 - (0.15 * p->bwdNonRefQpDeltai - 1);
490
+            }
491
+        }
492
+        else if (18 == sscanf(value, "%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf"
493
+            , &window10, &refQpDelta10, &nonRefQpDelta10, &window11, &refQpDelta11, &nonRefQpDelta11
494
+            , &window12, &refQpDelta12, &nonRefQpDelta12, &window13, &refQpDelta13, &nonRefQpDelta13
495
+            , &window14, &refQpDelta14, &nonRefQpDelta14, &window15, &refQpDelta15, &nonRefQpDelta15))
496
+        {
497
+            p->bwdMaxScenecutWindow = 0;
498
+            for (int i = 0; i < 6; i++)
499
+            {
500
+                p->bwdScenecutWindowi = window1i;
501
+                p->bwdRefQpDeltai = refQpDelta1i;
502
+                p->bwdNonRefQpDeltai = nonRefQpDelta1i;
503
+                p->bwdMaxScenecutWindow += p->bwdScenecutWindowi;
504
+            }
505
+        }
506
+        else
507
+        {
508
+            x265_log(NULL, X265_LOG_ERROR, "Specify all the necessary offsets for masking-strength \n");
509
+            bError = true;
510
+        }
511
+    }
512
+    else if (p->bEnableSceneCutAwareQp == BI_DIRECTIONAL)
513
+    {
514
+        int window26;
515
+        double refQpDelta26, nonRefQpDelta26;
516
+        if (6 == sscanf(value, "%d,%lf,%lf,%d,%lf,%lf", &window10, &refQpDelta10, &nonRefQpDelta10, &window20, &refQpDelta20, &nonRefQpDelta20))
517
+        {
518
+            if (window10 > 0)
519
+                p->fwdMaxScenecutWindow = window10;
520
+            if (refQpDelta10 > 0)
521
+                p->fwdRefQpDelta0 = refQpDelta10;
522
+            if (nonRefQpDelta10 > 0)
523
+                p->fwdNonRefQpDelta0 = nonRefQpDelta10;
524
+            if (window20 > 0)
525
+                p->bwdMaxScenecutWindow = window20;
526
+            if (refQpDelta20 > 0)
527
+                p->bwdRefQpDelta0 = refQpDelta20;
528
+            if (nonRefQpDelta20 > 0)
529
+                p->bwdNonRefQpDelta0 = nonRefQpDelta20;
530
+
531
+            p->fwdScenecutWindow0 = p->fwdMaxScenecutWindow / 6;
532
+            p->bwdScenecutWindow0 = p->bwdMaxScenecutWindow / 6;
533
+            for (int i = 1; i < 6; i++)
534
+            {
535
+                p->fwdScenecutWindowi = p->fwdMaxScenecutWindow / 6;
536
+                p->bwdScenecutWindowi = p->bwdMaxScenecutWindow / 6;
537
+                p->fwdRefQpDeltai = p->fwdRefQpDeltai - 1 - (0.15 * p->fwdRefQpDeltai - 1);
538
+                p->fwdNonRefQpDeltai = p->fwdNonRefQpDeltai - 1 - (0.15 * p->fwdNonRefQpDeltai - 1);
539
+                p->bwdRefQpDeltai = p->bwdRefQpDeltai - 1 - (0.15 * p->bwdRefQpDeltai - 1);
540
+                p->bwdNonRefQpDeltai = p->bwdNonRefQpDeltai - 1 - (0.15 * p->bwdNonRefQpDeltai - 1);
541
+            }
542
+        }
543
+        else if (36 == sscanf(value, "%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf"
544
+            , &window10, &refQpDelta10, &nonRefQpDelta10, &window11, &refQpDelta11, &nonRefQpDelta11
545
+            , &window12, &refQpDelta12, &nonRefQpDelta12, &window13, &refQpDelta13, &nonRefQpDelta13
546
+            , &window14, &refQpDelta14, &nonRefQpDelta14, &window15, &refQpDelta15, &nonRefQpDelta15
547
+            , &window20, &refQpDelta20, &nonRefQpDelta20, &window21, &refQpDelta21, &nonRefQpDelta21
548
+            , &window22, &refQpDelta22, &nonRefQpDelta22, &window23, &refQpDelta23, &nonRefQpDelta23
549
+            , &window24, &refQpDelta24, &nonRefQpDelta24, &window25, &refQpDelta25, &nonRefQpDelta25))
550
+        {
551
+            p->fwdMaxScenecutWindow = 0;
552
+            p->bwdMaxScenecutWindow = 0;
553
+            for (int i = 0; i < 6; i++)
554
+            {
555
+                p->fwdScenecutWindowi = window1i;
556
+                p->fwdRefQpDeltai = refQpDelta1i;
557
+                p->fwdNonRefQpDeltai = nonRefQpDelta1i;
558
+                p->bwdScenecutWindowi = window2i;
559
+                p->bwdRefQpDeltai = refQpDelta2i;
560
+                p->bwdNonRefQpDeltai = nonRefQpDelta2i;
561
+                p->fwdMaxScenecutWindow += p->fwdScenecutWindowi;
562
+                p->bwdMaxScenecutWindow += p->bwdScenecutWindowi;
563
+            }
564
+        }
565
+        else
566
+        {
567
+            x265_log(NULL, X265_LOG_ERROR, "Specify all the necessary offsets for masking-strength \n");
568
+            bError = true;
569
+        }
570
+    }
571
+    return bError;
572
+}
573
+
574
 void x265_copy_params(x265_param* dst, x265_param* src)
575
 {
576
     dst->cpuid = src->cpuid;
577
@@ -2440,10 +2590,13 @@
578
     dst->bRepeatHeaders = src->bRepeatHeaders;
579
     dst->bAnnexB = src->bAnnexB;
580
     dst->bEnableAccessUnitDelimiters = src->bEnableAccessUnitDelimiters;
581
+    dst->bEnableEndOfBitstream = src->bEnableEndOfBitstream;
582
+    dst->bEnableEndOfSequence = src->bEnableEndOfSequence;
583
     dst->bEmitInfoSEI = src->bEmitInfoSEI;
584
     dst->decodedPictureHashSEI = src->decodedPictureHashSEI;
585
     dst->bEnableTemporalSubLayers = src->bEnableTemporalSubLayers;
586
     dst->bOpenGOP = src->bOpenGOP;
587
+   dst->craNal = src->craNal;
588
     dst->keyframeMax = src->keyframeMax;
589
     dst->keyframeMin = src->keyframeMin;
590
     dst->bframes = src->bframes;
591
@@ -2541,8 +2694,11 @@
592
     dst->rc.rfConstantMin = src->rc.rfConstantMin;
593
     dst->rc.bStatWrite = src->rc.bStatWrite;
594
     dst->rc.bStatRead = src->rc.bStatRead;
595
+    dst->rc.dataShareMode = src->rc.dataShareMode;
596
     if (src->rc.statFileName) dst->rc.statFileName=strdup(src->rc.statFileName);
597
     else dst->rc.statFileName = NULL;
598
+    if (src->rc.sharedMemName) dst->rc.sharedMemName = strdup(src->rc.sharedMemName);
599
+    else dst->rc.sharedMemName = NULL;
600
     dst->rc.qblur = src->rc.qblur;
601
     dst->rc.complexityBlur = src->rc.complexityBlur;
602
     dst->rc.bEnableSlowFirstPass = src->rc.bEnableSlowFirstPass;
603
@@ -2550,6 +2706,7 @@
604
     dst->rc.zonefileCount = src->rc.zonefileCount;
605
     dst->reconfigWindowSize = src->reconfigWindowSize;
606
     dst->bResetZoneConfig = src->bResetZoneConfig;
607
+    dst->bNoResetZoneConfig = src->bNoResetZoneConfig;
608
     dst->decoderVbvMaxRate = src->decoderVbvMaxRate;
609
 
610
     if (src->rc.zonefileCount && src->rc.zones && src->bResetZoneConfig)
611
@@ -2557,6 +2714,7 @@
612
         for (int i = 0; i < src->rc.zonefileCount; i++)
613
         {
614
             dst->rc.zonesi.startFrame = src->rc.zonesi.startFrame;
615
+            dst->rc.zones0.keyframeMax = src->rc.zones0.keyframeMax;
616
             memcpy(dst->rc.zonesi.zoneParam, src->rc.zonesi.zoneParam, sizeof(x265_param));
617
         }
618
     }
619
@@ -2621,7 +2779,6 @@
620
     dst->bOptRefListLengthPPS = src->bOptRefListLengthPPS;
621
     dst->bMultiPassOptRPS = src->bMultiPassOptRPS;
622
     dst->scenecutBias = src->scenecutBias;
623
-    dst->edgeTransitionThreshold = src->edgeTransitionThreshold;
624
     dst->gopLookahead = src->lookaheadDepth;
625
     dst->bOptCUDeltaQP = src->bOptCUDeltaQP;
626
     dst->analysisMultiPassDistortion = src->analysisMultiPassDistortion;
627
@@ -2682,20 +2839,33 @@
628
     dst->bEnableSvtHevc = src->bEnableSvtHevc;
629
     dst->bEnableFades = src->bEnableFades;
630
     dst->bEnableSceneCutAwareQp = src->bEnableSceneCutAwareQp;
631
-    dst->fwdScenecutWindow = src->fwdScenecutWindow;
632
-    dst->fwdRefQpDelta = src->fwdRefQpDelta;
633
-    dst->fwdNonRefQpDelta = src->fwdNonRefQpDelta;
634
-    dst->bwdScenecutWindow = src->bwdScenecutWindow;
635
-    dst->bwdRefQpDelta = src->bwdRefQpDelta;
636
-    dst->bwdNonRefQpDelta = src->bwdNonRefQpDelta;
637
+    dst->fwdMaxScenecutWindow = src->fwdMaxScenecutWindow;
638
+    dst->bwdMaxScenecutWindow = src->bwdMaxScenecutWindow;
639
+    for (int i = 0; i < 6; i++)
640
+    {
641
+        dst->fwdScenecutWindowi = src->fwdScenecutWindowi;
642
+        dst->fwdRefQpDeltai = src->fwdRefQpDeltai;
643
+        dst->fwdNonRefQpDeltai = src->fwdNonRefQpDeltai;
644
+        dst->bwdScenecutWindowi = src->bwdScenecutWindowi;
645
+        dst->bwdRefQpDeltai = src->bwdRefQpDeltai;
646
+        dst->bwdNonRefQpDeltai = src->bwdNonRefQpDeltai;
647
+    }
648
     dst->bField = src->bField;
649
-
650
+    dst->bEnableTemporalFilter = src->bEnableTemporalFilter;
651
+    dst->temporalFilterStrength = src->temporalFilterStrength;
652
     dst->confWinRightOffset = src->confWinRightOffset;
653
     dst->confWinBottomOffset = src->confWinBottomOffset;
654
     dst->bliveVBV2pass = src->bliveVBV2pass;
655
+
656
+    if (src->videoSignalTypePreset) dst->videoSignalTypePreset = strdup(src->videoSignalTypePreset);
657
+    else dst->videoSignalTypePreset = NULL;
658
 #ifdef SVT_HEVC
659
     memcpy(dst->svtHevcParam, src->svtHevcParam, sizeof(EB_H265_ENC_CONFIGURATION));
660
 #endif
661
+    /* Film grain */
662
+    if (src->filmGrain)
663
+        dst->filmGrain = src->filmGrain;
664
+    dst->bEnableSBRC = src->bEnableSBRC;
665
 }
666
 
667
 #ifdef SVT_HEVC
668
x265_3.5.tar.gz/source/common/param.h -> x265_3.6.tar.gz/source/common/param.h Changed
17
 
1
@@ -38,6 +38,7 @@
2
 void  getParamAspectRatio(x265_param *p, int& width, int& height);
3
 bool  parseLambdaFile(x265_param *param);
4
 void x265_copy_params(x265_param* dst, x265_param* src);
5
+bool parseMaskingStrength(x265_param* p, const char* value);
6
 
7
 /* this table is kept internal to avoid confusion, since log level indices start at -1 */
8
 static const char * const logLevelNames = { "none", "error", "warning", "info", "debug", "full", 0 };
9
@@ -52,6 +53,7 @@
10
 int x265_param_default_preset(x265_param *, const char *preset, const char *tune);
11
 int x265_param_apply_profile(x265_param *, const char *profile);
12
 int x265_param_parse(x265_param *p, const char *name, const char *value);
13
+int x265_scenecut_aware_qp_param_parse(x265_param* p, const char* name, const char* value);
14
 int x265_zone_param_parse(x265_param* p, const char* name, const char* value);
15
 #define PARAM_NS X265_NS
16
 #endif
17
x265_3.5.tar.gz/source/common/piclist.cpp -> x265_3.6.tar.gz/source/common/piclist.cpp Changed
134
 
1
@@ -45,6 +45,25 @@
2
     m_count++;
3
 }
4
 
5
+void PicList::pushFrontMCSTF(Frame& curFrame)
6
+{
7
+    X265_CHECK(!curFrame.m_nextMCSTF && !curFrame.m_nextMCSTF, "piclist: picture already in OPB list\n"); // ensure frame is not in a list
8
+    curFrame.m_nextMCSTF = m_start;
9
+    curFrame.m_prevMCSTF = NULL;
10
+
11
+    if (m_count)
12
+    {
13
+        m_start->m_prevMCSTF = &curFrame;
14
+        m_start = &curFrame;
15
+    }
16
+    else
17
+    {
18
+        m_start = m_end = &curFrame;
19
+    }
20
+    m_count++;
21
+
22
+}
23
+
24
 void PicList::pushBack(Frame& curFrame)
25
 {
26
     X265_CHECK(!curFrame.m_next && !curFrame.m_prev, "piclist: picture already in list\n"); // ensure frame is not in a list
27
@@ -63,6 +82,24 @@
28
     m_count++;
29
 }
30
 
31
+void PicList::pushBackMCSTF(Frame& curFrame)
32
+{
33
+    X265_CHECK(!curFrame.m_nextMCSTF && !curFrame.m_prevMCSTF, "piclist: picture already in OPB list\n"); // ensure frame is not in a list
34
+    curFrame.m_nextMCSTF = NULL;
35
+    curFrame.m_prevMCSTF = m_end;
36
+
37
+    if (m_count)
38
+    {
39
+        m_end->m_nextMCSTF = &curFrame;
40
+        m_end = &curFrame;
41
+    }
42
+    else
43
+    {
44
+        m_start = m_end = &curFrame;
45
+    }
46
+    m_count++;
47
+}
48
+
49
 Frame *PicList::popFront()
50
 {
51
     if (m_start)
52
@@ -94,6 +131,14 @@
53
     return curFrame;
54
 }
55
 
56
+Frame* PicList::getPOCMCSTF(int poc)
57
+{
58
+    Frame *curFrame = m_start;
59
+    while (curFrame && curFrame->m_poc != poc)
60
+        curFrame = curFrame->m_nextMCSTF;
61
+    return curFrame;
62
+}
63
+
64
 Frame *PicList::popBack()
65
 {
66
     if (m_end)
67
@@ -117,6 +162,29 @@
68
         return NULL;
69
 }
70
 
71
+Frame *PicList::popBackMCSTF()
72
+{
73
+    if (m_end)
74
+    {
75
+        Frame* temp = m_end;
76
+        m_count--;
77
+
78
+        if (m_count)
79
+        {
80
+            m_end = m_end->m_prevMCSTF;
81
+            m_end->m_nextMCSTF = NULL;
82
+        }
83
+        else
84
+        {
85
+            m_start = m_end = NULL;
86
+        }
87
+        temp->m_nextMCSTF = temp->m_prevMCSTF = NULL;
88
+        return temp;
89
+    }
90
+    else
91
+        return NULL;
92
+}
93
+
94
 Frame* PicList::getCurFrame(void)
95
 {
96
     Frame *curFrame = m_start;
97
@@ -158,3 +226,36 @@
98
 
99
     curFrame.m_next = curFrame.m_prev = NULL;
100
 }
101
+
102
+void PicList::removeMCSTF(Frame& curFrame)
103
+{
104
+#if _DEBUG
105
+    Frame *tmp = m_start;
106
+    while (tmp && tmp != &curFrame)
107
+    {
108
+        tmp = tmp->m_nextMCSTF;
109
+    }
110
+
111
+    X265_CHECK(tmp == &curFrame, "framelist: pic being removed was not in list\n"); // verify pic is in this list
112
+#endif
113
+
114
+    m_count--;
115
+    if (m_count)
116
+    {
117
+        if (m_start == &curFrame)
118
+            m_start = curFrame.m_nextMCSTF;
119
+        if (m_end == &curFrame)
120
+            m_end = curFrame.m_prevMCSTF;
121
+
122
+        if (curFrame.m_nextMCSTF)
123
+            curFrame.m_nextMCSTF->m_prevMCSTF = curFrame.m_prevMCSTF;
124
+        if (curFrame.m_prevMCSTF)
125
+            curFrame.m_prevMCSTF->m_nextMCSTF = curFrame.m_nextMCSTF;
126
+    }
127
+    else
128
+    {
129
+        m_start = m_end = NULL;
130
+    }
131
+
132
+    curFrame.m_nextMCSTF = curFrame.m_prevMCSTF = NULL;
133
+}
134
x265_3.5.tar.gz/source/common/piclist.h -> x265_3.6.tar.gz/source/common/piclist.h Changed
33
 
1
@@ -49,24 +49,31 @@
2
 
3
     /** Push picture to end of the list */
4
     void pushBack(Frame& pic);
5
+    void pushBackMCSTF(Frame& pic);
6
 
7
     /** Push picture to beginning of the list */
8
     void pushFront(Frame& pic);
9
+    void pushFrontMCSTF(Frame& pic);
10
 
11
     /** Pop picture from end of the list */
12
     Frame* popBack();
13
+    Frame* popBackMCSTF();
14
 
15
     /** Pop picture from beginning of the list */
16
     Frame* popFront();
17
 
18
     /** Find frame with specified POC */
19
     Frame* getPOC(int poc);
20
+    /* Find next MCSTF frame with specified POC */
21
+    Frame* getPOCMCSTF(int poc);
22
 
23
     /** Get the current Frame from the list **/
24
     Frame* getCurFrame(void);
25
 
26
     /** Remove picture from list */
27
     void remove(Frame& pic);
28
+    /* Remove MCSTF picture from list */
29
+    void removeMCSTF(Frame& pic);
30
 
31
     Frame* first()        { return m_start;   }
32
 
33
x265_3.5.tar.gz/source/common/picyuv.cpp -> x265_3.6.tar.gz/source/common/picyuv.cpp Changed
60
 
1
@@ -125,6 +125,58 @@
2
     return false;
3
 }
4
 
5
+/*Copy pixels from the picture buffer of a frame to picture buffer of another frame*/
6
+void PicYuv::copyFromFrame(PicYuv* source)
7
+{
8
+    uint32_t numCuInHeight = (m_picHeight + m_param->maxCUSize - 1) / m_param->maxCUSize;
9
+
10
+    int maxHeight = numCuInHeight * m_param->maxCUSize;
11
+    memcpy(m_picBuf0, source->m_picBuf0, sizeof(pixel)* m_stride * (maxHeight + (m_lumaMarginY * 2)));
12
+    m_picOrg0 = m_picBuf0 + m_lumaMarginY * m_stride + m_lumaMarginX;
13
+
14
+    if (m_picCsp != X265_CSP_I400)
15
+    {
16
+        memcpy(m_picBuf1, source->m_picBuf1, sizeof(pixel)* m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
17
+        memcpy(m_picBuf2, source->m_picBuf2, sizeof(pixel)* m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
18
+
19
+        m_picOrg1 = m_picBuf1 + m_chromaMarginY * m_strideC + m_chromaMarginX;
20
+        m_picOrg2 = m_picBuf2 + m_chromaMarginY * m_strideC + m_chromaMarginX;
21
+    }
22
+    else
23
+    {
24
+        m_picBuf1 = m_picBuf2 = NULL;
25
+        m_picOrg1 = m_picOrg2 = NULL;
26
+    }
27
+}
28
+
29
+bool PicYuv::createScaledPicYUV(x265_param* param, uint8_t scaleFactor)
30
+{
31
+    m_param = param;
32
+    m_picWidth = m_param->sourceWidth / scaleFactor;
33
+    m_picHeight = m_param->sourceHeight / scaleFactor;
34
+
35
+    m_picCsp = m_param->internalCsp;
36
+    m_hChromaShift = CHROMA_H_SHIFT(m_picCsp);
37
+    m_vChromaShift = CHROMA_V_SHIFT(m_picCsp);
38
+
39
+    uint32_t numCuInWidth = (m_picWidth + param->maxCUSize - 1) / param->maxCUSize;
40
+    uint32_t numCuInHeight = (m_picHeight + param->maxCUSize - 1) / param->maxCUSize;
41
+
42
+    m_lumaMarginX = 128; // search margin for L0 and L1 ME in horizontal direction
43
+    m_lumaMarginY = 128; // search margin for L0 and L1 ME in vertical direction
44
+    m_stride = (numCuInWidth * param->maxCUSize) + (m_lumaMarginX << 1);
45
+
46
+    int maxHeight = numCuInHeight * param->maxCUSize;
47
+    CHECKED_MALLOC_ZERO(m_picBuf0, pixel, m_stride * (maxHeight + (m_lumaMarginY * 2)));
48
+    m_picOrg0 = m_picBuf0 + m_lumaMarginY * m_stride + m_lumaMarginX;
49
+    m_picBuf1 = m_picBuf2 = NULL;
50
+    m_picOrg1 = m_picOrg2 = NULL;
51
+    return true;
52
+
53
+fail:
54
+    return false;
55
+}
56
+
57
 int PicYuv::getLumaBufLen(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp)
58
 {
59
     m_picWidth = picWidth;
60
x265_3.5.tar.gz/source/common/picyuv.h -> x265_3.6.tar.gz/source/common/picyuv.h Changed
15
 
1
@@ -78,11 +78,13 @@
2
     PicYuv();
3
 
4
     bool  create(x265_param* param, bool picAlloc = true, pixel *pixelbuf = NULL);
5
+    bool  createScaledPicYUV(x265_param* param, uint8_t scaleFactor);
6
     bool  createOffsets(const SPS& sps);
7
     void  destroy();
8
     int   getLumaBufLen(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp);
9
 
10
     void  copyFromPicture(const x265_picture&, const x265_param& param, int padx, int pady);
11
+    void  copyFromFrame(PicYuv* source);
12
 
13
     intptr_t getChromaAddrOffset(uint32_t ctuAddr, uint32_t absPartIdx) const { return m_cuOffsetCctuAddr + m_buOffsetCabsPartIdx; }
14
 
15
x265_3.5.tar.gz/source/common/pixel.cpp -> x265_3.6.tar.gz/source/common/pixel.cpp Changed
51
 
1
@@ -266,7 +266,7 @@
2
 {
3
     int satd = 0;
4
 
5
-#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
6
+#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 && !HIGH_BIT_DEPTH
7
     pixelcmp_t satd_4x4 = x265_pixel_satd_4x4_neon;
8
 #endif
9
 
10
@@ -284,7 +284,7 @@
11
 {
12
     int satd = 0;
13
 
14
-#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
15
+#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 && !HIGH_BIT_DEPTH
16
     pixelcmp_t satd_8x4 = x265_pixel_satd_8x4_neon;
17
 #endif
18
 
19
@@ -627,6 +627,23 @@
20
     }
21
 }
22
 
23
+static
24
+void frame_subsample_luma(const pixel* src0, pixel* dst0, intptr_t src_stride, intptr_t dst_stride, int width, int height)
25
+{
26
+    for (int y = 0; y < height; y++, src0 += 2 * src_stride, dst0 += dst_stride)
27
+    {
28
+        const pixel *inRow = src0;
29
+        const pixel *inRowBelow = src0 + src_stride;
30
+        pixel *target = dst0;
31
+        for (int x = 0; x < width; x++)
32
+        {
33
+            targetx = (((inRow0 + inRowBelow0 + 1) >> 1) + ((inRow1 + inRowBelow1 + 1) >> 1) + 1) >> 1;
34
+            inRow += 2;
35
+            inRowBelow += 2;
36
+        }
37
+    }
38
+}
39
+
40
 /* structural similarity metric */
41
 static void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24)
42
 {
43
@@ -1355,5 +1372,7 @@
44
     p.cuBLOCK_16x16.normFact = normFact_c;
45
     p.cuBLOCK_32x32.normFact = normFact_c;
46
     p.cuBLOCK_64x64.normFact = normFact_c;
47
+    /* SubSample Luma*/
48
+    p.frameSubSampleLuma = frame_subsample_luma;
49
 }
50
 }
51
x265_3.5.tar.gz/source/common/ppc/intrapred_altivec.cpp -> x265_3.6.tar.gz/source/common/ppc/intrapred_altivec.cpp Changed
10
 
1
@@ -27,7 +27,7 @@
2
 #include <assert.h>
3
 #include <math.h>
4
 #include <cmath>
5
-#include <linux/types.h>
6
+#include <sys/types.h>
7
 #include <stdlib.h>
8
 #include <stdio.h>
9
 #include <stdint.h>
10
x265_3.5.tar.gz/source/common/primitives.h -> x265_3.6.tar.gz/source/common/primitives.h Changed
28
 
1
@@ -232,6 +232,8 @@
2
 typedef void(*psyRdoQuant_t2)(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos);
3
 typedef void(*ssimDistortion_t)(const pixel *fenc, uint32_t fStride, const pixel *recon,  intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k);
4
 typedef void(*normFactor_t)(const pixel *src, uint32_t blockSize, int shift, uint64_t *z_k);
5
+/* SubSampling Luma */
6
+typedef void (*downscaleluma_t)(const pixel* src0, pixel* dstf, intptr_t src_stride, intptr_t dst_stride, int width, int height);
7
 /* Function pointers to optimized encoder primitives. Each pointer can reference
8
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
9
 struct EncoderPrimitives
10
@@ -353,6 +355,8 @@
11
 
12
     downscale_t           frameInitLowres;
13
     downscale_t           frameInitLowerRes;
14
+    /* Sub Sample Luma */
15
+    downscaleluma_t        frameSubSampleLuma;
16
     cutree_propagate_cost propagateCost;
17
     cutree_fix8_unpack    fix8Unpack;
18
     cutree_fix8_pack      fix8Pack;
19
@@ -488,7 +492,7 @@
20
 
21
 #if ENABLE_ASSEMBLY && X265_ARCH_ARM64
22
 extern "C" {
23
-#include "aarch64/pixel-util.h"
24
+#include "aarch64/fun-decls.h"
25
 }
26
 #endif
27
 
28
x265_3.6.tar.gz/source/common/ringmem.cpp Added
359
 
1
@@ -0,0 +1,357 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2013-2017 MulticoreWare, Inc
4
+ *
5
+ * Authors: liwei <liwei@multicorewareinc.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com
23
+ *****************************************************************************/
24
+
25
+#include "ringmem.h"
26
+
27
+#ifndef _WIN32
28
+#include <sys/mman.h>
29
+#endif ////< _WIN32
30
+
31
+#ifdef _WIN32
32
+#define X265_SHARED_MEM_NAME                    "Local\\_x265_shr_mem_"
33
+#define X265_SEMAPHORE_RINGMEM_WRITER_NAME     "_x265_semW_"
34
+#define X265_SEMAPHORE_RINGMEM_READER_NAME     "_x265_semR_"
35
+#else /* POSIX / pthreads */
36
+#define X265_SHARED_MEM_NAME                    "/tmp/_x265_shr_mem_"
37
+#define X265_SEMAPHORE_RINGMEM_WRITER_NAME     "/tmp/_x265_semW_"
38
+#define X265_SEMAPHORE_RINGMEM_READER_NAME     "/tmp/_x265_semR_"
39
+#endif
40
+
41
+#define RINGMEM_ALLIGNMENT                       64
42
+
43
+namespace X265_NS {
44
+    RingMem::RingMem() 
45
+        : m_initialized(false)
46
+        , m_protectRW(false)
47
+        , m_itemSize(0)
48
+        , m_itemCnt(0)
49
+        , m_dataPool(NULL)
50
+        , m_shrMem(NULL)
51
+#ifdef _WIN32
52
+        , m_handle(NULL)
53
+#else //_WIN32
54
+        , m_filepath(NULL)
55
+#endif //_WIN32
56
+        , m_writeSem(NULL)
57
+        , m_readSem(NULL)
58
+    {
59
+    }
60
+
61
+
62
+    RingMem::~RingMem()
63
+    {
64
+    }
65
+
66
+    bool RingMem::skipRead(int32_t cnt) {
67
+        if (!m_initialized)
68
+        {
69
+            return false;
70
+        }
71
+
72
+        if (m_protectRW)
73
+        {
74
+            for (int i = 0; i < cnt; i++)
75
+            {
76
+                m_readSem->take();
77
+            }
78
+        }
79
+        
80
+        ATOMIC_ADD(&m_shrMem->m_read, cnt);
81
+
82
+        if (m_protectRW)
83
+        {
84
+            m_writeSem->give(cnt);
85
+        }
86
+
87
+        return true;
88
+    }
89
+
90
+    bool RingMem::skipWrite(int32_t cnt) {
91
+        if (!m_initialized)
92
+        {
93
+            return false;
94
+        }
95
+
96
+        if (m_protectRW)
97
+        {
98
+            for (int i = 0; i < cnt; i++)
99
+            {
100
+                m_writeSem->take();
101
+            }
102
+        }
103
+
104
+        ATOMIC_ADD(&m_shrMem->m_write, cnt);
105
+
106
+        if (m_protectRW)
107
+        {
108
+            m_readSem->give(cnt);
109
+        }
110
+
111
+        return true;
112
+    }
113
+
114
+    ///< initialize
115
+    bool RingMem::init(int32_t itemSize, int32_t itemCnt, const char *name, bool protectRW)
116
+    {
117
+        ///< check parameters
118
+        if (itemSize <= 0 || itemCnt <= 0 || NULL == name)
119
+        {
120
+            ///< invalid parameters 
121
+            return false;
122
+        }
123
+
124
+        if (!m_initialized)
125
+        {
126
+            ///< formating names
127
+            char nameBufMAX_SHR_NAME_LEN = { 0 };
128
+
129
+            ///< shared memory name
130
+            snprintf(nameBuf, sizeof(nameBuf) - 1, "%s%s", X265_SHARED_MEM_NAME, name);
131
+
132
+            ///< create or open shared memory
133
+            bool newCreated = false;
134
+
135
+            ///< calculate the size of the shared memory
136
+            int32_t shrMemSize = (itemSize * itemCnt + sizeof(ShrMemCtrl) + RINGMEM_ALLIGNMENT - 1) & ~(RINGMEM_ALLIGNMENT - 1);
137
+
138
+#ifdef _WIN32
139
+            HANDLE h = OpenFileMappingA(FILE_MAP_WRITE | FILE_MAP_READ, FALSE, nameBuf);
140
+            if (!h)
141
+            {
142
+                h = CreateFileMappingA(INVALID_HANDLE_VALUE, NULL, PAGE_READWRITE, 0, shrMemSize, nameBuf);
143
+
144
+                if (!h)
145
+                {
146
+                    return false;
147
+                }
148
+
149
+                newCreated = true;
150
+            }
151
+
152
+            void *pool = MapViewOfFile(h, FILE_MAP_ALL_ACCESS, 0, 0, 0);
153
+
154
+            ///< should not close the handle here, otherwise the OpenFileMapping would fail
155
+            //CloseHandle(h);
156
+            m_handle = h;
157
+
158
+            if (!pool)
159
+            {
160
+                return false;
161
+            }
162
+
163
+#else /* POSIX / pthreads */
164
+            mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH;
165
+            int flag = O_RDWR;
166
+            int shrfd = -1;
167
+            if ((shrfd = open(nameBuf, flag, mode)) < 0)
168
+            {
169
+                flag |= O_CREAT;
170
+                
171
+                shrfd = open(nameBuf, flag, mode);
172
+                if (shrfd < 0)
173
+                {
174
+                    return false;
175
+                }
176
+                newCreated = true;
177
+
178
+                lseek(shrfd, shrMemSize - 1, SEEK_SET);
179
+
180
+                if (-1 == write(shrfd, "\0", 1))
181
+                {
182
+                    close(shrfd);
183
+                    return false;
184
+                }
185
+
186
+                if (lseek(shrfd, 0, SEEK_END) < shrMemSize)
187
+                {
188
+                    close(shrfd);
189
+                    return false;
190
+                }
191
+            }
192
+
193
+            void *pool = mmap(0,
194
+                shrMemSize,
195
+                PROT_READ | PROT_WRITE,
196
+                MAP_SHARED,
197
+                shrfd,
198
+                0);
199
+
200
+            close(shrfd);
201
+            if (pool == MAP_FAILED)
202
+            {               
203
+                return false;
204
+            }
205
+
206
+            m_filepath = strdup(nameBuf);
207
+#endif ///< _WIN32
208
+
209
+            if (newCreated)
210
+            {
211
+                memset(pool, 0, shrMemSize);
212
+            }
213
+            
214
+            m_shrMem = reinterpret_cast<ShrMemCtrl *>(pool);
215
+            m_dataPool = reinterpret_cast<uint8_t *>(pool) + sizeof(ShrMemCtrl);
216
+            m_itemSize = itemSize;
217
+            m_itemCnt = itemCnt;
218
+            m_initialized = true;
219
+
220
+            if (protectRW)
221
+            {
222
+                m_protectRW = true;
223
+                m_writeSem = new NamedSemaphore();
224
+                if (!m_writeSem)
225
+                {
226
+                    release();
227
+                    return false;
228
+                }
229
+
230
+                ///< shared memory name
231
+                snprintf(nameBuf, sizeof(nameBuf) - 1, "%s%s", X265_SEMAPHORE_RINGMEM_WRITER_NAME, name);
232
+                if (!m_writeSem->create(nameBuf, m_itemCnt, m_itemCnt))
233
+                {
234
+                    release();
235
+                    return false;
236
+                }
237
+
238
+                m_readSem = new NamedSemaphore();
239
+                if (!m_readSem)
240
+                {
241
+                    release();
242
+                    return false;
243
+                }
244
+
245
+                ///< shared memory name
246
+                snprintf(nameBuf, sizeof(nameBuf) - 1, "%s%s", X265_SEMAPHORE_RINGMEM_READER_NAME, name);
247
+                if (!m_readSem->create(nameBuf, 0, m_itemCnt))
248
+                {
249
+                    release();
250
+                    return false;
251
+                }
252
+            }
253
+        }
254
+
255
+        return true;
256
+    }
257
+    ///< finalize
258
+    void RingMem::release()
259
+    {
260
+        if (m_initialized)
261
+        {
262
+            m_initialized = false;
263
+
264
+            if (m_shrMem)
265
+            {
266
+#ifdef _WIN32
267
+                UnmapViewOfFile(m_shrMem);
268
+                CloseHandle(m_handle);
269
+                m_handle = NULL;
270
+#else /* POSIX / pthreads */
271
+                int32_t shrMemSize = (m_itemSize * m_itemCnt + sizeof(ShrMemCtrl) + RINGMEM_ALLIGNMENT - 1) & (~RINGMEM_ALLIGNMENT - 1);
272
+                munmap(m_shrMem, shrMemSize);
273
+                unlink(m_filepath);
274
+                free(m_filepath);
275
+                m_filepath = NULL;
276
+#endif ///< _WIN32
277
+                m_shrMem = NULL;
278
+                m_dataPool = NULL;
279
+                m_itemSize = 0;
280
+                m_itemCnt = 0;
281
+            }
282
+            
283
+            if (m_protectRW)
284
+            {
285
+                m_protectRW = false;
286
+                if (m_writeSem)
287
+                {
288
+                    m_writeSem->release();
289
+
290
+                    delete m_writeSem;
291
+                    m_writeSem = NULL;
292
+                }
293
+
294
+                if (m_readSem)
295
+                {
296
+                    m_readSem->release();
297
+
298
+                    delete m_readSem;
299
+                    m_readSem = NULL;
300
+                }
301
+            }
302
+
303
+        }
304
+    }
305
+
306
+    ///< data read
307
+    bool RingMem::readNext(void* dst, fnRWSharedData callback)
308
+    {
309
+        if (!m_initialized || !callback || !dst)
310
+        {
311
+            return false;
312
+        }
313
+
314
+        if (m_protectRW)
315
+        {
316
+            if (!m_readSem->take())
317
+            {
318
+                return false;
319
+            }
320
+        }
321
+
322
+        int32_t index = ATOMIC_ADD(&m_shrMem->m_read, 1) % m_itemCnt;
323
+        (*callback)(dst, reinterpret_cast<uint8_t *>(m_dataPool) + index * m_itemSize, m_itemSize);
324
+
325
+        if (m_protectRW)
326
+        {
327
+            m_writeSem->give(1);
328
+        }
329
+
330
+        return true;
331
+    }
332
+    ///< data write
333
+    bool RingMem::writeData(void *data, fnRWSharedData callback)
334
+    {
335
+        if (!m_initialized || !data || !callback)
336
+        {
337
+            return false;
338
+        }
339
+
340
+        if (m_protectRW)
341
+        {
342
+            if (!m_writeSem->take())
343
+            {
344
+                return false;
345
+            }
346
+        }
347
+
348
+        int32_t index = ATOMIC_ADD(&m_shrMem->m_write, 1) % m_itemCnt;
349
+        (*callback)(reinterpret_cast<uint8_t *>(m_dataPool) + index * m_itemSize, data, m_itemSize);
350
+
351
+        if (m_protectRW)
352
+        {
353
+            m_readSem->give(1);
354
+        }
355
+
356
+        return true;
357
+    }
358
+}
359
x265_3.6.tar.gz/source/common/ringmem.h Added
92
 
1
@@ -0,0 +1,90 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2013-2017 MulticoreWare, Inc
4
+ *
5
+ * Authors: liwei <liwei@multicorewareinc.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com
23
+ *****************************************************************************/
24
+
25
+#ifndef X265_RINGMEM_H
26
+#define X265_RINGMEM_H
27
+
28
+#include "common.h"
29
+#include "threading.h"
30
+
31
+#if _MSC_VER
32
+#define snprintf _snprintf
33
+#define strdup _strdup
34
+#endif
35
+
36
+namespace X265_NS {
37
+
38
+#define MAX_SHR_NAME_LEN                         256
39
+
40
+    class RingMem {
41
+    public:
42
+        RingMem();
43
+        ~RingMem();
44
+
45
+        bool skipRead(int32_t cnt);
46
+
47
+        bool skipWrite(int32_t cnt);
48
+
49
+        ///< initialize
50
+        ///< protectRW: if use the semaphore the protect the write and read operation.
51
+        bool init(int32_t itemSize, int32_t itemCnt, const char *name, bool protectRW = false);
52
+        ///< finalize
53
+        void release();
54
+
55
+        typedef void(*fnRWSharedData)(void *dst, void *src, int32_t size);
56
+
57
+        ///< data read
58
+        bool readNext(void* dst, fnRWSharedData callback);
59
+        ///< data write
60
+        bool writeData(void *data, fnRWSharedData callback);
61
+
62
+    private:        
63
+        bool    m_initialized;
64
+        bool    m_protectRW;
65
+
66
+        int32_t m_itemSize;
67
+        int32_t m_itemCnt;
68
+        ///< data pool
69
+        void   *m_dataPool;
70
+        typedef struct {
71
+            ///< index to write
72
+            int32_t m_write;
73
+            ///< index to read
74
+            int32_t m_read;
75
+            
76
+        }ShrMemCtrl;
77
+
78
+        ShrMemCtrl *m_shrMem;
79
+#ifdef _WIN32
80
+        void       *m_handle;
81
+#else // _WIN32
82
+        char       *m_filepath;
83
+#endif // _WIN32
84
+
85
+        ///< Semaphores
86
+        NamedSemaphore *m_writeSem;
87
+        NamedSemaphore *m_readSem;
88
+    };
89
+};
90
+
91
+#endif // ifndef X265_RINGMEM_H
92
x265_3.5.tar.gz/source/common/slice.h -> x265_3.6.tar.gz/source/common/slice.h Changed
35
 
1
@@ -156,9 +156,9 @@
2
     HRDInfo          hrdParameters;
3
     ProfileTierLevel ptl;
4
     uint32_t         maxTempSubLayers;
5
-    uint32_t         numReorderPics;
6
-    uint32_t         maxDecPicBuffering;
7
-    uint32_t         maxLatencyIncrease;
8
+    uint32_t         numReorderPicsMAX_T_LAYERS;
9
+    uint32_t         maxDecPicBufferingMAX_T_LAYERS;
10
+    uint32_t         maxLatencyIncreaseMAX_T_LAYERS;
11
 };
12
 
13
 struct Window
14
@@ -235,9 +235,9 @@
15
     uint32_t maxAMPDepth;
16
 
17
     uint32_t maxTempSubLayers;   // max number of Temporal Sub layers
18
-    uint32_t maxDecPicBuffering; // these are dups of VPS values
19
-    uint32_t maxLatencyIncrease;
20
-    int      numReorderPics;
21
+    uint32_t maxDecPicBufferingMAX_T_LAYERS; // these are dups of VPS values
22
+    uint32_t maxLatencyIncreaseMAX_T_LAYERS;
23
+    int      numReorderPicsMAX_T_LAYERS;
24
 
25
     RPS      spsrpsMAX_NUM_SHORT_TERM_RPS;
26
     int      spsrpsNum;
27
@@ -363,6 +363,7 @@
28
     int         m_iNumRPSInSPS;
29
     const x265_param *m_param;
30
     int         m_fieldNum;
31
+    Frame*      m_mcstfRefFrameList2MAX_MCSTF_TEMPORAL_WINDOW_LENGTH;
32
 
33
     Slice()
34
     {
35
x265_3.6.tar.gz/source/common/temporalfilter.cpp Added
1019
 
1
@@ -0,0 +1,1017 @@
2
+/*****************************************************************************
3
+* Copyright (C) 2013-2021 MulticoreWare, Inc
4
+*
5
+ * Authors: Ashok Kumar Mishra <ashok@multicorewareinc.com>
6
+ *
7
+* This program is free software; you can redistribute it and/or modify
8
+* it under the terms of the GNU General Public License as published by
9
+* the Free Software Foundation; either version 2 of the License, or
10
+* (at your option) any later version.
11
+*
12
+* This program is distributed in the hope that it will be useful,
13
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+* GNU General Public License for more details.
16
+*
17
+* You should have received a copy of the GNU General Public License
18
+* along with this program; if not, write to the Free Software
19
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+*
21
+* This program is also available under a commercial proprietary license.
22
+* For more information, contact us at license @ x265.com.
23
+*****************************************************************************/
24
+#include "common.h"
25
+#include "temporalfilter.h"
26
+#include "primitives.h"
27
+
28
+#include "frame.h"
29
+#include "slice.h"
30
+#include "framedata.h"
31
+#include "analysis.h"
32
+
33
+using namespace X265_NS;
34
+
35
+void OrigPicBuffer::addPicture(Frame* inFrame)
36
+{
37
+    m_mcstfPicList.pushFrontMCSTF(*inFrame);
38
+}
39
+
40
+void OrigPicBuffer::addEncPicture(Frame* inFrame)
41
+{
42
+    m_mcstfOrigPicFreeList.pushFrontMCSTF(*inFrame);
43
+}
44
+
45
+void OrigPicBuffer::addEncPictureToPicList(Frame* inFrame)
46
+{
47
+    m_mcstfOrigPicList.pushFrontMCSTF(*inFrame);
48
+}
49
+
50
+OrigPicBuffer::~OrigPicBuffer()
51
+{
52
+    while (!m_mcstfOrigPicList.empty())
53
+    {
54
+        Frame* curFrame = m_mcstfOrigPicList.popBackMCSTF();
55
+        curFrame->destroy();
56
+        delete curFrame;
57
+    }
58
+
59
+    while (!m_mcstfOrigPicFreeList.empty())
60
+    {
61
+        Frame* curFrame = m_mcstfOrigPicFreeList.popBackMCSTF();
62
+        curFrame->destroy();
63
+        delete curFrame;
64
+    }
65
+}
66
+
67
+void OrigPicBuffer::setOrigPicList(Frame* inFrame, int frameCnt)
68
+{
69
+    Slice* slice = inFrame->m_encData->m_slice;
70
+    uint8_t j = 0;
71
+    for (int iterPOC = (inFrame->m_poc - inFrame->m_mcstf->m_range);
72
+        iterPOC <= (inFrame->m_poc + inFrame->m_mcstf->m_range); iterPOC++)
73
+    {
74
+        if (iterPOC != inFrame->m_poc)
75
+        {
76
+            if (iterPOC < 0)
77
+                continue;
78
+            if (iterPOC >= frameCnt)
79
+                break;
80
+
81
+            Frame *iterFrame = m_mcstfPicList.getPOCMCSTF(iterPOC);
82
+            X265_CHECK(iterFrame, "Reference frame not found in OPB");
83
+            if (iterFrame != NULL)
84
+            {
85
+                slice->m_mcstfRefFrameList1j = iterFrame;
86
+                iterFrame->m_refPicCnt1--;
87
+            }
88
+
89
+            iterFrame = m_mcstfOrigPicList.getPOCMCSTF(iterPOC);
90
+            if (iterFrame != NULL)
91
+            {
92
+
93
+                slice->m_mcstfRefFrameList1j = iterFrame;
94
+
95
+                iterFrame->m_refPicCnt1--;
96
+                Frame *cFrame = m_mcstfOrigPicList.getPOCMCSTF(inFrame->m_poc);
97
+                X265_CHECK(cFrame, "Reference frame not found in encoded OPB");
98
+                cFrame->m_refPicCnt1--;
99
+            }
100
+            j++;
101
+        }
102
+    }
103
+}
104
+
105
+void OrigPicBuffer::recycleOrigPicList()
106
+{
107
+    Frame *iterFrame = m_mcstfPicList.first();
108
+
109
+    while (iterFrame)
110
+    {
111
+        Frame *curFrame = iterFrame;
112
+        iterFrame = iterFrame->m_nextMCSTF;
113
+        if (!curFrame->m_refPicCnt1)
114
+        {
115
+            m_mcstfPicList.removeMCSTF(*curFrame);
116
+            iterFrame = m_mcstfPicList.first();
117
+        }
118
+    }
119
+
120
+    iterFrame = m_mcstfOrigPicList.first();
121
+
122
+    while (iterFrame)
123
+    {
124
+        Frame *curFrame = iterFrame;
125
+        iterFrame = iterFrame->m_nextMCSTF;
126
+        if (!curFrame->m_refPicCnt1)
127
+        {
128
+            m_mcstfOrigPicList.removeMCSTF(*curFrame);
129
+            *curFrame->m_isSubSampled = false;
130
+            m_mcstfOrigPicFreeList.pushFrontMCSTF(*curFrame);
131
+            iterFrame = m_mcstfOrigPicList.first();
132
+        }
133
+    }
134
+}
135
+
136
+void OrigPicBuffer::addPictureToFreelist(Frame* inFrame)
137
+{
138
+    m_mcstfOrigPicFreeList.pushBack(*inFrame);
139
+}
140
+
141
+TemporalFilter::TemporalFilter()
142
+{
143
+    m_sourceWidth = 0;
144
+    m_sourceHeight = 0,
145
+    m_QP = 0;
146
+    m_sliceTypeConfig = 3;
147
+    m_numRef = 0;
148
+    m_useSADinME = 1;
149
+
150
+    m_range = 2;
151
+    m_chromaFactor = 0.55;
152
+    m_sigmaMultiplier = 9.0;
153
+    m_sigmaZeroPoint = 10.0;
154
+    m_motionVectorFactor = 16;
155
+}
156
+
157
+void TemporalFilter::init(const x265_param* param)
158
+{
159
+    m_param = param;
160
+    m_bitDepth = param->internalBitDepth;
161
+    m_sourceWidth = param->sourceWidth;
162
+    m_sourceHeight = param->sourceHeight;
163
+    m_internalCsp = param->internalCsp;
164
+    m_numComponents = (m_internalCsp != X265_CSP_I400) ? MAX_NUM_COMPONENT : 1;
165
+
166
+    m_metld = new MotionEstimatorTLD;
167
+
168
+    predPUYuv.create(FENC_STRIDE, X265_CSP_I400);
169
+}
170
+
171
+int TemporalFilter::createRefPicInfo(TemporalFilterRefPicInfo* refFrame, x265_param* param)
172
+{
173
+    CHECKED_MALLOC_ZERO(refFrame->mvs, MV, sizeof(MV)* ((m_sourceWidth ) / 4) * ((m_sourceHeight ) / 4));
174
+    refFrame->mvsStride = m_sourceWidth / 4;
175
+    CHECKED_MALLOC_ZERO(refFrame->mvs0, MV, sizeof(MV)* ((m_sourceWidth ) / 16) * ((m_sourceHeight ) / 16));
176
+    refFrame->mvsStride0 = m_sourceWidth / 16;
177
+    CHECKED_MALLOC_ZERO(refFrame->mvs1, MV, sizeof(MV)* ((m_sourceWidth ) / 16) * ((m_sourceHeight ) / 16));
178
+    refFrame->mvsStride1 = m_sourceWidth / 16;
179
+    CHECKED_MALLOC_ZERO(refFrame->mvs2, MV, sizeof(MV)* ((m_sourceWidth ) / 16)*((m_sourceHeight ) / 16));
180
+    refFrame->mvsStride2 = m_sourceWidth / 16;
181
+
182
+    CHECKED_MALLOC_ZERO(refFrame->noise, int, sizeof(int) * ((m_sourceWidth) / 4) * ((m_sourceHeight) / 4));
183
+    CHECKED_MALLOC_ZERO(refFrame->error, int, sizeof(int) * ((m_sourceWidth) / 4) * ((m_sourceHeight) / 4));
184
+
185
+    refFrame->slicetype = X265_TYPE_AUTO;
186
+
187
+    refFrame->compensatedPic = new PicYuv;
188
+    refFrame->compensatedPic->create(param, true);
189
+
190
+    return 1;
191
+fail:
192
+    return 0;
193
+}
194
+
195
+int TemporalFilter::motionErrorLumaSAD(
196
+    PicYuv *orig,
197
+    PicYuv *buffer,
198
+    int x,
199
+    int y,
200
+    int dx,
201
+    int dy,
202
+    int bs,
203
+    int besterror)
204
+{
205
+
206
+    pixel* origOrigin = orig->m_picOrg0;
207
+    intptr_t origStride = orig->m_stride;
208
+    pixel *buffOrigin = buffer->m_picOrg0;
209
+    intptr_t buffStride = buffer->m_stride;
210
+    int error = 0;// dx * 10 + dy * 10;
211
+    if (((dx | dy) & 0xF) == 0)
212
+    {
213
+        dx /= m_motionVectorFactor;
214
+        dy /= m_motionVectorFactor;
215
+
216
+        const pixel* bufferRowStart = buffOrigin + (y + dy) * buffStride + (x + dx);
217
+#if 0
218
+        const pixel* origRowStart = origOrigin + y *origStride + x;
219
+
220
+        for (int y1 = 0; y1 < bs; y1++)
221
+        {
222
+            for (int x1 = 0; x1 < bs; x1++)
223
+            {
224
+                int diff = origRowStartx1 - bufferRowStartx1;
225
+                error += abs(diff);
226
+            }
227
+
228
+            origRowStart += origStride;
229
+            bufferRowStart += buffStride;
230
+        }
231
+#else
232
+        int partEnum = partitionFromSizes(bs, bs);
233
+        /* copy PU block into cache */
234
+        primitives.pupartEnum.copy_pp(predPUYuv.m_buf0, FENC_STRIDE, bufferRowStart, buffStride);
235
+
236
+        error = m_metld->me.bufSAD(predPUYuv.m_buf0, FENC_STRIDE);
237
+#endif
238
+        if (error > besterror)
239
+        {
240
+            return error;
241
+        }
242
+    }
243
+    else
244
+    {
245
+        const int *xFilter = s_interpolationFilterdx & 0xF;
246
+        const int *yFilter = s_interpolationFilterdy & 0xF;
247
+        int tempArray64 + 864;
248
+
249
+        int iSum, iBase;
250
+        for (int y1 = 1; y1 < bs + 7; y1++)
251
+        {
252
+            const int yOffset = y + y1 + (dy >> 4) - 3;
253
+            const pixel *sourceRow = buffOrigin + (yOffset)*buffStride + 0;
254
+            for (int x1 = 0; x1 < bs; x1++)
255
+            {
256
+                iSum = 0;
257
+                iBase = x + x1 + (dx >> 4) - 3;
258
+                const pixel *rowStart = sourceRow + iBase;
259
+
260
+                iSum += xFilter1 * rowStart1;
261
+                iSum += xFilter2 * rowStart2;
262
+                iSum += xFilter3 * rowStart3;
263
+                iSum += xFilter4 * rowStart4;
264
+                iSum += xFilter5 * rowStart5;
265
+                iSum += xFilter6 * rowStart6;
266
+
267
+                tempArrayy1x1 = iSum;
268
+            }
269
+        }
270
+
271
+        const pixel maxSampleValue = (1 << m_bitDepth) - 1;
272
+        for (int y1 = 0; y1 < bs; y1++)
273
+        {
274
+            const pixel *origRow = origOrigin + (y + y1)*origStride + 0;
275
+            for (int x1 = 0; x1 < bs; x1++)
276
+            {
277
+                iSum = 0;
278
+                iSum += yFilter1 * tempArrayy1 + 1x1;
279
+                iSum += yFilter2 * tempArrayy1 + 2x1;
280
+                iSum += yFilter3 * tempArrayy1 + 3x1;
281
+                iSum += yFilter4 * tempArrayy1 + 4x1;
282
+                iSum += yFilter5 * tempArrayy1 + 5x1;
283
+                iSum += yFilter6 * tempArrayy1 + 6x1;
284
+
285
+                iSum = (iSum + (1 << 11)) >> 12;
286
+                iSum = iSum < 0 ? 0 : (iSum > maxSampleValue ? maxSampleValue : iSum);
287
+
288
+                error += abs(iSum - origRowx + x1);
289
+            }
290
+            if (error > besterror)
291
+            {
292
+                return error;
293
+            }
294
+        }
295
+    }
296
+    return error;
297
+}
298
+
299
+int TemporalFilter::motionErrorLumaSSD(
300
+    PicYuv *orig,
301
+    PicYuv *buffer,
302
+    int x,
303
+    int y,
304
+    int dx,
305
+    int dy,
306
+    int bs,
307
+    int besterror)
308
+{
309
+
310
+    pixel* origOrigin = orig->m_picOrg0;
311
+    intptr_t origStride = orig->m_stride;
312
+    pixel *buffOrigin = buffer->m_picOrg0;
313
+    intptr_t buffStride = buffer->m_stride;
314
+    int error = 0;// dx * 10 + dy * 10;
315
+    if (((dx | dy) & 0xF) == 0)
316
+    {
317
+        dx /= m_motionVectorFactor;
318
+        dy /= m_motionVectorFactor;
319
+
320
+        const pixel* bufferRowStart = buffOrigin + (y + dy) * buffStride + (x + dx);
321
+#if 0
322
+        const pixel* origRowStart = origOrigin + y * origStride + x;
323
+
324
+        for (int y1 = 0; y1 < bs; y1++)
325
+        {
326
+            for (int x1 = 0; x1 < bs; x1++)
327
+            {
328
+                int diff = origRowStartx1 - bufferRowStartx1;
329
+                error += diff * diff;
330
+            }
331
+
332
+            origRowStart += origStride;
333
+            bufferRowStart += buffStride;
334
+        }
335
+#else
336
+        int partEnum = partitionFromSizes(bs, bs);
337
+        /* copy PU block into cache */
338
+        primitives.pupartEnum.copy_pp(predPUYuv.m_buf0, FENC_STRIDE, bufferRowStart, buffStride);
339
+
340
+        error = (int)primitives.cupartEnum.sse_pp(m_metld->me.fencPUYuv.m_buf0, FENC_STRIDE, predPUYuv.m_buf0, FENC_STRIDE);
341
+
342
+#endif
343
+        if (error > besterror)
344
+        {
345
+            return error;
346
+        }
347
+    }
348
+    else
349
+    {
350
+        const int *xFilter = s_interpolationFilterdx & 0xF;
351
+        const int *yFilter = s_interpolationFilterdy & 0xF;
352
+        int tempArray64 + 864;
353
+
354
+        int iSum, iBase;
355
+        for (int y1 = 1; y1 < bs + 7; y1++)
356
+        {
357
+            const int yOffset = y + y1 + (dy >> 4) - 3;
358
+            const pixel *sourceRow = buffOrigin + (yOffset)*buffStride + 0;
359
+            for (int x1 = 0; x1 < bs; x1++)
360
+            {
361
+                iSum = 0;
362
+                iBase = x + x1 + (dx >> 4) - 3;
363
+                const pixel *rowStart = sourceRow + iBase;
364
+
365
+                iSum += xFilter1 * rowStart1;
366
+                iSum += xFilter2 * rowStart2;
367
+                iSum += xFilter3 * rowStart3;
368
+                iSum += xFilter4 * rowStart4;
369
+                iSum += xFilter5 * rowStart5;
370
+                iSum += xFilter6 * rowStart6;
371
+
372
+                tempArrayy1x1 = iSum;
373
+            }
374
+        }
375
+
376
+        const pixel maxSampleValue = (1 << m_bitDepth) - 1;
377
+        for (int y1 = 0; y1 < bs; y1++)
378
+        {
379
+            const pixel *origRow = origOrigin + (y + y1)*origStride + 0;
380
+            for (int x1 = 0; x1 < bs; x1++)
381
+            {
382
+                iSum = 0;
383
+                iSum += yFilter1 * tempArrayy1 + 1x1;
384
+                iSum += yFilter2 * tempArrayy1 + 2x1;
385
+                iSum += yFilter3 * tempArrayy1 + 3x1;
386
+                iSum += yFilter4 * tempArrayy1 + 4x1;
387
+                iSum += yFilter5 * tempArrayy1 + 5x1;
388
+                iSum += yFilter6 * tempArrayy1 + 6x1;
389
+
390
+                iSum = (iSum + (1 << 11)) >> 12;
391
+                iSum = iSum < 0 ? 0 : (iSum > maxSampleValue ? maxSampleValue : iSum);
392
+
393
+                error += (iSum - origRowx + x1) * (iSum - origRowx + x1);
394
+            }
395
+            if (error > besterror)
396
+            {
397
+                return error;
398
+            }
399
+        }
400
+    }
401
+    return error;
402
+}
403
+
404
+void TemporalFilter::applyMotion(MV *mvs, uint32_t mvsStride, PicYuv *input, PicYuv *output)
405
+{
406
+    static const int lumaBlockSize = 8;
407
+    int srcStride = 0;
408
+    int dstStride = 0;
409
+    int csx = 0, csy = 0;
410
+    for (int c = 0; c < m_numComponents; c++)
411
+    {
412
+        const pixel maxValue = (1 << X265_DEPTH) - 1;
413
+
414
+        const pixel *pSrcImage = input->m_picOrgc;
415
+        pixel *pDstImage = output->m_picOrgc;
416
+
417
+        if (!c)
418
+        {
419
+            srcStride = (int)input->m_stride;
420
+            dstStride = (int)output->m_stride;
421
+        }
422
+        else
423
+        {
424
+            srcStride = (int)input->m_strideC;
425
+            dstStride = (int)output->m_strideC;
426
+            csx = CHROMA_H_SHIFT(m_internalCsp);
427
+            csy = CHROMA_V_SHIFT(m_internalCsp);
428
+        }
429
+        const int blockSizeX = lumaBlockSize >> csx;
430
+        const int blockSizeY = lumaBlockSize >> csy;
431
+        const int height = input->m_picHeight >> csy;
432
+        const int width = input->m_picWidth >> csx;
433
+
434
+        for (int y = 0, blockNumY = 0; y + blockSizeY <= height; y += blockSizeY, blockNumY++)
435
+        {
436
+            for (int x = 0, blockNumX = 0; x + blockSizeX <= width; x += blockSizeX, blockNumX++)
437
+            {
438
+                int mvIdx = blockNumY * mvsStride + blockNumX;
439
+                const MV &mv = mvsmvIdx;
440
+                const int dx = mv.x >> csx;
441
+                const int dy = mv.y >> csy;
442
+                const int xInt = mv.x >> (4 + csx);
443
+                const int yInt = mv.y >> (4 + csy);
444
+
445
+                const int *xFilter = s_interpolationFilterdx & 0xf;
446
+                const int *yFilter = s_interpolationFilterdy & 0xf; // will add 6 bit.
447
+                const int numFilterTaps = 7;
448
+                const int centreTapOffset = 3;
449
+
450
+                int tempArraylumaBlockSize + numFilterTapslumaBlockSize;
451
+
452
+                for (int by = 1; by < blockSizeY + numFilterTaps; by++)
453
+                {
454
+                    const int yOffset = y + by + yInt - centreTapOffset;
455
+                    const pixel *sourceRow = pSrcImage + yOffset * srcStride;
456
+                    for (int bx = 0; bx < blockSizeX; bx++)
457
+                    {
458
+                        int iBase = x + bx + xInt - centreTapOffset;
459
+                        const pixel *rowStart = sourceRow + iBase;
460
+
461
+                        int iSum = 0;
462
+                        iSum += xFilter1 * rowStart1;
463
+                        iSum += xFilter2 * rowStart2;
464
+                        iSum += xFilter3 * rowStart3;
465
+                        iSum += xFilter4 * rowStart4;
466
+                        iSum += xFilter5 * rowStart5;
467
+                        iSum += xFilter6 * rowStart6;
468
+
469
+                        tempArraybybx = iSum;
470
+                    }
471
+                }
472
+
473
+                pixel *pDstRow = pDstImage + y * dstStride;
474
+                for (int by = 0; by < blockSizeY; by++, pDstRow += dstStride)
475
+                {
476
+                    pixel *pDstPel = pDstRow + x;
477
+                    for (int bx = 0; bx < blockSizeX; bx++, pDstPel++)
478
+                    {
479
+                        int iSum = 0;
480
+
481
+                        iSum += yFilter1 * tempArrayby + 1bx;
482
+                        iSum += yFilter2 * tempArrayby + 2bx;
483
+                        iSum += yFilter3 * tempArrayby + 3bx;
484
+                        iSum += yFilter4 * tempArrayby + 4bx;
485
+                        iSum += yFilter5 * tempArrayby + 5bx;
486
+                        iSum += yFilter6 * tempArrayby + 6bx;
487
+
488
+                        iSum = (iSum + (1 << 11)) >> 12;
489
+                        iSum = iSum < 0 ? 0 : (iSum > maxValue ? maxValue : iSum);
490
+                        *pDstPel = (pixel)iSum;
491
+                    }
492
+                }
493
+            }
494
+        }
495
+    }
496
+}
497
+
498
+void TemporalFilter::bilateralFilter(Frame* frame,
499
+    TemporalFilterRefPicInfo* m_mcstfRefList,
500
+    double overallStrength)
501
+{
502
+
503
+    const int numRefs = frame->m_mcstf->m_numRef;
504
+
505
+    for (int i = 0; i < numRefs; i++)
506
+    {
507
+        TemporalFilterRefPicInfo *ref = &m_mcstfRefListi;
508
+        applyMotion(m_mcstfRefListi.mvs, m_mcstfRefListi.mvsStride, m_mcstfRefListi.picBuffer, ref->compensatedPic);
509
+    }
510
+
511
+    int refStrengthRow = 2;
512
+    if (numRefs == m_range * 2)
513
+    {
514
+        refStrengthRow = 0;
515
+    }
516
+    else if (numRefs == m_range)
517
+    {
518
+        refStrengthRow = 1;
519
+    }
520
+
521
+    const double lumaSigmaSq = (m_QP - m_sigmaZeroPoint) * (m_QP - m_sigmaZeroPoint) * m_sigmaMultiplier;
522
+    const double chromaSigmaSq = 30 * 30;
523
+
524
+    PicYuv* orgPic = frame->m_fencPic;
525
+
526
+    for (int c = 0; c < m_numComponents; c++)
527
+    {
528
+        int height, width;
529
+        pixel *srcPelRow = NULL;
530
+        intptr_t srcStride, correctedPicsStride = 0;
531
+
532
+        if (!c)
533
+        {
534
+            height = orgPic->m_picHeight;
535
+            width = orgPic->m_picWidth;
536
+            srcPelRow = orgPic->m_picOrgc;
537
+            srcStride = orgPic->m_stride;
538
+        }
539
+        else
540
+        {
541
+            int csx = CHROMA_H_SHIFT(m_internalCsp);
542
+            int csy = CHROMA_V_SHIFT(m_internalCsp);
543
+
544
+            height = orgPic->m_picHeight >> csy;
545
+            width = orgPic->m_picWidth >> csx;
546
+            srcPelRow = orgPic->m_picOrgc;
547
+            srcStride = (int)orgPic->m_strideC;
548
+        }
549
+
550
+        const double sigmaSq = (!c)  ? lumaSigmaSq : chromaSigmaSq;
551
+        const double weightScaling = overallStrength * ( (!c) ? 0.4 : m_chromaFactor);
552
+
553
+        const double maxSampleValue = (1 << m_bitDepth) - 1;
554
+        const double bitDepthDiffWeighting = 1024.0 / (maxSampleValue + 1);
555
+
556
+        const int blkSize = (!c) ? 8 : 4;
557
+
558
+        for (int y = 0; y < height; y++, srcPelRow += srcStride)
559
+        {
560
+            pixel *srcPel = srcPelRow;
561
+
562
+            for (int x = 0; x < width; x++, srcPel++)
563
+            {
564
+                const int orgVal = (int)*srcPel;
565
+                double temporalWeightSum = 1.0;
566
+                double newVal = (double)orgVal;
567
+
568
+                if ((y % blkSize == 0) && (x % blkSize == 0))
569
+                {
570
+                    for (int i = 0; i < numRefs; i++)
571
+                    {
572
+                        TemporalFilterRefPicInfo *refPicInfo = &m_mcstfRefListi;
573
+
574
+                        if (!c)
575
+                            correctedPicsStride = refPicInfo->compensatedPic->m_stride;
576
+                        else
577
+                            correctedPicsStride = refPicInfo->compensatedPic->m_strideC;
578
+
579
+                        double variance = 0, diffsum = 0;
580
+                        for (int y1 = 0; y1 < blkSize - 1; y1++)
581
+                        {
582
+                            for (int x1 = 0; x1 < blkSize - 1; x1++)
583
+                            {
584
+                                int pix = *(srcPel + x1);
585
+                                int pixR = *(srcPel + x1 + 1);
586
+                                int pixD = *(srcPel + x1 + srcStride);
587
+
588
+                                int ref = *(refPicInfo->compensatedPic->m_picOrgc + ((y + y1) * correctedPicsStride + x + x1));
589
+                                int refR = *(refPicInfo->compensatedPic->m_picOrgc + ((y + y1) * correctedPicsStride + x + x1 + 1));
590
+                                int refD = *(refPicInfo->compensatedPic->m_picOrgc + ((y + y1 + 1) * correctedPicsStride + x + x1));
591
+
592
+                                int diff = pix - ref;
593
+                                int diffR = pixR - refR;
594
+                                int diffD = pixD - refD;
595
+
596
+                                variance += diff * diff;
597
+                                diffsum += (diffR - diff) * (diffR - diff);
598
+                                diffsum += (diffD - diff) * (diffD - diff);
599
+                            }
600
+                        }
601
+
602
+                        refPicInfo->noise(y / blkSize) * refPicInfo->mvsStride + (x / blkSize) = (int)round((300 * variance + 50) / (10 * diffsum + 50));
603
+                    }
604
+                }
605
+
606
+                double minError = 9999999;
607
+                for (int i = 0; i < numRefs; i++)
608
+                {
609
+                    TemporalFilterRefPicInfo *refPicInfo = &m_mcstfRefListi;
610
+                    minError = X265_MIN(minError, (double)refPicInfo->error(y / blkSize) * refPicInfo->mvsStride + (x / blkSize));
611
+                }
612
+
613
+                for (int i = 0; i < numRefs; i++)
614
+                {
615
+                    TemporalFilterRefPicInfo *refPicInfo = &m_mcstfRefListi;
616
+
617
+                    const int error = refPicInfo->error(y / blkSize) * refPicInfo->mvsStride + (x / blkSize);
618
+                    const int noise = refPicInfo->noise(y / blkSize) * refPicInfo->mvsStride + (x / blkSize);
619
+
620
+                    const pixel *pCorrectedPelPtr = refPicInfo->compensatedPic->m_picOrgc + (y * correctedPicsStride + x);
621
+                    const int refVal = (int)*pCorrectedPelPtr;
622
+                    double diff = (double)(refVal - orgVal);
623
+                    diff *= bitDepthDiffWeighting;
624
+                    double diffSq = diff * diff;
625
+
626
+                    const int index = X265_MIN(3, std::abs(refPicInfo->origOffset) - 1);
627
+                    double ww = 1, sw = 1;
628
+                    ww *= (noise < 25) ? 1 : 1.2;
629
+                    sw *= (noise < 25) ? 1.3 : 0.8;
630
+                    ww *= (error < 50) ? 1.2 : ((error > 100) ? 0.8 : 1);
631
+                    sw *= (error < 50) ? 1.3 : 1;
632
+                    ww *= ((minError + 1) / (error + 1));
633
+                    const double weight = weightScaling * s_refStrengthsrefStrengthRowindex * ww * exp(-diffSq / (2 * sw * sigmaSq));
634
+
635
+                    newVal += weight * refVal;
636
+                    temporalWeightSum += weight;
637
+                }
638
+                newVal /= temporalWeightSum;
639
+                double sampleVal = round(newVal);
640
+                sampleVal = (sampleVal < 0 ? 0 : (sampleVal > maxSampleValue ? maxSampleValue : sampleVal));
641
+                *srcPel = (pixel)sampleVal;
642
+            }
643
+        }
644
+    }
645
+}
646
+
647
+void TemporalFilter::motionEstimationLuma(MV *mvs, uint32_t mvStride, PicYuv *orig, PicYuv *buffer, int blockSize,
648
+    MV *previous, uint32_t prevMvStride, int factor)
649
+{
650
+
651
+    int range = 5;
652
+
653
+
654
+    const int stepSize = blockSize;
655
+
656
+    const int origWidth = orig->m_picWidth;
657
+    const int origHeight = orig->m_picHeight;
658
+
659
+    int error;
660
+
661
+    for (int blockY = 0; blockY + blockSize <= origHeight; blockY += stepSize)
662
+    {
663
+        for (int blockX = 0; blockX + blockSize <= origWidth; blockX += stepSize)
664
+        {
665
+            const intptr_t pelOffset = blockY * orig->m_stride + blockX;
666
+            m_metld->me.setSourcePU(orig->m_picOrg0, orig->m_stride, pelOffset, blockSize, blockSize, X265_HEX_SEARCH, 1);
667
+
668
+
669
+            MV best(0, 0);
670
+            int leastError = INT_MAX;
671
+
672
+            if (previous == NULL)
673
+            {
674
+                range = 8;
675
+            }
676
+            else
677
+            {
678
+
679
+                for (int py = -1; py <= 1; py++)
680
+                {
681
+                    int testy = blockY / (2 * blockSize) + py;
682
+
683
+                    for (int px = -1; px <= 1; px++)
684
+                    {
685
+
686
+                        int testx = blockX / (2 * blockSize) + px;
687
+                        if ((testx >= 0) && (testx < origWidth / (2 * blockSize)) && (testy >= 0) && (testy < origHeight / (2 * blockSize)))
688
+                        {
689
+                            int mvIdx = testy * prevMvStride + testx;
690
+                            MV old = previousmvIdx;
691
+
692
+                            if (m_useSADinME)
693
+                                error = motionErrorLumaSAD(orig, buffer, blockX, blockY, old.x * factor, old.y * factor, blockSize, leastError);
694
+                            else
695
+                                error = motionErrorLumaSSD(orig, buffer, blockX, blockY, old.x * factor, old.y * factor, blockSize, leastError);
696
+
697
+                            if (error < leastError)
698
+                            {
699
+                                best.set(old.x * factor, old.y * factor);
700
+                                leastError = error;
701
+                            }
702
+                        }
703
+                    }
704
+                }
705
+
706
+                if (m_useSADinME)
707
+                    error = motionErrorLumaSAD(orig, buffer, blockX, blockY, 0, 0, blockSize, leastError);
708
+                else
709
+                    error = motionErrorLumaSSD(orig, buffer, blockX, blockY, 0, 0, blockSize, leastError);
710
+
711
+                if (error < leastError)
712
+                {
713
+                    best.set(0, 0);
714
+                    leastError = error;
715
+                }
716
+
717
+            }
718
+
719
+            MV prevBest = best;
720
+            for (int y2 = prevBest.y / m_motionVectorFactor - range; y2 <= prevBest.y / m_motionVectorFactor + range; y2++)
721
+            {
722
+                for (int x2 = prevBest.x / m_motionVectorFactor - range; x2 <= prevBest.x / m_motionVectorFactor + range; x2++)
723
+                {
724
+                    if (m_useSADinME)
725
+                        error = motionErrorLumaSAD(orig, buffer, blockX, blockY, x2 * m_motionVectorFactor, y2 * m_motionVectorFactor, blockSize, leastError);
726
+                    else
727
+                        error = motionErrorLumaSSD(orig, buffer, blockX, blockY, x2 * m_motionVectorFactor, y2 * m_motionVectorFactor, blockSize, leastError);
728
+                    if (error < leastError)
729
+                    {
730
+                        best.set(x2 * m_motionVectorFactor, y2 * m_motionVectorFactor);
731
+                        leastError = error;
732
+                    }
733
+                }
734
+            }
735
+
736
+            if (blockY > 0)
737
+            {
738
+                int idx = ((blockY - stepSize) / stepSize) * mvStride + (blockX / stepSize);
739
+                MV aboveMV = mvsidx;
740
+
741
+                if (m_useSADinME)
742
+                    error = motionErrorLumaSAD(orig, buffer, blockX, blockY, aboveMV.x, aboveMV.y, blockSize, leastError);
743
+                else
744
+                    error = motionErrorLumaSSD(orig, buffer, blockX, blockY, aboveMV.x, aboveMV.y, blockSize, leastError);
745
+
746
+                if (error < leastError)
747
+                {
748
+                    best.set(aboveMV.x, aboveMV.y);
749
+                    leastError = error;
750
+                }
751
+            }
752
+
753
+            if (blockX > 0)
754
+            {
755
+                int idx = ((blockY / stepSize) * mvStride + (blockX - stepSize) / stepSize);
756
+                MV leftMV = mvsidx;
757
+
758
+                if (m_useSADinME)
759
+                    error = motionErrorLumaSAD(orig, buffer, blockX, blockY, leftMV.x, leftMV.y, blockSize, leastError);
760
+                else
761
+                    error = motionErrorLumaSSD(orig, buffer, blockX, blockY, leftMV.x, leftMV.y, blockSize, leastError);
762
+
763
+                if (error < leastError)
764
+                {
765
+                    best.set(leftMV.x, leftMV.y);
766
+                    leastError = error;
767
+                }
768
+            }
769
+
770
+            // calculate average
771
+            double avg = 0.0;
772
+            for (int x1 = 0; x1 < blockSize; x1++)
773
+            {
774
+                for (int y1 = 0; y1 < blockSize; y1++)
775
+                {
776
+                    avg = avg + *(orig->m_picOrg0 + (blockX + x1 + orig->m_stride * (blockY + y1)));
777
+                }
778
+            }
779
+            avg = avg / (blockSize * blockSize);
780
+
781
+            // calculate variance
782
+            double variance = 0;
783
+            for (int x1 = 0; x1 < blockSize; x1++)
784
+            {
785
+                for (int y1 = 0; y1 < blockSize; y1++)
786
+                {
787
+                    int pix = *(orig->m_picOrg0 + (blockX + x1 + orig->m_stride * (blockY + y1)));
788
+                    variance = variance + (pix - avg) * (pix - avg);
789
+                }
790
+            }
791
+
792
+            leastError = (int)(20 * ((leastError + 5.0) / (variance + 5.0)) + (leastError / (blockSize * blockSize)) / 50);
793
+
794
+            int mvIdx = (blockY / stepSize) * mvStride + (blockX / stepSize);
795
+            mvsmvIdx = best;
796
+        }
797
+    }
798
+}
799
+
800
+
801
+void TemporalFilter::motionEstimationLumaDoubleRes(MV *mvs, uint32_t mvStride, PicYuv *orig, PicYuv *buffer, int blockSize,
802
+    MV *previous, uint32_t prevMvStride, int factor, int* minError)
803
+{
804
+
805
+    int range = 0;
806
+
807
+
808
+    const int stepSize = blockSize;
809
+
810
+    const int origWidth = orig->m_picWidth;
811
+    const int origHeight = orig->m_picHeight;
812
+
813
+    int error;
814
+
815
+    for (int blockY = 0; blockY + blockSize <= origHeight; blockY += stepSize)
816
+    {
817
+        for (int blockX = 0; blockX + blockSize <= origWidth; blockX += stepSize)
818
+        {
819
+
820
+            const intptr_t pelOffset = blockY * orig->m_stride + blockX;
821
+            m_metld->me.setSourcePU(orig->m_picOrg0, orig->m_stride, pelOffset, blockSize, blockSize, X265_HEX_SEARCH, 1);
822
+
823
+            MV best(0, 0);
824
+            int leastError = INT_MAX;
825
+
826
+            if (previous == NULL)
827
+            {
828
+                range = 8;
829
+            }
830
+            else
831
+            {
832
+
833
+                for (int py = -1; py <= 1; py++)
834
+                {
835
+                    int testy = blockY / (2 * blockSize) + py;
836
+
837
+                    for (int px = -1; px <= 1; px++)
838
+                    {
839
+
840
+                        int testx = blockX / (2 * blockSize) + px;
841
+                        if ((testx >= 0) && (testx < origWidth / (2 * blockSize)) && (testy >= 0) && (testy < origHeight / (2 * blockSize)))
842
+                        {
843
+                            int mvIdx = testy * prevMvStride + testx;
844
+                            MV old = previousmvIdx;
845
+
846
+                            if (m_useSADinME)
847
+                                error = motionErrorLumaSAD(orig, buffer, blockX, blockY, old.x * factor, old.y * factor, blockSize, leastError);
848
+                            else
849
+                                error = motionErrorLumaSSD(orig, buffer, blockX, blockY, old.x * factor, old.y * factor, blockSize, leastError);
850
+
851
+                            if (error < leastError)
852
+                            {
853
+                                best.set(old.x * factor, old.y * factor);
854
+                                leastError = error;
855
+                            }
856
+                        }
857
+                    }
858
+                }
859
+
860
+                if (m_useSADinME)
861
+                    error = motionErrorLumaSAD(orig, buffer, blockX, blockY, 0, 0, blockSize, leastError);
862
+                else
863
+                    error = motionErrorLumaSSD(orig, buffer, blockX, blockY, 0, 0, blockSize, leastError);
864
+
865
+                if (error < leastError)
866
+                {
867
+                    best.set(0, 0);
868
+                    leastError = error;
869
+                }
870
+
871
+            }
872
+
873
+            MV prevBest = best;
874
+            for (int y2 = prevBest.y / m_motionVectorFactor - range; y2 <= prevBest.y / m_motionVectorFactor + range; y2++)
875
+            {
876
+                for (int x2 = prevBest.x / m_motionVectorFactor - range; x2 <= prevBest.x / m_motionVectorFactor + range; x2++)
877
+                {
878
+                    if (m_useSADinME)
879
+                        error = motionErrorLumaSAD(orig, buffer, blockX, blockY, x2 * m_motionVectorFactor, y2 * m_motionVectorFactor, blockSize, leastError);
880
+                    else
881
+                        error = motionErrorLumaSSD(orig, buffer, blockX, blockY, x2 * m_motionVectorFactor, y2 * m_motionVectorFactor, blockSize, leastError);
882
+
883
+                    if (error < leastError)
884
+                    {
885
+                        best.set(x2 * m_motionVectorFactor, y2 * m_motionVectorFactor);
886
+                        leastError = error;
887
+                    }
888
+                }
889
+            }
890
+
891
+            prevBest = best;
892
+            int doubleRange = 3 * 4;
893
+            for (int y2 = prevBest.y - doubleRange; y2 <= prevBest.y + doubleRange; y2 += 4)
894
+            {
895
+                for (int x2 = prevBest.x - doubleRange; x2 <= prevBest.x + doubleRange; x2 += 4)
896
+                {
897
+                    if (m_useSADinME)
898
+                        error = motionErrorLumaSAD(orig, buffer, blockX, blockY, x2, y2, blockSize, leastError);
899
+                    else
900
+                        error = motionErrorLumaSSD(orig, buffer, blockX, blockY, x2, y2, blockSize, leastError);
901
+
902
+                    if (error < leastError)
903
+                    {
904
+                        best.set(x2, y2);
905
+                        leastError = error;
906
+                    }
907
+                }
908
+            }
909
+
910
+            prevBest = best;
911
+            doubleRange = 3;
912
+            for (int y2 = prevBest.y - doubleRange; y2 <= prevBest.y + doubleRange; y2++)
913
+            {
914
+                for (int x2 = prevBest.x - doubleRange; x2 <= prevBest.x + doubleRange; x2++)
915
+                {
916
+                    if (m_useSADinME)
917
+                        error = motionErrorLumaSAD(orig, buffer, blockX, blockY, x2, y2, blockSize, leastError);
918
+                    else
919
+                        error = motionErrorLumaSSD(orig, buffer, blockX, blockY, x2, y2, blockSize, leastError);
920
+
921
+                    if (error < leastError)
922
+                    {
923
+                        best.set(x2, y2);
924
+                        leastError = error;
925
+                    }
926
+                }
927
+            }
928
+
929
+
930
+            if (blockY > 0)
931
+            {
932
+                int idx = ((blockY - stepSize) / stepSize) * mvStride + (blockX / stepSize);
933
+                MV aboveMV = mvsidx;
934
+
935
+                if (m_useSADinME)
936
+                    error = motionErrorLumaSAD(orig, buffer, blockX, blockY, aboveMV.x, aboveMV.y, blockSize, leastError);
937
+                else
938
+                    error = motionErrorLumaSSD(orig, buffer, blockX, blockY, aboveMV.x, aboveMV.y, blockSize, leastError);
939
+
940
+                if (error < leastError)
941
+                {
942
+                    best.set(aboveMV.x, aboveMV.y);
943
+                    leastError = error;
944
+                }
945
+            }
946
+
947
+            if (blockX > 0)
948
+            {
949
+                int idx = ((blockY / stepSize) * mvStride + (blockX - stepSize) / stepSize);
950
+                MV leftMV = mvsidx;
951
+
952
+                if (m_useSADinME)
953
+                    error = motionErrorLumaSAD(orig, buffer, blockX, blockY, leftMV.x, leftMV.y, blockSize, leastError);
954
+                else
955
+                    error = motionErrorLumaSSD(orig, buffer, blockX, blockY, leftMV.x, leftMV.y, blockSize, leastError);
956
+
957
+                if (error < leastError)
958
+                {
959
+                    best.set(leftMV.x, leftMV.y);
960
+                    leastError = error;
961
+                }
962
+            }
963
+
964
+            // calculate average
965
+            double avg = 0.0;
966
+            for (int x1 = 0; x1 < blockSize; x1++)
967
+            {
968
+                for (int y1 = 0; y1 < blockSize; y1++)
969
+                {
970
+                    avg = avg + *(orig->m_picOrg0 + (blockX + x1 + orig->m_stride * (blockY + y1)));
971
+                }
972
+            }
973
+            avg = avg / (blockSize * blockSize);
974
+
975
+            // calculate variance
976
+            double variance = 0;
977
+            for (int x1 = 0; x1 < blockSize; x1++)
978
+            {
979
+                for (int y1 = 0; y1 < blockSize; y1++)
980
+                {
981
+                    int pix = *(orig->m_picOrg0 + (blockX + x1 + orig->m_stride * (blockY + y1)));
982
+                    variance = variance + (pix - avg) * (pix - avg);
983
+                }
984
+            }
985
+
986
+            leastError = (int)(20 * ((leastError + 5.0) / (variance + 5.0)) + (leastError / (blockSize * blockSize)) / 50);
987
+
988
+            int mvIdx = (blockY / stepSize) * mvStride + (blockX / stepSize);
989
+            mvsmvIdx = best;
990
+            minErrormvIdx = leastError;
991
+        }
992
+    }
993
+}
994
+
995
+void TemporalFilter::destroyRefPicInfo(TemporalFilterRefPicInfo* curFrame)
996
+{
997
+    if (curFrame)
998
+    {
999
+        if (curFrame->compensatedPic)
1000
+        {
1001
+            curFrame->compensatedPic->destroy();
1002
+            delete curFrame->compensatedPic;
1003
+        }
1004
+
1005
+        if (curFrame->mvs)
1006
+            X265_FREE(curFrame->mvs);
1007
+        if (curFrame->mvs0)
1008
+            X265_FREE(curFrame->mvs0);
1009
+        if (curFrame->mvs1)
1010
+            X265_FREE(curFrame->mvs1);
1011
+        if (curFrame->mvs2)
1012
+            X265_FREE(curFrame->mvs2);
1013
+        if (curFrame->noise)
1014
+            X265_FREE(curFrame->noise);
1015
+        if (curFrame->error)
1016
+            X265_FREE(curFrame->error);
1017
+    }
1018
+}
1019
x265_3.6.tar.gz/source/common/temporalfilter.h Added
187
 
1
@@ -0,0 +1,185 @@
2
+/*****************************************************************************
3
+* Copyright (C) 2013-2021 MulticoreWare, Inc
4
+*
5
+ * Authors: Ashok Kumar Mishra <ashok@multicorewareinc.com>
6
+ *
7
+* This program is free software; you can redistribute it and/or modify
8
+* it under the terms of the GNU General Public License as published by
9
+* the Free Software Foundation; either version 2 of the License, or
10
+* (at your option) any later version.
11
+*
12
+* This program is distributed in the hope that it will be useful,
13
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+* GNU General Public License for more details.
16
+*
17
+* You should have received a copy of the GNU General Public License
18
+* along with this program; if not, write to the Free Software
19
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+*
21
+* This program is also available under a commercial proprietary license.
22
+* For more information, contact us at license @ x265.com.
23
+*****************************************************************************/
24
+
25
+#ifndef X265_TEMPORAL_FILTER_H
26
+#define X265_TEMPORAL_FILTER_H
27
+
28
+#include "x265.h"
29
+#include "picyuv.h"
30
+#include "mv.h"
31
+#include "piclist.h"
32
+#include "yuv.h"
33
+#include "motion.h"
34
+
35
+const int s_interpolationFilter168 =
36
+{
37
+    {   0,   0,   0,  64,   0,   0,   0,   0 },   //0
38
+    {   0,   1,  -3,  64,   4,  -2,   0,   0 },   //1 -->-->
39
+    {   0,   1,  -6,  62,   9,  -3,   1,   0 },   //2 -->
40
+    {   0,   2,  -8,  60,  14,  -5,   1,   0 },   //3 -->-->
41
+    {   0,   2,  -9,  57,  19,  -7,   2,   0 },   //4
42
+    {   0,   3, -10,  53,  24,  -8,   2,   0 },   //5 -->-->
43
+    {   0,   3, -11,  50,  29,  -9,   2,   0 },   //6 -->
44
+    {   0,   3, -11,  44,  35, -10,   3,   0 },   //7 -->-->
45
+    {   0,   1,  -7,  38,  38,  -7,   1,   0 },   //8
46
+    {   0,   3, -10,  35,  44, -11,   3,   0 },   //9 -->-->
47
+    {   0,   2,  -9,  29,  50, -11,   3,   0 },   //10-->
48
+    {   0,   2,  -8,  24,  53, -10,   3,   0 },   //11-->-->
49
+    {   0,   2,  -7,  19,  57,  -9,   2,   0 },   //12
50
+    {   0,   1,  -5,  14,  60,  -8,   2,   0 },   //13-->-->
51
+    {   0,   1,  -3,   9,  62,  -6,   1,   0 },   //14-->
52
+    {   0,   0,  -2,   4,  64,  -3,   1,   0 }    //15-->-->
53
+};
54
+
55
+const double s_refStrengths34 =
56
+{ // abs(POC offset)
57
+  //  1,    2     3     4
58
+  {0.85, 0.57, 0.41, 0.33},  // m_range * 2
59
+  {1.13, 0.97, 0.81, 0.57},  // m_range
60
+  {0.30, 0.30, 0.30, 0.30}   // otherwise
61
+};
62
+
63
+namespace X265_NS {
64
+    class OrigPicBuffer
65
+    {
66
+    public:
67
+        PicList    m_mcstfPicList;
68
+        PicList    m_mcstfOrigPicFreeList;
69
+        PicList    m_mcstfOrigPicList;
70
+
71
+        ~OrigPicBuffer();
72
+        void addPicture(Frame*);
73
+        void addEncPicture(Frame*);
74
+        void setOrigPicList(Frame*, int);
75
+        void recycleOrigPicList();
76
+        void addPictureToFreelist(Frame*);
77
+        void addEncPictureToPicList(Frame*);
78
+    };
79
+
80
+    struct MotionEstimatorTLD
81
+    {
82
+        MotionEstimate  me;
83
+
84
+        MotionEstimatorTLD()
85
+        {
86
+            me.init(X265_CSP_I400);
87
+            me.setQP(X265_LOOKAHEAD_QP);
88
+        }
89
+
90
+        ~MotionEstimatorTLD() {}
91
+    };
92
+
93
+    struct TemporalFilterRefPicInfo
94
+    {
95
+        PicYuv*    picBuffer;
96
+        PicYuv*    picBufferSubSampled2;
97
+        PicYuv*    picBufferSubSampled4;
98
+        MV*        mvs;
99
+        MV*        mvs0;
100
+        MV*        mvs1;
101
+        MV*        mvs2;
102
+        uint32_t   mvsStride;
103
+        uint32_t   mvsStride0;
104
+        uint32_t   mvsStride1;
105
+        uint32_t   mvsStride2;
106
+        int*       error;
107
+        int*       noise;
108
+
109
+        int16_t    origOffset;
110
+        bool       isFilteredFrame;
111
+        PicYuv*    compensatedPic;
112
+
113
+        int*       isSubsampled;
114
+
115
+        int        slicetype;
116
+    };
117
+
118
+    class TemporalFilter
119
+    {
120
+    public:
121
+        TemporalFilter();
122
+        ~TemporalFilter() {}
123
+
124
+        void init(const x265_param* param);
125
+
126
+        //private:
127
+            // Private static member variables
128
+        const x265_param *m_param;
129
+        int32_t  m_bitDepth;
130
+        int m_range;
131
+        uint8_t m_numRef;
132
+        double m_chromaFactor;
133
+        double m_sigmaMultiplier;
134
+        double m_sigmaZeroPoint;
135
+        int m_motionVectorFactor;
136
+        int m_padding;
137
+
138
+        // Private member variables
139
+
140
+        int m_sourceWidth;
141
+        int m_sourceHeight;
142
+        int m_QP;
143
+
144
+        int m_internalCsp;
145
+        int m_numComponents;
146
+        uint8_t m_sliceTypeConfig;
147
+
148
+        MotionEstimatorTLD* m_metld;
149
+        Yuv  predPUYuv;
150
+        int m_useSADinME;
151
+
152
+        int createRefPicInfo(TemporalFilterRefPicInfo* refFrame, x265_param* param);
153
+
154
+        void bilateralFilter(Frame* frame, TemporalFilterRefPicInfo* mctfRefList, double overallStrength);
155
+
156
+        void motionEstimationLuma(MV *mvs, uint32_t mvStride, PicYuv *orig, PicYuv *buffer, int bs,
157
+            MV *previous = 0, uint32_t prevmvStride = 0, int factor = 1);
158
+
159
+        void motionEstimationLumaDoubleRes(MV *mvs, uint32_t mvStride, PicYuv *orig, PicYuv *buffer, int blockSize,
160
+            MV *previous, uint32_t prevMvStride, int factor, int* minError);
161
+
162
+        int motionErrorLumaSSD(PicYuv *orig,
163
+            PicYuv *buffer,
164
+            int x,
165
+            int y,
166
+            int dx,
167
+            int dy,
168
+            int bs,
169
+            int besterror = 8 * 8 * 1024 * 1024);
170
+
171
+        int motionErrorLumaSAD(PicYuv *orig,
172
+            PicYuv *buffer,
173
+            int x,
174
+            int y,
175
+            int dx,
176
+            int dy,
177
+            int bs,
178
+            int besterror = 8 * 8 * 1024 * 1024);
179
+
180
+        void destroyRefPicInfo(TemporalFilterRefPicInfo* curFrame);
181
+
182
+        void applyMotion(MV *mvs, uint32_t mvsStride, PicYuv *input, PicYuv *output);
183
+
184
+    };
185
+}
186
+#endif
187
x265_3.5.tar.gz/source/common/threading.h -> x265_3.6.tar.gz/source/common/threading.h Changed
340
 
1
@@ -3,6 +3,7 @@
2
  *
3
  * Authors: Steve Borho <steve@borho.org>
4
  *          Min Chen <chenm003@163.com>
5
+            liwei <liwei@multicorewareinc.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -253,6 +254,47 @@
10
     int                m_val;
11
 };
12
 
13
+class NamedSemaphore
14
+{
15
+public:
16
+    NamedSemaphore() : m_sem(NULL)
17
+    {
18
+    }
19
+
20
+    ~NamedSemaphore()
21
+    {
22
+    }
23
+
24
+    bool create(const char* name, const int initcnt, const int maxcnt)
25
+    {
26
+        if(!m_sem)
27
+        {
28
+            m_sem = CreateSemaphoreA(NULL, initcnt, maxcnt, name);
29
+        }
30
+        return m_sem != NULL;
31
+    }
32
+
33
+    bool give(const int32_t cnt)
34
+    {
35
+        return ReleaseSemaphore(m_sem, (LONG)cnt, NULL) != FALSE;
36
+    }
37
+
38
+    bool take(const uint32_t time_out = INFINITE)
39
+    {
40
+        int32_t rt = WaitForSingleObject(m_sem, time_out);
41
+        return rt != WAIT_TIMEOUT && rt != WAIT_FAILED;
42
+    }
43
+
44
+    void release()
45
+    {
46
+        CloseHandle(m_sem);
47
+        m_sem = NULL;
48
+    }
49
+
50
+private:
51
+    HANDLE m_sem;
52
+};
53
+
54
 #else /* POSIX / pthreads */
55
 
56
 typedef pthread_t ThreadHandle;
57
@@ -459,6 +501,282 @@
58
     int             m_val;
59
 };
60
 
61
+#define TIMEOUT_INFINITE 0xFFFFFFFF
62
+
63
+class NamedSemaphore
64
+{
65
+public:
66
+    NamedSemaphore() 
67
+        : m_sem(NULL)
68
+#ifndef __APPLE__
69
+        , m_name(NULL)
70
+#endif //__APPLE__
71
+    {
72
+    }
73
+
74
+    ~NamedSemaphore()
75
+    {
76
+    }
77
+
78
+    bool create(const char* name, const int initcnt, const int maxcnt)
79
+    {
80
+        bool ret = false;
81
+
82
+        if (initcnt >= maxcnt)
83
+        {
84
+            return false;
85
+        }
86
+
87
+#ifdef __APPLE__
88
+        do
89
+        {
90
+            int32_t pshared = name != NULL ? PTHREAD_PROCESS_SHARED : PTHREAD_PROCESS_PRIVATE;
91
+
92
+            m_sem = (mac_sem_t *)malloc(sizeof(mac_sem_t));
93
+            if (!m_sem)
94
+            {
95
+                break;
96
+            }
97
+
98
+            if (pthread_mutexattr_init(&m_sem->mutexAttr))
99
+            {
100
+                break;
101
+            }
102
+
103
+            if (pthread_mutexattr_setpshared(&m_sem->mutexAttr, pshared))
104
+            {
105
+                break;
106
+            }
107
+
108
+            if (pthread_condattr_init(&m_sem->condAttr))
109
+            {
110
+                break;
111
+            }
112
+
113
+            if (pthread_condattr_setpshared(&m_sem->condAttr, pshared))
114
+            {
115
+                break;
116
+            }
117
+
118
+            if (pthread_mutex_init(&m_sem->mutex, &m_sem->mutexAttr))
119
+            {
120
+                break;
121
+            }
122
+
123
+            if (pthread_cond_init(&m_sem->cond, &m_sem->condAttr))
124
+            {
125
+                break;
126
+            }
127
+
128
+            m_sem->curCnt = initcnt;
129
+            m_sem->maxCnt = maxcnt;
130
+
131
+            ret = true;
132
+        } while (0);
133
+        
134
+        if (!ret)
135
+        {
136
+            release();
137
+        }
138
+
139
+#else  //__APPLE__
140
+        m_sem = sem_open(name, O_CREAT | O_EXCL, 0666, initcnt);
141
+        if (m_sem != SEM_FAILED) 
142
+        {
143
+            m_name = strdup(name);
144
+            ret = true;
145
+        }
146
+        else 
147
+        {
148
+            if (EEXIST == errno) 
149
+            {
150
+                m_sem = sem_open(name, 0);
151
+                if (m_sem != SEM_FAILED) 
152
+                {
153
+                    m_name = strdup(name);
154
+                    ret = true;
155
+                }
156
+            }
157
+        }
158
+#endif //__APPLE__
159
+
160
+        return ret;
161
+    }
162
+
163
+    bool give(const int32_t cnt)
164
+    {
165
+        if (!m_sem)
166
+        {
167
+            return false;
168
+        }
169
+
170
+#ifdef __APPLE__
171
+        if (pthread_mutex_lock(&m_sem->mutex))
172
+        {
173
+            return false;
174
+        }
175
+
176
+        int oldCnt = m_sem->curCnt;
177
+        m_sem->curCnt += cnt;
178
+        if (m_sem->curCnt > m_sem->maxCnt)
179
+        {
180
+            m_sem->curCnt = m_sem->maxCnt;
181
+        }
182
+
183
+        bool ret = true;
184
+        if (!oldCnt)
185
+        {
186
+            ret = 0 == pthread_cond_broadcast(&m_sem->cond);
187
+        }
188
+
189
+        if (pthread_mutex_unlock(&m_sem->mutex))
190
+        {
191
+            return false;
192
+        }
193
+
194
+        return ret;
195
+#else //__APPLE__
196
+        int ret = 0;
197
+        int32_t curCnt = cnt;
198
+        while (curCnt-- && !ret) {
199
+            ret = sem_post(m_sem);
200
+        }
201
+
202
+        return 0 == ret;
203
+#endif //_APPLE__
204
+    }
205
+
206
+    bool take(const uint32_t time_out = TIMEOUT_INFINITE)
207
+    {
208
+        if (!m_sem)
209
+        {
210
+            return false;
211
+        }
212
+
213
+#ifdef __APPLE__
214
+
215
+        if (pthread_mutex_lock(&m_sem->mutex))
216
+        {
217
+            return false;
218
+        }
219
+
220
+        bool ret = true;
221
+        if (TIMEOUT_INFINITE == time_out) 
222
+        {
223
+            if (!m_sem->curCnt)
224
+            {
225
+                if (pthread_cond_wait(&m_sem->cond, &m_sem->mutex))
226
+                {
227
+                    ret = false;
228
+                } 
229
+            }
230
+
231
+            if (m_sem->curCnt && ret)
232
+            {
233
+                m_sem->curCnt--;
234
+            }
235
+        }
236
+        else
237
+        {
238
+            if (0 == time_out)
239
+            {
240
+                if (m_sem->curCnt)
241
+                {
242
+                    m_sem->curCnt--;
243
+                }
244
+                else
245
+                {
246
+                    ret = false;
247
+                }
248
+            }
249
+            else
250
+            {
251
+                if (!m_sem->curCnt)
252
+                {
253
+                    struct timespec ts;
254
+                    ts.tv_sec = time_out / 1000L;
255
+                    ts.tv_nsec = (time_out * 1000000L) - ts.tv_sec * 1000 * 1000 * 1000;
256
+
257
+                    if (pthread_cond_timedwait(&m_sem->cond, &m_sem->mutex, &ts))
258
+                    {
259
+                        ret = false;
260
+                    }
261
+                }
262
+
263
+                if (m_sem->curCnt && ret)
264
+                {
265
+                    m_sem->curCnt--;
266
+                }
267
+            }
268
+        }
269
+
270
+        if (pthread_mutex_unlock(&m_sem->mutex))
271
+        {
272
+            return false;
273
+        }
274
+
275
+        return ret;
276
+#else //__APPLE__
277
+        if (TIMEOUT_INFINITE == time_out) 
278
+        {
279
+            return 0 == sem_wait(m_sem);
280
+        }
281
+        else 
282
+        {
283
+            if (0 == time_out)
284
+            {
285
+                return 0 == sem_trywait(m_sem);
286
+            }
287
+            else
288
+            {
289
+                struct timespec ts;
290
+                ts.tv_sec = time_out / 1000L;
291
+                ts.tv_nsec = (time_out * 1000000L) - ts.tv_sec * 1000 * 1000 * 1000;
292
+                return 0 == sem_timedwait(m_sem, &ts);
293
+            }
294
+        }
295
+#endif //_APPLE__
296
+    }
297
+
298
+    void release()
299
+    {
300
+        if (m_sem)
301
+        {
302
+#ifdef __APPLE__
303
+            pthread_condattr_destroy(&m_sem->condAttr);
304
+            pthread_mutexattr_destroy(&m_sem->mutexAttr);
305
+            pthread_mutex_destroy(&m_sem->mutex);
306
+            pthread_cond_destroy(&m_sem->cond);
307
+            free(m_sem);
308
+            m_sem = NULL;
309
+#else //__APPLE__
310
+            sem_close(m_sem);
311
+            sem_unlink(m_name);
312
+            m_sem = NULL;
313
+            free(m_name);
314
+            m_name = NULL;
315
+#endif //__APPLE__
316
+        }
317
+    }
318
+
319
+private:
320
+#ifdef __APPLE__
321
+    typedef struct
322
+    {
323
+        pthread_mutex_t     mutex;
324
+        pthread_cond_t      cond;
325
+        pthread_mutexattr_t mutexAttr;
326
+        pthread_condattr_t  condAttr;
327
+        uint32_t            curCnt;
328
+        uint32_t            maxCnt;
329
+    }mac_sem_t;
330
+    mac_sem_t *m_sem;
331
+#else // __APPLE__
332
+    sem_t *m_sem;
333
+    char  *m_name;
334
+#endif // __APPLE_
335
+};
336
+
337
 #endif // ifdef _WIN32
338
 
339
 class ScopedLock
340
x265_3.5.tar.gz/source/common/threadpool.cpp -> x265_3.6.tar.gz/source/common/threadpool.cpp Changed
10
 
1
@@ -301,7 +301,7 @@
2
     /* limit threads based on param->numaPools
3
      * For windows because threads can't be allocated to live across sockets
4
      * changing the default behavior to be per-socket pools -- FIXME */
5
-#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
6
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 || HAVE_LIBNUMA
7
     if (!p->numaPools || (strcmp(p->numaPools, "NULL") == 0 || strcmp(p->numaPools, "*") == 0 || strcmp(p->numaPools, "") == 0))
8
     {
9
          char poolString50 = "";
10
x265_3.5.tar.gz/source/common/version.cpp -> x265_3.6.tar.gz/source/common/version.cpp Changed
10
 
1
@@ -71,7 +71,7 @@
2
 #define ONOS    "Unk-OS"
3
 #endif
4
 
5
-#if X86_64
6
+#if defined(_LP64) || defined(_WIN64)
7
 #define BITS    "64 bit"
8
 #else
9
 #define BITS    "32 bit"
10
x265_3.5.tar.gz/source/common/x86/asm-primitives.cpp -> x265_3.6.tar.gz/source/common/x86/asm-primitives.cpp Changed
85
 
1
@@ -1091,6 +1091,7 @@
2
 
3
         p.frameInitLowres = PFX(frame_init_lowres_core_sse2);
4
         p.frameInitLowerRes = PFX(frame_init_lowres_core_sse2);
5
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_sse2);
6
         // TODO: the planecopy_sp is really planecopy_SC now, must be fix it 
7
         //p.planecopy_sp = PFX(downShift_16_sse2);
8
         p.planecopy_sp_shl = PFX(upShift_16_sse2);
9
@@ -1121,6 +1122,7 @@
10
     {
11
         ASSIGN2(p.scale1D_128to64, scale1D_128to64_ssse3);
12
         p.scale2D_64to32 = PFX(scale2D_64to32_ssse3);
13
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_ssse3);
14
 
15
         // p.puLUMA_4x4.satd = p.cuBLOCK_4x4.sa8d = PFX(pixel_satd_4x4_ssse3); this one is broken
16
         ALL_LUMA_PU(satd, pixel_satd, ssse3);
17
@@ -1462,6 +1464,7 @@
18
         p.puLUMA_64x48.copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx);
19
         p.puLUMA_64x64.copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx);
20
         p.propagateCost = PFX(mbtree_propagate_cost_avx);
21
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_avx);
22
     }
23
     if (cpuMask & X265_CPU_XOP)
24
     {
25
@@ -1473,6 +1476,7 @@
26
         LUMA_VAR(xop);
27
         p.frameInitLowres = PFX(frame_init_lowres_core_xop);
28
         p.frameInitLowerRes = PFX(frame_init_lowres_core_xop);
29
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_xop);
30
     }
31
     if (cpuMask & X265_CPU_AVX2)
32
     {
33
@@ -2301,6 +2305,9 @@
34
 
35
         p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
36
         p.frameInitLowerRes = PFX(frame_init_lowres_core_avx2);
37
+
38
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_avx2);
39
+
40
         p.propagateCost = PFX(mbtree_propagate_cost_avx2);
41
         p.fix8Unpack = PFX(cutree_fix8_unpack_avx2);
42
         p.fix8Pack = PFX(cutree_fix8_pack_avx2);
43
@@ -3300,6 +3307,7 @@
44
         //p.frameInitLowres = PFX(frame_init_lowres_core_mmx2);
45
         p.frameInitLowres = PFX(frame_init_lowres_core_sse2);
46
         p.frameInitLowerRes = PFX(frame_init_lowres_core_sse2);
47
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_sse2);
48
 
49
         ALL_LUMA_TU(blockfill_sNONALIGNED, blockfill_s, sse2);
50
         ALL_LUMA_TU(blockfill_sALIGNED, blockfill_s, sse2);
51
@@ -3424,6 +3432,8 @@
52
         ASSIGN2(p.scale1D_128to64, scale1D_128to64_ssse3);
53
         p.scale2D_64to32 = PFX(scale2D_64to32_ssse3);
54
 
55
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_ssse3);
56
+
57
         ASSIGN2(p.puLUMA_8x4.convert_p2s, filterPixelToShort_8x4_ssse3);
58
         ASSIGN2(p.puLUMA_8x8.convert_p2s, filterPixelToShort_8x8_ssse3);
59
         ASSIGN2(p.puLUMA_8x16.convert_p2s, filterPixelToShort_8x16_ssse3);
60
@@ -3691,6 +3701,7 @@
61
         p.frameInitLowres = PFX(frame_init_lowres_core_avx);
62
         p.frameInitLowerRes = PFX(frame_init_lowres_core_avx);
63
         p.propagateCost = PFX(mbtree_propagate_cost_avx);
64
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_avx);
65
     }
66
     if (cpuMask & X265_CPU_XOP)
67
     {
68
@@ -3702,6 +3713,7 @@
69
         p.cuBLOCK_16x16.sse_pp = PFX(pixel_ssd_16x16_xop);
70
         p.frameInitLowres = PFX(frame_init_lowres_core_xop);
71
         p.frameInitLowerRes = PFX(frame_init_lowres_core_xop);
72
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_xop);
73
 
74
     }
75
 #if X86_64
76
@@ -4684,6 +4696,8 @@
77
         p.saoCuStatsE2 = PFX(saoCuStatsE2_avx2);
78
         p.saoCuStatsE3 = PFX(saoCuStatsE3_avx2);
79
 
80
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_avx2);
81
+
82
         if (cpuMask & X265_CPU_BMI2)
83
         {
84
             p.scanPosLast = PFX(scanPosLast_avx2_bmi2);
85
x265_3.5.tar.gz/source/common/x86/const-a.asm -> x265_3.6.tar.gz/source/common/x86/const-a.asm Changed
10
 
1
@@ -100,7 +100,7 @@
2
 const pw_2000,              times 16 dw 0x2000
3
 const pw_8000,              times  8 dw 0x8000
4
 const pw_3fff,              times 16 dw 0x3fff
5
-const pw_32_0,              times  4 dw 32,
6
+const pw_32_0,              times  4 dw 32
7
                             times  4 dw 0
8
 const pw_pixel_max,         times 16 dw ((1 << BIT_DEPTH)-1)
9
 
10
x265_3.5.tar.gz/source/common/x86/h-ipfilter8.asm -> x265_3.6.tar.gz/source/common/x86/h-ipfilter8.asm Changed
20
 
1
@@ -125,6 +125,9 @@
2
 ALIGN 32
3
 interp4_hps_shuf: times 2 db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
4
 
5
+ALIGN 32
6
+const interp_4tap_8x8_horiz_shuf,   dd 0, 4, 1, 5, 2, 6, 3, 7
7
+
8
 SECTION .text
9
 
10
 cextern pw_1
11
@@ -1459,8 +1462,6 @@
12
 
13
     RET
14
 
15
-ALIGN 32
16
-const interp_4tap_8x8_horiz_shuf,   dd 0, 4, 1, 5, 2, 6, 3, 7
17
 
18
 %macro FILTER_H4_w6 3
19
     movu        %1, srcq - 1
20
x265_3.5.tar.gz/source/common/x86/mc-a2.asm -> x265_3.6.tar.gz/source/common/x86/mc-a2.asm Changed
264
 
1
@@ -992,6 +992,262 @@
2
 FRAME_INIT_LOWRES
3
 %endif
4
 
5
+%macro SUBSAMPLEFILT8x4 7
6
+    mova      %3, r0+%7
7
+    mova      %4, r0+r2+%7
8
+    pavgb     %3, %4
9
+    pavgb     %4, r0+r2*2+%7
10
+    PALIGNR   %1, %3, 1, m6
11
+    PALIGNR   %2, %4, 1, m6
12
+%if cpuflag(xop)
13
+    pavgb     %1, %3
14
+    pavgb     %2, %4
15
+%else
16
+    pavgb     %1, %3
17
+    pavgb     %2, %4
18
+    psrlw     %5, %1, 8
19
+    psrlw     %6, %2, 8
20
+    pand      %1, m7
21
+    pand      %2, m7
22
+%endif
23
+%endmacro
24
+
25
+%macro SUBSAMPLEFILT32x4U 1
26
+    movu      m1, r0+r2
27
+    pavgb     m0, m1, r0
28
+    movu      m3, r0+r2+1
29
+    pavgb     m2, m3, r0+1
30
+    pavgb     m1, r0+r2*2
31
+    pavgb     m3, r0+r2*2+1
32
+    pavgb     m0, m2
33
+    pavgb     m1, m3
34
+
35
+    movu      m3, r0+r2+mmsize
36
+    pavgb     m2, m3, r0+mmsize
37
+    movu      m5, r0+r2+1+mmsize
38
+    pavgb     m4, m5, r0+1+mmsize
39
+    pavgb     m2, m4
40
+
41
+    pshufb    m0, m7
42
+    pshufb    m2, m7
43
+    punpcklqdq m0, m0, m2
44
+    vpermq    m0, m0, q3120
45
+    movu    %1, m0
46
+%endmacro
47
+
48
+%macro SUBSAMPLEFILT16x2 3
49
+    mova      m3, r0+%3+mmsize
50
+    mova      m2, r0+%3
51
+    pavgb     m3, r0+%3+r2+mmsize
52
+    pavgb     m2, r0+%3+r2
53
+    PALIGNR   %1, m3, 1, m6
54
+    pavgb     %1, m3
55
+    PALIGNR   m3, m2, 1, m6
56
+    pavgb     m3, m2
57
+%if cpuflag(xop)
58
+    vpperm    m3, m3, %1, m6
59
+%else
60
+    pand      m3, m7
61
+    pand      %1, m7
62
+    packuswb  m3, %1
63
+%endif
64
+    mova    %2, m3
65
+    mova      %1, m2
66
+%endmacro
67
+
68
+%macro SUBSAMPLEFILT8x2U 2
69
+    mova      m2, r0+%2
70
+    pavgb     m2, r0+%2+r2
71
+    mova      m0, r0+%2+1
72
+    pavgb     m0, r0+%2+r2+1
73
+    pavgb     m1, m3
74
+    pavgb     m0, m2
75
+    pand      m1, m7
76
+    pand      m0, m7
77
+    packuswb  m0, m1
78
+    mova    %1, m0
79
+%endmacro
80
+
81
+%macro SUBSAMPLEFILT8xU 2
82
+    mova      m3, r0+%2+8
83
+    mova      m2, r0+%2
84
+    pavgw     m3, r0+%2+r2+8
85
+    pavgw     m2, r0+%2+r2
86
+    movu      m1, r0+%2+10
87
+    movu      m0, r0+%2+2
88
+    pavgw     m1, r0+%2+r2+10
89
+    pavgw     m0, r0+%2+r2+2
90
+    pavgw     m1, m3
91
+    pavgw     m0, m2
92
+    psrld     m3, m1, 16
93
+    pand      m1, m7
94
+    pand      m0, m7
95
+    packssdw  m0, m1
96
+    movu    %1, m0
97
+%endmacro
98
+
99
+%macro SUBSAMPLEFILT8xA 3
100
+    movu      m3, r0+%3+mmsize
101
+    movu      m2, r0+%3
102
+    pavgw     m3, r0+%3+r2+mmsize
103
+    pavgw     m2, r0+%3+r2
104
+    PALIGNR   %1, m3, 2, m6
105
+    pavgw     %1, m3
106
+    PALIGNR   m3, m2, 2, m6
107
+    pavgw     m3, m2
108
+%if cpuflag(xop)
109
+    vpperm    m3, m3, %1, m6
110
+%else
111
+    pand      m3, m7
112
+    pand      %1, m7
113
+    packssdw  m3, %1
114
+%endif
115
+%if cpuflag(avx2)
116
+    vpermq     m3, m3, q3120
117
+%endif
118
+    movu    %2, m3
119
+    movu      %1, m2
120
+%endmacro
121
+
122
+;-----------------------------------------------------------------------------
123
+; void frame_subsample_luma( uint8_t *src0, uint8_t *dst0,
124
+;                              intptr_t src_stride, intptr_t dst_stride, int width, int height )
125
+;-----------------------------------------------------------------------------
126
+
127
+%macro FRAME_SUBSAMPLE_LUMA 0
128
+cglobal frame_subsample_luma, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
129
+%if HIGH_BIT_DEPTH
130
+    shl   dword r3m, 1
131
+    FIX_STRIDES r2
132
+    shl   dword r4m, 1
133
+%endif
134
+%if mmsize >= 16
135
+    add   dword r4m, mmsize-1
136
+    and   dword r4m, ~(mmsize-1)
137
+%endif
138
+    ; src += 2*(height-1)*stride + 2*width
139
+    mov      r6d, r5m
140
+    dec      r6d
141
+    imul     r6d, r2d
142
+    add      r6d, r4m
143
+    lea       r0, r0+r6*2
144
+    ; dst += (height-1)*stride + width
145
+    mov      r6d, r5m
146
+    dec      r6d
147
+    imul     r6d, r3m
148
+    add      r6d, r4m
149
+    add       r1, r6
150
+    ; gap = stride - width
151
+    mov      r6d, r3m
152
+    sub      r6d, r4m
153
+    PUSH      r6
154
+    %define dst_gap rsp+gprsize
155
+    mov      r6d, r2d
156
+    sub      r6d, r4m
157
+    shl      r6d, 1
158
+    PUSH      r6
159
+    %define src_gap rsp
160
+%if HIGH_BIT_DEPTH
161
+%if cpuflag(xop)
162
+    mova      m6, deinterleave_shuf32a
163
+    mova      m7, deinterleave_shuf32b
164
+%else
165
+    pcmpeqw   m7, m7
166
+    psrld     m7, 16
167
+%endif
168
+.vloop:
169
+    mov      r6d, r4m
170
+%ifnidn cpuname, mmx2
171
+    movu      m0, r0
172
+    movu      m1, r0+r2
173
+    pavgw     m0, m1
174
+    pavgw     m1, r0+r2*2
175
+%endif
176
+.hloop:
177
+    sub       r0, mmsize*2
178
+    sub       r1, mmsize
179
+%ifidn cpuname, mmx2
180
+    SUBSAMPLEFILT8xU r1, 0
181
+%else
182
+    SUBSAMPLEFILT8xA m0, r1, 0
183
+%endif
184
+    sub      r6d, mmsize
185
+    jg .hloop
186
+%else ; !HIGH_BIT_DEPTH
187
+%if cpuflag(avx2)
188
+    mova      m7, deinterleave_shuf
189
+%elif cpuflag(xop)
190
+    mova      m6, deinterleave_shuf32a
191
+    mova      m7, deinterleave_shuf32b
192
+%else
193
+    pcmpeqb   m7, m7
194
+    psrlw     m7, 8
195
+%endif
196
+.vloop:
197
+    mov      r6d, r4m
198
+%ifnidn cpuname, mmx2
199
+%if mmsize <= 16
200
+    mova      m0, r0
201
+    mova      m1, r0+r2
202
+    pavgb     m0, m1
203
+    pavgb     m1, r0+r2*2
204
+%endif
205
+%endif
206
+.hloop:
207
+    sub       r0, mmsize*2
208
+    sub       r1, mmsize
209
+%if mmsize==32
210
+    SUBSAMPLEFILT32x4U r1
211
+%elifdef m8
212
+    SUBSAMPLEFILT8x4   m0, m1, m2, m3, m10, m11, mmsize
213
+    mova      m8, m0
214
+    mova      m9, m1
215
+    SUBSAMPLEFILT8x4   m2, m3, m0, m1, m4, m5, 0
216
+%if cpuflag(xop)
217
+    vpperm    m4, m2, m8, m7
218
+    vpperm    m2, m2, m8, m6
219
+%else
220
+    packuswb  m2, m8
221
+%endif
222
+    mova    r1, m2
223
+%elifidn cpuname, mmx2
224
+    SUBSAMPLEFILT8x2U  r1, 0
225
+%else
226
+    SUBSAMPLEFILT16x2  m0, r1, 0
227
+%endif
228
+    sub      r6d, mmsize
229
+    jg .hloop
230
+%endif ; HIGH_BIT_DEPTH
231
+.skip:
232
+    mov       r3, dst_gap
233
+    sub       r0, src_gap
234
+    sub       r1, r3
235
+    dec    dword r5m
236
+    jg .vloop
237
+    ADD      rsp, 2*gprsize
238
+    emms
239
+    RET
240
+%endmacro ; FRAME_SUBSAMPLE_LUMA
241
+
242
+INIT_MMX mmx2
243
+FRAME_SUBSAMPLE_LUMA
244
+%if ARCH_X86_64 == 0
245
+INIT_MMX cache32, mmx2
246
+FRAME_SUBSAMPLE_LUMA
247
+%endif
248
+INIT_XMM sse2
249
+FRAME_SUBSAMPLE_LUMA
250
+INIT_XMM ssse3
251
+FRAME_SUBSAMPLE_LUMA
252
+INIT_XMM avx
253
+FRAME_SUBSAMPLE_LUMA
254
+INIT_XMM xop
255
+FRAME_SUBSAMPLE_LUMA
256
+%if ARCH_X86_64 == 1
257
+INIT_YMM avx2
258
+FRAME_SUBSAMPLE_LUMA
259
+%endif
260
+
261
 ;-----------------------------------------------------------------------------
262
 ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, int32_t *intra_costs,
263
 ;                             uint16_t *inter_costs, int32_t *inv_qscales, double *fps_factor, int len )
264
x265_3.5.tar.gz/source/common/x86/mc.h -> x265_3.6.tar.gz/source/common/x86/mc.h Changed
19
 
1
@@ -36,6 +36,17 @@
2
 
3
 #undef LOWRES
4
 
5
+#define SUBSAMPLELUMA(cpu) \
6
+    void PFX(frame_subsample_luma_ ## cpu)(const pixel* src0, pixel* dst0, intptr_t src_stride, intptr_t dst_stride, int width, int height);
7
+SUBSAMPLELUMA(mmx2)
8
+SUBSAMPLELUMA(sse2)
9
+SUBSAMPLELUMA(ssse3)
10
+SUBSAMPLELUMA(avx)
11
+SUBSAMPLELUMA(avx2)
12
+SUBSAMPLELUMA(xop)
13
+
14
+#undef SUBSAMPLELUMA
15
+
16
 #define PROPAGATE_COST(cpu) \
17
     void PFX(mbtree_propagate_cost_ ## cpu)(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, \
18
                                               const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
19
x265_3.5.tar.gz/source/common/x86/x86inc.asm -> x265_3.6.tar.gz/source/common/x86/x86inc.asm Changed
96
 
1
@@ -401,16 +401,6 @@
2
     %endif
3
 %endmacro
4
 
5
-%macro DEFINE_ARGS_INTERNAL 3+
6
-    %ifnum %2
7
-        DEFINE_ARGS %3
8
-    %elif %1 == 4
9
-        DEFINE_ARGS %2
10
-    %elif %1 > 4
11
-        DEFINE_ARGS %2, %3
12
-    %endif
13
-%endmacro
14
-
15
 %if WIN64 ; Windows x64 ;=================================================
16
 
17
 DECLARE_REG 0,  rcx
18
@@ -429,7 +419,7 @@
19
 DECLARE_REG 13, R12, 112
20
 DECLARE_REG 14, R13, 120
21
 
22
-%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
23
+%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
24
     %assign num_args %1
25
     %assign regs_used %2
26
     ASSERT regs_used >= num_args
27
@@ -441,7 +431,15 @@
28
         WIN64_SPILL_XMM %3
29
     %endif
30
     LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
31
-    DEFINE_ARGS_INTERNAL %0, %4, %5
32
+    %if %0 > 4
33
+         %ifnum %4
34
+             DEFINE_ARGS %5
35
+         %else
36
+             DEFINE_ARGS %4, %5
37
+         %endif
38
+     %elifnnum %4
39
+         DEFINE_ARGS %4
40
+     %endif
41
 %endmacro
42
 
43
 %macro WIN64_PUSH_XMM 0
44
@@ -537,7 +535,7 @@
45
 DECLARE_REG 13, R12, 64
46
 DECLARE_REG 14, R13, 72
47
 
48
-%macro PROLOGUE 2-5+ 0; #args, #regs, #xmm_regs, stack_size, arg_names...
49
+%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
50
     %assign num_args %1
51
     %assign regs_used %2
52
     %assign xmm_regs_used %3
53
@@ -547,7 +545,15 @@
54
     PUSH_IF_USED 9, 10, 11, 12, 13, 14
55
     ALLOC_STACK %4
56
     LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
57
-    DEFINE_ARGS_INTERNAL %0, %4, %5
58
+    %if %0 > 4
59
+         %ifnum %4
60
+             DEFINE_ARGS %5
61
+         %else
62
+             DEFINE_ARGS %4, %5
63
+         %endif
64
+     %elifnnum %4
65
+         DEFINE_ARGS %4
66
+     %endif
67
 %endmacro
68
 
69
 %define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
70
@@ -588,7 +594,7 @@
71
 
72
 DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
73
 
74
-%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, stack_size, arg_names...
75
+%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
76
     %assign num_args %1
77
     %assign regs_used %2
78
     ASSERT regs_used >= num_args
79
@@ -603,7 +609,15 @@
80
     PUSH_IF_USED 3, 4, 5, 6
81
     ALLOC_STACK %4
82
     LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
83
-    DEFINE_ARGS_INTERNAL %0, %4, %5
84
+    %if %0 > 4
85
+         %ifnum %4
86
+             DEFINE_ARGS %5
87
+         %else
88
+             DEFINE_ARGS %4, %5
89
+         %endif
90
+     %elifnnum %4
91
+         DEFINE_ARGS %4
92
+     %endif
93
 %endmacro
94
 
95
 %define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
96
x265_3.5.tar.gz/source/common/x86/x86util.asm -> x265_3.6.tar.gz/source/common/x86/x86util.asm Changed
13
 
1
@@ -578,8 +578,10 @@
2
     %elif %1==2
3
         %if mmsize==8
4
             SBUTTERFLY dq, %3, %4, %5
5
-        %else
6
+        %elif %0==6
7
             TRANS q, ORDER, %3, %4, %5, %6
8
+        %else
9
+            TRANS q, ORDER, %3, %4, %5
10
         %endif
11
     %elif %1==4
12
         SBUTTERFLY qdq, %3, %4, %5
13
x265_3.5.tar.gz/source/encoder/analysis.cpp -> x265_3.6.tar.gz/source/encoder/analysis.cpp Changed
10
 
1
@@ -3645,7 +3645,7 @@
2
             qp += distortionData->offsetctu.m_cuAddr;
3
     }
4
 
5
-    if (m_param->analysisLoadReuseLevel == 10 && m_param->rc.cuTree)
6
+    if (m_param->analysisLoadReuseLevel >= 2 && m_param->rc.cuTree)
7
     {
8
         int cuIdx = (ctu.m_cuAddr * ctu.m_numPartitions) + cuGeom.absPartIdx;
9
         if (ctu.m_slice->m_sliceType == I_SLICE)
10
x265_3.5.tar.gz/source/encoder/api.cpp -> x265_3.6.tar.gz/source/encoder/api.cpp Changed
50
 
1
@@ -208,7 +208,6 @@
2
     memcpy(zoneParam, param, sizeof(x265_param));
3
     for (int i = 0; i < param->rc.zonefileCount; i++)
4
     {
5
-        param->rc.zonesi.startFrame = -1;
6
         encoder->configureZone(zoneParam, param->rc.zonesi.zoneParam);
7
     }
8
 
9
@@ -608,6 +607,14 @@
10
     if (numEncoded < 0)
11
         encoder->m_aborted = true;
12
 
13
+    if ((!encoder->m_numDelayedPic && !numEncoded) && (encoder->m_param->bEnableEndOfSequence || encoder->m_param->bEnableEndOfBitstream))
14
+    {
15
+        Bitstream bs;
16
+        encoder->getEndNalUnits(encoder->m_nalList, bs);
17
+        *pp_nal = &encoder->m_nalList.m_nal0;
18
+        if (pi_nal) *pi_nal = encoder->m_nalList.m_numNal;
19
+    }
20
+
21
     return numEncoded;
22
 }
23
 
24
@@ -1042,6 +1049,7 @@
25
     &PARAM_NS::x265_param_free,
26
     &PARAM_NS::x265_param_default,
27
     &PARAM_NS::x265_param_parse,
28
+    &PARAM_NS::x265_scenecut_aware_qp_param_parse,
29
     &PARAM_NS::x265_param_apply_profile,
30
     &PARAM_NS::x265_param_default_preset,
31
     &x265_picture_alloc,
32
@@ -1288,6 +1296,8 @@
33
             if (param->csvLogLevel)
34
             {
35
                 fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, Scenecut, ");
36
+                if (!!param->bEnableTemporalSubLayers)
37
+                    fprintf(csvfp, "Temporal Sub Layer ID, ");
38
                 if (param->csvLogLevel >= 2)
39
                     fprintf(csvfp, "I/P cost ratio, ");
40
                 if (param->rc.rateControlMode == X265_RC_CRF)
41
@@ -1401,6 +1411,8 @@
42
     const x265_frame_stats* frameStats = &pic->frameData;
43
     fprintf(param->csvfpt, "%d, %c-SLICE, %4d, %2.2lf, %10d, %d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc,
44
                                                                    frameStats->qp, (int)frameStats->bits, frameStats->bScenecut);
45
+    if (!!param->bEnableTemporalSubLayers)
46
+        fprintf(param->csvfpt, "%d,", frameStats->tLayer);
47
     if (param->csvLogLevel >= 2)
48
         fprintf(param->csvfpt, "%.2f,", frameStats->ipCostRatio);
49
     if (param->rc.rateControlMode == X265_RC_CRF)
50
x265_3.5.tar.gz/source/encoder/dpb.cpp -> x265_3.6.tar.gz/source/encoder/dpb.cpp Changed
258
 
1
@@ -70,10 +70,18 @@
2
     {
3
         Frame *curFrame = iterFrame;
4
         iterFrame = iterFrame->m_next;
5
-        if (!curFrame->m_encData->m_bHasReferences && !curFrame->m_countRefEncoders)
6
+        bool isMCSTFReferenced = false;
7
+
8
+        if (curFrame->m_param->bEnableTemporalFilter)
9
+            isMCSTFReferenced =!!(curFrame->m_refPicCnt1);
10
+
11
+        if (!curFrame->m_encData->m_bHasReferences && !curFrame->m_countRefEncoders && !isMCSTFReferenced)
12
         {
13
             curFrame->m_bChromaExtended = false;
14
 
15
+            if (curFrame->m_param->bEnableTemporalFilter)
16
+                *curFrame->m_isSubSampled = false;
17
+
18
             // Reset column counter
19
             X265_CHECK(curFrame->m_reconRowFlag != NULL, "curFrame->m_reconRowFlag check failure");
20
             X265_CHECK(curFrame->m_reconColCount != NULL, "curFrame->m_reconColCount check failure");
21
@@ -142,12 +150,13 @@
22
     {
23
         newFrame->m_encData->m_bHasReferences = false;
24
 
25
+        newFrame->m_tempLayer = (newFrame->m_param->bEnableTemporalSubLayers && !m_bTemporalSublayer) ? 1 : newFrame->m_tempLayer;
26
         // Adjust NAL type for unreferenced B frames (change from _R "referenced"
27
         // to _N "non-referenced" NAL unit type)
28
         switch (slice->m_nalUnitType)
29
         {
30
         case NAL_UNIT_CODED_SLICE_TRAIL_R:
31
-            slice->m_nalUnitType = m_bTemporalSublayer ? NAL_UNIT_CODED_SLICE_TSA_N : NAL_UNIT_CODED_SLICE_TRAIL_N;
32
+            slice->m_nalUnitType = newFrame->m_param->bEnableTemporalSubLayers ? NAL_UNIT_CODED_SLICE_TSA_N : NAL_UNIT_CODED_SLICE_TRAIL_N;
33
             break;
34
         case NAL_UNIT_CODED_SLICE_RADL_R:
35
             slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_RADL_N;
36
@@ -168,13 +177,94 @@
37
 
38
     m_picList.pushFront(*newFrame);
39
 
40
+    if (m_bTemporalSublayer && getTemporalLayerNonReferenceFlag())
41
+    {
42
+        switch (slice->m_nalUnitType)
43
+        {
44
+        case NAL_UNIT_CODED_SLICE_TRAIL_R:
45
+            slice->m_nalUnitType =  NAL_UNIT_CODED_SLICE_TRAIL_N;
46
+            break;
47
+        case NAL_UNIT_CODED_SLICE_RADL_R:
48
+            slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_RADL_N;
49
+            break;
50
+        case NAL_UNIT_CODED_SLICE_RASL_R:
51
+            slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_RASL_N;
52
+            break;
53
+        default:
54
+            break;
55
+        }
56
+    }
57
     // Do decoding refresh marking if any
58
     decodingRefreshMarking(pocCurr, slice->m_nalUnitType);
59
 
60
-    computeRPS(pocCurr, slice->isIRAP(), &slice->m_rps, slice->m_sps->maxDecPicBuffering);
61
-
62
+    computeRPS(pocCurr, newFrame->m_tempLayer, slice->isIRAP(), &slice->m_rps, slice->m_sps->maxDecPicBufferingnewFrame->m_tempLayer);
63
+    bool isTSAPic = ((slice->m_nalUnitType == 2) || (slice->m_nalUnitType == 3)) ? true : false;
64
     // Mark pictures in m_piclist as unreferenced if they are not included in RPS
65
-    applyReferencePictureSet(&slice->m_rps, pocCurr);
66
+    applyReferencePictureSet(&slice->m_rps, pocCurr, newFrame->m_tempLayer, isTSAPic);
67
+
68
+
69
+    if (m_bTemporalSublayer && newFrame->m_tempLayer > 0
70
+        && !(slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RADL_N     // Check if not a leading picture
71
+            || slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RADL_R
72
+            || slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RASL_N
73
+            || slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RASL_R)
74
+        )
75
+    {
76
+        if (isTemporalLayerSwitchingPoint(pocCurr, newFrame->m_tempLayer) || (slice->m_sps->maxTempSubLayers == 1))
77
+        {
78
+            if (getTemporalLayerNonReferenceFlag())
79
+            {
80
+                slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_TSA_N;
81
+            }
82
+            else
83
+            {
84
+                slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_TSA_R;
85
+            }
86
+        }
87
+        else if (isStepwiseTemporalLayerSwitchingPoint(&slice->m_rps, pocCurr, newFrame->m_tempLayer))
88
+        {
89
+            bool isSTSA = true;
90
+            int id = newFrame->m_gopOffset % x265_gop_ra_lengthnewFrame->m_gopId;
91
+            for (int ii = id; (ii < x265_gop_ra_lengthnewFrame->m_gopId && isSTSA == true); ii++)
92
+            {
93
+                int tempIdRef = x265_gop_ranewFrame->m_gopIdii.layer;
94
+                if (tempIdRef == newFrame->m_tempLayer)
95
+                {
96
+                    for (int jj = 0; jj < slice->m_rps.numberOfPositivePictures + slice->m_rps.numberOfNegativePictures; jj++)
97
+                    {
98
+                        if (slice->m_rps.bUsedjj)
99
+                        {
100
+                            int refPoc = x265_gop_ranewFrame->m_gopIdii.poc_offset + slice->m_rps.deltaPOCjj;
101
+                            int kk = 0;
102
+                            for (kk = 0; kk < x265_gop_ra_lengthnewFrame->m_gopId; kk++)
103
+                            {
104
+                                if (x265_gop_ranewFrame->m_gopIdkk.poc_offset == refPoc)
105
+                                {
106
+                                    break;
107
+                                }
108
+                            }
109
+                            if (x265_gop_ranewFrame->m_gopIdkk.layer >= newFrame->m_tempLayer)
110
+                            {
111
+                                isSTSA = false;
112
+                                break;
113
+                            }
114
+                        }
115
+                    }
116
+                }
117
+            }
118
+            if (isSTSA == true)
119
+            {
120
+                if (getTemporalLayerNonReferenceFlag())
121
+                {
122
+                    slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_STSA_N;
123
+                }
124
+                else
125
+                {
126
+                    slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_STSA_R;
127
+                }
128
+            }
129
+        }
130
+    }
131
 
132
     if (slice->m_sliceType != I_SLICE)
133
         slice->m_numRefIdx0 = x265_clip3(1, newFrame->m_param->maxNumReferences, slice->m_rps.numberOfNegativePictures);
134
@@ -218,7 +308,7 @@
135
     }
136
 }
137
 
138
-void DPB::computeRPS(int curPoc, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer)
139
+void DPB::computeRPS(int curPoc, int tempId, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer)
140
 {
141
     unsigned int poci = 0, numNeg = 0, numPos = 0;
142
 
143
@@ -228,7 +318,7 @@
144
     {
145
         if ((iterPic->m_poc != curPoc) && iterPic->m_encData->m_bHasReferences)
146
         {
147
-            if ((m_lastIDR >= curPoc) || (m_lastIDR <= iterPic->m_poc))
148
+            if ((!m_bTemporalSublayer || (iterPic->m_tempLayer <= tempId)) && ((m_lastIDR >= curPoc) || (m_lastIDR <= iterPic->m_poc)))
149
             {
150
                     rps->pocpoci = iterPic->m_poc;
151
                     rps->deltaPOCpoci = rps->pocpoci - curPoc;
152
@@ -247,6 +337,18 @@
153
     rps->sortDeltaPOC();
154
 }
155
 
156
+bool DPB::getTemporalLayerNonReferenceFlag()
157
+{
158
+    Frame* curFrame = m_picList.first();
159
+    if (curFrame->m_encData->m_bHasReferences)
160
+    {
161
+        curFrame->m_sameLayerRefPic = true;
162
+        return false;
163
+    }
164
+    else
165
+        return true;
166
+}
167
+
168
 /* Marking reference pictures when an IDR/CRA is encountered. */
169
 void DPB::decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType)
170
 {
171
@@ -296,7 +398,7 @@
172
 }
173
 
174
 /** Function for applying picture marking based on the Reference Picture Set */
175
-void DPB::applyReferencePictureSet(RPS *rps, int curPoc)
176
+void DPB::applyReferencePictureSet(RPS *rps, int curPoc, int tempId, bool isTSAPicture)
177
 {
178
     // loop through all pictures in the reference picture buffer
179
     Frame* iterFrame = m_picList.first();
180
@@ -317,9 +419,68 @@
181
             }
182
             if (!referenced)
183
                 iterFrame->m_encData->m_bHasReferences = false;
184
+
185
+            if (m_bTemporalSublayer)
186
+            {
187
+                //check that pictures of higher temporal layers are not used
188
+                assert(referenced == 0 || iterFrame->m_encData->m_bHasReferences == false || iterFrame->m_tempLayer <= tempId);
189
+
190
+                //check that pictures of higher or equal temporal layer are not in the RPS if the current picture is a TSA picture
191
+                if (isTSAPicture)
192
+                {
193
+                    assert(referenced == 0 || iterFrame->m_tempLayer < tempId);
194
+                }
195
+                //check that pictures marked as temporal layer non-reference pictures are not used for reference
196
+                if (iterFrame->m_tempLayer == tempId)
197
+                {
198
+                    assert(referenced == 0 || iterFrame->m_sameLayerRefPic == true);
199
+                }
200
+            }
201
+        }
202
+        iterFrame = iterFrame->m_next;
203
+    }
204
+}
205
+
206
+bool DPB::isTemporalLayerSwitchingPoint(int curPoc, int tempId)
207
+{
208
+    // loop through all pictures in the reference picture buffer
209
+    Frame* iterFrame = m_picList.first();
210
+    while (iterFrame)
211
+    {
212
+        if (iterFrame->m_poc != curPoc && iterFrame->m_encData->m_bHasReferences)
213
+        {
214
+            if (iterFrame->m_tempLayer >= tempId)
215
+            {
216
+                return false;
217
+            }
218
+        }
219
+        iterFrame = iterFrame->m_next;
220
+    }
221
+    return true;
222
+}
223
+
224
+bool DPB::isStepwiseTemporalLayerSwitchingPoint(RPS *rps, int curPoc, int tempId)
225
+{
226
+    // loop through all pictures in the reference picture buffer
227
+    Frame* iterFrame = m_picList.first();
228
+    while (iterFrame)
229
+    {
230
+        if (iterFrame->m_poc != curPoc && iterFrame->m_encData->m_bHasReferences)
231
+        {
232
+            for (int i = 0; i < rps->numberOfPositivePictures + rps->numberOfNegativePictures; i++)
233
+            {
234
+                if ((iterFrame->m_poc == curPoc + rps->deltaPOCi) && rps->bUsedi)
235
+                {
236
+                    if (iterFrame->m_tempLayer >= tempId)
237
+                    {
238
+                        return false;
239
+                    }
240
+                }
241
+            }
242
         }
243
         iterFrame = iterFrame->m_next;
244
     }
245
+    return true;
246
 }
247
 
248
 /* deciding the nal_unit_type */
249
@@ -328,7 +489,7 @@
250
     if (!curPOC)
251
         return NAL_UNIT_CODED_SLICE_IDR_N_LP;
252
     if (bIsKeyFrame)
253
-        return m_bOpenGOP ? NAL_UNIT_CODED_SLICE_CRA : m_bhasLeadingPicture ? NAL_UNIT_CODED_SLICE_IDR_W_RADL : NAL_UNIT_CODED_SLICE_IDR_N_LP;
254
+        return (m_bOpenGOP || m_craNal) ? NAL_UNIT_CODED_SLICE_CRA : m_bhasLeadingPicture ? NAL_UNIT_CODED_SLICE_IDR_W_RADL : NAL_UNIT_CODED_SLICE_IDR_N_LP;
255
     if (m_pocCRA && curPOC < m_pocCRA)
256
         // All leading pictures are being marked as TFD pictures here since
257
         // current encoder uses all reference pictures while encoding leading
258
x265_3.5.tar.gz/source/encoder/dpb.h -> x265_3.6.tar.gz/source/encoder/dpb.h Changed
35
 
1
@@ -40,6 +40,7 @@
2
     int                m_lastIDR;
3
     int                m_pocCRA;
4
     int                m_bOpenGOP;
5
+   int                m_craNal;
6
     int                m_bhasLeadingPicture;
7
     bool               m_bRefreshPending;
8
     bool               m_bTemporalSublayer;
9
@@ -66,7 +67,8 @@
10
         m_bRefreshPending = false;
11
         m_frameDataFreeList = NULL;
12
         m_bOpenGOP = param->bOpenGOP;
13
-        m_bTemporalSublayer = !!param->bEnableTemporalSubLayers;
14
+       m_craNal = param->craNal;
15
+        m_bTemporalSublayer = (param->bEnableTemporalSubLayers > 2);
16
     }
17
 
18
     ~DPB();
19
@@ -77,10 +79,13 @@
20
 
21
 protected:
22
 
23
-    void computeRPS(int curPoc, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer);
24
+    void computeRPS(int curPoc,int tempId, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer);
25
 
26
-    void applyReferencePictureSet(RPS *rps, int curPoc);
27
+    void applyReferencePictureSet(RPS *rps, int curPoc, int tempId, bool isTSAPicture);
28
+    bool getTemporalLayerNonReferenceFlag();
29
     void decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType);
30
+    bool isTemporalLayerSwitchingPoint(int curPoc, int tempId);
31
+    bool isStepwiseTemporalLayerSwitchingPoint(RPS *rps, int curPoc, int tempId);
32
 
33
     NalUnitType getNalUnitType(int curPoc, bool bIsKeyFrame);
34
 };
35
x265_3.5.tar.gz/source/encoder/encoder.cpp -> x265_3.6.tar.gz/source/encoder/encoder.cpp Changed
1237
 
1
@@ -72,7 +72,40 @@
2
 {
3
     { 1, 1, 1, 1, 1, 5, 1,  2, 2, 2, 50 },
4
     { 1, 1, 1, 1, 1, 5, 0, 16, 9, 9, 81 },
5
-    { 1, 1, 1, 1, 1, 5, 0,  1, 1, 1, 82 }
6
+    { 1, 1, 1, 1, 1, 5, 0,  1, 1, 1, 82 },
7
+    { 1, 1, 1, 1, 1, 5, 0, 18, 9, 9, 84 }
8
+};
9
+
10
+typedef struct
11
+{
12
+    int bEnableVideoSignalTypePresentFlag;
13
+    int bEnableColorDescriptionPresentFlag;
14
+    int bEnableChromaLocInfoPresentFlag;
15
+    int colorPrimaries;
16
+    int transferCharacteristics;
17
+    int matrixCoeffs;
18
+    int bEnableVideoFullRangeFlag;
19
+    int chromaSampleLocTypeTopField;
20
+    int chromaSampleLocTypeBottomField;
21
+    const char* systemId;
22
+}VideoSignalTypePresets;
23
+
24
+VideoSignalTypePresets vstPresets =
25
+{
26
+    {1, 1, 1, 6, 6, 6, 0, 0, 0, "BT601_525"},
27
+    {1, 1, 1, 5, 6, 5, 0, 0, 0, "BT601_626"},
28
+    {1, 1, 1, 1, 1, 1, 0, 0, 0, "BT709_YCC"},
29
+    {1, 1, 0, 1, 1, 0, 0, 0, 0, "BT709_RGB"},
30
+    {1, 1, 1, 9, 14, 1, 0, 2, 2, "BT2020_YCC_NCL"},
31
+    {1, 1, 0, 9, 16, 9, 0, 0, 0, "BT2020_RGB"},
32
+    {1, 1, 1, 9, 16, 9, 0, 2, 2, "BT2100_PQ_YCC"},
33
+    {1, 1, 1, 9, 16, 14, 0, 2, 2, "BT2100_PQ_ICTCP"},
34
+    {1, 1, 0, 9, 16, 0, 0, 0, 0, "BT2100_PQ_RGB"},
35
+    {1, 1, 1, 9, 18, 9, 0, 2, 2, "BT2100_HLG_YCC"},
36
+    {1, 1, 0, 9, 18, 0, 0, 0, 0, "BT2100_HLG_RGB"},
37
+    {1, 1, 0, 1, 1, 0, 1, 0, 0, "FR709_RGB"},
38
+    {1, 1, 0, 9, 14, 0, 1, 0, 0, "FR2020_RGB"},
39
+    {1, 1, 1, 12, 1, 6, 1, 1, 1, "FRP3D65_YCC"}
40
 };
41
 }
42
 
43
@@ -109,6 +142,7 @@
44
     m_threadPool = NULL;
45
     m_analysisFileIn = NULL;
46
     m_analysisFileOut = NULL;
47
+    m_filmGrainIn = NULL;
48
     m_naluFile = NULL;
49
     m_offsetEmergency = NULL;
50
     m_iFrameNum = 0;
51
@@ -134,12 +168,8 @@
52
     m_prevTonemapPayload.payload = NULL;
53
     m_startPoint = 0;
54
     m_saveCTUSize = 0;
55
-    m_edgePic = NULL;
56
-    m_edgeHistThreshold = 0;
57
-    m_chromaHistThreshold = 0.0;
58
-    m_scaledEdgeThreshold = 0.0;
59
-    m_scaledChromaThreshold = 0.0;
60
     m_zoneIndex = 0;
61
+    m_origPicBuffer = 0;
62
 }
63
 
64
 inline char *strcatFilename(const char *input, const char *suffix)
65
@@ -216,34 +246,6 @@
66
         }
67
     }
68
 
69
-    if (m_param->bHistBasedSceneCut)
70
-    {
71
-        m_planeSizes0 = (m_param->sourceWidth >> x265_cli_cspsp->internalCsp.width0) * (m_param->sourceHeight >> x265_cli_cspsm_param->internalCsp.height0);
72
-        uint32_t pixelbytes = m_param->internalBitDepth > 8 ? 2 : 1;
73
-        m_edgePic = X265_MALLOC(pixel, m_planeSizes0 * pixelbytes);
74
-        m_edgeHistThreshold = m_param->edgeTransitionThreshold;
75
-        m_chromaHistThreshold = x265_min(m_edgeHistThreshold * 10.0, MAX_SCENECUT_THRESHOLD);
76
-        m_scaledEdgeThreshold = x265_min(m_edgeHistThreshold * SCENECUT_STRENGTH_FACTOR, MAX_SCENECUT_THRESHOLD);
77
-        m_scaledChromaThreshold = x265_min(m_chromaHistThreshold * SCENECUT_STRENGTH_FACTOR, MAX_SCENECUT_THRESHOLD);
78
-        if (m_param->sourceBitDepth != m_param->internalBitDepth)
79
-        {
80
-            int size = m_param->sourceWidth * m_param->sourceHeight;
81
-            int hshift = CHROMA_H_SHIFT(m_param->internalCsp);
82
-            int vshift = CHROMA_V_SHIFT(m_param->internalCsp);
83
-            int widthC = m_param->sourceWidth >> hshift;
84
-            int heightC = m_param->sourceHeight >> vshift;
85
-
86
-            m_inputPic0 = X265_MALLOC(pixel, size);
87
-            if (m_param->internalCsp != X265_CSP_I400)
88
-            {
89
-                for (int j = 1; j < 3; j++)
90
-                {
91
-                    m_inputPicj = X265_MALLOC(pixel, widthC * heightC);
92
-                }
93
-            }
94
-        }
95
-    }
96
-
97
     // Do not allow WPP if only one row or fewer than 3 columns, it is pointless and unstable
98
     if (rows == 1 || cols < 3)
99
     {
100
@@ -357,6 +359,10 @@
101
             lookAheadThreadPooli.start();
102
     m_lookahead->m_numPools = pools;
103
     m_dpb = new DPB(m_param);
104
+
105
+    if (m_param->bEnableTemporalFilter)
106
+        m_origPicBuffer = new OrigPicBuffer();
107
+
108
     m_rateControl = new RateControl(*m_param, this);
109
     if (!m_param->bResetZoneConfig)
110
     {
111
@@ -518,6 +524,15 @@
112
             }
113
         }
114
     }
115
+    if (m_param->filmGrain)
116
+    {
117
+        m_filmGrainIn = x265_fopen(m_param->filmGrain, "rb");
118
+        if (!m_filmGrainIn)
119
+        {
120
+            x265_log_file(NULL, X265_LOG_ERROR, "Failed to open film grain characteristics binary file %s\n", m_param->filmGrain);
121
+        }
122
+    }
123
+
124
     m_bZeroLatency = !m_param->bframes && !m_param->lookaheadDepth && m_param->frameNumThreads == 1 && m_param->maxSlices == 1;
125
     m_aborted |= parseLambdaFile(m_param);
126
 
127
@@ -879,26 +894,6 @@
128
         }
129
     }
130
 
131
-    if (m_param->bHistBasedSceneCut)
132
-    {
133
-        if (m_edgePic != NULL)
134
-        {
135
-            X265_FREE_ZERO(m_edgePic);
136
-        }
137
-
138
-        if (m_param->sourceBitDepth != m_param->internalBitDepth)
139
-        {
140
-            X265_FREE_ZERO(m_inputPic0);
141
-            if (m_param->internalCsp != X265_CSP_I400)
142
-            {
143
-                for (int i = 1; i < 3; i++)
144
-                {
145
-                    X265_FREE_ZERO(m_inputPici);
146
-                }
147
-            }
148
-        }
149
-    }
150
-
151
     for (int i = 0; i < m_param->frameNumThreads; i++)
152
     {
153
         if (m_frameEncoderi)
154
@@ -924,6 +919,10 @@
155
         delete zoneReadCount;
156
         delete zoneWriteCount;
157
     }
158
+
159
+    if (m_param->bEnableTemporalFilter)
160
+        delete m_origPicBuffer;
161
+
162
     if (m_rateControl)
163
     {
164
         m_rateControl->destroy();
165
@@ -963,6 +962,8 @@
166
      }
167
     if (m_naluFile)
168
         fclose(m_naluFile);
169
+    if (m_filmGrainIn)
170
+        x265_fclose(m_filmGrainIn);
171
 
172
 #ifdef SVT_HEVC
173
     X265_FREE(m_svtAppData);
174
@@ -974,6 +975,7 @@
175
         /* release string arguments that were strdup'd */
176
         free((char*)m_param->rc.lambdaFileName);
177
         free((char*)m_param->rc.statFileName);
178
+        free((char*)m_param->rc.sharedMemName);
179
         free((char*)m_param->analysisReuseFileName);
180
         free((char*)m_param->scalingLists);
181
         free((char*)m_param->csvfn);
182
@@ -982,6 +984,7 @@
183
         free((char*)m_param->toneMapFile);
184
         free((char*)m_param->analysisSave);
185
         free((char*)m_param->analysisLoad);
186
+        free((char*)m_param->videoSignalTypePreset);
187
         PARAM_NS::x265_param_free(m_param);
188
     }
189
 }
190
@@ -1358,215 +1361,90 @@
191
     dest->planes2 = (char*)dest->planes1 + src->stride1 * (src->height >> x265_cli_cspssrc->colorSpace.height1);
192
 }
193
 
194
-bool Encoder::computeHistograms(x265_picture *pic)
195
+bool Encoder::isFilterThisframe(uint8_t sliceTypeConfig, int curSliceType)
196
 {
197
-    pixel *src = NULL, *planeV = NULL, *planeU = NULL;
198
-    uint32_t widthC, heightC;
199
-    int hshift, vshift;
200
-
201
-    hshift = CHROMA_H_SHIFT(pic->colorSpace);
202
-    vshift = CHROMA_V_SHIFT(pic->colorSpace);
203
-    widthC = pic->width >> hshift;
204
-    heightC = pic->height >> vshift;
205
-
206
-    if (pic->bitDepth == X265_DEPTH)
207
+    uint8_t newSliceType = 0;
208
+    switch (curSliceType)
209
     {
210
-        src = (pixel*)pic->planes0;
211
-        if (m_param->internalCsp != X265_CSP_I400)
212
-        {
213
-            planeU = (pixel*)pic->planes1;
214
-            planeV = (pixel*)pic->planes2;
215
-        }
216
-    }
217
-    else if (pic->bitDepth == 8 && X265_DEPTH > 8)
218
-    {
219
-        int shift = (X265_DEPTH - 8);
220
-        uint8_t *yChar, *uChar, *vChar;
221
-
222
-        yChar = (uint8_t*)pic->planes0;
223
-        primitives.planecopy_cp(yChar, pic->stride0 / sizeof(*yChar), m_inputPic0, pic->stride0 / sizeof(*yChar), pic->width, pic->height, shift);
224
-        src = m_inputPic0;
225
-        if (m_param->internalCsp != X265_CSP_I400)
226
-        {
227
-            uChar = (uint8_t*)pic->planes1;
228
-            vChar = (uint8_t*)pic->planes2;
229
-            primitives.planecopy_cp(uChar, pic->stride1 / sizeof(*uChar), m_inputPic1, pic->stride1 / sizeof(*uChar), widthC, heightC, shift);
230
-            primitives.planecopy_cp(vChar, pic->stride2 / sizeof(*vChar), m_inputPic2, pic->stride2 / sizeof(*vChar), widthC, heightC, shift);
231
-            planeU = m_inputPic1;
232
-            planeV = m_inputPic2;
233
-        }
234
-    }
235
-    else
236
-    {
237
-        uint16_t *yShort, *uShort, *vShort;
238
-        /* mask off bits that are supposed to be zero */
239
-        uint16_t mask = (1 << X265_DEPTH) - 1;
240
-        int shift = abs(pic->bitDepth - X265_DEPTH);
241
-
242
-        yShort = (uint16_t*)pic->planes0;
243
-        uShort = (uint16_t*)pic->planes1;
244
-        vShort = (uint16_t*)pic->planes2;
245
-
246
-        if (pic->bitDepth > X265_DEPTH)
247
-        {
248
-            /* shift right and mask pixels to final size */
249
-            primitives.planecopy_sp(yShort, pic->stride0 / sizeof(*yShort), m_inputPic0, pic->stride0 / sizeof(*yShort), pic->width, pic->height, shift, mask);
250
-            if (m_param->internalCsp != X265_CSP_I400)
251
-            {
252
-                primitives.planecopy_sp(uShort, pic->stride1 / sizeof(*uShort), m_inputPic1, pic->stride1 / sizeof(*uShort), widthC, heightC, shift, mask);
253
-                primitives.planecopy_sp(vShort, pic->stride2 / sizeof(*vShort), m_inputPic2, pic->stride2 / sizeof(*vShort), widthC, heightC, shift, mask);
254
-            }
255
-        }
256
-        else /* Case for (pic.bitDepth < X265_DEPTH) */
257
-        {
258
-            /* shift left and mask pixels to final size */
259
-            primitives.planecopy_sp_shl(yShort, pic->stride0 / sizeof(*yShort), m_inputPic0, pic->stride0 / sizeof(*yShort), pic->width, pic->height, shift, mask);
260
-            if (m_param->internalCsp != X265_CSP_I400)
261
-            {
262
-                primitives.planecopy_sp_shl(uShort, pic->stride1 / sizeof(*uShort), m_inputPic1, pic->stride1 / sizeof(*uShort), widthC, heightC, shift, mask);
263
-                primitives.planecopy_sp_shl(vShort, pic->stride2 / sizeof(*vShort), m_inputPic2, pic->stride2 / sizeof(*vShort), widthC, heightC, shift, mask);
264
-            }
265
-        }
266
-
267
-        src = m_inputPic0;
268
-        planeU = m_inputPic1;
269
-        planeV = m_inputPic2;
270
-    }
271
-
272
-    size_t bufSize = sizeof(pixel) * m_planeSizes0;
273
-    memset(m_edgePic, 0, bufSize);
274
-
275
-    if (!computeEdge(m_edgePic, src, NULL, pic->width, pic->height, pic->width, false, 1))
276
-    {
277
-        x265_log(m_param, X265_LOG_ERROR, "Failed to compute edge!");
278
-        return false;
279
-    }
280
-
281
-    pixel pixelVal;
282
-    int32_t *edgeHist = m_curEdgeHist;
283
-    memset(edgeHist, 0, EDGE_BINS * sizeof(int32_t));
284
-    for (uint32_t i = 0; i < m_planeSizes0; i++)
285
-    {
286
-        if (m_edgePici)
287
-            edgeHist1++;
288
-        else
289
-            edgeHist0++;
290
-    }
291
-
292
-    /* Y Histogram Calculation */
293
-    int32_t *yHist = m_curYUVHist0;
294
-    memset(yHist, 0, HISTOGRAM_BINS * sizeof(int32_t));
295
-    for (uint32_t i = 0; i < m_planeSizes0; i++)
296
-    {
297
-        pixelVal = srci;
298
-        yHistpixelVal++;
299
+    case 1: newSliceType |= 1 << 0;
300
+        break;
301
+    case 2: newSliceType |= 1 << 0;
302
+        break;
303
+    case 3: newSliceType |= 1 << 1;
304
+        break;
305
+    case 4: newSliceType |= 1 << 2;
306
+        break;
307
+    case 5: newSliceType |= 1 << 3;
308
+        break;
309
+    default: return 0;
310
     }
311
+    return ((sliceTypeConfig & newSliceType) != 0);
312
+}
313
 
314
-    if (pic->colorSpace != X265_CSP_I400)
315
-    {
316
-        /* U Histogram Calculation */
317
-        int32_t *uHist = m_curYUVHist1;
318
-        memset(uHist, 0, sizeof(m_curYUVHist1));
319
-        for (uint32_t i = 0; i < m_planeSizes1; i++)
320
-        {
321
-            pixelVal = planeUi;
322
-            uHistpixelVal++;
323
-        }
324
+inline int enqueueRefFrame(FrameEncoder* curframeEncoder, Frame* iterFrame, Frame* curFrame, bool isPreFiltered, int16_t i)
325
+{
326
+    TemporalFilterRefPicInfo* dest = &curframeEncoder->m_mcstfRefListcurFrame->m_mcstf->m_numRef;
327
+    dest->picBuffer = iterFrame->m_fencPic;
328
+    dest->picBufferSubSampled2 = iterFrame->m_fencPicSubsampled2;
329
+    dest->picBufferSubSampled4 = iterFrame->m_fencPicSubsampled4;
330
+    dest->isFilteredFrame = isPreFiltered;
331
+    dest->isSubsampled = iterFrame->m_isSubSampled;
332
+    dest->origOffset = i;
333
+    curFrame->m_mcstf->m_numRef++;
334
 
335
-        /* V Histogram Calculation */
336
-        pixelVal = 0;
337
-        int32_t *vHist = m_curYUVHist2;
338
-        memset(vHist, 0, sizeof(m_curYUVHist2));
339
-        for (uint32_t i = 0; i < m_planeSizes2; i++)
340
-        {
341
-            pixelVal = planeVi;
342
-            vHistpixelVal++;
343
-        }
344
-    }
345
-    return true;
346
+    return 1;
347
 }
348
 
349
-void Encoder::computeHistogramSAD(double *normalizedMaxUVSad, double *normalizedEdgeSad, int curPoc)
350
+bool Encoder::generateMcstfRef(Frame* frameEnc, FrameEncoder* currEncoder)
351
 {
352
+    frameEnc->m_mcstf->m_numRef = 0;
353
 
354
-    if (curPoc == 0)
355
-    {   /* first frame is scenecut by default no sad computation for the same. */
356
-        *normalizedMaxUVSad = 0.0;
357
-        *normalizedEdgeSad = 0.0;
358
-    }
359
-    else
360
+    for (int iterPOC = (frameEnc->m_poc - frameEnc->m_mcstf->m_range);
361
+        iterPOC <= (frameEnc->m_poc + frameEnc->m_mcstf->m_range); iterPOC++)
362
     {
363
-        /* compute sum of absolute differences of histogram bins of chroma and luma edge response between the current and prev pictures. */
364
-        int32_t edgeHistSad = 0;
365
-        int32_t uHistSad = 0;
366
-        int32_t vHistSad = 0;
367
-        double normalizedUSad = 0.0;
368
-        double normalizedVSad = 0.0;
369
-
370
-        for (int j = 0; j < HISTOGRAM_BINS; j++)
371
+        bool isFound = false;
372
+        if (iterPOC != frameEnc->m_poc)
373
         {
374
-            if (j < 2)
375
+            //search for the reference frame in the Original Picture Buffer
376
+            if (!isFound)
377
             {
378
-                edgeHistSad += abs(m_curEdgeHistj - m_prevEdgeHistj);
379
-            }
380
-            uHistSad += abs(m_curYUVHist1j - m_prevYUVHist1j);
381
-            vHistSad += abs(m_curYUVHist2j - m_prevYUVHist2j);
382
-        }
383
-        *normalizedEdgeSad = normalizeRange(edgeHistSad, 0, 2 * m_planeSizes0, 0.0, 1.0);
384
-        normalizedUSad = normalizeRange(uHistSad, 0, 2 * m_planeSizes1, 0.0, 1.0);
385
-        normalizedVSad = normalizeRange(vHistSad, 0, 2 * m_planeSizes2, 0.0, 1.0);
386
-        *normalizedMaxUVSad = x265_max(normalizedUSad, normalizedVSad);
387
-    }
388
-
389
-    /* store histograms of previous frame for reference */
390
-    memcpy(m_prevEdgeHist, m_curEdgeHist, sizeof(m_curEdgeHist));
391
-    memcpy(m_prevYUVHist, m_curYUVHist, sizeof(m_curYUVHist));
392
-}
393
+                for (int j = 0; j < (2 * frameEnc->m_mcstf->m_range); j++)
394
+                {
395
+                    if (iterPOC < 0)
396
+                        continue;
397
+                    if (iterPOC >= m_pocLast)
398
+                    {
399
 
400
-double Encoder::normalizeRange(int32_t value, int32_t minValue, int32_t maxValue, double rangeStart, double rangeEnd)
401
-{
402
-    return (double)(value - minValue) * (rangeEnd - rangeStart) / (maxValue - minValue) + rangeStart;
403
-}
404
+                        TemporalFilter* mcstf = frameEnc->m_mcstf;
405
+                        while (mcstf->m_numRef)
406
+                        {
407
+                            memset(currEncoder->m_mcstfRefListmcstf->m_numRef.mvs0,  0, sizeof(MV) * ((mcstf->m_sourceWidth / 16) * (mcstf->m_sourceHeight / 16)));
408
+                            memset(currEncoder->m_mcstfRefListmcstf->m_numRef.mvs1,  0, sizeof(MV) * ((mcstf->m_sourceWidth / 16) * (mcstf->m_sourceHeight / 16)));
409
+                            memset(currEncoder->m_mcstfRefListmcstf->m_numRef.mvs2,  0, sizeof(MV) * ((mcstf->m_sourceWidth / 16) * (mcstf->m_sourceHeight / 16)));
410
+                            memset(currEncoder->m_mcstfRefListmcstf->m_numRef.mvs,   0, sizeof(MV) * ((mcstf->m_sourceWidth /  4) * (mcstf->m_sourceHeight /  4)));
411
+                            memset(currEncoder->m_mcstfRefListmcstf->m_numRef.noise, 0, sizeof(int) * ((mcstf->m_sourceWidth / 4) * (mcstf->m_sourceHeight / 4)));
412
+                            memset(currEncoder->m_mcstfRefListmcstf->m_numRef.error, 0, sizeof(int) * ((mcstf->m_sourceWidth / 4) * (mcstf->m_sourceHeight / 4)));
413
 
414
-void Encoder::findSceneCuts(x265_picture *pic, bool& bDup, double maxUVSad, double edgeSad, bool& isMaxThres, bool& isHardSC)
415
-{
416
-    double minEdgeT = m_edgeHistThreshold * MIN_EDGE_FACTOR;
417
-    double minChromaT = minEdgeT * SCENECUT_CHROMA_FACTOR;
418
-    double maxEdgeT = m_edgeHistThreshold * MAX_EDGE_FACTOR;
419
-    double maxChromaT = maxEdgeT * SCENECUT_CHROMA_FACTOR;
420
-    pic->frameData.bScenecut = false;
421
+                            mcstf->m_numRef--;
422
+                        }
423
 
424
-    if (pic->poc == 0)
425
-    {
426
-        /* for first frame */
427
-        pic->frameData.bScenecut = false;
428
-        bDup = false;
429
-    }
430
-    else
431
-    {
432
-        if (edgeSad == 0.0 && maxUVSad == 0.0)
433
-        {
434
-            bDup = true;
435
-        }
436
-        else if (edgeSad < minEdgeT && maxUVSad < minChromaT)
437
-        {
438
-            pic->frameData.bScenecut = false;
439
-        }
440
-        else if (edgeSad > maxEdgeT && maxUVSad > maxChromaT)
441
-        {
442
-            pic->frameData.bScenecut = true;
443
-            isMaxThres = true;
444
-            isHardSC = true;
445
-        }
446
-        else if (edgeSad > m_scaledEdgeThreshold || maxUVSad >= m_scaledChromaThreshold
447
-                 || (edgeSad > m_edgeHistThreshold && maxUVSad >= m_chromaHistThreshold))
448
-        {
449
-            pic->frameData.bScenecut = true;
450
-            bDup = false;
451
-            if (edgeSad > m_scaledEdgeThreshold || maxUVSad >= m_scaledChromaThreshold)
452
-                isHardSC = true;
453
+                        break;
454
+                    }
455
+                    Frame* iterFrame = frameEnc->m_encData->m_slice->m_mcstfRefFrameList1j;
456
+                    if (iterFrame->m_poc == iterPOC)
457
+                    {
458
+                        if (!enqueueRefFrame(currEncoder, iterFrame, frameEnc, false, (int16_t)(iterPOC - frameEnc->m_poc)))
459
+                        {
460
+                            return false;
461
+                        };
462
+                        break;
463
+                    }
464
+                }
465
+            }
466
         }
467
     }
468
+
469
+    return true;
470
 }
471
 
472
 /**
473
@@ -1595,40 +1473,24 @@
474
     const x265_picture* inputPic = NULL;
475
     static int written = 0, read = 0;
476
     bool dontRead = false;
477
-    bool bdropFrame = false;
478
     bool dropflag = false;
479
-    bool isMaxThres = false;
480
-    bool isHardSC = false;
481
 
482
     if (m_exportedPic)
483
     {
484
         if (!m_param->bUseAnalysisFile && m_param->analysisSave)
485
             x265_free_analysis_data(m_param, &m_exportedPic->m_analysisData);
486
+
487
         ATOMIC_DEC(&m_exportedPic->m_countRefEncoders);
488
+
489
         m_exportedPic = NULL;
490
         m_dpb->recycleUnreferenced();
491
+
492
+        if (m_param->bEnableTemporalFilter)
493
+            m_origPicBuffer->recycleOrigPicList();
494
     }
495
+
496
     if ((pic_in && (!m_param->chunkEnd || (m_encodedFrameNum < m_param->chunkEnd))) || (m_param->bEnableFrameDuplication && !pic_in && (read < written)))
497
     {
498
-        if (m_param->bHistBasedSceneCut && pic_in)
499
-        {
500
-            x265_picture *pic = (x265_picture *) pic_in;
501
-
502
-            if (pic->poc == 0)
503
-            {
504
-                /* for entire encode compute the chroma plane sizes only once */
505
-                for (int i = 1; i < x265_cli_cspsm_param->internalCsp.planes; i++)
506
-                    m_planeSizesi = (pic->width >> x265_cli_cspsm_param->internalCsp.widthi) * (pic->height >> x265_cli_cspsm_param->internalCsp.heighti);
507
-            }
508
-
509
-            if (computeHistograms(pic))
510
-            {
511
-                double maxUVSad = 0.0, edgeSad = 0.0;
512
-                computeHistogramSAD(&maxUVSad, &edgeSad, pic_in->poc);
513
-                findSceneCuts(pic, bdropFrame, maxUVSad, edgeSad, isMaxThres, isHardSC);
514
-            }
515
-        }
516
-
517
         if ((m_param->bEnableFrameDuplication && !pic_in && (read < written)))
518
             dontRead = true;
519
         else
520
@@ -1672,20 +1534,7 @@
521
                     written++;
522
                 }
523
 
524
-                if (m_param->bEnableFrameDuplication && m_param->bHistBasedSceneCut)
525
-                {
526
-                    if (!bdropFrame && m_dupBuffer1->dupPic->frameData.bScenecut == false)
527
-                    {
528
-                        psnrWeight = ComputePSNR(m_dupBuffer0->dupPic, m_dupBuffer1->dupPic, m_param);
529
-                        if (psnrWeight >= m_param->dupThreshold)
530
-                            dropflag = true;
531
-                    }
532
-                    else
533
-                    {
534
-                        dropflag = true;
535
-                    }
536
-                }
537
-                else if (m_param->bEnableFrameDuplication)
538
+                if (m_param->bEnableFrameDuplication)
539
                 {
540
                     psnrWeight = ComputePSNR(m_dupBuffer0->dupPic, m_dupBuffer1->dupPic, m_param);
541
                     if (psnrWeight >= m_param->dupThreshold)
542
@@ -1768,12 +1617,6 @@
543
                         }
544
                     }
545
                 }
546
-                if (m_param->recursionSkipMode == EDGE_BASED_RSKIP && m_param->bHistBasedSceneCut)
547
-                {
548
-                    pixel* src = m_edgePic;
549
-                    primitives.planecopy_pp_shr(src, inFrame->m_fencPic->m_picWidth, inFrame->m_edgeBitPic, inFrame->m_fencPic->m_stride,
550
-                        inFrame->m_fencPic->m_picWidth, inFrame->m_fencPic->m_picHeight, 0);
551
-                }
552
             }
553
             else
554
             {
555
@@ -1794,6 +1637,8 @@
556
             inFrame->m_lowres.satdCost = (int64_t)-1;
557
             inFrame->m_lowresInit = false;
558
             inFrame->m_isInsideWindow = 0;
559
+            inFrame->m_tempLayer = 0;
560
+            inFrame->m_sameLayerRefPic = 0;
561
         }
562
 
563
         /* Copy input picture into a Frame and PicYuv, send to lookahead */
564
@@ -1802,13 +1647,6 @@
565
         inFrame->m_poc       = ++m_pocLast;
566
         inFrame->m_userData  = inputPic->userData;
567
         inFrame->m_pts       = inputPic->pts;
568
-        if (m_param->bHistBasedSceneCut)
569
-        {
570
-            inFrame->m_lowres.bScenecut = (inputPic->frameData.bScenecut == 1) ? true : false;
571
-            inFrame->m_lowres.m_bIsMaxThres = isMaxThres;
572
-            if (m_param->radl && m_param->keyframeMax != m_param->keyframeMin)
573
-                inFrame->m_lowres.m_bIsHardScenecut = isHardSC;
574
-        }
575
 
576
         if ((m_param->bEnableSceneCutAwareQp & BACKWARD) && m_param->rc.bStatRead)
577
         {
578
@@ -1816,7 +1654,7 @@
579
             rcEntry = &(m_rateControl->m_rce2PassinFrame->m_poc);
580
             if(rcEntry->scenecut)
581
             {
582
-                int backwardWindow = X265_MIN(int((m_param->bwdScenecutWindow / 1000.0) * (m_param->fpsNum / m_param->fpsDenom)), p->lookaheadDepth);
583
+                int backwardWindow = X265_MIN(int((m_param->bwdMaxScenecutWindow / 1000.0) * (m_param->fpsNum / m_param->fpsDenom)), p->lookaheadDepth);
584
                 for (int i = 1; i <= backwardWindow; i++)
585
                 {
586
                     int frameNum = inFrame->m_poc - i;
587
@@ -1826,16 +1664,7 @@
588
                 }
589
             }
590
         }
591
-        if (m_param->bHistBasedSceneCut && m_param->analysisSave)
592
-        {
593
-            memcpy(inFrame->m_analysisData.edgeHist, m_curEdgeHist, EDGE_BINS * sizeof(int32_t));
594
-            memcpy(inFrame->m_analysisData.yuvHist0, m_curYUVHist0, HISTOGRAM_BINS *sizeof(int32_t));
595
-            if (inputPic->colorSpace != X265_CSP_I400)
596
-            {
597
-                memcpy(inFrame->m_analysisData.yuvHist1, m_curYUVHist1, HISTOGRAM_BINS * sizeof(int32_t));
598
-                memcpy(inFrame->m_analysisData.yuvHist2, m_curYUVHist2, HISTOGRAM_BINS * sizeof(int32_t));
599
-            }
600
-        }
601
+
602
         inFrame->m_forceqp   = inputPic->forceqp;
603
         inFrame->m_param     = (m_reconfigure || m_reconfigureRc) ? m_latestParam : m_param;
604
         inFrame->m_picStruct = inputPic->picStruct;
605
@@ -1881,7 +1710,8 @@
606
         }
607
 
608
         /* Use the frame types from the first pass, if available */
609
-        int sliceType = (m_param->rc.bStatRead) ? m_rateControl->rateControlSliceType(inFrame->m_poc) : inputPic->sliceType;
610
+        int sliceType = (m_param->rc.bStatRead) ? m_rateControl->rateControlSliceType(inFrame->m_poc) : X265_TYPE_AUTO;
611
+        inFrame->m_lowres.sliceTypeReq = inputPic->sliceType;
612
 
613
         /* In analysisSave mode, x265_analysis_data is allocated in inputPic and inFrame points to this */
614
         /* Load analysis data before lookahead->addPicture, since sliceType has been decided */
615
@@ -1977,6 +1807,59 @@
616
         if (m_reconfigureRc)
617
             inFrame->m_reconfigureRc = true;
618
 
619
+        if (m_param->bEnableTemporalFilter)
620
+        {
621
+            if (!m_pocLast)
622
+            {
623
+                /*One shot allocation of frames in OriginalPictureBuffer*/
624
+                int numFramesinOPB = X265_MAX(m_param->bframes, (inFrame->m_mcstf->m_range << 1)) + 1;
625
+                for (int i = 0; i < numFramesinOPB; i++)
626
+                {
627
+                    Frame* dupFrame = new Frame;
628
+                    if (!(dupFrame->create(m_param, pic_in->quantOffsets)))
629
+                    {
630
+                        m_aborted = true;
631
+                        x265_log(m_param, X265_LOG_ERROR, "Memory allocation failure, aborting encode\n");
632
+                        fflush(stderr);
633
+                        dupFrame->destroy();
634
+                        delete dupFrame;
635
+                        return -1;
636
+                    }
637
+                    else
638
+                    {
639
+                        if (m_sps.cuOffsetY)
640
+                        {
641
+                            dupFrame->m_fencPic->m_cuOffsetC = m_sps.cuOffsetC;
642
+                            dupFrame->m_fencPic->m_buOffsetC = m_sps.buOffsetC;
643
+                            dupFrame->m_fencPic->m_cuOffsetY = m_sps.cuOffsetY;
644
+                            dupFrame->m_fencPic->m_buOffsetY = m_sps.buOffsetY;
645
+                            if (m_param->internalCsp != X265_CSP_I400)
646
+                            {
647
+                                dupFrame->m_fencPic->m_cuOffsetC = m_sps.cuOffsetC;
648
+                                dupFrame->m_fencPic->m_buOffsetC = m_sps.buOffsetC;
649
+                            }
650
+                            m_origPicBuffer->addEncPicture(dupFrame);
651
+                        }
652
+                    }
653
+                }
654
+            }
655
+
656
+            inFrame->m_refPicCnt1 = 2 * inFrame->m_mcstf->m_range + 1;
657
+            if (inFrame->m_poc < inFrame->m_mcstf->m_range)
658
+                inFrame->m_refPicCnt1 -= (uint8_t)(inFrame->m_mcstf->m_range - inFrame->m_poc);
659
+            if (m_param->totalFrames && (inFrame->m_poc >= (m_param->totalFrames - inFrame->m_mcstf->m_range)))
660
+                inFrame->m_refPicCnt1 -= (uint8_t)(inFrame->m_poc + inFrame->m_mcstf->m_range - m_param->totalFrames + 1);
661
+
662
+            //Extend full-res original picture border
663
+            PicYuv *orig = inFrame->m_fencPic;
664
+            extendPicBorder(orig->m_picOrg0, orig->m_stride, orig->m_picWidth, orig->m_picHeight, orig->m_lumaMarginX, orig->m_lumaMarginY);
665
+            extendPicBorder(orig->m_picOrg1, orig->m_strideC, orig->m_picWidth >> orig->m_hChromaShift, orig->m_picHeight >> orig->m_vChromaShift, orig->m_chromaMarginX, orig->m_chromaMarginY);
666
+            extendPicBorder(orig->m_picOrg2, orig->m_strideC, orig->m_picWidth >> orig->m_hChromaShift, orig->m_picHeight >> orig->m_vChromaShift, orig->m_chromaMarginX, orig->m_chromaMarginY);
667
+
668
+            //TODO: Add subsampling here if required
669
+            m_origPicBuffer->addPicture(inFrame);
670
+        }
671
+
672
         m_lookahead->addPicture(*inFrame, sliceType);
673
         m_numDelayedPic++;
674
     }
675
@@ -2019,6 +1902,7 @@
676
                 pic_out->bitDepth = X265_DEPTH;
677
                 pic_out->userData = outFrame->m_userData;
678
                 pic_out->colorSpace = m_param->internalCsp;
679
+                pic_out->frameData.tLayer = outFrame->m_tempLayer;
680
                 frameData = &(pic_out->frameData);
681
 
682
                 pic_out->pts = outFrame->m_pts;
683
@@ -2041,16 +1925,6 @@
684
                     pic_out->analysisData.poc = pic_out->poc;
685
                     pic_out->analysisData.sliceType = pic_out->sliceType;
686
                     pic_out->analysisData.bScenecut = outFrame->m_lowres.bScenecut;
687
-                    if (m_param->bHistBasedSceneCut)
688
-                    {
689
-                        memcpy(pic_out->analysisData.edgeHist, outFrame->m_analysisData.edgeHist, EDGE_BINS * sizeof(int32_t));
690
-                        memcpy(pic_out->analysisData.yuvHist0, outFrame->m_analysisData.yuvHist0, HISTOGRAM_BINS * sizeof(int32_t));
691
-                        if (pic_out->colorSpace != X265_CSP_I400)
692
-                        {
693
-                            memcpy(pic_out->analysisData.yuvHist1, outFrame->m_analysisData.yuvHist1, HISTOGRAM_BINS * sizeof(int32_t));
694
-                            memcpy(pic_out->analysisData.yuvHist2, outFrame->m_analysisData.yuvHist2, HISTOGRAM_BINS * sizeof(int32_t));
695
-                        }
696
-                    }
697
                     pic_out->analysisData.satdCost  = outFrame->m_lowres.satdCost;
698
                     pic_out->analysisData.numCUsInFrame = outFrame->m_analysisData.numCUsInFrame;
699
                     pic_out->analysisData.numPartitions = outFrame->m_analysisData.numPartitions;
700
@@ -2198,7 +2072,7 @@
701
                 if (m_rateControl->writeRateControlFrameStats(outFrame, &curEncoder->m_rce))
702
                     m_aborted = true;
703
             if (pic_out)
704
-            { 
705
+            {
706
                 /* m_rcData is allocated for every frame */
707
                 pic_out->rcData = outFrame->m_rcData;
708
                 outFrame->m_rcData->qpaRc = outFrame->m_encData->m_avgQpRc;
709
@@ -2216,6 +2090,18 @@
710
                 outFrame->m_rcData->iCuCount = outFrame->m_encData->m_frameStats.percent8x8Intra * m_rateControl->m_ncu;
711
                 outFrame->m_rcData->pCuCount = outFrame->m_encData->m_frameStats.percent8x8Inter * m_rateControl->m_ncu;
712
                 outFrame->m_rcData->skipCuCount = outFrame->m_encData->m_frameStats.percent8x8Skip  * m_rateControl->m_ncu;
713
+                outFrame->m_rcData->currentSatd = curEncoder->m_rce.coeffBits;
714
+            }
715
+
716
+            if (m_param->bEnableTemporalFilter)
717
+            {
718
+                Frame *curFrame = m_origPicBuffer->m_mcstfPicList.getPOCMCSTF(outFrame->m_poc);
719
+                X265_CHECK(curFrame, "Outframe not found in DPB's mcstfPicList");
720
+                curFrame->m_refPicCnt0--;
721
+                curFrame->m_refPicCnt1--;
722
+                curFrame = m_origPicBuffer->m_mcstfOrigPicList.getPOCMCSTF(outFrame->m_poc);
723
+                X265_CHECK(curFrame, "Outframe not found in OPB's mcstfOrigPicList");
724
+                curFrame->m_refPicCnt1--;
725
             }
726
 
727
             /* Allow this frame to be recycled if no frame encoders are using it for reference */
728
@@ -2223,6 +2109,8 @@
729
             {
730
                 ATOMIC_DEC(&outFrame->m_countRefEncoders);
731
                 m_dpb->recycleUnreferenced();
732
+                if (m_param->bEnableTemporalFilter)
733
+                    m_origPicBuffer->recycleOrigPicList();
734
             }
735
             else
736
                 m_exportedPic = outFrame;
737
@@ -2253,7 +2141,7 @@
738
                         m_rateControl->m_lastScenecut = frameEnc->m_poc;
739
                     else
740
                     {
741
-                        int maxWindowSize = int((m_param->fwdScenecutWindow / 1000.0) * (m_param->fpsNum / m_param->fpsDenom) + 0.5);
742
+                        int maxWindowSize = int((m_param->fwdMaxScenecutWindow / 1000.0) * (m_param->fpsNum / m_param->fpsDenom) + 0.5);
743
                         if (frameEnc->m_poc > (m_rateControl->m_lastScenecut + maxWindowSize))
744
                             m_rateControl->m_lastScenecut = frameEnc->m_poc;
745
                     }
746
@@ -2422,8 +2310,36 @@
747
                 analysis->numPartitions  = m_param->num4x4Partitions;
748
                 x265_alloc_analysis_data(m_param, analysis);
749
             }
750
+            if (m_param->bEnableTemporalSubLayers > 2)
751
+            {
752
+                //Re-assign temporalid if the current frame is at the end of encode or when I slice is encountered
753
+                if ((frameEnc->m_poc == (m_param->totalFrames - 1)) || (frameEnc->m_lowres.sliceType == X265_TYPE_I) || (frameEnc->m_lowres.sliceType == X265_TYPE_IDR))
754
+                {
755
+                    frameEnc->m_tempLayer = (int8_t)0;
756
+                }
757
+            }
758
             /* determine references, setup RPS, etc */
759
             m_dpb->prepareEncode(frameEnc);
760
+
761
+            if (m_param->bEnableTemporalFilter)
762
+            {
763
+                X265_CHECK(!m_origPicBuffer->m_mcstfOrigPicFreeList.empty(), "Frames not available in Encoded OPB");
764
+
765
+                Frame *dupFrame = m_origPicBuffer->m_mcstfOrigPicFreeList.popBackMCSTF();
766
+                dupFrame->m_fencPic->copyFromFrame(frameEnc->m_fencPic);
767
+                dupFrame->m_poc = frameEnc->m_poc;
768
+                dupFrame->m_encodeOrder = frameEnc->m_encodeOrder;
769
+                dupFrame->m_refPicCnt1 = 2 * dupFrame->m_mcstf->m_range + 1;
770
+
771
+                if (dupFrame->m_poc < dupFrame->m_mcstf->m_range)
772
+                    dupFrame->m_refPicCnt1 -= (uint8_t)(dupFrame->m_mcstf->m_range - dupFrame->m_poc);
773
+                if (m_param->totalFrames && (dupFrame->m_poc >= (m_param->totalFrames - dupFrame->m_mcstf->m_range)))
774
+                    dupFrame->m_refPicCnt1 -= (uint8_t)(dupFrame->m_poc + dupFrame->m_mcstf->m_range - m_param->totalFrames + 1);
775
+
776
+                m_origPicBuffer->addEncPictureToPicList(dupFrame);
777
+                m_origPicBuffer->setOrigPicList(frameEnc, m_pocLast);
778
+            }
779
+
780
             if (!!m_param->selectiveSAO)
781
             {
782
                 Slice* slice = frameEnc->m_encData->m_slice;
783
@@ -2449,9 +2365,72 @@
784
 
785
             if (m_param->rc.rateControlMode != X265_RC_CQP)
786
                 m_lookahead->getEstimatedPictureCost(frameEnc);
787
+
788
             if (m_param->bIntraRefresh)
789
                  calcRefreshInterval(frameEnc);
790
 
791
+            // Generate MCSTF References and perform HME
792
+            if (m_param->bEnableTemporalFilter && isFilterThisframe(frameEnc->m_mcstf->m_sliceTypeConfig, frameEnc->m_lowres.sliceType))
793
+            {
794
+
795
+                if (!generateMcstfRef(frameEnc, curEncoder))
796
+                {
797
+                    m_aborted = true;
798
+                    x265_log(m_param, X265_LOG_ERROR, "Failed to initialize MCSTFReferencePicInfo at POC %d\n", frameEnc->m_poc);
799
+                    fflush(stderr);
800
+                    return -1;
801
+                }
802
+
803
+
804
+                if (!*frameEnc->m_isSubSampled)
805
+                {
806
+                    primitives.frameSubSampleLuma((const pixel *)frameEnc->m_fencPic->m_picOrg0,frameEnc->m_fencPicSubsampled2->m_picOrg0, frameEnc->m_fencPic->m_stride, frameEnc->m_fencPicSubsampled2->m_stride, frameEnc->m_fencPicSubsampled2->m_picWidth, frameEnc->m_fencPicSubsampled2->m_picHeight);
807
+                    extendPicBorder(frameEnc->m_fencPicSubsampled2->m_picOrg0, frameEnc->m_fencPicSubsampled2->m_stride, frameEnc->m_fencPicSubsampled2->m_picWidth, frameEnc->m_fencPicSubsampled2->m_picHeight, frameEnc->m_fencPicSubsampled2->m_lumaMarginX, frameEnc->m_fencPicSubsampled2->m_lumaMarginY);
808
+                    primitives.frameSubSampleLuma((const pixel *)frameEnc->m_fencPicSubsampled2->m_picOrg0,frameEnc->m_fencPicSubsampled4->m_picOrg0, frameEnc->m_fencPicSubsampled2->m_stride, frameEnc->m_fencPicSubsampled4->m_stride, frameEnc->m_fencPicSubsampled4->m_picWidth, frameEnc->m_fencPicSubsampled4->m_picHeight);
809
+                    extendPicBorder(frameEnc->m_fencPicSubsampled4->m_picOrg0, frameEnc->m_fencPicSubsampled4->m_stride, frameEnc->m_fencPicSubsampled4->m_picWidth, frameEnc->m_fencPicSubsampled4->m_picHeight, frameEnc->m_fencPicSubsampled4->m_lumaMarginX, frameEnc->m_fencPicSubsampled4->m_lumaMarginY);
810
+                    *frameEnc->m_isSubSampled = true;
811
+                }
812
+
813
+                for (uint8_t i = 1; i <= frameEnc->m_mcstf->m_numRef; i++)
814
+                {
815
+                    TemporalFilterRefPicInfo *ref = &curEncoder->m_mcstfRefListi - 1;
816
+                    if (!*ref->isSubsampled)
817
+                    {
818
+                        primitives.frameSubSampleLuma((const pixel *)ref->picBuffer->m_picOrg0, ref->picBufferSubSampled2->m_picOrg0, ref->picBuffer->m_stride, ref->picBufferSubSampled2->m_stride, ref->picBufferSubSampled2->m_picWidth, ref->picBufferSubSampled2->m_picHeight);
819
+                        extendPicBorder(ref->picBufferSubSampled2->m_picOrg0, ref->picBufferSubSampled2->m_stride, ref->picBufferSubSampled2->m_picWidth, ref->picBufferSubSampled2->m_picHeight, ref->picBufferSubSampled2->m_lumaMarginX, ref->picBufferSubSampled2->m_lumaMarginY);
820
+                        primitives.frameSubSampleLuma((const pixel *)ref->picBufferSubSampled2->m_picOrg0,ref->picBufferSubSampled4->m_picOrg0, ref->picBufferSubSampled2->m_stride, ref->picBufferSubSampled4->m_stride, ref->picBufferSubSampled4->m_picWidth, ref->picBufferSubSampled4->m_picHeight);
821
+                        extendPicBorder(ref->picBufferSubSampled4->m_picOrg0, ref->picBufferSubSampled4->m_stride, ref->picBufferSubSampled4->m_picWidth, ref->picBufferSubSampled4->m_picHeight, ref->picBufferSubSampled4->m_lumaMarginX, ref->picBufferSubSampled4->m_lumaMarginY);
822
+                        *ref->isSubsampled = true;
823
+                    }
824
+                }
825
+
826
+                for (uint8_t i = 1; i <= frameEnc->m_mcstf->m_numRef; i++)
827
+                {
828
+                    TemporalFilterRefPicInfo *ref = &curEncoder->m_mcstfRefListi - 1;
829
+
830
+                    curEncoder->m_frameEncTF->motionEstimationLuma(ref->mvs0, ref->mvsStride0, frameEnc->m_fencPicSubsampled4, ref->picBufferSubSampled4, 16);
831
+                    curEncoder->m_frameEncTF->motionEstimationLuma(ref->mvs1, ref->mvsStride1, frameEnc->m_fencPicSubsampled2, ref->picBufferSubSampled2, 16, ref->mvs0, ref->mvsStride0, 2);
832
+                    curEncoder->m_frameEncTF->motionEstimationLuma(ref->mvs2, ref->mvsStride2, frameEnc->m_fencPic, ref->picBuffer, 16, ref->mvs1, ref->mvsStride1, 2);
833
+                    curEncoder->m_frameEncTF->motionEstimationLumaDoubleRes(ref->mvs,  ref->mvsStride, frameEnc->m_fencPic, ref->picBuffer, 8, ref->mvs2, ref->mvsStride2, 1, ref->error);
834
+                }
835
+
836
+                for (int i = 0; i < frameEnc->m_mcstf->m_numRef; i++)
837
+                {
838
+                    TemporalFilterRefPicInfo *ref = &curEncoder->m_mcstfRefListi;
839
+                    ref->slicetype = m_lookahead->findSliceType(frameEnc->m_poc + ref->origOffset);
840
+                    Frame* dpbframePtr = m_dpb->m_picList.getPOC(frameEnc->m_poc + ref->origOffset);
841
+                    if (dpbframePtr != NULL)
842
+                    {
843
+                        if (dpbframePtr->m_encData->m_slice->m_sliceType == B_SLICE)
844
+                            ref->slicetype = X265_TYPE_B;
845
+                        else if (dpbframePtr->m_encData->m_slice->m_sliceType == P_SLICE)
846
+                            ref->slicetype = X265_TYPE_P;
847
+                        else
848
+                            ref->slicetype = X265_TYPE_I;
849
+                    }
850
+                }
851
+            }
852
+
853
             /* Allow FrameEncoder::compressFrame() to start in the frame encoder thread */
854
             if (!curEncoder->startCompressFrame(frameEnc))
855
                 m_aborted = true;
856
@@ -2523,7 +2502,11 @@
857
         encParam->dynamicRd = param->dynamicRd;
858
         encParam->bEnableTransformSkip = param->bEnableTransformSkip;
859
         encParam->bEnableAMP = param->bEnableAMP;
860
-
861
+        if (param->confWinBottomOffset == 0 && param->confWinRightOffset == 0)
862
+        {
863
+            encParam->confWinBottomOffset = param->confWinBottomOffset;
864
+            encParam->confWinRightOffset = param->confWinRightOffset;
865
+        }
866
         /* Resignal changes in params in Parameter Sets */
867
         m_sps.maxAMPDepth = (m_sps.bUseAMP = param->bEnableAMP && param->bEnableAMP) ? param->maxCUDepth : 0;
868
         m_pps.bTransformSkipEnabled = param->bEnableTransformSkip ? 1 : 0;
869
@@ -2729,18 +2712,7 @@
870
             (float)100.0 * m_numLumaWPBiFrames / m_analyzeB.m_numPics,
871
             (float)100.0 * m_numChromaWPBiFrames / m_analyzeB.m_numPics);
872
     }
873
-    int pWithB = 0;
874
-    for (int i = 0; i <= m_param->bframes; i++)
875
-        pWithB += m_lookahead->m_histogrami;
876
 
877
-    if (pWithB)
878
-    {
879
-        int p = 0;
880
-        for (int i = 0; i <= m_param->bframes; i++)
881
-            p += sprintf(buffer + p, "%.1f%% ", 100. * m_lookahead->m_histogrami / pWithB);
882
-
883
-        x265_log(m_param, X265_LOG_INFO, "consecutive B-frames: %s\n", buffer);
884
-    }
885
     if (m_param->bLossless)
886
     {
887
         float frameSize = (float)(m_param->sourceWidth - m_sps.conformanceWindow.rightOffset) *
888
@@ -3341,6 +3313,19 @@
889
     }
890
 }
891
 
892
+void Encoder::getEndNalUnits(NALList& list, Bitstream& bs)
893
+{
894
+    NALList nalList;
895
+    bs.resetBits();
896
+
897
+    if (m_param->bEnableEndOfSequence)
898
+        nalList.serialize(NAL_UNIT_EOS, bs);
899
+    if (m_param->bEnableEndOfBitstream)
900
+        nalList.serialize(NAL_UNIT_EOB, bs);
901
+
902
+    list.takeContents(nalList);
903
+}
904
+
905
 void Encoder::initVPS(VPS *vps)
906
 {
907
     /* Note that much of the VPS is initialized by determineLevel() */
908
@@ -3375,10 +3360,14 @@
909
     sps->bUseAMP = m_param->bEnableAMP;
910
     sps->maxAMPDepth = m_param->bEnableAMP ? m_param->maxCUDepth : 0;
911
 
912
-    sps->maxTempSubLayers = m_param->bEnableTemporalSubLayers ? 2 : 1;
913
-    sps->maxDecPicBuffering = m_vps.maxDecPicBuffering;
914
-    sps->numReorderPics = m_vps.numReorderPics;
915
-    sps->maxLatencyIncrease = m_vps.maxLatencyIncrease = m_param->bframes;
916
+    sps->maxTempSubLayers = m_vps.maxTempSubLayers;// Getting the value from the user
917
+
918
+    for(uint8_t i = 0; i < sps->maxTempSubLayers; i++)
919
+    {
920
+        sps->maxDecPicBufferingi = m_vps.maxDecPicBufferingi;
921
+        sps->numReorderPicsi = m_vps.numReorderPicsi;
922
+        sps->maxLatencyIncreasei = m_vps.maxLatencyIncreasei = m_param->bframes;
923
+    }
924
 
925
     sps->bUseStrongIntraSmoothing = m_param->bEnableStrongIntraSmoothing;
926
     sps->bTemporalMVPEnabled = m_param->bEnableTemporalMvp;
927
@@ -3518,6 +3507,11 @@
928
             p->rc.aqMode = X265_AQ_NONE;
929
             p->rc.hevcAq = 0;
930
         }
931
+        if (p->rc.aqMode == 0 && p->rc.cuTree)
932
+        {
933
+            p->rc.aqMode = X265_AQ_VARIANCE;
934
+            p->rc.aqStrength = 0;
935
+        }
936
         p->radl = zone->radl;
937
     }
938
     memcpy(zone, p, sizeof(x265_param));
939
@@ -3548,6 +3542,65 @@
940
         p->crQpOffset = 3;
941
 }
942
 
943
+void Encoder::configureVideoSignalTypePreset(x265_param* p)
944
+{
945
+    char systemId20 = {};
946
+    char colorVolume20 = {};
947
+    sscanf(p->videoSignalTypePreset, "%^::%s", systemId, colorVolume);
948
+    uint32_t sysId = 0;
949
+    while (strcmp(vstPresetssysId.systemId, systemId))
950
+    {
951
+        if (sysId + 1 == sizeof(vstPresets) / sizeof(vstPresets0))
952
+        {
953
+            x265_log(NULL, X265_LOG_ERROR, "Incorrect system-id, aborting\n");
954
+            m_aborted = true;
955
+            break;
956
+        }
957
+        sysId++;
958
+    }
959
+
960
+    p->vui.bEnableVideoSignalTypePresentFlag = vstPresetssysId.bEnableVideoSignalTypePresentFlag;
961
+    p->vui.bEnableColorDescriptionPresentFlag = vstPresetssysId.bEnableColorDescriptionPresentFlag;
962
+    p->vui.bEnableChromaLocInfoPresentFlag = vstPresetssysId.bEnableChromaLocInfoPresentFlag;
963
+    p->vui.colorPrimaries = vstPresetssysId.colorPrimaries;
964
+    p->vui.transferCharacteristics = vstPresetssysId.transferCharacteristics;
965
+    p->vui.matrixCoeffs = vstPresetssysId.matrixCoeffs;
966
+    p->vui.bEnableVideoFullRangeFlag = vstPresetssysId.bEnableVideoFullRangeFlag;
967
+    p->vui.chromaSampleLocTypeTopField = vstPresetssysId.chromaSampleLocTypeTopField;
968
+    p->vui.chromaSampleLocTypeBottomField = vstPresetssysId.chromaSampleLocTypeBottomField;
969
+
970
+    if (colorVolume0 != '\0')
971
+    {
972
+        if (!strcmp(systemId, "BT2100_PQ_YCC") || !strcmp(systemId, "BT2100_PQ_ICTCP") || !strcmp(systemId, "BT2100_PQ_RGB"))
973
+        {
974
+            p->bEmitHDR10SEI = 1;
975
+            if (!strcmp(colorVolume, "P3D65x1000n0005"))
976
+            {
977
+                p->masteringDisplayColorVolume = strdup("G(13250,34500)B(7500,3000)R(34000,16000)WP(15635,16450)L(10000000,5)");
978
+            }
979
+            else if (!strcmp(colorVolume, "P3D65x4000n005"))
980
+            {
981
+                p->masteringDisplayColorVolume = strdup("G(13250,34500)B(7500,3000)R(34000,16000)WP(15635,16450)L(40000000,50)");
982
+            }
983
+            else if (!strcmp(colorVolume, "BT2100x108n0005"))
984
+            {
985
+                p->masteringDisplayColorVolume = strdup("G(8500,39850)B(6550,2300)R(34000,146000)WP(15635,16450)L(10000000,1)");
986
+            }
987
+            else
988
+            {
989
+                x265_log(NULL, X265_LOG_ERROR, "Incorrect color-volume, aborting\n");
990
+                m_aborted = true;
991
+            }
992
+        }
993
+        else
994
+        {
995
+            x265_log(NULL, X265_LOG_ERROR, "Color-volume is not supported with the given system-id, aborting\n");
996
+            m_aborted = true;
997
+        }
998
+    }
999
+
1000
+}
1001
+
1002
 void Encoder::configure(x265_param *p)
1003
 {
1004
     this->m_param = p;
1005
@@ -3610,6 +3663,12 @@
1006
     if (!p->rdoqLevel)
1007
         p->psyRdoq = 0;
1008
 
1009
+    if (p->craNal && p->keyframeMax > 1)
1010
+    {
1011
+        x265_log_file(NULL, X265_LOG_ERROR, " --cra-nal works only with keyint 1, but given keyint = %s\n", p->keyframeMax);
1012
+        m_aborted = true;
1013
+    }
1014
+
1015
     /* Disable features which are not supported by the current RD level */
1016
     if (p->rdLevel < 3)
1017
     {
1018
@@ -3848,12 +3907,37 @@
1019
         p->limitReferences = 0;
1020
     }
1021
 
1022
-    if (p->bEnableTemporalSubLayers && !p->bframes)
1023
+    if ((p->bEnableTemporalSubLayers > 2) && !p->bframes)
1024
     {
1025
         x265_log(p, X265_LOG_WARNING, "B frames not enabled, temporal sublayer disabled\n");
1026
         p->bEnableTemporalSubLayers = 0;
1027
     }
1028
 
1029
+    if (!!p->bEnableTemporalSubLayers && p->bEnableTemporalSubLayers < 2)
1030
+    {
1031
+        p->bEnableTemporalSubLayers = 0;
1032
+        x265_log(p, X265_LOG_WARNING, "No support for temporal sublayers less than 2; Disabling temporal layers\n");
1033
+    }
1034
+
1035
+    if (p->bEnableTemporalSubLayers > 5)
1036
+    {
1037
+        p->bEnableTemporalSubLayers = 5;
1038
+        x265_log(p, X265_LOG_WARNING, "No support for temporal sublayers more than 5; Reducing the temporal sublayers to 5\n");
1039
+    }
1040
+
1041
+    // Assign number of B frames for temporal layers
1042
+    if (p->bEnableTemporalSubLayers > 2)
1043
+            p->bframes = x265_temporal_layer_bframesp->bEnableTemporalSubLayers - 1;
1044
+
1045
+    if (p->bEnableTemporalSubLayers > 2)
1046
+    {
1047
+        if (p->bFrameAdaptive)
1048
+        {
1049
+            x265_log(p, X265_LOG_WARNING, "Disabling adaptive B-frame placement to support temporal sub-layers\n");
1050
+            p->bFrameAdaptive = 0;
1051
+        }
1052
+    }
1053
+
1054
     m_bframeDelay = p->bframes ? (p->bBPyramid ? 2 : 1) : 0;
1055
 
1056
     p->bFrameBias = X265_MIN(X265_MAX(-90, p->bFrameBias), 100);
1057
@@ -3907,6 +3991,16 @@
1058
         p->rc.bStatRead = 0;
1059
     }
1060
 
1061
+    if ((p->rc.bStatWrite || p->rc.bStatRead) && p->rc.dataShareMode != X265_SHARE_MODE_FILE && p->rc.dataShareMode != X265_SHARE_MODE_SHAREDMEM)
1062
+    {
1063
+        p->rc.dataShareMode = X265_SHARE_MODE_FILE;
1064
+    }
1065
+
1066
+    if (!p->rc.bStatRead || p->rc.rateControlMode != X265_RC_CRF)
1067
+    {
1068
+        p->rc.bEncFocusedFramesOnly = 0;
1069
+    }
1070
+
1071
     /* some options make no sense if others are disabled */
1072
     p->bSaoNonDeblocked &= p->bEnableSAO;
1073
     p->bEnableTSkipFast &= p->bEnableTransformSkip;
1074
@@ -4243,6 +4337,9 @@
1075
         }
1076
     }
1077
 
1078
+    if (p->videoSignalTypePreset)     // Default disabled.
1079
+        configureVideoSignalTypePreset(p);
1080
+
1081
     if (m_param->toneMapFile || p->bHDR10Opt || p->bEmitHDR10SEI)
1082
     {
1083
         if (!p->bRepeatHeaders)
1084
@@ -4313,12 +4410,26 @@
1085
             m_param->searchRange = m_param->hmeRange2;
1086
     }
1087
 
1088
-   if (p->bHistBasedSceneCut && !p->edgeTransitionThreshold)
1089
-   {
1090
-       p->edgeTransitionThreshold = 0.03;
1091
-       x265_log(p, X265_LOG_WARNING, "using  default threshold %.2lf for scene cut detection\n", p->edgeTransitionThreshold);
1092
-   }
1093
+    if (p->bEnableSBRC && (p->rc.rateControlMode != X265_RC_CRF || (p->rc.vbvBufferSize == 0 || p->rc.vbvMaxBitrate == 0)))
1094
+    {
1095
+        x265_log(p, X265_LOG_WARNING, "SBRC can be enabled only with CRF+VBV mode. Disabling SBRC\n");
1096
+        p->bEnableSBRC = 0;
1097
+    }
1098
 
1099
+    if (p->bEnableSBRC)
1100
+    {
1101
+        p->rc.ipFactor = p->rc.ipFactor * X265_IPRATIO_STRENGTH;
1102
+        if (p->bOpenGOP)
1103
+        {
1104
+            x265_log(p, X265_LOG_WARNING, "Segment based RateControl requires closed gop structure. Enabling closed GOP.\n");
1105
+            p->bOpenGOP = 0;
1106
+        }
1107
+        if (p->keyframeMax != p->keyframeMin)
1108
+        {
1109
+            x265_log(p, X265_LOG_WARNING, "Segment based RateControl requires fixed gop length. Force set min-keyint equal to keyint.\n");
1110
+            p->keyframeMin = p->keyframeMax;
1111
+        }
1112
+    }
1113
 }
1114
 
1115
 void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const x265_picture* picIn, int paramBytes)
1116
@@ -4379,16 +4490,6 @@
1117
     analysis->frameRecordSize = frameRecordSize;
1118
     X265_FREAD(&analysis->sliceType, sizeof(int), 1, m_analysisFileIn, &(picData->sliceType));
1119
     X265_FREAD(&analysis->bScenecut, sizeof(int), 1, m_analysisFileIn, &(picData->bScenecut));
1120
-    if (m_param->bHistBasedSceneCut)
1121
-    {
1122
-        X265_FREAD(&analysis->edgeHist, sizeof(int32_t), EDGE_BINS, m_analysisFileIn, &m_curEdgeHist);
1123
-        X265_FREAD(&analysis->yuvHist0, sizeof(int32_t), HISTOGRAM_BINS, m_analysisFileIn, &m_curYUVHist0);
1124
-        if (m_param->internalCsp != X265_CSP_I400)
1125
-        {
1126
-            X265_FREAD(&analysis->yuvHist1, sizeof(int32_t), HISTOGRAM_BINS, m_analysisFileIn, &m_curYUVHist1);
1127
-            X265_FREAD(&analysis->yuvHist2, sizeof(int32_t), HISTOGRAM_BINS, m_analysisFileIn, &m_curYUVHist2);
1128
-        }
1129
-    }
1130
     X265_FREAD(&analysis->satdCost, sizeof(int64_t), 1, m_analysisFileIn, &(picData->satdCost));
1131
     X265_FREAD(&numCUsLoad, sizeof(int), 1, m_analysisFileIn, &(picData->numCUsInFrame));
1132
     X265_FREAD(&analysis->numPartitions, sizeof(int), 1, m_analysisFileIn, &(picData->numPartitions));
1133
@@ -4711,16 +4812,6 @@
1134
     analysis->frameRecordSize = frameRecordSize;
1135
     X265_FREAD(&analysis->sliceType, sizeof(int), 1, m_analysisFileIn, &(picData->sliceType));
1136
     X265_FREAD(&analysis->bScenecut, sizeof(int), 1, m_analysisFileIn, &(picData->bScenecut));
1137
-    if (m_param->bHistBasedSceneCut)
1138
-    {
1139
-        X265_FREAD(&analysis->edgeHist, sizeof(int32_t), EDGE_BINS, m_analysisFileIn, &m_curEdgeHist);
1140
-        X265_FREAD(&analysis->yuvHist0, sizeof(int32_t), HISTOGRAM_BINS, m_analysisFileIn, &m_curYUVHist0);
1141
-        if (m_param->internalCsp != X265_CSP_I400)
1142
-        {
1143
-            X265_FREAD(&analysis->yuvHist1, sizeof(int32_t), HISTOGRAM_BINS, m_analysisFileIn, &m_curYUVHist1);
1144
-            X265_FREAD(&analysis->yuvHist2, sizeof(int32_t), HISTOGRAM_BINS, m_analysisFileIn, &m_curYUVHist2);
1145
-        }
1146
-    }
1147
     X265_FREAD(&analysis->satdCost, sizeof(int64_t), 1, m_analysisFileIn, &(picData->satdCost));
1148
     X265_FREAD(&analysis->numCUsInFrame, sizeof(int), 1, m_analysisFileIn, &(picData->numCUsInFrame));
1149
     X265_FREAD(&analysis->numPartitions, sizeof(int), 1, m_analysisFileIn, &(picData->numPartitions));
1150
@@ -4810,8 +4901,14 @@
1151
 
1152
     if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
1153
     {
1154
-        if (m_param->analysisLoadReuseLevel < 2)
1155
-            return;
1156
+       if (m_param->analysisLoadReuseLevel < 2)
1157
+       {
1158
+           /* Restore to the current encode's numPartitions and numCUsInFrame */
1159
+           analysis->numPartitions = m_param->num4x4Partitions;
1160
+           analysis->numCUsInFrame = cuLoc.heightInCU * cuLoc.widthInCU;
1161
+           analysis->numCuInHeight = cuLoc.heightInCU;
1162
+           return;
1163
+       }
1164
 
1165
         uint8_t *tempBuf = NULL, *depthBuf = NULL, *modeBuf = NULL, *partSizes = NULL;
1166
         int8_t *cuQPBuf = NULL;
1167
@@ -4879,8 +4976,14 @@
1168
         uint32_t numDir = analysis->sliceType == X265_TYPE_P ? 1 : 2;
1169
         uint32_t numPlanes = m_param->internalCsp == X265_CSP_I400 ? 1 : 3;
1170
         X265_FREAD((WeightParam*)analysis->wt, sizeof(WeightParam), numPlanes * numDir, m_analysisFileIn, (picIn->analysisData.wt));
1171
-        if (m_param->analysisLoadReuseLevel < 2)
1172
-            return;
1173
+       if (m_param->analysisLoadReuseLevel < 2)
1174
+       {
1175
+           /* Restore to the current encode's numPartitions and numCUsInFrame */
1176
+           analysis->numPartitions = m_param->num4x4Partitions;
1177
+           analysis->numCUsInFrame = cuLoc.heightInCU * cuLoc.widthInCU;
1178
+           analysis->numCuInHeight = cuLoc.heightInCU;
1179
+           return;
1180
+       }
1181
 
1182
         uint8_t *tempBuf = NULL, *depthBuf = NULL, *modeBuf = NULL, *partSize = NULL, *mergeFlag = NULL;
1183
         uint8_t *interDir = NULL, *chromaDir = NULL, *mvpIdx2;
1184
@@ -5167,7 +5270,7 @@
1185
 
1186
         int bcutree;
1187
         X265_FREAD(&bcutree, sizeof(int), 1, m_analysisFileIn, &(saveParam->cuTree));
1188
-        if (loadLevel == 10 && m_param->rc.cuTree && (!bcutree || saveLevel < 2))
1189
+        if (loadLevel >= 2 && m_param->rc.cuTree && (!bcutree || saveLevel < 2))
1190
         {
1191
             x265_log(NULL, X265_LOG_ERROR, "Error reading cu-tree info. Disabling cutree offsets. \n");
1192
             m_param->rc.cuTree = 0;
1193
@@ -5337,6 +5440,7 @@
1194
             distortionData->highDistortionCtuCount++;
1195
     }
1196
 }
1197
+
1198
 void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, int sliceType)
1199
 {
1200
 
1201
@@ -5486,17 +5590,6 @@
1202
     /* calculate frameRecordSize */
1203
     analysis->frameRecordSize = sizeof(analysis->frameRecordSize) + sizeof(depthBytes) + sizeof(analysis->poc) + sizeof(analysis->sliceType) +
1204
                       sizeof(analysis->numCUsInFrame) + sizeof(analysis->numPartitions) + sizeof(analysis->bScenecut) + sizeof(analysis->satdCost);
1205
-    if (m_param->bHistBasedSceneCut)
1206
-    {
1207
-        analysis->frameRecordSize += sizeof(analysis->edgeHist);
1208
-        analysis->frameRecordSize += sizeof(int32_t) * HISTOGRAM_BINS;
1209
-        if (m_param->internalCsp != X265_CSP_I400)
1210
-        {
1211
-            analysis->frameRecordSize += sizeof(int32_t) * HISTOGRAM_BINS;
1212
-            analysis->frameRecordSize += sizeof(int32_t) * HISTOGRAM_BINS;
1213
-        }
1214
-    }
1215
-
1216
     if (analysis->sliceType > X265_TYPE_I)
1217
     {
1218
         numDir = (analysis->sliceType == X265_TYPE_P) ? 1 : 2;
1219
@@ -5641,17 +5734,6 @@
1220
     X265_FWRITE(&analysis->poc, sizeof(int), 1, m_analysisFileOut);
1221
     X265_FWRITE(&analysis->sliceType, sizeof(int), 1, m_analysisFileOut);
1222
     X265_FWRITE(&analysis->bScenecut, sizeof(int), 1, m_analysisFileOut);
1223
-    if (m_param->bHistBasedSceneCut)
1224
-    {
1225
-        X265_FWRITE(&analysis->edgeHist, sizeof(int32_t), EDGE_BINS, m_analysisFileOut);
1226
-        X265_FWRITE(&analysis->yuvHist0, sizeof(int32_t), HISTOGRAM_BINS, m_analysisFileOut);
1227
-        if (m_param->internalCsp != X265_CSP_I400)
1228
-        {
1229
-            X265_FWRITE(&analysis->yuvHist1, sizeof(int32_t), HISTOGRAM_BINS, m_analysisFileOut);
1230
-            X265_FWRITE(&analysis->yuvHist2, sizeof(int32_t), HISTOGRAM_BINS, m_analysisFileOut);
1231
-        }
1232
-    }
1233
-
1234
     X265_FWRITE(&analysis->satdCost, sizeof(int64_t), 1, m_analysisFileOut);
1235
     X265_FWRITE(&analysis->numCUsInFrame, sizeof(int), 1, m_analysisFileOut);
1236
     X265_FWRITE(&analysis->numPartitions, sizeof(int), 1, m_analysisFileOut);
1237
x265_3.5.tar.gz/source/encoder/encoder.h -> x265_3.6.tar.gz/source/encoder/encoder.h Changed
72
 
1
@@ -32,6 +32,7 @@
2
 #include "nal.h"
3
 #include "framedata.h"
4
 #include "svt.h"
5
+#include "temporalfilter.h"
6
 #ifdef ENABLE_HDR10_PLUS
7
     #include "dynamicHDR10/hdr10plus.h"
8
 #endif
9
@@ -256,19 +257,6 @@
10
     int                m_bToneMap; // Enables tone-mapping
11
     int                m_enableNal;
12
 
13
-    /* For histogram based scene-cut detection */
14
-    pixel*             m_edgePic;
15
-    pixel*             m_inputPic3;
16
-    int32_t            m_curYUVHist3HISTOGRAM_BINS;
17
-    int32_t            m_prevYUVHist3HISTOGRAM_BINS;
18
-    int32_t            m_curEdgeHist2;
19
-    int32_t            m_prevEdgeHist2;
20
-    uint32_t           m_planeSizes3;
21
-    double             m_edgeHistThreshold;
22
-    double             m_chromaHistThreshold;
23
-    double             m_scaledEdgeThreshold;
24
-    double             m_scaledChromaThreshold;
25
-
26
 #ifdef ENABLE_HDR10_PLUS
27
     const hdr10plus_api     *m_hdr10plus_api;
28
     uint8_t                 **m_cim;
29
@@ -295,6 +283,9 @@
30
 
31
     ThreadSafeInteger* zoneReadCount;
32
     ThreadSafeInteger* zoneWriteCount;
33
+    /* Film grain model file */
34
+    FILE* m_filmGrainIn;
35
+    OrigPicBuffer*          m_origPicBuffer;
36
 
37
     Encoder();
38
     ~Encoder()
39
@@ -327,6 +318,8 @@
40
 
41
     void getStreamHeaders(NALList& list, Entropy& sbacCoder, Bitstream& bs);
42
 
43
+    void getEndNalUnits(NALList& list, Bitstream& bs);
44
+
45
     void fetchStats(x265_stats* stats, size_t statsSizeBytes);
46
 
47
     void printSummary();
48
@@ -373,11 +366,6 @@
49
 
50
     void copyPicture(x265_picture *dest, const x265_picture *src);
51
 
52
-    bool computeHistograms(x265_picture *pic);
53
-    void computeHistogramSAD(double *maxUVNormalizedSAD, double *edgeNormalizedSAD, int curPoc);
54
-    double normalizeRange(int32_t value, int32_t minValue, int32_t maxValue, double rangeStart, double rangeEnd);
55
-    void findSceneCuts(x265_picture *pic, bool& bDup, double m_maxUVSADVal, double m_edgeSADVal, bool& isMaxThres, bool& isHardSC);
56
-
57
     void initRefIdx();
58
     void analyseRefIdx(int *numRefIdx);
59
     void updateRefIdx();
60
@@ -387,6 +375,11 @@
61
 
62
     void configureDolbyVisionParams(x265_param* p);
63
 
64
+    void configureVideoSignalTypePreset(x265_param* p);
65
+
66
+    bool isFilterThisframe(uint8_t sliceTypeConfig, int curSliceType);
67
+    bool generateMcstfRef(Frame* frameEnc, FrameEncoder* currEncoder);
68
+
69
 protected:
70
 
71
     void initVPS(VPS *vps);
72
x265_3.5.tar.gz/source/encoder/entropy.cpp -> x265_3.6.tar.gz/source/encoder/entropy.cpp Changed
41
 
1
@@ -245,9 +245,9 @@
2
 
3
     for (uint32_t i = 0; i < vps.maxTempSubLayers; i++)
4
     {
5
-        WRITE_UVLC(vps.maxDecPicBuffering - 1, "vps_max_dec_pic_buffering_minus1i");
6
-        WRITE_UVLC(vps.numReorderPics,         "vps_num_reorder_picsi");
7
-        WRITE_UVLC(vps.maxLatencyIncrease + 1, "vps_max_latency_increase_plus1i");
8
+        WRITE_UVLC(vps.maxDecPicBufferingi - 1, "vps_max_dec_pic_buffering_minus1i");
9
+        WRITE_UVLC(vps.numReorderPicsi,         "vps_num_reorder_picsi");
10
+        WRITE_UVLC(vps.maxLatencyIncreasei + 1, "vps_max_latency_increase_plus1i");
11
     }
12
 
13
     WRITE_CODE(0, 6, "vps_max_nuh_reserved_zero_layer_id");
14
@@ -291,9 +291,9 @@
15
 
16
     for (uint32_t i = 0; i < sps.maxTempSubLayers; i++)
17
     {
18
-        WRITE_UVLC(sps.maxDecPicBuffering - 1, "sps_max_dec_pic_buffering_minus1i");
19
-        WRITE_UVLC(sps.numReorderPics,         "sps_num_reorder_picsi");
20
-        WRITE_UVLC(sps.maxLatencyIncrease + 1, "sps_max_latency_increase_plus1i");
21
+        WRITE_UVLC(sps.maxDecPicBufferingi - 1, "sps_max_dec_pic_buffering_minus1i");
22
+        WRITE_UVLC(sps.numReorderPicsi,         "sps_num_reorder_picsi");
23
+        WRITE_UVLC(sps.maxLatencyIncreasei + 1, "sps_max_latency_increase_plus1i");
24
     }
25
 
26
     WRITE_UVLC(sps.log2MinCodingBlockSize - 3,    "log2_min_coding_block_size_minus3");
27
@@ -418,8 +418,11 @@
28
 
29
     if (maxTempSubLayers > 1)
30
     {
31
-         WRITE_FLAG(0, "sub_layer_profile_present_flagi");
32
-         WRITE_FLAG(0, "sub_layer_level_present_flagi");
33
+        for(int i = 0; i < maxTempSubLayers - 1; i++)
34
+        {
35
+            WRITE_FLAG(0, "sub_layer_profile_present_flagi");
36
+            WRITE_FLAG(0, "sub_layer_level_present_flagi");
37
+        }
38
          for (int i = maxTempSubLayers - 1; i < 8 ; i++)
39
              WRITE_CODE(0, 2, "reserved_zero_2bits");
40
     }
41
x265_3.5.tar.gz/source/encoder/frameencoder.cpp -> x265_3.6.tar.gz/source/encoder/frameencoder.cpp Changed
200
 
1
@@ -34,6 +34,7 @@
2
 #include "common.h"
3
 #include "slicetype.h"
4
 #include "nal.h"
5
+#include "temporalfilter.h"
6
 
7
 namespace X265_NS {
8
 void weightAnalyse(Slice& slice, Frame& frame, x265_param& param);
9
@@ -101,6 +102,16 @@
10
         delete m_rce.picTimingSEI;
11
         delete m_rce.hrdTiming;
12
     }
13
+
14
+    if (m_param->bEnableTemporalFilter)
15
+    {
16
+        delete m_frameEncTF->m_metld;
17
+
18
+        for (int i = 0; i < (m_frameEncTF->m_range << 1); i++)
19
+            m_frameEncTF->destroyRefPicInfo(&m_mcstfRefListi);
20
+
21
+        delete m_frameEncTF;
22
+    }
23
 }
24
 
25
 bool FrameEncoder::init(Encoder *top, int numRows, int numCols)
26
@@ -195,6 +206,16 @@
27
         m_sliceAddrBits = (uint16_t)(tmp + 1);
28
     }
29
 
30
+    if (m_param->bEnableTemporalFilter)
31
+    {
32
+        m_frameEncTF = new TemporalFilter();
33
+        if (m_frameEncTF)
34
+            m_frameEncTF->init(m_param);
35
+
36
+        for (int i = 0; i < (m_frameEncTF->m_range << 1); i++)
37
+            ok &= !!m_frameEncTF->createRefPicInfo(&m_mcstfRefListi, m_param);
38
+    }
39
+
40
     return ok;
41
 }
42
 
43
@@ -450,7 +471,7 @@
44
     m_ssimCnt = 0;
45
     memset(&(m_frame->m_encData->m_frameStats), 0, sizeof(m_frame->m_encData->m_frameStats));
46
 
47
-    if (!m_param->bHistBasedSceneCut && m_param->rc.aqMode != X265_AQ_EDGE && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
48
+    if (m_param->rc.aqMode != X265_AQ_EDGE && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
49
     {
50
         int height = m_frame->m_fencPic->m_picHeight;
51
         int width = m_frame->m_fencPic->m_picWidth;
52
@@ -467,6 +488,12 @@
53
      * unit) */
54
     Slice* slice = m_frame->m_encData->m_slice;
55
 
56
+    if (m_param->bEnableEndOfSequence && m_frame->m_lowres.sliceType == X265_TYPE_IDR && m_frame->m_poc)
57
+    {
58
+        m_bs.resetBits();
59
+        m_nalList.serialize(NAL_UNIT_EOS, m_bs);
60
+    }
61
+
62
     if (m_param->bEnableAccessUnitDelimiters && (m_frame->m_poc || m_param->bRepeatHeaders))
63
     {
64
         m_bs.resetBits();
65
@@ -573,6 +600,12 @@
66
     int qp = m_top->m_rateControl->rateControlStart(m_frame, &m_rce, m_top);
67
     m_rce.newQp = qp;
68
 
69
+    if (m_param->bEnableTemporalFilter)
70
+    {
71
+        m_frameEncTF->m_QP = qp;
72
+        m_frameEncTF->bilateralFilter(m_frame, m_mcstfRefList, m_param->temporalFilterStrength);
73
+    }
74
+
75
     if (m_nr)
76
     {
77
         if (qp > QP_MAX_SPEC && m_frame->m_param->rc.vbvBufferSize)
78
@@ -744,7 +777,7 @@
79
             // wait after removal of the access unit with the most recent
80
             // buffering period SEI message
81
             sei->m_auCpbRemovalDelay = X265_MIN(X265_MAX(1, m_rce.encodeOrder - prevBPSEI), (1 << hrd->cpbRemovalDelayLength));
82
-            sei->m_picDpbOutputDelay = slice->m_sps->numReorderPics + poc - m_rce.encodeOrder;
83
+            sei->m_picDpbOutputDelay = slice->m_sps->numReorderPicsm_frame->m_tempLayer + poc - m_rce.encodeOrder;
84
         }
85
 
86
         sei->writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
87
@@ -756,7 +789,14 @@
88
         m_seiAlternativeTC.m_preferredTransferCharacteristics = m_param->preferredTransferCharacteristics;
89
         m_seiAlternativeTC.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
90
     }
91
-
92
+    /* Write Film grain characteristics if present */
93
+    if (this->m_top->m_filmGrainIn)
94
+    {
95
+        FilmGrainCharacteristics m_filmGrain;
96
+        /* Read the Film grain model file */
97
+        readModel(&m_filmGrain, this->m_top->m_filmGrainIn);
98
+        m_filmGrain.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
99
+    }
100
     /* Write user SEI */
101
     for (int i = 0; i < m_frame->m_userSEI.numPayloads; i++)
102
     {
103
@@ -933,6 +973,23 @@
104
     if (m_param->bDynamicRefine && m_top->m_startPoint <= m_frame->m_encodeOrder) //Avoid collecting data that will not be used by future frames.
105
         collectDynDataFrame();
106
 
107
+    if (m_param->bEnableTemporalFilter && m_top->isFilterThisframe(m_frame->m_mcstf->m_sliceTypeConfig, m_frame->m_lowres.sliceType))
108
+    {
109
+        //Reset the MCSTF context in Frame Encoder and Frame
110
+        for (int i = 0; i < (m_frameEncTF->m_range << 1); i++)
111
+        {
112
+            memset(m_mcstfRefListi.mvs0, 0, sizeof(MV) * ((m_param->sourceWidth / 16) * (m_param->sourceHeight / 16)));
113
+            memset(m_mcstfRefListi.mvs1, 0, sizeof(MV) * ((m_param->sourceWidth / 16) * (m_param->sourceHeight / 16)));
114
+            memset(m_mcstfRefListi.mvs2, 0, sizeof(MV) * ((m_param->sourceWidth / 16) * (m_param->sourceHeight / 16)));
115
+            memset(m_mcstfRefListi.mvs,  0, sizeof(MV) * ((m_param->sourceWidth / 4) * (m_param->sourceHeight / 4)));
116
+            memset(m_mcstfRefListi.noise, 0, sizeof(int) * ((m_param->sourceWidth / 4) * (m_param->sourceHeight / 4)));
117
+            memset(m_mcstfRefListi.error, 0, sizeof(int) * ((m_param->sourceWidth / 4) * (m_param->sourceHeight / 4)));
118
+
119
+            m_frame->m_mcstf->m_numRef = 0;
120
+        }
121
+    }
122
+
123
+
124
     if (m_param->rc.bStatWrite)
125
     {
126
         int totalI = 0, totalP = 0, totalSkip = 0;
127
@@ -1041,7 +1098,7 @@
128
             
129
             m_bs.writeByteAlignment();
130
 
131
-            m_nalList.serialize(slice->m_nalUnitType, m_bs);
132
+            m_nalList.serialize(slice->m_nalUnitType, m_bs, (!!m_param->bEnableTemporalSubLayers ? m_frame->m_tempLayer + 1 : (1 + (slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N))));
133
         }
134
     }
135
     else
136
@@ -1062,7 +1119,7 @@
137
             m_entropyCoder.codeSliceHeaderWPPEntryPoints(m_substreamSizes, (slice->m_sps->numCuInHeight - 1), maxStreamSize);
138
         m_bs.writeByteAlignment();
139
 
140
-        m_nalList.serialize(slice->m_nalUnitType, m_bs);
141
+        m_nalList.serialize(slice->m_nalUnitType, m_bs, (!!m_param->bEnableTemporalSubLayers ? m_frame->m_tempLayer + 1 : (1 + (slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N))));
142
     }
143
 
144
     if (m_param->decodedPictureHashSEI)
145
@@ -2127,6 +2184,54 @@
146
         m_nr->nrOffsetDenoisecat0 = 0;
147
     }
148
 }
149
+
150
+void FrameEncoder::readModel(FilmGrainCharacteristics* m_filmGrain, FILE* filmgrain)
151
+{
152
+    char const* errorMessage = "Error reading FilmGrain characteristics\n";
153
+    FilmGrain m_fg;
154
+    x265_fread((char* )&m_fg, sizeof(bool) * 3 + sizeof(uint8_t), 1, filmgrain, errorMessage);
155
+    m_filmGrain->m_filmGrainCharacteristicsCancelFlag = m_fg.m_filmGrainCharacteristicsCancelFlag;
156
+    m_filmGrain->m_filmGrainCharacteristicsPersistenceFlag = m_fg.m_filmGrainCharacteristicsPersistenceFlag;
157
+    m_filmGrain->m_filmGrainModelId = m_fg.m_filmGrainModelId;
158
+    m_filmGrain->m_separateColourDescriptionPresentFlag = m_fg.m_separateColourDescriptionPresentFlag;
159
+    if (m_filmGrain->m_separateColourDescriptionPresentFlag)
160
+    {
161
+        ColourDescription m_clr;
162
+        x265_fread((char* )&m_clr, sizeof(bool) + sizeof(uint8_t) * 5, 1, filmgrain, errorMessage);
163
+        m_filmGrain->m_filmGrainBitDepthLumaMinus8 = m_clr.m_filmGrainBitDepthLumaMinus8;
164
+        m_filmGrain->m_filmGrainBitDepthChromaMinus8 = m_clr.m_filmGrainBitDepthChromaMinus8;
165
+        m_filmGrain->m_filmGrainFullRangeFlag = m_clr.m_filmGrainFullRangeFlag;
166
+        m_filmGrain->m_filmGrainColourPrimaries = m_clr.m_filmGrainColourPrimaries;
167
+        m_filmGrain->m_filmGrainTransferCharacteristics = m_clr.m_filmGrainTransferCharacteristics;
168
+        m_filmGrain->m_filmGrainMatrixCoeffs = m_clr.m_filmGrainMatrixCoeffs;
169
+    }
170
+    FGPresent m_present;
171
+    x265_fread((char* )&m_present, sizeof(bool) * 3 + sizeof(uint8_t) * 2, 1, filmgrain, errorMessage);
172
+    m_filmGrain->m_blendingModeId = m_present.m_blendingModeId;
173
+    m_filmGrain->m_log2ScaleFactor = m_present.m_log2ScaleFactor;
174
+    m_filmGrain->m_compModel0.bPresentFlag = m_present.m_presentFlag0;
175
+    m_filmGrain->m_compModel1.bPresentFlag = m_present.m_presentFlag1;
176
+    m_filmGrain->m_compModel2.bPresentFlag = m_present.m_presentFlag2;
177
+    for (int i = 0; i < MAX_NUM_COMPONENT; i++)
178
+    {
179
+        if (m_filmGrain->m_compModeli.bPresentFlag)
180
+        {
181
+            x265_fread((char* )(&m_filmGrain->m_compModeli.m_filmGrainNumIntensityIntervalMinus1), sizeof(uint8_t), 1, filmgrain, errorMessage);
182
+            x265_fread((char* )(&m_filmGrain->m_compModeli.numModelValues), sizeof(uint8_t), 1, filmgrain, errorMessage);
183
+            m_filmGrain->m_compModeli.intensityValues = (FilmGrainCharacteristics::CompModelIntensityValues* ) malloc(sizeof(FilmGrainCharacteristics::CompModelIntensityValues) * (m_filmGrain->m_compModeli.m_filmGrainNumIntensityIntervalMinus1+1)) ;
184
+            for (int j = 0; j <= m_filmGrain->m_compModeli.m_filmGrainNumIntensityIntervalMinus1; j++)
185
+            {
186
+                x265_fread((char* )(&m_filmGrain->m_compModeli.intensityValuesj.intensityIntervalLowerBound), sizeof(uint8_t), 1, filmgrain, errorMessage);
187
+                x265_fread((char* )(&m_filmGrain->m_compModeli.intensityValuesj.intensityIntervalUpperBound), sizeof(uint8_t), 1, filmgrain, errorMessage);
188
+                m_filmGrain->m_compModeli.intensityValuesj.compModelValue = (int* ) malloc(sizeof(int) * (m_filmGrain->m_compModeli.numModelValues));
189
+                for (int k = 0; k < m_filmGrain->m_compModeli.numModelValues; k++)
190
+                {
191
+                    x265_fread((char* )(&m_filmGrain->m_compModeli.intensityValuesj.compModelValuek), sizeof(int), 1, filmgrain, errorMessage);
192
+                }
193
+            }
194
+        }
195
+    }
196
+}
197
 #if ENABLE_LIBVMAF
198
 void FrameEncoder::vmafFrameLevelScore()
199
 {
200
x265_3.5.tar.gz/source/encoder/frameencoder.h -> x265_3.6.tar.gz/source/encoder/frameencoder.h Changed
63
 
1
@@ -40,6 +40,7 @@
2
 #include "ratecontrol.h"
3
 #include "reference.h"
4
 #include "nal.h"
5
+#include "temporalfilter.h"
6
 
7
 namespace X265_NS {
8
 // private x265 namespace
9
@@ -113,6 +114,34 @@
10
     }
11
 };
12
 
13
+/*Film grain characteristics*/
14
+struct FilmGrain
15
+{
16
+    bool    m_filmGrainCharacteristicsCancelFlag;
17
+    bool    m_filmGrainCharacteristicsPersistenceFlag;
18
+    bool    m_separateColourDescriptionPresentFlag;
19
+    uint8_t m_filmGrainModelId;
20
+    uint8_t m_blendingModeId;
21
+    uint8_t m_log2ScaleFactor;
22
+};
23
+
24
+struct ColourDescription
25
+{
26
+    bool        m_filmGrainFullRangeFlag;
27
+    uint8_t     m_filmGrainBitDepthLumaMinus8;
28
+    uint8_t     m_filmGrainBitDepthChromaMinus8;
29
+    uint8_t     m_filmGrainColourPrimaries;
30
+    uint8_t     m_filmGrainTransferCharacteristics;
31
+    uint8_t     m_filmGrainMatrixCoeffs;
32
+};
33
+
34
+struct FGPresent
35
+{
36
+    uint8_t     m_blendingModeId;
37
+    uint8_t     m_log2ScaleFactor;
38
+    bool        m_presentFlag3;
39
+};
40
+
41
 // Manages the wave-front processing of a single encoding frame
42
 class FrameEncoder : public WaveFront, public Thread
43
 {
44
@@ -205,6 +234,10 @@
45
     FrameFilter              m_frameFilter;
46
     NALList                  m_nalList;
47
 
48
+    // initialization for mcstf
49
+    TemporalFilter*          m_frameEncTF;
50
+    TemporalFilterRefPicInfo m_mcstfRefListMAX_MCSTF_TEMPORAL_WINDOW_LENGTH;
51
+
52
     class WeightAnalysis : public BondedTaskGroup
53
     {
54
     public:
55
@@ -250,6 +283,7 @@
56
     void collectDynDataFrame();
57
     void computeAvgTrainingData();
58
     void collectDynDataRow(CUData& ctu, FrameStats* rowStats);    
59
+    void readModel(FilmGrainCharacteristics* m_filmGrain, FILE* filmgrain);
60
 };
61
 }
62
 
63
x265_3.5.tar.gz/source/encoder/level.cpp -> x265_3.6.tar.gz/source/encoder/level.cpp Changed
86
 
1
@@ -72,7 +72,7 @@
2
      * for intra-only profiles (vps.ptl.intraConstraintFlag) */
3
     vps.ptl.lowerBitRateConstraintFlag = true;
4
 
5
-    vps.maxTempSubLayers = param.bEnableTemporalSubLayers ? 2 : 1;
6
+    vps.maxTempSubLayers = !!param.bEnableTemporalSubLayers ? param.bEnableTemporalSubLayers : 1;
7
     
8
     if (param.internalCsp == X265_CSP_I420 && param.internalBitDepth <= 10)
9
     {
10
@@ -167,7 +167,7 @@
11
 
12
         /* The value of sps_max_dec_pic_buffering_minus1 HighestTid  + 1 shall be less than
13
          * or equal to MaxDpbSize */
14
-        if (vps.maxDecPicBuffering > maxDpbSize)
15
+        if (vps.maxDecPicBufferingvps.maxTempSubLayers - 1 > maxDpbSize)
16
             continue;
17
 
18
         /* For level 5 and higher levels, the value of CtbSizeY shall be equal to 32 or 64 */
19
@@ -182,8 +182,8 @@
20
         }
21
 
22
         /* The value of NumPocTotalCurr shall be less than or equal to 8 */
23
-        int numPocTotalCurr = param.maxNumReferences + vps.numReorderPics;
24
-        if (numPocTotalCurr > 8)
25
+        int numPocTotalCurr = param.maxNumReferences + vps.numReorderPicsvps.maxTempSubLayers - 1;
26
+        if (numPocTotalCurr > 10)
27
         {
28
             x265_log(&param, X265_LOG_WARNING, "level %s detected, but NumPocTotalCurr (total references) is non-compliant\n", levelsi.name);
29
             vps.ptl.profileIdc = Profile::NONE;
30
@@ -289,9 +289,40 @@
31
  * circumstances it will be quite noisy */
32
 bool enforceLevel(x265_param& param, VPS& vps)
33
 {
34
-    vps.numReorderPics = (param.bBPyramid && param.bframes > 1) ? 2 : !!param.bframes;
35
-    vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 2, (uint32_t)param.maxNumReferences) + 1);
36
+    vps.maxTempSubLayers = !!param.bEnableTemporalSubLayers ? param.bEnableTemporalSubLayers : 1;
37
+    for (uint32_t i = 0; i < vps.maxTempSubLayers; i++)
38
+    {
39
+        vps.numReorderPicsi = (i == 0) ? ((param.bBPyramid && param.bframes > 1) ? 2 : !!param.bframes) : i;
40
+        vps.maxDecPicBufferingi = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPicsi + 2, (uint32_t)param.maxNumReferences) + 1);
41
+    }
42
 
43
+    if (!!param.bEnableTemporalSubLayers)
44
+    {
45
+        for (int i = 0; i < MAX_T_LAYERS - 1; i++)
46
+        {
47
+            // a lower layer can not have higher value of numReorderPics than a higher layer
48
+            if (vps.numReorderPicsi + 1 < vps.numReorderPicsi)
49
+            {
50
+                vps.numReorderPicsi + 1 = vps.numReorderPicsi;
51
+            }
52
+            // the value of numReorderPicsi shall be in the range of 0 to maxDecPicBufferingi - 1, inclusive
53
+            if (vps.numReorderPicsi > vps.maxDecPicBufferingi - 1)
54
+            {
55
+                vps.maxDecPicBufferingi = vps.numReorderPicsi + 1;
56
+            }
57
+            // a lower layer can not have higher value of maxDecPicBuffering than a higher layer
58
+            if (vps.maxDecPicBufferingi + 1 < vps.maxDecPicBufferingi)
59
+            {
60
+                vps.maxDecPicBufferingi + 1 = vps.maxDecPicBufferingi;
61
+            }
62
+        }
63
+
64
+        // the value of numReorderPicsi shall be in the range of 0 to maxDecPicBuffering i  -  1, inclusive
65
+        if (vps.numReorderPicsMAX_T_LAYERS - 1 > vps.maxDecPicBufferingMAX_T_LAYERS - 1 - 1)
66
+        {
67
+            vps.maxDecPicBufferingMAX_T_LAYERS - 1 = vps.numReorderPicsMAX_T_LAYERS - 1 + 1;
68
+        }
69
+    }
70
     /* no level specified by user, just auto-detect from the configuration */
71
     if (param.levelIdc <= 0)
72
         return true;
73
@@ -391,10 +422,10 @@
74
     }
75
 
76
     int savedRefCount = param.maxNumReferences;
77
-    while (vps.maxDecPicBuffering > maxDpbSize && param.maxNumReferences > 1)
78
+    while (vps.maxDecPicBufferingvps.maxTempSubLayers - 1 > maxDpbSize && param.maxNumReferences > 1)
79
     {
80
         param.maxNumReferences--;
81
-        vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 1, (uint32_t)param.maxNumReferences) + 1);
82
+        vps.maxDecPicBufferingvps.maxTempSubLayers - 1 = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPicsvps.maxTempSubLayers - 1 + 1, (uint32_t)param.maxNumReferences) + 1);
83
     }
84
     if (param.maxNumReferences != savedRefCount)
85
         x265_log(&param, X265_LOG_WARNING, "Lowering max references to %d to meet level requirement\n", param.maxNumReferences);
86
x265_3.5.tar.gz/source/encoder/motion.cpp -> x265_3.6.tar.gz/source/encoder/motion.cpp Changed
33
 
1
@@ -190,6 +190,31 @@
2
     X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
3
 }
4
 
5
+/* Called by lookahead, luma only, no use of PicYuv */
6
+void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int method, const int refine)
7
+{
8
+    partEnum = partitionFromSizes(pwidth, pheight);
9
+    X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
10
+    sad = primitives.pupartEnum.sad;
11
+    ads = primitives.pupartEnum.ads;
12
+    satd = primitives.pupartEnum.satd;
13
+    sad_x3 = primitives.pupartEnum.sad_x3;
14
+    sad_x4 = primitives.pupartEnum.sad_x4;
15
+
16
+
17
+    blockwidth = pwidth;
18
+    blockOffset = offset;
19
+    absPartIdx = ctuAddr = -1;
20
+
21
+    /* Search params */
22
+    searchMethod = method;
23
+    subpelRefine = refine;
24
+
25
+    /* copy PU block into cache */
26
+    primitives.pupartEnum.copy_pp(fencPUYuv.m_buf0, FENC_STRIDE, fencY + offset, stride);
27
+    X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
28
+}
29
+
30
 /* Called by Search::predInterSearch() or --pme equivalent, chroma residual might be considered */
31
 void MotionEstimate::setSourcePU(const Yuv& srcFencYuv, int _ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight, const int method, const int refine, bool bChroma)
32
 {
33
x265_3.5.tar.gz/source/encoder/motion.h -> x265_3.6.tar.gz/source/encoder/motion.h Changed
10
 
1
@@ -77,7 +77,7 @@
2
     void init(int csp);
3
 
4
     /* Methods called at slice setup */
5
-
6
+    void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int searchMethod, const int subpelRefine);
7
     void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int searchMethod, const int searchL0, const int searchL1, const int subpelRefine);
8
     void setSourcePU(const Yuv& srcFencYuv, int ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight, const int searchMethod, const int subpelRefine, bool bChroma);
9
 
10
x265_3.5.tar.gz/source/encoder/nal.cpp -> x265_3.6.tar.gz/source/encoder/nal.cpp Changed
19
 
1
@@ -57,7 +57,7 @@
2
     other.m_buffer = X265_MALLOC(uint8_t, m_allocSize);
3
 }
4
 
5
-void NALList::serialize(NalUnitType nalUnitType, const Bitstream& bs)
6
+void NALList::serialize(NalUnitType nalUnitType, const Bitstream& bs, uint8_t temporalID)
7
 {
8
     static const char startCodePrefix = { 0, 0, 0, 1 };
9
 
10
@@ -114,7 +114,7 @@
11
      * nuh_reserved_zero_6bits  6-bits
12
      * nuh_temporal_id_plus1    3-bits */
13
     outbytes++ = (uint8_t)nalUnitType << 1;
14
-    outbytes++ = 1 + (nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N);
15
+    outbytes++ = temporalID;
16
 
17
     /* 7.4.1 ...
18
      * Within the NAL unit, the following three-byte sequences shall not occur at
19
x265_3.5.tar.gz/source/encoder/nal.h -> x265_3.6.tar.gz/source/encoder/nal.h Changed
10
 
1
@@ -56,7 +56,7 @@
2
 
3
     void takeContents(NALList& other);
4
 
5
-    void serialize(NalUnitType nalUnitType, const Bitstream& bs);
6
+    void serialize(NalUnitType nalUnitType, const Bitstream& bs, uint8_t temporalID = 1);
7
 
8
     uint32_t serializeSubstreams(uint32_t* streamSizeBytes, uint32_t streamCount, const Bitstream* streams);
9
 };
10
x265_3.5.tar.gz/source/encoder/ratecontrol.cpp -> x265_3.6.tar.gz/source/encoder/ratecontrol.cpp Changed
1457
 
1
@@ -41,6 +41,10 @@
2
 #define BR_SHIFT  6
3
 #define CPB_SHIFT 4
4
 
5
+#define SHARED_DATA_ALIGNMENT      4 ///< 4btye, 32bit
6
+#define CUTREE_SHARED_MEM_NAME     "cutree"
7
+#define GOP_CNT_CU_TREE            3
8
+
9
 using namespace X265_NS;
10
 
11
 /* Amortize the partial cost of I frames over the next N frames */
12
@@ -104,6 +108,37 @@
13
     return output;
14
 }
15
 
16
+typedef struct CUTreeSharedDataItem
17
+{
18
+    uint8_t  *type;
19
+    uint16_t *stats;
20
+}CUTreeSharedDataItem;
21
+
22
+void static ReadSharedCUTreeData(void *dst, void *src, int32_t size)
23
+{
24
+    CUTreeSharedDataItem *statsDst = reinterpret_cast<CUTreeSharedDataItem *>(dst);
25
+    uint8_t *typeSrc = reinterpret_cast<uint8_t *>(src);
26
+    *statsDst->type = *typeSrc;
27
+
28
+    ///< for memory alignment, the type will take 32bit in the shared memory
29
+    int32_t offset = (sizeof(*statsDst->type) + SHARED_DATA_ALIGNMENT - 1) & ~(SHARED_DATA_ALIGNMENT - 1);
30
+    uint16_t *statsSrc = reinterpret_cast<uint16_t *>(typeSrc + offset);
31
+    memcpy(statsDst->stats, statsSrc, size - offset);
32
+}
33
+
34
+void static WriteSharedCUTreeData(void *dst, void *src, int32_t size)
35
+{
36
+    CUTreeSharedDataItem *statsSrc = reinterpret_cast<CUTreeSharedDataItem *>(src);
37
+    uint8_t *typeDst = reinterpret_cast<uint8_t *>(dst);
38
+    *typeDst = *statsSrc->type;
39
+
40
+    ///< for memory alignment, the type will take 32bit in the shared memory
41
+    int32_t offset = (sizeof(*statsSrc->type) + SHARED_DATA_ALIGNMENT - 1) & ~(SHARED_DATA_ALIGNMENT - 1);
42
+    uint16_t *statsDst = reinterpret_cast<uint16_t *>(typeDst + offset);
43
+    memcpy(statsDst, statsSrc->stats, size - offset);
44
+}
45
+
46
+
47
 inline double qScale2bits(RateControlEntry *rce, double qScale)
48
 {
49
     if (qScale < 0.1)
50
@@ -209,6 +244,7 @@
51
     m_lastAbrResetPoc = -1;
52
     m_statFileOut = NULL;
53
     m_cutreeStatFileOut = m_cutreeStatFileIn = NULL;
54
+    m_cutreeShrMem = NULL;
55
     m_rce2Pass = NULL;
56
     m_encOrder = NULL;
57
     m_lastBsliceSatdCost = 0;
58
@@ -224,6 +260,8 @@
59
     m_initVbv = false;
60
     m_singleFrameVbv = 0;
61
     m_rateTolerance = 1.0;
62
+    m_encodedSegmentBits = 0;
63
+    m_segDur = 0;
64
 
65
     if (m_param->rc.vbvBufferSize)
66
     {
67
@@ -320,47 +358,86 @@
68
         m_cuTreeStats.qpBufferi = NULL;
69
 }
70
 
71
-bool RateControl::init(const SPS& sps)
72
+bool RateControl::initCUTreeSharedMem()
73
 {
74
-    if (m_isVbv && !m_initVbv)
75
-    {
76
-        /* We don't support changing the ABR bitrate right now,
77
-         * so if the stream starts as CBR, keep it CBR. */
78
-        if (m_param->rc.vbvBufferSize < (int)(m_param->rc.vbvMaxBitrate / m_fps))
79
+    if (!m_cutreeShrMem) {
80
+        m_cutreeShrMem = new RingMem();
81
+        if (!m_cutreeShrMem)
82
         {
83
-            m_param->rc.vbvBufferSize = (int)(m_param->rc.vbvMaxBitrate / m_fps);
84
-            x265_log(m_param, X265_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
85
-                     m_param->rc.vbvBufferSize);
86
+            return false;
87
         }
88
-        int vbvBufferSize = m_param->rc.vbvBufferSize * 1000;
89
-        int vbvMaxBitrate = m_param->rc.vbvMaxBitrate * 1000;
90
 
91
-        if (m_param->bEmitHRDSEI && !m_param->decoderVbvMaxRate)
92
+        ///< now cutree data form at most 3 gops would be stored in the shared memory at the same time
93
+        int32_t itemSize = (sizeof(uint8_t) + SHARED_DATA_ALIGNMENT - 1) & ~(SHARED_DATA_ALIGNMENT - 1);
94
+        if (m_param->rc.qgSize == 8)
95
         {
96
-            const HRDInfo* hrd = &sps.vuiParameters.hrdParameters;
97
-            vbvBufferSize = hrd->cpbSizeValue << (hrd->cpbSizeScale + CPB_SHIFT);
98
-            vbvMaxBitrate = hrd->bitRateValue << (hrd->bitRateScale + BR_SHIFT);
99
+            itemSize += sizeof(uint16_t) * m_ncu * 4;
100
         }
101
-        m_bufferRate = vbvMaxBitrate / m_fps;
102
-        m_vbvMaxRate = vbvMaxBitrate;
103
-        m_bufferSize = vbvBufferSize;
104
-        m_singleFrameVbv = m_bufferRate * 1.1 > m_bufferSize;
105
+        else
106
+        {
107
+            itemSize += sizeof(uint16_t) * m_ncu;
108
+        }
109
+
110
+        int32_t itemCnt = X265_MIN(m_param->keyframeMax, (int)(m_fps + 0.5));
111
+        itemCnt *= GOP_CNT_CU_TREE;
112
 
113
-        if (m_param->rc.vbvBufferInit > 1.)
114
-            m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, m_param->rc.vbvBufferInit / m_param->rc.vbvBufferSize);
115
-        if (m_param->vbvBufferEnd > 1.)
116
-            m_param->vbvBufferEnd = x265_clip3(0.0, 1.0, m_param->vbvBufferEnd / m_param->rc.vbvBufferSize);
117
-        if (m_param->vbvEndFrameAdjust > 1.)
118
-            m_param->vbvEndFrameAdjust = x265_clip3(0.0, 1.0, m_param->vbvEndFrameAdjust);
119
-        m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, X265_MAX(m_param->rc.vbvBufferInit, m_bufferRate / m_bufferSize));
120
-        m_bufferFillFinal = m_bufferSize * m_param->rc.vbvBufferInit;
121
-        m_bufferFillActual = m_bufferFillFinal;
122
-        m_bufferExcess = 0;
123
-        m_minBufferFill = m_param->minVbvFullness / 100;
124
-        m_maxBufferFill = 1 - (m_param->maxVbvFullness / 100);
125
-        m_initVbv = true;
126
+        char shrnameMAX_SHR_NAME_LEN = { 0 };
127
+        strcpy(shrname, m_param->rc.sharedMemName);
128
+        strcat(shrname, CUTREE_SHARED_MEM_NAME);
129
+
130
+        if (!m_cutreeShrMem->init(itemSize, itemCnt, shrname))
131
+        {
132
+            return false;
133
+        }
134
     }
135
 
136
+    return true;
137
+}
138
+
139
+void RateControl::initVBV(const SPS& sps)
140
+{
141
+    /* We don't support changing the ABR bitrate right now,
142
+ * so if the stream starts as CBR, keep it CBR. */
143
+    if (m_param->rc.vbvBufferSize < (int)(m_param->rc.vbvMaxBitrate / m_fps))
144
+    {
145
+        m_param->rc.vbvBufferSize = (int)(m_param->rc.vbvMaxBitrate / m_fps);
146
+        x265_log(m_param, X265_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
147
+            m_param->rc.vbvBufferSize);
148
+    }
149
+    int vbvBufferSize = m_param->rc.vbvBufferSize * 1000;
150
+    int vbvMaxBitrate = m_param->rc.vbvMaxBitrate * 1000;
151
+
152
+    if (m_param->bEmitHRDSEI && !m_param->decoderVbvMaxRate)
153
+    {
154
+        const HRDInfo* hrd = &sps.vuiParameters.hrdParameters;
155
+        vbvBufferSize = hrd->cpbSizeValue << (hrd->cpbSizeScale + CPB_SHIFT);
156
+        vbvMaxBitrate = hrd->bitRateValue << (hrd->bitRateScale + BR_SHIFT);
157
+    }
158
+    m_bufferRate = vbvMaxBitrate / m_fps;
159
+    m_vbvMaxRate = vbvMaxBitrate;
160
+    m_bufferSize = vbvBufferSize;
161
+    m_singleFrameVbv = m_bufferRate * 1.1 > m_bufferSize;
162
+
163
+    if (m_param->rc.vbvBufferInit > 1.)
164
+        m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, m_param->rc.vbvBufferInit / m_param->rc.vbvBufferSize);
165
+    if (m_param->vbvBufferEnd > 1.)
166
+        m_param->vbvBufferEnd = x265_clip3(0.0, 1.0, m_param->vbvBufferEnd / m_param->rc.vbvBufferSize);
167
+    if (m_param->vbvEndFrameAdjust > 1.)
168
+        m_param->vbvEndFrameAdjust = x265_clip3(0.0, 1.0, m_param->vbvEndFrameAdjust);
169
+    m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, X265_MAX(m_param->rc.vbvBufferInit, m_bufferRate / m_bufferSize));
170
+    m_bufferFillFinal = m_bufferSize * m_param->rc.vbvBufferInit;
171
+    m_bufferFillActual = m_bufferFillFinal;
172
+    m_bufferExcess = 0;
173
+    m_minBufferFill = m_param->minVbvFullness / 100;
174
+    m_maxBufferFill = 1 - (m_param->maxVbvFullness / 100);
175
+    m_initVbv = true;
176
+}
177
+
178
+bool RateControl::init(const SPS& sps)
179
+{
180
+    if (m_isVbv && !m_initVbv)
181
+        initVBV(sps);
182
+
183
     if (!m_param->bResetZoneConfig && (m_relativeComplexity == NULL))
184
     {
185
         m_relativeComplexity = X265_MALLOC(double, m_param->reconfigWindowSize);
186
@@ -373,7 +450,9 @@
187
 
188
     m_totalBits = 0;
189
     m_encodedBits = 0;
190
+    m_encodedSegmentBits = 0;
191
     m_framesDone = 0;
192
+    m_segDur = 0;
193
     m_residualCost = 0;
194
     m_partialResidualCost = 0;
195
     m_amortizeFraction = 0.85;
196
@@ -421,244 +500,257 @@
197
         /* Load stat file and init 2pass algo */
198
         if (m_param->rc.bStatRead)
199
         {
200
-            m_expectedBitsSum = 0;
201
-            char *p, *statsIn, *statsBuf;
202
-            /* read 1st pass stats */
203
-            statsIn = statsBuf = x265_slurp_file(fileName);
204
-            if (!statsBuf)
205
-                return false;
206
-            if (m_param->rc.cuTree)
207
+            if (X265_SHARE_MODE_FILE == m_param->rc.dataShareMode)
208
             {
209
-                char *tmpFile = strcatFilename(fileName, ".cutree");
210
-                if (!tmpFile)
211
+                m_expectedBitsSum = 0;
212
+                char *p, *statsIn, *statsBuf;
213
+                /* read 1st pass stats */
214
+                statsIn = statsBuf = x265_slurp_file(fileName);
215
+                if (!statsBuf)
216
                     return false;
217
-                m_cutreeStatFileIn = x265_fopen(tmpFile, "rb");
218
-                X265_FREE(tmpFile);
219
-                if (!m_cutreeStatFileIn)
220
+                if (m_param->rc.cuTree)
221
                 {
222
-                    x265_log_file(m_param, X265_LOG_ERROR, "can't open stats file %s.cutree\n", fileName);
223
-                    return false;
224
+                    char *tmpFile = strcatFilename(fileName, ".cutree");
225
+                    if (!tmpFile)
226
+                        return false;
227
+                    m_cutreeStatFileIn = x265_fopen(tmpFile, "rb");
228
+                    X265_FREE(tmpFile);
229
+                    if (!m_cutreeStatFileIn)
230
+                    {
231
+                        x265_log_file(m_param, X265_LOG_ERROR, "can't open stats file %s.cutree\n", fileName);
232
+                        return false;
233
+                    }
234
                 }
235
-            }
236
 
237
-            /* check whether 1st pass options were compatible with current options */
238
-            if (strncmp(statsBuf, "#options:", 9))
239
-            {
240
-                x265_log(m_param, X265_LOG_ERROR,"options list in stats file not valid\n");
241
-                return false;
242
-            }
243
-            {
244
-                int i, j, m;
245
-                uint32_t k , l;
246
-                bool bErr = false;
247
-                char *opts = statsBuf;
248
-                statsIn = strchr(statsBuf, '\n');
249
-                if (!statsIn)
250
-                {
251
-                    x265_log(m_param, X265_LOG_ERROR, "Malformed stats file\n");
252
-                    return false;
253
-                }
254
-                *statsIn = '\0';
255
-                statsIn++;
256
-                if ((p = strstr(opts, " input-res=")) == 0 || sscanf(p, " input-res=%dx%d", &i, &j) != 2)
257
-                {
258
-                    x265_log(m_param, X265_LOG_ERROR, "Resolution specified in stats file not valid\n");
259
-                    return false;
260
-                }
261
-                if ((p = strstr(opts, " fps=")) == 0 || sscanf(p, " fps=%u/%u", &k, &l) != 2)
262
-                {
263
-                    x265_log(m_param, X265_LOG_ERROR, "fps specified in stats file not valid\n");
264
-                    return false;
265
-                }
266
-                if (((p = strstr(opts, " vbv-maxrate=")) == 0 || sscanf(p, " vbv-maxrate=%d", &m) != 1) && m_param->rc.rateControlMode == X265_RC_CRF)
267
-                {
268
-                    x265_log(m_param, X265_LOG_ERROR, "Constant rate-factor is incompatible with 2pass without vbv-maxrate in the previous pass\n");
269
-                    return false;
270
-                }
271
-                if (k != m_param->fpsNum || l != m_param->fpsDenom)
272
+                /* check whether 1st pass options were compatible with current options */
273
+                if (strncmp(statsBuf, "#options:", 9))
274
                 {
275
-                    x265_log(m_param, X265_LOG_ERROR, "fps mismatch with 1st pass (%u/%u vs %u/%u)\n",
276
-                              m_param->fpsNum, m_param->fpsDenom, k, l);
277
+                    x265_log(m_param, X265_LOG_ERROR, "options list in stats file not valid\n");
278
                     return false;
279
                 }
280
-                if (m_param->analysisMultiPassRefine)
281
                 {
282
-                    p = strstr(opts, "ref=");
283
-                    sscanf(p, "ref=%d", &i);
284
-                    if (i > m_param->maxNumReferences)
285
+                    int i, j, m;
286
+                    uint32_t k, l;
287
+                    bool bErr = false;
288
+                    char *opts = statsBuf;
289
+                    statsIn = strchr(statsBuf, '\n');
290
+                    if (!statsIn)
291
                     {
292
-                        x265_log(m_param, X265_LOG_ERROR, "maxNumReferences cannot be less than 1st pass (%d vs %d)\n",
293
-                            i, m_param->maxNumReferences);
294
+                        x265_log(m_param, X265_LOG_ERROR, "Malformed stats file\n");
295
                         return false;
296
                     }
297
-                }
298
-                if (m_param->analysisMultiPassRefine || m_param->analysisMultiPassDistortion)
299
-                {
300
-                    p = strstr(opts, "ctu=");
301
-                    sscanf(p, "ctu=%u", &k);
302
-                    if (k != m_param->maxCUSize)
303
+                    *statsIn = '\0';
304
+                    statsIn++;
305
+                    if ((p = strstr(opts, " input-res=")) == 0 || sscanf(p, " input-res=%dx%d", &i, &j) != 2)
306
                     {
307
-                        x265_log(m_param, X265_LOG_ERROR, "maxCUSize mismatch with 1st pass (%u vs %u)\n",
308
-                            k, m_param->maxCUSize);
309
+                        x265_log(m_param, X265_LOG_ERROR, "Resolution specified in stats file not valid\n");
310
                         return false;
311
                     }
312
+                    if ((p = strstr(opts, " fps=")) == 0 || sscanf(p, " fps=%u/%u", &k, &l) != 2)
313
+                    {
314
+                        x265_log(m_param, X265_LOG_ERROR, "fps specified in stats file not valid\n");
315
+                        return false;
316
+                    }
317
+                    if (((p = strstr(opts, " vbv-maxrate=")) == 0 || sscanf(p, " vbv-maxrate=%d", &m) != 1) && m_param->rc.rateControlMode == X265_RC_CRF)
318
+                    {
319
+                        x265_log(m_param, X265_LOG_ERROR, "Constant rate-factor is incompatible with 2pass without vbv-maxrate in the previous pass\n");
320
+                        return false;
321
+                    }
322
+                    if (k != m_param->fpsNum || l != m_param->fpsDenom)
323
+                    {
324
+                        x265_log(m_param, X265_LOG_ERROR, "fps mismatch with 1st pass (%u/%u vs %u/%u)\n",
325
+                            m_param->fpsNum, m_param->fpsDenom, k, l);
326
+                        return false;
327
+                    }
328
+                    if (m_param->analysisMultiPassRefine)
329
+                    {
330
+                        p = strstr(opts, "ref=");
331
+                        sscanf(p, "ref=%d", &i);
332
+                        if (i > m_param->maxNumReferences)
333
+                        {
334
+                            x265_log(m_param, X265_LOG_ERROR, "maxNumReferences cannot be less than 1st pass (%d vs %d)\n",
335
+                                i, m_param->maxNumReferences);
336
+                            return false;
337
+                        }
338
+                    }
339
+                    if (m_param->analysisMultiPassRefine || m_param->analysisMultiPassDistortion)
340
+                    {
341
+                        p = strstr(opts, "ctu=");
342
+                        sscanf(p, "ctu=%u", &k);
343
+                        if (k != m_param->maxCUSize)
344
+                        {
345
+                            x265_log(m_param, X265_LOG_ERROR, "maxCUSize mismatch with 1st pass (%u vs %u)\n",
346
+                                k, m_param->maxCUSize);
347
+                            return false;
348
+                        }
349
+                    }
350
+                    CMP_OPT_FIRST_PASS("bitdepth", m_param->internalBitDepth);
351
+                    CMP_OPT_FIRST_PASS("weightp", m_param->bEnableWeightedPred);
352
+                    CMP_OPT_FIRST_PASS("bframes", m_param->bframes);
353
+                    CMP_OPT_FIRST_PASS("b-pyramid", m_param->bBPyramid);
354
+                    CMP_OPT_FIRST_PASS("open-gop", m_param->bOpenGOP);
355
+                    CMP_OPT_FIRST_PASS(" keyint", m_param->keyframeMax);
356
+                    CMP_OPT_FIRST_PASS("scenecut", m_param->scenecutThreshold);
357
+                    CMP_OPT_FIRST_PASS("intra-refresh", m_param->bIntraRefresh);
358
+                    CMP_OPT_FIRST_PASS("frame-dup", m_param->bEnableFrameDuplication);
359
+                    if (m_param->bMultiPassOptRPS)
360
+                    {
361
+                        CMP_OPT_FIRST_PASS("multi-pass-opt-rps", m_param->bMultiPassOptRPS);
362
+                        CMP_OPT_FIRST_PASS("repeat-headers", m_param->bRepeatHeaders);
363
+                        CMP_OPT_FIRST_PASS("min-keyint", m_param->keyframeMin);
364
+                    }
365
+
366
+                    if ((p = strstr(opts, "b-adapt=")) != 0 && sscanf(p, "b-adapt=%d", &i) && i >= X265_B_ADAPT_NONE && i <= X265_B_ADAPT_TRELLIS)
367
+                    {
368
+                        m_param->bFrameAdaptive = i;
369
+                    }
370
+                    else if (m_param->bframes)
371
+                    {
372
+                        x265_log(m_param, X265_LOG_ERROR, "b-adapt method specified in stats file not valid\n");
373
+                        return false;
374
+                    }
375
+
376
+                    if ((p = strstr(opts, "rc-lookahead=")) != 0 && sscanf(p, "rc-lookahead=%d", &i))
377
+                        m_param->lookaheadDepth = i;
378
                 }
379
-                CMP_OPT_FIRST_PASS("bitdepth", m_param->internalBitDepth);
380
-                CMP_OPT_FIRST_PASS("weightp", m_param->bEnableWeightedPred);
381
-                CMP_OPT_FIRST_PASS("bframes", m_param->bframes);
382
-                CMP_OPT_FIRST_PASS("b-pyramid", m_param->bBPyramid);
383
-                CMP_OPT_FIRST_PASS("open-gop", m_param->bOpenGOP);
384
-                CMP_OPT_FIRST_PASS(" keyint", m_param->keyframeMax);
385
-                CMP_OPT_FIRST_PASS("scenecut", m_param->scenecutThreshold);
386
-                CMP_OPT_FIRST_PASS("intra-refresh", m_param->bIntraRefresh);
387
-                CMP_OPT_FIRST_PASS("frame-dup", m_param->bEnableFrameDuplication);
388
-                if (m_param->bMultiPassOptRPS)
389
+                /* find number of pics */
390
+                p = statsIn;
391
+                int numEntries;
392
+                for (numEntries = -1; p; numEntries++)
393
+                    p = strchr(p + 1, ';');
394
+                if (!numEntries)
395
                 {
396
-                    CMP_OPT_FIRST_PASS("multi-pass-opt-rps", m_param->bMultiPassOptRPS);
397
-                    CMP_OPT_FIRST_PASS("repeat-headers", m_param->bRepeatHeaders);
398
-                    CMP_OPT_FIRST_PASS("min-keyint", m_param->keyframeMin);
399
+                    x265_log(m_param, X265_LOG_ERROR, "empty stats file\n");
400
+                    return false;
401
                 }
402
+                m_numEntries = numEntries;
403
 
404
-                if ((p = strstr(opts, "b-adapt=")) != 0 && sscanf(p, "b-adapt=%d", &i) && i >= X265_B_ADAPT_NONE && i <= X265_B_ADAPT_TRELLIS)
405
+                if (m_param->totalFrames < m_numEntries && m_param->totalFrames > 0)
406
                 {
407
-                    m_param->bFrameAdaptive = i;
408
+                    x265_log(m_param, X265_LOG_WARNING, "2nd pass has fewer frames than 1st pass (%d vs %d)\n",
409
+                        m_param->totalFrames, m_numEntries);
410
                 }
411
-                else if (m_param->bframes)
412
+                if (m_param->totalFrames > m_numEntries && !m_param->bEnableFrameDuplication)
413
                 {
414
-                    x265_log(m_param, X265_LOG_ERROR, "b-adapt method specified in stats file not valid\n");
415
+                    x265_log(m_param, X265_LOG_ERROR, "2nd pass has more frames than 1st pass (%d vs %d)\n",
416
+                        m_param->totalFrames, m_numEntries);
417
                     return false;
418
                 }
419
 
420
-                if ((p = strstr(opts, "rc-lookahead=")) != 0 && sscanf(p, "rc-lookahead=%d", &i))
421
-                    m_param->lookaheadDepth = i;
422
-            }
423
-            /* find number of pics */
424
-            p = statsIn;
425
-            int numEntries;
426
-            for (numEntries = -1; p; numEntries++)
427
-                p = strchr(p + 1, ';');
428
-            if (!numEntries)
429
-            {
430
-                x265_log(m_param, X265_LOG_ERROR, "empty stats file\n");
431
-                return false;
432
-            }
433
-            m_numEntries = numEntries;
434
-
435
-            if (m_param->totalFrames < m_numEntries && m_param->totalFrames > 0)
436
-            {
437
-                x265_log(m_param, X265_LOG_WARNING, "2nd pass has fewer frames than 1st pass (%d vs %d)\n",
438
-                         m_param->totalFrames, m_numEntries);
439
-            }
440
-            if (m_param->totalFrames > m_numEntries && !m_param->bEnableFrameDuplication)
441
-            {
442
-                x265_log(m_param, X265_LOG_ERROR, "2nd pass has more frames than 1st pass (%d vs %d)\n",
443
-                         m_param->totalFrames, m_numEntries);
444
-                return false;
445
-            }
446
-
447
-            m_rce2Pass = X265_MALLOC(RateControlEntry, m_numEntries);
448
-            if (!m_rce2Pass)
449
-            {
450
-                 x265_log(m_param, X265_LOG_ERROR, "Rce Entries for 2 pass cannot be allocated\n");
451
-                 return false;
452
-            }
453
-            m_encOrder = X265_MALLOC(int, m_numEntries);
454
-            if (!m_encOrder)
455
-            {
456
-                x265_log(m_param, X265_LOG_ERROR, "Encode order for 2 pass cannot be allocated\n");
457
-                return false;
458
-            }
459
-            /* init all to skipped p frames */
460
-            for (int i = 0; i < m_numEntries; i++)
461
-            {
462
-                RateControlEntry *rce = &m_rce2Passi;
463
-                rce->sliceType = P_SLICE;
464
-                rce->qScale = rce->newQScale = x265_qp2qScale(20);
465
-                rce->miscBits = m_ncu + 10;
466
-                rce->newQp = 0;
467
-            }
468
-            /* read stats */
469
-            p = statsIn;
470
-            double totalQpAq = 0;
471
-            for (int i = 0; i < m_numEntries; i++)
472
-            {
473
-                RateControlEntry *rce, *rcePocOrder;
474
-                int frameNumber;
475
-                int encodeOrder;
476
-                char picType;
477
-                int e;
478
-                char *next;
479
-                double qpRc, qpAq, qNoVbv, qRceq;
480
-                next = strstr(p, ";");
481
-                if (next)
482
-                    *next++ = 0;
483
-                e = sscanf(p, " in:%d out:%d", &frameNumber, &encodeOrder);
484
-                if (frameNumber < 0 || frameNumber >= m_numEntries)
485
+                m_rce2Pass = X265_MALLOC(RateControlEntry, m_numEntries);
486
+                if (!m_rce2Pass)
487
                 {
488
-                    x265_log(m_param, X265_LOG_ERROR, "bad frame number (%d) at stats line %d\n", frameNumber, i);
489
+                    x265_log(m_param, X265_LOG_ERROR, "Rce Entries for 2 pass cannot be allocated\n");
490
                     return false;
491
                 }
492
-                rce = &m_rce2PassencodeOrder;
493
-                rcePocOrder = &m_rce2PassframeNumber;
494
-                m_encOrderframeNumber = encodeOrder;
495
-                if (!m_param->bMultiPassOptRPS)
496
-                {
497
-                    int scenecut = 0;
498
-                    e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf q-noVbv:%lf q-Rceq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf sc:%d",
499
-                        &picType, &qpRc, &qpAq, &qNoVbv, &qRceq, &rce->coeffBits,
500
-                        &rce->mvBits, &rce->miscBits, &rce->iCuCount, &rce->pCuCount,
501
-                        &rce->skipCuCount, &scenecut);
502
-                    rcePocOrder->scenecut = scenecut != 0;
503
+                m_encOrder = X265_MALLOC(int, m_numEntries);
504
+                if (!m_encOrder)
505
+                {
506
+                    x265_log(m_param, X265_LOG_ERROR, "Encode order for 2 pass cannot be allocated\n");
507
+                    return false;
508
                 }
509
-                else
510
+                /* init all to skipped p frames */
511
+                for (int i = 0; i < m_numEntries; i++)
512
                 {
513
-                    char deltaPOC128;
514
-                    char bUsed40;
515
-                    memset(deltaPOC, 0, sizeof(deltaPOC));
516
-                    memset(bUsed, 0, sizeof(bUsed));
517
-                    e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf q-noVbv:%lf q-Rceq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf nump:%d numnegp:%d numposp:%d deltapoc:%s bused:%s",
518
-                        &picType, &qpRc, &qpAq, &qNoVbv, &qRceq, &rce->coeffBits,
519
-                        &rce->mvBits, &rce->miscBits, &rce->iCuCount, &rce->pCuCount,
520
-                        &rce->skipCuCount, &rce->rpsData.numberOfPictures, &rce->rpsData.numberOfNegativePictures, &rce->rpsData.numberOfPositivePictures, deltaPOC, bUsed);
521
-                    splitdeltaPOC(deltaPOC, rce);
522
-                    splitbUsed(bUsed, rce);
523
-                    rce->rpsIdx = -1;
524
-                }
525
-                rce->keptAsRef = true;
526
-                rce->isIdr = false;
527
-                if (picType == 'b' || picType == 'p')
528
-                    rce->keptAsRef = false;
529
-                if (picType == 'I')
530
-                    rce->isIdr = true;
531
-                if (picType == 'I' || picType == 'i')
532
-                    rce->sliceType = I_SLICE;
533
-                else if (picType == 'P' || picType == 'p')
534
+                    RateControlEntry *rce = &m_rce2Passi;
535
                     rce->sliceType = P_SLICE;
536
-                else if (picType == 'B' || picType == 'b')
537
-                    rce->sliceType = B_SLICE;
538
-                else
539
-                    e = -1;
540
-                if (e < 10)
541
+                    rce->qScale = rce->newQScale = x265_qp2qScale(20);
542
+                    rce->miscBits = m_ncu + 10;
543
+                    rce->newQp = 0;
544
+                }
545
+                /* read stats */
546
+                p = statsIn;
547
+                double totalQpAq = 0;
548
+                for (int i = 0; i < m_numEntries; i++)
549
+                {
550
+                    RateControlEntry *rce, *rcePocOrder;
551
+                    int frameNumber;
552
+                    int encodeOrder;
553
+                    char picType;
554
+                    int e;
555
+                    char *next;
556
+                    double qpRc, qpAq, qNoVbv, qRceq;
557
+                    next = strstr(p, ";");
558
+                    if (next)
559
+                        *next++ = 0;
560
+                    e = sscanf(p, " in:%d out:%d", &frameNumber, &encodeOrder);
561
+                    if (frameNumber < 0 || frameNumber >= m_numEntries)
562
+                    {
563
+                        x265_log(m_param, X265_LOG_ERROR, "bad frame number (%d) at stats line %d\n", frameNumber, i);
564
+                        return false;
565
+                    }
566
+                    rce = &m_rce2PassencodeOrder;
567
+                    rcePocOrder = &m_rce2PassframeNumber;
568
+                    m_encOrderframeNumber = encodeOrder;
569
+                    if (!m_param->bMultiPassOptRPS)
570
+                    {
571
+                        int scenecut = 0;
572
+                        e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf q-noVbv:%lf q-Rceq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf sc:%d",
573
+                            &picType, &qpRc, &qpAq, &qNoVbv, &qRceq, &rce->coeffBits,
574
+                            &rce->mvBits, &rce->miscBits, &rce->iCuCount, &rce->pCuCount,
575
+                            &rce->skipCuCount, &scenecut);
576
+                        rcePocOrder->scenecut = scenecut != 0;
577
+                    }
578
+                    else
579
+                    {
580
+                        char deltaPOC128;
581
+                        char bUsed40;
582
+                        memset(deltaPOC, 0, sizeof(deltaPOC));
583
+                        memset(bUsed, 0, sizeof(bUsed));
584
+                        e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf q-noVbv:%lf q-Rceq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf nump:%d numnegp:%d numposp:%d deltapoc:%s bused:%s",
585
+                            &picType, &qpRc, &qpAq, &qNoVbv, &qRceq, &rce->coeffBits,
586
+                            &rce->mvBits, &rce->miscBits, &rce->iCuCount, &rce->pCuCount,
587
+                            &rce->skipCuCount, &rce->rpsData.numberOfPictures, &rce->rpsData.numberOfNegativePictures, &rce->rpsData.numberOfPositivePictures, deltaPOC, bUsed);
588
+                        splitdeltaPOC(deltaPOC, rce);
589
+                        splitbUsed(bUsed, rce);
590
+                        rce->rpsIdx = -1;
591
+                    }
592
+                    rce->keptAsRef = true;
593
+                    rce->isIdr = false;
594
+                    if (picType == 'b' || picType == 'p')
595
+                        rce->keptAsRef = false;
596
+                    if (picType == 'I')
597
+                        rce->isIdr = true;
598
+                    if (picType == 'I' || picType == 'i')
599
+                        rce->sliceType = I_SLICE;
600
+                    else if (picType == 'P' || picType == 'p')
601
+                        rce->sliceType = P_SLICE;
602
+                    else if (picType == 'B' || picType == 'b')
603
+                        rce->sliceType = B_SLICE;
604
+                    else
605
+                        e = -1;
606
+                    if (e < 10)
607
+                    {
608
+                        x265_log(m_param, X265_LOG_ERROR, "statistics are damaged at line %d, parser out=%d\n", i, e);
609
+                        return false;
610
+                    }
611
+                    rce->qScale = rce->newQScale = x265_qp2qScale(qpRc);
612
+                    totalQpAq += qpAq;
613
+                    rce->qpNoVbv = qNoVbv;
614
+                    rce->qpaRc = qpRc;
615
+                    rce->qpAq = qpAq;
616
+                    rce->qRceq = qRceq;
617
+                    p = next;
618
+                }
619
+                X265_FREE(statsBuf);
620
+                if (m_param->rc.rateControlMode != X265_RC_CQP)
621
+                {
622
+                    m_start = 0;
623
+                    m_isQpModified = true;
624
+                    if (!initPass2())
625
+                        return false;
626
+                } /* else we're using constant quant, so no need to run the bitrate allocation */
627
+            }
628
+            else // X265_SHARE_MODE_SHAREDMEM == m_param->rc.dataShareMode
629
+            {
630
+                if (m_param->rc.cuTree)
631
                 {
632
-                    x265_log(m_param, X265_LOG_ERROR, "statistics are damaged at line %d, parser out=%d\n", i, e);
633
-                    return false;
634
+                    if (!initCUTreeSharedMem())
635
+                    {
636
+                        return false;
637
+                    }
638
                 }
639
-                rce->qScale = rce->newQScale = x265_qp2qScale(qpRc);
640
-                totalQpAq += qpAq;
641
-                rce->qpNoVbv = qNoVbv;
642
-                rce->qpaRc = qpRc;
643
-                rce->qpAq = qpAq;
644
-                rce->qRceq = qRceq;
645
-                p = next;
646
-            }
647
-            X265_FREE(statsBuf);
648
-            if (m_param->rc.rateControlMode != X265_RC_CQP)
649
-            {
650
-                m_start = 0;
651
-                m_isQpModified = true;
652
-                if (!initPass2())
653
-                    return false;
654
-            } /* else we're using constant quant, so no need to run the bitrate allocation */
655
+            }
656
         }
657
         /* Open output file */
658
         /* If input and output files are the same, output to a temp file
659
@@ -682,19 +774,29 @@
660
             X265_FREE(p);
661
             if (m_param->rc.cuTree && !m_param->rc.bStatRead)
662
             {
663
-                statFileTmpname = strcatFilename(fileName, ".cutree.temp");
664
-                if (!statFileTmpname)
665
-                    return false;
666
-                m_cutreeStatFileOut = x265_fopen(statFileTmpname, "wb");
667
-                X265_FREE(statFileTmpname);
668
-                if (!m_cutreeStatFileOut)
669
+                if (X265_SHARE_MODE_FILE == m_param->rc.dataShareMode)
670
                 {
671
-                    x265_log_file(m_param, X265_LOG_ERROR, "can't open mbtree stats file %s.cutree.temp\n", fileName);
672
-                    return false;
673
+                    statFileTmpname = strcatFilename(fileName, ".cutree.temp");
674
+                    if (!statFileTmpname)
675
+                        return false;
676
+                    m_cutreeStatFileOut = x265_fopen(statFileTmpname, "wb");
677
+                    X265_FREE(statFileTmpname);
678
+                    if (!m_cutreeStatFileOut)
679
+                    {
680
+                        x265_log_file(m_param, X265_LOG_ERROR, "can't open mbtree stats file %s.cutree.temp\n", fileName);
681
+                        return false;
682
+                    }
683
+                }
684
+                else // X265_SHARE_MODE_SHAREDMEM == m_param->rc.dataShareMode
685
+                {
686
+                    if (!initCUTreeSharedMem())
687
+                    {
688
+                        return false;
689
+                    }
690
                 }
691
             }
692
         }
693
-        if (m_param->rc.cuTree)
694
+        if (m_param->rc.cuTree && !m_cuTreeStats.qpBuffer0)
695
         {
696
             if (m_param->rc.qgSize == 8)
697
             {
698
@@ -714,6 +816,10 @@
699
     return true;
700
 }
701
 
702
+void RateControl::skipCUTreeSharedMemRead(int32_t cnt)
703
+{
704
+    m_cutreeShrMem->skipRead(cnt);
705
+}
706
 void RateControl::reconfigureRC()
707
 {
708
     if (m_isVbv)
709
@@ -806,7 +912,7 @@
710
 
711
     TimingInfo *time = &sps.vuiParameters.timingInfo;
712
     int maxCpbOutputDelay = (int)(X265_MIN(m_param->keyframeMax * MAX_DURATION * time->timeScale / time->numUnitsInTick, INT_MAX));
713
-    int maxDpbOutputDelay = (int)(sps.maxDecPicBuffering * MAX_DURATION * time->timeScale / time->numUnitsInTick);
714
+    int maxDpbOutputDelay = (int)(sps.maxDecPicBufferingsps.maxTempSubLayers - 1 * MAX_DURATION * time->timeScale / time->numUnitsInTick);
715
     int maxDelay = (int)(90000.0 * cpbSizeUnscale / bitRateUnscale + 0.5);
716
 
717
     hrd->initialCpbRemovalDelayLength = 2 + x265_clip3(4, 22, 32 - calcLength(maxDelay));
718
@@ -1000,125 +1106,103 @@
719
 {
720
     uint64_t allConstBits = 0, allCodedBits = 0;
721
     uint64_t allAvailableBits = uint64_t(m_param->rc.bitrate * 1000. * m_numEntries * m_frameDuration);
722
-    int startIndex, framesCount, endIndex;
723
+    int startIndex, endIndex;
724
     int fps = X265_MIN(m_param->keyframeMax, (int)(m_fps + 0.5));
725
-    startIndex = endIndex = framesCount = 0;
726
-    int diffQp = 0;
727
+    int distance = fps << 1;
728
+    distance = distance > m_param->keyframeMax ? (m_param->keyframeMax << 1) : m_param->keyframeMax;
729
+    startIndex = endIndex = 0;
730
     double targetBits = 0;
731
     double expectedBits = 0;
732
-    for (startIndex = m_start, endIndex = m_start; endIndex < m_numEntries; endIndex++)
733
+    double targetBits2 = 0;
734
+    double expectedBits2 = 0;
735
+    double cpxSum = 0;
736
+    double cpxSum2 = 0;
737
+
738
+    if (m_param->rc.rateControlMode == X265_RC_ABR)
739
     {
740
-        allConstBits += m_rce2PassendIndex.miscBits;
741
-        allCodedBits += m_rce2PassendIndex.coeffBits + m_rce2PassendIndex.mvBits;
742
-        if (m_param->rc.rateControlMode == X265_RC_CRF)
743
+        for (endIndex = m_start; endIndex < m_numEntries; endIndex++)
744
         {
745
-            framesCount = endIndex - startIndex + 1;
746
-            diffQp += int (m_rce2PassendIndex.qpaRc - m_rce2PassendIndex.qpNoVbv);
747
-            if (framesCount > fps)
748
-                diffQp -= int (m_rce2PassendIndex - fps.qpaRc - m_rce2PassendIndex - fps.qpNoVbv);
749
-            if (framesCount >= fps)
750
-            {
751
-                if (diffQp >= 1)
752
-                {
753
-                    if (!m_isQpModified && endIndex > fps)
754
-                    {
755
-                        double factor = 2;
756
-                        double step = 0;
757
-                        if (endIndex + fps >= m_numEntries)
758
-                        {
759
-                            m_start = endIndex - (endIndex % fps);
760
-                            return true;
761
-                        }
762
-                        for (int start = endIndex + 1; start <= endIndex + fps && start < m_numEntries; start++)
763
-                        {
764
-                            RateControlEntry *rce = &m_rce2Passstart;
765
-                            targetBits += qScale2bits(rce, x265_qp2qScale(rce->qpNoVbv));
766
-                            expectedBits += qScale2bits(rce, rce->qScale);
767
-                        }
768
-                        if (expectedBits < 0.95 * targetBits)
769
-                        {
770
-                            m_isQpModified = true;
771
-                            m_isGopReEncoded = true;
772
-                            while (endIndex + fps < m_numEntries)
773
-                            {
774
-                                step = pow(2, factor / 6.0);
775
-                                expectedBits = 0;
776
-                                for (int start = endIndex + 1; start <= endIndex + fps; start++)
777
-                                {
778
-                                    RateControlEntry *rce = &m_rce2Passstart;
779
-                                    rce->newQScale = rce->qScale / step;
780
-                                    X265_CHECK(rce->newQScale >= 0, "new Qscale is negative\n");
781
-                                    expectedBits += qScale2bits(rce, rce->newQScale);
782
-                                    rce->newQp = x265_qScale2qp(rce->newQScale);
783
-                                }
784
-                                if (expectedBits >= targetBits && step > 1)
785
-                                    factor *= 0.90;
786
-                                else
787
-                                    break;
788
-                            }
789
-
790
-                            if (m_isVbv && endIndex + fps < m_numEntries)
791
-                                if (!vbv2Pass((uint64_t)targetBits, endIndex + fps, endIndex + 1))
792
-                                    return false;
793
-
794
-                            targetBits = 0;
795
-                            expectedBits = 0;
796
-
797
-                            for (int start = endIndex - fps + 1; start <= endIndex; start++)
798
-                            {
799
-                                RateControlEntry *rce = &m_rce2Passstart;
800
-                                targetBits += qScale2bits(rce, x265_qp2qScale(rce->qpNoVbv));
801
-                            }
802
-                            while (1)
803
-                            {
804
-                                step = pow(2, factor / 6.0);
805
-                                expectedBits = 0;
806
-                                for (int start = endIndex - fps + 1; start <= endIndex; start++)
807
-                                {
808
-                                    RateControlEntry *rce = &m_rce2Passstart;
809
-                                    rce->newQScale = rce->qScale * step;
810
-                                    X265_CHECK(rce->newQScale >= 0, "new Qscale is negative\n");
811
-                                    expectedBits += qScale2bits(rce, rce->newQScale);
812
-                                    rce->newQp = x265_qScale2qp(rce->newQScale);
813
-                                }
814
-                                if (expectedBits > targetBits && step > 1)
815
-                                    factor *= 1.1;
816
-                                else
817
-                                     break;
818
-                            }
819
-                            if (m_isVbv)
820
-                                if (!vbv2Pass((uint64_t)targetBits, endIndex, endIndex - fps + 1))
821
-                                    return false;
822
-                            diffQp = 0;
823
-                            m_reencode = endIndex - fps + 1;
824
-                            endIndex = endIndex + fps;
825
-                            startIndex = endIndex + 1;
826
-                            m_start = startIndex;
827
-                            targetBits = expectedBits = 0;
828
-                        }
829
-                        else
830
-                            targetBits = expectedBits = 0;
831
-                    }
832
-                }
833
-                else
834
-                    m_isQpModified = false;
835
-            }
836
+            allConstBits += m_rce2PassendIndex.miscBits;
837
+            allCodedBits += m_rce2PassendIndex.coeffBits + m_rce2PassendIndex.mvBits;
838
         }
839
-    }
840
 
841
-    if (m_param->rc.rateControlMode == X265_RC_ABR)
842
-    {
843
         if (allAvailableBits < allConstBits)
844
         {
845
             x265_log(m_param, X265_LOG_ERROR, "requested bitrate is too low. estimated minimum is %d kbps\n",
846
-                     (int)(allConstBits * m_fps / framesCount * 1000.));
847
+                (int)(allConstBits * m_fps / (m_numEntries - m_start) * 1000.));
848
             return false;
849
         }
850
         if (!analyseABR2Pass(allAvailableBits))
851
             return false;
852
+
853
+        return true;
854
+    }
855
+
856
+    if (m_isQpModified)
857
+    {
858
+        return true;
859
+    }
860
+
861
+    if (m_start + (fps << 1) > m_numEntries)
862
+    {
863
+        return true;
864
+    }
865
+
866
+    for (startIndex = m_start, endIndex = m_numEntries - 1; startIndex < endIndex; startIndex++, endIndex--)
867
+    {
868
+        cpxSum += m_rce2PassstartIndex.qScale / m_rce2PassstartIndex.coeffBits;
869
+        cpxSum2 += m_rce2PassendIndex.qScale / m_rce2PassendIndex.coeffBits;
870
+
871
+        RateControlEntry *rce = &m_rce2PassstartIndex;
872
+        targetBits += qScale2bits(rce, x265_qp2qScale(rce->qpNoVbv));
873
+        expectedBits += qScale2bits(rce, rce->qScale);
874
+
875
+        rce = &m_rce2PassendIndex;
876
+        targetBits2 += qScale2bits(rce, x265_qp2qScale(rce->qpNoVbv));
877
+        expectedBits2 += qScale2bits(rce, rce->qScale);
878
     }
879
 
880
-    m_start = X265_MAX(m_start, endIndex - fps);
881
+    if (expectedBits < 0.95 * targetBits || expectedBits2 < 0.95 * targetBits2)
882
+    {
883
+        if (cpxSum / cpxSum2 < 0.95 || cpxSum2 / cpxSum < 0.95)
884
+        {
885
+            m_isQpModified = true;
886
+            m_isGopReEncoded = true;
887
+
888
+            m_shortTermCplxSum = 0;
889
+            m_shortTermCplxCount = 0;
890
+            m_framesDone = m_start;
891
+
892
+            for (startIndex = m_start; startIndex < m_numEntries; startIndex++)
893
+            {
894
+                m_shortTermCplxSum *= 0.5;
895
+                m_shortTermCplxCount *= 0.5;
896
+                m_shortTermCplxSum += m_rce2PassstartIndex.currentSatd / (CLIP_DURATION(m_frameDuration) / BASE_FRAME_DURATION);
897
+                m_shortTermCplxCount++;
898
+            }
899
+
900
+            m_bufferFill = m_rce2Passm_start - 1.bufferFill;
901
+            m_bufferFillFinal = m_rce2Passm_start - 1.bufferFillFinal;
902
+            m_bufferFillActual = m_rce2Passm_start - 1.bufferFillActual;
903
+
904
+            m_reencode = m_start;
905
+            m_start = m_numEntries;
906
+        }
907
+        else
908
+        {
909
+
910
+            m_isQpModified = false;
911
+            m_isGopReEncoded = false;
912
+        }
913
+    }
914
+    else
915
+    {
916
+
917
+        m_isQpModified = false;
918
+        m_isGopReEncoded = false;
919
+    }
920
+
921
+    m_start = X265_MAX(m_start, m_numEntries - distance + m_param->keyframeMax);
922
 
923
     return true;
924
 }
925
@@ -1271,6 +1355,16 @@
926
     m_predType = getPredictorType(curFrame->m_lowres.sliceType, m_sliceType);
927
     rce->poc = m_curSlice->m_poc;
928
 
929
+    if (m_param->bEnableSBRC)
930
+    {
931
+        if (rce->poc == 0 || (m_framesDone % m_param->keyframeMax == 0))
932
+        {
933
+            //Reset SBRC buffer
934
+            m_encodedSegmentBits = 0;
935
+            m_segDur = 0;
936
+        }
937
+    }
938
+
939
     if (!m_param->bResetZoneConfig && (rce->encodeOrder % m_param->reconfigWindowSize == 0))
940
     {
941
         int index = m_zoneBufferIdx % m_param->rc.zonefileCount;
942
@@ -1304,7 +1398,8 @@
943
             {
944
                 m_param = m_param->rc.zonesi.zoneParam;
945
                 reconfigureRC();
946
-                init(*m_curSlice->m_sps);
947
+                if (!m_param->bNoResetZoneConfig)
948
+                    init(*m_curSlice->m_sps);
949
             }
950
         }
951
     }
952
@@ -1391,15 +1486,57 @@
953
             rce->frameSizeMaximum *= m_param->maxAUSizeFactor;
954
         }
955
     }
956
+
957
+    ///< regenerate the qp
958
     if (!m_isAbr && m_2pass && m_param->rc.rateControlMode == X265_RC_CRF)
959
     {
960
-        rce->qpPrev = x265_qScale2qp(rce->qScale);
961
-        rce->qScale = rce->newQScale;
962
-        rce->qpaRc = curEncData.m_avgQpRc = curEncData.m_avgQpAq = x265_qScale2qp(rce->newQScale);
963
-        m_qp = int(rce->qpaRc + 0.5);
964
-        rce->frameSizePlanned = qScale2bits(rce, rce->qScale);
965
-        m_framesDone++;
966
-        return m_qp;
967
+        if (!m_param->rc.bEncFocusedFramesOnly)
968
+        {
969
+            rce->qpPrev = x265_qScale2qp(rce->qScale);
970
+            if (m_param->bEnableSceneCutAwareQp)
971
+            {
972
+                double lqmin = m_lminm_sliceType;
973
+                double lqmax = m_lmaxm_sliceType;
974
+                if (m_param->bEnableSceneCutAwareQp & FORWARD)
975
+                    rce->newQScale = forwardMasking(curFrame, rce->newQScale);
976
+                if (m_param->bEnableSceneCutAwareQp & BACKWARD)
977
+                    rce->newQScale = backwardMasking(curFrame, rce->newQScale);
978
+                rce->newQScale = x265_clip3(lqmin, lqmax, rce->newQScale);
979
+            }
980
+            rce->qScale = rce->newQScale;
981
+            rce->qpaRc = curEncData.m_avgQpRc = curEncData.m_avgQpAq = x265_qScale2qp(rce->newQScale);
982
+            m_qp = int(rce->qpaRc + 0.5);
983
+            rce->frameSizePlanned = qScale2bits(rce, rce->qScale);
984
+            m_framesDone++;
985
+            return m_qp;
986
+        }
987
+        else
988
+        { 
989
+            int index = m_encOrderrce->poc;
990
+            index++;
991
+            double totalDuration = m_frameDuration;
992
+            for (int j = 0; totalDuration < 1.0 && index < m_numEntries; j++)
993
+            {
994
+                switch (m_rce2Passindex.sliceType)
995
+                {
996
+                case B_SLICE:
997
+                    curFrame->m_lowres.plannedTypej = m_rce2Passindex.keptAsRef ? X265_TYPE_BREF : X265_TYPE_B;
998
+                    break;
999
+                case P_SLICE:
1000
+                    curFrame->m_lowres.plannedTypej = X265_TYPE_P;
1001
+                    break;
1002
+                case I_SLICE:
1003
+                    curFrame->m_lowres.plannedTypej = m_param->bOpenGOP ? X265_TYPE_I : X265_TYPE_IDR;
1004
+                    break;
1005
+                default:
1006
+                    break;
1007
+                }
1008
+
1009
+                curFrame->m_lowres.plannedSatdj = m_rce2Passindex.currentSatd;
1010
+                totalDuration += m_frameDuration;
1011
+                index++;
1012
+            }
1013
+        }
1014
     }
1015
 
1016
     if (m_isAbr || m_2pass) // ABR,CRF
1017
@@ -1655,10 +1792,25 @@
1018
             {
1019
                 m_cuTreeStats.qpBufPos++;
1020
 
1021
-                if (!fread(&type, 1, 1, m_cutreeStatFileIn))
1022
-                    goto fail;
1023
-                if (fread(m_cuTreeStats.qpBufferm_cuTreeStats.qpBufPos, sizeof(uint16_t), ncu, m_cutreeStatFileIn) != (size_t)ncu)
1024
-                    goto fail;
1025
+                if (X265_SHARE_MODE_FILE == m_param->rc.dataShareMode)
1026
+                {
1027
+                    if (!fread(&type, 1, 1, m_cutreeStatFileIn))
1028
+                        goto fail;
1029
+                    if (fread(m_cuTreeStats.qpBufferm_cuTreeStats.qpBufPos, sizeof(uint16_t), ncu, m_cutreeStatFileIn) != (size_t)ncu)
1030
+                        goto fail;
1031
+                }
1032
+                else // X265_SHARE_MODE_SHAREDMEM == m_param->rc.dataShareMode
1033
+                {
1034
+                    if (!m_cutreeShrMem)
1035
+                    {
1036
+                        goto fail;
1037
+                    }
1038
+
1039
+                    CUTreeSharedDataItem shrItem;
1040
+                    shrItem.type = &type;
1041
+                    shrItem.stats = m_cuTreeStats.qpBufferm_cuTreeStats.qpBufPos;
1042
+                    m_cutreeShrMem->readNext(&shrItem, ReadSharedCUTreeData);
1043
+                }
1044
 
1045
                 if (type != sliceTypeActual && m_cuTreeStats.qpBufPos == 1)
1046
                 {
1047
@@ -1785,7 +1937,7 @@
1048
         m_sliderPos++;
1049
     }
1050
 
1051
-    if (m_sliceType == B_SLICE)
1052
+    if((!m_param->bEnableSBRC && m_sliceType == B_SLICE) || (m_param->bEnableSBRC && !IS_REFERENCED(curFrame)))
1053
     {
1054
         /* B-frames don't have independent rate control, but rather get the
1055
          * average QP of the two adjacent P-frames + an offset */
1056
@@ -1836,8 +1988,16 @@
1057
             double minScenecutQscale =x265_qp2qScale(ABR_SCENECUT_INIT_QP_MIN); 
1058
             m_lastQScaleForP_SLICE = X265_MAX(minScenecutQscale, m_lastQScaleForP_SLICE);
1059
         }
1060
+
1061
         double qScale = x265_qp2qScale(q);
1062
         rce->qpNoVbv = q;
1063
+
1064
+        if (m_param->bEnableSBRC)
1065
+        {
1066
+            qScale = tuneQscaleForSBRC(curFrame, qScale);
1067
+            rce->qpNoVbv = x265_qScale2qp(qScale);
1068
+        }
1069
+
1070
         double lmin = 0, lmax = 0;
1071
         if (m_isGrainEnabled && m_isFirstMiniGop)
1072
         {
1073
@@ -1890,7 +2050,7 @@
1074
                 qScale = x265_clip3(lqmin, lqmax, qScale);
1075
             }
1076
 
1077
-            if (!m_2pass || m_param->bliveVBV2pass)
1078
+            if (!m_2pass || m_param->bliveVBV2pass || (m_2pass && m_param->rc.rateControlMode == X265_RC_CRF && m_param->rc.bEncFocusedFramesOnly))
1079
             {
1080
                 /* clip qp to permissible range after vbv-lookahead estimation to avoid possible 
1081
                  * mispredictions by initial frame size predictors */
1082
@@ -1927,7 +2087,7 @@
1083
     else
1084
     {
1085
         double abrBuffer = 2 * m_rateTolerance * m_bitrate;
1086
-        if (m_2pass)
1087
+        if (m_2pass && (m_param->rc.rateControlMode != X265_RC_CRF || !m_param->rc.bEncFocusedFramesOnly))
1088
         {
1089
             double lmin = m_lminm_sliceType;
1090
             double lmax = m_lmaxm_sliceType;
1091
@@ -2057,6 +2217,19 @@
1092
 
1093
             if (m_param->rc.rateControlMode == X265_RC_CRF)
1094
             {
1095
+                if (m_param->bEnableSBRC)
1096
+                {
1097
+                    double rfConstant = m_param->rc.rfConstant;
1098
+                    if (m_currentSatd < rce->movingAvgSum)
1099
+                        rfConstant += 2;
1100
+                    double ipOffset = (curFrame->m_lowres.bScenecut ? m_ipOffset : m_ipOffset / 2.0);
1101
+                    rfConstant = (rce->sliceType == I_SLICE ? rfConstant - ipOffset :
1102
+                        (rce->sliceType == B_SLICE ? rfConstant + m_pbOffset : rfConstant));
1103
+                    double mbtree_offset = m_param->rc.cuTree ? (1.0 - m_param->rc.qCompress) * 13.5 : 0;
1104
+                    double qComp = (m_param->rc.cuTree && !m_param->rc.hevcAq) ? 0.99 : m_param->rc.qCompress;
1105
+                    m_rateFactorConstant = pow(m_currentSatd, 1.0 - qComp) /
1106
+                        x265_qp2qScale(rfConstant + mbtree_offset);
1107
+                }
1108
                 q = getQScale(rce, m_rateFactorConstant);
1109
                 x265_zone* zone = getZone();
1110
                 if (zone)
1111
@@ -2082,7 +2255,7 @@
1112
                 }
1113
                 double tunedQScale = tuneAbrQScaleFromFeedback(initialQScale);
1114
                 overflow = tunedQScale / initialQScale;
1115
-                q = !m_partialResidualFrames? tunedQScale : initialQScale;
1116
+                q = !m_partialResidualFrames ? tunedQScale : initialQScale;
1117
                 bool isEncodeEnd = (m_param->totalFrames && 
1118
                     m_framesDone > 0.75 * m_param->totalFrames) ? 1 : 0;
1119
                 bool isEncodeBeg = m_framesDone < (int)(m_fps + 0.5);
1120
@@ -2138,6 +2311,9 @@
1121
                 q = X265_MAX(minScenecutQscale, q);
1122
                 m_lastQScaleForP_SLICE = X265_MAX(minScenecutQscale, m_lastQScaleForP_SLICE);
1123
             }
1124
+            if (m_param->bEnableSBRC)
1125
+                q = tuneQscaleForSBRC(curFrame, q);
1126
+
1127
             rce->qpNoVbv = x265_qScale2qp(q);
1128
             if (m_sliceType == P_SLICE)
1129
             {
1130
@@ -2319,6 +2495,43 @@
1131
     return (p->coeff * var + p->offset) / (q * p->count);
1132
 }
1133
 
1134
+double RateControl::tuneQscaleForSBRC(Frame* curFrame, double q)
1135
+{
1136
+    int depth = 0;
1137
+    int framesDoneInSeg = m_framesDone % m_param->keyframeMax;
1138
+    if (framesDoneInSeg + m_param->lookaheadDepth <= m_param->keyframeMax)
1139
+        depth = m_param->lookaheadDepth;
1140
+    else
1141
+        depth = m_param->keyframeMax - framesDoneInSeg;
1142
+    for (int iterations = 0; iterations < 1000; iterations++)
1143
+    {
1144
+        double totalDuration = m_segDur;
1145
+        double frameBitsTotal = m_encodedSegmentBits + predictSize(&m_predm_predType, q, (double)m_currentSatd);
1146
+        for (int i = 0; i < depth; i++)
1147
+        {
1148
+            int type = curFrame->m_lowres.plannedTypei;
1149
+            if (type == X265_TYPE_AUTO)
1150
+                break;
1151
+            int64_t satd = curFrame->m_lowres.plannedSatdi >> (X265_DEPTH - 8);
1152
+            type = IS_X265_TYPE_I(curFrame->m_lowres.plannedTypei) ? I_SLICE : IS_X265_TYPE_B(curFrame->m_lowres.plannedTypei) ? B_SLICE : P_SLICE;
1153
+            int predType = getPredictorType(curFrame->m_lowres.plannedTypei, type);
1154
+            double curBits = predictSize(&m_predpredType, q, (double)satd);
1155
+            frameBitsTotal += curBits;
1156
+            totalDuration += m_frameDuration;
1157
+        }
1158
+        //Check for segment buffer overflow and adjust QP accordingly
1159
+        double segDur = m_param->keyframeMax / m_fps;
1160
+        double allowedSize = m_vbvMaxRate * segDur;
1161
+        double remDur = segDur - totalDuration;
1162
+        double remainingBits = frameBitsTotal / totalDuration * remDur;
1163
+        if (frameBitsTotal + remainingBits > 0.9 * allowedSize)
1164
+            q = q * 1.01;
1165
+        else
1166
+            break;
1167
+    }
1168
+    return q;
1169
+}
1170
+
1171
 double RateControl::clipQscale(Frame* curFrame, RateControlEntry* rce, double q)
1172
 {
1173
     // B-frames are not directly subject to VBV,
1174
@@ -2395,7 +2608,7 @@
1175
                     {
1176
                         finalDur = x265_clip3(0.4, 1.0, totalDuration);
1177
                     }
1178
-                    targetFill = X265_MIN(m_bufferFill + totalDuration * m_vbvMaxRate * 0.5, m_bufferSize * (1 - m_minBufferFill * finalDur));
1179
+                    targetFill = X265_MIN(m_bufferFill + totalDuration * m_vbvMaxRate * 0.5, m_bufferSize * (m_minBufferFill * finalDur));
1180
                     if (bufferFillCur < targetFill)
1181
                     {
1182
                         q *= 1.01;
1183
@@ -2828,7 +3041,7 @@
1184
 
1185
     if (m_param->rc.aqMode || m_isVbv || m_param->bAQMotion || bEnableDistOffset)
1186
     {
1187
-        if (m_isVbv && !(m_2pass && m_param->rc.rateControlMode == X265_RC_CRF))
1188
+        if (m_isVbv && !(m_2pass && m_param->rc.rateControlMode == X265_RC_CRF && !m_param->rc.bEncFocusedFramesOnly))
1189
         {
1190
             double avgQpRc = 0;
1191
             /* determine avg QP decided by VBV rate control */
1192
@@ -2862,8 +3075,9 @@
1193
     if (m_param->rc.rateControlMode == X265_RC_CRF)
1194
     {
1195
         double crfVal, qpRef = curEncData.m_avgQpRc;
1196
+
1197
         bool is2passCrfChange = false;
1198
-        if (m_2pass)
1199
+        if (m_2pass && !m_param->rc.bEncFocusedFramesOnly)
1200
         {
1201
             if (fabs(curEncData.m_avgQpRc - rce->qpPrev) > 0.1)
1202
             {
1203
@@ -2921,6 +3135,8 @@
1204
         m_wantedBitsWindow += m_frameDuration * m_bitrate;
1205
         m_totalBits += bits - rce->rowTotalBits;
1206
         m_encodedBits += actualBits;
1207
+        m_encodedSegmentBits += actualBits;
1208
+        m_segDur += m_frameDuration;
1209
         int pos = m_sliderPos - m_param->frameNumThreads;
1210
         if (pos >= 0)
1211
             m_encodedBitsWindowpos % s_slidingWindowFrames = actualBits;
1212
@@ -3048,10 +3264,26 @@
1213
     {
1214
         uint8_t sliceType = (uint8_t)rce->sliceType;
1215
         primitives.fix8Pack(m_cuTreeStats.qpBuffer0, curFrame->m_lowres.qpCuTreeOffset, ncu);
1216
-        if (fwrite(&sliceType, 1, 1, m_cutreeStatFileOut) < 1)
1217
-            goto writeFailure;
1218
-        if (fwrite(m_cuTreeStats.qpBuffer0, sizeof(uint16_t), ncu, m_cutreeStatFileOut) < (size_t)ncu)
1219
-            goto writeFailure;
1220
+
1221
+        if (X265_SHARE_MODE_FILE == m_param->rc.dataShareMode)
1222
+        {
1223
+            if (fwrite(&sliceType, 1, 1, m_cutreeStatFileOut) < 1)
1224
+                goto writeFailure;
1225
+            if (fwrite(m_cuTreeStats.qpBuffer0, sizeof(uint16_t), ncu, m_cutreeStatFileOut) < (size_t)ncu)
1226
+                goto writeFailure;
1227
+        }
1228
+        else // X265_SHARE_MODE_SHAREDMEM == m_param->rc.dataShareMode
1229
+        {
1230
+            if (!m_cutreeShrMem)
1231
+            {
1232
+                goto writeFailure;
1233
+            }
1234
+
1235
+            CUTreeSharedDataItem shrItem;
1236
+            shrItem.type = &sliceType;
1237
+            shrItem.stats = m_cuTreeStats.qpBuffer0;
1238
+            m_cutreeShrMem->writeData(&shrItem, WriteSharedCUTreeData);
1239
+        } 
1240
     }
1241
     return 0;
1242
 
1243
@@ -3127,6 +3359,13 @@
1244
     if (m_cutreeStatFileIn)
1245
         fclose(m_cutreeStatFileIn);
1246
 
1247
+    if (m_cutreeShrMem)
1248
+    {
1249
+        m_cutreeShrMem->release();
1250
+        delete m_cutreeShrMem;
1251
+        m_cutreeShrMem = NULL;
1252
+    }
1253
+
1254
     X265_FREE(m_rce2Pass);
1255
     X265_FREE(m_encOrder);
1256
     for (int i = 0; i < 2; i++)
1257
@@ -3186,13 +3425,20 @@
1258
 double RateControl::forwardMasking(Frame* curFrame, double q)
1259
 {
1260
     double qp = x265_qScale2qp(q);
1261
-    uint32_t maxWindowSize = uint32_t((m_param->fwdScenecutWindow / 1000.0) * (m_param->fpsNum / m_param->fpsDenom) + 0.5);
1262
-    uint32_t windowSize = maxWindowSize / 3;
1263
+    uint32_t maxWindowSize = uint32_t((m_param->fwdMaxScenecutWindow / 1000.0) * (m_param->fpsNum / m_param->fpsDenom) + 0.5);
1264
+    uint32_t windowSize6, prevWindow = 0;
1265
     int lastScenecut = m_top->m_rateControl->m_lastScenecut;
1266
-    int lastIFrame = m_top->m_rateControl->m_lastScenecutAwareIFrame;
1267
-    double fwdRefQpDelta = double(m_param->fwdRefQpDelta);
1268
-    double fwdNonRefQpDelta = double(m_param->fwdNonRefQpDelta);
1269
-    double sliceTypeDelta = SLICE_TYPE_DELTA * fwdRefQpDelta;
1270
+
1271
+    double fwdRefQpDelta6, fwdNonRefQpDelta6, sliceTypeDelta6;
1272
+    for (int i = 0; i < 6; i++)
1273
+    {
1274
+        windowSizei = prevWindow + (uint32_t((m_param->fwdScenecutWindowi / 1000.0) * (m_param->fpsNum / m_param->fpsDenom) + 0.5));
1275
+        fwdRefQpDeltai = double(m_param->fwdRefQpDeltai);
1276
+        fwdNonRefQpDeltai = double(m_param->fwdNonRefQpDeltai);
1277
+        sliceTypeDeltai = SLICE_TYPE_DELTA * fwdRefQpDeltai;
1278
+        prevWindow = windowSizei;
1279
+    }
1280
+
1281
 
1282
     //Check whether the current frame is within the forward window
1283
     if (curFrame->m_poc > lastScenecut && curFrame->m_poc <= (lastScenecut + int(maxWindowSize)))
1284
@@ -3205,45 +3451,51 @@
1285
         }
1286
         else if (curFrame->m_lowres.sliceType == X265_TYPE_P)
1287
         {
1288
-            if (!(lastIFrame > lastScenecut && lastIFrame <= (lastScenecut + int(maxWindowSize))
1289
-                && curFrame->m_poc >= lastIFrame))
1290
-            {
1291
-                //Add offsets corresponding to the window in which the P-frame occurs
1292
-                if (curFrame->m_poc <= (lastScenecut + int(windowSize)))
1293
-                    qp += WINDOW1_DELTA * (fwdRefQpDelta - sliceTypeDelta);
1294
-                else if (((curFrame->m_poc) > (lastScenecut + int(windowSize))) && ((curFrame->m_poc) <= (lastScenecut + 2 * int(windowSize))))
1295
-                    qp += WINDOW2_DELTA * (fwdRefQpDelta - sliceTypeDelta);
1296
-                else if (curFrame->m_poc > lastScenecut + 2 * int(windowSize))
1297
-                    qp += WINDOW3_DELTA * (fwdRefQpDelta - sliceTypeDelta);
1298
-            }
1299
+            //Add offsets corresponding to the window in which the P-frame occurs
1300
+            if (curFrame->m_poc <= (lastScenecut + int(windowSize0)))
1301
+                qp += fwdRefQpDelta0 - sliceTypeDelta0;
1302
+            else if (((curFrame->m_poc) > (lastScenecut + int(windowSize0))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize1))))
1303
+                qp += fwdRefQpDelta1 - sliceTypeDelta1;
1304
+            else if (((curFrame->m_poc) > (lastScenecut + int(windowSize1))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize2))))
1305
+                qp += fwdRefQpDelta2 - sliceTypeDelta2;
1306
+            else if (((curFrame->m_poc) > (lastScenecut + int(windowSize2))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize3))))
1307
+                qp += fwdRefQpDelta3 - sliceTypeDelta3;
1308
+            else if (((curFrame->m_poc) > (lastScenecut + int(windowSize3))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize4))))
1309
+                qp += fwdRefQpDelta4 - sliceTypeDelta4;
1310
+            else if (curFrame->m_poc > lastScenecut + int(windowSize4))
1311
+                qp += fwdRefQpDelta5 - sliceTypeDelta5;
1312
         }
1313
         else if (curFrame->m_lowres.sliceType == X265_TYPE_BREF)
1314
         {
1315
-            if (!(lastIFrame > lastScenecut && lastIFrame <= (lastScenecut + int(maxWindowSize))
1316
-                && curFrame->m_poc >= lastIFrame))
1317
-            {
1318
-                //Add offsets corresponding to the window in which the B-frame occurs
1319
-                if (curFrame->m_poc <= (lastScenecut + int(windowSize)))
1320
-                    qp += WINDOW1_DELTA * fwdRefQpDelta;
1321
-                else if (((curFrame->m_poc) > (lastScenecut + int(windowSize))) && ((curFrame->m_poc) <= (lastScenecut + 2 * int(windowSize))))
1322
-                    qp += WINDOW2_DELTA * fwdRefQpDelta;
1323
-                else if (curFrame->m_poc > lastScenecut + 2 * int(windowSize))
1324
-                    qp += WINDOW3_DELTA * fwdRefQpDelta;
1325
-            }
1326
+            //Add offsets corresponding to the window in which the B-frame occurs
1327
+            if (curFrame->m_poc <= (lastScenecut + int(windowSize0)))
1328
+                qp += fwdRefQpDelta0;
1329
+            else if (((curFrame->m_poc) > (lastScenecut + int(windowSize0))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize1))))
1330
+                qp += fwdRefQpDelta1;
1331
+            else if (((curFrame->m_poc) > (lastScenecut + int(windowSize1))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize2))))
1332
+                qp += fwdRefQpDelta2;
1333
+            else if (((curFrame->m_poc) > (lastScenecut + int(windowSize2))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize3))))
1334
+                qp += fwdRefQpDelta3;
1335
+            else if (((curFrame->m_poc) > (lastScenecut + int(windowSize3))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize4))))
1336
+                qp += fwdRefQpDelta4;
1337
+            else if (curFrame->m_poc > lastScenecut + int(windowSize4))
1338
+                qp += fwdRefQpDelta5;
1339
         }
1340
         else if (curFrame->m_lowres.sliceType == X265_TYPE_B)
1341
         {
1342
-            if (!(lastIFrame > lastScenecut && lastIFrame <= (lastScenecut + int(maxWindowSize))
1343
-                && curFrame->m_poc >= lastIFrame))
1344
-            {
1345
-                //Add offsets corresponding to the window in which the b-frame occurs
1346
-                if (curFrame->m_poc <= (lastScenecut + int(windowSize)))
1347
-                    qp += WINDOW1_DELTA * fwdNonRefQpDelta;
1348
-                else if (((curFrame->m_poc) > (lastScenecut + int(windowSize))) && ((curFrame->m_poc) <= (lastScenecut + 2 * int(windowSize))))
1349
-                    qp += WINDOW2_DELTA * fwdNonRefQpDelta;
1350
-                else if (curFrame->m_poc > lastScenecut + 2 * int(windowSize))
1351
-                    qp += WINDOW3_DELTA * fwdNonRefQpDelta;
1352
-            }
1353
+            //Add offsets corresponding to the window in which the b-frame occurs
1354
+            if (curFrame->m_poc <= (lastScenecut + int(windowSize0)))
1355
+                qp += fwdNonRefQpDelta0;
1356
+            else if (((curFrame->m_poc) > (lastScenecut + int(windowSize0))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize1))))
1357
+                qp += fwdNonRefQpDelta1;
1358
+            else if (((curFrame->m_poc) > (lastScenecut + int(windowSize1))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize2))))
1359
+                qp += fwdNonRefQpDelta2;
1360
+            else if (((curFrame->m_poc) > (lastScenecut + int(windowSize2))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize3))))
1361
+                qp += fwdNonRefQpDelta3;
1362
+            else if (((curFrame->m_poc) > (lastScenecut + int(windowSize3))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize4))))
1363
+                qp += fwdNonRefQpDelta4;
1364
+            else if (curFrame->m_poc > lastScenecut + int(windowSize4))
1365
+                qp += fwdNonRefQpDelta5;
1366
         }
1367
     }
1368
 
1369
@@ -3252,24 +3504,75 @@
1370
 double RateControl::backwardMasking(Frame* curFrame, double q)
1371
 {
1372
     double qp = x265_qScale2qp(q);
1373
-    double fwdRefQpDelta = double(m_param->fwdRefQpDelta);
1374
-    double bwdRefQpDelta = double(m_param->bwdRefQpDelta);
1375
-    double bwdNonRefQpDelta = double(m_param->bwdNonRefQpDelta);
1376
+    uint32_t windowSize6, prevWindow = 0;
1377
+    int lastScenecut = m_top->m_rateControl->m_lastScenecut;
1378
 
1379
-    if (curFrame->m_isInsideWindow == BACKWARD_WINDOW)
1380
+    double bwdRefQpDelta6, bwdNonRefQpDelta6, sliceTypeDelta6;
1381
+    for (int i = 0; i < 6; i++)
1382
     {
1383
-        if (bwdRefQpDelta < 0)
1384
-            bwdRefQpDelta = WINDOW3_DELTA * fwdRefQpDelta;
1385
-        double sliceTypeDelta = SLICE_TYPE_DELTA * bwdRefQpDelta;
1386
-        if (bwdNonRefQpDelta < 0)
1387
-            bwdNonRefQpDelta = bwdRefQpDelta + sliceTypeDelta;
1388
+        windowSizei = prevWindow + (uint32_t((m_param->bwdScenecutWindowi / 1000.0) * (m_param->fpsNum / m_param->fpsDenom) + 0.5));
1389
+        prevWindow = windowSizei;
1390
+        bwdRefQpDeltai = double(m_param->bwdRefQpDeltai);
1391
+        bwdNonRefQpDeltai = double(m_param->bwdNonRefQpDeltai);
1392
+
1393
+        if (bwdRefQpDeltai < 0)
1394
+            bwdRefQpDeltai = BWD_WINDOW_DELTA * m_param->fwdRefQpDeltai;
1395
+        sliceTypeDeltai = SLICE_TYPE_DELTA * bwdRefQpDeltai;
1396
+
1397
+        if (bwdNonRefQpDeltai < 0)
1398
+            bwdNonRefQpDeltai = bwdRefQpDeltai + sliceTypeDeltai;
1399
+    }
1400
 
1401
+    if (curFrame->m_isInsideWindow == BACKWARD_WINDOW)
1402
+    {
1403
         if (curFrame->m_lowres.sliceType == X265_TYPE_P)
1404
-            qp += bwdRefQpDelta - sliceTypeDelta;
1405
+        {
1406
+            //Add offsets corresponding to the window in which the P-frame occurs
1407
+            if (curFrame->m_poc >= (lastScenecut - int(windowSize0)))
1408
+                qp += bwdRefQpDelta0 - sliceTypeDelta0;
1409
+            else if (((curFrame->m_poc) < (lastScenecut - int(windowSize0))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize1))))
1410
+                qp += bwdRefQpDelta1 - sliceTypeDelta1;
1411
+            else if (((curFrame->m_poc) < (lastScenecut - int(windowSize1))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize2))))
1412
+                qp += bwdRefQpDelta2 - sliceTypeDelta2;
1413
+            else if (((curFrame->m_poc) < (lastScenecut - int(windowSize2))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize3))))
1414
+                qp += bwdRefQpDelta3 - sliceTypeDelta3;
1415
+            else if (((curFrame->m_poc) < (lastScenecut - int(windowSize3))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize4))))
1416
+                qp += bwdRefQpDelta4 - sliceTypeDelta4;
1417
+            else if (curFrame->m_poc < lastScenecut - int(windowSize4))
1418
+                qp += bwdRefQpDelta5 - sliceTypeDelta5;
1419
+        }
1420
         else if (curFrame->m_lowres.sliceType == X265_TYPE_BREF)
1421
-            qp += bwdRefQpDelta;
1422
+        {
1423
+            //Add offsets corresponding to the window in which the B-frame occurs
1424
+            if (curFrame->m_poc >= (lastScenecut - int(windowSize0)))
1425
+                qp += bwdRefQpDelta0;
1426
+            else if (((curFrame->m_poc) < (lastScenecut - int(windowSize0))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize1))))
1427
+                qp += bwdRefQpDelta1;
1428
+            else if (((curFrame->m_poc) < (lastScenecut - int(windowSize1))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize2))))
1429
+                qp += bwdRefQpDelta2;
1430
+            else if (((curFrame->m_poc) < (lastScenecut - int(windowSize2))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize3))))
1431
+                qp += bwdRefQpDelta3;
1432
+            else if (((curFrame->m_poc) < (lastScenecut - int(windowSize3))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize4))))
1433
+                qp += bwdRefQpDelta4;
1434
+            else if (curFrame->m_poc < lastScenecut - int(windowSize4))
1435
+                qp += bwdRefQpDelta5;
1436
+        }
1437
         else if (curFrame->m_lowres.sliceType == X265_TYPE_B)
1438
-            qp += bwdNonRefQpDelta;
1439
+        {
1440
+            //Add offsets corresponding to the window in which the b-frame occurs
1441
+            if (curFrame->m_poc >= (lastScenecut - int(windowSize0)))
1442
+                qp += bwdNonRefQpDelta0;
1443
+            else if (((curFrame->m_poc) < (lastScenecut - int(windowSize0))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize1))))
1444
+                qp += bwdNonRefQpDelta1;
1445
+            else if (((curFrame->m_poc) < (lastScenecut - int(windowSize1))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize2))))
1446
+                qp += bwdNonRefQpDelta2;
1447
+            else if (((curFrame->m_poc) < (lastScenecut - int(windowSize2))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize3))))
1448
+                qp += bwdNonRefQpDelta3;
1449
+            else if (((curFrame->m_poc) < (lastScenecut - int(windowSize3))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize4))))
1450
+                qp += bwdNonRefQpDelta4;
1451
+            else if (curFrame->m_poc < lastScenecut - int(windowSize4))
1452
+                qp += bwdNonRefQpDelta5;
1453
+        }
1454
     }
1455
 
1456
     return x265_qp2qScale(qp);
1457
x265_3.5.tar.gz/source/encoder/ratecontrol.h -> x265_3.6.tar.gz/source/encoder/ratecontrol.h Changed
90
 
1
@@ -28,6 +28,7 @@
2
 
3
 #include "common.h"
4
 #include "sei.h"
5
+#include "ringmem.h"
6
 
7
 namespace X265_NS {
8
 // encoder namespace
9
@@ -46,11 +47,6 @@
10
 #define MIN_AMORTIZE_FRACTION 0.2
11
 #define CLIP_DURATION(f) x265_clip3(MIN_FRAME_DURATION, MAX_FRAME_DURATION, f)
12
 
13
-/*Scenecut Aware QP*/
14
-#define WINDOW1_DELTA           1.0 /* The offset for the frames coming in the window-1*/
15
-#define WINDOW2_DELTA           0.7 /* The offset for the frames coming in the window-2*/
16
-#define WINDOW3_DELTA           0.4 /* The offset for the frames coming in the window-3*/
17
-
18
 struct Predictor
19
 {
20
     double coeffMin;
21
@@ -73,6 +69,7 @@
22
     Predictor  rowPreds32;
23
     Predictor* rowPred2;
24
 
25
+    int64_t currentSatd;
26
     int64_t lastSatd;      /* Contains the picture cost of the previous frame, required for resetAbr and VBV */
27
     int64_t leadingNoBSatd;
28
     int64_t rowTotalBits;  /* update cplxrsum and totalbits at the end of 2 rows */
29
@@ -87,6 +84,8 @@
30
     double  rowCplxrSum;
31
     double  qpNoVbv;
32
     double  bufferFill;
33
+    double  bufferFillFinal;
34
+    double  bufferFillActual;
35
     double  targetFill;
36
     bool    vbvEndAdj;
37
     double  frameDuration;
38
@@ -192,6 +191,8 @@
39
     double  m_qCompress;
40
     int64_t m_totalBits;        /* total bits used for already encoded frames (after ammortization) */
41
     int64_t m_encodedBits;      /* bits used for encoded frames (without ammortization) */
42
+    int64_t m_encodedSegmentBits;      /* bits used for encoded frames in a segment*/
43
+    double  m_segDur;
44
     double  m_fps;
45
     int64_t m_satdCostWindow50;
46
     int64_t m_encodedBitsWindow50;
47
@@ -237,6 +238,8 @@
48
     FILE*   m_statFileOut;
49
     FILE*   m_cutreeStatFileOut;
50
     FILE*   m_cutreeStatFileIn;
51
+    ///< store the cutree data in memory instead of file
52
+    RingMem *m_cutreeShrMem;
53
     double  m_lastAccumPNorm;
54
     double  m_expectedBitsSum;   /* sum of qscale2bits after rceq, ratefactor, and overflow, only includes finished frames */
55
     int64_t m_predictedBits;
56
@@ -254,6 +257,7 @@
57
     RateControl(x265_param& p, Encoder *enc);
58
     bool init(const SPS& sps);
59
     void initHRD(SPS& sps);
60
+    void initVBV(const SPS& sps);
61
     void reconfigureRC();
62
 
63
     void setFinalFrameCount(int count);
64
@@ -271,6 +275,9 @@
65
     int writeRateControlFrameStats(Frame* curFrame, RateControlEntry* rce);
66
     bool   initPass2();
67
 
68
+    bool initCUTreeSharedMem();
69
+    void skipCUTreeSharedMemRead(int32_t cnt);
70
+
71
     double forwardMasking(Frame* curFrame, double q);
72
     double backwardMasking(Frame* curFrame, double q);
73
 
74
@@ -291,6 +298,7 @@
75
     double rateEstimateQscale(Frame* pic, RateControlEntry *rce); // main logic for calculating QP based on ABR
76
     double tuneAbrQScaleFromFeedback(double qScale);
77
     double tuneQScaleForZone(RateControlEntry *rce, double qScale); // Tune qScale to adhere to zone budget
78
+    double tuneQscaleForSBRC(Frame* curFrame, double q); // Tune qScale to adhere to segment budget
79
     void   accumPQpUpdate();
80
 
81
     int    getPredictorType(int lowresSliceType, int sliceType);
82
@@ -311,6 +319,7 @@
83
     double tuneQScaleForGrain(double rcOverflow);
84
     void   splitdeltaPOC(char deltapoc, RateControlEntry *rce);
85
     void   splitbUsed(char deltapoc, RateControlEntry *rce);
86
+    void   checkAndResetCRF(RateControlEntry* rce);
87
 };
88
 }
89
 #endif // ifndef X265_RATECONTROL_H
90
x265_3.5.tar.gz/source/encoder/sei.cpp -> x265_3.6.tar.gz/source/encoder/sei.cpp Changed
10
 
1
@@ -68,7 +68,7 @@
2
     {
3
         if (nalUnitType != NAL_UNIT_UNSPECIFIED)
4
             bs.writeByteAlignment();
5
-        list.serialize(nalUnitType, bs);
6
+        list.serialize(nalUnitType, bs, (1 + (nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N)));
7
     }
8
 }
9
 
10
x265_3.5.tar.gz/source/encoder/sei.h -> x265_3.6.tar.gz/source/encoder/sei.h Changed
103
 
1
@@ -73,6 +73,101 @@
2
     }
3
 };
4
 
5
+/* Film grain characteristics */
6
+class FilmGrainCharacteristics : public SEI
7
+{
8
+  public:
9
+
10
+    FilmGrainCharacteristics()
11
+    {
12
+        m_payloadType = FILM_GRAIN_CHARACTERISTICS;
13
+        m_payloadSize = 0;
14
+    }
15
+
16
+    struct CompModelIntensityValues
17
+    {
18
+        uint8_t intensityIntervalLowerBound;
19
+        uint8_t intensityIntervalUpperBound;
20
+        int*    compModelValue;
21
+    };
22
+
23
+    struct CompModel
24
+    {
25
+        bool    bPresentFlag;
26
+        uint8_t numModelValues;
27
+        uint8_t m_filmGrainNumIntensityIntervalMinus1;
28
+        CompModelIntensityValues* intensityValues;
29
+    };
30
+
31
+    CompModel   m_compModelMAX_NUM_COMPONENT;
32
+    bool        m_filmGrainCharacteristicsPersistenceFlag;
33
+    bool        m_filmGrainCharacteristicsCancelFlag;
34
+    bool        m_separateColourDescriptionPresentFlag;
35
+    bool        m_filmGrainFullRangeFlag;
36
+    uint8_t     m_filmGrainModelId;
37
+    uint8_t     m_blendingModeId;
38
+    uint8_t     m_log2ScaleFactor;
39
+    uint8_t     m_filmGrainBitDepthLumaMinus8;
40
+    uint8_t     m_filmGrainBitDepthChromaMinus8;
41
+    uint8_t     m_filmGrainColourPrimaries;
42
+    uint8_t     m_filmGrainTransferCharacteristics;
43
+    uint8_t     m_filmGrainMatrixCoeffs;
44
+
45
+    void writeSEI(const SPS&)
46
+    {
47
+        WRITE_FLAG(m_filmGrainCharacteristicsCancelFlag, "film_grain_characteristics_cancel_flag");
48
+
49
+        if (!m_filmGrainCharacteristicsCancelFlag)
50
+        {
51
+            WRITE_CODE(m_filmGrainModelId, 2, "film_grain_model_id");
52
+            WRITE_FLAG(m_separateColourDescriptionPresentFlag, "separate_colour_description_present_flag");
53
+            if (m_separateColourDescriptionPresentFlag)
54
+            {
55
+                WRITE_CODE(m_filmGrainBitDepthLumaMinus8, 3, "film_grain_bit_depth_luma_minus8");
56
+                WRITE_CODE(m_filmGrainBitDepthChromaMinus8, 3, "film_grain_bit_depth_chroma_minus8");
57
+                WRITE_FLAG(m_filmGrainFullRangeFlag, "film_grain_full_range_flag");
58
+                WRITE_CODE(m_filmGrainColourPrimaries, X265_BYTE, "film_grain_colour_primaries");
59
+                WRITE_CODE(m_filmGrainTransferCharacteristics, X265_BYTE, "film_grain_transfer_characteristics");
60
+                WRITE_CODE(m_filmGrainMatrixCoeffs, X265_BYTE, "film_grain_matrix_coeffs");
61
+            }
62
+            WRITE_CODE(m_blendingModeId, 2, "blending_mode_id");
63
+            WRITE_CODE(m_log2ScaleFactor, 4, "log2_scale_factor");
64
+            for (uint8_t c = 0; c < 3; c++)
65
+            {
66
+                WRITE_FLAG(m_compModelc.bPresentFlag && m_compModelc.m_filmGrainNumIntensityIntervalMinus1 + 1 > 0 && m_compModelc.numModelValues > 0, "comp_model_present_flagc");
67
+            }
68
+            for (uint8_t c = 0; c < 3; c++)
69
+            {
70
+                if (m_compModelc.bPresentFlag && m_compModelc.m_filmGrainNumIntensityIntervalMinus1 + 1 > 0 && m_compModelc.numModelValues > 0)
71
+                {
72
+                    assert(m_compModelc.m_filmGrainNumIntensityIntervalMinus1 + 1 <= 256);
73
+                    assert(m_compModelc.numModelValues <= X265_BYTE);
74
+                    WRITE_CODE(m_compModelc.m_filmGrainNumIntensityIntervalMinus1 , X265_BYTE, "num_intensity_intervals_minus1c");
75
+                    WRITE_CODE(m_compModelc.numModelValues - 1, 3, "num_model_values_minus1c");
76
+                    for (uint8_t interval = 0; interval < m_compModelc.m_filmGrainNumIntensityIntervalMinus1 + 1; interval++)
77
+                    {
78
+                        WRITE_CODE(m_compModelc.intensityValuesinterval.intensityIntervalLowerBound, X265_BYTE, "intensity_interval_lower_boundci");
79
+                        WRITE_CODE(m_compModelc.intensityValuesinterval.intensityIntervalUpperBound, X265_BYTE, "intensity_interval_upper_boundci");
80
+                        for (uint8_t j = 0; j < m_compModelc.numModelValues; j++)
81
+                        {
82
+                            WRITE_SVLC(m_compModelc.intensityValuesinterval.compModelValuej,"comp_model_valueci");
83
+                        }
84
+                    }
85
+                }
86
+            }
87
+            WRITE_FLAG(m_filmGrainCharacteristicsPersistenceFlag, "film_grain_characteristics_persistence_flag");
88
+        }
89
+        if (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
90
+        {
91
+            WRITE_FLAG(1, "payload_bit_equal_to_one");
92
+            while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
93
+            {
94
+                WRITE_FLAG(0, "payload_bit_equal_to_zero");
95
+            }
96
+        }
97
+    }
98
+};
99
+
100
 static const uint32_t ISO_IEC_11578_LEN = 16;
101
 
102
 class SEIuserDataUnregistered : public SEI
103
x265_3.5.tar.gz/source/encoder/slicetype.cpp -> x265_3.6.tar.gz/source/encoder/slicetype.cpp Changed
1444
 
1
@@ -87,6 +87,14 @@
2
 
3
 namespace X265_NS {
4
 
5
+uint32_t acEnergyVarHist(uint64_t sum_ssd, int shift)
6
+{
7
+    uint32_t sum = (uint32_t)sum_ssd;
8
+    uint32_t ssd = (uint32_t)(sum_ssd >> 32);
9
+
10
+    return ssd - ((uint64_t)sum * sum >> shift);
11
+}
12
+
13
 bool computeEdge(pixel* edgePic, pixel* refPic, pixel* edgeTheta, intptr_t stride, int height, int width, bool bcalcTheta, pixel whitePixel)
14
 {
15
     intptr_t rowOne = 0, rowTwo = 0, rowThree = 0, colOne = 0, colTwo = 0, colThree = 0;
16
@@ -184,7 +192,7 @@
17
     {
18
         for (int colNum = 0; colNum < width; colNum++)
19
         {
20
-            if ((rowNum >= 2) && (colNum >= 2) && (rowNum != height - 2) && (colNum != width - 2)) //Ignoring the border pixels of the picture
21
+            if ((rowNum >= 2) && (colNum >= 2) && (rowNum < height - 2) && (colNum < width - 2)) //Ignoring the border pixels of the picture
22
             {
23
                 /*  5x5 Gaussian filter
24
                     2   4   5   4   2
25
@@ -519,7 +527,7 @@
26
                 if (param->rc.aqMode == X265_AQ_EDGE)
27
                     edgeFilter(curFrame, param);
28
 
29
-                if (param->rc.aqMode == X265_AQ_EDGE && !param->bHistBasedSceneCut && param->recursionSkipMode == EDGE_BASED_RSKIP)
30
+                if (param->rc.aqMode == X265_AQ_EDGE && param->recursionSkipMode == EDGE_BASED_RSKIP)
31
                 {
32
                     pixel* src = curFrame->m_edgePic + curFrame->m_fencPic->m_lumaMarginY * curFrame->m_fencPic->m_stride + curFrame->m_fencPic->m_lumaMarginX;
33
                     primitives.planecopy_pp_shr(src, curFrame->m_fencPic->m_stride, curFrame->m_edgeBitPic,
34
@@ -1050,7 +1058,48 @@
35
     m_countPreLookahead = 0;
36
 #endif
37
 
38
-    memset(m_histogram, 0, sizeof(m_histogram));
39
+    m_accHistDiffRunningAvgCb = X265_MALLOC(uint32_t*, NUMBER_OF_SEGMENTS_IN_WIDTH * sizeof(uint32_t*));
40
+    m_accHistDiffRunningAvgCb0 = X265_MALLOC(uint32_t, NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
41
+    memset(m_accHistDiffRunningAvgCb0, 0, sizeof(uint32_t) * NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
42
+    for (uint32_t w = 1; w < NUMBER_OF_SEGMENTS_IN_WIDTH; w++) {
43
+        m_accHistDiffRunningAvgCbw = m_accHistDiffRunningAvgCb0 + w * NUMBER_OF_SEGMENTS_IN_HEIGHT;
44
+    }
45
+
46
+    m_accHistDiffRunningAvgCr = X265_MALLOC(uint32_t*, NUMBER_OF_SEGMENTS_IN_WIDTH * sizeof(uint32_t*));
47
+    m_accHistDiffRunningAvgCr0 = X265_MALLOC(uint32_t, NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
48
+    memset(m_accHistDiffRunningAvgCr0, 0, sizeof(uint32_t) * NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
49
+    for (uint32_t w = 1; w < NUMBER_OF_SEGMENTS_IN_WIDTH; w++) {
50
+        m_accHistDiffRunningAvgCrw = m_accHistDiffRunningAvgCr0 + w * NUMBER_OF_SEGMENTS_IN_HEIGHT;
51
+    }
52
+
53
+    m_accHistDiffRunningAvg = X265_MALLOC(uint32_t*, NUMBER_OF_SEGMENTS_IN_WIDTH * sizeof(uint32_t*));
54
+    m_accHistDiffRunningAvg0 = X265_MALLOC(uint32_t, NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
55
+    memset(m_accHistDiffRunningAvg0, 0, sizeof(uint32_t) * NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
56
+    for (uint32_t w = 1; w < NUMBER_OF_SEGMENTS_IN_WIDTH; w++) {
57
+        m_accHistDiffRunningAvgw = m_accHistDiffRunningAvg0 + w * NUMBER_OF_SEGMENTS_IN_HEIGHT;
58
+    }
59
+
60
+    m_resetRunningAvg = true;
61
+
62
+    m_segmentCountThreshold = (uint32_t)(((float)((NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT) * 50) / 100) + 0.5);
63
+
64
+    if (m_param->bEnableTemporalSubLayers > 2)
65
+    {
66
+        switch (m_param->bEnableTemporalSubLayers)
67
+        {
68
+        case 3:
69
+            m_gopId = 0;
70
+            break;
71
+        case 4:
72
+            m_gopId = 1;
73
+            break;
74
+        case 5:
75
+            m_gopId = 2;
76
+            break;
77
+        default:
78
+            break;
79
+        }
80
+    }
81
 }
82
 
83
 #if DETAILED_CU_STATS
84
@@ -1098,6 +1147,7 @@
85
             m_pooli.stopWorkers();
86
     }
87
 }
88
+
89
 void Lookahead::destroy()
90
 {
91
     // these two queues will be empty unless the encode was aborted
92
@@ -1309,32 +1359,32 @@
93
     default:
94
         return;
95
     }
96
-    if (!m_param->analysisLoad || !m_param->bDisableLookahead)
97
+    if (!curFrame->m_param->analysisLoad || !curFrame->m_param->bDisableLookahead)
98
     {
99
         X265_CHECK(curFrame->m_lowres.costEstb - p0p1 - b > 0, "Slice cost not estimated\n")
100
 
101
-        if (m_param->rc.cuTree && !m_param->rc.bStatRead)
102
+        if (curFrame->m_param->rc.cuTree && !curFrame->m_param->rc.bStatRead)
103
             /* update row satds based on cutree offsets */
104
             curFrame->m_lowres.satdCost = frameCostRecalculate(frames, p0, p1, b);
105
-        else if (!m_param->analysisLoad || m_param->scaleFactor || m_param->bAnalysisType == HEVC_INFO)
106
+        else if (!curFrame->m_param->analysisLoad || curFrame->m_param->scaleFactor || curFrame->m_param->bAnalysisType == HEVC_INFO)
107
         {
108
-            if (m_param->rc.aqMode)
109
+            if (curFrame->m_param->rc.aqMode)
110
                 curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstAqb - p0p1 - b;
111
             else
112
                 curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstb - p0p1 - b;
113
         }
114
-        if (m_param->rc.vbvBufferSize && m_param->rc.vbvMaxBitrate)
115
+        if (curFrame->m_param->rc.vbvBufferSize && curFrame->m_param->rc.vbvMaxBitrate)
116
         {
117
             /* aggregate lowres row satds to CTU resolution */
118
             curFrame->m_lowres.lowresCostForRc = curFrame->m_lowres.lowresCostsb - p0p1 - b;
119
             uint32_t lowresRow = 0, lowresCol = 0, lowresCuIdx = 0, sum = 0, intraSum = 0;
120
-            uint32_t scale = m_param->maxCUSize / (2 * X265_LOWRES_CU_SIZE);
121
-            uint32_t numCuInHeight = (m_param->sourceHeight + m_param->maxCUSize - 1) / m_param->maxCUSize;
122
+            uint32_t scale = curFrame->m_param->maxCUSize / (2 * X265_LOWRES_CU_SIZE);
123
+            uint32_t numCuInHeight = (curFrame->m_param->sourceHeight + curFrame->m_param->maxCUSize - 1) / curFrame->m_param->maxCUSize;
124
             uint32_t widthInLowresCu = (uint32_t)m_8x8Width, heightInLowresCu = (uint32_t)m_8x8Height;
125
             double *qp_offset = 0;
126
             /* Factor in qpoffsets based on Aq/Cutree in CU costs */
127
-            if (m_param->rc.aqMode || m_param->bAQMotion)
128
-                qp_offset = (framesb->sliceType == X265_TYPE_B || !m_param->rc.cuTree) ? framesb->qpAqOffset : framesb->qpCuTreeOffset;
129
+            if (curFrame->m_param->rc.aqMode || curFrame->m_param->bAQMotion)
130
+                qp_offset = (framesb->sliceType == X265_TYPE_B || !curFrame->m_param->rc.cuTree) ? framesb->qpAqOffset : framesb->qpCuTreeOffset;
131
 
132
             for (uint32_t row = 0; row < numCuInHeight; row++)
133
             {
134
@@ -1350,7 +1400,7 @@
135
                         if (qp_offset)
136
                         {
137
                             double qpOffset;
138
-                            if (m_param->rc.qgSize == 8)
139
+                            if (curFrame->m_param->rc.qgSize == 8)
140
                                 qpOffset = (qp_offsetlowresCol * 2 + lowresRow * widthInLowresCu * 4 +
141
                                 qp_offsetlowresCol * 2 + lowresRow * widthInLowresCu * 4 + 1 +
142
                                 qp_offsetlowresCol * 2 + lowresRow * widthInLowresCu * 4 + curFrame->m_lowres.maxBlocksInRowFullRes +
143
@@ -1361,7 +1411,7 @@
144
                             int32_t intraCuCost = curFrame->m_lowres.intraCostlowresCuIdx;
145
                             curFrame->m_lowres.intraCostlowresCuIdx = (intraCuCost * x265_exp2fix8(qpOffset) + 128) >> 8;
146
                         }
147
-                        if (m_param->bIntraRefresh && slice->m_sliceType == X265_TYPE_P)
148
+                        if (curFrame->m_param->bIntraRefresh && slice->m_sliceType == X265_TYPE_P)
149
                             for (uint32_t x = curFrame->m_encData->m_pir.pirStartCol; x <= curFrame->m_encData->m_pir.pirEndCol; x++)
150
                                 diff += curFrame->m_lowres.intraCostlowresCuIdx - lowresCuCost;
151
                         curFrame->m_lowres.lowresCostForRclowresCuIdx = lowresCuCost;
152
@@ -1377,6 +1427,291 @@
153
     }
154
 }
155
 
156
+uint32_t LookaheadTLD::calcVariance(pixel* inpSrc, intptr_t stride, intptr_t blockOffset, uint32_t plane)
157
+{
158
+    pixel* src = inpSrc + blockOffset;
159
+
160
+    uint32_t var;
161
+    if (!plane)
162
+        var = acEnergyVarHist(primitives.cuBLOCK_8x8.var(src, stride), 6);
163
+    else
164
+        var = acEnergyVarHist(primitives.cuBLOCK_4x4.var(src, stride), 4);
165
+
166
+    x265_emms();
167
+    return var;
168
+}
169
+
170
+/*
171
+** Compute Block and Picture Variance, Block Mean for all blocks in the picture
172
+*/
173
+void LookaheadTLD::computePictureStatistics(Frame *curFrame)
174
+{
175
+    int maxCol = curFrame->m_fencPic->m_picWidth;
176
+    int maxRow = curFrame->m_fencPic->m_picHeight;
177
+    intptr_t inpStride = curFrame->m_fencPic->m_stride;
178
+
179
+    // Variance
180
+    uint64_t picTotVariance = 0;
181
+    uint32_t variance;
182
+
183
+    uint64_t blockXY = 0;
184
+    pixel* src = curFrame->m_fencPic->m_picOrg0;
185
+
186
+    for (int blockY = 0; blockY < maxRow; blockY += 8)
187
+    {
188
+        uint64_t rowVariance = 0;
189
+        for (int blockX = 0; blockX < maxCol; blockX += 8)
190
+        {
191
+            intptr_t blockOffsetLuma = blockX + (blockY * inpStride);
192
+
193
+            variance = calcVariance(
194
+                src,
195
+                inpStride,
196
+                blockOffsetLuma, 0);
197
+
198
+            rowVariance += variance;
199
+            blockXY++;
200
+        }
201
+        picTotVariance += (uint16_t)(rowVariance / maxCol);
202
+    }
203
+
204
+    curFrame->m_lowres.picAvgVariance = (uint16_t)(picTotVariance / maxRow);
205
+
206
+    // Collect chroma variance
207
+    int hShift = curFrame->m_fencPic->m_hChromaShift;
208
+    int vShift = curFrame->m_fencPic->m_vChromaShift;
209
+
210
+    int maxColChroma = curFrame->m_fencPic->m_picWidth >> hShift;
211
+    int maxRowChroma = curFrame->m_fencPic->m_picHeight >> vShift;
212
+    intptr_t cStride = curFrame->m_fencPic->m_strideC;
213
+
214
+    pixel* srcCb = curFrame->m_fencPic->m_picOrg1;
215
+
216
+    picTotVariance = 0;
217
+    for (int blockY = 0; blockY < maxRowChroma; blockY += 4)
218
+    {
219
+        uint64_t rowVariance = 0;
220
+        for (int blockX = 0; blockX < maxColChroma; blockX += 4)
221
+        {
222
+            intptr_t blockOffsetChroma = blockX + blockY * cStride;
223
+
224
+            variance = calcVariance(
225
+                srcCb,
226
+                cStride,
227
+                blockOffsetChroma, 1);
228
+
229
+            rowVariance += variance;
230
+            blockXY++;
231
+        }
232
+        picTotVariance += (uint16_t)(rowVariance / maxColChroma);
233
+    }
234
+
235
+    curFrame->m_lowres.picAvgVarianceCb = (uint16_t)(picTotVariance / maxRowChroma);
236
+
237
+
238
+    pixel* srcCr = curFrame->m_fencPic->m_picOrg2;
239
+
240
+    picTotVariance = 0;
241
+    for (int blockY = 0; blockY < maxRowChroma; blockY += 4)
242
+    {
243
+        uint64_t rowVariance = 0;
244
+        for (int blockX = 0; blockX < maxColChroma; blockX += 4)
245
+        {
246
+            intptr_t blockOffsetChroma = blockX + blockY * cStride;
247
+
248
+            variance = calcVariance(
249
+                srcCr,
250
+                cStride,
251
+                blockOffsetChroma, 2);
252
+
253
+            rowVariance += variance;
254
+            blockXY++;
255
+        }
256
+        picTotVariance += (uint16_t)(rowVariance / maxColChroma);
257
+    }
258
+
259
+    curFrame->m_lowres.picAvgVarianceCr = (uint16_t)(picTotVariance / maxRowChroma);
260
+}
261
+
262
+/*
263
+* Compute histogram of n-bins for the input
264
+*/
265
+void LookaheadTLD::calculateHistogram(
266
+    pixel     *inputSrc,
267
+    uint32_t   inputWidth,
268
+    uint32_t   inputHeight,
269
+    intptr_t   stride,
270
+    uint8_t    dsFactor,
271
+    uint32_t  *histogram,
272
+    uint64_t  *sum)
273
+
274
+{
275
+    *sum = 0;
276
+
277
+    for (uint32_t verticalIdx = 0; verticalIdx < inputHeight; verticalIdx += dsFactor)
278
+    {
279
+        for (uint32_t horizontalIdx = 0; horizontalIdx < inputWidth; horizontalIdx += dsFactor)
280
+        {
281
+            ++(histograminputSrchorizontalIdx);
282
+            *sum += inputSrchorizontalIdx;
283
+        }
284
+        inputSrc += (stride << (dsFactor >> 1));
285
+    }
286
+
287
+    return;
288
+}
289
+
290
+/*
291
+* Compute histogram bins and chroma pixel intensity *
292
+*/
293
+void LookaheadTLD::computeIntensityHistogramBinsChroma(
294
+    Frame    *curFrame,
295
+    uint64_t *sumAverageIntensityCb,
296
+    uint64_t *sumAverageIntensityCr)
297
+{
298
+    uint64_t    sum;
299
+    uint8_t     dsFactor = 4;
300
+
301
+    uint32_t segmentWidth = curFrame->m_lowres.widthFullRes / NUMBER_OF_SEGMENTS_IN_WIDTH;
302
+    uint32_t segmentHeight = curFrame->m_lowres.heightFullRes / NUMBER_OF_SEGMENTS_IN_HEIGHT;
303
+
304
+    for (uint32_t segmentInFrameWidthIndex = 0; segmentInFrameWidthIndex < NUMBER_OF_SEGMENTS_IN_WIDTH; segmentInFrameWidthIndex++)
305
+    {
306
+        for (uint32_t segmentInFrameHeightIndex = 0; segmentInFrameHeightIndex < NUMBER_OF_SEGMENTS_IN_HEIGHT; segmentInFrameHeightIndex++)
307
+        {
308
+            // Initialize bins to 1
309
+            for (uint32_t cuIndex = 0; cuIndex < 256; cuIndex++) {
310
+                curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex1cuIndex = 1;
311
+                curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex2cuIndex = 1;
312
+            }
313
+
314
+            uint32_t segmentWidthOffset = (segmentInFrameWidthIndex == NUMBER_OF_SEGMENTS_IN_WIDTH - 1) ?
315
+                curFrame->m_lowres.widthFullRes - (NUMBER_OF_SEGMENTS_IN_WIDTH * segmentWidth) : 0;
316
+
317
+            uint32_t segmentHeightOffset = (segmentInFrameHeightIndex == NUMBER_OF_SEGMENTS_IN_HEIGHT - 1) ?
318
+                curFrame->m_lowres.heightFullRes - (NUMBER_OF_SEGMENTS_IN_HEIGHT * segmentHeight) : 0;
319
+
320
+
321
+            // U Histogram
322
+            calculateHistogram(
323
+                curFrame->m_fencPic->m_picOrg1 + ((segmentInFrameWidthIndex * segmentWidth) >> 1) + (((segmentInFrameHeightIndex * segmentHeight) >> 1) * curFrame->m_fencPic->m_strideC),
324
+                (segmentWidth + segmentWidthOffset) >> 1,
325
+                (segmentHeight + segmentHeightOffset) >> 1,
326
+                curFrame->m_fencPic->m_strideC,
327
+                dsFactor,
328
+                curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex1,
329
+                &sum);
330
+
331
+            sum = (sum << dsFactor);
332
+            *sumAverageIntensityCb += sum;
333
+            curFrame->m_lowres.averageIntensityPerSegmentsegmentInFrameWidthIndexsegmentInFrameHeightIndex1 =
334
+                (uint8_t)((sum + (((segmentWidth + segmentWidthOffset) * (segmentHeight + segmentHeightOffset)) >> 3)) / (((segmentWidth + segmentWidthOffset) * (segmentHeight + segmentHeightOffset)) >> 2));
335
+
336
+            for (uint16_t histogramBin = 0; histogramBin < HISTOGRAM_NUMBER_OF_BINS; histogramBin++) {
337
+                curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex1histogramBin =
338
+                    curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex1histogramBin << dsFactor;
339
+            }
340
+
341
+            // V Histogram
342
+            calculateHistogram(
343
+                curFrame->m_fencPic->m_picOrg2 + ((segmentInFrameWidthIndex * segmentWidth) >> 1) + (((segmentInFrameHeightIndex * segmentHeight) >> 1) * curFrame->m_fencPic->m_strideC),
344
+                (segmentWidth + segmentWidthOffset) >> 1,
345
+                (segmentHeight + segmentHeightOffset) >> 1,
346
+                curFrame->m_fencPic->m_strideC,
347
+                dsFactor,
348
+                curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex2,
349
+                &sum);
350
+
351
+            sum = (sum << dsFactor);
352
+            *sumAverageIntensityCr += sum;
353
+            curFrame->m_lowres.averageIntensityPerSegmentsegmentInFrameWidthIndexsegmentInFrameHeightIndex2 =
354
+                (uint8_t)((sum + (((segmentWidth + segmentWidthOffset) * (segmentHeight + segmentHeightOffset)) >> 3)) / (((segmentWidth + segmentHeightOffset) * (segmentHeight + segmentHeightOffset)) >> 2));
355
+
356
+            for (uint16_t histogramBin = 0; histogramBin < HISTOGRAM_NUMBER_OF_BINS; histogramBin++) {
357
+                curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex2histogramBin =
358
+                    curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex2histogramBin << dsFactor;
359
+            }
360
+        }
361
+    }
362
+    return;
363
+
364
+}
365
+
366
+/*
367
+* Compute histogram bins and luma pixel intensity *
368
+*/
369
+void LookaheadTLD::computeIntensityHistogramBinsLuma(
370
+    Frame    *curFrame,
371
+    uint64_t *sumAvgIntensityTotalSegmentsLuma)
372
+{
373
+    uint64_t sum;
374
+
375
+    uint32_t segmentWidth = curFrame->m_lowres.quarterSampleLowResWidth / NUMBER_OF_SEGMENTS_IN_WIDTH;
376
+    uint32_t segmentHeight = curFrame->m_lowres.quarterSampleLowResHeight / NUMBER_OF_SEGMENTS_IN_HEIGHT;
377
+
378
+    for (uint32_t segmentInFrameWidthIndex = 0; segmentInFrameWidthIndex < NUMBER_OF_SEGMENTS_IN_WIDTH; segmentInFrameWidthIndex++)
379
+    {
380
+        for (uint32_t segmentInFrameHeightIndex = 0; segmentInFrameHeightIndex < NUMBER_OF_SEGMENTS_IN_HEIGHT; segmentInFrameHeightIndex++)
381
+        {
382
+            // Initialize bins to 1
383
+            for (uint32_t cuIndex = 0; cuIndex < 256; cuIndex++) {
384
+                curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex0cuIndex = 1;
385
+            }
386
+
387
+            uint32_t segmentWidthOffset = (segmentInFrameWidthIndex == NUMBER_OF_SEGMENTS_IN_WIDTH - 1) ?
388
+                curFrame->m_lowres.quarterSampleLowResWidth - (NUMBER_OF_SEGMENTS_IN_WIDTH * segmentWidth) : 0;
389
+
390
+            uint32_t segmentHeightOffset = (segmentInFrameHeightIndex == NUMBER_OF_SEGMENTS_IN_HEIGHT - 1) ?
391
+                curFrame->m_lowres.quarterSampleLowResHeight - (NUMBER_OF_SEGMENTS_IN_HEIGHT * segmentHeight) : 0;
392
+
393
+            // Y Histogram
394
+            calculateHistogram(
395
+                curFrame->m_lowres.quarterSampleLowResBuffer + (curFrame->m_lowres.quarterSampleLowResOriginX + segmentInFrameWidthIndex * segmentWidth) + ((curFrame->m_lowres.quarterSampleLowResOriginY + segmentInFrameHeightIndex * segmentHeight) * curFrame->m_lowres.quarterSampleLowResStrideY),
396
+                segmentWidth + segmentWidthOffset,
397
+                segmentHeight + segmentHeightOffset,
398
+                curFrame->m_lowres.quarterSampleLowResStrideY,
399
+                1,
400
+                curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex0,
401
+                &sum);
402
+
403
+            curFrame->m_lowres.averageIntensityPerSegmentsegmentInFrameWidthIndexsegmentInFrameHeightIndex0 = (uint8_t)((sum + (((segmentWidth + segmentWidthOffset)*(segmentWidth + segmentHeightOffset)) >> 1)) / ((segmentWidth + segmentWidthOffset)*(segmentHeight + segmentHeightOffset)));
404
+            (*sumAvgIntensityTotalSegmentsLuma) += (sum << 4);
405
+            for (uint32_t histogramBin = 0; histogramBin < HISTOGRAM_NUMBER_OF_BINS; histogramBin++)
406
+            {
407
+                curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex0histogramBin =
408
+                    curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex0histogramBin << 4;
409
+            }
410
+        }
411
+    }
412
+}
413
+
414
+void LookaheadTLD::collectPictureStatistics(Frame *curFrame)
415
+{
416
+
417
+    uint64_t sumAverageIntensityCb = 0;
418
+    uint64_t sumAverageIntensityCr = 0;
419
+    uint64_t sumAverageIntensity = 0;
420
+
421
+    // Histogram bins for Luma
422
+    computeIntensityHistogramBinsLuma(
423
+        curFrame,
424
+        &sumAverageIntensity);
425
+
426
+    // Histogram bins for Chroma
427
+    computeIntensityHistogramBinsChroma(
428
+        curFrame,
429
+        &sumAverageIntensityCb,
430
+        &sumAverageIntensityCr);
431
+
432
+    curFrame->m_lowres.averageIntensity0 = (uint8_t)((sumAverageIntensity + ((curFrame->m_lowres.widthFullRes * curFrame->m_lowres.heightFullRes) >> 1)) / (curFrame->m_lowres.widthFullRes * curFrame->m_lowres.heightFullRes));
433
+    curFrame->m_lowres.averageIntensity1 = (uint8_t)((sumAverageIntensityCb + ((curFrame->m_lowres.widthFullRes * curFrame->m_lowres.heightFullRes) >> 3)) / ((curFrame->m_lowres.widthFullRes * curFrame->m_lowres.heightFullRes) >> 2));
434
+    curFrame->m_lowres.averageIntensity2 = (uint8_t)((sumAverageIntensityCr + ((curFrame->m_lowres.widthFullRes * curFrame->m_lowres.heightFullRes) >> 3)) / ((curFrame->m_lowres.widthFullRes * curFrame->m_lowres.heightFullRes) >> 2));
435
+
436
+    computePictureStatistics(curFrame);
437
+
438
+    curFrame->m_lowres.bHistScenecutAnalyzed = false;
439
+}
440
+
441
 void PreLookaheadGroup::processTasks(int workerThreadID)
442
 {
443
     if (workerThreadID < 0)
444
@@ -1393,6 +1728,10 @@
445
         preFrame->m_lowres.init(preFrame->m_fencPic, preFrame->m_poc);
446
         if (m_lookahead.m_bAdaptiveQuant)
447
             tld.calcAdaptiveQuantFrame(preFrame, m_lookahead.m_param);
448
+
449
+        if (m_lookahead.m_param->bHistBasedSceneCut)
450
+            tld.collectPictureStatistics(preFrame);
451
+
452
         tld.lowresIntraEstimate(preFrame->m_lowres, m_lookahead.m_param->rc.qgSize);
453
         preFrame->m_lowresInit = true;
454
 
455
@@ -1401,6 +1740,53 @@
456
     m_lock.release();
457
 }
458
 
459
+
460
+void Lookahead::placeBref(Frame** frames, int start, int end, int num, int *brefs)
461
+{
462
+    int avg = (start + end) / 2;
463
+    if (m_param->bEnableTemporalSubLayers < 2)
464
+    {
465
+        (*framesavg).m_lowres.sliceType = X265_TYPE_BREF;
466
+        (*brefs)++;
467
+        return;
468
+    }
469
+    else
470
+    {
471
+        if (num <= 2)
472
+            return;
473
+        else
474
+        {
475
+            (*framesavg).m_lowres.sliceType = X265_TYPE_BREF;
476
+            (*brefs)++;
477
+            placeBref(frames, start, avg, avg - start, brefs);
478
+            placeBref(frames, avg + 1, end, end - avg, brefs);
479
+            return;
480
+        }
481
+    }
482
+}
483
+
484
+
485
+void Lookahead::compCostBref(Lowres **frames, int start, int end, int num)
486
+{
487
+    CostEstimateGroup estGroup(*this, frames);
488
+    int avg = (start + end) / 2;
489
+    if (num <= 2)
490
+    {
491
+        for (int i = start; i < end; i++)
492
+        {
493
+            estGroup.singleCost(start, end + 1, i + 1);
494
+        }
495
+        return;
496
+    }
497
+    else
498
+    {
499
+        estGroup.singleCost(start, end + 1, avg + 1);
500
+        compCostBref(frames, start, avg, avg - start);
501
+        compCostBref(frames, avg + 1, end, end - avg);
502
+        return;
503
+    }
504
+}
505
+
506
 /* called by API thread or worker thread with inputQueueLock acquired */
507
 void Lookahead::slicetypeDecide()
508
 {
509
@@ -1416,6 +1802,18 @@
510
         ScopedLock lock(m_inputLock);
511
 
512
         Frame *curFrame = m_inputQueue.first();
513
+        if (m_param->bResetZoneConfig)
514
+        {
515
+            for (int i = 0; i < m_param->rc.zonefileCount; i++)
516
+            {
517
+                if (m_param->rc.zonesi.startFrame == curFrame->m_poc)
518
+                    m_param = m_param->rc.zonesi.zoneParam;
519
+                int nextZoneStart = m_param->rc.zonesi.startFrame;
520
+                nextZoneStart += nextZoneStart ? m_param->rc.zonesi.zoneParam->radl : 0;
521
+                if (nextZoneStart < curFrame->m_poc + maxSearch && curFrame->m_poc < nextZoneStart)
522
+                    maxSearch = nextZoneStart - curFrame->m_poc;
523
+            }
524
+        }
525
         int j;
526
         for (j = 0; j < m_param->bframes + 2; j++)
527
         {
528
@@ -1502,7 +1900,7 @@
529
          m_param->rc.cuTree || m_param->scenecutThreshold || m_param->bHistBasedSceneCut ||
530
          (m_param->lookaheadDepth && m_param->rc.vbvBufferSize)))
531
     {
532
-        if(!m_param->rc.bStatRead)
533
+        if (!m_param->rc.bStatRead)
534
             slicetypeAnalyse(frames, false);
535
         bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
536
         if ((m_param->analysisLoad && m_param->scaleFactor && bIsVbv) || m_param->bliveVBV2pass)
537
@@ -1526,6 +1924,8 @@
538
         {
539
             Lowres& frm = listbframes->m_lowres;
540
 
541
+            if (frm.sliceTypeReq != X265_TYPE_AUTO && frm.sliceTypeReq != frm.sliceType)
542
+                frm.sliceType = frm.sliceTypeReq;
543
             if (frm.sliceType == X265_TYPE_BREF && !m_param->bBPyramid && brefs == m_param->bBPyramid)
544
             {
545
                 frm.sliceType = X265_TYPE_B;
546
@@ -1583,12 +1983,9 @@
547
             }
548
             if (frm.sliceType == X265_TYPE_IDR && frm.bScenecut && isClosedGopRadl)
549
             {
550
-                if (!m_param->bHistBasedSceneCut || (m_param->bHistBasedSceneCut && frm.m_bIsHardScenecut))
551
-                {
552
-                    for (int i = bframes; i < bframes + m_param->radl; i++)
553
-                        listi->m_lowres.sliceType = X265_TYPE_B;
554
-                    list(bframes + m_param->radl)->m_lowres.sliceType = X265_TYPE_IDR;
555
-                }
556
+                for (int i = bframes; i < bframes + m_param->radl; i++)
557
+                    listi->m_lowres.sliceType = X265_TYPE_B;
558
+                list(bframes + m_param->radl)->m_lowres.sliceType = X265_TYPE_IDR;
559
             }
560
             if (frm.sliceType == X265_TYPE_IDR)
561
             {
562
@@ -1649,138 +2046,454 @@
563
                 break;
564
         }
565
     }
566
-    if (bframes)
567
-        listbframes - 1->m_lowres.bLastMiniGopBFrame = true;
568
-    listbframes->m_lowres.leadingBframes = bframes;
569
-    m_lastNonB = &listbframes->m_lowres;
570
-    m_histogrambframes++;
571
-
572
-    /* insert a bref into the sequence */
573
-    if (m_param->bBPyramid && bframes > 1 && !brefs)
574
-    {
575
-        listbframes / 2->m_lowres.sliceType = X265_TYPE_BREF;
576
-        brefs++;
577
-    }
578
-    /* calculate the frame costs ahead of time for estimateFrameCost while we still have lowres */
579
-    if (m_param->rc.rateControlMode != X265_RC_CQP)
580
-    {
581
-        int p0, p1, b;
582
-        /* For zero latency tuning, calculate frame cost to be used later in RC */
583
-        if (!maxSearch)
584
+
585
+    if (m_param->bEnableTemporalSubLayers > 2)
586
+    {
587
+        //Split the partial mini GOP into sub mini GOPs when temporal sub layers are enabled
588
+        if (bframes < m_param->bframes)
589
         {
590
-            for (int i = 0; i <= bframes; i++)
591
-               framesi + 1 = &listi->m_lowres;
592
-        }
593
+            int leftOver = bframes + 1;
594
+            int8_t gopId = m_gopId - 1;
595
+            int gopLen = x265_gop_ra_lengthgopId;
596
+            int listReset = 0;
597
 
598
-        /* estimate new non-B cost */
599
-        p1 = b = bframes + 1;
600
-        p0 = (IS_X265_TYPE_I(framesbframes + 1->sliceType)) ? b : 0;
601
+            m_outputLock.acquire();
602
 
603
-        CostEstimateGroup estGroup(*this, frames);
604
+            while ((gopId >= 0) && (leftOver > 3))
605
+            {
606
+                if (leftOver < gopLen)
607
+                {
608
+                    gopId = gopId - 1;
609
+                    gopLen = x265_gop_ra_lengthgopId;
610
+                    continue;
611
+                }
612
+                else
613
+                {
614
+                    int newbFrames = listReset + gopLen - 1;
615
+                    //Re-assign GOP
616
+                    listnewbFrames->m_lowres.sliceType = IS_X265_TYPE_I(listnewbFrames->m_lowres.sliceType) ? listnewbFrames->m_lowres.sliceType : X265_TYPE_P;
617
+                    if (newbFrames)
618
+                        listnewbFrames - 1->m_lowres.bLastMiniGopBFrame = true;
619
+                    listnewbFrames->m_lowres.leadingBframes = newbFrames;
620
+                    m_lastNonB = &listnewbFrames->m_lowres;
621
+
622
+                    /* insert a bref into the sequence */
623
+                    if (m_param->bBPyramid && newbFrames)
624
+                    {
625
+                        placeBref(list, listReset, newbFrames, newbFrames + 1, &brefs);
626
+                    }
627
+                    if (m_param->rc.rateControlMode != X265_RC_CQP)
628
+                    {
629
+                        int p0, p1, b;
630
+                        /* For zero latency tuning, calculate frame cost to be used later in RC */
631
+                        if (!maxSearch)
632
+                        {
633
+                            for (int i = listReset; i <= newbFrames; i++)
634
+                                framesi + 1 = &listlistReset + i->m_lowres;
635
+                        }
636
 
637
-        estGroup.singleCost(p0, p1, b);
638
+                        /* estimate new non-B cost */
639
+                        p1 = b = newbFrames + 1;
640
+                        p0 = (IS_X265_TYPE_I(framesnewbFrames + 1->sliceType)) ? b : listReset;
641
 
642
-        if (bframes)
643
+                        CostEstimateGroup estGroup(*this, frames);
644
+
645
+                        estGroup.singleCost(p0, p1, b);
646
+
647
+                        if (newbFrames)
648
+                            compCostBref(frames, listReset, newbFrames, newbFrames + 1);
649
+                    }
650
+
651
+                    m_inputLock.acquire();
652
+                    /* dequeue all frames from inputQueue that are about to be enqueued
653
+                     * in the output queue. The order is important because Frame can
654
+                     * only be in one list at a time */
655
+                    int64_t ptsX265_BFRAME_MAX + 1;
656
+                    for (int i = 0; i < gopLen; i++)
657
+                    {
658
+                        Frame *curFrame;
659
+                        curFrame = m_inputQueue.popFront();
660
+                        ptsi = curFrame->m_pts;
661
+                        maxSearch--;
662
+                    }
663
+                    m_inputLock.release();
664
+
665
+                    int idx = 0;
666
+                    /* add non-B to output queue */
667
+                    listnewbFrames->m_reorderedPts = ptsidx++;
668
+                    listnewbFrames->m_gopOffset = 0;
669
+                    listnewbFrames->m_gopId = gopId;
670
+                    listnewbFrames->m_tempLayer = x265_gop_ragopId0.layer;
671
+                    m_outputQueue.pushBack(*listnewbFrames);
672
+
673
+                    /* add B frames to output queue */
674
+                    int i = 1, j = 1;
675
+                    while (i < gopLen)
676
+                    {
677
+                        int offset = listReset + (x265_gop_ragopIdj.poc_offset - 1);
678
+                        if (!listoffset || offset == newbFrames)
679
+                            continue;
680
+
681
+                        // Assign gop offset and temporal layer of frames
682
+                        listoffset->m_gopOffset = j;
683
+                        listbframes->m_gopId = gopId;
684
+                        listoffset->m_tempLayer = x265_gop_ragopIdj++.layer;
685
+
686
+                        listoffset->m_reorderedPts = ptsidx++;
687
+                        m_outputQueue.pushBack(*listoffset);
688
+                        i++;
689
+                    }
690
+
691
+                    listReset += gopLen;
692
+                    leftOver = leftOver - gopLen;
693
+                    gopId -= 1;
694
+                    gopLen = (gopId >= 0) ? x265_gop_ra_lengthgopId : 0;
695
+                }
696
+            }
697
+
698
+            if (leftOver > 0 && leftOver < 4)
699
+            {
700
+                int64_t ptsX265_BFRAME_MAX + 1;
701
+                int idx = 0;
702
+
703
+                int newbFrames = listReset + leftOver - 1;
704
+                listnewbFrames->m_lowres.sliceType = IS_X265_TYPE_I(listnewbFrames->m_lowres.sliceType) ? listnewbFrames->m_lowres.sliceType : X265_TYPE_P;
705
+                if (newbFrames)
706
+                        listnewbFrames - 1->m_lowres.bLastMiniGopBFrame = true;
707
+                listnewbFrames->m_lowres.leadingBframes = newbFrames;
708
+                m_lastNonB = &listnewbFrames->m_lowres;
709
+
710
+                /* insert a bref into the sequence */
711
+                if (m_param->bBPyramid && (newbFrames- listReset) > 1)
712
+                    placeBref(list, listReset, newbFrames, newbFrames + 1, &brefs);
713
+
714
+                if (m_param->rc.rateControlMode != X265_RC_CQP)
715
+                {
716
+                    int p0, p1, b;
717
+                    /* For zero latency tuning, calculate frame cost to be used later in RC */
718
+                    if (!maxSearch)
719
+                    {
720
+                        for (int i = listReset; i <= newbFrames; i++)
721
+                            framesi + 1 = &listlistReset + i->m_lowres;
722
+                    }
723
+
724
+                        /* estimate new non-B cost */
725
+                    p1 = b = newbFrames + 1;
726
+                    p0 = (IS_X265_TYPE_I(framesnewbFrames + 1->sliceType)) ? b : listReset;
727
+
728
+                    CostEstimateGroup estGroup(*this, frames);
729
+
730
+                    estGroup.singleCost(p0, p1, b);
731
+
732
+                    if (newbFrames)
733
+                        compCostBref(frames, listReset, newbFrames, newbFrames + 1);
734
+                }
735
+
736
+                m_inputLock.acquire();
737
+                /* dequeue all frames from inputQueue that are about to be enqueued
738
+                 * in the output queue. The order is important because Frame can
739
+                 * only be in one list at a time */
740
+                for (int i = 0; i < leftOver; i++)
741
+                {
742
+                    Frame *curFrame;
743
+                    curFrame = m_inputQueue.popFront();
744
+                    ptsi = curFrame->m_pts;
745
+                    maxSearch--;
746
+                }
747
+                m_inputLock.release();
748
+
749
+                m_lastNonB = &listnewbFrames->m_lowres;
750
+                listnewbFrames->m_reorderedPts = ptsidx++;
751
+                listnewbFrames->m_gopOffset = 0;
752
+                listnewbFrames->m_gopId = -1;
753
+                listnewbFrames->m_tempLayer = 0;
754
+                m_outputQueue.pushBack(*listnewbFrames);
755
+                if (brefs)
756
+                {
757
+                    for (int i = listReset; i < newbFrames; i++)
758
+                    {
759
+                        if (listi->m_lowres.sliceType == X265_TYPE_BREF)
760
+                        {
761
+                            listi->m_reorderedPts = ptsidx++;
762
+                            listi->m_gopOffset = 0;
763
+                            listi->m_gopId = -1;
764
+                            listi->m_tempLayer = 0;
765
+                            m_outputQueue.pushBack(*listi);
766
+                        }
767
+                    }
768
+                }
769
+
770
+                /* add B frames to output queue */
771
+                for (int i = listReset; i < newbFrames; i++)
772
+                {
773
+                    /* push all the B frames into output queue except B-ref, which already pushed into output queue */
774
+                    if (listi->m_lowres.sliceType != X265_TYPE_BREF)
775
+                    {
776
+                        listi->m_reorderedPts = ptsidx++;
777
+                        listi->m_gopOffset = 0;
778
+                        listi->m_gopId = -1;
779
+                        listi->m_tempLayer = 1;
780
+                        m_outputQueue.pushBack(*listi);
781
+                    }
782
+                }
783
+            }
784
+        }
785
+        else
786
+        // Fill the complete mini GOP when temporal sub layers are enabled
787
         {
788
-            p0 = 0; // last nonb
789
-            bool isp0available = framesbframes + 1->sliceType == X265_TYPE_IDR ? false : true;
790
 
791
-            for (b = 1; b <= bframes; b++)
792
+            listbframes - 1->m_lowres.bLastMiniGopBFrame = true;
793
+            listbframes->m_lowres.leadingBframes = bframes;
794
+            m_lastNonB = &listbframes->m_lowres;
795
+
796
+            /* insert a bref into the sequence */
797
+            if (m_param->bBPyramid && !brefs)
798
             {
799
-                if (!isp0available)
800
-                    p0 = b;
801
+                placeBref(list, 0, bframes, bframes + 1, &brefs);
802
+            }
803
 
804
-                if (framesb->sliceType == X265_TYPE_B)
805
-                    for (p1 = b; framesp1->sliceType == X265_TYPE_B; p1++)
806
-                        ; // find new nonb or bref
807
-                else
808
-                    p1 = bframes + 1;
809
+            /* calculate the frame costs ahead of time for estimateFrameCost while we still have lowres */
810
+            if (m_param->rc.rateControlMode != X265_RC_CQP)
811
+            {
812
+                int p0, p1, b;
813
+                /* For zero latency tuning, calculate frame cost to be used later in RC */
814
+                if (!maxSearch)
815
+                {
816
+                    for (int i = 0; i <= bframes; i++)
817
+                        framesi + 1 = &listi->m_lowres;
818
+                }
819
 
820
+                /* estimate new non-B cost */
821
+                p1 = b = bframes + 1;
822
+                p0 = (IS_X265_TYPE_I(framesbframes + 1->sliceType)) ? b : 0;
823
+
824
+                CostEstimateGroup estGroup(*this, frames);
825
                 estGroup.singleCost(p0, p1, b);
826
 
827
-                if (framesb->sliceType == X265_TYPE_BREF)
828
+                compCostBref(frames, 0, bframes, bframes + 1);
829
+            }
830
+
831
+            m_inputLock.acquire();
832
+            /* dequeue all frames from inputQueue that are about to be enqueued
833
+            * in the output queue. The order is important because Frame can
834
+            * only be in one list at a time */
835
+            int64_t ptsX265_BFRAME_MAX + 1;
836
+            for (int i = 0; i <= bframes; i++)
837
+            {
838
+                Frame *curFrame;
839
+                curFrame = m_inputQueue.popFront();
840
+                ptsi = curFrame->m_pts;
841
+                maxSearch--;
842
+            }
843
+            m_inputLock.release();
844
+
845
+            m_outputLock.acquire();
846
+
847
+            int idx = 0;
848
+            /* add non-B to output queue */
849
+            listbframes->m_reorderedPts = ptsidx++;
850
+            listbframes->m_gopOffset = 0;
851
+            listbframes->m_gopId = m_gopId;
852
+            listbframes->m_tempLayer = x265_gop_ram_gopId0.layer;
853
+            m_outputQueue.pushBack(*listbframes);
854
+
855
+            int i = 1, j = 1;
856
+            while (i <= bframes)
857
+            {
858
+                int offset = x265_gop_ram_gopIdj.poc_offset - 1;
859
+                if (!listoffset || offset == bframes)
860
+                    continue;
861
+
862
+                // Assign gop offset and temporal layer of frames
863
+                listoffset->m_gopOffset = j;
864
+                listoffset->m_gopId = m_gopId;
865
+                listoffset->m_tempLayer = x265_gop_ram_gopIdj++.layer;
866
+
867
+                /* add B frames to output queue */
868
+                listoffset->m_reorderedPts = ptsidx++;
869
+                m_outputQueue.pushBack(*listoffset);
870
+                i++;
871
+            }
872
+        }
873
+
874
+        bool isKeyFrameAnalyse = (m_param->rc.cuTree || (m_param->rc.vbvBufferSize && m_param->lookaheadDepth));
875
+        if (isKeyFrameAnalyse && IS_X265_TYPE_I(m_lastNonB->sliceType))
876
+        {
877
+            m_inputLock.acquire();
878
+            Frame *curFrame = m_inputQueue.first();
879
+            frames0 = m_lastNonB;
880
+            int j;
881
+            for (j = 0; j < maxSearch; j++)
882
+            {
883
+                framesj + 1 = &curFrame->m_lowres;
884
+                curFrame = curFrame->m_next;
885
+            }
886
+            m_inputLock.release();
887
+
888
+            framesj + 1 = NULL;
889
+            if (!m_param->rc.bStatRead)
890
+                slicetypeAnalyse(frames, true);
891
+            bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
892
+            if ((m_param->analysisLoad && m_param->scaleFactor && bIsVbv) || m_param->bliveVBV2pass)
893
+            {
894
+                int numFrames;
895
+                for (numFrames = 0; numFrames < maxSearch; numFrames++)
896
                 {
897
-                    p0 = b;
898
-                    isp0available = true;
899
+                    Lowres *fenc = framesnumFrames + 1;
900
+                    if (!fenc)
901
+                        break;
902
                 }
903
+                vbvLookahead(frames, numFrames, true);
904
             }
905
         }
906
-    }
907
 
908
-    m_inputLock.acquire();
909
-    /* dequeue all frames from inputQueue that are about to be enqueued
910
-     * in the output queue. The order is important because Frame can
911
-     * only be in one list at a time */
912
-    int64_t ptsX265_BFRAME_MAX + 1;
913
-    for (int i = 0; i <= bframes; i++)
914
-    {
915
-        Frame *curFrame;
916
-        curFrame = m_inputQueue.popFront();
917
-        ptsi = curFrame->m_pts;
918
-        maxSearch--;
919
-    }
920
-    m_inputLock.release();
921
 
922
-    m_outputLock.acquire();
923
-    /* add non-B to output queue */
924
-    int idx = 0;
925
-    listbframes->m_reorderedPts = ptsidx++;
926
-    m_outputQueue.pushBack(*listbframes);
927
-    /* Add B-ref frame next to P frame in output queue, the B-ref encode before non B-ref frame */
928
-    if (brefs)
929
+        m_outputLock.release();
930
+    }
931
+    else
932
     {
933
-        for (int i = 0; i < bframes; i++)
934
+
935
+        if (bframes)
936
+            listbframes - 1->m_lowres.bLastMiniGopBFrame = true;
937
+        listbframes->m_lowres.leadingBframes = bframes;
938
+        m_lastNonB = &listbframes->m_lowres;
939
+
940
+        /* insert a bref into the sequence */
941
+        if (m_param->bBPyramid && bframes > 1 && !brefs)
942
         {
943
-            if (listi->m_lowres.sliceType == X265_TYPE_BREF)
944
+            placeBref(list, 0, bframes, bframes + 1, &brefs);
945
+        }
946
+        /* calculate the frame costs ahead of time for estimateFrameCost while we still have lowres */
947
+        if (m_param->rc.rateControlMode != X265_RC_CQP)
948
+        {
949
+            int p0, p1, b;
950
+            /* For zero latency tuning, calculate frame cost to be used later in RC */
951
+            if (!maxSearch)
952
             {
953
-                listi->m_reorderedPts = ptsidx++;
954
-                m_outputQueue.pushBack(*listi);
955
+                for (int i = 0; i <= bframes; i++)
956
+                    framesi + 1 = &listi->m_lowres;
957
+            }
958
+
959
+            /* estimate new non-B cost */
960
+            p1 = b = bframes + 1;
961
+            p0 = (IS_X265_TYPE_I(framesbframes + 1->sliceType)) ? b : 0;
962
+
963
+            CostEstimateGroup estGroup(*this, frames);
964
+            estGroup.singleCost(p0, p1, b);
965
+
966
+            if (m_param->bEnableTemporalSubLayers > 1 && bframes)
967
+            {
968
+                compCostBref(frames, 0, bframes, bframes + 1);
969
+            }
970
+            else
971
+            {
972
+                if (bframes)
973
+                {
974
+                    p0 = 0; // last nonb
975
+                    bool isp0available = framesbframes + 1->sliceType == X265_TYPE_IDR ? false : true;
976
+
977
+                    for (b = 1; b <= bframes; b++)
978
+                    {
979
+                        if (!isp0available)
980
+                            p0 = b;
981
+
982
+                        if (framesb->sliceType == X265_TYPE_B)
983
+                            for (p1 = b; framesp1->sliceType == X265_TYPE_B; p1++)
984
+                                ; // find new nonb or bref
985
+                        else
986
+                            p1 = bframes + 1;
987
+
988
+                        estGroup.singleCost(p0, p1, b);
989
+
990
+                        if (framesb->sliceType == X265_TYPE_BREF)
991
+                        {
992
+                            p0 = b;
993
+                            isp0available = true;
994
+                        }
995
+                    }
996
+                }
997
             }
998
         }
999
-    }
1000
 
1001
-    /* add B frames to output queue */
1002
-    for (int i = 0; i < bframes; i++)
1003
-    {
1004
-        /* push all the B frames into output queue except B-ref, which already pushed into output queue */
1005
-        if (listi->m_lowres.sliceType != X265_TYPE_BREF)
1006
+        m_inputLock.acquire();
1007
+        /* dequeue all frames from inputQueue that are about to be enqueued
1008
+         * in the output queue. The order is important because Frame can
1009
+         * only be in one list at a time */
1010
+        int64_t ptsX265_BFRAME_MAX + 1;
1011
+        for (int i = 0; i <= bframes; i++)
1012
+        {
1013
+            Frame *curFrame;
1014
+            curFrame = m_inputQueue.popFront();
1015
+            ptsi = curFrame->m_pts;
1016
+            maxSearch--;
1017
+        }
1018
+        m_inputLock.release();
1019
+
1020
+        m_outputLock.acquire();
1021
+
1022
+        /* add non-B to output queue */
1023
+        int idx = 0;
1024
+        listbframes->m_reorderedPts = ptsidx++;
1025
+        m_outputQueue.pushBack(*listbframes);
1026
+
1027
+        /* Add B-ref frame next to P frame in output queue, the B-ref encode before non B-ref frame */
1028
+        if (brefs)
1029
         {
1030
-            listi->m_reorderedPts = ptsidx++;
1031
-            m_outputQueue.pushBack(*listi);
1032
+            for (int i = 0; i < bframes; i++)
1033
+            {
1034
+                if (listi->m_lowres.sliceType == X265_TYPE_BREF)
1035
+                {
1036
+                    listi->m_reorderedPts = ptsidx++;
1037
+                    m_outputQueue.pushBack(*listi);
1038
+                }
1039
+            }
1040
         }
1041
-    }
1042
 
1043
-    bool isKeyFrameAnalyse = (m_param->rc.cuTree || (m_param->rc.vbvBufferSize && m_param->lookaheadDepth));
1044
-    if (isKeyFrameAnalyse && IS_X265_TYPE_I(m_lastNonB->sliceType))
1045
-    {
1046
-        m_inputLock.acquire();
1047
-        Frame *curFrame = m_inputQueue.first();
1048
-        frames0 = m_lastNonB;
1049
-        int j;
1050
-        for (j = 0; j < maxSearch; j++)
1051
+        /* add B frames to output queue */
1052
+        for (int i = 0; i < bframes; i++)
1053
         {
1054
-            framesj + 1 = &curFrame->m_lowres;
1055
-            curFrame = curFrame->m_next;
1056
+            /* push all the B frames into output queue except B-ref, which already pushed into output queue */
1057
+            if (listi->m_lowres.sliceType != X265_TYPE_BREF)
1058
+            {
1059
+                listi->m_reorderedPts = ptsidx++;
1060
+                m_outputQueue.pushBack(*listi);
1061
+            }
1062
         }
1063
-        m_inputLock.release();
1064
 
1065
-        framesj + 1 = NULL;
1066
-        if (!m_param->rc.bStatRead)
1067
-            slicetypeAnalyse(frames, true);
1068
-        bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
1069
-        if ((m_param->analysisLoad && m_param->scaleFactor && bIsVbv) || m_param->bliveVBV2pass)
1070
+
1071
+        bool isKeyFrameAnalyse = (m_param->rc.cuTree || (m_param->rc.vbvBufferSize && m_param->lookaheadDepth));
1072
+        if (isKeyFrameAnalyse && IS_X265_TYPE_I(m_lastNonB->sliceType))
1073
         {
1074
-            int numFrames;
1075
-            for (numFrames = 0; numFrames < maxSearch; numFrames++)
1076
+            m_inputLock.acquire();
1077
+            Frame *curFrame = m_inputQueue.first();
1078
+            frames0 = m_lastNonB;
1079
+            int j;
1080
+            for (j = 0; j < maxSearch; j++)
1081
+            {
1082
+                framesj + 1 = &curFrame->m_lowres;
1083
+                curFrame = curFrame->m_next;
1084
+            }
1085
+            m_inputLock.release();
1086
+
1087
+            framesj + 1 = NULL;
1088
+            if (!m_param->rc.bStatRead)
1089
+                slicetypeAnalyse(frames, true);
1090
+            bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
1091
+            if ((m_param->analysisLoad && m_param->scaleFactor && bIsVbv) || m_param->bliveVBV2pass)
1092
             {
1093
-                Lowres *fenc = framesnumFrames + 1;
1094
-                if (!fenc)
1095
-                    break;
1096
+                int numFrames;
1097
+                for (numFrames = 0; numFrames < maxSearch; numFrames++)
1098
+                {
1099
+                    Lowres *fenc = framesnumFrames + 1;
1100
+                    if (!fenc)
1101
+                        break;
1102
+                }
1103
+                vbvLookahead(frames, numFrames, true);
1104
             }
1105
-            vbvLookahead(frames, numFrames, true);
1106
         }
1107
+
1108
+        m_outputLock.release();
1109
     }
1110
-    m_outputLock.release();
1111
 }
1112
 
1113
 void Lookahead::vbvLookahead(Lowres **frames, int numFrames, int keyframe)
1114
@@ -1909,6 +2622,8 @@
1115
             nextZoneStart += (i + 1 < m_param->rc.zonefileCount) ? m_param->rc.zonesi + 1.startFrame + m_param->rc.zonesi + 1.zoneParam->radl : m_param->totalFrames;
1116
             if (curZoneStart <= frames0->frameNum && nextZoneStart > frames0->frameNum)
1117
                 m_param->keyframeMax = nextZoneStart - curZoneStart;
1118
+            if (m_param->rc.zonesm_param->rc.zonefileCount - 1.startFrame <= frames0->frameNum && nextZoneStart == 0)
1119
+                m_param->keyframeMax = m_param->rc.zones0.keyframeMax;
1120
         }
1121
     }
1122
     int keylimit = m_param->keyframeMax;
1123
@@ -2013,44 +2728,13 @@
1124
     int numAnalyzed = numFrames;
1125
     bool isScenecut = false;
1126
 
1127
-    /* Temporal computations for scenecut detection */
1128
     if (m_param->bHistBasedSceneCut)
1129
-    {
1130
-        for (int i = numFrames - 1; i > 0; i--)
1131
-        {
1132
-            if (framesi->interPCostPercDiff > 0.0)
1133
-                continue;
1134
-            int64_t interCost = framesi->costEst10;
1135
-            int64_t intraCost = framesi->costEst00;
1136
-            if (interCost < 0 || intraCost < 0)
1137
-                continue;
1138
-            int times = 0;
1139
-            double averagePcost = 0.0, averageIcost = 0.0;
1140
-            for (int j = i - 1; j >= 0 && times < 5; j--, times++)
1141
-            {
1142
-                if (framesj->costEst00 > 0 && framesj->costEst10 > 0)
1143
-                {
1144
-                    averageIcost += framesj->costEst00;
1145
-                    averagePcost += framesj->costEst10;
1146
-                }
1147
-                else
1148
-                    times--;
1149
-            }
1150
-            if (times)
1151
-            {
1152
-                averageIcost = averageIcost / times;
1153
-                averagePcost = averagePcost / times;
1154
-                framesi->interPCostPercDiff = abs(interCost - averagePcost) / X265_MIN(interCost, averagePcost) * 100;
1155
-                framesi->intraCostPercDiff = abs(intraCost - averageIcost) / X265_MIN(intraCost, averageIcost) * 100;
1156
-            }
1157
-        }
1158
-    }
1159
-
1160
-    /* When scenecut threshold is set, use scenecut detection for I frame placements */
1161
-    if (!m_param->bHistBasedSceneCut || (m_param->bHistBasedSceneCut && frames1->bScenecut))
1162
+        isScenecut = histBasedScenecut(frames, 0, 1, origNumFrames);
1163
+    else
1164
         isScenecut = scenecut(frames, 0, 1, true, origNumFrames);
1165
 
1166
-    if (isScenecut && (m_param->bHistBasedSceneCut || m_param->scenecutThreshold))
1167
+    /* When scenecut threshold is set, use scenecut detection for I frame placements */
1168
+    if (m_param->scenecutThreshold && isScenecut)
1169
     {
1170
         frames1->sliceType = X265_TYPE_I;
1171
         return;
1172
@@ -2061,8 +2745,7 @@
1173
         m_extendGopBoundary = false;
1174
         for (int i = m_param->bframes + 1; i < origNumFrames; i += m_param->bframes + 1)
1175
         {
1176
-            if (!m_param->bHistBasedSceneCut || (m_param->bHistBasedSceneCut && framesi + 1->bScenecut))
1177
-                scenecut(frames, i, i + 1, true, origNumFrames);
1178
+            scenecut(frames, i, i + 1, true, origNumFrames);
1179
 
1180
             for (int j = i + 1; j <= X265_MIN(i + m_param->bframes + 1, origNumFrames); j++)
1181
             {
1182
@@ -2175,10 +2858,8 @@
1183
         {
1184
             for (int j = 1; j < numBFrames + 1; j++)
1185
             {
1186
-                bool isNextScenecut = false;
1187
-                if (!m_param->bHistBasedSceneCut || (m_param->bHistBasedSceneCut && framesj + 1->bScenecut))
1188
-                    isNextScenecut = scenecut(frames, j, j + 1, false, origNumFrames);
1189
-                if (isNextScenecut || (bForceRADL && framesj->frameNum == preRADL))
1190
+                if (scenecut(frames, j, j + 1, false, origNumFrames) ||
1191
+                    (bForceRADL && (framesj->frameNum == preRADL)))
1192
                 {
1193
                     framesj->sliceType = X265_TYPE_P;
1194
                     numAnalyzed = j;
1195
@@ -2244,9 +2925,10 @@
1196
         /* Where A and B are scenes: AAAAAABBBAAAAAA
1197
          * If BBB is shorter than (maxp1-p0), it is detected as a flash
1198
          * and not considered a scenecut. */
1199
+
1200
         for (int cp1 = p1; cp1 <= maxp1; cp1++)
1201
         {
1202
-            if (!scenecutInternal(frames, p0, cp1, false) && !m_param->bHistBasedSceneCut)
1203
+            if (!scenecutInternal(frames, p0, cp1, false))
1204
             {
1205
                 /* Any frame in between p0 and cur_p1 cannot be a real scenecut. */
1206
                 for (int i = cp1; i > p0; i--)
1207
@@ -2255,7 +2937,7 @@
1208
                     noScenecuts = false;
1209
                 }
1210
             }
1211
-            else if ((m_param->bHistBasedSceneCut && framescp1->m_bIsMaxThres) || scenecutInternal(frames, cp1 - 1, cp1, false))
1212
+            else if (scenecutInternal(frames, cp1 - 1, cp1, false))
1213
             {
1214
                 /* If current frame is a Scenecut from p0 frame as well as Scenecut from
1215
                  * preceeding frame, mark it as a Scenecut */
1216
@@ -2316,9 +2998,6 @@
1217
 
1218
     if (!framesp1->bScenecut)
1219
         return false;
1220
-    /* Check only scene transitions if max threshold */
1221
-    if (m_param->bHistBasedSceneCut && framesp1->m_bIsMaxThres)
1222
-        return framesp1->bScenecut;
1223
 
1224
     return scenecutInternal(frames, p0, p1, bRealScenecut);
1225
 }
1226
@@ -2336,19 +3015,8 @@
1227
     /* magic numbers pulled out of thin air */
1228
     float threshMin = (float)(threshMax * 0.25);
1229
     double bias = m_param->scenecutBias;
1230
-    if (m_param->bHistBasedSceneCut)
1231
-    {
1232
-        double minT = TEMPORAL_SCENECUT_THRESHOLD * (1 + m_param->edgeTransitionThreshold);
1233
-        if (frame->interPCostPercDiff > minT || frame->intraCostPercDiff > minT)
1234
-        {
1235
-            if (bRealScenecut && frame->bScenecut)
1236
-                x265_log(m_param, X265_LOG_DEBUG, "scene cut at %d \n", frame->frameNum);
1237
-            return frame->bScenecut;
1238
-        }
1239
-        else
1240
-            return false;
1241
-    }
1242
-    else if (bRealScenecut)
1243
+
1244
+    if (bRealScenecut)
1245
     {
1246
         if (m_param->keyframeMin == m_param->keyframeMax)
1247
             threshMin = threshMax;
1248
@@ -2375,6 +3043,167 @@
1249
     return res;
1250
 }
1251
 
1252
+bool Lookahead::detectHistBasedSceneChange(Lowres **frames, int p0, int p1, int p2)
1253
+{
1254
+    bool isAbruptChange;
1255
+    bool isSceneChange;
1256
+
1257
+    Lowres  *previousFrame = framesp0;
1258
+    Lowres  *currentFrame = framesp1;
1259
+    Lowres  *futureFrame = framesp2;
1260
+
1261
+    currentFrame->bHistScenecutAnalyzed = true;
1262
+
1263
+    uint32_t **accHistDiffRunningAvgCb = m_accHistDiffRunningAvgCb;
1264
+    uint32_t **accHistDiffRunningAvgCr = m_accHistDiffRunningAvgCr;
1265
+    uint32_t **accHistDiffRunningAvg = m_accHistDiffRunningAvg;
1266
+
1267
+    uint8_t absIntDiffFuturePast = 0;
1268
+    uint8_t absIntDiffFuturePresent = 0;
1269
+    uint8_t absIntDiffPresentPast = 0;
1270
+
1271
+    uint32_t abruptChangeCount = 0;
1272
+    uint32_t sceneChangeCount = 0;
1273
+
1274
+    uint32_t segmentWidth = frames1->widthFullRes / NUMBER_OF_SEGMENTS_IN_WIDTH;
1275
+    uint32_t segmentHeight = frames1->heightFullRes / NUMBER_OF_SEGMENTS_IN_HEIGHT;
1276
+
1277
+    for (uint32_t segmentInFrameWidthIndex = 0; segmentInFrameWidthIndex < NUMBER_OF_SEGMENTS_IN_WIDTH; segmentInFrameWidthIndex++)
1278
+    {
1279
+        for (uint32_t segmentInFrameHeightIndex = 0; segmentInFrameHeightIndex < NUMBER_OF_SEGMENTS_IN_HEIGHT; segmentInFrameHeightIndex++)
1280
+        {
1281
+            isAbruptChange = false;
1282
+            isSceneChange = false;
1283
+
1284
+            // accumulative absolute histogram differences between the past and current frame
1285
+            uint32_t accHistDiff = 0;
1286
+            uint32_t accHistDiffCb = 0;
1287
+            uint32_t accHistDiffCr = 0;
1288
+
1289
+            uint32_t segmentWidthOffset = (segmentInFrameWidthIndex == NUMBER_OF_SEGMENTS_IN_WIDTH - 1) ?
1290
+                frames1->widthFullRes - (NUMBER_OF_SEGMENTS_IN_WIDTH * segmentWidth) : 0;
1291
+
1292
+            uint32_t segmentHeightOffset = (segmentInFrameHeightIndex == NUMBER_OF_SEGMENTS_IN_HEIGHT - 1) ?
1293
+                frames1->heightFullRes - (NUMBER_OF_SEGMENTS_IN_HEIGHT * segmentHeight) : 0;
1294
+
1295
+            segmentWidth += segmentWidthOffset;
1296
+            segmentHeight += segmentHeightOffset;
1297
+
1298
+            uint32_t segmentThreshHold = (
1299
+                ((X265_ABS((int64_t)currentFrame->picAvgVariance - (int64_t)previousFrame->picAvgVariance)) > PICTURE_DIFF_VARIANCE_TH) &&
1300
+                (currentFrame->picAvgVariance > PICTURE_VARIANCE_TH || previousFrame->picAvgVariance > PICTURE_VARIANCE_TH)) ?
1301
+                HIGH_VAR_SCENE_CHANGE_TH * NUM64x64INPIC(segmentWidth, segmentHeight) : LOW_VAR_SCENE_CHANGE_TH * NUM64x64INPIC(segmentWidth, segmentHeight);
1302
+
1303
+            uint32_t segmentThreshHoldCb = (
1304
+                ((X265_ABS((int64_t)currentFrame->picAvgVarianceCb - (int64_t)previousFrame->picAvgVarianceCb)) > PICTURE_DIFF_VARIANCE_CHROMA_TH) &&
1305
+                (currentFrame->picAvgVarianceCb > PICTURE_VARIANCE_CHROMA_TH || previousFrame->picAvgVarianceCb > PICTURE_VARIANCE_CHROMA_TH)) ?
1306
+                HIGH_VAR_SCENE_CHANGE_CHROMA_TH * NUM64x64INPIC(segmentWidth, segmentHeight) : LOW_VAR_SCENE_CHANGE_CHROMA_TH * NUM64x64INPIC(segmentWidth, segmentHeight);
1307
+
1308
+            uint32_t segmentThreshHoldCr = (
1309
+                ((X265_ABS((int64_t)currentFrame->picAvgVarianceCr - (int64_t)previousFrame->picAvgVarianceCr)) > PICTURE_DIFF_VARIANCE_CHROMA_TH) &&
1310
+                (currentFrame->picAvgVarianceCr > PICTURE_VARIANCE_CHROMA_TH || previousFrame->picAvgVarianceCr > PICTURE_VARIANCE_CHROMA_TH)) ?
1311
+                HIGH_VAR_SCENE_CHANGE_CHROMA_TH * NUM64x64INPIC(segmentWidth, segmentHeight) : LOW_VAR_SCENE_CHANGE_CHROMA_TH * NUM64x64INPIC(segmentWidth, segmentHeight);
1312
+
1313
+            for (uint32_t bin = 0; bin < HISTOGRAM_NUMBER_OF_BINS; ++bin) {
1314
+                accHistDiff += X265_ABS((int32_t)currentFrame->picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex0bin - (int32_t)previousFrame->picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex0bin);
1315
+                accHistDiffCb += X265_ABS((int32_t)currentFrame->picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex1bin - (int32_t)previousFrame->picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex1bin);
1316
+                accHistDiffCr += X265_ABS((int32_t)currentFrame->picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex2bin - (int32_t)previousFrame->picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex2bin);
1317
+            }
1318
+
1319
+            if (m_resetRunningAvg) {
1320
+                accHistDiffRunningAvgsegmentInFrameWidthIndexsegmentInFrameHeightIndex = accHistDiff;
1321
+                accHistDiffRunningAvgCbsegmentInFrameWidthIndexsegmentInFrameHeightIndex = accHistDiffCb;
1322
+                accHistDiffRunningAvgCrsegmentInFrameWidthIndexsegmentInFrameHeightIndex = accHistDiffCr;
1323
+            }
1324
+
1325
+            // difference between accumulative absolute histogram differences and the running average at the current frame.
1326
+            uint32_t accHistDiffError = X265_ABS((int32_t)accHistDiffRunningAvgsegmentInFrameWidthIndexsegmentInFrameHeightIndex - (int32_t)accHistDiff);
1327
+            uint32_t accHistDiffErrorCb = X265_ABS((int32_t)accHistDiffRunningAvgCbsegmentInFrameWidthIndexsegmentInFrameHeightIndex - (int32_t)accHistDiffCb);
1328
+            uint32_t accHistDiffErrorCr = X265_ABS((int32_t)accHistDiffRunningAvgCrsegmentInFrameWidthIndexsegmentInFrameHeightIndex - (int32_t)accHistDiffCr);
1329
+
1330
+            if ((accHistDiffError > segmentThreshHold     && accHistDiff >= accHistDiffError) ||
1331
+                (accHistDiffErrorCb > segmentThreshHoldCb && accHistDiffCb >= accHistDiffErrorCb) ||
1332
+                (accHistDiffErrorCr > segmentThreshHoldCr && accHistDiffCr >= accHistDiffErrorCr)) {
1333
+
1334
+                isAbruptChange = true;
1335
+            }
1336
+
1337
+            if (isAbruptChange)
1338
+            {
1339
+                absIntDiffFuturePast = (uint8_t)X265_ABS((int16_t)futureFrame->averageIntensityPerSegmentsegmentInFrameWidthIndexsegmentInFrameHeightIndex0 - (int16_t)previousFrame->averageIntensityPerSegmentsegmentInFrameWidthIndexsegmentInFrameHeightIndex0);
1340
+                absIntDiffFuturePresent = (uint8_t)X265_ABS((int16_t)futureFrame->averageIntensityPerSegmentsegmentInFrameWidthIndexsegmentInFrameHeightIndex0 - (int16_t)currentFrame->averageIntensityPerSegmentsegmentInFrameWidthIndexsegmentInFrameHeightIndex0);
1341
+                absIntDiffPresentPast = (uint8_t)X265_ABS((int16_t)currentFrame->averageIntensityPerSegmentsegmentInFrameWidthIndexsegmentInFrameHeightIndex0 - (int16_t)previousFrame->averageIntensityPerSegmentsegmentInFrameWidthIndexsegmentInFrameHeightIndex0);
1342
+
1343
+                if (absIntDiffFuturePresent >= FLASH_TH * absIntDiffFuturePast && absIntDiffPresentPast >= FLASH_TH * absIntDiffFuturePast) {
1344
+                    x265_log(m_param, X265_LOG_DEBUG, "Flash in frame# %i , %i, %i, %i\n", currentFrame->frameNum, absIntDiffFuturePast, absIntDiffFuturePresent, absIntDiffPresentPast);
1345
+                }
1346
+                else if (absIntDiffFuturePresent < FADE_TH && absIntDiffPresentPast < FADE_TH) {
1347
+                    x265_log(m_param, X265_LOG_DEBUG, "Fade in frame# %i , %i, %i, %i\n", currentFrame->frameNum, absIntDiffFuturePast, absIntDiffFuturePresent, absIntDiffPresentPast);
1348
+                }
1349
+                else if (X265_ABS(absIntDiffFuturePresent - absIntDiffPresentPast) < INTENSITY_CHANGE_TH && absIntDiffFuturePresent + absIntDiffPresentPast >= absIntDiffFuturePast) {
1350
+                    x265_log(m_param, X265_LOG_DEBUG, "Intensity Change in frame# %i , %i, %i, %i\n", currentFrame->frameNum, absIntDiffFuturePast, absIntDiffFuturePresent, absIntDiffPresentPast);
1351
+                }
1352
+                else {
1353
+                    isSceneChange = true;
1354
+                    x265_log(m_param, X265_LOG_DEBUG, "Scene change in frame# %i , %i, %i, %i\n", currentFrame->frameNum, absIntDiffFuturePast, absIntDiffFuturePresent, absIntDiffPresentPast);
1355
+                }
1356
+
1357
+            }
1358
+            else {
1359
+                accHistDiffRunningAvgsegmentInFrameWidthIndexsegmentInFrameHeightIndex = (3 * accHistDiffRunningAvgsegmentInFrameWidthIndexsegmentInFrameHeightIndex + accHistDiff) / 4;
1360
+            }
1361
+
1362
+            abruptChangeCount += isAbruptChange;
1363
+            sceneChangeCount += isSceneChange;
1364
+        }
1365
+    }
1366
+
1367
+    if (abruptChangeCount >= m_segmentCountThreshold) {
1368
+        m_resetRunningAvg = true;
1369
+    }
1370
+    else {
1371
+        m_resetRunningAvg = false;
1372
+    }
1373
+
1374
+    if ((sceneChangeCount >= m_segmentCountThreshold)) {
1375
+        x265_log(m_param, X265_LOG_DEBUG, "Scene Change in Pic Number# %i\n", currentFrame->frameNum);
1376
+
1377
+        return true;
1378
+    }
1379
+    else {
1380
+        return false;
1381
+    }
1382
+
1383
+}
1384
+
1385
+bool Lookahead::histBasedScenecut(Lowres **frames, int p0, int p1, int numFrames)
1386
+{
1387
+    /* Only do analysis during a normal scenecut check. */
1388
+    if (m_param->bframes)
1389
+    {
1390
+        int origmaxp1 = p0 + 1;
1391
+        /* Look ahead to avoid coding short flashes as scenecuts. */
1392
+        origmaxp1 += m_param->bframes;
1393
+        int maxp1 = X265_MIN(origmaxp1, numFrames);
1394
+
1395
+        for (int cp1 = p0; cp1 < maxp1; cp1++)
1396
+        {
1397
+            if (framescp1 + 1->bHistScenecutAnalyzed == true)
1398
+                continue;
1399
+
1400
+            if (framescp1 + 2 != NULL && detectHistBasedSceneChange(frames, cp1, cp1 + 1, cp1 + 2))
1401
+            {
1402
+                /* If current frame is a Scenecut from p0 frame as well as Scenecut from
1403
+                 * preceeding frame, mark it as a Scenecut */
1404
+                framescp1+1->bScenecut = true;
1405
+            }
1406
+        }
1407
+
1408
+    }
1409
+
1410
+    return framesp1->bScenecut;
1411
+}
1412
+
1413
 void Lookahead::slicetypePath(Lowres **frames, int length, char(*best_paths)X265_LOOKAHEAD_MAX + 1)
1414
 {
1415
     char paths2X265_LOOKAHEAD_MAX + 1;
1416
@@ -2404,6 +3233,27 @@
1417
     memcpy(best_pathslength % (X265_BFRAME_MAX + 1), pathsidx ^ 1, length);
1418
 }
1419
 
1420
+// Find slicetype of the frame with poc # in lookahead buffer
1421
+int Lookahead::findSliceType(int poc)
1422
+{
1423
+    int out_slicetype = X265_TYPE_AUTO;
1424
+    if (m_filled)
1425
+    {
1426
+        m_outputLock.acquire();
1427
+        Frame* out = m_outputQueue.first();
1428
+        while (out != NULL) {
1429
+            if (poc == out->m_poc)
1430
+            {
1431
+                out_slicetype = out->m_lowres.sliceType;
1432
+                break;
1433
+            }
1434
+            out = out->m_next;
1435
+        }
1436
+        m_outputLock.release();
1437
+    }
1438
+    return out_slicetype;
1439
+}
1440
+
1441
 int64_t Lookahead::slicetypePathCost(Lowres **frames, char *path, int64_t threshold)
1442
 {
1443
     int64_t cost = 0;
1444
x265_3.5.tar.gz/source/encoder/slicetype.h -> x265_3.6.tar.gz/source/encoder/slicetype.h Changed
110
 
1
@@ -44,6 +44,24 @@
2
 #define EDGE_INCLINATION 45
3
 #define TEMPORAL_SCENECUT_THRESHOLD 50
4
 
5
+#define X265_ABS(a)                        (((a) < 0) ? (-(a)) : (a))
6
+
7
+#define PICTURE_DIFF_VARIANCE_TH            390
8
+#define PICTURE_VARIANCE_TH                 1500
9
+#define LOW_VAR_SCENE_CHANGE_TH             2250
10
+#define HIGH_VAR_SCENE_CHANGE_TH            3500
11
+
12
+#define PICTURE_DIFF_VARIANCE_CHROMA_TH     10
13
+#define PICTURE_VARIANCE_CHROMA_TH          20
14
+#define LOW_VAR_SCENE_CHANGE_CHROMA_TH      2250/4
15
+#define HIGH_VAR_SCENE_CHANGE_CHROMA_TH     3500/4
16
+
17
+#define FLASH_TH                            1.5
18
+#define FADE_TH                             4
19
+#define INTENSITY_CHANGE_TH                 4
20
+
21
+#define NUM64x64INPIC(w,h)                  ((w*h)>> (MAX_LOG2_CU_SIZE<<1))
22
+
23
 #if HIGH_BIT_DEPTH
24
 #define EDGE_THRESHOLD 1023.0
25
 #else
26
@@ -93,7 +111,29 @@
27
 
28
     ~LookaheadTLD() { X265_FREE(wbuffer0); }
29
 
30
+    void collectPictureStatistics(Frame *curFrame);
31
+    void computeIntensityHistogramBinsLuma(Frame *curFrame, uint64_t *sumAvgIntensityTotalSegmentsLuma);
32
+
33
+    void computeIntensityHistogramBinsChroma(
34
+        Frame    *curFrame,
35
+        uint64_t *sumAverageIntensityCb,
36
+        uint64_t *sumAverageIntensityCr);
37
+
38
+    void calculateHistogram(
39
+        pixel    *inputSrc,
40
+        uint32_t  inputWidth,
41
+        uint32_t  inputHeight,
42
+        intptr_t  stride,
43
+        uint8_t   dsFactor,
44
+        uint32_t *histogram,
45
+        uint64_t *sum);
46
+
47
+    void computePictureStatistics(Frame *curFrame);
48
+
49
+    uint32_t calcVariance(pixel* src, intptr_t stride, intptr_t blockOffset, uint32_t plane);
50
+
51
     void calcAdaptiveQuantFrame(Frame *curFrame, x265_param* param);
52
+    void calcFrameSegment(Frame *curFrame);
53
     void lowresIntraEstimate(Lowres& fenc, uint32_t qgSize);
54
 
55
     void weightsAnalyse(Lowres& fenc, Lowres& ref);
56
@@ -124,7 +164,6 @@
57
 
58
     /* pre-lookahead */
59
     int           m_fullQueueSize;
60
-    int           m_histogramX265_BFRAME_MAX + 1;
61
     int           m_lastKeyframe;
62
     int           m_8x8Width;
63
     int           m_8x8Height;
64
@@ -153,6 +192,16 @@
65
     bool          m_isFadeIn;
66
     uint64_t      m_fadeCount;
67
     int           m_fadeStart;
68
+
69
+    uint32_t    **m_accHistDiffRunningAvgCb;
70
+    uint32_t    **m_accHistDiffRunningAvgCr;
71
+    uint32_t    **m_accHistDiffRunningAvg;
72
+
73
+    bool          m_resetRunningAvg;
74
+    uint32_t      m_segmentCountThreshold;
75
+
76
+    int8_t                  m_gopId;
77
+
78
     Lookahead(x265_param *param, ThreadPool *pool);
79
 #if DETAILED_CU_STATS
80
     int64_t       m_slicetypeDecideElapsedTime;
81
@@ -174,6 +223,7 @@
82
 
83
     void    getEstimatedPictureCost(Frame *pic);
84
     void    setLookaheadQueue();
85
+    int     findSliceType(int poc);
86
 
87
 protected:
88
 
89
@@ -184,6 +234,10 @@
90
     /* called by slicetypeAnalyse() to make slice decisions */
91
     bool    scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, int numFrames);
92
     bool    scenecutInternal(Lowres **frames, int p0, int p1, bool bRealScenecut);
93
+
94
+    bool    histBasedScenecut(Lowres **frames, int p0, int p1, int numFrames);
95
+    bool    detectHistBasedSceneChange(Lowres **frames, int p0, int p1, int p2);
96
+
97
     void    slicetypePath(Lowres **frames, int length, char(*best_paths)X265_LOOKAHEAD_MAX + 1);
98
     int64_t slicetypePathCost(Lowres **frames, char *path, int64_t threshold);
99
     int64_t vbvFrameCost(Lowres **frames, int p0, int p1, int b);
100
@@ -199,6 +253,9 @@
101
 
102
     /* called by getEstimatedPictureCost() to finalize cuTree costs */
103
     int64_t frameCostRecalculate(Lowres **frames, int p0, int p1, int b);
104
+    /*Compute index for positioning B-Ref frames*/
105
+    void     placeBref(Frame** frames, int start, int end, int num, int *brefs);
106
+    void     compCostBref(Lowres **frame, int start, int end, int num);
107
 };
108
 
109
 class PreLookaheadGroup : public BondedTaskGroup
110
x265_3.5.tar.gz/source/output/output.cpp -> x265_3.6.tar.gz/source/output/output.cpp Changed
19
 
1
@@ -30,14 +30,14 @@
2
 
3
 using namespace X265_NS;
4
 
5
-ReconFile* ReconFile::open(const char *fname, int width, int height, uint32_t bitdepth, uint32_t fpsNum, uint32_t fpsDenom, int csp)
6
+ReconFile* ReconFile::open(const char *fname, int width, int height, uint32_t bitdepth, uint32_t fpsNum, uint32_t fpsDenom, int csp, int sourceBitDepth)
7
 {
8
     const char * s = strrchr(fname, '.');
9
 
10
     if (s && !strcmp(s, ".y4m"))
11
-        return new Y4MOutput(fname, width, height, fpsNum, fpsDenom, csp);
12
+        return new Y4MOutput(fname, width, height, bitdepth, fpsNum, fpsDenom, csp, sourceBitDepth);
13
     else
14
-        return new YUVOutput(fname, width, height, bitdepth, csp);
15
+        return new YUVOutput(fname, width, height, bitdepth, csp, sourceBitDepth);
16
 }
17
 
18
 OutputFile* OutputFile::open(const char *fname, InputFileInfo& inputInfo)
19
x265_3.5.tar.gz/source/output/output.h -> x265_3.6.tar.gz/source/output/output.h Changed
10
 
1
@@ -42,7 +42,7 @@
2
     ReconFile()           {}
3
 
4
     static ReconFile* open(const char *fname, int width, int height, uint32_t bitdepth,
5
-                           uint32_t fpsNum, uint32_t fpsDenom, int csp);
6
+                           uint32_t fpsNum, uint32_t fpsDenom, int csp, int sourceBitDepth);
7
 
8
     virtual bool isFail() const = 0;
9
 
10
x265_3.5.tar.gz/source/output/y4m.cpp -> x265_3.6.tar.gz/source/output/y4m.cpp Changed
145
 
1
@@ -28,11 +28,13 @@
2
 using namespace X265_NS;
3
 using namespace std;
4
 
5
-Y4MOutput::Y4MOutput(const char *filename, int w, int h, uint32_t fpsNum, uint32_t fpsDenom, int csp)
6
+Y4MOutput::Y4MOutput(const char* filename, int w, int h, uint32_t bitdepth, uint32_t fpsNum, uint32_t fpsDenom, int csp, int inputdepth)
7
     : width(w)
8
     , height(h)
9
+    , bitDepth(bitdepth)
10
     , colorSpace(csp)
11
     , frameSize(0)
12
+    , inputDepth(inputdepth)
13
 {
14
     ofs.open(filename, ios::binary | ios::out);
15
     buf = new charwidth;
16
@@ -41,7 +43,13 @@
17
 
18
     if (ofs)
19
     {
20
-        ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << fpsNum << ":" << fpsDenom << " Ip" << " C" << cf << "\n";
21
+        if (bitDepth == 10)
22
+            ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << fpsNum << ":" << fpsDenom << " Ip" << " C" << cf << "p10" << " XYSCSS = " << cf << "P10" << "\n";
23
+        else if (bitDepth == 12)
24
+            ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << fpsNum << ":" << fpsDenom << " Ip" << " C" << cf << "p12" << " XYSCSS = " << cf << "P12" << "\n";
25
+        else
26
+            ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << fpsNum << ":" << fpsDenom << " Ip" << " C" << cf << "\n";
27
+
28
         header = ofs.tellp();
29
     }
30
 
31
@@ -58,52 +66,81 @@
32
 bool Y4MOutput::writePicture(const x265_picture& pic)
33
 {
34
     std::ofstream::pos_type outPicPos = header;
35
-    outPicPos += (uint64_t)pic.poc * (6 + frameSize);
36
+    if (pic.bitDepth > 8)
37
+        outPicPos += (uint64_t)(pic.poc * (6 + frameSize * 2));
38
+    else
39
+        outPicPos += (uint64_t)pic.poc * (6 + frameSize);
40
     ofs.seekp(outPicPos);
41
     ofs << "FRAME\n";
42
 
43
-#if HIGH_BIT_DEPTH
44
-    if (pic.bitDepth > 8 && pic.poc == 0)
45
-        x265_log(NULL, X265_LOG_WARNING, "y4m: down-shifting reconstructed pixels to 8 bits\n");
46
-#else
47
-    if (pic.bitDepth > 8 && pic.poc == 0)
48
-        x265_log(NULL, X265_LOG_WARNING, "y4m: forcing reconstructed pixels to 8 bits\n");
49
-#endif
50
+    if (inputDepth > 8)
51
+    {
52
+        if (pic.bitDepth == 8 && pic.poc == 0)
53
+            x265_log(NULL, X265_LOG_WARNING, "y4m: down-shifting reconstructed pixels to 8 bits\n");
54
+    }
55
 
56
     X265_CHECK(pic.colorSpace == colorSpace, "invalid chroma subsampling\n");
57
 
58
-#if HIGH_BIT_DEPTH
59
-
60
-    // encoder gave us short pixels, downshift, then write
61
-    X265_CHECK(pic.bitDepth > 8, "invalid bit depth\n");
62
-    int shift = pic.bitDepth - 8;
63
-    for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
64
+    if (inputDepth > 8)//if HIGH_BIT_DEPTH
65
     {
66
-        uint16_t *src = (uint16_t*)pic.planesi;
67
-        for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
68
+        if (pic.bitDepth == 8)
69
         {
70
-            for (int w = 0; w < width >> x265_cli_cspscolorSpace.widthi; w++)
71
-                bufw = (char)(srcw >> shift);
72
-
73
-            ofs.write(buf, width >> x265_cli_cspscolorSpace.widthi);
74
-            src += pic.stridei / sizeof(*src);
75
+            // encoder gave us short pixels, downshift, then write
76
+            X265_CHECK(pic.bitDepth == 8, "invalid bit depth\n");
77
+            int shift = pic.bitDepth - 8;
78
+            for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
79
+            {
80
+                char *src = (char*)pic.planesi;
81
+                for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
82
+                {
83
+                    for (int w = 0; w < width >> x265_cli_cspscolorSpace.widthi; w++)
84
+                        bufw = (char)(srcw >> shift);
85
+
86
+                    ofs.write(buf, width >> x265_cli_cspscolorSpace.widthi);
87
+                    src += pic.stridei / sizeof(*src);
88
+                }
89
+            }
90
+        }
91
+        else
92
+        {
93
+            X265_CHECK(pic.bitDepth > 8, "invalid bit depth\n");
94
+            for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
95
+            {
96
+                uint16_t *src = (uint16_t*)pic.planesi;
97
+                for (int h = 0; h < (height * 1) >> x265_cli_cspscolorSpace.heighti; h++)
98
+                {
99
+                    ofs.write((const char*)src, (width * 2) >> x265_cli_cspscolorSpace.widthi);
100
+                    src += pic.stridei / sizeof(*src);
101
+                }
102
+            }
103
         }
104
     }
105
-
106
-#else // if HIGH_BIT_DEPTH
107
-
108
-    X265_CHECK(pic.bitDepth == 8, "invalid bit depth\n");
109
-    for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
110
+    else if (inputDepth == 8 && pic.bitDepth > 8)
111
     {
112
-        char *src = (char*)pic.planesi;
113
-        for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
114
+        X265_CHECK(pic.bitDepth > 8, "invalid bit depth\n");
115
+        for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
116
         {
117
-            ofs.write(src, width >> x265_cli_cspscolorSpace.widthi);
118
-            src += pic.stridei / sizeof(*src);
119
+            uint16_t* src = (uint16_t*)pic.planesi;
120
+            for (int h = 0; h < (height * 1) >> x265_cli_cspscolorSpace.heighti; h++)
121
+            {
122
+                ofs.write((const char*)src, (width * 2) >> x265_cli_cspscolorSpace.widthi);
123
+                src += pic.stridei / sizeof(*src);
124
+            }
125
+        }
126
+    }
127
+    else
128
+    {
129
+        X265_CHECK(pic.bitDepth == 8, "invalid bit depth\n");
130
+        for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
131
+        {
132
+            char *src = (char*)pic.planesi;
133
+            for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
134
+            {
135
+                ofs.write(src, width >> x265_cli_cspscolorSpace.widthi);
136
+                src += pic.stridei / sizeof(*src);
137
+            }
138
         }
139
     }
140
-
141
-#endif // if HIGH_BIT_DEPTH
142
 
143
     return true;
144
 }
145
x265_3.5.tar.gz/source/output/y4m.h -> x265_3.6.tar.gz/source/output/y4m.h Changed
25
 
1
@@ -38,10 +38,14 @@
2
 
3
     int height;
4
 
5
+    uint32_t bitDepth;
6
+
7
     int colorSpace;
8
 
9
     uint32_t frameSize;
10
 
11
+    int inputDepth;
12
+
13
     std::ofstream ofs;
14
 
15
     std::ofstream::pos_type header;
16
@@ -52,7 +56,7 @@
17
 
18
 public:
19
 
20
-    Y4MOutput(const char *filename, int width, int height, uint32_t fpsNum, uint32_t fpsDenom, int csp);
21
+    Y4MOutput(const char *filename, int width, int height, uint32_t bitdepth, uint32_t fpsNum, uint32_t fpsDenom, int csp, int inputDepth);
22
 
23
     virtual ~Y4MOutput();
24
 
25
x265_3.5.tar.gz/source/output/yuv.cpp -> x265_3.6.tar.gz/source/output/yuv.cpp Changed
107
 
1
@@ -28,12 +28,13 @@
2
 using namespace X265_NS;
3
 using namespace std;
4
 
5
-YUVOutput::YUVOutput(const char *filename, int w, int h, uint32_t d, int csp)
6
+YUVOutput::YUVOutput(const char *filename, int w, int h, uint32_t d, int csp, int inputdepth)
7
     : width(w)
8
     , height(h)
9
     , depth(d)
10
     , colorSpace(csp)
11
     , frameSize(0)
12
+    , inputDepth(inputdepth)
13
 {
14
     ofs.open(filename, ios::binary | ios::out);
15
     buf = new charwidth;
16
@@ -56,50 +57,52 @@
17
     X265_CHECK(pic.colorSpace == colorSpace, "invalid chroma subsampling\n");
18
     X265_CHECK(pic.bitDepth == (int)depth, "invalid bit depth\n");
19
 
20
-#if HIGH_BIT_DEPTH
21
-    if (depth == 8)
22
+    if (inputDepth > 8)
23
     {
24
-        int shift = pic.bitDepth - 8;
25
-        ofs.seekp((std::streamoff)fileOffset);
26
-        for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
27
-        {
28
-            uint16_t *src = (uint16_t*)pic.planesi;
29
-            for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
30
-            {
31
-                for (int w = 0; w < width >> x265_cli_cspscolorSpace.widthi; w++)
32
-                    bufw = (char)(srcw >> shift);
33
+   if (depth == 8)
34
+   {
35
+       int shift = pic.bitDepth - 8;
36
+       ofs.seekp((std::streamoff)fileOffset);
37
+       for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
38
+       {
39
+           uint16_t *src = (uint16_t*)pic.planesi;
40
+           for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
41
+           {
42
+               for (int w = 0; w < width >> x265_cli_cspscolorSpace.widthi; w++)
43
+                   bufw = (char)(srcw >> shift);
44
 
45
-                ofs.write(buf, width >> x265_cli_cspscolorSpace.widthi);
46
-                src += pic.stridei / sizeof(*src);
47
-            }
48
-        }
49
+               ofs.write(buf, width >> x265_cli_cspscolorSpace.widthi);
50
+               src += pic.stridei / sizeof(*src);
51
+           }
52
+       }
53
+   }
54
+   else
55
+   {
56
+       ofs.seekp((std::streamoff)(fileOffset * 2));
57
+       for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
58
+       {
59
+           uint16_t *src = (uint16_t*)pic.planesi;
60
+           for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
61
+           {
62
+               ofs.write((const char*)src, (width * 2) >> x265_cli_cspscolorSpace.widthi);
63
+               src += pic.stridei / sizeof(*src);
64
+           }
65
+       }
66
+   }
67
     }
68
     else
69
     {
70
-        ofs.seekp((std::streamoff)(fileOffset * 2));
71
-        for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
72
-        {
73
-            uint16_t *src = (uint16_t*)pic.planesi;
74
-            for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
75
-            {
76
-                ofs.write((const char*)src, (width * 2) >> x265_cli_cspscolorSpace.widthi);
77
-                src += pic.stridei / sizeof(*src);
78
-            }
79
-        }
80
+   ofs.seekp((std::streamoff)fileOffset);
81
+   for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
82
+   {
83
+       char *src = (char*)pic.planesi;
84
+       for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
85
+       {
86
+           ofs.write(src, width >> x265_cli_cspscolorSpace.widthi);
87
+           src += pic.stridei / sizeof(*src);
88
+       }
89
+   }
90
     }
91
-#else // if HIGH_BIT_DEPTH
92
-    ofs.seekp((std::streamoff)fileOffset);
93
-    for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
94
-    {
95
-        char *src = (char*)pic.planesi;
96
-        for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
97
-        {
98
-            ofs.write(src, width >> x265_cli_cspscolorSpace.widthi);
99
-            src += pic.stridei / sizeof(*src);
100
-        }
101
-    }
102
-
103
-#endif // if HIGH_BIT_DEPTH
104
 
105
     return true;
106
 }
107
x265_3.5.tar.gz/source/output/yuv.h -> x265_3.6.tar.gz/source/output/yuv.h Changed
18
 
1
@@ -46,13 +46,15 @@
2
 
3
     uint32_t frameSize;
4
 
5
+    int inputDepth;
6
+
7
     char *buf;
8
 
9
     std::ofstream ofs;
10
 
11
 public:
12
 
13
-    YUVOutput(const char *filename, int width, int height, uint32_t bitdepth, int csp);
14
+    YUVOutput(const char *filename, int width, int height, uint32_t bitdepth, int csp, int inputDepth);
15
 
16
     virtual ~YUVOutput();
17
 
18
x265_3.5.tar.gz/source/test/CMakeLists.txt -> x265_3.6.tar.gz/source/test/CMakeLists.txt Changed
24
 
1
@@ -23,15 +23,13 @@
2
 
3
 # add ARM assembly files
4
 if(ARM OR CROSS_COMPILE_ARM)
5
-    if(NOT ARM64)
6
-        enable_language(ASM)
7
-        set(NASM_SRC checkasm-arm.S)
8
-        add_custom_command(
9
-            OUTPUT checkasm-arm.obj
10
-            COMMAND ${CMAKE_CXX_COMPILER}
11
-            ARGS ${NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S -o checkasm-arm.obj
12
-            DEPENDS checkasm-arm.S)
13
-    endif()
14
+    enable_language(ASM)
15
+    set(NASM_SRC checkasm-arm.S)
16
+    add_custom_command(
17
+        OUTPUT checkasm-arm.obj
18
+        COMMAND ${CMAKE_CXX_COMPILER}
19
+        ARGS ${NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S -o checkasm-arm.obj
20
+        DEPENDS checkasm-arm.S)
21
 endif(ARM OR CROSS_COMPILE_ARM)
22
 
23
 # add PowerPC assembly files
24
x265_3.5.tar.gz/source/test/pixelharness.cpp -> x265_3.6.tar.gz/source/test/pixelharness.cpp Changed
63
 
1
@@ -406,6 +406,32 @@
2
     return true;
3
 }
4
 
5
+bool PixelHarness::check_downscaleluma_t(downscaleluma_t ref, downscaleluma_t opt)
6
+{
7
+    ALIGN_VAR_16(pixel, ref_destf32 * 32);
8
+    ALIGN_VAR_16(pixel, opt_destf32 * 32);
9
+
10
+    intptr_t src_stride = 64;
11
+    intptr_t dst_stride = 32;
12
+    int bx = 32;
13
+    int by = 32;
14
+    int j = 0;
15
+    for (int i = 0; i < ITERS; i++)
16
+    {
17
+        int index = i % TEST_CASES;
18
+        ref(pixel_test_buffindex + j, ref_destf, src_stride, dst_stride, bx, by);
19
+        checked(opt, pixel_test_buffindex + j, opt_destf, src_stride, dst_stride, bx, by);
20
+
21
+        if (memcmp(ref_destf, opt_destf, 32 * 32 * sizeof(pixel)))
22
+            return false;
23
+
24
+        reportfail();
25
+        j += INCR;
26
+    }
27
+
28
+    return true;
29
+}
30
+
31
 bool PixelHarness::check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt)
32
 {
33
     ALIGN_VAR_16(int16_t, ref_dest64 * 64);
34
@@ -2793,6 +2819,15 @@
35
         }
36
     }
37
 
38
+    if (opt.frameSubSampleLuma)
39
+    {
40
+        if (!check_downscaleluma_t(ref.frameSubSampleLuma, opt.frameSubSampleLuma))
41
+        {
42
+            printf("SubSample Luma failed!\n");
43
+            return false;
44
+        }
45
+    }
46
+
47
     if (opt.scale1D_128to64NONALIGNED)
48
     {
49
         if (!check_scale1D_pp(ref.scale1D_128to64NONALIGNED, opt.scale1D_128to64NONALIGNED))
50
@@ -3492,6 +3527,12 @@
51
         REPORT_SPEEDUP(opt.frameInitLowres, ref.frameInitLowres, pbuf2, pbuf1, pbuf2, pbuf3, pbuf4, 64, 64, 64, 64);
52
     }
53
 
54
+    if (opt.frameSubSampleLuma)
55
+    {
56
+        HEADER0("downscaleluma");
57
+        REPORT_SPEEDUP(opt.frameSubSampleLuma, ref.frameSubSampleLuma, pbuf2, pbuf1, 64, 64, 64, 64);
58
+    }
59
+
60
     if (opt.scale1D_128to64NONALIGNED)
61
     {
62
         HEADER0("scale1D_128to64");
63
x265_3.5.tar.gz/source/test/pixelharness.h -> x265_3.6.tar.gz/source/test/pixelharness.h Changed
9
 
1
@@ -138,6 +138,7 @@
2
     bool check_integral_inith(integralh_t ref, integralh_t opt);
3
     bool check_ssimDist(ssimDistortion_t ref, ssimDistortion_t opt);
4
     bool check_normFact(normFactor_t ref, normFactor_t opt, int block);
5
+    bool check_downscaleluma_t(downscaleluma_t ref, downscaleluma_t opt);
6
 
7
 public:
8
 
9
x265_3.5.tar.gz/source/test/rate-control-tests.txt -> x265_3.6.tar.gz/source/test/rate-control-tests.txt Changed
10
 
1
@@ -15,7 +15,7 @@
2
 112_1920x1080_25.yuv,--preset ultrafast --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 15000 --hrd --strict-cbr
3
 Traffic_4096x2048_30.yuv,--preset superfast --bitrate 20000 --vbv-maxrate 20000 --vbv-bufsize 20000 --repeat-headers --strict-cbr
4
 Traffic_4096x2048_30.yuv,--preset faster --bitrate 8000 --vbv-maxrate 8000 --vbv-bufsize 6000 --aud --repeat-headers --no-open-gop --hrd --pmode --pme
5
-News-4k.y4m,--preset veryfast --bitrate 3000 --vbv-maxrate 5000 --vbv-bufsize 5000 --repeat-headers --temporal-layers
6
+News-4k.y4m,--preset veryfast --bitrate 3000 --vbv-maxrate 5000 --vbv-bufsize 5000 --repeat-headers --temporal-layers 3
7
 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --bitrate 18000 --vbv-bufsize 20000 --vbv-maxrate 18000 --strict-cbr
8
 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --bitrate 8000 --vbv-bufsize 12000 --vbv-maxrate 10000  --tune grain
9
 big_buck_bunny_360p24.y4m,--preset medium --bitrate 400 --vbv-bufsize 600 --vbv-maxrate 600 --aud --hrd --tune fast-decode
10
x265_3.5.tar.gz/source/test/regression-tests.txt -> x265_3.6.tar.gz/source/test/regression-tests.txt Changed
91
 
1
@@ -18,12 +18,12 @@
2
 BasketballDrive_1920x1080_50.y4m,--preset faster --aq-strength 2 --merange 190 --slices 3
3
 BasketballDrive_1920x1080_50.y4m,--preset medium --ctu 16 --max-tu-size 8 --subme 7 --qg-size 16 --cu-lossless --tu-inter-depth 3 --limit-tu 1
4
 BasketballDrive_1920x1080_50.y4m,--preset medium --keyint -1 --nr-inter 100 -F4 --no-sao
5
-BasketballDrive_1920x1080_50.y4m,--preset medium --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 2 --bitrate 7000 --limit-modes::--preset medium --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 2 --bitrate 7000 --limit-modes
6
+BasketballDrive_1920x1080_50.y4m,--preset medium --analysis-save x265_analysis.dat --analysis-save-reuse-level 2 --bitrate 7000 --limit-modes::--preset medium --analysis-load x265_analysis.dat --analysis-load-reuse-level 2 --bitrate 7000 --limit-modes
7
 BasketballDrive_1920x1080_50.y4m,--preset slow --nr-intra 100 -F4 --aq-strength 3 --qg-size 16 --limit-refs 1
8
 BasketballDrive_1920x1080_50.y4m,--preset slower --lossless --chromaloc 3 --subme 0 --limit-tu 4
9
-BasketballDrive_1920x1080_50.y4m,--preset slower --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --bitrate 7000 --limit-tu 0::--preset slower --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 10 --bitrate 7000 --limit-tu 0
10
+BasketballDrive_1920x1080_50.y4m,--preset slower --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --bitrate 7000 --limit-tu 0::--preset slower --analysis-load x265_analysis.dat --analysis-load-reuse-level 10 --bitrate 7000 --limit-tu 0
11
 BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1 --aq-mode 3 --limit-tu 3
12
-BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --crf 18 --tskip-fast --limit-tu 2::--preset veryslow --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --crf 18 --tskip-fast --limit-tu 2
13
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --crf 18 --tskip-fast --limit-tu 2::--preset veryslow --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --crf 18 --tskip-fast --limit-tu 2
14
 BasketballDrive_1920x1080_50.y4m,--preset veryslow --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
15
 Coastguard-4k.y4m,--preset ultrafast --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
16
 Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop
17
@@ -33,7 +33,7 @@
18
 Coastguard-4k.y4m,--preset slow --tune psnr --cbqpoffs -1 --crqpoffs 1 --limit-refs 1
19
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset ultrafast --weightp --tune zerolatency --qg-size 16
20
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset superfast --weightp --no-wpp --sao
21
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset veryfast --temporal-layers --tune grain
22
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset veryfast --temporal-layers 2 --tune grain
23
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset faster --max-tu-size 4 --min-cu-size 32
24
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --aq-mode 0 --sar 2 --range full
25
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset medium --no-wpp --no-cutree --no-strong-intra-smoothing --limit-refs 1
26
@@ -41,7 +41,7 @@
27
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset slower --tune ssim --tune fastdecode --limit-refs 2
28
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset ultrafast --weightp --no-wpp --no-open-gop
29
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset superfast --weightp --dither --no-psy-rd
30
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryfast --temporal-layers --repeat-headers --limit-refs 2
31
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryfast --temporal-layers 2 --repeat-headers --limit-refs 2
32
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset medium --dither --keyint -1 --rdoq-level 1 --limit-modes
33
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --tskip --tskip-fast --no-scenecut --limit-tu 1
34
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --aq-mode 3 --aq-strength 1.5 --aq-motion --bitrate 5000
35
@@ -49,11 +49,11 @@
36
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --hevc-aq --no-cutree --qg-size 16
37
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset superfast --weightp --qg-size 16
38
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset medium --tune psnr --bframes 16 --limit-modes
39
-DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
40
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers 2 --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
41
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset veryfast --weightp --nr-intra 1000 -F4
42
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset medium --nr-inter 500 -F4 --no-psy-rdoq
43
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset slower --no-weightp --rdoq-level 0 --limit-refs 3 --tu-inter-depth 4 --limit-tu 3
44
-DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset fast --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1::--preset fast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1
45
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset fast --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1::--preset fast --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1
46
 FourPeople_1280x720_60.y4m,--preset superfast --no-wpp --lookahead-slices 2
47
 FourPeople_1280x720_60.y4m,--preset veryfast --aq-mode 2 --aq-strength 1.5 --qg-size 8
48
 FourPeople_1280x720_60.y4m,--preset medium --qp 38 --no-psy-rd
49
@@ -158,13 +158,10 @@
50
 ducks_take_off_420_1_720p50.y4m,--preset medium --selective-sao 4 --sao --crf 20
51
 Traffic_4096x2048_30p.y4m, --preset medium --frame-dup --dup-threshold 60 --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000
52
 Kimono1_1920x1080_24_400.yuv,--preset superfast --qp 28 --zones 0,139,q=32
53
-sintel_trailer_2k_1920x1080_24.yuv, --preset medium --hist-scenecut --hist-threshold 0.02 --frame-dup --dup-threshold 60 --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000
54
-sintel_trailer_2k_1920x1080_24.yuv, --preset medium --hist-scenecut --hist-threshold 0.02
55
-sintel_trailer_2k_1920x1080_24.yuv, --preset ultrafast --hist-scenecut --hist-threshold 0.02
56
 crowd_run_1920x1080_50.yuv, --preset faster --ctu 32 --rskip 2 --rskip-edge-threshold 5
57
 crowd_run_1920x1080_50.yuv, --preset fast --ctu 64 --rskip 2 --rskip-edge-threshold 5 --aq-mode 4
58
-crowd_run_1920x1080_50.yuv, --preset slow --ctu 32 --rskip 2 --rskip-edge-threshold 5 --hist-scenecut --hist-threshold 0.1
59
-crowd_run_1920x1080_50.yuv, --preset slower --ctu 16 --rskip 2 --rskip-edge-threshold 5 --hist-scenecut --hist-threshold 0.1 --aq-mode 4
60
+crowd_run_1920x1080_50.yuv, --preset ultrafast --video-signal-type-preset BT2100_PQ_YCC:BT2100x108n0005
61
+crowd_run_1920x1080_50.yuv, --preset ultrafast --eob --eos
62
  
63
 # Main12 intraCost overflow bug test
64
 720p50_parkrun_ter.y4m,--preset medium
65
@@ -182,14 +179,22 @@
66
 
67
 #scaled save/load test
68
 crowd_run_1080p50.y4m,--preset ultrafast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 8000 --vbv-bufsize 8000::crowd_run_2160p50.y4m, --preset ultrafast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 12000 --vbv-bufsize 12000 
69
-crowd_run_1080p50.y4m,--preset superfast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m, --preset superfast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000 
70
-crowd_run_1080p50.y4m,--preset fast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m, --preset fast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --scale-factor 2 --qp 18
71
+crowd_run_1080p50.y4m,--preset superfast --analysis-save x265_analysis.dat  --analysis-save-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m, --preset superfast --analysis-load x265_analysis.dat  --analysis-load-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000 
72
+crowd_run_1080p50.y4m,--preset fast --analysis-save x265_analysis.dat  --analysis-save-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m, --preset fast --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --scale-factor 2 --qp 18
73
 crowd_run_1080p50.y4m,--preset medium --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000  --vbv-maxrate 5000 --vbv-bufsize 5000 --early-skip --tu-inter-depth 3::crowd_run_2160p50.y4m, --preset medium --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 4 --dynamic-refine::crowd_run_2160p50.y4m, --preset medium --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 3 --refine-inter 3
74
-RaceHorses_416x240_30.y4m,--preset slow --no-cutree --ctu 16 --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --crf 22  --vbv-maxrate 1000 --vbv-bufsize 1000::RaceHorses_832x480_30.y4m, --preset slow --no-cutree --ctu 32 --analysis-load x265_analysis.dat  --analysis-save x265_analysis_2.dat --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --crf 16 --vbv-maxrate 4000 --vbv-bufsize 4000 --refine-intra 0 --refine-inter 1::RaceHorses_1664x960_30.y4m,--preset slow --no-cutree --ctu 64 --analysis-load x265_analysis_2.dat  --analysis-load-reuse-level 10 --scale-factor 2 --crf 12 --vbv-maxrate 7000 --vbv-bufsize 7000 --refine-intra 2 --refine-inter 2
75
+RaceHorses_416x240_30.y4m,--preset slow --ctu 16 --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --crf 22  --vbv-maxrate 1000 --vbv-bufsize 1000::RaceHorses_832x480_30.y4m, --preset slow --ctu 32 --analysis-load x265_analysis.dat  --analysis-save x265_analysis_2.dat --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --crf 16 --vbv-maxrate 4000 --vbv-bufsize 4000 --refine-intra 0 --refine-inter 1::RaceHorses_1664x960_30.y4m,--preset slow --ctu 64 --analysis-load x265_analysis_2.dat  --analysis-load-reuse-level 10 --scale-factor 2 --crf 12 --vbv-maxrate 7000 --vbv-bufsize 7000 --refine-intra 2 --refine-inter 2
76
 ElFunete_960x540_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune psnr --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500 --analysis-save-reuse-level 10 --analysis-save elfuente_960x540.dat --scale-factor 2::ElFunete_1920x1080_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune psnr --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500 --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --analysis-save elfuente_1920x1080.dat --limit-tu 0 --scale-factor 2 --analysis-load elfuente_960x540.dat --refine-intra 4 --refine-inter 2::ElFuente_3840x2160_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune=psnr --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000 --analysis-load-reuse-level 10 --limit-tu 0 --scale-factor 2 --analysis-load elfuente_1920x1080.dat --refine-intra 4 --refine-inter 2
77
 #save/load with ctu distortion refinement
78
 CrowdRun_1920x1080_50_10bit_422.yuv,--no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --refine-ctu-distortion 1 --bitrate 7000::--no-cutree --analysis-load x265_analysis.dat --refine-ctu-distortion 1 --bitrate 7000 --analysis-load-reuse-level 5
79
 #segment encoding
80
 BasketballDrive_1920x1080_50.y4m, --preset ultrafast --no-open-gop --chunk-start 100 --chunk-end 200
81
 
82
+#Test FG SEI message addition
83
+#OldTownCross_1920x1080_50_10bit_422.yuv,--preset slower --tune grain --film-grain "OldTownCross_1920x1080_50_10bit_422.bin"
84
+#RaceHorses_416x240_30_10bit.yuv,--preset ultrafast --signhide --colormatrix bt709 --film-grain "RaceHorses_416x240_30_10bit.bin"
85
+
86
+#Temporal layers tests
87
+ducks_take_off_420_720p50.y4m,--preset slow --temporal-layers 3 --b-adapt 0
88
+parkrun_ter_720p50.y4m,--preset medium --temporal-layers 4 --b-adapt 0
89
+BasketballDrive_1920x1080_50.y4m, --preset medium --no-open-gop --keyint 50 --min-keyint 50 --temporal-layers 5 --b-adapt 0
90
 # vim: tw=200
91
x265_3.5.tar.gz/source/test/save-load-tests.txt -> x265_3.6.tar.gz/source/test/save-load-tests.txt Changed
16
 
1
@@ -12,10 +12,10 @@
2
 # not auto-detected.
3
 crowd_run_1080p50.y4m, --preset ultrafast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 8000 --vbv-bufsize 8000::crowd_run_2160p50.y4m, --preset ultrafast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 12000 --vbv-bufsize 12000
4
 crowd_run_540p50.y4m, --preset ultrafast --no-cutree --analysis-save x265_analysis.dat --scale-factor 2 --crf 26 --vbv-maxrate 8000 --vbv-bufsize 8000::crowd_run_1080p50.y4m, --preset ultrafast --no-cutree --analysis-load x265_analysis.dat --scale-factor 2 --crf 26 --vbv-maxrate 12000 --vbv-bufsize 12000
5
-crowd_run_1080p50.y4m, --preset superfast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m,   --preset superfast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000
6
-crowd_run_1080p50.y4m,  --preset fast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m,   --preset fast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --scale-factor 2 --qp 18
7
-crowd_run_1080p50.y4m,   --preset medium --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000  --vbv-maxrate 5000 --vbv-bufsize 5000 --early-skip --tu-inter-depth 3::crowd_run_2160p50.y4m,    --preset medium --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 4 --dynamic-refine::crowd_run_2160p50.y4m,    --preset medium --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 3 --refine-inter 3
8
+crowd_run_1080p50.y4m, --preset superfast --analysis-save x265_analysis.dat  --analysis-save-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m,   --preset superfast --analysis-load x265_analysis.dat  --analysis-load-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000
9
+crowd_run_1080p50.y4m,  --preset fast --analysis-save x265_analysis.dat  --analysis-save-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m,   --preset fast --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --scale-factor 2 --qp 18
10
+crowd_run_1080p50.y4m,   --preset medium --analysis-save x265_analysis.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000  --vbv-maxrate 5000 --vbv-bufsize 5000 --early-skip --tu-inter-depth 3::crowd_run_2160p50.y4m,    --preset medium --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 4 --dynamic-refine::crowd_run_2160p50.y4m,    --preset medium --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 3 --refine-inter 3
11
 RaceHorses_416x240_30.y4m,   --preset slow --no-cutree --ctu 16 --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --crf 22  --vbv-maxrate 1000 --vbv-bufsize 1000::RaceHorses_832x480_30.y4m,    --preset slow --no-cutree --ctu 32 --analysis-load x265_analysis.dat  --analysis-save x265_analysis_2.dat --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --crf 16 --vbv-maxrate 4000 --vbv-bufsize 4000 --refine-intra 0 --refine-inter 1::RaceHorses_1664x960_30.y4m,   --preset slow --no-cutree --ctu 64 --analysis-load x265_analysis_2.dat  --analysis-load-reuse-level 10 --scale-factor 2 --crf 12 --vbv-maxrate 7000 --vbv-bufsize 7000 --refine-intra 2 --refine-inter 2
12
-crowd_run_540p50.y4m,   --preset veryslow --no-cutree --analysis-save x265_analysis_540.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-bufsize 15000 --vbv-maxrate 9000::crowd_run_1080p50.y4m,   --preset veryslow --no-cutree --analysis-save x265_analysis_1080.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_1080p50.y4m,  --preset veryslow --no-cutree --analysis-save x265_analysis_1080.dat --analysis-load x265_analysis_540.dat --refine-intra 4 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_2160p50.y4m,  --preset veryslow --no-cutree --analysis-save x265_analysis_2160.dat --analysis-load x265_analysis_1080.dat --refine-intra 3 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000::crowd_run_2160p50.y4m,  --preset veryslow --no-cutree --analysis-load x265_analysis_2160.dat --refine-intra 2 --dynamic-refine --analysis-load-reuse-level 10 --scale-factor 1 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000
13
+crowd_run_540p50.y4m,   --preset veryslow --analysis-save x265_analysis_540.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-bufsize 15000 --vbv-maxrate 9000::crowd_run_1080p50.y4m,   --preset veryslow --analysis-save x265_analysis_1080.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_1080p50.y4m,  --preset veryslow --analysis-save x265_analysis_1080.dat --analysis-load x265_analysis_540.dat --refine-intra 4 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_2160p50.y4m,  --preset veryslow --analysis-save x265_analysis_2160.dat --analysis-load x265_analysis_1080.dat --refine-intra 3 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000::crowd_run_2160p50.y4m,  --preset veryslow --analysis-load x265_analysis_2160.dat --refine-intra 2 --dynamic-refine --analysis-load-reuse-level 10 --scale-factor 1 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000
14
 crowd_run_540p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_540.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-bufsize 15000 --vbv-maxrate 9000::crowd_run_1080p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_1080.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_1080p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_1080.dat --analysis-load x265_analysis_540.dat --refine-intra 4 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_2160p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_2160.dat --analysis-load x265_analysis_1080.dat --refine-intra 3 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000::crowd_run_2160p50.y4m,  --preset medium --no-cutree --analysis-load x265_analysis_2160.dat --refine-intra 2 --dynamic-refine --analysis-load-reuse-level 10 --scale-factor 1 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000
15
 News-4k.y4m,  --preset medium --analysis-save x265_analysis_fdup.dat --frame-dup --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000::News-4k.y4m, --analysis-load x265_analysis_fdup.dat --frame-dup --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000
16
x265_3.5.tar.gz/source/test/smoke-tests.txt -> x265_3.6.tar.gz/source/test/smoke-tests.txt Changed
9
 
1
@@ -23,3 +23,7 @@
2
 # Main12 intraCost overflow bug test
3
 720p50_parkrun_ter.y4m,--preset medium
4
 720p50_parkrun_ter.y4m,--preset=fast --hevc-aq --no-cutree
5
+# Test FG SEI message addition
6
+# CrowdRun_1920x1080_50_10bit_444.yuv,--preset=ultrafast --weightp --keyint -1 --film-grain "CrowdRun_1920x1080_50_10bit_444.bin"
7
+# DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=veryfast --min-cu 16 --film-grain "DucksAndLegs_1920x1080_60_10bit_422.bin"
8
+# NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset=superfast --bitrate 10000 --sao --limit-sao --cll --max-cll "1000,400" --film-grain "NebutaFestival_2560x1600_60_10bit_crop.bin"
9
x265_3.5.tar.gz/source/test/testbench.cpp -> x265_3.6.tar.gz/source/test/testbench.cpp Changed
43
 
1
@@ -174,6 +174,8 @@
2
         { "AVX512", X265_CPU_AVX512 },
3
         { "ARMv6", X265_CPU_ARMV6 },
4
         { "NEON", X265_CPU_NEON },
5
+        { "SVE2", X265_CPU_SVE2 },
6
+        { "SVE", X265_CPU_SVE },
7
         { "FastNeonMRC", X265_CPU_FAST_NEON_MRC },
8
         { "", 0 },
9
     };
10
@@ -208,15 +210,8 @@
11
 
12
         EncoderPrimitives asmprim;
13
         memset(&asmprim, 0, sizeof(asmprim));
14
-        setupAssemblyPrimitives(asmprim, test_archi.flag);
15
-
16
-#if X265_ARCH_ARM64
17
-        /* Temporary workaround because luma_vsp assembly primitive has not been completed
18
-         * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
19
-         * Otherwise, segment fault occurs. */
20
-        setupAliasCPrimitives(cprim, asmprim, test_archi.flag);
21
-#endif
22
 
23
+        setupAssemblyPrimitives(asmprim, test_archi.flag);
24
         setupAliasPrimitives(asmprim);
25
         memcpy(&primitives, &asmprim, sizeof(EncoderPrimitives));
26
         for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++)
27
@@ -239,14 +234,8 @@
28
 #if X265_ARCH_X86
29
     setupInstrinsicPrimitives(optprim, cpuid);
30
 #endif
31
-    setupAssemblyPrimitives(optprim, cpuid);
32
 
33
-#if X265_ARCH_ARM64
34
-    /* Temporary workaround because luma_vsp assembly primitive has not been completed
35
-     * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
36
-     * Otherwise, segment fault occurs. */
37
-    setupAliasCPrimitives(cprim, optprim, cpuid);
38
-#endif
39
+    setupAssemblyPrimitives(optprim, cpuid);
40
 
41
     /* Note that we do not setup aliases for performance tests, that would be
42
      * redundant. The testbench only verifies they are correctly aliased */
43
x265_3.5.tar.gz/source/test/testharness.h -> x265_3.6.tar.gz/source/test/testharness.h Changed
48
 
1
@@ -73,7 +73,7 @@
2
 #include <x86intrin.h>
3
 #elif ( !defined(__APPLE__) && defined (__GNUC__) && defined(__ARM_NEON__))
4
 #include <arm_neon.h>
5
-#elif defined(__GNUC__) && (!defined(__clang__) || __clang_major__ < 4)
6
+#else
7
 /* fallback for older GCC/MinGW */
8
 static inline uint32_t __rdtsc(void)
9
 {
10
@@ -82,15 +82,13 @@
11
 #if X265_ARCH_X86
12
     asm volatile("rdtsc" : "=a" (a) ::"edx");
13
 #elif X265_ARCH_ARM
14
-#if X265_ARCH_ARM64
15
-    asm volatile("mrs %0, cntvct_el0" : "=r"(a));
16
-#else
17
     // TOD-DO: verify following inline asm to get cpu Timestamp Counter for ARM arch
18
     // asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(a));
19
 
20
     // TO-DO: replace clock() function with appropriate ARM cpu instructions
21
     a = clock();
22
-#endif
23
+#elif  X265_ARCH_ARM64
24
+    asm volatile("mrs %0, cntvct_el0" : "=r"(a));
25
 #endif
26
     return a;
27
 }
28
@@ -128,8 +126,8 @@
29
         x265_emms(); \
30
         float optperf = (10.0f * cycles / runs) / 4; \
31
         float refperf = (10.0f * refcycles / refruns) / 4; \
32
-        printf("\t%3.2fx ", refperf / optperf); \
33
-        printf("\t %-8.2lf \t %-8.2lf\n", optperf, refperf); \
34
+        printf(" | \t%3.2fx | ", refperf / optperf); \
35
+        printf("\t %-8.2lf | \t %-8.2lf\n", optperf, refperf); \
36
     }
37
 
38
 extern "C" {
39
@@ -140,7 +138,7 @@
40
  * needs an explicit asm check because it only sometimes crashes in normal use. */
41
 intptr_t PFX(checkasm_call)(intptr_t (*func)(), int *ok, ...);
42
 float PFX(checkasm_call_float)(float (*func)(), int *ok, ...);
43
-#elif X265_ARCH_ARM == 0
44
+#elif (X265_ARCH_ARM == 0 && X265_ARCH_ARM64 == 0)
45
 #define PFX(stack_pagealign)(func, align) func()
46
 #endif
47
 
48
x265_3.5.tar.gz/source/x265.cpp -> x265_3.6.tar.gz/source/x265.cpp Changed
18
 
1
@@ -296,6 +296,16 @@
2
 
3
     int ret = 0;
4
 
5
+    if (cliopt0.scenecutAwareQpConfig)
6
+    {
7
+        if (!cliopt0.parseScenecutAwareQpConfig())
8
+        {
9
+            x265_log(NULL, X265_LOG_ERROR, "Unable to parse scenecut aware qp config file \n");
10
+            fclose(cliopt0.scenecutAwareQpConfig);
11
+            cliopt0.scenecutAwareQpConfig = NULL;
12
+        }
13
+    }
14
+
15
     AbrEncoder* abrEnc = new AbrEncoder(cliopt, numEncodes, ret);
16
     int threadsActive = abrEnc->m_numActiveEncodes.get();
17
     while (threadsActive)
18
x265_3.5.tar.gz/source/x265.h -> x265_3.6.tar.gz/source/x265.h Changed
470
 
1
@@ -26,6 +26,7 @@
2
 #define X265_H
3
 #include <stdint.h>
4
 #include <stdio.h>
5
+#include <sys/stat.h>
6
 #include "x265_config.h"
7
 #ifdef __cplusplus
8
 extern "C" {
9
@@ -59,7 +60,7 @@
10
     NAL_UNIT_CODED_SLICE_TRAIL_N = 0,
11
     NAL_UNIT_CODED_SLICE_TRAIL_R,
12
     NAL_UNIT_CODED_SLICE_TSA_N,
13
-    NAL_UNIT_CODED_SLICE_TLA_R,
14
+    NAL_UNIT_CODED_SLICE_TSA_R,
15
     NAL_UNIT_CODED_SLICE_STSA_N,
16
     NAL_UNIT_CODED_SLICE_STSA_R,
17
     NAL_UNIT_CODED_SLICE_RADL_N,
18
@@ -311,6 +312,7 @@
19
     double           vmafFrameScore;
20
     double           bufferFillFinal;
21
     double           unclippedBufferFillFinal;
22
+    uint8_t          tLayer;
23
 } x265_frame_stats;
24
 
25
 typedef struct x265_ctu_info_t
26
@@ -536,6 +538,8 @@
27
 /* ARM */
28
 #define X265_CPU_ARMV6           0x0000001
29
 #define X265_CPU_NEON            0x0000002  /* ARM NEON */
30
+#define X265_CPU_SVE2            0x0000008  /* ARM SVE2 */
31
+#define X265_CPU_SVE             0x0000010  /* ARM SVE2 */
32
 #define X265_CPU_FAST_NEON_MRC   0x0000004  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
33
 
34
 /* IBM Power8 */
35
@@ -613,6 +617,13 @@
36
 #define SLICE_TYPE_DELTA        0.3 /* The offset decremented or incremented for P-frames or b-frames respectively*/
37
 #define BACKWARD_WINDOW         1 /* Scenecut window before a scenecut */
38
 #define FORWARD_WINDOW          2 /* Scenecut window after a scenecut */
39
+#define BWD_WINDOW_DELTA        0.4
40
+
41
+#define X265_MAX_GOP_CONFIG 3
42
+#define X265_MAX_GOP_LENGTH 16
43
+#define MAX_T_LAYERS 7
44
+
45
+#define X265_IPRATIO_STRENGTH   1.43
46
 
47
 typedef struct x265_cli_csp
48
 {
49
@@ -696,6 +707,7 @@
50
 typedef struct x265_zone
51
 {
52
     int   startFrame, endFrame; /* range of frame numbers */
53
+    int   keyframeMax;          /* it store the default/user defined keyframeMax value*/
54
     int   bForceQp;             /* whether to use qp vs bitrate factor */
55
     int   qp;
56
     float bitrateFactor;
57
@@ -747,6 +759,271 @@
58
 
59
 static const x265_vmaf_commondata vcd = { { NULL, (char *)"/usr/local/share/model/vmaf_v0.6.1.pkl", NULL, NULL, 0, 0, 0, 0, 0, 0, 0, NULL, 0, 1, 0 } };
60
 
61
+typedef struct x265_temporal_layer {
62
+    int poc_offset;      /* POC offset */
63
+    int8_t layer;        /* Current layer */
64
+    int8_t qp_offset;    /* QP offset */
65
+} x265_temporal_layer;
66
+
67
+static const int8_t x265_temporal_layer_bframesMAX_T_LAYERS = {-1, -1, 3, 7, 15, -1, -1};
68
+
69
+static const int8_t x265_gop_ra_lengthX265_MAX_GOP_CONFIG = { 4, 8, 16};
70
+static const x265_temporal_layer x265_gop_raX265_MAX_GOP_CONFIGX265_MAX_GOP_LENGTH = {
71
+    {
72
+        {
73
+            4,
74
+            0,
75
+            1,
76
+        },
77
+        {
78
+            2,
79
+            1,
80
+            5,
81
+        },
82
+        {
83
+            1,
84
+            2,
85
+            3,
86
+        },
87
+        {
88
+            3,
89
+            2,
90
+            5,
91
+        },
92
+        {
93
+            -1,
94
+            -1,
95
+            -1,
96
+        },
97
+        {
98
+            -1,
99
+            -1,
100
+            -1,
101
+        },
102
+        {
103
+            -1,
104
+            -1,
105
+            -1,
106
+        },
107
+        {
108
+            -1,
109
+            -1,
110
+            -1,
111
+        },
112
+        {
113
+            -1,
114
+            -1,
115
+            -1,
116
+        },
117
+        {
118
+            -1,
119
+            -1,
120
+            -1,
121
+        },
122
+        {
123
+            -1,
124
+            -1,
125
+            -1,
126
+        },
127
+        {
128
+            -1,
129
+            -1,
130
+            -1,
131
+        },
132
+        {
133
+            -1,
134
+            -1,
135
+            -1,
136
+        },
137
+        {
138
+            -1,
139
+            -1,
140
+            -1,
141
+        },
142
+        {
143
+            -1,
144
+            -1,
145
+            -1,
146
+        },
147
+        {
148
+            -1,
149
+            -1,
150
+            -1,
151
+        }
152
+    },
153
+
154
+    {
155
+        {
156
+            8,
157
+            0,
158
+            1,
159
+        },
160
+        {
161
+            4,
162
+            1,
163
+            5,
164
+        },
165
+        {
166
+            2,
167
+            2,
168
+            4,
169
+        },
170
+        {
171
+            1,
172
+            3,
173
+            5,
174
+        },
175
+        {
176
+            3,
177
+            3,
178
+            2,
179
+        },
180
+        {
181
+            6,
182
+            2,
183
+            5,
184
+        },
185
+        {
186
+            5,
187
+            3,
188
+            4,
189
+        },
190
+        {
191
+            7,
192
+            3,
193
+            5,
194
+        },
195
+        {
196
+            -1,
197
+            -1,
198
+            -1,
199
+        },
200
+        {
201
+            -1,
202
+            -1,
203
+            -1,
204
+        },
205
+        {
206
+            -1,
207
+            -1,
208
+            -1,
209
+        },
210
+        {
211
+            -1,
212
+            -1,
213
+            -1,
214
+        },
215
+        {
216
+            -1,
217
+            -1,
218
+            -1,
219
+        },
220
+        {
221
+            -1,
222
+            -1,
223
+            -1,
224
+        },
225
+        {
226
+            -1,
227
+            -1,
228
+            -1,
229
+        },
230
+        {
231
+            -1,
232
+            -1,
233
+            -1,
234
+        },
235
+    },
236
+    {
237
+        {
238
+            16,
239
+            0,
240
+            1,
241
+        },
242
+        {
243
+            8,
244
+            1,
245
+            6,
246
+        },
247
+        {
248
+            4,
249
+            2,
250
+            5,
251
+        },
252
+        {
253
+            2,
254
+            3,
255
+            6,
256
+        },
257
+        {
258
+            1,
259
+            4,
260
+            4,
261
+        },
262
+        {
263
+            3,
264
+            4,
265
+            6,
266
+        },
267
+        {
268
+            6,
269
+            3,
270
+            5,
271
+        },
272
+        {
273
+            5,
274
+            4,
275
+            6,
276
+        },
277
+        {
278
+            7,
279
+            4,
280
+            1,
281
+        },
282
+        {
283
+            12,
284
+            2,
285
+            6,
286
+        },
287
+        {
288
+            10,
289
+            3,
290
+            5,
291
+        },
292
+        {
293
+            9,
294
+            4,
295
+            6,
296
+        },
297
+        {
298
+            11,
299
+            4,
300
+            4,
301
+        },
302
+        {
303
+            14,
304
+            3,
305
+            6,
306
+        },
307
+        {
308
+            13,
309
+            4,
310
+            5,
311
+        },
312
+        {
313
+            15,
314
+            4,
315
+            6,
316
+        }
317
+    }
318
+};
319
+
320
+typedef enum
321
+{
322
+    X265_SHARE_MODE_FILE = 0,
323
+    X265_SHARE_MODE_SHAREDMEM
324
+}X265_DATA_SHARE_MODES;
325
+
326
 /* x265 input parameters
327
  *
328
  * For version safety you may use x265_param_alloc/free() to manage the
329
@@ -983,6 +1260,9 @@
330
      * performance impact, but the use case may preclude it.  Default true */
331
     int       bOpenGOP;
332
 
333
+   /*Force nal type to CRA to all frames expect first frame. Default disabled*/
334
+   int       craNal;
335
+
336
     /* Scene cuts closer together than this are coded as I, not IDR. */
337
     int       keyframeMin;
338
 
339
@@ -1433,10 +1713,10 @@
340
         double    rfConstantMin;
341
 
342
         /* Multi-pass encoding */
343
-        /* Enable writing the stats in a multi-pass encode to the stat output file */
344
+        /* Enable writing the stats in a multi-pass encode to the stat output file/memory */
345
         int       bStatWrite;
346
 
347
-        /* Enable loading data from the stat input file in a multi pass encode */
348
+        /* Enable loading data from the stat input file/memory in a multi pass encode */
349
         int       bStatRead;
350
 
351
         /* Filename of the 2pass output/input stats file, if unspecified the
352
@@ -1489,6 +1769,21 @@
353
         /* internally enable if tune grain is set */
354
         int      bEnableConstVbv;
355
 
356
+        /* if only the focused frames would be re-encode or not */
357
+        int       bEncFocusedFramesOnly;
358
+
359
+        /* Share the data with stats file or shared memory.
360
+        It must be one of the X265_DATA_SHARE_MODES enum values
361
+        Available if the bStatWrite or bStatRead is true.
362
+        Use stats file by default.
363
+        The stats file mode would be used among the encoders running in sequence.
364
+        The shared memory mode could only be used among the encoders running in parallel.
365
+        Now only the cutree data could be shared among shared memory. More data would be support in the future.*/
366
+        int       dataShareMode;
367
+
368
+        /* Unique shared memory name. Required if the shared memory mode enabled. NULL by default */
369
+        const char* sharedMemName;
370
+
371
     } rc;
372
 
373
     /*== Video Usability Information ==*/
374
@@ -1850,6 +2145,10 @@
375
       Default 1 (Enabled). API only. */
376
     int       bResetZoneConfig;
377
 
378
+    /*Flag to indicate rate-control history has not to be reset during zone reconfiguration.
379
+      Default 0 (Disabled) */
380
+    int       bNoResetZoneConfig;
381
+
382
     /* It reduces the bits spent on the inter-frames within the scenecutWindow before and / or after a scenecut
383
      * by increasing their QP in ratecontrol pass2 algorithm without any deterioration in visual quality.
384
      * 0 - Disabled (default).
385
@@ -1860,20 +2159,15 @@
386
 
387
     /* The duration(in milliseconds) for which there is a reduction in the bits spent on the inter-frames after a scenecut
388
      * by increasing their QP, when bEnableSceneCutAwareQp is 1 or 3. Default is 500ms.*/
389
-    int       fwdScenecutWindow;
390
+    int       fwdMaxScenecutWindow;
391
+    int       fwdScenecutWindow6;
392
 
393
     /* The offset by which QP is incremented for inter-frames after a scenecut when bEnableSceneCutAwareQp is 1 or 3.
394
      * Default is +5. */
395
-    double    fwdRefQpDelta;
396
+    double    fwdRefQpDelta6;
397
 
398
     /* The offset by which QP is incremented for non-referenced inter-frames after a scenecut when bEnableSceneCutAwareQp is 1 or 3. */
399
-    double    fwdNonRefQpDelta;
400
-
401
-    /* A genuine threshold used for histogram based scene cut detection.
402
-     * This threshold determines whether a frame is a scenecut or not
403
-     * when compared against the edge and chroma histogram sad values.
404
-     * Default 0.03. Range: Real number in the interval (0,1). */
405
-    double    edgeTransitionThreshold;
406
+    double    fwdNonRefQpDelta6;
407
 
408
     /* Enables histogram based scenecut detection algorithm to detect scenecuts. Default disabled */
409
     int       bHistBasedSceneCut;
410
@@ -1941,13 +2235,39 @@
411
 
412
     /* The duration(in milliseconds) for which there is a reduction in the bits spent on the inter-frames before a scenecut
413
      * by increasing their QP, when bEnableSceneCutAwareQp is 2 or 3. Default is 100ms.*/
414
-    int       bwdScenecutWindow;
415
+    int       bwdMaxScenecutWindow;
416
+    int       bwdScenecutWindow6;
417
 
418
     /* The offset by which QP is incremented for inter-frames before a scenecut when bEnableSceneCutAwareQp is 2 or 3. */
419
-    double    bwdRefQpDelta;
420
+    double    bwdRefQpDelta6;
421
 
422
     /* The offset by which QP is incremented for non-referenced inter-frames before a scenecut when bEnableSceneCutAwareQp is 2 or 3. */
423
-    double    bwdNonRefQpDelta;
424
+    double    bwdNonRefQpDelta6;
425
+
426
+    /* Specify combinations of color primaries, transfer characteristics, color matrix,
427
+    * range of luma and chroma signals, and chroma sample location. This has higher
428
+    * precedence than individual VUI parameters. If any individual VUI option is specified
429
+    * together with this, which changes the values set corresponding to the system-id
430
+    * or color-volume, it will be discarded. */
431
+    const char* videoSignalTypePreset;
432
+
433
+    /* Flag indicating whether the encoder should emit an End of Bitstream
434
+     * NAL at the end of bitstream. Default false */
435
+    int      bEnableEndOfBitstream;
436
+
437
+    /* Flag indicating whether the encoder should emit an End of Sequence
438
+     * NAL at the end of every Coded Video Sequence. Default false */
439
+    int      bEnableEndOfSequence;
440
+
441
+    /* Film Grain Characteristic file */
442
+    char* filmGrain;
443
+
444
+    /*Motion compensated temporal filter*/
445
+    int      bEnableTemporalFilter;
446
+    double   temporalFilterStrength;
447
+
448
+    /*SBRC*/
449
+    int      bEnableSBRC;
450
 } x265_param;
451
 
452
 /* x265_param_alloc:
453
@@ -1982,6 +2302,8 @@
454
 
455
 int x265_zone_param_parse(x265_param* p, const char* name, const char* value);
456
 
457
+int x265_scenecut_aware_qp_param_parse(x265_param* p, const char* name, const char* value);
458
+
459
 static const char * const x265_profile_names = {
460
     /* HEVC v1 */
461
     "main", "main10", "mainstillpicture", /* alias */ "msp",
462
@@ -2251,6 +2573,7 @@
463
     void          (*param_free)(x265_param*);
464
     void          (*param_default)(x265_param*);
465
     int           (*param_parse)(x265_param*, const char*, const char*);
466
+    int           (*scenecut_aware_qp_param_parse)(x265_param*, const char*, const char*);
467
     int           (*param_apply_profile)(x265_param*, const char*);
468
     int           (*param_default_preset)(x265_param*, const char*, const char *);
469
     x265_picture* (*picture_alloc)(void);
470
x265_3.5.tar.gz/source/x265cli.cpp -> x265_3.6.tar.gz/source/x265cli.cpp Changed
393
 
1
@@ -28,8 +28,8 @@
2
 #include "x265cli.h"
3
 #include "svt.h"
4
 
5
-#define START_CODE 0x00000001
6
-#define START_CODE_BYTES 4
7
+#define START_CODE 0x00000001
8
+#define START_CODE_BYTES 4
9
 
10
 #ifdef __cplusplus
11
 namespace X265_NS {
12
@@ -166,6 +166,7 @@
13
         H0("   --rdpenalty <0..2>            penalty for 32x32 intra TU in non-I slices. 0:disabled 1:RD-penalty 2:maximum. Default %d\n", param->rdPenalty);
14
         H0("\nSlice decision options:\n");
15
         H0("   --no-open-gop               Enable open-GOP, allows I slices to be non-IDR. Default %s\n", OPT(param->bOpenGOP));
16
+       H0("   --cra-nal                     Force nal type to CRA to all frames expect first frame, works only with keyint 1. Default %s\n", OPT(param->craNal));
17
         H0("-I/--keyint <integer>            Max IDR period in frames. -1 for infinite-gop. Default %d\n", param->keyframeMax);
18
         H0("-i/--min-keyint <integer>        Scenecuts closer together than this are coded as I, not IDR. Default: auto\n");
19
         H0("   --gop-lookahead <integer>     Extends gop boundary if a scenecut is found within this from keyint boundary. Default 0\n");
20
@@ -174,7 +175,6 @@
21
         H1("   --scenecut-bias <0..100.0>    Bias for scenecut detection. Default %.2f\n", param->scenecutBias);
22
         H0("   --hist-scenecut               Enables histogram based scene-cut detection using histogram based algorithm.\n");
23
         H0("   --no-hist-scenecut            Disables histogram based scene-cut detection using histogram based algorithm.\n");
24
-        H1("   --hist-threshold <0.0..1.0>   Luma Edge histogram's Normalized SAD threshold for histogram based scenecut detection Default %.2f\n", param->edgeTransitionThreshold);
25
         H0("   --no-fades                  Enable detection and handling of fade-in regions. Default %s\n", OPT(param->bEnableFades));
26
         H1("   --scenecut-aware-qp <0..3>    Enable increasing QP for frames inside the scenecut window around scenecut. Default %s\n", OPT(param->bEnableSceneCutAwareQp));
27
         H1("                                 0 - Disabled\n");
28
@@ -182,6 +182,7 @@
29
         H1("                                 2 - Backward masking\n");
30
         H1("                                 3 - Bidirectional masking\n");
31
         H1("   --masking-strength <string>   Comma separated values which specify the duration and offset for the QP increment for inter-frames when scenecut-aware-qp is enabled.\n");
32
+        H1("   --scenecut-qp-config <file>   File containing scenecut-aware-qp mode, window duration and offsets settings required for the masking. Works only with --pass 2\n");
33
         H0("   --radl <integer>              Number of RADL pictures allowed in front of IDR. Default %d\n", param->radl);
34
         H0("   --intra-refresh               Use Periodic Intra Refresh instead of IDR frames\n");
35
         H0("   --rc-lookahead <integer>      Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth);
36
@@ -262,6 +263,7 @@
37
         H0("   --aq-strength <float>         Reduces blocking and blurring in flat and textured areas (0 to 3.0). Default %.2f\n", param->rc.aqStrength);
38
         H0("   --qp-adaptation-range <float> Delta QP range by QP adaptation based on a psycho-visual model (1.0 to 6.0). Default %.2f\n", param->rc.qpAdaptationRange);
39
         H0("   --no-aq-motion              Block level QP adaptation based on the relative motion between the block and the frame. Default %s\n", OPT(param->bAQMotion));
40
+        H1("   --no-sbrc                   Enables the segment based rate control. Default %s\n", OPT(param->bEnableSBRC));
41
         H0("   --qg-size <int>               Specifies the size of the quantization group (64, 32, 16, 8). Default %d\n", param->rc.qgSize);
42
         H0("   --no-cutree                 Enable cutree for Adaptive Quantization. Default %s\n", OPT(param->rc.cuTree));
43
         H0("   --no-rc-grain               Enable ratecontrol mode to handle grains specifically. turned on with tune grain. Default %s\n", OPT(param->rc.bEnableGrain));
44
@@ -282,6 +284,7 @@
45
         H1("                                       q=<integer> (force QP)\n");
46
         H1("                                   or  b=<float> (bitrate multiplier)\n");
47
         H0("   --zonefile <filename>         Zone file containing the zone boundaries and the parameters to be reconfigured.\n");
48
+        H0("   --no-zonefile-rc-init         This allow to use rate-control history across zones in zonefile.\n");
49
         H1("   --lambda-file <string>        Specify a file containing replacement values for the lambda tables\n");
50
         H1("                                 MAX_MAX_QP+1 floats for lambda table, then again for lambda2 table\n");
51
         H1("                                 Blank lines and lines starting with hash(#) are ignored\n");
52
@@ -314,6 +317,30 @@
53
         H0("   --master-display <string>     SMPTE ST 2086 master display color volume info SEI (HDR)\n");
54
         H0("                                    format: G(x,y)B(x,y)R(x,y)WP(x,y)L(max,min)\n");
55
         H0("   --max-cll <string>            Specify content light level info SEI as \"cll,fall\" (HDR).\n");
56
+        H0("   --video-signal-type-preset <string>    Specify combinations of color primaries, transfer characteristics, color matrix, range of luma and chroma signals, and chroma sample location\n");
57
+        H0("                                            format: <system-id>:<color-volume>\n");
58
+        H0("                                            This has higher precedence than individual VUI parameters. If any individual VUI option is specified together with this,\n");
59
+        H0("                                            which changes the values set corresponding to the system-id or color-volume, it will be discarded.\n");
60
+        H0("                                            The color-volume can be used only with the system-id options BT2100_PQ_YCC, BT2100_PQ_ICTCP, and BT2100_PQ_RGB.\n");
61
+        H0("                                            system-id options and their corresponding values:\n");
62
+        H0("                                              BT601_525:       --colorprim smpte170m --transfer smpte170m --colormatrix smpte170m --range limited --chromaloc 0\n");
63
+        H0("                                              BT601_626:       --colorprim bt470bg --transfer smpte170m --colormatrix bt470bg --range limited --chromaloc 0\n");
64
+        H0("                                              BT709_YCC:       --colorprim bt709 --transfer bt709 --colormatrix bt709 --range limited --chromaloc 0\n");
65
+        H0("                                              BT709_RGB:       --colorprim bt709 --transfer bt709 --colormatrix gbr --range limited\n");
66
+        H0("                                              BT2020_YCC_NCL:  --colorprim bt2020 --transfer bt2020-10 --colormatrix bt709 --range limited --chromaloc 2\n");
67
+        H0("                                              BT2020_RGB:      --colorprim bt2020 --transfer smpte2084 --colormatrix bt2020nc --range limited\n");
68
+        H0("                                              BT2100_PQ_YCC:   --colorprim bt2020 --transfer smpte2084 --colormatrix bt2020nc --range limited --chromaloc 2\n");
69
+        H0("                                              BT2100_PQ_ICTCP: --colorprim bt2020 --transfer smpte2084 --colormatrix ictcp --range limited --chromaloc 2\n");
70
+        H0("                                              BT2100_PQ_RGB:   --colorprim bt2020 --transfer smpte2084 --colormatrix gbr --range limited\n");
71
+        H0("                                              BT2100_HLG_YCC:  --colorprim bt2020 --transfer arib-std-b67 --colormatrix bt2020nc --range limited --chromaloc 2\n");
72
+        H0("                                              BT2100_HLG_RGB:  --colorprim bt2020 --transfer arib-std-b67 --colormatrix gbr --range limited\n");
73
+        H0("                                              FR709_RGB:       --colorprim bt709 --transfer bt709 --colormatrix gbr --range full\n");
74
+        H0("                                              FR2020_RGB:      --colorprim bt2020 --transfer bt2020-10 --colormatrix gbr --range full\n");
75
+        H0("                                              FRP3D65_YCC:     --colorprim smpte432 --transfer bt709 --colormatrix smpte170m --range full --chromaloc 1\n");
76
+        H0("                                            color-volume options and their corresponding values:\n");
77
+        H0("                                              P3D65x1000n0005: --master-display G(13250,34500)B(7500,3000)R(34000,16000)WP(15635,16450)L(10000000,5)\n");
78
+        H0("                                              P3D65x4000n005:  --master-display G(13250,34500)B(7500,3000)R(34000,16000)WP(15635,16450)L(40000000,50)\n");
79
+        H0("                                              BT2100x108n0005: --master-display G(8500,39850)B(6550,2300)R(34000,146000)WP(15635,16450)L(10000000,1)\n");
80
         H0("   --no-cll                    Emit content light level info SEI. Default %s\n", OPT(param->bEmitCLL));
81
         H0("   --no-hdr10                  Control dumping of HDR10 SEI packet. If max-cll or master-display has non-zero values, this is enabled. Default %s\n", OPT(param->bEmitHDR10SEI));
82
         H0("   --no-hdr-opt                Add luma and chroma offsets for HDR/WCG content. Default %s. Now deprecated.\n", OPT(param->bHDROpt));
83
@@ -324,9 +351,11 @@
84
         H0("   --no-repeat-headers         Emit SPS and PPS headers at each keyframe. Default %s\n", OPT(param->bRepeatHeaders));
85
         H0("   --no-info                   Emit SEI identifying encoder and parameters. Default %s\n", OPT(param->bEmitInfoSEI));
86
         H0("   --no-hrd                    Enable HRD parameters signaling. Default %s\n", OPT(param->bEmitHRDSEI));
87
-        H0("   --no-idr-recovery-sei      Emit recovery point infor SEI at each IDR frame \n");
88
-        H0("   --no-temporal-layers        Enable a temporal sublayer for unreferenced B frames. Default %s\n", OPT(param->bEnableTemporalSubLayers));
89
+        H0("   --no-idr-recovery-sei       Emit recovery point infor SEI at each IDR frame \n");
90
+        H0("   --temporal-layers             Enable a temporal sublayer for unreferenced B frames. Default %s\n", OPT(param->bEnableTemporalSubLayers));
91
         H0("   --no-aud                    Emit access unit delimiters at the start of each access unit. Default %s\n", OPT(param->bEnableAccessUnitDelimiters));
92
+        H0("   --no-eob                    Emit end of bitstream nal unit at the end of the bitstream. Default %s\n", OPT(param->bEnableEndOfBitstream));
93
+        H0("   --no-eos                    Emit end of sequence nal unit at the end of every coded video sequence. Default %s\n", OPT(param->bEnableEndOfSequence));
94
         H1("   --hash <integer>              Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default %d\n", param->decodedPictureHashSEI);
95
         H0("   --atc-sei <integer>           Emit the alternative transfer characteristics SEI message where the integer is the preferred transfer characteristics. Default disabled\n");
96
         H0("   --pic-struct <integer>        Set the picture structure and emits it in the picture timing SEI message. Values in the range 0..12. See D.3.3 of the HEVC spec. for a detailed explanation.\n");
97
@@ -344,6 +373,7 @@
98
         H0("   --lowpass-dct                 Use low-pass subband dct approximation. Default %s\n", OPT(param->bLowPassDct));
99
         H0("   --no-frame-dup              Enable Frame duplication. Default %s\n", OPT(param->bEnableFrameDuplication));
100
         H0("   --dup-threshold <integer>     PSNR threshold for Frame duplication. Default %d\n", param->dupThreshold);
101
+        H0("   --no-mcstf                  Enable GOP based temporal filter. Default %d\n", param->bEnableTemporalFilter);
102
 #ifdef SVT_HEVC
103
         H0("   --nosvt                     Enable SVT HEVC encoder %s\n", OPT(param->bEnableSvtHevc));
104
         H0("   --no-svt-hme                Enable Hierarchial motion estimation(HME) in SVT HEVC encoder \n");
105
@@ -365,6 +395,9 @@
106
         H1("    2 - unable to open encoder\n");
107
         H1("    3 - unable to generate stream headers\n");
108
         H1("    4 - encoder abort\n");
109
+        H0("\nSEI Message Options\n");
110
+        H0("   --film-grain <filename>           File containing Film Grain Characteristics to be written as a SEI Message\n");
111
+
112
 #undef OPT
113
 #undef H0
114
 #undef H1
115
@@ -484,6 +517,9 @@
116
 
117
         memcpy(globalParam->rc.zoneszonefileCount.zoneParam, globalParam, sizeof(x265_param));
118
 
119
+        if (zonefileCount == 0)
120
+            globalParam->rc.zoneszonefileCount.keyframeMax = globalParam->keyframeMax;
121
+
122
         for (optind = 0;;)
123
         {
124
             int long_options_index = -1;
125
@@ -708,12 +744,19 @@
126
                         return true;
127
                     }
128
                 }
129
+                OPT("scenecut-qp-config")
130
+                {
131
+                    this->scenecutAwareQpConfig = x265_fopen(optarg, "rb");
132
+                    if (!this->scenecutAwareQpConfig)
133
+                        x265_log_file(param, X265_LOG_ERROR, "%s scenecut aware qp config file not found or error in opening config file\n", optarg);
134
+                }
135
                 OPT("zonefile")
136
                 {
137
                     this->zoneFile = x265_fopen(optarg, "rb");
138
                     if (!this->zoneFile)
139
                         x265_log_file(param, X265_LOG_ERROR, "%s zone file not found or error in opening zone file\n", optarg);
140
                 }
141
+                OPT("no-zonefile-rc-init") this->param->bNoResetZoneConfig = true;
142
                 OPT("fullhelp")
143
                 {
144
                     param->logLevel = X265_LOG_FULL;
145
@@ -875,7 +918,7 @@
146
             if (reconFileBitDepth == 0)
147
                 reconFileBitDepth = param->internalBitDepth;
148
             this->recon = ReconFile::open(reconfn, param->sourceWidth, param->sourceHeight, reconFileBitDepth,
149
-                param->fpsNum, param->fpsDenom, param->internalCsp);
150
+                param->fpsNum, param->fpsDenom, param->internalCsp, param->sourceBitDepth);
151
             if (this->recon->isFail())
152
             {
153
                 x265_log(param, X265_LOG_WARNING, "unable to write reconstructed outputs file\n");
154
@@ -973,6 +1016,7 @@
155
         param->rc.zones = X265_MALLOC(x265_zone, param->rc.zonefileCount);
156
         for (int i = 0; i < param->rc.zonefileCount; i++)
157
         {
158
+            param->rc.zonesi.startFrame = -1;
159
             while (fgets(line, sizeof(line), zoneFile))
160
             {
161
                 if (*line == '#' || (strcmp(line, "\r\n") == 0))
162
@@ -1010,57 +1054,179 @@
163
         return 1;
164
     }
165
 
166
-    /* Parse the RPU file and extract the RPU corresponding to the current picture
167
-    * and fill the rpu field of the input picture */
168
-    int CLIOptions::rpuParser(x265_picture * pic)
169
-    {
170
-        uint8_t byteVal;
171
-        uint32_t code = 0;
172
-        int bytesRead = 0;
173
-        pic->rpu.payloadSize = 0;
174
-
175
-        if (!pic->pts)
176
-        {
177
-            while (bytesRead++ < 4 && fread(&byteVal, sizeof(uint8_t), 1, dolbyVisionRpu))
178
-                code = (code << 8) | byteVal;
179
-
180
-            if (code != START_CODE)
181
-            {
182
-                x265_log(NULL, X265_LOG_ERROR, "Invalid Dolby Vision RPU startcode in POC %d\n", pic->pts);
183
-                return 1;
184
-            }
185
-        }
186
-
187
-        bytesRead = 0;
188
-        while (fread(&byteVal, sizeof(uint8_t), 1, dolbyVisionRpu))
189
-        {
190
-            code = (code << 8) | byteVal;
191
-            if (bytesRead++ < 3)
192
-                continue;
193
-            if (bytesRead >= 1024)
194
-            {
195
-                x265_log(NULL, X265_LOG_ERROR, "Invalid Dolby Vision RPU size in POC %d\n", pic->pts);
196
-                return 1;
197
-            }
198
-
199
-            if (code != START_CODE)
200
-                pic->rpu.payloadpic->rpu.payloadSize++ = (code >> (3 * 8)) & 0xFF;
201
-            else
202
-                return 0;
203
-        }
204
-
205
-        int ShiftBytes = START_CODE_BYTES - (bytesRead - pic->rpu.payloadSize);
206
-        int bytesLeft = bytesRead - pic->rpu.payloadSize;
207
-        code = (code << ShiftBytes * 8);
208
-        for (int i = 0; i < bytesLeft; i++)
209
-        {
210
-            pic->rpu.payloadpic->rpu.payloadSize++ = (code >> (3 * 8)) & 0xFF;
211
-            code = (code << 8);
212
-        }
213
-        if (!pic->rpu.payloadSize)
214
-            x265_log(NULL, X265_LOG_WARNING, "Dolby Vision RPU not found for POC %d\n", pic->pts);
215
-        return 0;
216
-    }
217
+    /* Parse the RPU file and extract the RPU corresponding to the current picture
218
+    * and fill the rpu field of the input picture */
219
+    int CLIOptions::rpuParser(x265_picture * pic)
220
+    {
221
+        uint8_t byteVal;
222
+        uint32_t code = 0;
223
+        int bytesRead = 0;
224
+        pic->rpu.payloadSize = 0;
225
+
226
+        if (!pic->pts)
227
+        {
228
+            while (bytesRead++ < 4 && fread(&byteVal, sizeof(uint8_t), 1, dolbyVisionRpu))
229
+                code = (code << 8) | byteVal;
230
+
231
+            if (code != START_CODE)
232
+            {
233
+                x265_log(NULL, X265_LOG_ERROR, "Invalid Dolby Vision RPU startcode in POC %d\n", pic->pts);
234
+                return 1;
235
+            }
236
+        }
237
+
238
+        bytesRead = 0;
239
+        while (fread(&byteVal, sizeof(uint8_t), 1, dolbyVisionRpu))
240
+        {
241
+            code = (code << 8) | byteVal;
242
+            if (bytesRead++ < 3)
243
+                continue;
244
+            if (bytesRead >= 1024)
245
+            {
246
+                x265_log(NULL, X265_LOG_ERROR, "Invalid Dolby Vision RPU size in POC %d\n", pic->pts);
247
+                return 1;
248
+            }
249
+
250
+            if (code != START_CODE)
251
+                pic->rpu.payloadpic->rpu.payloadSize++ = (code >> (3 * 8)) & 0xFF;
252
+            else
253
+                return 0;
254
+        }
255
+
256
+        int ShiftBytes = START_CODE_BYTES - (bytesRead - pic->rpu.payloadSize);
257
+        int bytesLeft = bytesRead - pic->rpu.payloadSize;
258
+        code = (code << ShiftBytes * 8);
259
+        for (int i = 0; i < bytesLeft; i++)
260
+        {
261
+            pic->rpu.payloadpic->rpu.payloadSize++ = (code >> (3 * 8)) & 0xFF;
262
+            code = (code << 8);
263
+        }
264
+        if (!pic->rpu.payloadSize)
265
+            x265_log(NULL, X265_LOG_WARNING, "Dolby Vision RPU not found for POC %d\n", pic->pts);
266
+        return 0;
267
+    }
268
+
269
+    bool CLIOptions::parseScenecutAwareQpConfig()
270
+    {
271
+        char line256;
272
+        char* argLine;
273
+        rewind(scenecutAwareQpConfig);
274
+        while (fgets(line, sizeof(line), scenecutAwareQpConfig))
275
+        {
276
+            if (*line == '#' || (strcmp(line, "\r\n") == 0))
277
+                continue;
278
+            int index = (int)strcspn(line, "\r\n");
279
+            lineindex = '\0';
280
+            argLine = line;
281
+            while (isspace((unsigned char)*argLine)) argLine++;
282
+            char* start = strchr(argLine, '-');
283
+            int argCount = 0;
284
+            char **args = (char**)malloc(256 * sizeof(char *));
285
+            //Adding a dummy string to avoid file parsing error
286
+            argsargCount++ = (char *)"x265";
287
+            char* token = strtok(start, " ");
288
+            while (token)
289
+            {
290
+                argsargCount++ = token;
291
+                token = strtok(NULL, " ");
292
+            }
293
+            argsargCount = NULL;
294
+            CLIOptions cliopt;
295
+            if (cliopt.parseScenecutAwareQpParam(argCount, args, param))
296
+            {
297
+                cliopt.destroy();
298
+                if (cliopt.api)
299
+                    cliopt.api->param_free(cliopt.param);
300
+                exit(1);
301
+            }
302
+            break;
303
+        }
304
+        return 1;
305
+    }
306
+    bool CLIOptions::parseScenecutAwareQpParam(int argc, char **argv, x265_param* globalParam)
307
+    {
308
+        bool bError = false;
309
+        int bShowHelp = false;
310
+        int outputBitDepth = 0;
311
+        const char *profile = NULL;
312
+        /* Presets are applied before all other options. */
313
+        for (optind = 0;;)
314
+        {
315
+            int c = getopt_long(argc, argv, short_options, long_options, NULL);
316
+            if (c == -1)
317
+                break;
318
+            else if (c == 'D')
319
+                outputBitDepth = atoi(optarg);
320
+            else if (c == 'P')
321
+                profile = optarg;
322
+            else if (c == '?')
323
+                bShowHelp = true;
324
+        }
325
+        if (!outputBitDepth && profile)
326
+        {
327
+            /*try to derive the output bit depth from the requested profile*/
328
+            if (strstr(profile, "10"))
329
+                outputBitDepth = 10;
330
+            else if (strstr(profile, "12"))
331
+                outputBitDepth = 12;
332
+            else
333
+                outputBitDepth = 8;
334
+        }
335
+        api = x265_api_get(outputBitDepth);
336
+        if (!api)
337
+        {
338
+            x265_log(NULL, X265_LOG_WARNING, "falling back to default bit-depth\n");
339
+            api = x265_api_get(0);
340
+        }
341
+        if (bShowHelp)
342
+        {
343
+            printVersion(globalParam, api);
344
+            showHelp(globalParam);
345
+        }
346
+        for (optind = 0;;)
347
+        {
348
+            int long_options_index = -1;
349
+            int c = getopt_long(argc, argv, short_options, long_options, &long_options_index);
350
+            if (c == -1)
351
+                break;
352
+            if (long_options_index < 0 && c > 0)
353
+            {
354
+                for (size_t i = 0; i < sizeof(long_options) / sizeof(long_options0); i++)
355
+                {
356
+                    if (long_optionsi.val == c)
357
+                    {
358
+                        long_options_index = (int)i;
359
+                        break;
360
+                    }
361
+                }
362
+                if (long_options_index < 0)
363
+                {
364
+                    /* getopt_long might have already printed an error message */
365
+                    if (c != 63)
366
+                        x265_log(NULL, X265_LOG_WARNING, "internal error: short option '%c' has no long option\n", c);
367
+                    return true;
368
+                }
369
+            }
370
+            if (long_options_index < 0)
371
+            {
372
+                x265_log(NULL, X265_LOG_WARNING, "short option '%c' unrecognized\n", c);
373
+                return true;
374
+            }
375
+            bError |= !!api->scenecut_aware_qp_param_parse(globalParam, long_optionslong_options_index.name, optarg);
376
+            if (bError)
377
+            {
378
+                const char *name = long_options_index > 0 ? long_optionslong_options_index.name : argvoptind - 2;
379
+                x265_log(NULL, X265_LOG_ERROR, "invalid argument: %s = %s\n", name, optarg);
380
+                return true;
381
+            }
382
+        }
383
+        if (optind < argc)
384
+        {
385
+            x265_log(param, X265_LOG_WARNING, "extra unused command arguments given <%s>\n", argvoptind);
386
+            return true;
387
+        }
388
+        return false;
389
+    }
390
 
391
 #ifdef __cplusplus
392
 }
393
x265_3.5.tar.gz/source/x265cli.h -> x265_3.6.tar.gz/source/x265cli.h Changed
104
 
1
@@ -135,6 +135,7 @@
2
     { "no-fast-intra",        no_argument, NULL, 0 },
3
     { "no-open-gop",          no_argument, NULL, 0 },
4
     { "open-gop",             no_argument, NULL, 0 },
5
+    { "cra-nal",              no_argument, NULL, 0 },
6
     { "keyint",         required_argument, NULL, 'I' },
7
     { "min-keyint",     required_argument, NULL, 'i' },
8
     { "gop-lookahead",  required_argument, NULL, 0 },
9
@@ -143,7 +144,6 @@
10
     { "scenecut-bias",  required_argument, NULL, 0 },
11
     { "hist-scenecut",        no_argument, NULL, 0},
12
     { "no-hist-scenecut",     no_argument, NULL, 0},
13
-    { "hist-threshold", required_argument, NULL, 0},
14
     { "fades",                no_argument, NULL, 0 },
15
     { "no-fades",             no_argument, NULL, 0 },
16
     { "scenecut-aware-qp", required_argument, NULL, 0 },
17
@@ -182,6 +182,8 @@
18
     { "qp",             required_argument, NULL, 'q' },
19
     { "aq-mode",        required_argument, NULL, 0 },
20
     { "aq-strength",    required_argument, NULL, 0 },
21
+    { "sbrc",                 no_argument, NULL, 0 },
22
+    { "no-sbrc",              no_argument, NULL, 0 },
23
     { "rc-grain",             no_argument, NULL, 0 },
24
     { "no-rc-grain",          no_argument, NULL, 0 },
25
     { "ipratio",        required_argument, NULL, 0 },
26
@@ -244,6 +246,7 @@
27
     { "crop-rect",      required_argument, NULL, 0 }, /* DEPRECATED */
28
     { "master-display", required_argument, NULL, 0 },
29
     { "max-cll",        required_argument, NULL, 0 },
30
+    {"video-signal-type-preset", required_argument, NULL, 0 },
31
     { "min-luma",       required_argument, NULL, 0 },
32
     { "max-luma",       required_argument, NULL, 0 },
33
     { "log2-max-poc-lsb", required_argument, NULL, 8 },
34
@@ -263,11 +266,16 @@
35
     { "repeat-headers",       no_argument, NULL, 0 },
36
     { "aud",                  no_argument, NULL, 0 },
37
     { "no-aud",               no_argument, NULL, 0 },
38
+    { "eob",                  no_argument, NULL, 0 },
39
+    { "no-eob",               no_argument, NULL, 0 },
40
+    { "eos",                  no_argument, NULL, 0 },
41
+    { "no-eos",               no_argument, NULL, 0 },
42
     { "info",                 no_argument, NULL, 0 },
43
     { "no-info",              no_argument, NULL, 0 },
44
     { "zones",          required_argument, NULL, 0 },
45
     { "qpfile",         required_argument, NULL, 0 },
46
     { "zonefile",       required_argument, NULL, 0 },
47
+    { "no-zonefile-rc-init",  no_argument, NULL, 0 },
48
     { "lambda-file",    required_argument, NULL, 0 },
49
     { "b-intra",              no_argument, NULL, 0 },
50
     { "no-b-intra",           no_argument, NULL, 0 },
51
@@ -298,8 +306,7 @@
52
     { "dynamic-refine",       no_argument, NULL, 0 },
53
     { "no-dynamic-refine",    no_argument, NULL, 0 },
54
     { "strict-cbr",           no_argument, NULL, 0 },
55
-    { "temporal-layers",      no_argument, NULL, 0 },
56
-    { "no-temporal-layers",   no_argument, NULL, 0 },
57
+    { "temporal-layers",      required_argument, NULL, 0 },
58
     { "qg-size",        required_argument, NULL, 0 },
59
     { "recon-y4m-exec", required_argument, NULL, 0 },
60
     { "analyze-src-pics", no_argument, NULL, 0 },
61
@@ -349,6 +356,8 @@
62
     { "frame-dup",            no_argument, NULL, 0 },
63
     { "no-frame-dup", no_argument, NULL, 0 },
64
     { "dup-threshold", required_argument, NULL, 0 },
65
+    { "mcstf",                 no_argument, NULL, 0 },
66
+    { "no-mcstf",              no_argument, NULL, 0 },
67
 #ifdef SVT_HEVC
68
     { "svt",     no_argument, NULL, 0 },
69
     { "no-svt",  no_argument, NULL, 0 },
70
@@ -373,6 +382,8 @@
71
     { "abr-ladder", required_argument, NULL, 0 },
72
     { "min-vbv-fullness", required_argument, NULL, 0 },
73
     { "max-vbv-fullness", required_argument, NULL, 0 },
74
+    { "scenecut-qp-config", required_argument, NULL, 0 },
75
+    { "film-grain", required_argument, NULL, 0 },
76
     { 0, 0, 0, 0 },
77
     { 0, 0, 0, 0 },
78
     { 0, 0, 0, 0 },
79
@@ -388,6 +399,7 @@
80
         FILE*       qpfile;
81
         FILE*       zoneFile;
82
         FILE*    dolbyVisionRpu;    /* File containing Dolby Vision BL RPU metadata */
83
+        FILE*    scenecutAwareQpConfig; /* File containing scenecut aware frame quantization related CLI options */
84
         const char* reconPlayCmd;
85
         const x265_api* api;
86
         x265_param* param;
87
@@ -425,6 +437,7 @@
88
             qpfile = NULL;
89
             zoneFile = NULL;
90
             dolbyVisionRpu = NULL;
91
+            scenecutAwareQpConfig = NULL;
92
             reconPlayCmd = NULL;
93
             api = NULL;
94
             param = NULL;
95
@@ -455,6 +468,8 @@
96
         bool parseQPFile(x265_picture &pic_org);
97
         bool parseZoneFile();
98
         int rpuParser(x265_picture * pic);
99
+        bool parseScenecutAwareQpConfig();
100
+        bool parseScenecutAwareQpParam(int argc, char **argv, x265_param* globalParam);
101
     };
102
 #ifdef __cplusplus
103
 }
104
x265_3.5.tar.gz/x265Version.txt -> x265_3.6.tar.gz/x265Version.txt Changed
8
 
1
@@ -1,4 +1,4 @@
2
 #Attribute:         Values
3
-repositorychangeset: f0c1022b6
4
+repositorychangeset: aa7f602f7
5
 releasetagdistance: 1
6
-releasetag: 3.5
7
+releasetag: 3.6
8
Refresh

No build results available

Refresh

No rpmlint results available

Request History
Luigi Baldoni's avatar

Aloysius created request 10 months ago

Update to version 3.6


Luigi Baldoni's avatar

Aloysius accepted request 10 months ago