Overview
x265.changes
Changed
x
1
2
-------------------------------------------------------------------
3
+Thu Jun 13 05:58:19 UTC 2024 - Luigi Baldoni <aloisio@gmx.com>
4
+
5
+- Update to version 3.6
6
+ New features:
7
+ * Segment based Ratecontrol (SBRC) feature
8
+ * Motion-Compensated Spatio-Temporal Filtering
9
+ * Scene-cut aware qp - BBAQ (Bidirectional Boundary Aware
10
+ Quantization)
11
+ * Histogram-Based Scene Change Detection
12
+ * Film-Grain characteristics as a SEI message to support Film
13
+ Grain Synthesis(FGS)
14
+ * Add temporal layer implementation(Hierarchical B-frame
15
+ implementation)
16
+ Enhancements to existing features:
17
+ * Added Dolby Vision 8.4 Profile Support
18
+ API changes:
19
+ * Add Segment based Ratecontrol(SBRC) feature: "--no-sbrc".
20
+ * Add command line parameter for mcstf feature: "--no-mctf".
21
+ * Add command line parameters for the scene cut aware qp
22
+ feature: "--scenecut-aware-qp" and "--masking-strength".
23
+ * Add command line parameters for Histogram-Based Scene Change
24
+ Detection: "--hist-scenecut".
25
+ * Add film grain characteristics as a SEI message to the
26
+ bitstream: "--film-grain <filename>"
27
+ * cli: add new option --cra-nal (Force nal type to CRA to all
28
+ frames expect for the first frame, works only with keyint 1)
29
+ Optimizations:
30
+ * ARM64 NEON optimizations:- Several time-consuming C
31
+ functions have been optimized for the targeted platform -
32
+ aarch64. The overall performance increased by around 20%.
33
+ * SVE/SVE2 optimizations
34
+ Bug fixes:
35
+ * Linux bug to utilize all the cores
36
+ * Crash with hist-scenecut build when source resolution is not
37
+ multiple of minCuSize
38
+ * 32bit and 64bit builds generation for ARM
39
+ * bugs in zonefile feature (Reflect Zonefile Parameters inside
40
+ Lookahead, extra IDR issue, Avg I Slice QP value issue etc..)
41
+ * Add x86 ASM implementation for subsampling luma
42
+ * Fix for abrladder segfault with load reuse level 1
43
+ * Reorder miniGOP based on temporal layer hierarchy and add
44
+ support for more B frame
45
+ * Add MacOS aarch64 build support
46
+ * Fix boundary condition issue for Gaussian filter
47
+- Drop arm.patch and replace it with 0001-Fix-arm-flags.patch
48
+ and 0004-Do-not-build-with-assembly-support-on-arm.patch
49
+ (courtesy of Debian)
50
+
51
+-------------------------------------------------------------------
52
Wed May 19 13:21:09 UTC 2021 - Luigi Baldoni <aloisio@gmx.com>
53
54
- Build libx265_main10 and libx265_main12 unconditionally and
55
x265.spec
Changed
46
1
2
#
3
# spec file for package x265
4
#
5
-# Copyright (c) 2021 Packman Team <packman@links2linux.de>
6
+# Copyright (c) 2024 Packman Team <packman@links2linux.de>
7
# Copyright (c) 2014 Torsten Gruner <t.gruner@katodev.de>
8
#
9
# All modifications and additions to the file contributed by third parties
10
11
#
12
13
14
-%define sover 199
15
+%define sover 209
16
%define libname lib%{name}
17
%define libsoname %{libname}-%{sover}
18
-%define uver 3_5
19
+%define uver 3_6
20
Name: x265
21
-Version: 3.5
22
+Version: 3.6
23
Release: 0
24
Summary: A free h265/HEVC encoder - encoder binary
25
License: GPL-2.0-or-later
26
Group: Productivity/Multimedia/Video/Editors and Convertors
27
URL: https://bitbucket.org/multicoreware/x265_git
28
Source0: https://bitbucket.org/multicoreware/x265_git/downloads/%{name}_%{version}.tar.gz
29
-Patch0: arm.patch
30
Patch1: x265.pkgconfig.patch
31
Patch2: x265-fix_enable512.patch
32
+Patch3: 0001-Fix-arm-flags.patch
33
+Patch4: 0004-Do-not-build-with-assembly-support-on-arm.patch
34
BuildRequires: cmake >= 2.8.8
35
BuildRequires: gcc-c++
36
BuildRequires: nasm >= 2.13
37
38
%cmake_install
39
find %{buildroot} -type f -name "*.a" -delete -print0
40
41
+%check
42
+
43
%post -n %{libsoname} -p /sbin/ldconfig
44
%postun -n %{libsoname} -p /sbin/ldconfig
45
46
0001-Fix-arm-flags.patch
Added
41
1
2
+From: Sebastian Ramacher <sramacher@debian.org>
3
+Date: Sun, 21 Jun 2020 17:54:56 +0200
4
+Subject: Fix arm* flags
5
+
6
+---
7
+ source/CMakeLists.txt | 7 ++-----
8
+ 1 file changed, 2 insertions(+), 5 deletions(-)
9
+
10
+diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
11
+index ab5ddfe..eb9b19b 100755
12
+--- a/source/CMakeLists.txt
13
++++ b/source/CMakeLists.txt
14
+@@ -253,10 +253,7 @@ if(GCC)
15
+ elseif(ARM)
16
+ find_package(Neon)
17
+ if(CPU_HAS_NEON)
18
+- set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
19
+ add_definitions(-DHAVE_NEON)
20
+- else()
21
+- set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
22
+ endif()
23
+ endif()
24
+ if(ARM64 OR CROSS_COMPILE_ARM64)
25
+@@ -265,13 +262,13 @@ if(GCC)
26
+ find_package(SVE2)
27
+ if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2)
28
+ message(STATUS "Found SVE2")
29
+- set(ARM_ARGS -O3 -march=armv8-a+sve2 -fPIC -flax-vector-conversions)
30
++ set(ARM_ARGS -fPIC -flax-vector-conversions)
31
+ add_definitions(-DHAVE_SVE2)
32
+ add_definitions(-DHAVE_SVE)
33
+ add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE2
34
+ elseif(CPU_HAS_SVE OR CROSS_COMPILE_SVE)
35
+ message(STATUS "Found SVE")
36
+- set(ARM_ARGS -O3 -march=armv8-a+sve -fPIC -flax-vector-conversions)
37
++ set(ARM_ARGS -fPIC -flax-vector-conversions)
38
+ add_definitions(-DHAVE_SVE)
39
+ add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE
40
+ elseif(CPU_HAS_NEON)
41
0004-Do-not-build-with-assembly-support-on-arm.patch
Added
30
1
2
+From: Sebastian Ramacher <sramacher@debian.org>
3
+Date: Fri, 31 May 2024 23:38:23 +0200
4
+Subject: Do not build with assembly support on arm*
5
+
6
+---
7
+ source/CMakeLists.txt | 9 ---------
8
+ 1 file changed, 9 deletions(-)
9
+
10
+diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
11
+index 672cc2d..f112330 100755
12
+--- a/source/CMakeLists.txt
13
++++ b/source/CMakeLists.txt
14
+@@ -73,15 +73,6 @@ elseif(POWERMATCH GREATER "-1")
15
+ add_definitions(-DPPC64=1)
16
+ message(STATUS "Detected POWER PPC64 target processor")
17
+ endif()
18
+-elseif(ARMMATCH GREATER "-1")
19
+- if(CROSS_COMPILE_ARM)
20
+- message(STATUS "Cross compiling for ARM arch")
21
+- else()
22
+- set(CROSS_COMPILE_ARM 0)
23
+- endif()
24
+- message(STATUS "Detected ARM target processor")
25
+- set(ARM 1)
26
+- add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
27
+ elseif(ARM64MATCH GREATER "-1")
28
+ #if(CROSS_COMPILE_ARM64)
29
+ #message(STATUS "Cross compiling for ARM64 arch")
30
arm.patch
Deleted
110
1
2
-Index: x265_3.4/source/CMakeLists.txt
3
-===================================================================
4
---- x265_3.4.orig/source/CMakeLists.txt
5
-+++ x265_3.4/source/CMakeLists.txt
6
-@@ -64,26 +64,26 @@ elseif(POWERMATCH GREATER "-1")
7
- add_definitions(-DPPC64=1)
8
- message(STATUS "Detected POWER PPC64 target processor")
9
- endif()
10
--elseif(ARMMATCH GREATER "-1")
11
-- if(CROSS_COMPILE_ARM)
12
-- message(STATUS "Cross compiling for ARM arch")
13
-- else()
14
-- set(CROSS_COMPILE_ARM 0)
15
-- endif()
16
-- set(ARM 1)
17
-- if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
18
-- message(STATUS "Detected ARM64 target processor")
19
-- set(ARM64 1)
20
-- add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0)
21
-- else()
22
-- message(STATUS "Detected ARM target processor")
23
-- add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1)
24
-- endif()
25
-+elseif(${SYSPROC} MATCHES "armv5.*")
26
-+ message(STATUS "Detected ARMV5 system processor")
27
-+ set(ARMV5 1)
28
-+ add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=0 -DHAVE_NEON=0)
29
-+elseif(${SYSPROC} STREQUAL "armv6l")
30
-+ message(STATUS "Detected ARMV6 system processor")
31
-+ set(ARMV6 1)
32
-+ add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
33
-+elseif(${SYSPROC} STREQUAL "armv7l")
34
-+ message(STATUS "Detected ARMV7 system processor")
35
-+ set(ARMV7 1)
36
-+ add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
37
-+elseif(${SYSPROC} STREQUAL "aarch64")
38
-+ message(STATUS "Detected AArch64 system processor")
39
-+ set(ARMV7 1)
40
-+ add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0 -DHAVE_NEON=0)
41
- else()
42
- message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
43
- message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
44
- endif()
45
--
46
- if(UNIX)
47
- list(APPEND PLATFORM_LIBS pthread)
48
- find_library(LIBRT rt)
49
-@@ -238,28 +238,9 @@ if(GCC)
50
- endif()
51
- endif()
52
- endif()
53
-- if(ARM AND CROSS_COMPILE_ARM)
54
-- if(ARM64)
55
-- set(ARM_ARGS -fPIC)
56
-- else()
57
-- set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
58
-- endif()
59
-- message(STATUS "cross compile arm")
60
-- elseif(ARM)
61
-- if(ARM64)
62
-- set(ARM_ARGS -fPIC)
63
-- add_definitions(-DHAVE_NEON)
64
-- else()
65
-- find_package(Neon)
66
-- if(CPU_HAS_NEON)
67
-- set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
68
-- add_definitions(-DHAVE_NEON)
69
-- else()
70
-- set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
71
-- endif()
72
-- endif()
73
-+ if(ARMV7)
74
-+ add_definitions(-fPIC)
75
- endif()
76
-- add_definitions(${ARM_ARGS})
77
- if(FPROFILE_GENERATE)
78
- if(INTEL_CXX)
79
- add_definitions(-prof-gen -prof-dir="${CMAKE_CURRENT_BINARY_DIR}")
80
-Index: x265_3.4/source/common/cpu.cpp
81
-===================================================================
82
---- x265_3.4.orig/source/common/cpu.cpp
83
-+++ x265_3.4/source/common/cpu.cpp
84
-@@ -39,7 +39,7 @@
85
- #include <machine/cpu.h>
86
- #endif
87
-
88
--#if X265_ARCH_ARM && !defined(HAVE_NEON)
89
-+#if X265_ARCH_ARM && (!defined(HAVE_NEON) || HAVE_NEON==0)
90
- #include <signal.h>
91
- #include <setjmp.h>
92
- static sigjmp_buf jmpbuf;
93
-@@ -350,7 +350,6 @@ uint32_t cpu_detect(bool benableavx512)
94
- }
95
-
96
- canjump = 1;
97
-- PFX(cpu_neon_test)();
98
- canjump = 0;
99
- signal(SIGILL, oldsig);
100
- #endif // if !HAVE_NEON
101
-@@ -366,7 +365,7 @@ uint32_t cpu_detect(bool benableavx512)
102
- // which may result in incorrect detection and the counters stuck enabled.
103
- // right now Apple does not seem to support performance counters for this test
104
- #ifndef __MACH__
105
-- flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
106
-+ //flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
107
- #endif
108
- // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
109
- #elif X265_ARCH_ARM64
110
baselibs.conf
Changed
4
1
2
-libx265-199
3
+libx265-209
4
x265_3.5.tar.gz/source/common/aarch64/ipfilter8.S
Deleted
416
1
2
-/*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
- *
5
- * Authors: Yimeng Su <yimeng.su@huawei.com>
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
- *
21
- * This program is also available under a commercial proprietary license.
22
- * For more information, contact us at license @ x265.com.
23
- *****************************************************************************/
24
-
25
-#include "asm.S"
26
-
27
-.section .rodata
28
-
29
-.align 4
30
-
31
-.text
32
-
33
-
34
-
35
-.macro qpel_filter_0_32b
36
- movi v24.8h, #64
37
- uxtl v19.8h, v5.8b
38
- smull v17.4s, v19.4h, v24.4h
39
- smull2 v18.4s, v19.8h, v24.8h
40
-.endm
41
-
42
-.macro qpel_filter_1_32b
43
- movi v16.8h, #58
44
- uxtl v19.8h, v5.8b
45
- smull v17.4s, v19.4h, v16.4h
46
- smull2 v18.4s, v19.8h, v16.8h
47
-
48
- movi v24.8h, #10
49
- uxtl v21.8h, v1.8b
50
- smull v19.4s, v21.4h, v24.4h
51
- smull2 v20.4s, v21.8h, v24.8h
52
-
53
- movi v16.8h, #17
54
- uxtl v23.8h, v2.8b
55
- smull v21.4s, v23.4h, v16.4h
56
- smull2 v22.4s, v23.8h, v16.8h
57
-
58
- movi v24.8h, #5
59
- uxtl v1.8h, v6.8b
60
- smull v23.4s, v1.4h, v24.4h
61
- smull2 v16.4s, v1.8h, v24.8h
62
-
63
- sub v17.4s, v17.4s, v19.4s
64
- sub v18.4s, v18.4s, v20.4s
65
-
66
- uxtl v1.8h, v4.8b
67
- sshll v19.4s, v1.4h, #2
68
- sshll2 v20.4s, v1.8h, #2
69
-
70
- add v17.4s, v17.4s, v21.4s
71
- add v18.4s, v18.4s, v22.4s
72
-
73
- uxtl v1.8h, v0.8b
74
- uxtl v2.8h, v3.8b
75
- ssubl v21.4s, v2.4h, v1.4h
76
- ssubl2 v22.4s, v2.8h, v1.8h
77
-
78
- add v17.4s, v17.4s, v19.4s
79
- add v18.4s, v18.4s, v20.4s
80
- sub v21.4s, v21.4s, v23.4s
81
- sub v22.4s, v22.4s, v16.4s
82
- add v17.4s, v17.4s, v21.4s
83
- add v18.4s, v18.4s, v22.4s
84
-.endm
85
-
86
-.macro qpel_filter_2_32b
87
- movi v16.4s, #11
88
- uxtl v19.8h, v5.8b
89
- uxtl v20.8h, v2.8b
90
- saddl v17.4s, v19.4h, v20.4h
91
- saddl2 v18.4s, v19.8h, v20.8h
92
-
93
- uxtl v21.8h, v1.8b
94
- uxtl v22.8h, v6.8b
95
- saddl v19.4s, v21.4h, v22.4h
96
- saddl2 v20.4s, v21.8h, v22.8h
97
-
98
- mul v19.4s, v19.4s, v16.4s
99
- mul v20.4s, v20.4s, v16.4s
100
-
101
- movi v16.4s, #40
102
- mul v17.4s, v17.4s, v16.4s
103
- mul v18.4s, v18.4s, v16.4s
104
-
105
- uxtl v21.8h, v4.8b
106
- uxtl v22.8h, v3.8b
107
- saddl v23.4s, v21.4h, v22.4h
108
- saddl2 v16.4s, v21.8h, v22.8h
109
-
110
- uxtl v1.8h, v0.8b
111
- uxtl v2.8h, v7.8b
112
- saddl v21.4s, v1.4h, v2.4h
113
- saddl2 v22.4s, v1.8h, v2.8h
114
-
115
- shl v23.4s, v23.4s, #2
116
- shl v16.4s, v16.4s, #2
117
-
118
- add v19.4s, v19.4s, v21.4s
119
- add v20.4s, v20.4s, v22.4s
120
- add v17.4s, v17.4s, v23.4s
121
- add v18.4s, v18.4s, v16.4s
122
- sub v17.4s, v17.4s, v19.4s
123
- sub v18.4s, v18.4s, v20.4s
124
-.endm
125
-
126
-.macro qpel_filter_3_32b
127
- movi v16.8h, #17
128
- movi v24.8h, #5
129
-
130
- uxtl v19.8h, v5.8b
131
- smull v17.4s, v19.4h, v16.4h
132
- smull2 v18.4s, v19.8h, v16.8h
133
-
134
- uxtl v21.8h, v1.8b
135
- smull v19.4s, v21.4h, v24.4h
136
- smull2 v20.4s, v21.8h, v24.8h
137
-
138
- movi v16.8h, #58
139
- uxtl v23.8h, v2.8b
140
- smull v21.4s, v23.4h, v16.4h
141
- smull2 v22.4s, v23.8h, v16.8h
142
-
143
- movi v24.8h, #10
144
- uxtl v1.8h, v6.8b
145
- smull v23.4s, v1.4h, v24.4h
146
- smull2 v16.4s, v1.8h, v24.8h
147
-
148
- sub v17.4s, v17.4s, v19.4s
149
- sub v18.4s, v18.4s, v20.4s
150
-
151
- uxtl v1.8h, v3.8b
152
- sshll v19.4s, v1.4h, #2
153
- sshll2 v20.4s, v1.8h, #2
154
-
155
- add v17.4s, v17.4s, v21.4s
156
- add v18.4s, v18.4s, v22.4s
157
-
158
- uxtl v1.8h, v4.8b
159
- uxtl v2.8h, v7.8b
160
- ssubl v21.4s, v1.4h, v2.4h
161
- ssubl2 v22.4s, v1.8h, v2.8h
162
-
163
- add v17.4s, v17.4s, v19.4s
164
- add v18.4s, v18.4s, v20.4s
165
- sub v21.4s, v21.4s, v23.4s
166
- sub v22.4s, v22.4s, v16.4s
167
- add v17.4s, v17.4s, v21.4s
168
- add v18.4s, v18.4s, v22.4s
169
-.endm
170
-
171
-
172
-
173
-
174
-.macro vextin8
175
- ld1 {v3.16b}, x11, #16
176
- mov v7.d0, v3.d1
177
- ext v0.8b, v3.8b, v7.8b, #1
178
- ext v4.8b, v3.8b, v7.8b, #2
179
- ext v1.8b, v3.8b, v7.8b, #3
180
- ext v5.8b, v3.8b, v7.8b, #4
181
- ext v2.8b, v3.8b, v7.8b, #5
182
- ext v6.8b, v3.8b, v7.8b, #6
183
- ext v3.8b, v3.8b, v7.8b, #7
184
-.endm
185
-
186
-
187
-
188
-// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
189
-.macro HPS_FILTER a b filterhps
190
- mov w12, #8192
191
- mov w6, w10
192
- sub x3, x3, #\a
193
- lsl x3, x3, #1
194
- mov w9, #\a
195
- cmp w9, #4
196
- b.eq 14f
197
- cmp w9, #12
198
- b.eq 15f
199
- b 7f
200
-14:
201
- HPS_FILTER_4 \a \b \filterhps
202
- b 10f
203
-15:
204
- HPS_FILTER_12 \a \b \filterhps
205
- b 10f
206
-7:
207
- cmp w5, #0
208
- b.eq 8f
209
- cmp w5, #1
210
- b.eq 9f
211
-8:
212
-loop1_hps_\filterhps\()_\a\()x\b\()_rowext0:
213
- mov w7, #\a
214
- lsr w7, w7, #3
215
- mov x11, x0
216
- sub x11, x11, #4
217
-loop2_hps_\filterhps\()_\a\()x\b\()_rowext0:
218
- vextin8
219
- \filterhps
220
- dup v16.4s, w12
221
- sub v17.4s, v17.4s, v16.4s
222
- sub v18.4s, v18.4s, v16.4s
223
- xtn v0.4h, v17.4s
224
- xtn2 v0.8h, v18.4s
225
- st1 {v0.8h}, x2, #16
226
- subs w7, w7, #1
227
- sub x11, x11, #8
228
- b.ne loop2_hps_\filterhps\()_\a\()x\b\()_rowext0
229
- subs w6, w6, #1
230
- add x0, x0, x1
231
- add x2, x2, x3
232
- b.ne loop1_hps_\filterhps\()_\a\()x\b\()_rowext0
233
- b 10f
234
-9:
235
-loop3_hps_\filterhps\()_\a\()x\b\()_rowext1:
236
- mov w7, #\a
237
- lsr w7, w7, #3
238
- mov x11, x0
239
- sub x11, x11, #4
240
-loop4_hps_\filterhps\()_\a\()x\b\()_rowext1:
241
- vextin8
242
- \filterhps
243
- dup v16.4s, w12
244
- sub v17.4s, v17.4s, v16.4s
245
- sub v18.4s, v18.4s, v16.4s
246
- xtn v0.4h, v17.4s
247
- xtn2 v0.8h, v18.4s
248
- st1 {v0.8h}, x2, #16
249
- subs w7, w7, #1
250
- sub x11, x11, #8
251
- b.ne loop4_hps_\filterhps\()_\a\()x\b\()_rowext1
252
- subs w6, w6, #1
253
- add x0, x0, x1
254
- add x2, x2, x3
255
- b.ne loop3_hps_\filterhps\()_\a\()x\b\()_rowext1
256
-10:
257
-.endm
258
-
259
-.macro HPS_FILTER_4 w h filterhps
260
- cmp w5, #0
261
- b.eq 11f
262
- cmp w5, #1
263
- b.eq 12f
264
-11:
265
-loop4_hps_\filterhps\()_\w\()x\h\()_rowext0:
266
- mov x11, x0
267
- sub x11, x11, #4
268
- vextin8
269
- \filterhps
270
- dup v16.4s, w12
271
- sub v17.4s, v17.4s, v16.4s
272
- xtn v0.4h, v17.4s
273
- st1 {v0.4h}, x2, #8
274
- sub x11, x11, #8
275
- subs w6, w6, #1
276
- add x0, x0, x1
277
- add x2, x2, x3
278
- b.ne loop4_hps_\filterhps\()_\w\()x\h\()_rowext0
279
- b 13f
280
-12:
281
-loop5_hps_\filterhps\()_\w\()x\h\()_rowext1:
282
- mov x11, x0
283
- sub x11, x11, #4
284
- vextin8
285
- \filterhps
286
- dup v16.4s, w12
287
- sub v17.4s, v17.4s, v16.4s
288
- xtn v0.4h, v17.4s
289
- st1 {v0.4h}, x2, #8
290
- sub x11, x11, #8
291
- subs w6, w6, #1
292
- add x0, x0, x1
293
- add x2, x2, x3
294
- b.ne loop5_hps_\filterhps\()_\w\()x\h\()_rowext1
295
-13:
296
-.endm
297
-
298
-.macro HPS_FILTER_12 w h filterhps
299
- cmp w5, #0
300
- b.eq 14f
301
- cmp w5, #1
302
- b.eq 15f
303
-14:
304
-loop12_hps_\filterhps\()_\w\()x\h\()_rowext0:
305
- mov x11, x0
306
- sub x11, x11, #4
307
- vextin8
308
- \filterhps
309
- dup v16.4s, w12
310
- sub v17.4s, v17.4s, v16.4s
311
- sub v18.4s, v18.4s, v16.4s
312
- xtn v0.4h, v17.4s
313
- xtn2 v0.8h, v18.4s
314
- st1 {v0.8h}, x2, #16
315
- sub x11, x11, #8
316
-
317
- vextin8
318
- \filterhps
319
- dup v16.4s, w12
320
- sub v17.4s, v17.4s, v16.4s
321
- xtn v0.4h, v17.4s
322
- st1 {v0.4h}, x2, #8
323
- add x2, x2, x3
324
- subs w6, w6, #1
325
- add x0, x0, x1
326
- b.ne loop12_hps_\filterhps\()_\w\()x\h\()_rowext0
327
- b 16f
328
-15:
329
-loop12_hps_\filterhps\()_\w\()x\h\()_rowext1:
330
- mov x11, x0
331
- sub x11, x11, #4
332
- vextin8
333
- \filterhps
334
- dup v16.4s, w12
335
- sub v17.4s, v17.4s, v16.4s
336
- sub v18.4s, v18.4s, v16.4s
337
- xtn v0.4h, v17.4s
338
- xtn2 v0.8h, v18.4s
339
- st1 {v0.8h}, x2, #16
340
- sub x11, x11, #8
341
-
342
- vextin8
343
- \filterhps
344
- dup v16.4s, w12
345
- sub v17.4s, v17.4s, v16.4s
346
- xtn v0.4h, v17.4s
347
- st1 {v0.4h}, x2, #8
348
- add x2, x2, x3
349
- subs w6, w6, #1
350
- add x0, x0, x1
351
- b.ne loop12_hps_\filterhps\()_\w\()x\h\()_rowext1
352
-16:
353
-.endm
354
-
355
-// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
356
-.macro LUMA_HPS w h
357
-function x265_interp_8tap_horiz_ps_\w\()x\h\()_neon
358
- mov w10, #\h
359
- cmp w5, #0
360
- b.eq 6f
361
- sub x0, x0, x1, lsl #2
362
-
363
- add x0, x0, x1
364
- add w10, w10, #7
365
-6:
366
- cmp w4, #0
367
- b.eq 0f
368
- cmp w4, #1
369
- b.eq 1f
370
- cmp w4, #2
371
- b.eq 2f
372
- cmp w4, #3
373
- b.eq 3f
374
-0:
375
- HPS_FILTER \w \h qpel_filter_0_32b
376
- b 5f
377
-1:
378
- HPS_FILTER \w \h qpel_filter_1_32b
379
- b 5f
380
-2:
381
- HPS_FILTER \w \h qpel_filter_2_32b
382
- b 5f
383
-3:
384
- HPS_FILTER \w \h qpel_filter_3_32b
385
- b 5f
386
-5:
387
- ret
388
-endfunc
389
-.endm
390
-
391
-LUMA_HPS 4 4
392
-LUMA_HPS 4 8
393
-LUMA_HPS 4 16
394
-LUMA_HPS 8 4
395
-LUMA_HPS 8 8
396
-LUMA_HPS 8 16
397
-LUMA_HPS 8 32
398
-LUMA_HPS 12 16
399
-LUMA_HPS 16 4
400
-LUMA_HPS 16 8
401
-LUMA_HPS 16 12
402
-LUMA_HPS 16 16
403
-LUMA_HPS 16 32
404
-LUMA_HPS 16 64
405
-LUMA_HPS 24 32
406
-LUMA_HPS 32 8
407
-LUMA_HPS 32 16
408
-LUMA_HPS 32 24
409
-LUMA_HPS 32 32
410
-LUMA_HPS 32 64
411
-LUMA_HPS 48 64
412
-LUMA_HPS 64 16
413
-LUMA_HPS 64 32
414
-LUMA_HPS 64 48
415
-LUMA_HPS 64 64
416
x265_3.5.tar.gz/source/common/aarch64/ipfilter8.h
Deleted
57
1
2
-/*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
- *
5
- * Authors: Yimeng Su <yimeng.su@huawei.com>
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
- *
21
- * This program is also available under a commercial proprietary license.
22
- * For more information, contact us at license @ x265.com.
23
- *****************************************************************************/
24
-
25
-#ifndef X265_IPFILTER8_AARCH64_H
26
-#define X265_IPFILTER8_AARCH64_H
27
-
28
-
29
-void x265_interp_8tap_horiz_ps_4x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
30
-void x265_interp_8tap_horiz_ps_4x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
31
-void x265_interp_8tap_horiz_ps_4x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
32
-void x265_interp_8tap_horiz_ps_8x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
33
-void x265_interp_8tap_horiz_ps_8x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
34
-void x265_interp_8tap_horiz_ps_8x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
35
-void x265_interp_8tap_horiz_ps_8x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
36
-void x265_interp_8tap_horiz_ps_12x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
37
-void x265_interp_8tap_horiz_ps_16x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
38
-void x265_interp_8tap_horiz_ps_16x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
39
-void x265_interp_8tap_horiz_ps_16x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
40
-void x265_interp_8tap_horiz_ps_16x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
41
-void x265_interp_8tap_horiz_ps_16x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
42
-void x265_interp_8tap_horiz_ps_16x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
43
-void x265_interp_8tap_horiz_ps_24x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
44
-void x265_interp_8tap_horiz_ps_32x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
45
-void x265_interp_8tap_horiz_ps_32x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
46
-void x265_interp_8tap_horiz_ps_32x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
47
-void x265_interp_8tap_horiz_ps_32x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
48
-void x265_interp_8tap_horiz_ps_32x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
49
-void x265_interp_8tap_horiz_ps_48x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
50
-void x265_interp_8tap_horiz_ps_64x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
51
-void x265_interp_8tap_horiz_ps_64x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
52
-void x265_interp_8tap_horiz_ps_64x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
53
-void x265_interp_8tap_horiz_ps_64x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
54
-
55
-
56
-#endif // ifndef X265_IPFILTER8_AARCH64_H
57
x265_3.5.tar.gz/source/common/aarch64/pixel-util.h
Deleted
42
1
2
-/*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
- *
5
- * Authors: Yimeng Su <yimeng.su@huawei.com>
6
- * Hongbin Liu <liuhongbin1@huawei.com>
7
- *
8
- * This program is free software; you can redistribute it and/or modify
9
- * it under the terms of the GNU General Public License as published by
10
- * the Free Software Foundation; either version 2 of the License, or
11
- * (at your option) any later version.
12
- *
13
- * This program is distributed in the hope that it will be useful,
14
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
- * GNU General Public License for more details.
17
- *
18
- * You should have received a copy of the GNU General Public License
19
- * along with this program; if not, write to the Free Software
20
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21
- *
22
- * This program is also available under a commercial proprietary license.
23
- * For more information, contact us at license @ x265.com.
24
- *****************************************************************************/
25
-
26
-#ifndef X265_PIXEL_UTIL_AARCH64_H
27
-#define X265_PIXEL_UTIL_AARCH64_H
28
-
29
-int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
30
-int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
31
-int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
32
-int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
33
-int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
34
-int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
35
-int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
36
-int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
37
-
38
-uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
39
-int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
40
-
41
-#endif // ifndef X265_PIXEL_UTIL_AARCH64_H
42
x265_3.5.tar.gz/source/common/aarch64/pixel.h
Deleted
107
1
2
-/*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
- *
5
- * Authors: Hongbin Liu <liuhongbin1@huawei.com>
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
- *
21
- * This program is also available under a commercial proprietary license.
22
- * For more information, contact us at license @ x265.com.
23
- *****************************************************************************/
24
-
25
-#ifndef X265_I386_PIXEL_AARCH64_H
26
-#define X265_I386_PIXEL_AARCH64_H
27
-
28
-void x265_pixel_avg_pp_4x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
29
-void x265_pixel_avg_pp_4x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
30
-void x265_pixel_avg_pp_4x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
31
-void x265_pixel_avg_pp_8x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
32
-void x265_pixel_avg_pp_8x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
33
-void x265_pixel_avg_pp_8x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
34
-void x265_pixel_avg_pp_8x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
35
-void x265_pixel_avg_pp_12x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
36
-void x265_pixel_avg_pp_16x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
37
-void x265_pixel_avg_pp_16x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
38
-void x265_pixel_avg_pp_16x12_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
39
-void x265_pixel_avg_pp_16x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
40
-void x265_pixel_avg_pp_16x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
41
-void x265_pixel_avg_pp_16x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
42
-void x265_pixel_avg_pp_24x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
43
-void x265_pixel_avg_pp_32x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
44
-void x265_pixel_avg_pp_32x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
45
-void x265_pixel_avg_pp_32x24_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
46
-void x265_pixel_avg_pp_32x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
47
-void x265_pixel_avg_pp_32x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
48
-void x265_pixel_avg_pp_48x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
49
-void x265_pixel_avg_pp_64x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
50
-void x265_pixel_avg_pp_64x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
51
-void x265_pixel_avg_pp_64x48_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
52
-void x265_pixel_avg_pp_64x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
53
-
54
-void x265_sad_x3_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
55
-void x265_sad_x3_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
56
-void x265_sad_x3_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
57
-void x265_sad_x3_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
58
-void x265_sad_x3_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
59
-void x265_sad_x3_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
60
-void x265_sad_x3_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
61
-void x265_sad_x3_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
62
-void x265_sad_x3_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
63
-void x265_sad_x3_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
64
-void x265_sad_x3_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
65
-void x265_sad_x3_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
66
-void x265_sad_x3_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
67
-void x265_sad_x3_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
68
-void x265_sad_x3_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
69
-void x265_sad_x3_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
70
-void x265_sad_x3_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
71
-void x265_sad_x3_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
72
-void x265_sad_x3_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
73
-void x265_sad_x3_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
74
-void x265_sad_x3_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
75
-void x265_sad_x3_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
76
-void x265_sad_x3_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
77
-void x265_sad_x3_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
78
-void x265_sad_x3_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
79
-
80
-void x265_sad_x4_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
81
-void x265_sad_x4_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
82
-void x265_sad_x4_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
83
-void x265_sad_x4_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
84
-void x265_sad_x4_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
85
-void x265_sad_x4_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
86
-void x265_sad_x4_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
87
-void x265_sad_x4_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
88
-void x265_sad_x4_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
89
-void x265_sad_x4_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
90
-void x265_sad_x4_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
91
-void x265_sad_x4_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
92
-void x265_sad_x4_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
93
-void x265_sad_x4_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
94
-void x265_sad_x4_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
95
-void x265_sad_x4_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
96
-void x265_sad_x4_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
97
-void x265_sad_x4_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
98
-void x265_sad_x4_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
99
-void x265_sad_x4_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
100
-void x265_sad_x4_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
101
-void x265_sad_x4_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
102
-void x265_sad_x4_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
103
-void x265_sad_x4_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
104
-void x265_sad_x4_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
105
-
106
-#endif // ifndef X265_I386_PIXEL_AARCH64_H
107
x265_3.6.tar.gz/.gitignore
Added
38
1
2
+# Prerequisites
3
+*.d
4
+
5
+# Compiled Object files
6
+*.slo
7
+*.lo
8
+*.o
9
+*.obj
10
+
11
+# Precompiled Headers
12
+*.gch
13
+*.pch
14
+
15
+# Compiled Dynamic libraries
16
+*.so
17
+*.dylib
18
+*.dll
19
+
20
+# Fortran module files
21
+*.mod
22
+*.smod
23
+
24
+# Compiled Static libraries
25
+*.lai
26
+*.la
27
+*.a
28
+*.lib
29
+
30
+# Executables
31
+*.exe
32
+*.out
33
+*.app
34
+
35
+# Build directory
36
+build/
37
+
38
x265_3.5.tar.gz/build/README.txt -> x265_3.6.tar.gz/build/README.txt
Changed
37
1
2
3
Note: MSVC12 requires cmake 2.8.11 or later
4
5
+Note: When the SVE/SVE2 instruction set of Arm AArch64 architecture is to be used, the GCC10.x and onwards must
6
+ be installed in order to compile x265.
7
+
8
9
= Optional Prerequisites =
10
11
12
building out of a Mercurial source repository. If you are building out of
13
a release source package, the version will not change. If Mercurial is not
14
found, the version will be "unknown".
15
+
16
+= Build Instructions for cross-compilation for Arm AArch64 Targets=
17
+
18
+When the target platform is based on Arm AArch64 architecture, the x265 can be
19
+built in x86 platforms. However, the CMAKE_C_COMPILER and CMAKE_CXX_COMPILER
20
+enviroment variables should be set to point to the cross compilers of the
21
+appropriate gcc. For example:
22
+
23
+1. export CMAKE_C_COMPILER=aarch64-unknown-linux-gnu-gcc
24
+2. export CMAKE_CXX_COMPILER=aarch64-unknown-linux-gnu-g++
25
+
26
+The default ones are aarch64-linux-gnu-gcc and aarch64-linux-gnu-g++.
27
+Then, the normal building process can be followed.
28
+
29
+Moreover, if the target platform supports SVE or SVE2 instruction set, the
30
+CROSS_COMPILE_SVE or CROSS_COMPILE_SVE2 environment variables should be set
31
+to true, respectively. For example:
32
+
33
+1. export CROSS_COMPILE_SVE2=true
34
+2. export CROSS_COMPILE_SVE=true
35
+
36
+Then, the normal building process can be followed.
37
x265_3.6.tar.gz/build/aarch64-darwin
Added
2
1
+(directory)
2
x265_3.6.tar.gz/build/aarch64-darwin/crosscompile.cmake
Added
25
1
2
+# CMake toolchain file for cross compiling x265 for aarch64
3
+# This feature is only supported as experimental. Use with caution.
4
+# Please report bugs on bitbucket
5
+# Run cmake with: cmake -DCMAKE_TOOLCHAIN_FILE=crosscompile.cmake -G "Unix Makefiles" ../../source && ccmake ../../source
6
+
7
+set(CROSS_COMPILE_ARM64 1)
8
+set(CMAKE_SYSTEM_NAME Darwin)
9
+set(CMAKE_SYSTEM_PROCESSOR aarch64)
10
+
11
+# specify the cross compiler
12
+set(CMAKE_C_COMPILER gcc-12)
13
+set(CMAKE_CXX_COMPILER g++-12)
14
+
15
+# specify the target environment
16
+SET(CMAKE_FIND_ROOT_PATH /opt/homebrew/bin/)
17
+
18
+# specify whether SVE/SVE2 is supported by the target platform
19
+if(DEFINED ENV{CROSS_COMPILE_SVE2})
20
+ set(CROSS_COMPILE_SVE2 1)
21
+elseif(DEFINED ENV{CROSS_COMPILE_SVE})
22
+ set(CROSS_COMPILE_SVE 1)
23
+endif()
24
+
25
x265_3.6.tar.gz/build/aarch64-darwin/make-Makefiles.bash
Added
6
1
2
+#!/bin/bash
3
+# Run this from within a bash shell
4
+
5
+cmake -DCMAKE_TOOLCHAIN_FILE="crosscompile.cmake" -G "Unix Makefiles" ../../source && ccmake ../../source
6
x265_3.5.tar.gz/build/aarch64-linux/crosscompile.cmake -> x265_3.6.tar.gz/build/aarch64-linux/crosscompile.cmake
Changed
34
1
2
# Please report bugs on bitbucket
3
# Run cmake with: cmake -DCMAKE_TOOLCHAIN_FILE=crosscompile.cmake -G "Unix Makefiles" ../../source && ccmake ../../source
4
5
-set(CROSS_COMPILE_ARM 1)
6
+set(CROSS_COMPILE_ARM64 1)
7
set(CMAKE_SYSTEM_NAME Linux)
8
set(CMAKE_SYSTEM_PROCESSOR aarch64)
9
10
# specify the cross compiler
11
-set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
12
-set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
13
+if(DEFINED ENV{CMAKE_C_COMPILER})
14
+ set(CMAKE_C_COMPILER $ENV{CMAKE_C_COMPILER})
15
+else()
16
+ set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
17
+endif()
18
+if(DEFINED ENV{CMAKE_CXX_COMPILER})
19
+ set(CMAKE_CXX_COMPILER $ENV{CMAKE_CXX_COMPILER})
20
+else()
21
+ set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
22
+endif()
23
24
# specify the target environment
25
SET(CMAKE_FIND_ROOT_PATH /usr/aarch64-linux-gnu)
26
+
27
+# specify whether SVE/SVE2 is supported by the target platform
28
+if(DEFINED ENV{CROSS_COMPILE_SVE2})
29
+ set(CROSS_COMPILE_SVE2 1)
30
+elseif(DEFINED ENV{CROSS_COMPILE_SVE})
31
+ set(CROSS_COMPILE_SVE 1)
32
+endif()
33
+
34
x265_3.5.tar.gz/build/arm-linux/make-Makefiles.bash -> x265_3.6.tar.gz/build/arm-linux/make-Makefiles.bash
Changed
7
1
2
#!/bin/bash
3
# Run this from within a bash shell
4
5
-cmake -G "Unix Makefiles" ../../source && ccmake ../../source
6
+cmake -DCMAKE_TOOLCHAIN_FILE="crosscompile.cmake" -G "Unix Makefiles" ../../source && ccmake ../../source
7
x265_3.5.tar.gz/doc/reST/cli.rst -> x265_3.6.tar.gz/doc/reST/cli.rst
Changed
405
1
2
auto-detection by the encoder. If specified, the encoder will
3
attempt to bring the encode specifications within that specified
4
level. If the encoder is unable to reach the level it issues a
5
- warning and aborts the encode. If the requested requirement level is
6
- higher than the actual level, the actual requirement level is
7
- signaled.
8
+ warning and aborts the encode. The requested level will be signaled
9
+ in the bitstream even if it is higher than the actual level.
10
11
Beware, specifying a decoder level will force the encoder to enable
12
VBV for constant rate factor encodes, which may introduce
13
14
(main, main10, etc). Second, an encoder is created from this
15
x265_param instance and the :option:`--level-idc` and
16
:option:`--high-tier` parameters are used to reduce bitrate or other
17
- features in order to enforce the target level. Finally, the encoder
18
- re-examines the final set of parameters and detects the actual
19
- minimum decoder requirement level and this is what is signaled in
20
- the bitstream headers. The detected decoder level will only use High
21
- tier if the user specified a High tier level.
22
+ features in order to enforce the target level. The detected decoder level
23
+ will only use High tier if the user specified a High tier level.
24
25
The signaled profile will be determined by the encoder's internal
26
bitdepth and input color space. If :option:`--keyint` is 0 or 1,
27
28
Note that :option:`--analysis-save-reuse-level` and :option:`--analysis-load-reuse-level` must be paired
29
with :option:`--analysis-save` and :option:`--analysis-load` respectively.
30
31
- +--------------+------------------------------------------+
32
- | Level | Description |
33
- +==============+==========================================+
34
- | 1 | Lookahead information |
35
- +--------------+------------------------------------------+
36
- | 2 to 4 | Level 1 + intra/inter modes, ref's |
37
- +--------------+------------------------------------------+
38
- | 5 and 6 | Level 2 + rect-amp |
39
- +--------------+------------------------------------------+
40
- | 7 | Level 5 + AVC size CU refinement |
41
- +--------------+------------------------------------------+
42
- | 8 and 9 | Level 5 + AVC size Full CU analysis-info |
43
- +--------------+------------------------------------------+
44
- | 10 | Level 5 + Full CU analysis-info |
45
- +--------------+------------------------------------------+
46
+ +--------------+---------------------------------------------------+
47
+ | Level | Description |
48
+ +==============+===================================================+
49
+ | 1 | Lookahead information |
50
+ +--------------+---------------------------------------------------+
51
+ | 2 to 4 | Level 1 + intra/inter modes, depth, ref's, cutree |
52
+ +--------------+---------------------------------------------------+
53
+ | 5 and 6 | Level 2 + rect-amp |
54
+ +--------------+---------------------------------------------------+
55
+ | 7 | Level 5 + AVC size CU refinement |
56
+ +--------------+---------------------------------------------------+
57
+ | 8 and 9 | Level 5 + AVC size Full CU analysis-info |
58
+ +--------------+---------------------------------------------------+
59
+ | 10 | Level 5 + Full CU analysis-info |
60
+ +--------------+---------------------------------------------------+
61
62
.. option:: --refine-mv-type <string>
63
64
65
Search range for HME level 0, 1 and 2.
66
The Search Range for each HME level must be between 0 and 32768(excluding).
67
Default search range is 16,32,48 for level 0,1,2 respectively.
68
+
69
+.. option:: --mcstf, --no-mcstf
70
+
71
+ Enable Motion Compensated Temporal filtering.
72
+ Default: disabled
73
74
Spatial/intra options
75
=====================
76
77
78
.. option:: --hist-scenecut, --no-hist-scenecut
79
80
- Indicates that scenecuts need to be detected using luma edge and chroma histograms.
81
- :option:`--hist-scenecut` enables scenecut detection using the histograms and disables the default scene cut algorithm.
82
- :option:`--no-hist-scenecut` disables histogram based scenecut algorithm.
83
-
84
-.. option:: --hist-threshold <0.0..1.0>
85
-
86
- This value represents the threshold for normalized SAD of edge histograms used in scenecut detection.
87
- This requires :option:`--hist-scenecut` to be enabled. For example, a value of 0.2 indicates that a frame with normalized SAD value
88
- greater than 0.2 against the previous frame as scenecut.
89
- Increasing the threshold reduces the number of scenecuts detected.
90
- Default 0.03.
91
+ Scenecuts detected based on histogram, intensity and variance of the picture.
92
+ :option:`--hist-scenecut` enables or :option:`--no-hist-scenecut` disables scenecut detection based on
93
+ histogram.
94
95
.. option:: --radl <integer>
96
97
98
Default 1.0.
99
**Range of values:** 0.0 to 3.0
100
101
+.. option:: --sbrc --no-sbrc
102
+
103
+ To enable and disable segment based rate control.Segment duration depends on the
104
+ keyframe interval specified.If unspecified,default keyframe interval will be used.
105
+ Default: disabled.
106
+
107
.. option:: --hevc-aq
108
109
Enable adaptive quantization
110
111
112
**CLI ONLY**
113
114
+.. option:: --scenecut-qp-config <filename>
115
+
116
+ Specify a text file which contains the scenecut aware QP options.
117
+ The options include :option:`--scenecut-aware-qp` and :option:`--masking-strength`
118
+
119
+ **CLI ONLY**
120
+
121
.. option:: --scenecut-aware-qp <integer>
122
123
It reduces the bits spent on the inter-frames within the scenecut window
124
before and after a scenecut by increasing their QP in ratecontrol pass2 algorithm
125
- without any deterioration in visual quality. If a scenecut falls within the window,
126
- the QP of the inter-frames after this scenecut will not be modified.
127
+ without any deterioration in visual quality.
128
:option:`--scenecut-aware-qp` works only with --pass 2. Default 0.
129
130
+-------+---------------------------------------------------------------+
131
132
for the QP increment for inter-frames when :option:`--scenecut-aware-qp`
133
is enabled.
134
135
- When :option:`--scenecut-aware-qp` is::
136
+ When :option:`--scenecut-aware-qp` is:
137
+
138
* 1 (Forward masking):
139
- --masking-strength <fwdWindow,fwdRefQPDelta,fwdNonRefQPDelta>
140
+ --masking-strength <fwdMaxWindow,fwdRefQPDelta,fwdNonRefQPDelta>
141
+ or
142
+ --masking-strength <fwdWindow1,fwdRefQPDelta1,fwdNonRefQPDelta1,fwdWindow2,fwdRefQPDelta2,fwdNonRefQPDelta2,
143
+ fwdWindow3,fwdRefQPDelta3,fwdNonRefQPDelta3,fwdWindow4,fwdRefQPDelta4,fwdNonRefQPDelta4,
144
+ fwdWindow5,fwdRefQPDelta5,fwdNonRefQPDelta5,fwdWindow6,fwdRefQPDelta6,fwdNonRefQPDelta6>
145
* 2 (Backward masking):
146
- --masking-strength <bwdWindow,bwdRefQPDelta,bwdNonRefQPDelta>
147
+ --masking-strength <bwdMaxWindow,bwdRefQPDelta,bwdNonRefQPDelta>
148
+ or
149
+ --masking-strength <bwdWindow1,bwdRefQPDelta1,bwdNonRefQPDelta1,bwdWindow2,bwdRefQPDelta2,bwdNonRefQPDelta2,
150
+ bwdWindow3,bwdRefQPDelta3,bwdNonRefQPDelta3,bwdWindow4,bwdRefQPDelta4,bwdNonRefQPDelta4,
151
+ bwdWindow5,bwdRefQPDelta5,bwdNonRefQPDelta5,bwdWindow6,bwdRefQPDelta6,bwdNonRefQPDelta6>
152
* 3 (Bi-directional masking):
153
- --masking-strength <fwdWindow,fwdRefQPDelta,fwdNonRefQPDelta,bwdWindow,bwdRefQPDelta,bwdNonRefQPDelta>
154
+ --masking-strength <fwdMaxWindow,fwdRefQPDelta,fwdNonRefQPDelta,bwdMaxWindow,bwdRefQPDelta,bwdNonRefQPDelta>
155
+ or
156
+ --masking-strength <fwdWindow1,fwdRefQPDelta1,fwdNonRefQPDelta1,fwdWindow2,fwdRefQPDelta2,fwdNonRefQPDelta2,
157
+ fwdWindow3,fwdRefQPDelta3,fwdNonRefQPDelta3,fwdWindow4,fwdRefQPDelta4,fwdNonRefQPDelta4,
158
+ fwdWindow5,fwdRefQPDelta5,fwdNonRefQPDelta5,fwdWindow6,fwdRefQPDelta6,fwdNonRefQPDelta6,
159
+ bwdWindow1,bwdRefQPDelta1,bwdNonRefQPDelta1,bwdWindow2,bwdRefQPDelta2,bwdNonRefQPDelta2,
160
+ bwdWindow3,bwdRefQPDelta3,bwdNonRefQPDelta3,bwdWindow4,bwdRefQPDelta4,bwdNonRefQPDelta4,
161
+ bwdWindow5,bwdRefQPDelta5,bwdNonRefQPDelta5,bwdWindow6,bwdRefQPDelta6,bwdNonRefQPDelta6>
162
163
+-----------------+---------------------------------------------------------------+
164
| Parameter | Description |
165
+=================+===============================================================+
166
- | fwdWindow | The duration(in milliseconds) for which there is a reduction |
167
- | | in the bits spent on the inter-frames after a scenecut by |
168
- | | increasing their QP. Default 500ms. |
169
- | | **Range of values:** 0 to 1000 |
170
+ | fwdMaxWindow | The maximum duration(in milliseconds) for which there is a |
171
+ | | reduction in the bits spent on the inter-frames after a |
172
+ | | scenecut by increasing their QP. Default 500ms. |
173
+ | | **Range of values:** 0 to 2000 |
174
+ +-----------------+---------------------------------------------------------------+
175
+ | fwdWindow | The duration of a sub-window(in milliseconds) for which there |
176
+ | | is a reduction in the bits spent on the inter-frames after a |
177
+ | | scenecut by increasing their QP. Default 500ms. |
178
+ | | **Range of values:** 0 to 2000 |
179
+-----------------+---------------------------------------------------------------+
180
| fwdRefQPDelta | The offset by which QP is incremented for inter-frames |
181
| | after a scenecut. Default 5. |
182
- | | **Range of values:** 0 to 10 |
183
+ | | **Range of values:** 0 to 20 |
184
+-----------------+---------------------------------------------------------------+
185
| fwdNonRefQPDelta| The offset by which QP is incremented for non-referenced |
186
| | inter-frames after a scenecut. The offset is computed from |
187
| | fwdRefQPDelta when it is not explicitly specified. |
188
- | | **Range of values:** 0 to 10 |
189
+ | | **Range of values:** 0 to 20 |
190
+ +-----------------+---------------------------------------------------------------+
191
+ | bwdMaxWindow | The maximum duration(in milliseconds) for which there is a |
192
+ | | reduction in the bits spent on the inter-frames before a |
193
+ | | scenecut by increasing their QP. Default 100ms. |
194
+ | | **Range of values:** 0 to 2000 |
195
+-----------------+---------------------------------------------------------------+
196
- | bwdWindow | The duration(in milliseconds) for which there is a reduction |
197
- | | in the bits spent on the inter-frames before a scenecut by |
198
- | | increasing their QP. Default 100ms. |
199
- | | **Range of values:** 0 to 1000 |
200
+ | bwdWindow | The duration of a sub-window(in milliseconds) for which there |
201
+ | | is a reduction in the bits spent on the inter-frames before a |
202
+ | | scenecut by increasing their QP. Default 100ms. |
203
+ | | **Range of values:** 0 to 2000 |
204
+-----------------+---------------------------------------------------------------+
205
| bwdRefQPDelta | The offset by which QP is incremented for inter-frames |
206
| | before a scenecut. The offset is computed from |
207
| | fwdRefQPDelta when it is not explicitly specified. |
208
- | | **Range of values:** 0 to 10 |
209
+ | | **Range of values:** 0 to 20 |
210
+-----------------+---------------------------------------------------------------+
211
| bwdNonRefQPDelta| The offset by which QP is incremented for non-referenced |
212
| | inter-frames before a scenecut. The offset is computed from |
213
| | bwdRefQPDelta when it is not explicitly specified. |
214
- | | **Range of values:** 0 to 10 |
215
+ | | **Range of values:** 0 to 20 |
216
+-----------------+---------------------------------------------------------------+
217
218
- **CLI ONLY**
219
+ We can specify the value for the Use :option:`--masking-strength` parameter in different formats.
220
+ 1. If we don't specify --masking-strength and specify only --scenecut-aware-qp, then default offset and window size values are considered.
221
+ 2. If we specify --masking-strength with the format 1 mentioned above, the values of window, refQpDelta and nonRefQpDelta given by the user are taken for window 1 and the offsets for the remaining windows are derived with 15% difference between windows.
222
+ 3. If we specify the --masking-strength with the format 2 mentioned above, the values of window, refQpDelta and nonRefQpDelta given by the user for each window from 1 to 6 are directly used.NOTE: We can use this format to specify zero offsets for any particular window
223
+
224
+ Sample config file:: (Format 2 Forward masking explained here)
225
+
226
+ --scenecut-aware-qp 1 --masking-strength 1000,8,12
227
+
228
+ The above sample config file is available in `the downloads page <https://bitbucket.org/multicoreware/x265_git/downloads/scenecut_qp_config.txt>`_
229
230
.. option:: --vbv-live-multi-pass, --no-vbv-live-multi-pass
231
232
233
rate control mode.
234
235
Default disabled. **Experimental feature**
236
+
237
+
238
+.. option:: bEncFocusedFramesOnly
239
+
240
+ Used to trigger encoding of selective GOPs; Disabled by default.
241
+
242
+ **API ONLY**
243
+
244
245
Quantization Options
246
====================
247
248
Values in the range 0..12. See D.3.3 of the HEVC spec. for a detailed explanation.
249
Required for HLG (Hybrid Log Gamma) signaling. Not signaled by default.
250
251
+.. option:: --video-signal-type-preset <string>
252
+
253
+ Specify combinations of color primaries, transfer characteristics, color matrix,
254
+ range of luma and chroma signals, and chroma sample location.
255
+ String format: <system-id>:<color-volume>
256
+
257
+ This has higher precedence than individual VUI parameters. If any individual VUI option
258
+ is specified together with this, which changes the values set corresponding to the system-id
259
+ or color-volume, it will be discarded.
260
+
261
+ system-id options and their corresponding values:
262
+ +----------------+---------------------------------------------------------------+
263
+ | system-id | Value |
264
+ +================+===============================================================+
265
+ | BT601_525 | --colorprim smpte170m --transfer smpte170m |
266
+ | | --colormatrix smpte170m --range limited --chromaloc 0 |
267
+ +----------------+---------------------------------------------------------------+
268
+ | BT601_626 | --colorprim bt470bg --transfer smpte170m --colormatrix bt470bg|
269
+ | | --range limited --chromaloc 0 |
270
+ +----------------+---------------------------------------------------------------+
271
+ | BT709_YCC | --colorprim bt709 --transfer bt709 --colormatrix bt709 |
272
+ | | --range limited --chromaloc 0 |
273
+ +----------------+---------------------------------------------------------------+
274
+ | BT709_RGB | --colorprim bt709 --transfer bt709 --colormatrix gbr |
275
+ | | --range limited |
276
+ +----------------+---------------------------------------------------------------+
277
+ | BT2020_YCC_NCL | --colorprim bt2020 --transfer bt2020-10 --colormatrix bt709 |
278
+ | | --range limited --chromaloc 2 |
279
+ +----------------+---------------------------------------------------------------+
280
+ | BT2020_RGB | --colorprim bt2020 --transfer smpte2084 --colormatrix bt2020nc|
281
+ | | --range limited |
282
+ +----------------+---------------------------------------------------------------+
283
+ | BT2100_PQ_YCC | --colorprim bt2020 --transfer smpte2084 --colormatrix bt2020nc|
284
+ | | --range limited --chromaloc 2 |
285
+ +----------------+---------------------------------------------------------------+
286
+ | BT2100_PQ_ICTCP| --colorprim bt2020 --transfer smpte2084 --colormatrix ictcp |
287
+ | | --range limited --chromaloc 2 |
288
+ +----------------+---------------------------------------------------------------+
289
+ | BT2100_PQ_RGB | --colorprim bt2020 --transfer smpte2084 --colormatrix gbr |
290
+ | | --range limited |
291
+ +----------------+---------------------------------------------------------------+
292
+ | BT2100_HLG_YCC | --colorprim bt2020 --transfer arib-std-b67 |
293
+ | | --colormatrix bt2020nc --range limited --chromaloc 2 |
294
+ +----------------+---------------------------------------------------------------+
295
+ | BT2100_HLG_RGB | --colorprim bt2020 --transfer arib-std-b67 --colormatrix gbr |
296
+ | | --range limited |
297
+ +----------------+---------------------------------------------------------------+
298
+ | FR709_RGB | --colorprim bt709 --transfer bt709 --colormatrix gbr |
299
+ | | --range full |
300
+ +----------------+---------------------------------------------------------------+
301
+ | FR2020_RGB | --colorprim bt2020 --transfer bt2020-10 --colormatrix gbr |
302
+ | | --range full |
303
+ +----------------+---------------------------------------------------------------+
304
+ | FRP3D65_YCC | --colorprim smpte432 --transfer bt709 --colormatrix smpte170m |
305
+ | | --range full --chromaloc 1 |
306
+ +----------------+---------------------------------------------------------------+
307
+
308
+ color-volume options and their corresponding values:
309
+ +----------------+---------------------------------------------------------------+
310
+ | color-volume | Value |
311
+ +================+===============================================================+
312
+ | P3D65x1000n0005| --master-display G(13250,34500)B(7500,3000)R(34000,16000) |
313
+ | | WP(15635,16450)L(10000000,5) |
314
+ +----------------+---------------------------------------------------------------+
315
+ | P3D65x4000n005 | --master-display G(13250,34500)B(7500,3000)R(34000,16000) |
316
+ | | WP(15635,16450)L(40000000,50) |
317
+ +----------------+---------------------------------------------------------------+
318
+ | BT2100x108n0005| --master-display G(8500,39850)B(6550,2300)R(34000,146000) |
319
+ | | WP(15635,16450)L(10000000,1) |
320
+ +----------------+---------------------------------------------------------------+
321
+
322
+ Note: The color-volume options can be used only with the system-id options BT2100_PQ_YCC,
323
+ BT2100_PQ_ICTCP, and BT2100_PQ_RGB. It is incompatible with other options.
324
+
325
+
326
Bitstream options
327
=================
328
329
330
the very first AUD will be skipped since it cannot be placed at the
331
start of the access unit, where it belongs. Default disabled
332
333
+.. option:: --eob, --no-eob
334
+
335
+ Emit an end of bitstream NAL unit at the end of the bitstream.
336
+ Default disabled
337
+
338
+.. option:: --eos, --no-eos
339
+
340
+ Emit an end of sequence NAL unit at the end of every coded
341
+ video sequence. Default disabled
342
+
343
.. option:: --hrd, --no-hrd
344
345
Enable the signaling of HRD parameters to the decoder. The HRD
346
347
The value is specified as a float or as an integer with the profile times 10,
348
for example profile 5 is specified as "5" or "5.0" or "50".
349
350
- Currently only profile 5, profile 8.1 and profile 8.2 enabled, Default 0 (disabled)
351
+ Currently only profile 5, profile 8.1, profile 8.2 and profile 8.4 enabled, Default 0 (disabled)
352
353
.. option:: --dolby-vision-rpu <filename>
354
355
356
2. CRC
357
3. Checksum
358
359
-.. option:: --temporal-layers,--no-temporal-layers
360
+.. option:: --temporal-layers <integer>
361
362
- Enable a temporal sub layer. All referenced I/P/B frames are in the
363
- base layer and all unreferenced B frames are placed in a temporal
364
- enhancement layer. A decoder may choose to drop the enhancement layer
365
- and only decode and display the base layer slices.
366
-
367
- If used with a fixed GOP (:option:`--b-adapt` 0) and :option:`--bframes`
368
- 3 then the two layers evenly split the frame rate, with a cadence of
369
- PbBbP. You probably also want :option:`--no-scenecut` and a keyframe
370
- interval that is a multiple of 4.
371
+ Enable specified number of temporal sub layers. For any frame in layer N,
372
+ all referenced frames are in the layer N or N-1.A decoder may choose to drop the enhancement layer
373
+ and only decode and display the base layer slices.Allowed number of temporal sub-layers
374
+ are 2 to 5.(2 and 5 inclusive)
375
+
376
+ When enabled,temporal layers 3 through 5 configures a fixed miniGOP with the number of bframes as shown below
377
+ unless miniGOP size is modified due to lookahead decisions.Temporal layer 2 is a special case that has
378
+ all reference frames in base layer and non-reference frames in enhancement layer without any constraint on the
379
+ number of bframes.Default disabled.
380
+ +----------------+--------+
381
+ | temporal layer | bframes|
382
+ +================+========+
383
+ | 3 | 3 |
384
+ +----------------+--------+
385
+ | 4 | 7 |
386
+ +----------------+--------+
387
+ | 5 | 15 |
388
+ +----------------+--------+
389
390
.. option:: --log2-max-poc-lsb <integer>
391
392
393
Emit SEI messages in a single NAL unit instead of multiple NALs. Default disabled.
394
When HRD SEI is enabled the HM decoder will throw a warning.
395
396
+.. option:: --film-grain <filename>
397
+
398
+ Refers to the film grain model characteristics for signal enhancement information transmission
399
+
400
+ **CLI_ONLY**
401
+
402
DCT Approximations
403
=================
404
405
x265_3.5.tar.gz/doc/reST/introduction.rst -> x265_3.6.tar.gz/doc/reST/introduction.rst
Changed
9
1
2
to start is with the `Motion Picture Experts Group - Licensing Authority
3
- HEVC Licensing Program <http://www.mpegla.com/main/PID/HEVC/default.aspx>`_.
4
5
-x265 is a registered trademark of MulticoreWare, Inc. The x265 logo is
6
+x265 is a registered trademark of MulticoreWare, Inc. The X265 logo is
7
a trademark of MulticoreWare, and may only be used with explicit written
8
permission. All rights reserved.
9
x265_3.5.tar.gz/doc/reST/releasenotes.rst -> x265_3.6.tar.gz/doc/reST/releasenotes.rst
Changed
55
1
2
Release Notes
3
*************
4
5
+Version 3.6
6
+===========
7
+
8
+Release date - 4th April, 2024.
9
+
10
+New feature
11
+-----------
12
+1. Segment based Ratecontrol (SBRC) feature
13
+2. Motion-Compensated Spatio-Temporal Filtering
14
+3. Scene-cut aware qp - BBAQ (Bidirectional Boundary Aware Quantization)
15
+4. Histogram-Based Scene Change Detection
16
+5. Film-Grain characteristics as a SEI message to support Film Grain Synthesis(FGS)
17
+6. Add temporal layer implementation(Hierarchical B-frame implementation)
18
+
19
+Enhancements to existing features
20
+---------------------------------
21
+1. Added Dolby Vision 8.4 Profile Support
22
+
23
+
24
+API changes
25
+-----------
26
+1. Add Segment based Ratecontrol(SBRC) feature: "--no-sbrc".
27
+2. Add command line parameter for mcstf feature: "--no-mctf".
28
+3. Add command line parameters for the scene cut aware qp feature: "--scenecut-aware-qp" and "--masking-strength".
29
+4. Add command line parameters for Histogram-Based Scene Change Detection: "--hist-scenecut".
30
+5. Add film grain characteristics as a SEI message to the bitstream: "--film-grain <filename>"
31
+6. cli: add new option --cra-nal (Force nal type to CRA to all frames expect for the first frame, works only with keyint 1)
32
+
33
+Optimizations
34
+---------------------
35
+ARM64 NEON optimizations:- Several time-consuming C functions have been optimized for the targeted platform - aarch64. The overall performance increased by around 20%.
36
+SVE/SVE2 optimizations
37
+
38
+
39
+Bug fixes
40
+---------
41
+1. Linux bug to utilize all the cores
42
+2. Crash with hist-scenecut build when source resolution is not multiple of minCuSize
43
+3. 32bit and 64bit builds generation for ARM
44
+4. bugs in zonefile feature (Reflect Zonefile Parameters inside Lookahead, extra IDR issue, Avg I Slice QP value issue etc..)
45
+5. Add x86 ASM implementation for subsampling luma
46
+6. Fix for abrladder segfault with load reuse level 1
47
+7. Reorder miniGOP based on temporal layer hierarchy and add support for more B frame
48
+8. Add MacOS aarch64 build support
49
+9. Fix boundary condition issue for Gaussian filter
50
+
51
+
52
Version 3.5
53
===========
54
55
x265_3.5.tar.gz/readme.rst -> x265_3.6.tar.gz/readme.rst
Changed
10
1
2
x265 HEVC Encoder
3
=================
4
5
-| **Read:** | Online `documentation <http://x265.readthedocs.org/en/default/>`_ | Developer `wiki <http://bitbucket.org/multicoreware/x265/wiki/>`_
6
+| **Read:** | Online `documentation <http://x265.readthedocs.org/en/master/>`_ | Developer `wiki <http://bitbucket.org/multicoreware/x265_git/wiki/>`_
7
| **Download:** | `releases <http://ftp.videolan.org/pub/videolan/x265/>`_
8
| **Interact:** | #x265 on freenode.irc.net | `x265-devel@videolan.org <http://mailman.videolan.org/listinfo/x265-devel>`_ | `Report an issue <https://bitbucket.org/multicoreware/x265/issues?status=new&status=open>`_
9
10
x265_3.5.tar.gz/source/CMakeLists.txt -> x265_3.6.tar.gz/source/CMakeLists.txt
Changed
232
1
2
option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
3
mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
4
# X265_BUILD must be incremented each time the public API is changed
5
-set(X265_BUILD 199)
6
+set(X265_BUILD 209)
7
configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
8
"${PROJECT_BINARY_DIR}/x265.def")
9
configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
10
11
SET(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake" "${CMAKE_MODULE_PATH}")
12
13
# System architecture detection
14
-string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
15
+if (APPLE AND CMAKE_OSX_ARCHITECTURES)
16
+ string(TOLOWER "${CMAKE_OSX_ARCHITECTURES}" SYSPROC)
17
+else()
18
+ string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
19
+endif()
20
set(X86_ALIASES x86 i386 i686 x86_64 amd64)
21
-set(ARM_ALIASES armv6l armv7l aarch64)
22
+set(ARM_ALIASES armv6l armv7l)
23
+set(ARM64_ALIASES arm64 arm64e aarch64)
24
list(FIND X86_ALIASES "${SYSPROC}" X86MATCH)
25
list(FIND ARM_ALIASES "${SYSPROC}" ARMMATCH)
26
-set(POWER_ALIASES ppc64 ppc64le)
27
+list(FIND ARM64_ALIASES "${SYSPROC}" ARM64MATCH)
28
+set(POWER_ALIASES powerpc64 powerpc64le ppc64 ppc64le)
29
list(FIND POWER_ALIASES "${SYSPROC}" POWERMATCH)
30
-if("${SYSPROC}" STREQUAL "" OR X86MATCH GREATER "-1")
31
+if(X86MATCH GREATER "-1")
32
set(X86 1)
33
add_definitions(-DX265_ARCH_X86=1)
34
if(CMAKE_CXX_FLAGS STREQUAL "-m32")
35
36
else()
37
set(CROSS_COMPILE_ARM 0)
38
endif()
39
+ message(STATUS "Detected ARM target processor")
40
set(ARM 1)
41
- if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
42
- message(STATUS "Detected ARM64 target processor")
43
- set(ARM64 1)
44
- add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0)
45
- else()
46
- message(STATUS "Detected ARM target processor")
47
- add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1)
48
- endif()
49
+ add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
50
+elseif(ARM64MATCH GREATER "-1")
51
+ #if(CROSS_COMPILE_ARM64)
52
+ #message(STATUS "Cross compiling for ARM64 arch")
53
+ #else()
54
+ #set(CROSS_COMPILE_ARM64 0)
55
+ #endif()
56
+ message(STATUS "Detected ARM64 target processor")
57
+ set(ARM64 1)
58
+ add_definitions(-DX265_ARCH_ARM64=1 -DHAVE_NEON)
59
else()
60
message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
61
message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
62
63
endif()
64
endif()
65
if(ARM AND CROSS_COMPILE_ARM)
66
- if(ARM64)
67
- set(ARM_ARGS -fPIC)
68
- else()
69
- set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
70
- endif()
71
message(STATUS "cross compile arm")
72
+ set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
73
elseif(ARM)
74
- if(ARM64)
75
- set(ARM_ARGS -fPIC)
76
+ find_package(Neon)
77
+ if(CPU_HAS_NEON)
78
+ set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
79
add_definitions(-DHAVE_NEON)
80
else()
81
- find_package(Neon)
82
- if(CPU_HAS_NEON)
83
- set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
84
- add_definitions(-DHAVE_NEON)
85
- else()
86
- set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
87
- endif()
88
+ set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
89
endif()
90
endif()
91
+ if(ARM64 OR CROSS_COMPILE_ARM64)
92
+ find_package(Neon)
93
+ find_package(SVE)
94
+ find_package(SVE2)
95
+ if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2)
96
+ message(STATUS "Found SVE2")
97
+ set(ARM_ARGS -O3 -march=armv8-a+sve2 -fPIC -flax-vector-conversions)
98
+ add_definitions(-DHAVE_SVE2)
99
+ add_definitions(-DHAVE_SVE)
100
+ add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE2
101
+ elseif(CPU_HAS_SVE OR CROSS_COMPILE_SVE)
102
+ message(STATUS "Found SVE")
103
+ set(ARM_ARGS -O3 -march=armv8-a+sve -fPIC -flax-vector-conversions)
104
+ add_definitions(-DHAVE_SVE)
105
+ add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE
106
+ elseif(CPU_HAS_NEON)
107
+ message(STATUS "Found NEON")
108
+ set(ARM_ARGS -fPIC -flax-vector-conversions)
109
+ add_definitions(-DHAVE_NEON)
110
+ else()
111
+ set(ARM_ARGS -fPIC -flax-vector-conversions)
112
+ endif()
113
+ endif()
114
+ if(ENABLE_PIC)
115
+ list(APPEND ARM_ARGS -DPIC)
116
+ endif()
117
add_definitions(${ARM_ARGS})
118
if(FPROFILE_GENERATE)
119
if(INTEL_CXX)
120
121
endif(GCC)
122
123
find_package(Nasm)
124
-if(ARM OR CROSS_COMPILE_ARM)
125
+if(ARM OR CROSS_COMPILE_ARM OR ARM64 OR CROSS_COMPILE_ARM64)
126
option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" ON)
127
elseif(NASM_FOUND AND X86)
128
if (NASM_VERSION_STRING VERSION_LESS "2.13.0")
129
130
endif(EXTRA_LIB)
131
mark_as_advanced(EXTRA_LIB EXTRA_LINK_FLAGS)
132
133
-if(X64)
134
+if(X64 OR ARM64 OR PPC64)
135
# NOTE: We only officially support high-bit-depth compiles of x265
136
# on 64bit architectures. Main10 plus large resolution plus slow
137
# preset plus 32bit address space usually means malloc failure. You
138
139
# license" so to speak. If it breaks you get to keep both halves.
140
# You will need to disable assembly manually.
141
option(HIGH_BIT_DEPTH "Store pixel samples as 16bit values (Main10/Main12)" OFF)
142
-endif(X64)
143
+endif(X64 OR ARM64 OR PPC64)
144
if(HIGH_BIT_DEPTH)
145
option(MAIN12 "Support Main12 instead of Main10" OFF)
146
if(MAIN12)
147
148
endif()
149
add_definitions(-DX265_NS=${X265_NS})
150
151
+if(ARM64)
152
+ if(HIGH_BIT_DEPTH)
153
+ if(MAIN12)
154
+ list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=12 -DX265_NS=${X265_NS})
155
+ else()
156
+ list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=10 -DX265_NS=${X265_NS})
157
+ endif()
158
+ else()
159
+ list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=0 -DBIT_DEPTH=8 -DX265_NS=${X265_NS})
160
+ endif()
161
+endif(ARM64)
162
+
163
option(WARNINGS_AS_ERRORS "Stop compiles on first warning" OFF)
164
if(WARNINGS_AS_ERRORS)
165
if(GCC)
166
167
# compile ARM arch asm files here
168
enable_language(ASM)
169
foreach(ASM ${ARM_ASMS})
170
- if(ARM64)
171
- set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
172
- else()
173
- set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})
174
- endif()
175
+ set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})
176
list(APPEND ASM_SRCS ${ASM_SRC})
177
list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
178
add_custom_command(
179
180
ARGS ${ARM_ARGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
181
DEPENDS ${ASM_SRC})
182
endforeach()
183
+ elseif(ARM64 OR CROSS_COMPILE_ARM64)
184
+ # compile ARM64 arch asm files here
185
+ enable_language(ASM)
186
+ foreach(ASM ${ARM_ASMS})
187
+ set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
188
+ list(APPEND ASM_SRCS ${ASM_SRC})
189
+ list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
190
+ add_custom_command(
191
+ OUTPUT ${ASM}.${SUFFIX}
192
+ COMMAND ${CMAKE_CXX_COMPILER}
193
+ ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
194
+ DEPENDS ${ASM_SRC})
195
+ endforeach()
196
+ if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2)
197
+ foreach(ASM ${ARM_ASMS_SVE})
198
+ set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
199
+ list(APPEND ASM_SRCS ${ASM_SRC})
200
+ list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
201
+ add_custom_command(
202
+ OUTPUT ${ASM}.${SUFFIX}
203
+ COMMAND ${CMAKE_CXX_COMPILER}
204
+ ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
205
+ DEPENDS ${ASM_SRC})
206
+ endforeach()
207
+ foreach(ASM ${ARM_ASMS_SVE2})
208
+ set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
209
+ list(APPEND ASM_SRCS ${ASM_SRC})
210
+ list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
211
+ add_custom_command(
212
+ OUTPUT ${ASM}.${SUFFIX}
213
+ COMMAND ${CMAKE_CXX_COMPILER}
214
+ ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
215
+ DEPENDS ${ASM_SRC})
216
+ endforeach()
217
+ elseif(CPU_HAS_SVE OR CROSS_COMPILE_SVE)
218
+ foreach(ASM ${ARM_ASMS_SVE})
219
+ set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
220
+ list(APPEND ASM_SRCS ${ASM_SRC})
221
+ list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
222
+ add_custom_command(
223
+ OUTPUT ${ASM}.${SUFFIX}
224
+ COMMAND ${CMAKE_CXX_COMPILER}
225
+ ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
226
+ DEPENDS ${ASM_SRC})
227
+ endforeach()
228
+ endif()
229
elseif(X86)
230
# compile X86 arch asm files here
231
foreach(ASM ${MSVC_ASMS})
232
x265_3.5.tar.gz/source/abrEncApp.cpp -> x265_3.6.tar.gz/source/abrEncApp.cpp
Changed
2220
1
2
-/*****************************************************************************
3
-* Copyright (C) 2013-2020 MulticoreWare, Inc
4
-*
5
-* Authors: Pooja Venkatesan <pooja@multicorewareinc.com>
6
-* Aruna Matheswaran <aruna@multicorewareinc.com>
7
-*
8
-* This program is free software; you can redistribute it and/or modify
9
-* it under the terms of the GNU General Public License as published by
10
-* the Free Software Foundation; either version 2 of the License, or
11
-* (at your option) any later version.
12
-*
13
-* This program is distributed in the hope that it will be useful,
14
-* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
-* GNU General Public License for more details.
17
-*
18
-* You should have received a copy of the GNU General Public License
19
-* along with this program; if not, write to the Free Software
20
-* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21
-*
22
-* This program is also available under a commercial proprietary license.
23
-* For more information, contact us at license @ x265.com.
24
-*****************************************************************************/
25
-
26
-#include "abrEncApp.h"
27
-#include "mv.h"
28
-#include "slice.h"
29
-#include "param.h"
30
-
31
-#include <signal.h>
32
-#include <errno.h>
33
-
34
-#include <queue>
35
-
36
-using namespace X265_NS;
37
-
38
-/* Ctrl-C handler */
39
-static volatile sig_atomic_t b_ctrl_c /* = 0 */;
40
-static void sigint_handler(int)
41
-{
42
- b_ctrl_c = 1;
43
-}
44
-
45
-namespace X265_NS {
46
- // private namespace
47
-#define X265_INPUT_QUEUE_SIZE 250
48
-
49
- AbrEncoder::AbrEncoder(CLIOptions cliopt, uint8_t numEncodes, int &ret)
50
- {
51
- m_numEncodes = numEncodes;
52
- m_numActiveEncodes.set(numEncodes);
53
- m_queueSize = (numEncodes > 1) ? X265_INPUT_QUEUE_SIZE : 1;
54
- m_passEnc = X265_MALLOC(PassEncoder*, m_numEncodes);
55
-
56
- for (uint8_t i = 0; i < m_numEncodes; i++)
57
- {
58
- m_passEnci = new PassEncoder(i, cliopti, this);
59
- if (!m_passEnci)
60
- {
61
- x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for passEncoder\n");
62
- ret = 4;
63
- }
64
- m_passEnci->init(ret);
65
- }
66
-
67
- if (!allocBuffers())
68
- {
69
- x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for buffers\n");
70
- ret = 4;
71
- }
72
-
73
- /* start passEncoder worker threads */
74
- for (uint8_t pass = 0; pass < m_numEncodes; pass++)
75
- m_passEncpass->startThreads();
76
- }
77
-
78
- bool AbrEncoder::allocBuffers()
79
- {
80
- m_inputPicBuffer = X265_MALLOC(x265_picture**, m_numEncodes);
81
- m_analysisBuffer = X265_MALLOC(x265_analysis_data*, m_numEncodes);
82
-
83
- m_picWriteCnt = new ThreadSafeIntegerm_numEncodes;
84
- m_picReadCnt = new ThreadSafeIntegerm_numEncodes;
85
- m_analysisWriteCnt = new ThreadSafeIntegerm_numEncodes;
86
- m_analysisReadCnt = new ThreadSafeIntegerm_numEncodes;
87
-
88
- m_picIdxReadCnt = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
89
- m_analysisWrite = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
90
- m_analysisRead = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
91
- m_readFlag = X265_MALLOC(int*, m_numEncodes);
92
-
93
- for (uint8_t pass = 0; pass < m_numEncodes; pass++)
94
- {
95
- m_inputPicBufferpass = X265_MALLOC(x265_picture*, m_queueSize);
96
- for (uint32_t idx = 0; idx < m_queueSize; idx++)
97
- {
98
- m_inputPicBufferpassidx = x265_picture_alloc();
99
- x265_picture_init(m_passEncpass->m_param, m_inputPicBufferpassidx);
100
- }
101
-
102
- CHECKED_MALLOC_ZERO(m_analysisBufferpass, x265_analysis_data, m_queueSize);
103
- m_picIdxReadCntpass = new ThreadSafeIntegerm_queueSize;
104
- m_analysisWritepass = new ThreadSafeIntegerm_queueSize;
105
- m_analysisReadpass = new ThreadSafeIntegerm_queueSize;
106
- m_readFlagpass = X265_MALLOC(int, m_queueSize);
107
- }
108
- return true;
109
- fail:
110
- return false;
111
- }
112
-
113
- void AbrEncoder::destroy()
114
- {
115
- x265_cleanup(); /* Free library singletons */
116
- for (uint8_t pass = 0; pass < m_numEncodes; pass++)
117
- {
118
- for (uint32_t index = 0; index < m_queueSize; index++)
119
- {
120
- X265_FREE(m_inputPicBufferpassindex->planes0);
121
- x265_picture_free(m_inputPicBufferpassindex);
122
- }
123
-
124
- X265_FREE(m_inputPicBufferpass);
125
- X265_FREE(m_analysisBufferpass);
126
- X265_FREE(m_readFlagpass);
127
- delete m_picIdxReadCntpass;
128
- delete m_analysisWritepass;
129
- delete m_analysisReadpass;
130
- m_passEncpass->destroy();
131
- delete m_passEncpass;
132
- }
133
- X265_FREE(m_inputPicBuffer);
134
- X265_FREE(m_analysisBuffer);
135
- X265_FREE(m_readFlag);
136
-
137
- delete m_picWriteCnt;
138
- delete m_picReadCnt;
139
- delete m_analysisWriteCnt;
140
- delete m_analysisReadCnt;
141
-
142
- X265_FREE(m_picIdxReadCnt);
143
- X265_FREE(m_analysisWrite);
144
- X265_FREE(m_analysisRead);
145
-
146
- X265_FREE(m_passEnc);
147
- }
148
-
149
- PassEncoder::PassEncoder(uint32_t id, CLIOptions cliopt, AbrEncoder *parent)
150
- {
151
- m_id = id;
152
- m_cliopt = cliopt;
153
- m_parent = parent;
154
- if(!(m_cliopt.enableScaler && m_id))
155
- m_input = m_cliopt.input;
156
- m_param = cliopt.param;
157
- m_inputOver = false;
158
- m_lastIdx = -1;
159
- m_encoder = NULL;
160
- m_scaler = NULL;
161
- m_reader = NULL;
162
- m_ret = 0;
163
- }
164
-
165
- int PassEncoder::init(int &result)
166
- {
167
- if (m_parent->m_numEncodes > 1)
168
- setReuseLevel();
169
-
170
- if (!(m_cliopt.enableScaler && m_id))
171
- m_reader = new Reader(m_id, this);
172
- else
173
- {
174
- VideoDesc *src = NULL, *dst = NULL;
175
- dst = new VideoDesc(m_param->sourceWidth, m_param->sourceHeight, m_param->internalCsp, m_param->internalBitDepth);
176
- int dstW = m_parent->m_passEncm_id - 1->m_param->sourceWidth;
177
- int dstH = m_parent->m_passEncm_id - 1->m_param->sourceHeight;
178
- src = new VideoDesc(dstW, dstH, m_param->internalCsp, m_param->internalBitDepth);
179
- if (src != NULL && dst != NULL)
180
- {
181
- m_scaler = new Scaler(0, 1, m_id, src, dst, this);
182
- if (!m_scaler)
183
- {
184
- x265_log(m_param, X265_LOG_ERROR, "\n MALLOC failure in Scaler");
185
- result = 4;
186
- }
187
- }
188
- }
189
-
190
- /* note: we could try to acquire a different libx265 API here based on
191
- * the profile found during option parsing, but it must be done before
192
- * opening an encoder */
193
-
194
- if (m_param)
195
- m_encoder = m_cliopt.api->encoder_open(m_param);
196
- if (!m_encoder)
197
- {
198
- x265_log(NULL, X265_LOG_ERROR, "x265_encoder_open() failed for Enc, \n");
199
- m_ret = 2;
200
- return -1;
201
- }
202
-
203
- /* get the encoder parameters post-initialization */
204
- m_cliopt.api->encoder_parameters(m_encoder, m_param);
205
-
206
- return 1;
207
- }
208
-
209
- void PassEncoder::setReuseLevel()
210
- {
211
- uint32_t r, padh = 0, padw = 0;
212
-
213
- m_param->confWinBottomOffset = m_param->confWinRightOffset = 0;
214
-
215
- m_param->analysisLoadReuseLevel = m_cliopt.loadLevel;
216
- m_param->analysisSaveReuseLevel = m_cliopt.saveLevel;
217
- m_param->analysisSave = m_cliopt.saveLevel ? "save.dat" : NULL;
218
- m_param->analysisLoad = m_cliopt.loadLevel ? "load.dat" : NULL;
219
- m_param->bUseAnalysisFile = 0;
220
-
221
- if (m_cliopt.loadLevel)
222
- {
223
- x265_param *refParam = m_parent->m_passEncm_cliopt.refId->m_param;
224
-
225
- if (m_param->sourceHeight == (refParam->sourceHeight - refParam->confWinBottomOffset) &&
226
- m_param->sourceWidth == (refParam->sourceWidth - refParam->confWinRightOffset))
227
- {
228
- m_parent->m_passEncm_id->m_param->confWinBottomOffset = refParam->confWinBottomOffset;
229
- m_parent->m_passEncm_id->m_param->confWinRightOffset = refParam->confWinRightOffset;
230
- }
231
- else
232
- {
233
- int srcH = refParam->sourceHeight - refParam->confWinBottomOffset;
234
- int srcW = refParam->sourceWidth - refParam->confWinRightOffset;
235
-
236
- double scaleFactorH = double(m_param->sourceHeight / srcH);
237
- double scaleFactorW = double(m_param->sourceWidth / srcW);
238
-
239
- int absScaleFactorH = (int)(10 * scaleFactorH + 0.5);
240
- int absScaleFactorW = (int)(10 * scaleFactorW + 0.5);
241
-
242
- if (absScaleFactorH == 20 && absScaleFactorW == 20)
243
- {
244
- m_param->scaleFactor = 2;
245
-
246
- m_parent->m_passEncm_id->m_param->confWinBottomOffset = refParam->confWinBottomOffset * 2;
247
- m_parent->m_passEncm_id->m_param->confWinRightOffset = refParam->confWinRightOffset * 2;
248
-
249
- }
250
- }
251
- }
252
-
253
- int h = m_param->sourceHeight + m_param->confWinBottomOffset;
254
- int w = m_param->sourceWidth + m_param->confWinRightOffset;
255
- if (h & (m_param->minCUSize - 1))
256
- {
257
- r = h & (m_param->minCUSize - 1);
258
- padh = m_param->minCUSize - r;
259
- m_param->confWinBottomOffset += padh;
260
-
261
- }
262
-
263
- if (w & (m_param->minCUSize - 1))
264
- {
265
- r = w & (m_param->minCUSize - 1);
266
- padw = m_param->minCUSize - r;
267
- m_param->confWinRightOffset += padw;
268
- }
269
- }
270
-
271
- void PassEncoder::startThreads()
272
- {
273
- /* Start slave worker threads */
274
- m_threadActive = true;
275
- start();
276
- /* Start reader threads*/
277
- if (m_reader != NULL)
278
- {
279
- m_reader->m_threadActive = true;
280
- m_reader->start();
281
- }
282
- /* Start scaling worker threads */
283
- if (m_scaler != NULL)
284
- {
285
- m_scaler->m_threadActive = true;
286
- m_scaler->start();
287
- }
288
- }
289
-
290
- void PassEncoder::copyInfo(x265_analysis_data * src)
291
- {
292
-
293
- uint32_t written = m_parent->m_analysisWriteCntm_id.get();
294
-
295
- int index = written % m_parent->m_queueSize;
296
- //If all streams have read analysis data, reuse that position in Queue
297
-
298
- int read = m_parent->m_analysisReadm_idindex.get();
299
- int write = m_parent->m_analysisWritem_idindex.get();
300
-
301
- int overwrite = written / m_parent->m_queueSize;
302
- bool emptyIdxFound = 0;
303
- while (!emptyIdxFound && overwrite)
304
- {
305
- for (uint32_t i = 0; i < m_parent->m_queueSize; i++)
306
- {
307
- read = m_parent->m_analysisReadm_idi.get();
308
- write = m_parent->m_analysisWritem_idi.get();
309
- write *= m_cliopt.numRefs;
310
-
311
- if (read == write)
312
- {
313
- index = i;
314
- emptyIdxFound = 1;
315
- }
316
- }
317
- }
318
-
319
- x265_analysis_data *m_analysisInfo = &m_parent->m_analysisBufferm_idindex;
320
-
321
- x265_free_analysis_data(m_param, m_analysisInfo);
322
- memcpy(m_analysisInfo, src, sizeof(x265_analysis_data));
323
- x265_alloc_analysis_data(m_param, m_analysisInfo);
324
-
325
- bool isVbv = m_param->rc.vbvBufferSize && m_param->rc.vbvMaxBitrate;
326
- if (m_param->bDisableLookahead && isVbv)
327
- {
328
- memcpy(m_analysisInfo->lookahead.intraSatdForVbv, src->lookahead.intraSatdForVbv, src->numCuInHeight * sizeof(uint32_t));
329
- memcpy(m_analysisInfo->lookahead.satdForVbv, src->lookahead.satdForVbv, src->numCuInHeight * sizeof(uint32_t));
330
- memcpy(m_analysisInfo->lookahead.intraVbvCost, src->lookahead.intraVbvCost, src->numCUsInFrame * sizeof(uint32_t));
331
- memcpy(m_analysisInfo->lookahead.vbvCost, src->lookahead.vbvCost, src->numCUsInFrame * sizeof(uint32_t));
332
- }
333
-
334
- if (src->sliceType == X265_TYPE_IDR || src->sliceType == X265_TYPE_I)
335
- {
336
- if (m_param->analysisSaveReuseLevel < 2)
337
- goto ret;
338
- x265_analysis_intra_data *intraDst, *intraSrc;
339
- intraDst = (x265_analysis_intra_data*)m_analysisInfo->intraData;
340
- intraSrc = (x265_analysis_intra_data*)src->intraData;
341
- memcpy(intraDst->depth, intraSrc->depth, sizeof(uint8_t) * src->depthBytes);
342
- memcpy(intraDst->modes, intraSrc->modes, sizeof(uint8_t) * src->numCUsInFrame * src->numPartitions);
343
- memcpy(intraDst->partSizes, intraSrc->partSizes, sizeof(char) * src->depthBytes);
344
- memcpy(intraDst->chromaModes, intraSrc->chromaModes, sizeof(uint8_t) * src->depthBytes);
345
- if (m_param->rc.cuTree)
346
- memcpy(intraDst->cuQPOff, intraSrc->cuQPOff, sizeof(int8_t) * src->depthBytes);
347
- }
348
- else
349
- {
350
- bool bIntraInInter = (src->sliceType == X265_TYPE_P || m_param->bIntraInBFrames);
351
- int numDir = src->sliceType == X265_TYPE_P ? 1 : 2;
352
- memcpy(m_analysisInfo->wt, src->wt, sizeof(WeightParam) * 3 * numDir);
353
- if (m_param->analysisSaveReuseLevel < 2)
354
- goto ret;
355
- x265_analysis_inter_data *interDst, *interSrc;
356
- interDst = (x265_analysis_inter_data*)m_analysisInfo->interData;
357
- interSrc = (x265_analysis_inter_data*)src->interData;
358
- memcpy(interDst->depth, interSrc->depth, sizeof(uint8_t) * src->depthBytes);
359
- memcpy(interDst->modes, interSrc->modes, sizeof(uint8_t) * src->depthBytes);
360
- if (m_param->rc.cuTree)
361
- memcpy(interDst->cuQPOff, interSrc->cuQPOff, sizeof(int8_t) * src->depthBytes);
362
- if (m_param->analysisSaveReuseLevel > 4)
363
- {
364
- memcpy(interDst->partSize, interSrc->partSize, sizeof(uint8_t) * src->depthBytes);
365
- memcpy(interDst->mergeFlag, interSrc->mergeFlag, sizeof(uint8_t) * src->depthBytes);
366
- if (m_param->analysisSaveReuseLevel == 10)
367
- {
368
- memcpy(interDst->interDir, interSrc->interDir, sizeof(uint8_t) * src->depthBytes);
369
- for (int dir = 0; dir < numDir; dir++)
370
- {
371
- memcpy(interDst->mvpIdxdir, interSrc->mvpIdxdir, sizeof(uint8_t) * src->depthBytes);
372
- memcpy(interDst->refIdxdir, interSrc->refIdxdir, sizeof(int8_t) * src->depthBytes);
373
- memcpy(interDst->mvdir, interSrc->mvdir, sizeof(MV) * src->depthBytes);
374
- }
375
- if (bIntraInInter)
376
- {
377
- x265_analysis_intra_data *intraDst = (x265_analysis_intra_data*)m_analysisInfo->intraData;
378
- x265_analysis_intra_data *intraSrc = (x265_analysis_intra_data*)src->intraData;
379
- memcpy(intraDst->modes, intraSrc->modes, sizeof(uint8_t) * src->numPartitions * src->numCUsInFrame);
380
- memcpy(intraDst->chromaModes, intraSrc->chromaModes, sizeof(uint8_t) * src->depthBytes);
381
- }
382
- }
383
- }
384
- if (m_param->analysisSaveReuseLevel != 10)
385
- memcpy(interDst->ref, interSrc->ref, sizeof(int32_t) * src->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir);
386
- }
387
-
388
-ret:
389
- //increment analysis Write counter
390
- m_parent->m_analysisWriteCntm_id.incr();
391
- m_parent->m_analysisWritem_idindex.incr();
392
- return;
393
- }
394
-
395
-
396
- bool PassEncoder::readPicture(x265_picture *dstPic)
397
- {
398
- /*Check and wait if there any input frames to read*/
399
- int ipread = m_parent->m_picReadCntm_id.get();
400
- int ipwrite = m_parent->m_picWriteCntm_id.get();
401
-
402
- bool isAbrLoad = m_cliopt.loadLevel && (m_parent->m_numEncodes > 1);
403
- while (!m_inputOver && (ipread == ipwrite))
404
- {
405
- ipwrite = m_parent->m_picWriteCntm_id.waitForChange(ipwrite);
406
- }
407
-
408
- if (m_threadActive && ipread < ipwrite)
409
- {
410
- /*Get input index to read from inputQueue. If doesn't need analysis info, it need not wait to fetch poc from analysisQueue*/
411
- int readPos = ipread % m_parent->m_queueSize;
412
- x265_analysis_data* analysisData = 0;
413
-
414
- if (isAbrLoad)
415
- {
416
- /*If stream is master of each slave pass, then fetch analysis data from prev pass*/
417
- int analysisQId = m_cliopt.refId;
418
- /*Check and wait if there any analysis Data to read*/
419
- int analysisWrite = m_parent->m_analysisWriteCntanalysisQId.get();
420
- int written = analysisWrite * m_parent->m_passEncanalysisQId->m_cliopt.numRefs;
421
- int analysisRead = m_parent->m_analysisReadCntanalysisQId.get();
422
-
423
- while (m_threadActive && written == analysisRead)
424
- {
425
- analysisWrite = m_parent->m_analysisWriteCntanalysisQId.waitForChange(analysisWrite);
426
- written = analysisWrite * m_parent->m_passEncanalysisQId->m_cliopt.numRefs;
427
- }
428
-
429
- if (analysisRead < written)
430
- {
431
- int analysisIdx = 0;
432
- if (!m_param->bDisableLookahead)
433
- {
434
- bool analysisdRead = false;
435
- while ((analysisRead < written) && !analysisdRead)
436
- {
437
- while (analysisWrite < ipread)
438
- {
439
- analysisWrite = m_parent->m_analysisWriteCntanalysisQId.waitForChange(analysisWrite);
440
- written = analysisWrite * m_parent->m_passEncanalysisQId->m_cliopt.numRefs;
441
- }
442
- for (uint32_t i = 0; i < m_parent->m_queueSize; i++)
443
- {
444
- analysisData = &m_parent->m_analysisBufferanalysisQIdi;
445
- int read = m_parent->m_analysisReadanalysisQIdi.get();
446
- int write = m_parent->m_analysisWriteanalysisQIdi.get() * m_parent->m_passEncanalysisQId->m_cliopt.numRefs;
447
- if ((analysisData->poc == (uint32_t)(ipread)) && (read < write))
448
- {
449
- analysisIdx = i;
450
- analysisdRead = true;
451
- break;
452
- }
453
- }
454
- }
455
- }
456
- else
457
- {
458
- analysisIdx = analysisRead % m_parent->m_queueSize;
459
- analysisData = &m_parent->m_analysisBufferanalysisQIdanalysisIdx;
460
- readPos = analysisData->poc % m_parent->m_queueSize;
461
- while ((ipwrite < readPos) || ((ipwrite - 1) < (int)analysisData->poc))
462
- {
463
- ipwrite = m_parent->m_picWriteCntm_id.waitForChange(ipwrite);
464
- }
465
- }
466
-
467
- m_lastIdx = analysisIdx;
468
- }
469
- else
470
- return false;
471
- }
472
-
473
-
474
- x265_picture *srcPic = (x265_picture*)(m_parent->m_inputPicBufferm_idreadPos);
475
-
476
- x265_picture *pic = (x265_picture*)(dstPic);
477
- pic->colorSpace = srcPic->colorSpace;
478
- pic->bitDepth = srcPic->bitDepth;
479
- pic->framesize = srcPic->framesize;
480
- pic->height = srcPic->height;
481
- pic->pts = srcPic->pts;
482
- pic->dts = srcPic->dts;
483
- pic->reorderedPts = srcPic->reorderedPts;
484
- pic->width = srcPic->width;
485
- pic->analysisData = srcPic->analysisData;
486
- pic->userSEI = srcPic->userSEI;
487
- pic->stride0 = srcPic->stride0;
488
- pic->stride1 = srcPic->stride1;
489
- pic->stride2 = srcPic->stride2;
490
- pic->planes0 = srcPic->planes0;
491
- pic->planes1 = srcPic->planes1;
492
- pic->planes2 = srcPic->planes2;
493
- if (isAbrLoad)
494
- pic->analysisData = *analysisData;
495
- return true;
496
- }
497
- else
498
- return false;
499
- }
500
-
501
- void PassEncoder::threadMain()
502
- {
503
+/*****************************************************************************
504
+* Copyright (C) 2013-2020 MulticoreWare, Inc
505
+*
506
+* Authors: Pooja Venkatesan <pooja@multicorewareinc.com>
507
+* Aruna Matheswaran <aruna@multicorewareinc.com>
508
+*
509
+* This program is free software; you can redistribute it and/or modify
510
+* it under the terms of the GNU General Public License as published by
511
+* the Free Software Foundation; either version 2 of the License, or
512
+* (at your option) any later version.
513
+*
514
+* This program is distributed in the hope that it will be useful,
515
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
516
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
517
+* GNU General Public License for more details.
518
+*
519
+* You should have received a copy of the GNU General Public License
520
+* along with this program; if not, write to the Free Software
521
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
522
+*
523
+* This program is also available under a commercial proprietary license.
524
+* For more information, contact us at license @ x265.com.
525
+*****************************************************************************/
526
+
527
+#include "abrEncApp.h"
528
+#include "mv.h"
529
+#include "slice.h"
530
+#include "param.h"
531
+
532
+#include <signal.h>
533
+#include <errno.h>
534
+
535
+#include <queue>
536
+
537
+using namespace X265_NS;
538
+
539
+/* Ctrl-C handler */
540
+static volatile sig_atomic_t b_ctrl_c /* = 0 */;
541
+static void sigint_handler(int)
542
+{
543
+ b_ctrl_c = 1;
544
+}
545
+
546
+namespace X265_NS {
547
+ // private namespace
548
+#define X265_INPUT_QUEUE_SIZE 250
549
+
550
+ AbrEncoder::AbrEncoder(CLIOptions cliopt, uint8_t numEncodes, int &ret)
551
+ {
552
+ m_numEncodes = numEncodes;
553
+ m_numActiveEncodes.set(numEncodes);
554
+ m_queueSize = (numEncodes > 1) ? X265_INPUT_QUEUE_SIZE : 1;
555
+ m_passEnc = X265_MALLOC(PassEncoder*, m_numEncodes);
556
+
557
+ for (uint8_t i = 0; i < m_numEncodes; i++)
558
+ {
559
+ m_passEnci = new PassEncoder(i, cliopti, this);
560
+ if (!m_passEnci)
561
+ {
562
+ x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for passEncoder\n");
563
+ ret = 4;
564
+ }
565
+ m_passEnci->init(ret);
566
+ }
567
+
568
+ if (!allocBuffers())
569
+ {
570
+ x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for buffers\n");
571
+ ret = 4;
572
+ }
573
+
574
+ /* start passEncoder worker threads */
575
+ for (uint8_t pass = 0; pass < m_numEncodes; pass++)
576
+ m_passEncpass->startThreads();
577
+ }
578
+
579
+ bool AbrEncoder::allocBuffers()
580
+ {
581
+ m_inputPicBuffer = X265_MALLOC(x265_picture**, m_numEncodes);
582
+ m_analysisBuffer = X265_MALLOC(x265_analysis_data*, m_numEncodes);
583
+
584
+ m_picWriteCnt = new ThreadSafeIntegerm_numEncodes;
585
+ m_picReadCnt = new ThreadSafeIntegerm_numEncodes;
586
+ m_analysisWriteCnt = new ThreadSafeIntegerm_numEncodes;
587
+ m_analysisReadCnt = new ThreadSafeIntegerm_numEncodes;
588
+
589
+ m_picIdxReadCnt = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
590
+ m_analysisWrite = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
591
+ m_analysisRead = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
592
+ m_readFlag = X265_MALLOC(int*, m_numEncodes);
593
+
594
+ for (uint8_t pass = 0; pass < m_numEncodes; pass++)
595
+ {
596
+ m_inputPicBufferpass = X265_MALLOC(x265_picture*, m_queueSize);
597
+ for (uint32_t idx = 0; idx < m_queueSize; idx++)
598
+ {
599
+ m_inputPicBufferpassidx = x265_picture_alloc();
600
+ x265_picture_init(m_passEncpass->m_param, m_inputPicBufferpassidx);
601
+ }
602
+
603
+ CHECKED_MALLOC_ZERO(m_analysisBufferpass, x265_analysis_data, m_queueSize);
604
+ m_picIdxReadCntpass = new ThreadSafeIntegerm_queueSize;
605
+ m_analysisWritepass = new ThreadSafeIntegerm_queueSize;
606
+ m_analysisReadpass = new ThreadSafeIntegerm_queueSize;
607
+ m_readFlagpass = X265_MALLOC(int, m_queueSize);
608
+ }
609
+ return true;
610
+ fail:
611
+ return false;
612
+ }
613
+
614
+ void AbrEncoder::destroy()
615
+ {
616
+ x265_cleanup(); /* Free library singletons */
617
+ for (uint8_t pass = 0; pass < m_numEncodes; pass++)
618
+ {
619
+ for (uint32_t index = 0; index < m_queueSize; index++)
620
+ {
621
+ X265_FREE(m_inputPicBufferpassindex->planes0);
622
+ x265_picture_free(m_inputPicBufferpassindex);
623
+ }
624
+
625
+ X265_FREE(m_inputPicBufferpass);
626
+ X265_FREE(m_analysisBufferpass);
627
+ X265_FREE(m_readFlagpass);
628
+ delete m_picIdxReadCntpass;
629
+ delete m_analysisWritepass;
630
+ delete m_analysisReadpass;
631
+ m_passEncpass->destroy();
632
+ delete m_passEncpass;
633
+ }
634
+ X265_FREE(m_inputPicBuffer);
635
+ X265_FREE(m_analysisBuffer);
636
+ X265_FREE(m_readFlag);
637
+
638
+ delete m_picWriteCnt;
639
+ delete m_picReadCnt;
640
+ delete m_analysisWriteCnt;
641
+ delete m_analysisReadCnt;
642
+
643
+ X265_FREE(m_picIdxReadCnt);
644
+ X265_FREE(m_analysisWrite);
645
+ X265_FREE(m_analysisRead);
646
+
647
+ X265_FREE(m_passEnc);
648
+ }
649
+
650
+ PassEncoder::PassEncoder(uint32_t id, CLIOptions cliopt, AbrEncoder *parent)
651
+ {
652
+ m_id = id;
653
+ m_cliopt = cliopt;
654
+ m_parent = parent;
655
+ if(!(m_cliopt.enableScaler && m_id))
656
+ m_input = m_cliopt.input;
657
+ m_param = cliopt.param;
658
+ m_inputOver = false;
659
+ m_lastIdx = -1;
660
+ m_encoder = NULL;
661
+ m_scaler = NULL;
662
+ m_reader = NULL;
663
+ m_ret = 0;
664
+ }
665
+
666
+ int PassEncoder::init(int &result)
667
+ {
668
+ if (m_parent->m_numEncodes > 1)
669
+ setReuseLevel();
670
+
671
+ if (!(m_cliopt.enableScaler && m_id))
672
+ m_reader = new Reader(m_id, this);
673
+ else
674
+ {
675
+ VideoDesc *src = NULL, *dst = NULL;
676
+ dst = new VideoDesc(m_param->sourceWidth, m_param->sourceHeight, m_param->internalCsp, m_param->internalBitDepth);
677
+ int dstW = m_parent->m_passEncm_id - 1->m_param->sourceWidth;
678
+ int dstH = m_parent->m_passEncm_id - 1->m_param->sourceHeight;
679
+ src = new VideoDesc(dstW, dstH, m_param->internalCsp, m_param->internalBitDepth);
680
+ if (src != NULL && dst != NULL)
681
+ {
682
+ m_scaler = new Scaler(0, 1, m_id, src, dst, this);
683
+ if (!m_scaler)
684
+ {
685
+ x265_log(m_param, X265_LOG_ERROR, "\n MALLOC failure in Scaler");
686
+ result = 4;
687
+ }
688
+ }
689
+ }
690
+
691
+ if (m_cliopt.zoneFile)
692
+ {
693
+ if (!m_cliopt.parseZoneFile())
694
+ {
695
+ x265_log(NULL, X265_LOG_ERROR, "Unable to parse zonefile in %s\n");
696
+ fclose(m_cliopt.zoneFile);
697
+ m_cliopt.zoneFile = NULL;
698
+ }
699
+ }
700
+
701
+ /* note: we could try to acquire a different libx265 API here based on
702
+ * the profile found during option parsing, but it must be done before
703
+ * opening an encoder */
704
+
705
+ if (m_param)
706
+ m_encoder = m_cliopt.api->encoder_open(m_param);
707
+ if (!m_encoder)
708
+ {
709
+ x265_log(NULL, X265_LOG_ERROR, "x265_encoder_open() failed for Enc, \n");
710
+ m_ret = 2;
711
+ return -1;
712
+ }
713
+
714
+ /* get the encoder parameters post-initialization */
715
+ m_cliopt.api->encoder_parameters(m_encoder, m_param);
716
+
717
+ return 1;
718
+ }
719
+
720
+ void PassEncoder::setReuseLevel()
721
+ {
722
+ uint32_t r, padh = 0, padw = 0;
723
+
724
+ m_param->confWinBottomOffset = m_param->confWinRightOffset = 0;
725
+
726
+ m_param->analysisLoadReuseLevel = m_cliopt.loadLevel;
727
+ m_param->analysisSaveReuseLevel = m_cliopt.saveLevel;
728
+ m_param->analysisSave = m_cliopt.saveLevel ? "save.dat" : NULL;
729
+ m_param->analysisLoad = m_cliopt.loadLevel ? "load.dat" : NULL;
730
+ m_param->bUseAnalysisFile = 0;
731
+
732
+ if (m_cliopt.loadLevel)
733
+ {
734
+ x265_param *refParam = m_parent->m_passEncm_cliopt.refId->m_param;
735
+
736
+ if (m_param->sourceHeight == (refParam->sourceHeight - refParam->confWinBottomOffset) &&
737
+ m_param->sourceWidth == (refParam->sourceWidth - refParam->confWinRightOffset))
738
+ {
739
+ m_parent->m_passEncm_id->m_param->confWinBottomOffset = refParam->confWinBottomOffset;
740
+ m_parent->m_passEncm_id->m_param->confWinRightOffset = refParam->confWinRightOffset;
741
+ }
742
+ else
743
+ {
744
+ int srcH = refParam->sourceHeight - refParam->confWinBottomOffset;
745
+ int srcW = refParam->sourceWidth - refParam->confWinRightOffset;
746
+
747
+ double scaleFactorH = double(m_param->sourceHeight / srcH);
748
+ double scaleFactorW = double(m_param->sourceWidth / srcW);
749
+
750
+ int absScaleFactorH = (int)(10 * scaleFactorH + 0.5);
751
+ int absScaleFactorW = (int)(10 * scaleFactorW + 0.5);
752
+
753
+ if (absScaleFactorH == 20 && absScaleFactorW == 20)
754
+ {
755
+ m_param->scaleFactor = 2;
756
+
757
+ m_parent->m_passEncm_id->m_param->confWinBottomOffset = refParam->confWinBottomOffset * 2;
758
+ m_parent->m_passEncm_id->m_param->confWinRightOffset = refParam->confWinRightOffset * 2;
759
+
760
+ }
761
+ }
762
+ }
763
+
764
+ int h = m_param->sourceHeight + m_param->confWinBottomOffset;
765
+ int w = m_param->sourceWidth + m_param->confWinRightOffset;
766
+ if (h & (m_param->minCUSize - 1))
767
+ {
768
+ r = h & (m_param->minCUSize - 1);
769
+ padh = m_param->minCUSize - r;
770
+ m_param->confWinBottomOffset += padh;
771
+
772
+ }
773
+
774
+ if (w & (m_param->minCUSize - 1))
775
+ {
776
+ r = w & (m_param->minCUSize - 1);
777
+ padw = m_param->minCUSize - r;
778
+ m_param->confWinRightOffset += padw;
779
+ }
780
+ }
781
+
782
+ void PassEncoder::startThreads()
783
+ {
784
+ /* Start slave worker threads */
785
+ m_threadActive = true;
786
+ start();
787
+ /* Start reader threads*/
788
+ if (m_reader != NULL)
789
+ {
790
+ m_reader->m_threadActive = true;
791
+ m_reader->start();
792
+ }
793
+ /* Start scaling worker threads */
794
+ if (m_scaler != NULL)
795
+ {
796
+ m_scaler->m_threadActive = true;
797
+ m_scaler->start();
798
+ }
799
+ }
800
+
801
+ void PassEncoder::copyInfo(x265_analysis_data * src)
802
+ {
803
+
804
+ uint32_t written = m_parent->m_analysisWriteCntm_id.get();
805
+
806
+ int index = written % m_parent->m_queueSize;
807
+ //If all streams have read analysis data, reuse that position in Queue
808
+
809
+ int read = m_parent->m_analysisReadm_idindex.get();
810
+ int write = m_parent->m_analysisWritem_idindex.get();
811
+
812
+ int overwrite = written / m_parent->m_queueSize;
813
+ bool emptyIdxFound = 0;
814
+ while (!emptyIdxFound && overwrite)
815
+ {
816
+ for (uint32_t i = 0; i < m_parent->m_queueSize; i++)
817
+ {
818
+ read = m_parent->m_analysisReadm_idi.get();
819
+ write = m_parent->m_analysisWritem_idi.get();
820
+ write *= m_cliopt.numRefs;
821
+
822
+ if (read == write)
823
+ {
824
+ index = i;
825
+ emptyIdxFound = 1;
826
+ }
827
+ }
828
+ }
829
+
830
+ x265_analysis_data *m_analysisInfo = &m_parent->m_analysisBufferm_idindex;
831
+
832
+ x265_free_analysis_data(m_param, m_analysisInfo);
833
+ memcpy(m_analysisInfo, src, sizeof(x265_analysis_data));
834
+ x265_alloc_analysis_data(m_param, m_analysisInfo);
835
+
836
+ bool isVbv = m_param->rc.vbvBufferSize && m_param->rc.vbvMaxBitrate;
837
+ if (m_param->bDisableLookahead && isVbv)
838
+ {
839
+ memcpy(m_analysisInfo->lookahead.intraSatdForVbv, src->lookahead.intraSatdForVbv, src->numCuInHeight * sizeof(uint32_t));
840
+ memcpy(m_analysisInfo->lookahead.satdForVbv, src->lookahead.satdForVbv, src->numCuInHeight * sizeof(uint32_t));
841
+ memcpy(m_analysisInfo->lookahead.intraVbvCost, src->lookahead.intraVbvCost, src->numCUsInFrame * sizeof(uint32_t));
842
+ memcpy(m_analysisInfo->lookahead.vbvCost, src->lookahead.vbvCost, src->numCUsInFrame * sizeof(uint32_t));
843
+ }
844
+
845
+ if (src->sliceType == X265_TYPE_IDR || src->sliceType == X265_TYPE_I)
846
+ {
847
+ if (m_param->analysisSaveReuseLevel < 2)
848
+ goto ret;
849
+ x265_analysis_intra_data *intraDst, *intraSrc;
850
+ intraDst = (x265_analysis_intra_data*)m_analysisInfo->intraData;
851
+ intraSrc = (x265_analysis_intra_data*)src->intraData;
852
+ memcpy(intraDst->depth, intraSrc->depth, sizeof(uint8_t) * src->depthBytes);
853
+ memcpy(intraDst->modes, intraSrc->modes, sizeof(uint8_t) * src->numCUsInFrame * src->numPartitions);
854
+ memcpy(intraDst->partSizes, intraSrc->partSizes, sizeof(char) * src->depthBytes);
855
+ memcpy(intraDst->chromaModes, intraSrc->chromaModes, sizeof(uint8_t) * src->depthBytes);
856
+ if (m_param->rc.cuTree)
857
+ memcpy(intraDst->cuQPOff, intraSrc->cuQPOff, sizeof(int8_t) * src->depthBytes);
858
+ }
859
+ else
860
+ {
861
+ bool bIntraInInter = (src->sliceType == X265_TYPE_P || m_param->bIntraInBFrames);
862
+ int numDir = src->sliceType == X265_TYPE_P ? 1 : 2;
863
+ memcpy(m_analysisInfo->wt, src->wt, sizeof(WeightParam) * 3 * numDir);
864
+ if (m_param->analysisSaveReuseLevel < 2)
865
+ goto ret;
866
+ x265_analysis_inter_data *interDst, *interSrc;
867
+ interDst = (x265_analysis_inter_data*)m_analysisInfo->interData;
868
+ interSrc = (x265_analysis_inter_data*)src->interData;
869
+ memcpy(interDst->depth, interSrc->depth, sizeof(uint8_t) * src->depthBytes);
870
+ memcpy(interDst->modes, interSrc->modes, sizeof(uint8_t) * src->depthBytes);
871
+ if (m_param->rc.cuTree)
872
+ memcpy(interDst->cuQPOff, interSrc->cuQPOff, sizeof(int8_t) * src->depthBytes);
873
+ if (m_param->analysisSaveReuseLevel > 4)
874
+ {
875
+ memcpy(interDst->partSize, interSrc->partSize, sizeof(uint8_t) * src->depthBytes);
876
+ memcpy(interDst->mergeFlag, interSrc->mergeFlag, sizeof(uint8_t) * src->depthBytes);
877
+ if (m_param->analysisSaveReuseLevel == 10)
878
+ {
879
+ memcpy(interDst->interDir, interSrc->interDir, sizeof(uint8_t) * src->depthBytes);
880
+ for (int dir = 0; dir < numDir; dir++)
881
+ {
882
+ memcpy(interDst->mvpIdxdir, interSrc->mvpIdxdir, sizeof(uint8_t) * src->depthBytes);
883
+ memcpy(interDst->refIdxdir, interSrc->refIdxdir, sizeof(int8_t) * src->depthBytes);
884
+ memcpy(interDst->mvdir, interSrc->mvdir, sizeof(MV) * src->depthBytes);
885
+ }
886
+ if (bIntraInInter)
887
+ {
888
+ x265_analysis_intra_data *intraDst = (x265_analysis_intra_data*)m_analysisInfo->intraData;
889
+ x265_analysis_intra_data *intraSrc = (x265_analysis_intra_data*)src->intraData;
890
+ memcpy(intraDst->modes, intraSrc->modes, sizeof(uint8_t) * src->numPartitions * src->numCUsInFrame);
891
+ memcpy(intraDst->chromaModes, intraSrc->chromaModes, sizeof(uint8_t) * src->depthBytes);
892
+ }
893
+ }
894
+ }
895
+ if (m_param->analysisSaveReuseLevel != 10)
896
+ memcpy(interDst->ref, interSrc->ref, sizeof(int32_t) * src->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir);
897
+ }
898
+
899
+ret:
900
+ //increment analysis Write counter
901
+ m_parent->m_analysisWriteCntm_id.incr();
902
+ m_parent->m_analysisWritem_idindex.incr();
903
+ return;
904
+ }
905
+
906
+
907
+ bool PassEncoder::readPicture(x265_picture *dstPic)
908
+ {
909
+ /*Check and wait if there any input frames to read*/
910
+ int ipread = m_parent->m_picReadCntm_id.get();
911
+ int ipwrite = m_parent->m_picWriteCntm_id.get();
912
+
913
+ bool isAbrLoad = m_cliopt.loadLevel && (m_parent->m_numEncodes > 1);
914
+ while (!m_inputOver && (ipread == ipwrite))
915
+ {
916
+ ipwrite = m_parent->m_picWriteCntm_id.waitForChange(ipwrite);
917
+ }
918
+
919
+ if (m_threadActive && ipread < ipwrite)
920
+ {
921
+ /*Get input index to read from inputQueue. If doesn't need analysis info, it need not wait to fetch poc from analysisQueue*/
922
+ int readPos = ipread % m_parent->m_queueSize;
923
+ x265_analysis_data* analysisData = 0;
924
+
925
+ if (isAbrLoad)
926
+ {
927
+ /*If stream is master of each slave pass, then fetch analysis data from prev pass*/
928
+ int analysisQId = m_cliopt.refId;
929
+ /*Check and wait if there any analysis Data to read*/
930
+ int analysisWrite = m_parent->m_analysisWriteCntanalysisQId.get();
931
+ int written = analysisWrite * m_parent->m_passEncanalysisQId->m_cliopt.numRefs;
932
+ int analysisRead = m_parent->m_analysisReadCntanalysisQId.get();
933
+
934
+ while (m_threadActive && written == analysisRead)
935
+ {
936
+ analysisWrite = m_parent->m_analysisWriteCntanalysisQId.waitForChange(analysisWrite);
937
+ written = analysisWrite * m_parent->m_passEncanalysisQId->m_cliopt.numRefs;
938
+ }
939
+
940
+ if (analysisRead < written)
941
+ {
942
+ int analysisIdx = 0;
943
+ if (!m_param->bDisableLookahead)
944
+ {
945
+ bool analysisdRead = false;
946
+ while ((analysisRead < written) && !analysisdRead)
947
+ {
948
+ while (analysisWrite < ipread)
949
+ {
950
+ analysisWrite = m_parent->m_analysisWriteCntanalysisQId.waitForChange(analysisWrite);
951
+ written = analysisWrite * m_parent->m_passEncanalysisQId->m_cliopt.numRefs;
952
+ }
953
+ for (uint32_t i = 0; i < m_parent->m_queueSize; i++)
954
+ {
955
+ analysisData = &m_parent->m_analysisBufferanalysisQIdi;
956
+ int read = m_parent->m_analysisReadanalysisQIdi.get();
957
+ int write = m_parent->m_analysisWriteanalysisQIdi.get() * m_parent->m_passEncanalysisQId->m_cliopt.numRefs;
958
+ if ((analysisData->poc == (uint32_t)(ipread)) && (read < write))
959
+ {
960
+ analysisIdx = i;
961
+ analysisdRead = true;
962
+ break;
963
+ }
964
+ }
965
+ }
966
+ }
967
+ else
968
+ {
969
+ analysisIdx = analysisRead % m_parent->m_queueSize;
970
+ analysisData = &m_parent->m_analysisBufferanalysisQIdanalysisIdx;
971
+ readPos = analysisData->poc % m_parent->m_queueSize;
972
+ while ((ipwrite < readPos) || ((ipwrite - 1) < (int)analysisData->poc))
973
+ {
974
+ ipwrite = m_parent->m_picWriteCntm_id.waitForChange(ipwrite);
975
+ }
976
+ }
977
+
978
+ m_lastIdx = analysisIdx;
979
+ }
980
+ else
981
+ return false;
982
+ }
983
+
984
+
985
+ x265_picture *srcPic = (x265_picture*)(m_parent->m_inputPicBufferm_idreadPos);
986
+
987
+ x265_picture *pic = (x265_picture*)(dstPic);
988
+ pic->colorSpace = srcPic->colorSpace;
989
+ pic->bitDepth = srcPic->bitDepth;
990
+ pic->framesize = srcPic->framesize;
991
+ pic->height = srcPic->height;
992
+ pic->pts = srcPic->pts;
993
+ pic->dts = srcPic->dts;
994
+ pic->reorderedPts = srcPic->reorderedPts;
995
+ pic->width = srcPic->width;
996
+ pic->analysisData = srcPic->analysisData;
997
+ pic->userSEI = srcPic->userSEI;
998
+ pic->stride0 = srcPic->stride0;
999
+ pic->stride1 = srcPic->stride1;
1000
+ pic->stride2 = srcPic->stride2;
1001
+ pic->planes0 = srcPic->planes0;
1002
+ pic->planes1 = srcPic->planes1;
1003
+ pic->planes2 = srcPic->planes2;
1004
+ if (isAbrLoad)
1005
+ pic->analysisData = *analysisData;
1006
+ return true;
1007
+ }
1008
+ else
1009
+ return false;
1010
+ }
1011
+
1012
+ void PassEncoder::threadMain()
1013
+ {
1014
THREAD_NAME("PassEncoder", m_id);
1015
1016
while (m_threadActive)
1017
{
1018
-
1019
-#if ENABLE_LIBVMAF
1020
- x265_vmaf_data* vmafdata = m_cliopt.vmafData;
1021
-#endif
1022
- /* This allows muxers to modify bitstream format */
1023
- m_cliopt.output->setParam(m_param);
1024
- const x265_api* api = m_cliopt.api;
1025
- ReconPlay* reconPlay = NULL;
1026
- if (m_cliopt.reconPlayCmd)
1027
- reconPlay = new ReconPlay(m_cliopt.reconPlayCmd, *m_param);
1028
- char* profileName = m_cliopt.encName ? m_cliopt.encName : (char *)"x265";
1029
-
1030
- if (m_cliopt.zoneFile)
1031
- {
1032
- if (!m_cliopt.parseZoneFile())
1033
- {
1034
- x265_log(NULL, X265_LOG_ERROR, "Unable to parse zonefile in %s\n", profileName);
1035
- fclose(m_cliopt.zoneFile);
1036
- m_cliopt.zoneFile = NULL;
1037
- }
1038
- }
1039
-
1040
- if (signal(SIGINT, sigint_handler) == SIG_ERR)
1041
- x265_log(m_param, X265_LOG_ERROR, "Unable to register CTRL+C handler: %s in %s\n",
1042
- strerror(errno), profileName);
1043
-
1044
- x265_picture pic_orig, pic_out;
1045
- x265_picture *pic_in = &pic_orig;
1046
- /* Allocate recon picture if analysis save/load is enabled */
1047
- std::priority_queue<int64_t>* pts_queue = m_cliopt.output->needPTS() ? new std::priority_queue<int64_t>() : NULL;
1048
- x265_picture *pic_recon = (m_cliopt.recon || m_param->analysisSave || m_param->analysisLoad || pts_queue || reconPlay || m_param->csvLogLevel) ? &pic_out : NULL;
1049
- uint32_t inFrameCount = 0;
1050
- uint32_t outFrameCount = 0;
1051
- x265_nal *p_nal;
1052
- x265_stats stats;
1053
- uint32_t nal;
1054
- int16_t *errorBuf = NULL;
1055
- bool bDolbyVisionRPU = false;
1056
- uint8_t *rpuPayload = NULL;
1057
- int inputPicNum = 1;
1058
- x265_picture picField1, picField2;
1059
- x265_analysis_data* analysisInfo = (x265_analysis_data*)(&pic_out.analysisData);
1060
- bool isAbrSave = m_cliopt.saveLevel && (m_parent->m_numEncodes > 1);
1061
-
1062
- if (!m_param->bRepeatHeaders && !m_param->bEnableSvtHevc)
1063
- {
1064
- if (api->encoder_headers(m_encoder, &p_nal, &nal) < 0)
1065
- {
1066
- x265_log(m_param, X265_LOG_ERROR, "Failure generating stream headers in %s\n", profileName);
1067
- m_ret = 3;
1068
- goto fail;
1069
- }
1070
- else
1071
- m_cliopt.totalbytes += m_cliopt.output->writeHeaders(p_nal, nal);
1072
- }
1073
-
1074
- if (m_param->bField && m_param->interlaceMode)
1075
- {
1076
- api->picture_init(m_param, &picField1);
1077
- api->picture_init(m_param, &picField2);
1078
- // return back the original height of input
1079
- m_param->sourceHeight *= 2;
1080
- api->picture_init(m_param, &pic_orig);
1081
- }
1082
- else
1083
- api->picture_init(m_param, &pic_orig);
1084
-
1085
- if (m_param->dolbyProfile && m_cliopt.dolbyVisionRpu)
1086
- {
1087
- rpuPayload = X265_MALLOC(uint8_t, 1024);
1088
- pic_in->rpu.payload = rpuPayload;
1089
- if (pic_in->rpu.payload)
1090
- bDolbyVisionRPU = true;
1091
- }
1092
-
1093
- if (m_cliopt.bDither)
1094
- {
1095
- errorBuf = X265_MALLOC(int16_t, m_param->sourceWidth + 1);
1096
- if (errorBuf)
1097
- memset(errorBuf, 0, (m_param->sourceWidth + 1) * sizeof(int16_t));
1098
- else
1099
- m_cliopt.bDither = false;
1100
- }
1101
-
1102
- // main encoder loop
1103
- while (pic_in && !b_ctrl_c)
1104
- {
1105
- pic_orig.poc = (m_param->bField && m_param->interlaceMode) ? inFrameCount * 2 : inFrameCount;
1106
- if (m_cliopt.qpfile)
1107
- {
1108
- if (!m_cliopt.parseQPFile(pic_orig))
1109
- {
1110
- x265_log(NULL, X265_LOG_ERROR, "can't parse qpfile for frame %d in %s\n",
1111
- pic_in->poc, profileName);
1112
- fclose(m_cliopt.qpfile);
1113
- m_cliopt.qpfile = NULL;
1114
- }
1115
- }
1116
-
1117
- if (m_cliopt.framesToBeEncoded && inFrameCount >= m_cliopt.framesToBeEncoded)
1118
- pic_in = NULL;
1119
- else if (readPicture(pic_in))
1120
- inFrameCount++;
1121
- else
1122
- pic_in = NULL;
1123
-
1124
- if (pic_in)
1125
- {
1126
- if (pic_in->bitDepth > m_param->internalBitDepth && m_cliopt.bDither)
1127
- {
1128
- x265_dither_image(pic_in, m_cliopt.input->getWidth(), m_cliopt.input->getHeight(), errorBuf, m_param->internalBitDepth);
1129
- pic_in->bitDepth = m_param->internalBitDepth;
1130
- }
1131
- /* Overwrite PTS */
1132
- pic_in->pts = pic_in->poc;
1133
-
1134
- // convert to field
1135
- if (m_param->bField && m_param->interlaceMode)
1136
- {
1137
- int height = pic_in->height >> 1;
1138
-
1139
- int static bCreated = 0;
1140
- if (bCreated == 0)
1141
- {
1142
- bCreated = 1;
1143
- inputPicNum = 2;
1144
- picField1.fieldNum = 1;
1145
- picField2.fieldNum = 2;
1146
-
1147
- picField1.bitDepth = picField2.bitDepth = pic_in->bitDepth;
1148
- picField1.colorSpace = picField2.colorSpace = pic_in->colorSpace;
1149
- picField1.height = picField2.height = pic_in->height >> 1;
1150
- picField1.framesize = picField2.framesize = pic_in->framesize >> 1;
1151
-
1152
- size_t fieldFrameSize = (size_t)pic_in->framesize >> 1;
1153
- char* field1Buf = X265_MALLOC(char, fieldFrameSize);
1154
- char* field2Buf = X265_MALLOC(char, fieldFrameSize);
1155
-
1156
- int stride = picField1.stride0 = picField2.stride0 = pic_in->stride0;
1157
- uint64_t framesize = stride * (height >> x265_cli_cspspic_in->colorSpace.height0);
1158
- picField1.planes0 = field1Buf;
1159
- picField2.planes0 = field2Buf;
1160
- for (int i = 1; i < x265_cli_cspspic_in->colorSpace.planes; i++)
1161
- {
1162
- picField1.planesi = field1Buf + framesize;
1163
- picField2.planesi = field2Buf + framesize;
1164
-
1165
- stride = picField1.stridei = picField2.stridei = pic_in->stridei;
1166
- framesize += (stride * (height >> x265_cli_cspspic_in->colorSpace.heighti));
1167
- }
1168
- assert(framesize == picField1.framesize);
1169
- }
1170
-
1171
- picField1.pts = picField1.poc = pic_in->poc;
1172
- picField2.pts = picField2.poc = pic_in->poc + 1;
1173
-
1174
- picField1.userSEI = picField2.userSEI = pic_in->userSEI;
1175
-
1176
- //if (pic_in->userData)
1177
- //{
1178
- // // Have to handle userData here
1179
- //}
1180
-
1181
- if (pic_in->framesize)
1182
- {
1183
- for (int i = 0; i < x265_cli_cspspic_in->colorSpace.planes; i++)
1184
- {
1185
- char* srcP1 = (char*)pic_in->planesi;
1186
- char* srcP2 = (char*)pic_in->planesi + pic_in->stridei;
1187
- char* p1 = (char*)picField1.planesi;
1188
- char* p2 = (char*)picField2.planesi;
1189
-
1190
- int stride = picField1.stridei;
1191
-
1192
- for (int y = 0; y < (height >> x265_cli_cspspic_in->colorSpace.heighti); y++)
1193
- {
1194
- memcpy(p1, srcP1, stride);
1195
- memcpy(p2, srcP2, stride);
1196
- srcP1 += 2 * stride;
1197
- srcP2 += 2 * stride;
1198
- p1 += stride;
1199
- p2 += stride;
1200
- }
1201
- }
1202
- }
1203
- }
1204
-
1205
- if (bDolbyVisionRPU)
1206
- {
1207
- if (m_param->bField && m_param->interlaceMode)
1208
- {
1209
- if (m_cliopt.rpuParser(&picField1) > 0)
1210
- goto fail;
1211
- if (m_cliopt.rpuParser(&picField2) > 0)
1212
- goto fail;
1213
- }
1214
- else
1215
- {
1216
- if (m_cliopt.rpuParser(pic_in) > 0)
1217
- goto fail;
1218
- }
1219
- }
1220
- }
1221
-
1222
- for (int inputNum = 0; inputNum < inputPicNum; inputNum++)
1223
- {
1224
- x265_picture *picInput = NULL;
1225
- if (inputPicNum == 2)
1226
- picInput = pic_in ? (inputNum ? &picField2 : &picField1) : NULL;
1227
- else
1228
- picInput = pic_in;
1229
-
1230
- int numEncoded = api->encoder_encode(m_encoder, &p_nal, &nal, picInput, pic_recon);
1231
-
1232
- int idx = (inFrameCount - 1) % m_parent->m_queueSize;
1233
- m_parent->m_picIdxReadCntm_ididx.incr();
1234
- m_parent->m_picReadCntm_id.incr();
1235
- if (m_cliopt.loadLevel && picInput)
1236
- {
1237
- m_parent->m_analysisReadCntm_cliopt.refId.incr();
1238
- m_parent->m_analysisReadm_cliopt.refIdm_lastIdx.incr();
1239
- }
1240
-
1241
- if (numEncoded < 0)
1242
- {
1243
- b_ctrl_c = 1;
1244
- m_ret = 4;
1245
- break;
1246
- }
1247
-
1248
- if (reconPlay && numEncoded)
1249
- reconPlay->writePicture(*pic_recon);
1250
-
1251
- outFrameCount += numEncoded;
1252
-
1253
- if (isAbrSave && numEncoded)
1254
- {
1255
- copyInfo(analysisInfo);
1256
- }
1257
-
1258
- if (numEncoded && pic_recon && m_cliopt.recon)
1259
- m_cliopt.recon->writePicture(pic_out);
1260
- if (nal)
1261
- {
1262
- m_cliopt.totalbytes += m_cliopt.output->writeFrame(p_nal, nal, pic_out);
1263
- if (pts_queue)
1264
- {
1265
- pts_queue->push(-pic_out.pts);
1266
- if (pts_queue->size() > 2)
1267
- pts_queue->pop();
1268
- }
1269
- }
1270
- m_cliopt.printStatus(outFrameCount);
1271
- }
1272
- }
1273
-
1274
- /* Flush the encoder */
1275
- while (!b_ctrl_c)
1276
- {
1277
- int numEncoded = api->encoder_encode(m_encoder, &p_nal, &nal, NULL, pic_recon);
1278
- if (numEncoded < 0)
1279
- {
1280
- m_ret = 4;
1281
- break;
1282
- }
1283
-
1284
- if (reconPlay && numEncoded)
1285
- reconPlay->writePicture(*pic_recon);
1286
-
1287
- outFrameCount += numEncoded;
1288
- if (isAbrSave && numEncoded)
1289
- {
1290
- copyInfo(analysisInfo);
1291
- }
1292
-
1293
- if (numEncoded && pic_recon && m_cliopt.recon)
1294
- m_cliopt.recon->writePicture(pic_out);
1295
- if (nal)
1296
- {
1297
- m_cliopt.totalbytes += m_cliopt.output->writeFrame(p_nal, nal, pic_out);
1298
- if (pts_queue)
1299
- {
1300
- pts_queue->push(-pic_out.pts);
1301
- if (pts_queue->size() > 2)
1302
- pts_queue->pop();
1303
- }
1304
- }
1305
-
1306
- m_cliopt.printStatus(outFrameCount);
1307
-
1308
- if (!numEncoded)
1309
- break;
1310
- }
1311
-
1312
- if (bDolbyVisionRPU)
1313
- {
1314
- if (fgetc(m_cliopt.dolbyVisionRpu) != EOF)
1315
- x265_log(NULL, X265_LOG_WARNING, "Dolby Vision RPU count is greater than frame count in %s\n",
1316
- profileName);
1317
- x265_log(NULL, X265_LOG_INFO, "VES muxing with Dolby Vision RPU file successful in %s\n",
1318
- profileName);
1319
- }
1320
-
1321
- /* clear progress report */
1322
- if (m_cliopt.bProgress)
1323
- fprintf(stderr, "%*s\r", 80, " ");
1324
-
1325
- fail:
1326
-
1327
- delete reconPlay;
1328
-
1329
- api->encoder_get_stats(m_encoder, &stats, sizeof(stats));
1330
- if (m_param->csvfn && !b_ctrl_c)
1331
-#if ENABLE_LIBVMAF
1332
- api->vmaf_encoder_log(m_encoder, m_cliopt.argCnt, m_cliopt.argString, m_cliopt.param, vmafdata);
1333
-#else
1334
- api->encoder_log(m_encoder, m_cliopt.argCnt, m_cliopt.argString);
1335
-#endif
1336
- api->encoder_close(m_encoder);
1337
-
1338
- int64_t second_largest_pts = 0;
1339
- int64_t largest_pts = 0;
1340
- if (pts_queue && pts_queue->size() >= 2)
1341
- {
1342
- second_largest_pts = -pts_queue->top();
1343
- pts_queue->pop();
1344
- largest_pts = -pts_queue->top();
1345
- pts_queue->pop();
1346
- delete pts_queue;
1347
- pts_queue = NULL;
1348
- }
1349
- m_cliopt.output->closeFile(largest_pts, second_largest_pts);
1350
-
1351
- if (b_ctrl_c)
1352
- general_log(m_param, NULL, X265_LOG_INFO, "aborted at input frame %d, output frame %d in %s\n",
1353
- m_cliopt.seek + inFrameCount, stats.encodedPictureCount, profileName);
1354
-
1355
- api->param_free(m_param);
1356
-
1357
- X265_FREE(errorBuf);
1358
- X265_FREE(rpuPayload);
1359
-
1360
- m_threadActive = false;
1361
- m_parent->m_numActiveEncodes.decr();
1362
- }
1363
- }
1364
-
1365
- void PassEncoder::destroy()
1366
- {
1367
- stop();
1368
- if (m_reader)
1369
- {
1370
- m_reader->stop();
1371
- delete m_reader;
1372
- }
1373
- else
1374
- {
1375
- m_scaler->stop();
1376
- m_scaler->destroy();
1377
- delete m_scaler;
1378
- }
1379
- }
1380
-
1381
- Scaler::Scaler(int threadId, int threadNum, int id, VideoDesc *src, VideoDesc *dst, PassEncoder *parentEnc)
1382
- {
1383
- m_parentEnc = parentEnc;
1384
- m_id = id;
1385
- m_srcFormat = src;
1386
- m_dstFormat = dst;
1387
- m_threadActive = false;
1388
- m_scaleFrameSize = 0;
1389
- m_filterManager = NULL;
1390
- m_threadId = threadId;
1391
- m_threadTotal = threadNum;
1392
-
1393
- int csp = dst->m_csp;
1394
- uint32_t pixelbytes = dst->m_inputDepth > 8 ? 2 : 1;
1395
- for (int i = 0; i < x265_cli_cspscsp.planes; i++)
1396
- {
1397
- int w = dst->m_width >> x265_cli_cspscsp.widthi;
1398
- int h = dst->m_height >> x265_cli_cspscsp.heighti;
1399
- m_scalePlanesi = w * h * pixelbytes;
1400
- m_scaleFrameSize += m_scalePlanesi;
1401
- }
1402
-
1403
- if (src->m_height != dst->m_height || src->m_width != dst->m_width)
1404
- {
1405
- m_filterManager = new ScalerFilterManager;
1406
- m_filterManager->init(4, m_srcFormat, m_dstFormat);
1407
- }
1408
- }
1409
-
1410
- bool Scaler::scalePic(x265_picture * destination, x265_picture * source)
1411
- {
1412
- if (!destination || !source)
1413
- return false;
1414
- x265_param* param = m_parentEnc->m_param;
1415
- int pixelBytes = m_dstFormat->m_inputDepth > 8 ? 2 : 1;
1416
- if (m_srcFormat->m_height != m_dstFormat->m_height || m_srcFormat->m_width != m_dstFormat->m_width)
1417
- {
1418
- void **srcPlane = NULL, **dstPlane = NULL;
1419
- int srcStride3, dstStride3;
1420
- destination->bitDepth = source->bitDepth;
1421
- destination->colorSpace = source->colorSpace;
1422
- destination->pts = source->pts;
1423
- destination->dts = source->dts;
1424
- destination->reorderedPts = source->reorderedPts;
1425
- destination->poc = source->poc;
1426
- destination->userSEI = source->userSEI;
1427
- srcPlane = source->planes;
1428
- dstPlane = destination->planes;
1429
- srcStride0 = source->stride0;
1430
- destination->stride0 = m_dstFormat->m_width * pixelBytes;
1431
- dstStride0 = destination->stride0;
1432
- if (param->internalCsp != X265_CSP_I400)
1433
- {
1434
- srcStride1 = source->stride1;
1435
- srcStride2 = source->stride2;
1436
- destination->stride1 = destination->stride0 >> x265_cli_cspsparam->internalCsp.width1;
1437
- destination->stride2 = destination->stride0 >> x265_cli_cspsparam->internalCsp.width2;
1438
- dstStride1 = destination->stride1;
1439
- dstStride2 = destination->stride2;
1440
- }
1441
- if (m_scaleFrameSize)
1442
- {
1443
- m_filterManager->scale_pic(srcPlane, dstPlane, srcStride, dstStride);
1444
- return true;
1445
- }
1446
- else
1447
- x265_log(param, X265_LOG_INFO, "Empty frame received\n");
1448
- }
1449
- return false;
1450
- }
1451
-
1452
- void Scaler::threadMain()
1453
- {
1454
- THREAD_NAME("Scaler", m_id);
1455
-
1456
- /* unscaled picture is stored in the last index */
1457
- uint32_t srcId = m_id - 1;
1458
- int QDepth = m_parentEnc->m_parent->m_queueSize;
1459
- while (!m_parentEnc->m_inputOver)
1460
- {
1461
-
1462
- uint32_t scaledWritten = m_parentEnc->m_parent->m_picWriteCntm_id.get();
1463
-
1464
- if (m_parentEnc->m_cliopt.framesToBeEncoded && scaledWritten >= m_parentEnc->m_cliopt.framesToBeEncoded)
1465
- break;
1466
-
1467
- if (m_threadTotal > 1 && (m_threadId != scaledWritten % m_threadTotal))
1468
- {
1469
- continue;
1470
- }
1471
- uint32_t written = m_parentEnc->m_parent->m_picWriteCntsrcId.get();
1472
-
1473
- /*If all the input pictures are scaled by the current scale worker thread wait for input pictures*/
1474
- while (m_threadActive && (scaledWritten == written)) {
1475
- written = m_parentEnc->m_parent->m_picWriteCntsrcId.waitForChange(written);
1476
- }
1477
-
1478
- if (m_threadActive && scaledWritten < written)
1479
- {
1480
-
1481
- int scaledWriteIdx = scaledWritten % QDepth;
1482
- int overWritePicBuffer = scaledWritten / QDepth;
1483
- int read = m_parentEnc->m_parent->m_picIdxReadCntm_idscaledWriteIdx.get();
1484
-
1485
- while (overWritePicBuffer && read < overWritePicBuffer)
1486
- {
1487
- read = m_parentEnc->m_parent->m_picIdxReadCntm_idscaledWriteIdx.waitForChange(read);
1488
- }
1489
-
1490
- if (!m_parentEnc->m_parent->m_inputPicBufferm_idscaledWriteIdx)
1491
- {
1492
- int framesize = 0;
1493
- int planesize3;
1494
- int csp = m_dstFormat->m_csp;
1495
- int stride3;
1496
- stride0 = m_dstFormat->m_width;
1497
- stride1 = stride0 >> x265_cli_cspscsp.width1;
1498
- stride2 = stride0 >> x265_cli_cspscsp.width2;
1499
- for (int i = 0; i < x265_cli_cspscsp.planes; i++)
1500
- {
1501
- uint32_t h = m_dstFormat->m_height >> x265_cli_cspscsp.heighti;
1502
- planesizei = h * stridei;
1503
- framesize += planesizei;
1504
- }
1505
-
1506
- m_parentEnc->m_parent->m_inputPicBufferm_idscaledWriteIdx = x265_picture_alloc();
1507
- x265_picture_init(m_parentEnc->m_param, m_parentEnc->m_parent->m_inputPicBufferm_idscaledWriteIdx);
1508
-
1509
- ((x265_picture*)m_parentEnc->m_parent->m_inputPicBufferm_idscaledWritten % QDepth)->framesize = framesize;
1510
- for (int32_t j = 0; j < x265_cli_cspscsp.planes; j++)
1511
- {
1512
- m_parentEnc->m_parent->m_inputPicBufferm_idscaledWritten % QDepth->planesj = X265_MALLOC(char, planesizej);
1513
- }
1514
- }
1515
-
1516
- x265_picture *srcPic = m_parentEnc->m_parent->m_inputPicBuffersrcIdscaledWritten % QDepth;
1517
- x265_picture* destPic = m_parentEnc->m_parent->m_inputPicBufferm_idscaledWriteIdx;
1518
-
1519
- // Enqueue this picture up with the current encoder so that it will asynchronously encode
1520
- if (!scalePic(destPic, srcPic))
1521
- x265_log(NULL, X265_LOG_ERROR, "Unable to copy scaled input picture to input queue \n");
1522
- else
1523
- m_parentEnc->m_parent->m_picWriteCntm_id.incr();
1524
- m_scaledWriteCnt.incr();
1525
- m_parentEnc->m_parent->m_picIdxReadCntsrcIdscaledWriteIdx.incr();
1526
- }
1527
- if (m_threadTotal > 1)
1528
- {
1529
- written = m_parentEnc->m_parent->m_picWriteCntsrcId.get();
1530
- int totalWrite = written / m_threadTotal;
1531
- if (written % m_threadTotal > m_threadId)
1532
- totalWrite++;
1533
- if (totalWrite == m_scaledWriteCnt.get())
1534
- {
1535
- m_parentEnc->m_parent->m_picWriteCntsrcId.poke();
1536
- m_parentEnc->m_parent->m_picWriteCntm_id.poke();
1537
- break;
1538
- }
1539
- }
1540
- else
1541
- {
1542
- /* Once end of video is reached and all frames are scaled, release wait on picwritecount */
1543
- scaledWritten = m_parentEnc->m_parent->m_picWriteCntm_id.get();
1544
- written = m_parentEnc->m_parent->m_picWriteCntsrcId.get();
1545
- if (written == scaledWritten)
1546
- {
1547
- m_parentEnc->m_parent->m_picWriteCntsrcId.poke();
1548
- m_parentEnc->m_parent->m_picWriteCntm_id.poke();
1549
- break;
1550
- }
1551
- }
1552
-
1553
- }
1554
- m_threadActive = false;
1555
- destroy();
1556
- }
1557
-
1558
- Reader::Reader(int id, PassEncoder *parentEnc)
1559
- {
1560
- m_parentEnc = parentEnc;
1561
- m_id = id;
1562
- m_input = parentEnc->m_input;
1563
- }
1564
-
1565
- void Reader::threadMain()
1566
- {
1567
- THREAD_NAME("Reader", m_id);
1568
-
1569
- int QDepth = m_parentEnc->m_parent->m_queueSize;
1570
- x265_picture* src = x265_picture_alloc();
1571
- x265_picture_init(m_parentEnc->m_param, src);
1572
-
1573
- while (m_threadActive)
1574
- {
1575
- uint32_t written = m_parentEnc->m_parent->m_picWriteCntm_id.get();
1576
- uint32_t writeIdx = written % QDepth;
1577
- uint32_t read = m_parentEnc->m_parent->m_picIdxReadCntm_idwriteIdx.get();
1578
- uint32_t overWritePicBuffer = written / QDepth;
1579
-
1580
- if (m_parentEnc->m_cliopt.framesToBeEncoded && written >= m_parentEnc->m_cliopt.framesToBeEncoded)
1581
- break;
1582
-
1583
- while (overWritePicBuffer && read < overWritePicBuffer)
1584
- {
1585
- read = m_parentEnc->m_parent->m_picIdxReadCntm_idwriteIdx.waitForChange(read);
1586
- }
1587
-
1588
- x265_picture* dest = m_parentEnc->m_parent->m_inputPicBufferm_idwriteIdx;
1589
- if (m_input->readPicture(*src))
1590
- {
1591
- dest->poc = src->poc;
1592
- dest->pts = src->pts;
1593
- dest->userSEI = src->userSEI;
1594
- dest->bitDepth = src->bitDepth;
1595
- dest->framesize = src->framesize;
1596
- dest->height = src->height;
1597
- dest->width = src->width;
1598
- dest->colorSpace = src->colorSpace;
1599
- dest->userSEI = src->userSEI;
1600
- dest->rpu.payload = src->rpu.payload;
1601
- dest->picStruct = src->picStruct;
1602
- dest->stride0 = src->stride0;
1603
- dest->stride1 = src->stride1;
1604
- dest->stride2 = src->stride2;
1605
-
1606
- if (!dest->planes0)
1607
- dest->planes0 = X265_MALLOC(char, dest->framesize);
1608
-
1609
- memcpy(dest->planes0, src->planes0, src->framesize * sizeof(char));
1610
- dest->planes1 = (char*)dest->planes0 + src->stride0 * src->height;
1611
- dest->planes2 = (char*)dest->planes1 + src->stride1 * (src->height >> x265_cli_cspssrc->colorSpace.height1);
1612
- m_parentEnc->m_parent->m_picWriteCntm_id.incr();
1613
- }
1614
- else
1615
- {
1616
- m_threadActive = false;
1617
- m_parentEnc->m_inputOver = true;
1618
- m_parentEnc->m_parent->m_picWriteCntm_id.poke();
1619
- }
1620
- }
1621
- x265_picture_free(src);
1622
- }
1623
-}
1624
+
1625
+#if ENABLE_LIBVMAF
1626
+ x265_vmaf_data* vmafdata = m_cliopt.vmafData;
1627
+#endif
1628
+ /* This allows muxers to modify bitstream format */
1629
+ m_cliopt.output->setParam(m_param);
1630
+ const x265_api* api = m_cliopt.api;
1631
+ ReconPlay* reconPlay = NULL;
1632
+ if (m_cliopt.reconPlayCmd)
1633
+ reconPlay = new ReconPlay(m_cliopt.reconPlayCmd, *m_param);
1634
+ char* profileName = m_cliopt.encName ? m_cliopt.encName : (char *)"x265";
1635
+
1636
+ if (signal(SIGINT, sigint_handler) == SIG_ERR)
1637
+ x265_log(m_param, X265_LOG_ERROR, "Unable to register CTRL+C handler: %s in %s\n",
1638
+ strerror(errno), profileName);
1639
+
1640
+ x265_picture pic_orig, pic_out;
1641
+ x265_picture *pic_in = &pic_orig;
1642
+ /* Allocate recon picture if analysis save/load is enabled */
1643
+ std::priority_queue<int64_t>* pts_queue = m_cliopt.output->needPTS() ? new std::priority_queue<int64_t>() : NULL;
1644
+ x265_picture *pic_recon = (m_cliopt.recon || m_param->analysisSave || m_param->analysisLoad || pts_queue || reconPlay || m_param->csvLogLevel) ? &pic_out : NULL;
1645
+ uint32_t inFrameCount = 0;
1646
+ uint32_t outFrameCount = 0;
1647
+ x265_nal *p_nal;
1648
+ x265_stats stats;
1649
+ uint32_t nal;
1650
+ int16_t *errorBuf = NULL;
1651
+ bool bDolbyVisionRPU = false;
1652
+ uint8_t *rpuPayload = NULL;
1653
+ int inputPicNum = 1;
1654
+ x265_picture picField1, picField2;
1655
+ x265_analysis_data* analysisInfo = (x265_analysis_data*)(&pic_out.analysisData);
1656
+ bool isAbrSave = m_cliopt.saveLevel && (m_parent->m_numEncodes > 1);
1657
+
1658
+ if (!m_param->bRepeatHeaders && !m_param->bEnableSvtHevc)
1659
+ {
1660
+ if (api->encoder_headers(m_encoder, &p_nal, &nal) < 0)
1661
+ {
1662
+ x265_log(m_param, X265_LOG_ERROR, "Failure generating stream headers in %s\n", profileName);
1663
+ m_ret = 3;
1664
+ goto fail;
1665
+ }
1666
+ else
1667
+ m_cliopt.totalbytes += m_cliopt.output->writeHeaders(p_nal, nal);
1668
+ }
1669
+
1670
+ if (m_param->bField && m_param->interlaceMode)
1671
+ {
1672
+ api->picture_init(m_param, &picField1);
1673
+ api->picture_init(m_param, &picField2);
1674
+ // return back the original height of input
1675
+ m_param->sourceHeight *= 2;
1676
+ api->picture_init(m_param, &pic_orig);
1677
+ }
1678
+ else
1679
+ api->picture_init(m_param, &pic_orig);
1680
+
1681
+ if (m_param->dolbyProfile && m_cliopt.dolbyVisionRpu)
1682
+ {
1683
+ rpuPayload = X265_MALLOC(uint8_t, 1024);
1684
+ pic_in->rpu.payload = rpuPayload;
1685
+ if (pic_in->rpu.payload)
1686
+ bDolbyVisionRPU = true;
1687
+ }
1688
+
1689
+ if (m_cliopt.bDither)
1690
+ {
1691
+ errorBuf = X265_MALLOC(int16_t, m_param->sourceWidth + 1);
1692
+ if (errorBuf)
1693
+ memset(errorBuf, 0, (m_param->sourceWidth + 1) * sizeof(int16_t));
1694
+ else
1695
+ m_cliopt.bDither = false;
1696
+ }
1697
+
1698
+ // main encoder loop
1699
+ while (pic_in && !b_ctrl_c)
1700
+ {
1701
+ pic_orig.poc = (m_param->bField && m_param->interlaceMode) ? inFrameCount * 2 : inFrameCount;
1702
+ if (m_cliopt.qpfile)
1703
+ {
1704
+ if (!m_cliopt.parseQPFile(pic_orig))
1705
+ {
1706
+ x265_log(NULL, X265_LOG_ERROR, "can't parse qpfile for frame %d in %s\n",
1707
+ pic_in->poc, profileName);
1708
+ fclose(m_cliopt.qpfile);
1709
+ m_cliopt.qpfile = NULL;
1710
+ }
1711
+ }
1712
+
1713
+ if (m_cliopt.framesToBeEncoded && inFrameCount >= m_cliopt.framesToBeEncoded)
1714
+ pic_in = NULL;
1715
+ else if (readPicture(pic_in))
1716
+ inFrameCount++;
1717
+ else
1718
+ pic_in = NULL;
1719
+
1720
+ if (pic_in)
1721
+ {
1722
+ if (pic_in->bitDepth > m_param->internalBitDepth && m_cliopt.bDither)
1723
+ {
1724
+ x265_dither_image(pic_in, m_cliopt.input->getWidth(), m_cliopt.input->getHeight(), errorBuf, m_param->internalBitDepth);
1725
+ pic_in->bitDepth = m_param->internalBitDepth;
1726
+ }
1727
+ /* Overwrite PTS */
1728
+ pic_in->pts = pic_in->poc;
1729
+
1730
+ // convert to field
1731
+ if (m_param->bField && m_param->interlaceMode)
1732
+ {
1733
+ int height = pic_in->height >> 1;
1734
+
1735
+ int static bCreated = 0;
1736
+ if (bCreated == 0)
1737
+ {
1738
+ bCreated = 1;
1739
+ inputPicNum = 2;
1740
+ picField1.fieldNum = 1;
1741
+ picField2.fieldNum = 2;
1742
+
1743
+ picField1.bitDepth = picField2.bitDepth = pic_in->bitDepth;
1744
+ picField1.colorSpace = picField2.colorSpace = pic_in->colorSpace;
1745
+ picField1.height = picField2.height = pic_in->height >> 1;
1746
+ picField1.framesize = picField2.framesize = pic_in->framesize >> 1;
1747
+
1748
+ size_t fieldFrameSize = (size_t)pic_in->framesize >> 1;
1749
+ char* field1Buf = X265_MALLOC(char, fieldFrameSize);
1750
+ char* field2Buf = X265_MALLOC(char, fieldFrameSize);
1751
+
1752
+ int stride = picField1.stride0 = picField2.stride0 = pic_in->stride0;
1753
+ uint64_t framesize = stride * (height >> x265_cli_cspspic_in->colorSpace.height0);
1754
+ picField1.planes0 = field1Buf;
1755
+ picField2.planes0 = field2Buf;
1756
+ for (int i = 1; i < x265_cli_cspspic_in->colorSpace.planes; i++)
1757
+ {
1758
+ picField1.planesi = field1Buf + framesize;
1759
+ picField2.planesi = field2Buf + framesize;
1760
+
1761
+ stride = picField1.stridei = picField2.stridei = pic_in->stridei;
1762
+ framesize += (stride * (height >> x265_cli_cspspic_in->colorSpace.heighti));
1763
+ }
1764
+ assert(framesize == picField1.framesize);
1765
+ }
1766
+
1767
+ picField1.pts = picField1.poc = pic_in->poc;
1768
+ picField2.pts = picField2.poc = pic_in->poc + 1;
1769
+
1770
+ picField1.userSEI = picField2.userSEI = pic_in->userSEI;
1771
+
1772
+ //if (pic_in->userData)
1773
+ //{
1774
+ // // Have to handle userData here
1775
+ //}
1776
+
1777
+ if (pic_in->framesize)
1778
+ {
1779
+ for (int i = 0; i < x265_cli_cspspic_in->colorSpace.planes; i++)
1780
+ {
1781
+ char* srcP1 = (char*)pic_in->planesi;
1782
+ char* srcP2 = (char*)pic_in->planesi + pic_in->stridei;
1783
+ char* p1 = (char*)picField1.planesi;
1784
+ char* p2 = (char*)picField2.planesi;
1785
+
1786
+ int stride = picField1.stridei;
1787
+
1788
+ for (int y = 0; y < (height >> x265_cli_cspspic_in->colorSpace.heighti); y++)
1789
+ {
1790
+ memcpy(p1, srcP1, stride);
1791
+ memcpy(p2, srcP2, stride);
1792
+ srcP1 += 2 * stride;
1793
+ srcP2 += 2 * stride;
1794
+ p1 += stride;
1795
+ p2 += stride;
1796
+ }
1797
+ }
1798
+ }
1799
+ }
1800
+
1801
+ if (bDolbyVisionRPU)
1802
+ {
1803
+ if (m_param->bField && m_param->interlaceMode)
1804
+ {
1805
+ if (m_cliopt.rpuParser(&picField1) > 0)
1806
+ goto fail;
1807
+ if (m_cliopt.rpuParser(&picField2) > 0)
1808
+ goto fail;
1809
+ }
1810
+ else
1811
+ {
1812
+ if (m_cliopt.rpuParser(pic_in) > 0)
1813
+ goto fail;
1814
+ }
1815
+ }
1816
+ }
1817
+
1818
+ for (int inputNum = 0; inputNum < inputPicNum; inputNum++)
1819
+ {
1820
+ x265_picture *picInput = NULL;
1821
+ if (inputPicNum == 2)
1822
+ picInput = pic_in ? (inputNum ? &picField2 : &picField1) : NULL;
1823
+ else
1824
+ picInput = pic_in;
1825
+
1826
+ int numEncoded = api->encoder_encode(m_encoder, &p_nal, &nal, picInput, pic_recon);
1827
+
1828
+ int idx = (inFrameCount - 1) % m_parent->m_queueSize;
1829
+ m_parent->m_picIdxReadCntm_ididx.incr();
1830
+ m_parent->m_picReadCntm_id.incr();
1831
+ if (m_cliopt.loadLevel && picInput)
1832
+ {
1833
+ m_parent->m_analysisReadCntm_cliopt.refId.incr();
1834
+ m_parent->m_analysisReadm_cliopt.refIdm_lastIdx.incr();
1835
+ }
1836
+
1837
+ if (numEncoded < 0)
1838
+ {
1839
+ b_ctrl_c = 1;
1840
+ m_ret = 4;
1841
+ break;
1842
+ }
1843
+
1844
+ if (reconPlay && numEncoded)
1845
+ reconPlay->writePicture(*pic_recon);
1846
+
1847
+ outFrameCount += numEncoded;
1848
+
1849
+ if (isAbrSave && numEncoded)
1850
+ {
1851
+ copyInfo(analysisInfo);
1852
+ }
1853
+
1854
+ if (numEncoded && pic_recon && m_cliopt.recon)
1855
+ m_cliopt.recon->writePicture(pic_out);
1856
+ if (nal)
1857
+ {
1858
+ m_cliopt.totalbytes += m_cliopt.output->writeFrame(p_nal, nal, pic_out);
1859
+ if (pts_queue)
1860
+ {
1861
+ pts_queue->push(-pic_out.pts);
1862
+ if (pts_queue->size() > 2)
1863
+ pts_queue->pop();
1864
+ }
1865
+ }
1866
+ m_cliopt.printStatus(outFrameCount);
1867
+ }
1868
+ }
1869
+
1870
+ /* Flush the encoder */
1871
+ while (!b_ctrl_c)
1872
+ {
1873
+ int numEncoded = api->encoder_encode(m_encoder, &p_nal, &nal, NULL, pic_recon);
1874
+ if (numEncoded < 0)
1875
+ {
1876
+ m_ret = 4;
1877
+ break;
1878
+ }
1879
+
1880
+ if (reconPlay && numEncoded)
1881
+ reconPlay->writePicture(*pic_recon);
1882
+
1883
+ outFrameCount += numEncoded;
1884
+ if (isAbrSave && numEncoded)
1885
+ {
1886
+ copyInfo(analysisInfo);
1887
+ }
1888
+
1889
+ if (numEncoded && pic_recon && m_cliopt.recon)
1890
+ m_cliopt.recon->writePicture(pic_out);
1891
+ if (nal)
1892
+ {
1893
+ m_cliopt.totalbytes += m_cliopt.output->writeFrame(p_nal, nal, pic_out);
1894
+ if (pts_queue)
1895
+ {
1896
+ pts_queue->push(-pic_out.pts);
1897
+ if (pts_queue->size() > 2)
1898
+ pts_queue->pop();
1899
+ }
1900
+ }
1901
+
1902
+ m_cliopt.printStatus(outFrameCount);
1903
+
1904
+ if (!numEncoded)
1905
+ break;
1906
+ }
1907
+
1908
+ if (bDolbyVisionRPU)
1909
+ {
1910
+ if (fgetc(m_cliopt.dolbyVisionRpu) != EOF)
1911
+ x265_log(NULL, X265_LOG_WARNING, "Dolby Vision RPU count is greater than frame count in %s\n",
1912
+ profileName);
1913
+ x265_log(NULL, X265_LOG_INFO, "VES muxing with Dolby Vision RPU file successful in %s\n",
1914
+ profileName);
1915
+ }
1916
+
1917
+ /* clear progress report */
1918
+ if (m_cliopt.bProgress)
1919
+ fprintf(stderr, "%*s\r", 80, " ");
1920
+
1921
+ fail:
1922
+
1923
+ delete reconPlay;
1924
+
1925
+ api->encoder_get_stats(m_encoder, &stats, sizeof(stats));
1926
+ if (m_param->csvfn && !b_ctrl_c)
1927
+#if ENABLE_LIBVMAF
1928
+ api->vmaf_encoder_log(m_encoder, m_cliopt.argCnt, m_cliopt.argString, m_cliopt.param, vmafdata);
1929
+#else
1930
+ api->encoder_log(m_encoder, m_cliopt.argCnt, m_cliopt.argString);
1931
+#endif
1932
+ api->encoder_close(m_encoder);
1933
+
1934
+ int64_t second_largest_pts = 0;
1935
+ int64_t largest_pts = 0;
1936
+ if (pts_queue && pts_queue->size() >= 2)
1937
+ {
1938
+ second_largest_pts = -pts_queue->top();
1939
+ pts_queue->pop();
1940
+ largest_pts = -pts_queue->top();
1941
+ pts_queue->pop();
1942
+ delete pts_queue;
1943
+ pts_queue = NULL;
1944
+ }
1945
+ m_cliopt.output->closeFile(largest_pts, second_largest_pts);
1946
+
1947
+ if (b_ctrl_c)
1948
+ general_log(m_param, NULL, X265_LOG_INFO, "aborted at input frame %d, output frame %d in %s\n",
1949
+ m_cliopt.seek + inFrameCount, stats.encodedPictureCount, profileName);
1950
+
1951
+ api->param_free(m_param);
1952
+
1953
+ X265_FREE(errorBuf);
1954
+ X265_FREE(rpuPayload);
1955
+
1956
+ m_threadActive = false;
1957
+ m_parent->m_numActiveEncodes.decr();
1958
+ }
1959
+ }
1960
+
1961
+ void PassEncoder::destroy()
1962
+ {
1963
+ stop();
1964
+ if (m_reader)
1965
+ {
1966
+ m_reader->stop();
1967
+ delete m_reader;
1968
+ }
1969
+ else
1970
+ {
1971
+ m_scaler->stop();
1972
+ m_scaler->destroy();
1973
+ delete m_scaler;
1974
+ }
1975
+ }
1976
+
1977
+ Scaler::Scaler(int threadId, int threadNum, int id, VideoDesc *src, VideoDesc *dst, PassEncoder *parentEnc)
1978
+ {
1979
+ m_parentEnc = parentEnc;
1980
+ m_id = id;
1981
+ m_srcFormat = src;
1982
+ m_dstFormat = dst;
1983
+ m_threadActive = false;
1984
+ m_scaleFrameSize = 0;
1985
+ m_filterManager = NULL;
1986
+ m_threadId = threadId;
1987
+ m_threadTotal = threadNum;
1988
+
1989
+ int csp = dst->m_csp;
1990
+ uint32_t pixelbytes = dst->m_inputDepth > 8 ? 2 : 1;
1991
+ for (int i = 0; i < x265_cli_cspscsp.planes; i++)
1992
+ {
1993
+ int w = dst->m_width >> x265_cli_cspscsp.widthi;
1994
+ int h = dst->m_height >> x265_cli_cspscsp.heighti;
1995
+ m_scalePlanesi = w * h * pixelbytes;
1996
+ m_scaleFrameSize += m_scalePlanesi;
1997
+ }
1998
+
1999
+ if (src->m_height != dst->m_height || src->m_width != dst->m_width)
2000
+ {
2001
+ m_filterManager = new ScalerFilterManager;
2002
+ m_filterManager->init(4, m_srcFormat, m_dstFormat);
2003
+ }
2004
+ }
2005
+
2006
+ bool Scaler::scalePic(x265_picture * destination, x265_picture * source)
2007
+ {
2008
+ if (!destination || !source)
2009
+ return false;
2010
+ x265_param* param = m_parentEnc->m_param;
2011
+ int pixelBytes = m_dstFormat->m_inputDepth > 8 ? 2 : 1;
2012
+ if (m_srcFormat->m_height != m_dstFormat->m_height || m_srcFormat->m_width != m_dstFormat->m_width)
2013
+ {
2014
+ void **srcPlane = NULL, **dstPlane = NULL;
2015
+ int srcStride3, dstStride3;
2016
+ destination->bitDepth = source->bitDepth;
2017
+ destination->colorSpace = source->colorSpace;
2018
+ destination->pts = source->pts;
2019
+ destination->dts = source->dts;
2020
+ destination->reorderedPts = source->reorderedPts;
2021
+ destination->poc = source->poc;
2022
+ destination->userSEI = source->userSEI;
2023
+ srcPlane = source->planes;
2024
+ dstPlane = destination->planes;
2025
+ srcStride0 = source->stride0;
2026
+ destination->stride0 = m_dstFormat->m_width * pixelBytes;
2027
+ dstStride0 = destination->stride0;
2028
+ if (param->internalCsp != X265_CSP_I400)
2029
+ {
2030
+ srcStride1 = source->stride1;
2031
+ srcStride2 = source->stride2;
2032
+ destination->stride1 = destination->stride0 >> x265_cli_cspsparam->internalCsp.width1;
2033
+ destination->stride2 = destination->stride0 >> x265_cli_cspsparam->internalCsp.width2;
2034
+ dstStride1 = destination->stride1;
2035
+ dstStride2 = destination->stride2;
2036
+ }
2037
+ if (m_scaleFrameSize)
2038
+ {
2039
+ m_filterManager->scale_pic(srcPlane, dstPlane, srcStride, dstStride);
2040
+ return true;
2041
+ }
2042
+ else
2043
+ x265_log(param, X265_LOG_INFO, "Empty frame received\n");
2044
+ }
2045
+ return false;
2046
+ }
2047
+
2048
+ void Scaler::threadMain()
2049
+ {
2050
+ THREAD_NAME("Scaler", m_id);
2051
+
2052
+ /* unscaled picture is stored in the last index */
2053
+ uint32_t srcId = m_id - 1;
2054
+ int QDepth = m_parentEnc->m_parent->m_queueSize;
2055
+ while (!m_parentEnc->m_inputOver)
2056
+ {
2057
+
2058
+ uint32_t scaledWritten = m_parentEnc->m_parent->m_picWriteCntm_id.get();
2059
+
2060
+ if (m_parentEnc->m_cliopt.framesToBeEncoded && scaledWritten >= m_parentEnc->m_cliopt.framesToBeEncoded)
2061
+ break;
2062
+
2063
+ if (m_threadTotal > 1 && (m_threadId != scaledWritten % m_threadTotal))
2064
+ {
2065
+ continue;
2066
+ }
2067
+ uint32_t written = m_parentEnc->m_parent->m_picWriteCntsrcId.get();
2068
+
2069
+ /*If all the input pictures are scaled by the current scale worker thread wait for input pictures*/
2070
+ while (m_threadActive && (scaledWritten == written)) {
2071
+ written = m_parentEnc->m_parent->m_picWriteCntsrcId.waitForChange(written);
2072
+ }
2073
+
2074
+ if (m_threadActive && scaledWritten < written)
2075
+ {
2076
+
2077
+ int scaledWriteIdx = scaledWritten % QDepth;
2078
+ int overWritePicBuffer = scaledWritten / QDepth;
2079
+ int read = m_parentEnc->m_parent->m_picIdxReadCntm_idscaledWriteIdx.get();
2080
+
2081
+ while (overWritePicBuffer && read < overWritePicBuffer)
2082
+ {
2083
+ read = m_parentEnc->m_parent->m_picIdxReadCntm_idscaledWriteIdx.waitForChange(read);
2084
+ }
2085
+
2086
+ if (!m_parentEnc->m_parent->m_inputPicBufferm_idscaledWriteIdx)
2087
+ {
2088
+ int framesize = 0;
2089
+ int planesize3;
2090
+ int csp = m_dstFormat->m_csp;
2091
+ int stride3;
2092
+ stride0 = m_dstFormat->m_width;
2093
+ stride1 = stride0 >> x265_cli_cspscsp.width1;
2094
+ stride2 = stride0 >> x265_cli_cspscsp.width2;
2095
+ for (int i = 0; i < x265_cli_cspscsp.planes; i++)
2096
+ {
2097
+ uint32_t h = m_dstFormat->m_height >> x265_cli_cspscsp.heighti;
2098
+ planesizei = h * stridei;
2099
+ framesize += planesizei;
2100
+ }
2101
+
2102
+ m_parentEnc->m_parent->m_inputPicBufferm_idscaledWriteIdx = x265_picture_alloc();
2103
+ x265_picture_init(m_parentEnc->m_param, m_parentEnc->m_parent->m_inputPicBufferm_idscaledWriteIdx);
2104
+
2105
+ ((x265_picture*)m_parentEnc->m_parent->m_inputPicBufferm_idscaledWritten % QDepth)->framesize = framesize;
2106
+ for (int32_t j = 0; j < x265_cli_cspscsp.planes; j++)
2107
+ {
2108
+ m_parentEnc->m_parent->m_inputPicBufferm_idscaledWritten % QDepth->planesj = X265_MALLOC(char, planesizej);
2109
+ }
2110
+ }
2111
+
2112
+ x265_picture *srcPic = m_parentEnc->m_parent->m_inputPicBuffersrcIdscaledWritten % QDepth;
2113
+ x265_picture* destPic = m_parentEnc->m_parent->m_inputPicBufferm_idscaledWriteIdx;
2114
+
2115
+ // Enqueue this picture up with the current encoder so that it will asynchronously encode
2116
+ if (!scalePic(destPic, srcPic))
2117
+ x265_log(NULL, X265_LOG_ERROR, "Unable to copy scaled input picture to input queue \n");
2118
+ else
2119
+ m_parentEnc->m_parent->m_picWriteCntm_id.incr();
2120
+ m_scaledWriteCnt.incr();
2121
+ m_parentEnc->m_parent->m_picIdxReadCntsrcIdscaledWriteIdx.incr();
2122
+ }
2123
+ if (m_threadTotal > 1)
2124
+ {
2125
+ written = m_parentEnc->m_parent->m_picWriteCntsrcId.get();
2126
+ int totalWrite = written / m_threadTotal;
2127
+ if (written % m_threadTotal > m_threadId)
2128
+ totalWrite++;
2129
+ if (totalWrite == m_scaledWriteCnt.get())
2130
+ {
2131
+ m_parentEnc->m_parent->m_picWriteCntsrcId.poke();
2132
+ m_parentEnc->m_parent->m_picWriteCntm_id.poke();
2133
+ break;
2134
+ }
2135
+ }
2136
+ else
2137
+ {
2138
+ /* Once end of video is reached and all frames are scaled, release wait on picwritecount */
2139
+ scaledWritten = m_parentEnc->m_parent->m_picWriteCntm_id.get();
2140
+ written = m_parentEnc->m_parent->m_picWriteCntsrcId.get();
2141
+ if (written == scaledWritten)
2142
+ {
2143
+ m_parentEnc->m_parent->m_picWriteCntsrcId.poke();
2144
+ m_parentEnc->m_parent->m_picWriteCntm_id.poke();
2145
+ break;
2146
+ }
2147
+ }
2148
+
2149
+ }
2150
+ m_threadActive = false;
2151
+ destroy();
2152
+ }
2153
+
2154
+ Reader::Reader(int id, PassEncoder *parentEnc)
2155
+ {
2156
+ m_parentEnc = parentEnc;
2157
+ m_id = id;
2158
+ m_input = parentEnc->m_input;
2159
+ }
2160
+
2161
+ void Reader::threadMain()
2162
+ {
2163
+ THREAD_NAME("Reader", m_id);
2164
+
2165
+ int QDepth = m_parentEnc->m_parent->m_queueSize;
2166
+ x265_picture* src = x265_picture_alloc();
2167
+ x265_picture_init(m_parentEnc->m_param, src);
2168
+
2169
+ while (m_threadActive)
2170
+ {
2171
+ uint32_t written = m_parentEnc->m_parent->m_picWriteCntm_id.get();
2172
+ uint32_t writeIdx = written % QDepth;
2173
+ uint32_t read = m_parentEnc->m_parent->m_picIdxReadCntm_idwriteIdx.get();
2174
+ uint32_t overWritePicBuffer = written / QDepth;
2175
+
2176
+ if (m_parentEnc->m_cliopt.framesToBeEncoded && written >= m_parentEnc->m_cliopt.framesToBeEncoded)
2177
+ break;
2178
+
2179
+ while (overWritePicBuffer && read < overWritePicBuffer)
2180
+ {
2181
+ read = m_parentEnc->m_parent->m_picIdxReadCntm_idwriteIdx.waitForChange(read);
2182
+ }
2183
+
2184
+ x265_picture* dest = m_parentEnc->m_parent->m_inputPicBufferm_idwriteIdx;
2185
+ if (m_input->readPicture(*src))
2186
+ {
2187
+ dest->poc = src->poc;
2188
+ dest->pts = src->pts;
2189
+ dest->userSEI = src->userSEI;
2190
+ dest->bitDepth = src->bitDepth;
2191
+ dest->framesize = src->framesize;
2192
+ dest->height = src->height;
2193
+ dest->width = src->width;
2194
+ dest->colorSpace = src->colorSpace;
2195
+ dest->userSEI = src->userSEI;
2196
+ dest->rpu.payload = src->rpu.payload;
2197
+ dest->picStruct = src->picStruct;
2198
+ dest->stride0 = src->stride0;
2199
+ dest->stride1 = src->stride1;
2200
+ dest->stride2 = src->stride2;
2201
+
2202
+ if (!dest->planes0)
2203
+ dest->planes0 = X265_MALLOC(char, dest->framesize);
2204
+
2205
+ memcpy(dest->planes0, src->planes0, src->framesize * sizeof(char));
2206
+ dest->planes1 = (char*)dest->planes0 + src->stride0 * src->height;
2207
+ dest->planes2 = (char*)dest->planes1 + src->stride1 * (src->height >> x265_cli_cspssrc->colorSpace.height1);
2208
+ m_parentEnc->m_parent->m_picWriteCntm_id.incr();
2209
+ }
2210
+ else
2211
+ {
2212
+ m_threadActive = false;
2213
+ m_parentEnc->m_inputOver = true;
2214
+ m_parentEnc->m_parent->m_picWriteCntm_id.poke();
2215
+ }
2216
+ }
2217
+ x265_picture_free(src);
2218
+ }
2219
+}
2220
x265_3.5.tar.gz/source/abrEncApp.h -> x265_3.6.tar.gz/source/abrEncApp.h
Changed
9
1
2
FILE* m_qpfile;
3
FILE* m_zoneFile;
4
FILE* m_dolbyVisionRpu;/* File containing Dolby Vision BL RPU metadata */
5
+ FILE* m_scenecutAwareQpConfig;
6
7
int m_ret;
8
9
x265_3.5.tar.gz/source/cmake/FindNeon.cmake -> x265_3.6.tar.gz/source/cmake/FindNeon.cmake
Changed
27
1
2
include(FindPackageHandleStandardArgs)
3
4
# Check the version of neon supported by the ARM CPU
5
-execute_process(COMMAND cat /proc/cpuinfo | grep Features | grep neon
6
- OUTPUT_VARIABLE neon_version
7
- ERROR_QUIET
8
- OUTPUT_STRIP_TRAILING_WHITESPACE)
9
+if(APPLE)
10
+ execute_process(COMMAND sysctl -a
11
+ COMMAND grep "hw.optional.neon: 1"
12
+ OUTPUT_VARIABLE neon_version
13
+ ERROR_QUIET
14
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
15
+else()
16
+ execute_process(COMMAND cat /proc/cpuinfo
17
+ COMMAND grep Features
18
+ COMMAND grep neon
19
+ OUTPUT_VARIABLE neon_version
20
+ ERROR_QUIET
21
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
22
+endif()
23
+
24
if(neon_version)
25
set(CPU_HAS_NEON 1)
26
endif()
27
x265_3.6.tar.gz/source/cmake/FindSVE.cmake
Added
23
1
2
+include(FindPackageHandleStandardArgs)
3
+
4
+# Check the version of SVE supported by the ARM CPU
5
+if(APPLE)
6
+ execute_process(COMMAND sysctl -a
7
+ COMMAND grep "hw.optional.sve: 1"
8
+ OUTPUT_VARIABLE sve_version
9
+ ERROR_QUIET
10
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
11
+else()
12
+ execute_process(COMMAND cat /proc/cpuinfo
13
+ COMMAND grep Features
14
+ COMMAND grep -e "sve$" -e "sve:space:"
15
+ OUTPUT_VARIABLE sve_version
16
+ ERROR_QUIET
17
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
18
+endif()
19
+
20
+if(sve_version)
21
+ set(CPU_HAS_SVE 1)
22
+endif()
23
x265_3.6.tar.gz/source/cmake/FindSVE2.cmake
Added
24
1
2
+include(FindPackageHandleStandardArgs)
3
+
4
+# Check the version of SVE2 supported by the ARM CPU
5
+if(APPLE)
6
+ execute_process(COMMAND sysctl -a
7
+ COMMAND grep "hw.optional.sve2: 1"
8
+ OUTPUT_VARIABLE sve2_version
9
+ ERROR_QUIET
10
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
11
+else()
12
+ execute_process(COMMAND cat /proc/cpuinfo
13
+ COMMAND grep Features
14
+ COMMAND grep sve2
15
+ OUTPUT_VARIABLE sve2_version
16
+ ERROR_QUIET
17
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
18
+endif()
19
+
20
+if(sve2_version)
21
+ set(CPU_HAS_SVE 1)
22
+ set(CPU_HAS_SVE2 1)
23
+endif()
24
x265_3.5.tar.gz/source/common/CMakeLists.txt -> x265_3.6.tar.gz/source/common/CMakeLists.txt
Changed
76
1
2
endif(ENABLE_ASSEMBLY AND X86)
3
4
if(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
5
- if(ARM64)
6
- if(GCC AND (CMAKE_CXX_FLAGS_RELEASE MATCHES "-O3"))
7
- message(STATUS "Detected CXX compiler using -O3 optimization level")
8
- add_definitions(-DAUTO_VECTORIZE=1)
9
- endif()
10
- set(C_SRCS asm-primitives.cpp pixel.h ipfilter8.h)
11
-
12
- # add ARM assembly/intrinsic files here
13
- set(A_SRCS asm.S mc-a.S sad-a.S pixel-util.S ipfilter8.S)
14
- set(VEC_PRIMITIVES)
15
+ set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
16
17
- set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
18
- foreach(SRC ${C_SRCS})
19
- set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
20
- endforeach()
21
- else()
22
- set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
23
+ # add ARM assembly/intrinsic files here
24
+ set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S dct-a.S)
25
+ set(VEC_PRIMITIVES)
26
27
- # add ARM assembly/intrinsic files here
28
- set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S dct-a.S)
29
- set(VEC_PRIMITIVES)
30
+ set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
31
+ foreach(SRC ${C_SRCS})
32
+ set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
33
+ endforeach()
34
+ source_group(Assembly FILES ${ASM_PRIMITIVES})
35
+endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
36
37
- set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
38
- foreach(SRC ${C_SRCS})
39
- set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
40
- endforeach()
41
+if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
42
+ if(GCC AND (CMAKE_CXX_FLAGS_RELEASE MATCHES "-O3"))
43
+ message(STATUS "Detected CXX compiler using -O3 optimization level")
44
+ add_definitions(-DAUTO_VECTORIZE=1)
45
endif()
46
+
47
+ set(C_SRCS asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h)
48
+ enable_language(ASM)
49
+
50
+ # add ARM assembly/intrinsic files here
51
+ set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S sad-a-common.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S ipfilter.S ipfilter-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S)
52
+ set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S)
53
+ set(A_SRCS_SVE2 mc-a-sve2.S sad-a-sve2.S pixel-util-sve2.S ipfilter-sve2.S ssd-a-sve2.S)
54
+ set(VEC_PRIMITIVES)
55
+
56
+ set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
57
+ set(ARM_ASMS_SVE "${A_SRCS_SVE}" CACHE INTERNAL "ARM Assembly Sources that use SVE instruction set")
58
+ set(ARM_ASMS_SVE2 "${A_SRCS_SVE2}" CACHE INTERNAL "ARM Assembly Sources that use SVE2 instruction set")
59
+ foreach(SRC ${C_SRCS})
60
+ set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
61
+ endforeach()
62
source_group(Assembly FILES ${ASM_PRIMITIVES})
63
-endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
64
+endif(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
65
66
if(POWER)
67
set_source_files_properties(version.cpp PROPERTIES COMPILE_FLAGS -DX265_VERSION=${X265_VERSION})
68
69
scalinglist.cpp scalinglist.h
70
quant.cpp quant.h contexts.h
71
deblock.cpp deblock.h
72
- scaler.cpp scaler.h)
73
+ scaler.cpp scaler.h
74
+ ringmem.cpp ringmem.h
75
+ temporalfilter.cpp temporalfilter.h)
76
x265_3.6.tar.gz/source/common/aarch64/arm64-utils.cpp
Added
302
1
2
+#include "common.h"
3
+#include "x265.h"
4
+#include "arm64-utils.h"
5
+#include <arm_neon.h>
6
+
7
+#define COPY_16(d,s) *(uint8x16_t *)(d) = *(uint8x16_t *)(s)
8
+namespace X265_NS
9
+{
10
+
11
+
12
+
13
+void transpose8x8(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
14
+{
15
+ uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7;
16
+ uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;
17
+
18
+ a0 = *(uint8x8_t *)(src + 0 * sstride);
19
+ a1 = *(uint8x8_t *)(src + 1 * sstride);
20
+ a2 = *(uint8x8_t *)(src + 2 * sstride);
21
+ a3 = *(uint8x8_t *)(src + 3 * sstride);
22
+ a4 = *(uint8x8_t *)(src + 4 * sstride);
23
+ a5 = *(uint8x8_t *)(src + 5 * sstride);
24
+ a6 = *(uint8x8_t *)(src + 6 * sstride);
25
+ a7 = *(uint8x8_t *)(src + 7 * sstride);
26
+
27
+ b0 = vtrn1_u32(a0, a4);
28
+ b1 = vtrn1_u32(a1, a5);
29
+ b2 = vtrn1_u32(a2, a6);
30
+ b3 = vtrn1_u32(a3, a7);
31
+ b4 = vtrn2_u32(a0, a4);
32
+ b5 = vtrn2_u32(a1, a5);
33
+ b6 = vtrn2_u32(a2, a6);
34
+ b7 = vtrn2_u32(a3, a7);
35
+
36
+ a0 = vtrn1_u16(b0, b2);
37
+ a1 = vtrn1_u16(b1, b3);
38
+ a2 = vtrn2_u16(b0, b2);
39
+ a3 = vtrn2_u16(b1, b3);
40
+ a4 = vtrn1_u16(b4, b6);
41
+ a5 = vtrn1_u16(b5, b7);
42
+ a6 = vtrn2_u16(b4, b6);
43
+ a7 = vtrn2_u16(b5, b7);
44
+
45
+ b0 = vtrn1_u8(a0, a1);
46
+ b1 = vtrn2_u8(a0, a1);
47
+ b2 = vtrn1_u8(a2, a3);
48
+ b3 = vtrn2_u8(a2, a3);
49
+ b4 = vtrn1_u8(a4, a5);
50
+ b5 = vtrn2_u8(a4, a5);
51
+ b6 = vtrn1_u8(a6, a7);
52
+ b7 = vtrn2_u8(a6, a7);
53
+
54
+ *(uint8x8_t *)(dst + 0 * dstride) = b0;
55
+ *(uint8x8_t *)(dst + 1 * dstride) = b1;
56
+ *(uint8x8_t *)(dst + 2 * dstride) = b2;
57
+ *(uint8x8_t *)(dst + 3 * dstride) = b3;
58
+ *(uint8x8_t *)(dst + 4 * dstride) = b4;
59
+ *(uint8x8_t *)(dst + 5 * dstride) = b5;
60
+ *(uint8x8_t *)(dst + 6 * dstride) = b6;
61
+ *(uint8x8_t *)(dst + 7 * dstride) = b7;
62
+}
63
+
64
+
65
+
66
+
67
+
68
+
69
+void transpose16x16(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
70
+{
71
+ uint16x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aA, aB, aC, aD, aE, aF;
72
+ uint16x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, bA, bB, bC, bD, bE, bF;
73
+ uint16x8_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF;
74
+ uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, dA, dB, dC, dD, dE, dF;
75
+
76
+ a0 = *(uint16x8_t *)(src + 0 * sstride);
77
+ a1 = *(uint16x8_t *)(src + 1 * sstride);
78
+ a2 = *(uint16x8_t *)(src + 2 * sstride);
79
+ a3 = *(uint16x8_t *)(src + 3 * sstride);
80
+ a4 = *(uint16x8_t *)(src + 4 * sstride);
81
+ a5 = *(uint16x8_t *)(src + 5 * sstride);
82
+ a6 = *(uint16x8_t *)(src + 6 * sstride);
83
+ a7 = *(uint16x8_t *)(src + 7 * sstride);
84
+ a8 = *(uint16x8_t *)(src + 8 * sstride);
85
+ a9 = *(uint16x8_t *)(src + 9 * sstride);
86
+ aA = *(uint16x8_t *)(src + 10 * sstride);
87
+ aB = *(uint16x8_t *)(src + 11 * sstride);
88
+ aC = *(uint16x8_t *)(src + 12 * sstride);
89
+ aD = *(uint16x8_t *)(src + 13 * sstride);
90
+ aE = *(uint16x8_t *)(src + 14 * sstride);
91
+ aF = *(uint16x8_t *)(src + 15 * sstride);
92
+
93
+ b0 = vtrn1q_u64(a0, a8);
94
+ b1 = vtrn1q_u64(a1, a9);
95
+ b2 = vtrn1q_u64(a2, aA);
96
+ b3 = vtrn1q_u64(a3, aB);
97
+ b4 = vtrn1q_u64(a4, aC);
98
+ b5 = vtrn1q_u64(a5, aD);
99
+ b6 = vtrn1q_u64(a6, aE);
100
+ b7 = vtrn1q_u64(a7, aF);
101
+ b8 = vtrn2q_u64(a0, a8);
102
+ b9 = vtrn2q_u64(a1, a9);
103
+ bA = vtrn2q_u64(a2, aA);
104
+ bB = vtrn2q_u64(a3, aB);
105
+ bC = vtrn2q_u64(a4, aC);
106
+ bD = vtrn2q_u64(a5, aD);
107
+ bE = vtrn2q_u64(a6, aE);
108
+ bF = vtrn2q_u64(a7, aF);
109
+
110
+ c0 = vtrn1q_u32(b0, b4);
111
+ c1 = vtrn1q_u32(b1, b5);
112
+ c2 = vtrn1q_u32(b2, b6);
113
+ c3 = vtrn1q_u32(b3, b7);
114
+ c4 = vtrn2q_u32(b0, b4);
115
+ c5 = vtrn2q_u32(b1, b5);
116
+ c6 = vtrn2q_u32(b2, b6);
117
+ c7 = vtrn2q_u32(b3, b7);
118
+ c8 = vtrn1q_u32(b8, bC);
119
+ c9 = vtrn1q_u32(b9, bD);
120
+ cA = vtrn1q_u32(bA, bE);
121
+ cB = vtrn1q_u32(bB, bF);
122
+ cC = vtrn2q_u32(b8, bC);
123
+ cD = vtrn2q_u32(b9, bD);
124
+ cE = vtrn2q_u32(bA, bE);
125
+ cF = vtrn2q_u32(bB, bF);
126
+
127
+ d0 = vtrn1q_u16(c0, c2);
128
+ d1 = vtrn1q_u16(c1, c3);
129
+ d2 = vtrn2q_u16(c0, c2);
130
+ d3 = vtrn2q_u16(c1, c3);
131
+ d4 = vtrn1q_u16(c4, c6);
132
+ d5 = vtrn1q_u16(c5, c7);
133
+ d6 = vtrn2q_u16(c4, c6);
134
+ d7 = vtrn2q_u16(c5, c7);
135
+ d8 = vtrn1q_u16(c8, cA);
136
+ d9 = vtrn1q_u16(c9, cB);
137
+ dA = vtrn2q_u16(c8, cA);
138
+ dB = vtrn2q_u16(c9, cB);
139
+ dC = vtrn1q_u16(cC, cE);
140
+ dD = vtrn1q_u16(cD, cF);
141
+ dE = vtrn2q_u16(cC, cE);
142
+ dF = vtrn2q_u16(cD, cF);
143
+
144
+ *(uint16x8_t *)(dst + 0 * dstride) = vtrn1q_u8(d0, d1);
145
+ *(uint16x8_t *)(dst + 1 * dstride) = vtrn2q_u8(d0, d1);
146
+ *(uint16x8_t *)(dst + 2 * dstride) = vtrn1q_u8(d2, d3);
147
+ *(uint16x8_t *)(dst + 3 * dstride) = vtrn2q_u8(d2, d3);
148
+ *(uint16x8_t *)(dst + 4 * dstride) = vtrn1q_u8(d4, d5);
149
+ *(uint16x8_t *)(dst + 5 * dstride) = vtrn2q_u8(d4, d5);
150
+ *(uint16x8_t *)(dst + 6 * dstride) = vtrn1q_u8(d6, d7);
151
+ *(uint16x8_t *)(dst + 7 * dstride) = vtrn2q_u8(d6, d7);
152
+ *(uint16x8_t *)(dst + 8 * dstride) = vtrn1q_u8(d8, d9);
153
+ *(uint16x8_t *)(dst + 9 * dstride) = vtrn2q_u8(d8, d9);
154
+ *(uint16x8_t *)(dst + 10 * dstride) = vtrn1q_u8(dA, dB);
155
+ *(uint16x8_t *)(dst + 11 * dstride) = vtrn2q_u8(dA, dB);
156
+ *(uint16x8_t *)(dst + 12 * dstride) = vtrn1q_u8(dC, dD);
157
+ *(uint16x8_t *)(dst + 13 * dstride) = vtrn2q_u8(dC, dD);
158
+ *(uint16x8_t *)(dst + 14 * dstride) = vtrn1q_u8(dE, dF);
159
+ *(uint16x8_t *)(dst + 15 * dstride) = vtrn2q_u8(dE, dF);
160
+
161
+
162
+}
163
+
164
+
165
+void transpose32x32(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
166
+{
167
+ //assumption: there is no partial overlap
168
+ transpose16x16(dst, src, dstride, sstride);
169
+ transpose16x16(dst + 16 * dstride + 16, src + 16 * sstride + 16, dstride, sstride);
170
+ if (dst == src)
171
+ {
172
+ uint8_t tmp16 * 16 __attribute__((aligned(64)));
173
+ transpose16x16(tmp, src + 16, 16, sstride);
174
+ transpose16x16(dst + 16, src + 16 * sstride, dstride, sstride);
175
+ for (int i = 0; i < 16; i++)
176
+ {
177
+ COPY_16(dst + (16 + i)*dstride, tmp + 16 * i);
178
+ }
179
+ }
180
+ else
181
+ {
182
+ transpose16x16(dst + 16 * dstride, src + 16, dstride, sstride);
183
+ transpose16x16(dst + 16, src + 16 * sstride, dstride, sstride);
184
+ }
185
+
186
+}
187
+
188
+
189
+
190
+void transpose8x8(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride)
191
+{
192
+ uint16x8_t a0, a1, a2, a3, a4, a5, a6, a7;
193
+ uint16x8_t b0, b1, b2, b3, b4, b5, b6, b7;
194
+
195
+ a0 = *(uint16x8_t *)(src + 0 * sstride);
196
+ a1 = *(uint16x8_t *)(src + 1 * sstride);
197
+ a2 = *(uint16x8_t *)(src + 2 * sstride);
198
+ a3 = *(uint16x8_t *)(src + 3 * sstride);
199
+ a4 = *(uint16x8_t *)(src + 4 * sstride);
200
+ a5 = *(uint16x8_t *)(src + 5 * sstride);
201
+ a6 = *(uint16x8_t *)(src + 6 * sstride);
202
+ a7 = *(uint16x8_t *)(src + 7 * sstride);
203
+
204
+ b0 = vtrn1q_u64(a0, a4);
205
+ b1 = vtrn1q_u64(a1, a5);
206
+ b2 = vtrn1q_u64(a2, a6);
207
+ b3 = vtrn1q_u64(a3, a7);
208
+ b4 = vtrn2q_u64(a0, a4);
209
+ b5 = vtrn2q_u64(a1, a5);
210
+ b6 = vtrn2q_u64(a2, a6);
211
+ b7 = vtrn2q_u64(a3, a7);
212
+
213
+ a0 = vtrn1q_u32(b0, b2);
214
+ a1 = vtrn1q_u32(b1, b3);
215
+ a2 = vtrn2q_u32(b0, b2);
216
+ a3 = vtrn2q_u32(b1, b3);
217
+ a4 = vtrn1q_u32(b4, b6);
218
+ a5 = vtrn1q_u32(b5, b7);
219
+ a6 = vtrn2q_u32(b4, b6);
220
+ a7 = vtrn2q_u32(b5, b7);
221
+
222
+ b0 = vtrn1q_u16(a0, a1);
223
+ b1 = vtrn2q_u16(a0, a1);
224
+ b2 = vtrn1q_u16(a2, a3);
225
+ b3 = vtrn2q_u16(a2, a3);
226
+ b4 = vtrn1q_u16(a4, a5);
227
+ b5 = vtrn2q_u16(a4, a5);
228
+ b6 = vtrn1q_u16(a6, a7);
229
+ b7 = vtrn2q_u16(a6, a7);
230
+
231
+ *(uint16x8_t *)(dst + 0 * dstride) = b0;
232
+ *(uint16x8_t *)(dst + 1 * dstride) = b1;
233
+ *(uint16x8_t *)(dst + 2 * dstride) = b2;
234
+ *(uint16x8_t *)(dst + 3 * dstride) = b3;
235
+ *(uint16x8_t *)(dst + 4 * dstride) = b4;
236
+ *(uint16x8_t *)(dst + 5 * dstride) = b5;
237
+ *(uint16x8_t *)(dst + 6 * dstride) = b6;
238
+ *(uint16x8_t *)(dst + 7 * dstride) = b7;
239
+}
240
+
241
+void transpose16x16(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride)
242
+{
243
+ //assumption: there is no partial overlap
244
+ transpose8x8(dst, src, dstride, sstride);
245
+ transpose8x8(dst + 8 * dstride + 8, src + 8 * sstride + 8, dstride, sstride);
246
+
247
+ if (dst == src)
248
+ {
249
+ uint16_t tmp8 * 8;
250
+ transpose8x8(tmp, src + 8, 8, sstride);
251
+ transpose8x8(dst + 8, src + 8 * sstride, dstride, sstride);
252
+ for (int i = 0; i < 8; i++)
253
+ {
254
+ COPY_16(dst + (8 + i)*dstride, tmp + 8 * i);
255
+ }
256
+ }
257
+ else
258
+ {
259
+ transpose8x8(dst + 8 * dstride, src + 8, dstride, sstride);
260
+ transpose8x8(dst + 8, src + 8 * sstride, dstride, sstride);
261
+ }
262
+
263
+}
264
+
265
+
266
+
267
+void transpose32x32(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride)
268
+{
269
+ //assumption: there is no partial overlap
270
+ for (int i = 0; i < 4; i++)
271
+ {
272
+ transpose8x8(dst + i * 8 * (1 + dstride), src + i * 8 * (1 + sstride), dstride, sstride);
273
+ for (int j = i + 1; j < 4; j++)
274
+ {
275
+ if (dst == src)
276
+ {
277
+ uint16_t tmp8 * 8 __attribute__((aligned(64)));
278
+ transpose8x8(tmp, src + 8 * i + 8 * j * sstride, 8, sstride);
279
+ transpose8x8(dst + 8 * i + 8 * j * dstride, src + 8 * j + 8 * i * sstride, dstride, sstride);
280
+ for (int k = 0; k < 8; k++)
281
+ {
282
+ COPY_16(dst + 8 * j + (8 * i + k)*dstride, tmp + 8 * k);
283
+ }
284
+ }
285
+ else
286
+ {
287
+ transpose8x8(dst + 8 * (j + i * dstride), src + 8 * (i + j * sstride), dstride, sstride);
288
+ transpose8x8(dst + 8 * (i + j * dstride), src + 8 * (j + i * sstride), dstride, sstride);
289
+ }
290
+
291
+ }
292
+ }
293
+}
294
+
295
+
296
+
297
+
298
+}
299
+
300
+
301
+
302
x265_3.6.tar.gz/source/common/aarch64/arm64-utils.h
Added
17
1
2
+#ifndef __ARM64_UTILS_H__
3
+#define __ARM64_UTILS_H__
4
+
5
+
6
+namespace X265_NS
7
+{
8
+void transpose8x8(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride);
9
+void transpose16x16(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride);
10
+void transpose32x32(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride);
11
+void transpose8x8(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride);
12
+void transpose16x16(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride);
13
+void transpose32x32(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride);
14
+}
15
+
16
+#endif
17
x265_3.5.tar.gz/source/common/aarch64/asm-primitives.cpp -> x265_3.6.tar.gz/source/common/aarch64/asm-primitives.cpp
Changed
2102
1
2
*
3
* Authors: Hongbin Liu <liuhongbin1@huawei.com>
4
* Yimeng Su <yimeng.su@huawei.com>
5
+ * Sebastian Pop <spop@amazon.com>
6
*
7
* This program is free software; you can redistribute it and/or modify
8
* it under the terms of the GNU General Public License as published by
9
10
* For more information, contact us at license @ x265.com.
11
*****************************************************************************/
12
13
+
14
#include "common.h"
15
#include "primitives.h"
16
#include "x265.h"
17
#include "cpu.h"
18
19
+extern "C" {
20
+#include "fun-decls.h"
21
+}
22
+
23
+#define ALL_LUMA_TU_TYPED(prim, fncdef, fname, cpu) \
24
+ p.cuBLOCK_4x4.prim = fncdef PFX(fname ## _4x4_ ## cpu); \
25
+ p.cuBLOCK_8x8.prim = fncdef PFX(fname ## _8x8_ ## cpu); \
26
+ p.cuBLOCK_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
27
+ p.cuBLOCK_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
28
+ p.cuBLOCK_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu)
29
+#define LUMA_TU_TYPED_NEON(prim, fncdef, fname) \
30
+ p.cuBLOCK_4x4.prim = fncdef PFX(fname ## _4x4_ ## neon); \
31
+ p.cuBLOCK_8x8.prim = fncdef PFX(fname ## _8x8_ ## neon); \
32
+ p.cuBLOCK_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
33
+ p.cuBLOCK_64x64.prim = fncdef PFX(fname ## _64x64_ ## neon)
34
+#define LUMA_TU_TYPED_CAN_USE_SVE(prim, fncdef, fname) \
35
+ p.cuBLOCK_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve)
36
+#define ALL_LUMA_TU(prim, fname, cpu) ALL_LUMA_TU_TYPED(prim, , fname, cpu)
37
+#define LUMA_TU_NEON(prim, fname) LUMA_TU_TYPED_NEON(prim, , fname)
38
+#define LUMA_TU_CAN_USE_SVE(prim, fname) LUMA_TU_TYPED_CAN_USE_SVE(prim, , fname)
39
+
40
+#define ALL_LUMA_PU_TYPED(prim, fncdef, fname, cpu) \
41
+ p.puLUMA_4x4.prim = fncdef PFX(fname ## _4x4_ ## cpu); \
42
+ p.puLUMA_8x8.prim = fncdef PFX(fname ## _8x8_ ## cpu); \
43
+ p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
44
+ p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
45
+ p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
46
+ p.puLUMA_8x4.prim = fncdef PFX(fname ## _8x4_ ## cpu); \
47
+ p.puLUMA_4x8.prim = fncdef PFX(fname ## _4x8_ ## cpu); \
48
+ p.puLUMA_16x8.prim = fncdef PFX(fname ## _16x8_ ## cpu); \
49
+ p.puLUMA_8x16.prim = fncdef PFX(fname ## _8x16_ ## cpu); \
50
+ p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
51
+ p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
52
+ p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
53
+ p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
54
+ p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
55
+ p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
56
+ p.puLUMA_16x4.prim = fncdef PFX(fname ## _16x4_ ## cpu); \
57
+ p.puLUMA_4x16.prim = fncdef PFX(fname ## _4x16_ ## cpu); \
58
+ p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
59
+ p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
60
+ p.puLUMA_32x8.prim = fncdef PFX(fname ## _32x8_ ## cpu); \
61
+ p.puLUMA_8x32.prim = fncdef PFX(fname ## _8x32_ ## cpu); \
62
+ p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
63
+ p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
64
+ p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \
65
+ p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu)
66
+#define LUMA_PU_TYPED_MULTIPLE_ARCHS_1(prim, fncdef, fname, cpu) \
67
+ p.puLUMA_4x4.prim = fncdef PFX(fname ## _4x4_ ## cpu); \
68
+ p.puLUMA_4x8.prim = fncdef PFX(fname ## _4x8_ ## cpu); \
69
+ p.puLUMA_4x16.prim = fncdef PFX(fname ## _4x16_ ## cpu)
70
+#define LUMA_PU_TYPED_MULTIPLE_ARCHS_2(prim, fncdef, fname, cpu) \
71
+ p.puLUMA_8x8.prim = fncdef PFX(fname ## _8x8_ ## cpu); \
72
+ p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
73
+ p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
74
+ p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
75
+ p.puLUMA_8x4.prim = fncdef PFX(fname ## _8x4_ ## cpu); \
76
+ p.puLUMA_16x8.prim = fncdef PFX(fname ## _16x8_ ## cpu); \
77
+ p.puLUMA_8x16.prim = fncdef PFX(fname ## _8x16_ ## cpu); \
78
+ p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
79
+ p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
80
+ p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
81
+ p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
82
+ p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
83
+ p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
84
+ p.puLUMA_16x4.prim = fncdef PFX(fname ## _16x4_ ## cpu); \
85
+ p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
86
+ p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
87
+ p.puLUMA_32x8.prim = fncdef PFX(fname ## _32x8_ ## cpu); \
88
+ p.puLUMA_8x32.prim = fncdef PFX(fname ## _8x32_ ## cpu); \
89
+ p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
90
+ p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
91
+ p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \
92
+ p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu)
93
+#define LUMA_PU_TYPED_NEON_1(prim, fncdef, fname) \
94
+ p.puLUMA_4x4.prim = fncdef PFX(fname ## _4x4_ ## neon); \
95
+ p.puLUMA_4x8.prim = fncdef PFX(fname ## _4x8_ ## neon); \
96
+ p.puLUMA_4x16.prim = fncdef PFX(fname ## _4x16_ ## neon); \
97
+ p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## neon); \
98
+ p.puLUMA_8x8.prim = fncdef PFX(fname ## _8x8_ ## neon); \
99
+ p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
100
+ p.puLUMA_8x4.prim = fncdef PFX(fname ## _8x4_ ## neon); \
101
+ p.puLUMA_16x8.prim = fncdef PFX(fname ## _16x8_ ## neon); \
102
+ p.puLUMA_8x16.prim = fncdef PFX(fname ## _8x16_ ## neon); \
103
+ p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \
104
+ p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
105
+ p.puLUMA_16x4.prim = fncdef PFX(fname ## _16x4_ ## neon); \
106
+ p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## neon); \
107
+ p.puLUMA_8x32.prim = fncdef PFX(fname ## _8x32_ ## neon); \
108
+ p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## neon); \
109
+ p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## neon)
110
+#define LUMA_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fncdef, fname) \
111
+ p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve); \
112
+ p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## sve); \
113
+ p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## sve); \
114
+ p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## sve); \
115
+ p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## sve); \
116
+ p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## sve); \
117
+ p.puLUMA_32x8.prim = fncdef PFX(fname ## _32x8_ ## sve); \
118
+ p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## sve); \
119
+ p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## sve)
120
+#define LUMA_PU_TYPED_NEON_2(prim, fncdef, fname) \
121
+ p.puLUMA_4x4.prim = fncdef PFX(fname ## _4x4_ ## neon); \
122
+ p.puLUMA_8x4.prim = fncdef PFX(fname ## _8x4_ ## neon); \
123
+ p.puLUMA_4x8.prim = fncdef PFX(fname ## _4x8_ ## neon); \
124
+ p.puLUMA_8x8.prim = fncdef PFX(fname ## _8x8_ ## neon); \
125
+ p.puLUMA_16x8.prim = fncdef PFX(fname ## _16x8_ ## neon); \
126
+ p.puLUMA_8x16.prim = fncdef PFX(fname ## _8x16_ ## neon); \
127
+ p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
128
+ p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
129
+ p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \
130
+ p.puLUMA_16x4.prim = fncdef PFX(fname ## _16x4_ ## neon); \
131
+ p.puLUMA_4x16.prim = fncdef PFX(fname ## _4x16_ ## neon); \
132
+ p.puLUMA_8x32.prim = fncdef PFX(fname ## _8x32_ ## neon); \
133
+ p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## neon)
134
+#define LUMA_PU_TYPED_MULTIPLE_ARCHS_3(prim, fncdef, fname, cpu) \
135
+ p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
136
+ p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
137
+ p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
138
+ p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
139
+ p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
140
+ p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
141
+ p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
142
+ p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
143
+ p.puLUMA_32x8.prim = fncdef PFX(fname ## _32x8_ ## cpu); \
144
+ p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
145
+ p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
146
+ p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu)
147
+#define LUMA_PU_TYPED_NEON_3(prim, fncdef, fname) \
148
+ p.puLUMA_4x4.prim = fncdef PFX(fname ## _4x4_ ## neon); \
149
+ p.puLUMA_4x8.prim = fncdef PFX(fname ## _4x8_ ## neon); \
150
+ p.puLUMA_4x16.prim = fncdef PFX(fname ## _4x16_ ## neon)
151
+#define LUMA_PU_TYPED_CAN_USE_SVE2(prim, fncdef, fname) \
152
+ p.puLUMA_8x8.prim = fncdef PFX(fname ## _8x8_ ## sve2); \
153
+ p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## sve2); \
154
+ p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve2); \
155
+ p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## sve2); \
156
+ p.puLUMA_8x4.prim = fncdef PFX(fname ## _8x4_ ## sve2); \
157
+ p.puLUMA_16x8.prim = fncdef PFX(fname ## _16x8_ ## sve2); \
158
+ p.puLUMA_8x16.prim = fncdef PFX(fname ## _8x16_ ## sve2); \
159
+ p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## sve2); \
160
+ p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## sve2); \
161
+ p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## sve2); \
162
+ p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## sve2); \
163
+ p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## sve2); \
164
+ p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## sve2); \
165
+ p.puLUMA_16x4.prim = fncdef PFX(fname ## _16x4_ ## sve2); \
166
+ p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## sve2); \
167
+ p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## sve2); \
168
+ p.puLUMA_32x8.prim = fncdef PFX(fname ## _32x8_ ## sve2); \
169
+ p.puLUMA_8x32.prim = fncdef PFX(fname ## _8x32_ ## sve2); \
170
+ p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## sve2); \
171
+ p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## sve2); \
172
+ p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## sve2); \
173
+ p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## sve2)
174
+#define LUMA_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
175
+ p.puLUMA_4x4.prim = fncdef PFX(filterPixelToShort ## _4x4_ ## neon); \
176
+ p.puLUMA_8x8.prim = fncdef PFX(filterPixelToShort ## _8x8_ ## neon); \
177
+ p.puLUMA_16x16.prim = fncdef PFX(filterPixelToShort ## _16x16_ ## neon); \
178
+ p.puLUMA_8x4.prim = fncdef PFX(filterPixelToShort ## _8x4_ ## neon); \
179
+ p.puLUMA_4x8.prim = fncdef PFX(filterPixelToShort ## _4x8_ ## neon); \
180
+ p.puLUMA_16x8.prim = fncdef PFX(filterPixelToShort ## _16x8_ ## neon); \
181
+ p.puLUMA_8x16.prim = fncdef PFX(filterPixelToShort ## _8x16_ ## neon); \
182
+ p.puLUMA_16x32.prim = fncdef PFX(filterPixelToShort ## _16x32_ ## neon); \
183
+ p.puLUMA_16x12.prim = fncdef PFX(filterPixelToShort ## _16x12_ ## neon); \
184
+ p.puLUMA_12x16.prim = fncdef PFX(filterPixelToShort ## _12x16_ ## neon); \
185
+ p.puLUMA_16x4.prim = fncdef PFX(filterPixelToShort ## _16x4_ ## neon); \
186
+ p.puLUMA_4x16.prim = fncdef PFX(filterPixelToShort ## _4x16_ ## neon); \
187
+ p.puLUMA_24x32.prim = fncdef PFX(filterPixelToShort ## _24x32_ ## neon); \
188
+ p.puLUMA_8x32.prim = fncdef PFX(filterPixelToShort ## _8x32_ ## neon); \
189
+ p.puLUMA_16x64.prim = fncdef PFX(filterPixelToShort ## _16x64_ ## neon)
190
+#define LUMA_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
191
+ p.puLUMA_32x32.prim = fncdef PFX(filterPixelToShort ## _32x32_ ## sve); \
192
+ p.puLUMA_32x16.prim = fncdef PFX(filterPixelToShort ## _32x16_ ## sve); \
193
+ p.puLUMA_32x64.prim = fncdef PFX(filterPixelToShort ## _32x64_ ## sve); \
194
+ p.puLUMA_32x24.prim = fncdef PFX(filterPixelToShort ## _32x24_ ## sve); \
195
+ p.puLUMA_32x8.prim = fncdef PFX(filterPixelToShort ## _32x8_ ## sve); \
196
+ p.puLUMA_64x64.prim = fncdef PFX(filterPixelToShort ## _64x64_ ## sve); \
197
+ p.puLUMA_64x32.prim = fncdef PFX(filterPixelToShort ## _64x32_ ## sve); \
198
+ p.puLUMA_64x48.prim = fncdef PFX(filterPixelToShort ## _64x48_ ## sve); \
199
+ p.puLUMA_64x16.prim = fncdef PFX(filterPixelToShort ## _64x16_ ## sve); \
200
+ p.puLUMA_48x64.prim = fncdef PFX(filterPixelToShort ## _48x64_ ## sve)
201
+#define ALL_LUMA_PU(prim, fname, cpu) ALL_LUMA_PU_TYPED(prim, , fname, cpu)
202
+#define LUMA_PU_MULTIPLE_ARCHS_1(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_1(prim, , fname, cpu)
203
+#define LUMA_PU_MULTIPLE_ARCHS_2(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_2(prim, , fname, cpu)
204
+#define LUMA_PU_NEON_1(prim, fname) LUMA_PU_TYPED_NEON_1(prim, , fname)
205
+#define LUMA_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fname) LUMA_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, , fname)
206
+#define LUMA_PU_NEON_2(prim, fname) LUMA_PU_TYPED_NEON_2(prim, , fname)
207
+#define LUMA_PU_MULTIPLE_ARCHS_3(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_3(prim, , fname, cpu)
208
+#define LUMA_PU_NEON_3(prim, fname) LUMA_PU_TYPED_NEON_3(prim, , fname)
209
+#define LUMA_PU_CAN_USE_SVE2(prim, fname) LUMA_PU_TYPED_CAN_USE_SVE2(prim, , fname)
210
+#define LUMA_PU_NEON_FILTER_PIXEL_TO_SHORT(prim) LUMA_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, )
211
+#define LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(prim) LUMA_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, )
212
+
213
+
214
+#define ALL_LUMA_PU_T(prim, fname) \
215
+ p.puLUMA_4x4.prim = fname<LUMA_4x4>; \
216
+ p.puLUMA_8x8.prim = fname<LUMA_8x8>; \
217
+ p.puLUMA_16x16.prim = fname<LUMA_16x16>; \
218
+ p.puLUMA_32x32.prim = fname<LUMA_32x32>; \
219
+ p.puLUMA_64x64.prim = fname<LUMA_64x64>; \
220
+ p.puLUMA_8x4.prim = fname<LUMA_8x4>; \
221
+ p.puLUMA_4x8.prim = fname<LUMA_4x8>; \
222
+ p.puLUMA_16x8.prim = fname<LUMA_16x8>; \
223
+ p.puLUMA_8x16.prim = fname<LUMA_8x16>; \
224
+ p.puLUMA_16x32.prim = fname<LUMA_16x32>; \
225
+ p.puLUMA_32x16.prim = fname<LUMA_32x16>; \
226
+ p.puLUMA_64x32.prim = fname<LUMA_64x32>; \
227
+ p.puLUMA_32x64.prim = fname<LUMA_32x64>; \
228
+ p.puLUMA_16x12.prim = fname<LUMA_16x12>; \
229
+ p.puLUMA_12x16.prim = fname<LUMA_12x16>; \
230
+ p.puLUMA_16x4.prim = fname<LUMA_16x4>; \
231
+ p.puLUMA_4x16.prim = fname<LUMA_4x16>; \
232
+ p.puLUMA_32x24.prim = fname<LUMA_32x24>; \
233
+ p.puLUMA_24x32.prim = fname<LUMA_24x32>; \
234
+ p.puLUMA_32x8.prim = fname<LUMA_32x8>; \
235
+ p.puLUMA_8x32.prim = fname<LUMA_8x32>; \
236
+ p.puLUMA_64x48.prim = fname<LUMA_64x48>; \
237
+ p.puLUMA_48x64.prim = fname<LUMA_48x64>; \
238
+ p.puLUMA_64x16.prim = fname<LUMA_64x16>; \
239
+ p.puLUMA_16x64.prim = fname<LUMA_16x64>
240
+
241
+#define ALL_CHROMA_420_PU_TYPED(prim, fncdef, fname, cpu) \
242
+ p.chromaX265_CSP_I420.puCHROMA_420_4x4.prim = fncdef PFX(fname ## _4x4_ ## cpu); \
243
+ p.chromaX265_CSP_I420.puCHROMA_420_8x8.prim = fncdef PFX(fname ## _8x8_ ## cpu); \
244
+ p.chromaX265_CSP_I420.puCHROMA_420_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
245
+ p.chromaX265_CSP_I420.puCHROMA_420_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
246
+ p.chromaX265_CSP_I420.puCHROMA_420_4x2.prim = fncdef PFX(fname ## _4x2_ ## cpu); \
247
+ p.chromaX265_CSP_I420.puCHROMA_420_2x4.prim = fncdef PFX(fname ## _2x4_ ## cpu); \
248
+ p.chromaX265_CSP_I420.puCHROMA_420_8x4.prim = fncdef PFX(fname ## _8x4_ ## cpu); \
249
+ p.chromaX265_CSP_I420.puCHROMA_420_4x8.prim = fncdef PFX(fname ## _4x8_ ## cpu); \
250
+ p.chromaX265_CSP_I420.puCHROMA_420_16x8.prim = fncdef PFX(fname ## _16x8_ ## cpu); \
251
+ p.chromaX265_CSP_I420.puCHROMA_420_8x16.prim = fncdef PFX(fname ## _8x16_ ## cpu); \
252
+ p.chromaX265_CSP_I420.puCHROMA_420_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
253
+ p.chromaX265_CSP_I420.puCHROMA_420_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
254
+ p.chromaX265_CSP_I420.puCHROMA_420_8x6.prim = fncdef PFX(fname ## _8x6_ ## cpu); \
255
+ p.chromaX265_CSP_I420.puCHROMA_420_6x8.prim = fncdef PFX(fname ## _6x8_ ## cpu); \
256
+ p.chromaX265_CSP_I420.puCHROMA_420_8x2.prim = fncdef PFX(fname ## _8x2_ ## cpu); \
257
+ p.chromaX265_CSP_I420.puCHROMA_420_2x8.prim = fncdef PFX(fname ## _2x8_ ## cpu); \
258
+ p.chromaX265_CSP_I420.puCHROMA_420_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
259
+ p.chromaX265_CSP_I420.puCHROMA_420_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
260
+ p.chromaX265_CSP_I420.puCHROMA_420_16x4.prim = fncdef PFX(fname ## _16x4_ ## cpu); \
261
+ p.chromaX265_CSP_I420.puCHROMA_420_4x16.prim = fncdef PFX(fname ## _4x16_ ## cpu); \
262
+ p.chromaX265_CSP_I420.puCHROMA_420_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
263
+ p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
264
+ p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim = fncdef PFX(fname ## _32x8_ ## cpu); \
265
+ p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim = fncdef PFX(fname ## _8x32_ ## cpu)
266
+#define CHROMA_420_PU_TYPED_NEON_1(prim, fncdef, fname) \
267
+ p.chromaX265_CSP_I420.puCHROMA_420_4x4.prim = fncdef PFX(fname ## _4x4_ ## neon); \
268
+ p.chromaX265_CSP_I420.puCHROMA_420_4x2.prim = fncdef PFX(fname ## _4x2_ ## neon); \
269
+ p.chromaX265_CSP_I420.puCHROMA_420_4x8.prim = fncdef PFX(fname ## _4x8_ ## neon); \
270
+ p.chromaX265_CSP_I420.puCHROMA_420_6x8.prim = fncdef PFX(fname ## _6x8_ ## neon); \
271
+ p.chromaX265_CSP_I420.puCHROMA_420_12x16.prim = fncdef PFX(fname ## _12x16_ ## neon); \
272
+ p.chromaX265_CSP_I420.puCHROMA_420_4x16.prim = fncdef PFX(fname ## _4x16_ ## neon); \
273
+ p.chromaX265_CSP_I420.puCHROMA_420_32x24.prim = fncdef PFX(fname ## _32x24_ ## neon); \
274
+ p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(fname ## _24x32_ ## neon); \
275
+ p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim = fncdef PFX(fname ## _32x8_ ## neon); \
276
+ p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim = fncdef PFX(fname ## _8x32_ ## neon); \
277
+ p.chromaX265_CSP_I420.puCHROMA_420_8x8.prim = fncdef PFX(fname ## _8x8_ ## neon); \
278
+ p.chromaX265_CSP_I420.puCHROMA_420_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
279
+ p.chromaX265_CSP_I420.puCHROMA_420_2x4.prim = fncdef PFX(fname ## _2x4_ ## neon); \
280
+ p.chromaX265_CSP_I420.puCHROMA_420_8x4.prim = fncdef PFX(fname ## _8x4_ ## neon); \
281
+ p.chromaX265_CSP_I420.puCHROMA_420_16x8.prim = fncdef PFX(fname ## _16x8_ ## neon); \
282
+ p.chromaX265_CSP_I420.puCHROMA_420_8x16.prim = fncdef PFX(fname ## _8x16_ ## neon); \
283
+ p.chromaX265_CSP_I420.puCHROMA_420_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
284
+ p.chromaX265_CSP_I420.puCHROMA_420_8x6.prim = fncdef PFX(fname ## _8x6_ ## neon); \
285
+ p.chromaX265_CSP_I420.puCHROMA_420_8x2.prim = fncdef PFX(fname ## _8x2_ ## neon); \
286
+ p.chromaX265_CSP_I420.puCHROMA_420_2x8.prim = fncdef PFX(fname ## _2x8_ ## neon); \
287
+ p.chromaX265_CSP_I420.puCHROMA_420_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \
288
+ p.chromaX265_CSP_I420.puCHROMA_420_16x4.prim = fncdef PFX(fname ## _16x4_ ## neon)
289
+#define CHROMA_420_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fncdef, fname) \
290
+ p.chromaX265_CSP_I420.puCHROMA_420_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve); \
291
+ p.chromaX265_CSP_I420.puCHROMA_420_32x16.prim = fncdef PFX(fname ## _32x16_ ## sve)
292
+#define CHROMA_420_PU_TYPED_NEON_2(prim, fncdef, fname) \
293
+ p.chromaX265_CSP_I420.puCHROMA_420_4x4.prim = fncdef PFX(fname ## _4x4_ ## neon); \
294
+ p.chromaX265_CSP_I420.puCHROMA_420_4x2.prim = fncdef PFX(fname ## _4x2_ ## neon); \
295
+ p.chromaX265_CSP_I420.puCHROMA_420_4x8.prim = fncdef PFX(fname ## _4x8_ ## neon); \
296
+ p.chromaX265_CSP_I420.puCHROMA_420_4x16.prim = fncdef PFX(fname ## _4x16_ ## neon)
297
+#define CHROMA_420_PU_TYPED_MULTIPLE_ARCHS(prim, fncdef, fname, cpu) \
298
+ p.chromaX265_CSP_I420.puCHROMA_420_8x8.prim = fncdef PFX(fname ## _8x8_ ## cpu); \
299
+ p.chromaX265_CSP_I420.puCHROMA_420_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
300
+ p.chromaX265_CSP_I420.puCHROMA_420_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
301
+ p.chromaX265_CSP_I420.puCHROMA_420_2x4.prim = fncdef PFX(fname ## _2x4_ ## cpu); \
302
+ p.chromaX265_CSP_I420.puCHROMA_420_8x4.prim = fncdef PFX(fname ## _8x4_ ## cpu); \
303
+ p.chromaX265_CSP_I420.puCHROMA_420_16x8.prim = fncdef PFX(fname ## _16x8_ ## cpu); \
304
+ p.chromaX265_CSP_I420.puCHROMA_420_8x16.prim = fncdef PFX(fname ## _8x16_ ## cpu); \
305
+ p.chromaX265_CSP_I420.puCHROMA_420_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
306
+ p.chromaX265_CSP_I420.puCHROMA_420_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
307
+ p.chromaX265_CSP_I420.puCHROMA_420_8x6.prim = fncdef PFX(fname ## _8x6_ ## cpu); \
308
+ p.chromaX265_CSP_I420.puCHROMA_420_6x8.prim = fncdef PFX(fname ## _6x8_ ## cpu); \
309
+ p.chromaX265_CSP_I420.puCHROMA_420_8x2.prim = fncdef PFX(fname ## _8x2_ ## cpu); \
310
+ p.chromaX265_CSP_I420.puCHROMA_420_2x8.prim = fncdef PFX(fname ## _2x8_ ## cpu); \
311
+ p.chromaX265_CSP_I420.puCHROMA_420_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
312
+ p.chromaX265_CSP_I420.puCHROMA_420_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
313
+ p.chromaX265_CSP_I420.puCHROMA_420_16x4.prim = fncdef PFX(fname ## _16x4_ ## cpu); \
314
+ p.chromaX265_CSP_I420.puCHROMA_420_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
315
+ p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
316
+ p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim = fncdef PFX(fname ## _32x8_ ## cpu); \
317
+ p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim = fncdef PFX(fname ## _8x32_ ## cpu)
318
+#define CHROMA_420_PU_TYPED_FILTER_PIXEL_TO_SHORT_NEON(prim, fncdef) \
319
+ p.chromaX265_CSP_I420.puCHROMA_420_4x4.prim = fncdef PFX(filterPixelToShort ## _4x4_ ## neon); \
320
+ p.chromaX265_CSP_I420.puCHROMA_420_8x8.prim = fncdef PFX(filterPixelToShort ## _8x8_ ## neon); \
321
+ p.chromaX265_CSP_I420.puCHROMA_420_16x16.prim = fncdef PFX(filterPixelToShort ## _16x16_ ## neon); \
322
+ p.chromaX265_CSP_I420.puCHROMA_420_8x4.prim = fncdef PFX(filterPixelToShort ## _8x4_ ## neon); \
323
+ p.chromaX265_CSP_I420.puCHROMA_420_4x8.prim = fncdef PFX(filterPixelToShort ## _4x8_ ## neon); \
324
+ p.chromaX265_CSP_I420.puCHROMA_420_16x8.prim = fncdef PFX(filterPixelToShort ## _16x8_ ## neon); \
325
+ p.chromaX265_CSP_I420.puCHROMA_420_8x16.prim = fncdef PFX(filterPixelToShort ## _8x16_ ## neon); \
326
+ p.chromaX265_CSP_I420.puCHROMA_420_16x32.prim = fncdef PFX(filterPixelToShort ## _16x32_ ## neon); \
327
+ p.chromaX265_CSP_I420.puCHROMA_420_8x6.prim = fncdef PFX(filterPixelToShort ## _8x6_ ## neon); \
328
+ p.chromaX265_CSP_I420.puCHROMA_420_8x2.prim = fncdef PFX(filterPixelToShort ## _8x2_ ## neon); \
329
+ p.chromaX265_CSP_I420.puCHROMA_420_16x12.prim = fncdef PFX(filterPixelToShort ## _16x12_ ## neon); \
330
+ p.chromaX265_CSP_I420.puCHROMA_420_12x16.prim = fncdef PFX(filterPixelToShort ## _12x16_ ## neon); \
331
+ p.chromaX265_CSP_I420.puCHROMA_420_16x4.prim = fncdef PFX(filterPixelToShort ## _16x4_ ## neon); \
332
+ p.chromaX265_CSP_I420.puCHROMA_420_4x16.prim = fncdef PFX(filterPixelToShort ## _4x16_ ## neon); \
333
+ p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(filterPixelToShort ## _24x32_ ## neon); \
334
+ p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim = fncdef PFX(filterPixelToShort ## _8x32_ ## neon)
335
+#define CHROMA_420_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
336
+ p.chromaX265_CSP_I420.puCHROMA_420_2x4.prim = fncdef PFX(filterPixelToShort ## _2x4_ ## sve); \
337
+ p.chromaX265_CSP_I420.puCHROMA_420_2x8.prim = fncdef PFX(filterPixelToShort ## _2x8_ ## sve); \
338
+ p.chromaX265_CSP_I420.puCHROMA_420_6x8.prim = fncdef PFX(filterPixelToShort ## _6x8_ ## sve); \
339
+ p.chromaX265_CSP_I420.puCHROMA_420_4x2.prim = fncdef PFX(filterPixelToShort ## _4x2_ ## sve); \
340
+ p.chromaX265_CSP_I420.puCHROMA_420_32x32.prim = fncdef PFX(filterPixelToShort ## _32x32_ ## sve); \
341
+ p.chromaX265_CSP_I420.puCHROMA_420_32x16.prim = fncdef PFX(filterPixelToShort ## _32x16_ ## sve); \
342
+ p.chromaX265_CSP_I420.puCHROMA_420_32x24.prim = fncdef PFX(filterPixelToShort ## _32x24_ ## sve); \
343
+ p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim = fncdef PFX(filterPixelToShort ## _32x8_ ## sve)
344
+#define ALL_CHROMA_420_PU(prim, fname, cpu) ALL_CHROMA_420_PU_TYPED(prim, , fname, cpu)
345
+#define CHROMA_420_PU_NEON_1(prim, fname) CHROMA_420_PU_TYPED_NEON_1(prim, , fname)
346
+#define CHROMA_420_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fname) CHROMA_420_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, , fname)
347
+#define CHROMA_420_PU_NEON_2(prim, fname) CHROMA_420_PU_TYPED_NEON_2(prim, , fname)
348
+#define CHROMA_420_PU_MULTIPLE_ARCHS(prim, fname, cpu) CHROMA_420_PU_TYPED_MULTIPLE_ARCHS(prim, , fname, cpu)
349
+#define CHROMA_420_PU_FILTER_PIXEL_TO_SHORT_NEON(prim) CHROMA_420_PU_TYPED_FILTER_PIXEL_TO_SHORT_NEON(prim, )
350
+#define CHROMA_420_PU_SVE_FILTER_PIXEL_TO_SHORT(prim) CHROMA_420_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, )
351
+
352
+
353
+#define ALL_CHROMA_420_4x4_PU_TYPED(prim, fncdef, fname, cpu) \
354
+ p.chromaX265_CSP_I420.puCHROMA_420_4x4.prim = fncdef PFX(fname ## _4x4_ ## cpu); \
355
+ p.chromaX265_CSP_I420.puCHROMA_420_8x2.prim = fncdef PFX(fname ## _8x2_ ## cpu); \
356
+ p.chromaX265_CSP_I420.puCHROMA_420_8x8.prim = fncdef PFX(fname ## _8x8_ ## cpu); \
357
+ p.chromaX265_CSP_I420.puCHROMA_420_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
358
+ p.chromaX265_CSP_I420.puCHROMA_420_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
359
+ p.chromaX265_CSP_I420.puCHROMA_420_8x4.prim = fncdef PFX(fname ## _8x4_ ## cpu); \
360
+ p.chromaX265_CSP_I420.puCHROMA_420_8x6.prim = fncdef PFX(fname ## _8x6_ ## cpu); \
361
+ p.chromaX265_CSP_I420.puCHROMA_420_4x8.prim = fncdef PFX(fname ## _4x8_ ## cpu); \
362
+ p.chromaX265_CSP_I420.puCHROMA_420_16x8.prim = fncdef PFX(fname ## _16x8_ ## cpu); \
363
+ p.chromaX265_CSP_I420.puCHROMA_420_8x16.prim = fncdef PFX(fname ## _8x16_ ## cpu); \
364
+ p.chromaX265_CSP_I420.puCHROMA_420_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
365
+ p.chromaX265_CSP_I420.puCHROMA_420_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
366
+ p.chromaX265_CSP_I420.puCHROMA_420_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
367
+ p.chromaX265_CSP_I420.puCHROMA_420_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
368
+ p.chromaX265_CSP_I420.puCHROMA_420_16x4.prim = fncdef PFX(fname ## _16x4_ ## cpu); \
369
+ p.chromaX265_CSP_I420.puCHROMA_420_4x16.prim = fncdef PFX(fname ## _4x16_ ## cpu); \
370
+ p.chromaX265_CSP_I420.puCHROMA_420_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
371
+ p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
372
+ p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim = fncdef PFX(fname ## _32x8_ ## cpu); \
373
+ p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim = fncdef PFX(fname ## _8x32_ ## cpu)
374
+#define ALL_CHROMA_420_4x4_PU(prim, fname, cpu) ALL_CHROMA_420_4x4_PU_TYPED(prim, , fname, cpu)
375
+
376
+#define ALL_CHROMA_422_PU_TYPED(prim, fncdef, fname, cpu) \
377
+ p.chromaX265_CSP_I422.puCHROMA_422_4x8.prim = fncdef PFX(fname ## _4x8_ ## cpu); \
378
+ p.chromaX265_CSP_I422.puCHROMA_422_8x16.prim = fncdef PFX(fname ## _8x16_ ## cpu); \
379
+ p.chromaX265_CSP_I422.puCHROMA_422_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
380
+ p.chromaX265_CSP_I422.puCHROMA_422_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
381
+ p.chromaX265_CSP_I422.puCHROMA_422_4x4.prim = fncdef PFX(fname ## _4x4_ ## cpu); \
382
+ p.chromaX265_CSP_I422.puCHROMA_422_2x8.prim = fncdef PFX(fname ## _2x8_ ## cpu); \
383
+ p.chromaX265_CSP_I422.puCHROMA_422_8x8.prim = fncdef PFX(fname ## _8x8_ ## cpu); \
384
+ p.chromaX265_CSP_I422.puCHROMA_422_4x16.prim = fncdef PFX(fname ## _4x16_ ## cpu); \
385
+ p.chromaX265_CSP_I422.puCHROMA_422_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
386
+ p.chromaX265_CSP_I422.puCHROMA_422_8x32.prim = fncdef PFX(fname ## _8x32_ ## cpu); \
387
+ p.chromaX265_CSP_I422.puCHROMA_422_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
388
+ p.chromaX265_CSP_I422.puCHROMA_422_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu); \
389
+ p.chromaX265_CSP_I422.puCHROMA_422_8x12.prim = fncdef PFX(fname ## _8x12_ ## cpu); \
390
+ p.chromaX265_CSP_I422.puCHROMA_422_6x16.prim = fncdef PFX(fname ## _6x16_ ## cpu); \
391
+ p.chromaX265_CSP_I422.puCHROMA_422_8x4.prim = fncdef PFX(fname ## _8x4_ ## cpu); \
392
+ p.chromaX265_CSP_I422.puCHROMA_422_2x16.prim = fncdef PFX(fname ## _2x16_ ## cpu); \
393
+ p.chromaX265_CSP_I422.puCHROMA_422_16x24.prim = fncdef PFX(fname ## _16x24_ ## cpu); \
394
+ p.chromaX265_CSP_I422.puCHROMA_422_12x32.prim = fncdef PFX(fname ## _12x32_ ## cpu); \
395
+ p.chromaX265_CSP_I422.puCHROMA_422_16x8.prim = fncdef PFX(fname ## _16x8_ ## cpu); \
396
+ p.chromaX265_CSP_I422.puCHROMA_422_4x32.prim = fncdef PFX(fname ## _4x32_ ## cpu); \
397
+ p.chromaX265_CSP_I422.puCHROMA_422_32x48.prim = fncdef PFX(fname ## _32x48_ ## cpu); \
398
+ p.chromaX265_CSP_I422.puCHROMA_422_24x64.prim = fncdef PFX(fname ## _24x64_ ## cpu); \
399
+ p.chromaX265_CSP_I422.puCHROMA_422_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
400
+ p.chromaX265_CSP_I422.puCHROMA_422_8x64.prim = fncdef PFX(fname ## _8x64_ ## cpu)
401
+#define CHROMA_422_PU_TYPED_NEON_1(prim, fncdef, fname) \
402
+ p.chromaX265_CSP_I422.puCHROMA_422_4x8.prim = fncdef PFX(fname ## _4x8_ ## neon); \
403
+ p.chromaX265_CSP_I422.puCHROMA_422_4x4.prim = fncdef PFX(fname ## _4x4_ ## neon); \
404
+ p.chromaX265_CSP_I422.puCHROMA_422_4x16.prim = fncdef PFX(fname ## _4x16_ ## neon); \
405
+ p.chromaX265_CSP_I422.puCHROMA_422_6x16.prim = fncdef PFX(fname ## _6x16_ ## neon); \
406
+ p.chromaX265_CSP_I422.puCHROMA_422_12x32.prim = fncdef PFX(fname ## _12x32_ ## neon); \
407
+ p.chromaX265_CSP_I422.puCHROMA_422_4x32.prim = fncdef PFX(fname ## _4x32_ ## neon); \
408
+ p.chromaX265_CSP_I422.puCHROMA_422_8x16.prim = fncdef PFX(fname ## _8x16_ ## neon); \
409
+ p.chromaX265_CSP_I422.puCHROMA_422_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
410
+ p.chromaX265_CSP_I422.puCHROMA_422_2x8.prim = fncdef PFX(fname ## _2x8_ ## neon); \
411
+ p.chromaX265_CSP_I422.puCHROMA_422_8x8.prim = fncdef PFX(fname ## _8x8_ ## neon); \
412
+ p.chromaX265_CSP_I422.puCHROMA_422_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
413
+ p.chromaX265_CSP_I422.puCHROMA_422_8x32.prim = fncdef PFX(fname ## _8x32_ ## neon); \
414
+ p.chromaX265_CSP_I422.puCHROMA_422_16x64.prim = fncdef PFX(fname ## _16x64_ ## neon); \
415
+ p.chromaX265_CSP_I422.puCHROMA_422_8x12.prim = fncdef PFX(fname ## _8x12_ ## neon); \
416
+ p.chromaX265_CSP_I422.puCHROMA_422_8x4.prim = fncdef PFX(fname ## _8x4_ ## neon); \
417
+ p.chromaX265_CSP_I422.puCHROMA_422_2x16.prim = fncdef PFX(fname ## _2x16_ ## neon); \
418
+ p.chromaX265_CSP_I422.puCHROMA_422_16x24.prim = fncdef PFX(fname ## _16x24_ ## neon); \
419
+ p.chromaX265_CSP_I422.puCHROMA_422_16x8.prim = fncdef PFX(fname ## _16x8_ ## neon); \
420
+ p.chromaX265_CSP_I422.puCHROMA_422_24x64.prim = fncdef PFX(fname ## _24x64_ ## neon); \
421
+ p.chromaX265_CSP_I422.puCHROMA_422_8x64.prim = fncdef PFX(fname ## _8x64_ ## neon)
422
+#define CHROMA_422_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fncdef, fname) \
423
+ p.chromaX265_CSP_I422.puCHROMA_422_32x64.prim = fncdef PFX(fname ## _32x64_ ## sve); \
424
+ p.chromaX265_CSP_I422.puCHROMA_422_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve); \
425
+ p.chromaX265_CSP_I422.puCHROMA_422_32x48.prim = fncdef PFX(fname ## _32x48_ ## sve); \
426
+ p.chromaX265_CSP_I422.puCHROMA_422_32x16.prim = fncdef PFX(fname ## _32x16_ ## sve)
427
+#define CHROMA_422_PU_TYPED_NEON_2(prim, fncdef, fname) \
428
+ p.chromaX265_CSP_I422.puCHROMA_422_4x8.prim = fncdef PFX(fname ## _4x8_ ## neon); \
429
+ p.chromaX265_CSP_I422.puCHROMA_422_4x4.prim = fncdef PFX(fname ## _4x4_ ## neon); \
430
+ p.chromaX265_CSP_I422.puCHROMA_422_4x16.prim = fncdef PFX(fname ## _4x16_ ## neon); \
431
+ p.chromaX265_CSP_I422.puCHROMA_422_4x32.prim = fncdef PFX(fname ## _4x32_ ## neon)
432
+#define CHROMA_422_PU_TYPED_CAN_USE_SVE2(prim, fncdef, fname) \
433
+ p.chromaX265_CSP_I422.puCHROMA_422_8x16.prim = fncdef PFX(fname ## _8x16_ ## sve2); \
434
+ p.chromaX265_CSP_I422.puCHROMA_422_16x32.prim = fncdef PFX(fname ## _16x32_ ## sve2); \
435
+ p.chromaX265_CSP_I422.puCHROMA_422_32x64.prim = fncdef PFX(fname ## _32x64_ ## sve2); \
436
+ p.chromaX265_CSP_I422.puCHROMA_422_2x8.prim = fncdef PFX(fname ## _2x8_ ## sve2); \
437
+ p.chromaX265_CSP_I422.puCHROMA_422_8x8.prim = fncdef PFX(fname ## _8x8_ ## sve2); \
438
+ p.chromaX265_CSP_I422.puCHROMA_422_16x16.prim = fncdef PFX(fname ## _16x16_ ## sve2); \
439
+ p.chromaX265_CSP_I422.puCHROMA_422_8x32.prim = fncdef PFX(fname ## _8x32_ ## sve2); \
440
+ p.chromaX265_CSP_I422.puCHROMA_422_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve2); \
441
+ p.chromaX265_CSP_I422.puCHROMA_422_16x64.prim = fncdef PFX(fname ## _16x64_ ## sve2); \
442
+ p.chromaX265_CSP_I422.puCHROMA_422_8x12.prim = fncdef PFX(fname ## _8x12_ ## sve2); \
443
+ p.chromaX265_CSP_I422.puCHROMA_422_6x16.prim = fncdef PFX(fname ## _6x16_ ## sve2); \
444
+ p.chromaX265_CSP_I422.puCHROMA_422_8x4.prim = fncdef PFX(fname ## _8x4_ ## sve2); \
445
+ p.chromaX265_CSP_I422.puCHROMA_422_2x16.prim = fncdef PFX(fname ## _2x16_ ## sve2); \
446
+ p.chromaX265_CSP_I422.puCHROMA_422_16x24.prim = fncdef PFX(fname ## _16x24_ ## sve2); \
447
+ p.chromaX265_CSP_I422.puCHROMA_422_12x32.prim = fncdef PFX(fname ## _12x32_ ## sve2); \
448
+ p.chromaX265_CSP_I422.puCHROMA_422_16x8.prim = fncdef PFX(fname ## _16x8_ ## sve2); \
449
+ p.chromaX265_CSP_I422.puCHROMA_422_32x48.prim = fncdef PFX(fname ## _32x48_ ## sve2); \
450
+ p.chromaX265_CSP_I422.puCHROMA_422_24x64.prim = fncdef PFX(fname ## _24x64_ ## sve2); \
451
+ p.chromaX265_CSP_I422.puCHROMA_422_32x16.prim = fncdef PFX(fname ## _32x16_ ## sve2); \
452
+ p.chromaX265_CSP_I422.puCHROMA_422_8x64.prim = fncdef PFX(fname ## _8x64_ ## sve2)
453
+#define CHROMA_422_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
454
+ p.chromaX265_CSP_I422.puCHROMA_422_4x8.prim = fncdef PFX(filterPixelToShort ## _4x8_ ## neon); \
455
+ p.chromaX265_CSP_I422.puCHROMA_422_8x16.prim = fncdef PFX(filterPixelToShort ## _8x16_ ## neon); \
456
+ p.chromaX265_CSP_I422.puCHROMA_422_16x32.prim = fncdef PFX(filterPixelToShort ## _16x32_ ## neon); \
457
+ p.chromaX265_CSP_I422.puCHROMA_422_4x4.prim = fncdef PFX(filterPixelToShort ## _4x4_ ## neon); \
458
+ p.chromaX265_CSP_I422.puCHROMA_422_8x8.prim = fncdef PFX(filterPixelToShort ## _8x8_ ## neon); \
459
+ p.chromaX265_CSP_I422.puCHROMA_422_4x16.prim = fncdef PFX(filterPixelToShort ## _4x16_ ## neon); \
460
+ p.chromaX265_CSP_I422.puCHROMA_422_16x16.prim = fncdef PFX(filterPixelToShort ## _16x16_ ## neon); \
461
+ p.chromaX265_CSP_I422.puCHROMA_422_8x32.prim = fncdef PFX(filterPixelToShort ## _8x32_ ## neon); \
462
+ p.chromaX265_CSP_I422.puCHROMA_422_16x64.prim = fncdef PFX(filterPixelToShort ## _16x64_ ## neon); \
463
+ p.chromaX265_CSP_I422.puCHROMA_422_8x12.prim = fncdef PFX(filterPixelToShort ## _8x12_ ## neon); \
464
+ p.chromaX265_CSP_I422.puCHROMA_422_8x4.prim = fncdef PFX(filterPixelToShort ## _8x4_ ## neon); \
465
+ p.chromaX265_CSP_I422.puCHROMA_422_16x24.prim = fncdef PFX(filterPixelToShort ## _16x24_ ## neon); \
466
+ p.chromaX265_CSP_I422.puCHROMA_422_12x32.prim = fncdef PFX(filterPixelToShort ## _12x32_ ## neon); \
467
+ p.chromaX265_CSP_I422.puCHROMA_422_16x8.prim = fncdef PFX(filterPixelToShort ## _16x8_ ## neon); \
468
+ p.chromaX265_CSP_I422.puCHROMA_422_4x32.prim = fncdef PFX(filterPixelToShort ## _4x32_ ## neon); \
469
+ p.chromaX265_CSP_I422.puCHROMA_422_24x64.prim = fncdef PFX(filterPixelToShort ## _24x64_ ## neon); \
470
+ p.chromaX265_CSP_I422.puCHROMA_422_8x64.prim = fncdef PFX(filterPixelToShort ## _8x64_ ## neon)
471
+#define CHROMA_422_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
472
+ p.chromaX265_CSP_I422.puCHROMA_422_2x8.prim = fncdef PFX(filterPixelToShort ## _2x8_ ## sve); \
473
+ p.chromaX265_CSP_I422.puCHROMA_422_2x16.prim = fncdef PFX(filterPixelToShort ## _2x16_ ## sve); \
474
+ p.chromaX265_CSP_I422.puCHROMA_422_6x16.prim = fncdef PFX(filterPixelToShort ## _6x16_ ## sve); \
475
+ p.chromaX265_CSP_I422.puCHROMA_422_32x64.prim = fncdef PFX(filterPixelToShort ## _32x64_ ## sve); \
476
+ p.chromaX265_CSP_I422.puCHROMA_422_32x32.prim = fncdef PFX(filterPixelToShort ## _32x32_ ## sve); \
477
+ p.chromaX265_CSP_I422.puCHROMA_422_32x48.prim = fncdef PFX(filterPixelToShort ## _32x48_ ## sve); \
478
+ p.chromaX265_CSP_I422.puCHROMA_422_32x16.prim = fncdef PFX(filterPixelToShort ## _32x16_ ## sve)
479
+#define ALL_CHROMA_422_PU(prim, fname, cpu) ALL_CHROMA_422_PU_TYPED(prim, , fname, cpu)
480
+#define CHROMA_422_PU_NEON_1(prim, fname) CHROMA_422_PU_TYPED_NEON_1(prim, , fname)
481
+#define CHROMA_422_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fname) CHROMA_422_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, , fname)
482
+#define CHROMA_422_PU_NEON_2(prim, fname) CHROMA_422_PU_TYPED_NEON_2(prim, , fname)
483
+#define CHROMA_422_PU_CAN_USE_SVE2(prim, fname) CHROMA_422_PU_TYPED_CAN_USE_SVE2(prim, , fname)
484
+#define CHROMA_422_PU_NEON_FILTER_PIXEL_TO_SHORT(prim) CHROMA_422_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, )
485
+#define CHROMA_422_PU_SVE_FILTER_PIXEL_TO_SHORT(prim) CHROMA_422_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, )
486
+
487
+#define ALL_CHROMA_444_PU_TYPED(prim, fncdef, fname, cpu) \
488
+ p.chromaX265_CSP_I444.puLUMA_4x4.prim = fncdef PFX(fname ## _4x4_ ## cpu); \
489
+ p.chromaX265_CSP_I444.puLUMA_8x8.prim = fncdef PFX(fname ## _8x8_ ## cpu); \
490
+ p.chromaX265_CSP_I444.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
491
+ p.chromaX265_CSP_I444.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
492
+ p.chromaX265_CSP_I444.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
493
+ p.chromaX265_CSP_I444.puLUMA_8x4.prim = fncdef PFX(fname ## _8x4_ ## cpu); \
494
+ p.chromaX265_CSP_I444.puLUMA_4x8.prim = fncdef PFX(fname ## _4x8_ ## cpu); \
495
+ p.chromaX265_CSP_I444.puLUMA_16x8.prim = fncdef PFX(fname ## _16x8_ ## cpu); \
496
+ p.chromaX265_CSP_I444.puLUMA_8x16.prim = fncdef PFX(fname ## _8x16_ ## cpu); \
497
+ p.chromaX265_CSP_I444.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
498
+ p.chromaX265_CSP_I444.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
499
+ p.chromaX265_CSP_I444.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
500
+ p.chromaX265_CSP_I444.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
501
+ p.chromaX265_CSP_I444.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
502
+ p.chromaX265_CSP_I444.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
503
+ p.chromaX265_CSP_I444.puLUMA_16x4.prim = fncdef PFX(fname ## _16x4_ ## cpu); \
504
+ p.chromaX265_CSP_I444.puLUMA_4x16.prim = fncdef PFX(fname ## _4x16_ ## cpu); \
505
+ p.chromaX265_CSP_I444.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
506
+ p.chromaX265_CSP_I444.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
507
+ p.chromaX265_CSP_I444.puLUMA_32x8.prim = fncdef PFX(fname ## _32x8_ ## cpu); \
508
+ p.chromaX265_CSP_I444.puLUMA_8x32.prim = fncdef PFX(fname ## _8x32_ ## cpu); \
509
+ p.chromaX265_CSP_I444.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
510
+ p.chromaX265_CSP_I444.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
511
+ p.chromaX265_CSP_I444.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \
512
+ p.chromaX265_CSP_I444.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu)
513
+#define CHROMA_444_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
514
+ p.chromaX265_CSP_I444.puLUMA_4x4.prim = fncdef PFX(filterPixelToShort ## _4x4_ ## neon); \
515
+ p.chromaX265_CSP_I444.puLUMA_8x8.prim = fncdef PFX(filterPixelToShort ## _8x8_ ## neon); \
516
+ p.chromaX265_CSP_I444.puLUMA_16x16.prim = fncdef PFX(filterPixelToShort ## _16x16_ ## neon); \
517
+ p.chromaX265_CSP_I444.puLUMA_8x4.prim = fncdef PFX(filterPixelToShort ## _8x4_ ## neon); \
518
+ p.chromaX265_CSP_I444.puLUMA_4x8.prim = fncdef PFX(filterPixelToShort ## _4x8_ ## neon); \
519
+ p.chromaX265_CSP_I444.puLUMA_16x8.prim = fncdef PFX(filterPixelToShort ## _16x8_ ## neon); \
520
+ p.chromaX265_CSP_I444.puLUMA_8x16.prim = fncdef PFX(filterPixelToShort ## _8x16_ ## neon); \
521
+ p.chromaX265_CSP_I444.puLUMA_16x32.prim = fncdef PFX(filterPixelToShort ## _16x32_ ## neon); \
522
+ p.chromaX265_CSP_I444.puLUMA_16x12.prim = fncdef PFX(filterPixelToShort ## _16x12_ ## neon); \
523
+ p.chromaX265_CSP_I444.puLUMA_12x16.prim = fncdef PFX(filterPixelToShort ## _12x16_ ## neon); \
524
+ p.chromaX265_CSP_I444.puLUMA_16x4.prim = fncdef PFX(filterPixelToShort ## _16x4_ ## neon); \
525
+ p.chromaX265_CSP_I444.puLUMA_4x16.prim = fncdef PFX(filterPixelToShort ## _4x16_ ## neon); \
526
+ p.chromaX265_CSP_I444.puLUMA_24x32.prim = fncdef PFX(filterPixelToShort ## _24x32_ ## neon); \
527
+ p.chromaX265_CSP_I444.puLUMA_8x32.prim = fncdef PFX(filterPixelToShort ## _8x32_ ## neon); \
528
+ p.chromaX265_CSP_I444.puLUMA_16x64.prim = fncdef PFX(filterPixelToShort ## _16x64_ ## neon)
529
+#define CHROMA_444_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
530
+ p.chromaX265_CSP_I444.puLUMA_32x32.prim = fncdef PFX(filterPixelToShort ## _32x32_ ## sve); \
531
+ p.chromaX265_CSP_I444.puLUMA_32x16.prim = fncdef PFX(filterPixelToShort ## _32x16_ ## sve); \
532
+ p.chromaX265_CSP_I444.puLUMA_32x64.prim = fncdef PFX(filterPixelToShort ## _32x64_ ## sve); \
533
+ p.chromaX265_CSP_I444.puLUMA_32x24.prim = fncdef PFX(filterPixelToShort ## _32x24_ ## sve); \
534
+ p.chromaX265_CSP_I444.puLUMA_32x8.prim = fncdef PFX(filterPixelToShort ## _32x8_ ## sve); \
535
+ p.chromaX265_CSP_I444.puLUMA_64x64.prim = fncdef PFX(filterPixelToShort ## _64x64_ ## sve); \
536
+ p.chromaX265_CSP_I444.puLUMA_64x32.prim = fncdef PFX(filterPixelToShort ## _64x32_ ## sve); \
537
+ p.chromaX265_CSP_I444.puLUMA_64x48.prim = fncdef PFX(filterPixelToShort ## _64x48_ ## sve); \
538
+ p.chromaX265_CSP_I444.puLUMA_64x16.prim = fncdef PFX(filterPixelToShort ## _64x16_ ## sve); \
539
+ p.chromaX265_CSP_I444.puLUMA_48x64.prim = fncdef PFX(filterPixelToShort ## _48x64_ ## sve)
540
+#define ALL_CHROMA_444_PU(prim, fname, cpu) ALL_CHROMA_444_PU_TYPED(prim, , fname, cpu)
541
+#define CHROMA_444_PU_NEON_FILTER_PIXEL_TO_SHORT(prim) CHROMA_444_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, )
542
+#define CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(prim) CHROMA_444_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, )
543
+
544
+#define ALL_CHROMA_420_VERT_FILTERS(cpu) \
545
+ ALL_CHROMA_420_4x4_PU(filter_vpp, interp_4tap_vert_pp, cpu); \
546
+ ALL_CHROMA_420_4x4_PU(filter_vps, interp_4tap_vert_ps, cpu); \
547
+ ALL_CHROMA_420_4x4_PU(filter_vsp, interp_4tap_vert_sp, cpu); \
548
+ ALL_CHROMA_420_4x4_PU(filter_vss, interp_4tap_vert_ss, cpu)
549
+
550
+#define CHROMA_420_VERT_FILTERS_NEON() \
551
+ ALL_CHROMA_420_4x4_PU(filter_vsp, interp_4tap_vert_sp, neon)
552
+
553
+#define CHROMA_420_VERT_FILTERS_CAN_USE_SVE2() \
554
+ ALL_CHROMA_420_4x4_PU(filter_vpp, interp_4tap_vert_pp, sve2); \
555
+ ALL_CHROMA_420_4x4_PU(filter_vps, interp_4tap_vert_ps, sve2); \
556
+ ALL_CHROMA_420_4x4_PU(filter_vss, interp_4tap_vert_ss, sve2)
557
+
558
+#define SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(W, H) \
559
+ p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vsp = PFX(interp_4tap_vert_sp_ ## W ## x ## H ## _ ## neon)
560
+
561
+#define SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(W, H, cpu) \
562
+ p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vpp = PFX(interp_4tap_vert_pp_ ## W ## x ## H ## _ ## cpu); \
563
+ p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vps = PFX(interp_4tap_vert_ps_ ## W ## x ## H ## _ ## cpu); \
564
+ p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vss = PFX(interp_4tap_vert_ss_ ## W ## x ## H ## _ ## cpu)
565
+
566
+#define CHROMA_422_VERT_FILTERS_NEON() \
567
+ SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(4, 8); \
568
+ SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(8, 16); \
569
+ SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(8, 8); \
570
+ SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(4, 16); \
571
+ SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(8, 12); \
572
+ SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(8, 4); \
573
+ SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(16, 32); \
574
+ SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(16, 16); \
575
+ SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(8, 32); \
576
+ SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(16, 24); \
577
+ SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(12, 32); \
578
+ SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(16, 8); \
579
+ SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(4, 32); \
580
+ SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(32, 64); \
581
+ SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(32, 32); \
582
+ SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(16, 64); \
583
+ SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(32, 48); \
584
+ SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(24, 64); \
585
+ SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(32, 16); \
586
+ SETUP_CHROMA_422_VERT_FUNC_DEF_NEON(8, 64)
587
+
588
+#define CHROMA_422_VERT_FILTERS_CAN_USE_SVE2(cpu) \
589
+ SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(4, 8, cpu); \
590
+ SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(8, 16, cpu); \
591
+ SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(8, 8, cpu); \
592
+ SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(4, 16, cpu); \
593
+ SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(8, 12, cpu); \
594
+ SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(8, 4, cpu); \
595
+ SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(16, 32, cpu); \
596
+ SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(16, 16, cpu); \
597
+ SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(8, 32, cpu); \
598
+ SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(16, 24, cpu); \
599
+ SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(12, 32, cpu); \
600
+ SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(16, 8, cpu); \
601
+ SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(4, 32, cpu); \
602
+ SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(32, 64, cpu); \
603
+ SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(32, 32, cpu); \
604
+ SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(16, 64, cpu); \
605
+ SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(32, 48, cpu); \
606
+ SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(24, 64, cpu); \
607
+ SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(32, 16, cpu); \
608
+ SETUP_CHROMA_422_VERT_FUNC_DEF_CAN_USE_SVE2(8, 64, cpu)
609
+
610
+#define ALL_CHROMA_444_VERT_FILTERS(cpu) \
611
+ ALL_CHROMA_444_PU(filter_vpp, interp_4tap_vert_pp, cpu); \
612
+ ALL_CHROMA_444_PU(filter_vps, interp_4tap_vert_ps, cpu); \
613
+ ALL_CHROMA_444_PU(filter_vsp, interp_4tap_vert_sp, cpu); \
614
+ ALL_CHROMA_444_PU(filter_vss, interp_4tap_vert_ss, cpu)
615
+
616
+#define CHROMA_444_VERT_FILTERS_NEON() \
617
+ ALL_CHROMA_444_PU(filter_vsp, interp_4tap_vert_sp, neon)
618
+
619
+#define CHROMA_444_VERT_FILTERS_CAN_USE_SVE2() \
620
+ ALL_CHROMA_444_PU(filter_vpp, interp_4tap_vert_pp, sve2); \
621
+ ALL_CHROMA_444_PU(filter_vps, interp_4tap_vert_ps, sve2); \
622
+ ALL_CHROMA_444_PU(filter_vss, interp_4tap_vert_ss, sve2)
623
+
624
+#define ALL_CHROMA_420_FILTERS(cpu) \
625
+ ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, cpu); \
626
+ ALL_CHROMA_420_PU(filter_hps, interp_4tap_horiz_ps, cpu); \
627
+ ALL_CHROMA_420_PU(filter_vpp, interp_4tap_vert_pp, cpu); \
628
+ ALL_CHROMA_420_PU(filter_vps, interp_4tap_vert_ps, cpu)
629
+
630
+#define CHROMA_420_FILTERS_NEON() \
631
+ ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, neon); \
632
+ ALL_CHROMA_420_PU(filter_hps, interp_4tap_horiz_ps, neon)
633
+
634
+#define CHROMA_420_FILTERS_CAN_USE_SVE2() \
635
+ ALL_CHROMA_420_PU(filter_vpp, interp_4tap_vert_pp, sve2); \
636
+ ALL_CHROMA_420_PU(filter_vps, interp_4tap_vert_ps, sve2)
637
+
638
+#define ALL_CHROMA_422_FILTERS(cpu) \
639
+ ALL_CHROMA_422_PU(filter_hpp, interp_4tap_horiz_pp, cpu); \
640
+ ALL_CHROMA_422_PU(filter_hps, interp_4tap_horiz_ps, cpu); \
641
+ ALL_CHROMA_422_PU(filter_vpp, interp_4tap_vert_pp, cpu); \
642
+ ALL_CHROMA_422_PU(filter_vps, interp_4tap_vert_ps, cpu)
643
+
644
+#define CHROMA_422_FILTERS_NEON() \
645
+ ALL_CHROMA_422_PU(filter_hpp, interp_4tap_horiz_pp, neon); \
646
+ ALL_CHROMA_422_PU(filter_hps, interp_4tap_horiz_ps, neon)
647
+
648
+#define CHROMA_422_FILTERS_CAN_USE_SVE2() \
649
+ ALL_CHROMA_422_PU(filter_vpp, interp_4tap_vert_pp, sve2); \
650
+ ALL_CHROMA_422_PU(filter_vps, interp_4tap_vert_ps, sve2)
651
+
652
+#define ALL_CHROMA_444_FILTERS(cpu) \
653
+ ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, cpu); \
654
+ ALL_CHROMA_444_PU(filter_hps, interp_4tap_horiz_ps, cpu); \
655
+ ALL_CHROMA_444_PU(filter_vpp, interp_4tap_vert_pp, cpu); \
656
+ ALL_CHROMA_444_PU(filter_vps, interp_4tap_vert_ps, cpu)
657
+
658
+#define CHROMA_444_FILTERS_NEON() \
659
+ ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, neon); \
660
+ ALL_CHROMA_444_PU(filter_hps, interp_4tap_horiz_ps, neon)
661
+
662
+#define CHROMA_444_FILTERS_CAN_USE_SVE2() \
663
+ ALL_CHROMA_444_PU(filter_vpp, interp_4tap_vert_pp, sve2); \
664
+ ALL_CHROMA_444_PU(filter_vps, interp_4tap_vert_ps, sve2)
665
+
666
667
#if defined(__GNUC__)
668
#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
669
670
#define GCC_4_9_0 40900
671
#define GCC_5_1_0 50100
672
673
-extern "C" {
674
-#include "pixel.h"
675
-#include "pixel-util.h"
676
-#include "ipfilter8.h"
677
-}
678
+#include "pixel-prim.h"
679
+#include "filter-prim.h"
680
+#include "dct-prim.h"
681
+#include "loopfilter-prim.h"
682
+#include "intrapred-prim.h"
683
684
-namespace X265_NS {
685
+namespace X265_NS
686
+{
687
// private x265 namespace
688
689
690
template<int size>
691
-void interp_8tap_hv_pp_cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY)
692
+void interp_8tap_hv_pp_cpu(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
693
{
694
ALIGN_VAR_32(int16_t, immedMAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA - 1));
695
const int halfFilterSize = NTAPS_LUMA >> 1;
696
697
primitives.pusize.luma_vsp(immed + (halfFilterSize - 1) * immedStride, immedStride, dst, dstStride, idxY);
698
}
699
700
-
701
-/* Temporary workaround because luma_vsp assembly primitive has not been completed
702
- * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
703
- * Otherwise, segment fault occurs. */
704
-void setupAliasCPrimitives(EncoderPrimitives &cp, EncoderPrimitives &asmp, int cpuMask)
705
+void setupNeonPrimitives(EncoderPrimitives &p)
706
{
707
- if (cpuMask & X265_CPU_NEON)
708
- {
709
- asmp.puLUMA_8x4.luma_vsp = cp.puLUMA_8x4.luma_vsp;
710
- asmp.puLUMA_8x8.luma_vsp = cp.puLUMA_8x8.luma_vsp;
711
- asmp.puLUMA_8x16.luma_vsp = cp.puLUMA_8x16.luma_vsp;
712
- asmp.puLUMA_8x32.luma_vsp = cp.puLUMA_8x32.luma_vsp;
713
- asmp.puLUMA_12x16.luma_vsp = cp.puLUMA_12x16.luma_vsp;
714
-#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */
715
- asmp.puLUMA_16x4.luma_vsp = cp.puLUMA_16x4.luma_vsp;
716
- asmp.puLUMA_16x8.luma_vsp = cp.puLUMA_16x8.luma_vsp;
717
- asmp.puLUMA_16x12.luma_vsp = cp.puLUMA_16x12.luma_vsp;
718
- asmp.puLUMA_16x16.luma_vsp = cp.puLUMA_16x16.luma_vsp;
719
- asmp.puLUMA_16x32.luma_vsp = cp.puLUMA_16x32.luma_vsp;
720
- asmp.puLUMA_16x64.luma_vsp = cp.puLUMA_16x64.luma_vsp;
721
- asmp.puLUMA_32x16.luma_vsp = cp.puLUMA_32x16.luma_vsp;
722
- asmp.puLUMA_32x24.luma_vsp = cp.puLUMA_32x24.luma_vsp;
723
- asmp.puLUMA_32x32.luma_vsp = cp.puLUMA_32x32.luma_vsp;
724
- asmp.puLUMA_32x64.luma_vsp = cp.puLUMA_32x64.luma_vsp;
725
- asmp.puLUMA_48x64.luma_vsp = cp.puLUMA_48x64.luma_vsp;
726
- asmp.puLUMA_64x16.luma_vsp = cp.puLUMA_64x16.luma_vsp;
727
- asmp.puLUMA_64x32.luma_vsp = cp.puLUMA_64x32.luma_vsp;
728
- asmp.puLUMA_64x48.luma_vsp = cp.puLUMA_64x48.luma_vsp;
729
- asmp.puLUMA_64x64.luma_vsp = cp.puLUMA_64x64.luma_vsp;
730
-#if !AUTO_VECTORIZE || GCC_VERSION < GCC_4_9_0 /* gcc_version < gcc-4.9.0 */
731
- asmp.puLUMA_4x4.luma_vsp = cp.puLUMA_4x4.luma_vsp;
732
- asmp.puLUMA_4x8.luma_vsp = cp.puLUMA_4x8.luma_vsp;
733
- asmp.puLUMA_4x16.luma_vsp = cp.puLUMA_4x16.luma_vsp;
734
- asmp.puLUMA_24x32.luma_vsp = cp.puLUMA_24x32.luma_vsp;
735
- asmp.puLUMA_32x8.luma_vsp = cp.puLUMA_32x8.luma_vsp;
736
+ setupPixelPrimitives_neon(p);
737
+ setupFilterPrimitives_neon(p);
738
+ setupDCTPrimitives_neon(p);
739
+ setupLoopFilterPrimitives_neon(p);
740
+ setupIntraPrimitives_neon(p);
741
+
742
+ ALL_CHROMA_420_PU(p2sNONALIGNED, filterPixelToShort, neon);
743
+ ALL_CHROMA_422_PU(p2sALIGNED, filterPixelToShort, neon);
744
+ ALL_CHROMA_444_PU(p2sALIGNED, filterPixelToShort, neon);
745
+ ALL_LUMA_PU(convert_p2sALIGNED, filterPixelToShort, neon);
746
+ ALL_CHROMA_420_PU(p2sALIGNED, filterPixelToShort, neon);
747
+ ALL_CHROMA_422_PU(p2sNONALIGNED, filterPixelToShort, neon);
748
+ ALL_CHROMA_444_PU(p2sNONALIGNED, filterPixelToShort, neon);
749
+ ALL_LUMA_PU(convert_p2sNONALIGNED, filterPixelToShort, neon);
750
+
751
+#if !HIGH_BIT_DEPTH
752
+ ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, neon);
753
+ ALL_LUMA_PU(luma_vsp, interp_8tap_vert_sp, neon);
754
+ ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, neon);
755
+ ALL_LUMA_PU(luma_hpp, interp_horiz_pp, neon);
756
+ ALL_LUMA_PU(luma_hps, interp_horiz_ps, neon);
757
+ ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, neon);
758
+ ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu);
759
+ ALL_CHROMA_420_VERT_FILTERS(neon);
760
+ CHROMA_422_VERT_FILTERS_NEON();
761
+ CHROMA_422_VERT_FILTERS_CAN_USE_SVE2(neon);
762
+ ALL_CHROMA_444_VERT_FILTERS(neon);
763
+ ALL_CHROMA_420_FILTERS(neon);
764
+ ALL_CHROMA_422_FILTERS(neon);
765
+ ALL_CHROMA_444_FILTERS(neon);
766
+
767
+ // Blockcopy_pp
768
+ ALL_LUMA_PU(copy_pp, blockcopy_pp, neon);
769
+ ALL_CHROMA_420_PU(copy_pp, blockcopy_pp, neon);
770
+ ALL_CHROMA_422_PU(copy_pp, blockcopy_pp, neon);
771
+ p.cuBLOCK_4x4.copy_pp = PFX(blockcopy_pp_4x4_neon);
772
+ p.cuBLOCK_8x8.copy_pp = PFX(blockcopy_pp_8x8_neon);
773
+ p.cuBLOCK_16x16.copy_pp = PFX(blockcopy_pp_16x16_neon);
774
+ p.cuBLOCK_32x32.copy_pp = PFX(blockcopy_pp_32x32_neon);
775
+ p.cuBLOCK_64x64.copy_pp = PFX(blockcopy_pp_64x64_neon);
776
+ p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_pp = PFX(blockcopy_pp_4x4_neon);
777
+ p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_pp = PFX(blockcopy_pp_8x8_neon);
778
+ p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_pp = PFX(blockcopy_pp_16x16_neon);
779
+ p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_pp = PFX(blockcopy_pp_32x32_neon);
780
+ p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_pp = PFX(blockcopy_pp_4x8_neon);
781
+ p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_pp = PFX(blockcopy_pp_8x16_neon);
782
+ p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_pp = PFX(blockcopy_pp_16x32_neon);
783
+ p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_pp = PFX(blockcopy_pp_32x64_neon);
784
+
785
+#endif // !HIGH_BIT_DEPTH
786
+
787
+ // Blockcopy_ss
788
+ p.cuBLOCK_4x4.copy_ss = PFX(blockcopy_ss_4x4_neon);
789
+ p.cuBLOCK_8x8.copy_ss = PFX(blockcopy_ss_8x8_neon);
790
+ p.cuBLOCK_16x16.copy_ss = PFX(blockcopy_ss_16x16_neon);
791
+ p.cuBLOCK_32x32.copy_ss = PFX(blockcopy_ss_32x32_neon);
792
+ p.cuBLOCK_64x64.copy_ss = PFX(blockcopy_ss_64x64_neon);
793
+
794
+ // Blockcopy_ps
795
+ p.cuBLOCK_4x4.copy_ps = PFX(blockcopy_ps_4x4_neon);
796
+ p.cuBLOCK_8x8.copy_ps = PFX(blockcopy_ps_8x8_neon);
797
+ p.cuBLOCK_16x16.copy_ps = PFX(blockcopy_ps_16x16_neon);
798
+ p.cuBLOCK_32x32.copy_ps = PFX(blockcopy_ps_32x32_neon);
799
+ p.cuBLOCK_64x64.copy_ps = PFX(blockcopy_ps_64x64_neon);
800
+
801
+ // Blockcopy_sp
802
+ p.cuBLOCK_4x4.copy_sp = PFX(blockcopy_sp_4x4_neon);
803
+ p.cuBLOCK_8x8.copy_sp = PFX(blockcopy_sp_8x8_neon);
804
+ p.cuBLOCK_16x16.copy_sp = PFX(blockcopy_sp_16x16_neon);
805
+ p.cuBLOCK_32x32.copy_sp = PFX(blockcopy_sp_32x32_neon);
806
+ p.cuBLOCK_64x64.copy_sp = PFX(blockcopy_sp_64x64_neon);
807
+
808
+ // chroma blockcopy_ss
809
+ p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_ss = PFX(blockcopy_ss_4x4_neon);
810
+ p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_ss = PFX(blockcopy_ss_8x8_neon);
811
+ p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_ss = PFX(blockcopy_ss_16x16_neon);
812
+ p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_ss = PFX(blockcopy_ss_32x32_neon);
813
+ p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_ss = PFX(blockcopy_ss_4x8_neon);
814
+ p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_ss = PFX(blockcopy_ss_8x16_neon);
815
+ p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_ss = PFX(blockcopy_ss_16x32_neon);
816
+ p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_ss = PFX(blockcopy_ss_32x64_neon);
817
+
818
+ // chroma blockcopy_ps
819
+ p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_ps = PFX(blockcopy_ps_4x4_neon);
820
+ p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_ps = PFX(blockcopy_ps_8x8_neon);
821
+ p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_ps = PFX(blockcopy_ps_16x16_neon);
822
+ p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_ps = PFX(blockcopy_ps_32x32_neon);
823
+ p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_ps = PFX(blockcopy_ps_4x8_neon);
824
+ p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_ps = PFX(blockcopy_ps_8x16_neon);
825
+ p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_ps = PFX(blockcopy_ps_16x32_neon);
826
+ p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_ps = PFX(blockcopy_ps_32x64_neon);
827
+
828
+ // chroma blockcopy_sp
829
+ p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_sp = PFX(blockcopy_sp_4x4_neon);
830
+ p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_sp = PFX(blockcopy_sp_8x8_neon);
831
+ p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_sp = PFX(blockcopy_sp_16x16_neon);
832
+ p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_sp = PFX(blockcopy_sp_32x32_neon);
833
+ p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_sp = PFX(blockcopy_sp_4x8_neon);
834
+ p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_sp = PFX(blockcopy_sp_8x16_neon);
835
+ p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_sp = PFX(blockcopy_sp_16x32_neon);
836
+ p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_sp = PFX(blockcopy_sp_32x64_neon);
837
+
838
+ // Block_fill
839
+ ALL_LUMA_TU(blockfill_sALIGNED, blockfill_s, neon);
840
+ ALL_LUMA_TU(blockfill_sNONALIGNED, blockfill_s, neon);
841
+
842
+ // copy_count
843
+ p.cuBLOCK_4x4.copy_cnt = PFX(copy_cnt_4_neon);
844
+ p.cuBLOCK_8x8.copy_cnt = PFX(copy_cnt_8_neon);
845
+ p.cuBLOCK_16x16.copy_cnt = PFX(copy_cnt_16_neon);
846
+ p.cuBLOCK_32x32.copy_cnt = PFX(copy_cnt_32_neon);
847
+
848
+ // count nonzero
849
+ p.cuBLOCK_4x4.count_nonzero = PFX(count_nonzero_4_neon);
850
+ p.cuBLOCK_8x8.count_nonzero = PFX(count_nonzero_8_neon);
851
+ p.cuBLOCK_16x16.count_nonzero = PFX(count_nonzero_16_neon);
852
+ p.cuBLOCK_32x32.count_nonzero = PFX(count_nonzero_32_neon);
853
+
854
+ // cpy2Dto1D_shl
855
+ p.cuBLOCK_4x4.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_4x4_neon);
856
+ p.cuBLOCK_8x8.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_8x8_neon);
857
+ p.cuBLOCK_16x16.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16x16_neon);
858
+ p.cuBLOCK_32x32.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32x32_neon);
859
+ p.cuBLOCK_64x64.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_64x64_neon);
860
+
861
+ // cpy2Dto1D_shr
862
+ p.cuBLOCK_4x4.cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_4x4_neon);
863
+ p.cuBLOCK_8x8.cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_8x8_neon);
864
+ p.cuBLOCK_16x16.cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16x16_neon);
865
+ p.cuBLOCK_32x32.cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_neon);
866
+
867
+ // cpy1Dto2D_shl
868
+ p.cuBLOCK_4x4.cpy1Dto2D_shlALIGNED = PFX(cpy1Dto2D_shl_4x4_neon);
869
+ p.cuBLOCK_8x8.cpy1Dto2D_shlALIGNED = PFX(cpy1Dto2D_shl_8x8_neon);
870
+ p.cuBLOCK_16x16.cpy1Dto2D_shlALIGNED = PFX(cpy1Dto2D_shl_16x16_neon);
871
+ p.cuBLOCK_32x32.cpy1Dto2D_shlALIGNED = PFX(cpy1Dto2D_shl_32x32_neon);
872
+ p.cuBLOCK_64x64.cpy1Dto2D_shlALIGNED = PFX(cpy1Dto2D_shl_64x64_neon);
873
+
874
+ p.cuBLOCK_4x4.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_4x4_neon);
875
+ p.cuBLOCK_8x8.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_8x8_neon);
876
+ p.cuBLOCK_16x16.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_16x16_neon);
877
+ p.cuBLOCK_32x32.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_32x32_neon);
878
+ p.cuBLOCK_64x64.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_64x64_neon);
879
+
880
+ // cpy1Dto2D_shr
881
+ p.cuBLOCK_4x4.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_4x4_neon);
882
+ p.cuBLOCK_8x8.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_8x8_neon);
883
+ p.cuBLOCK_16x16.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16x16_neon);
884
+ p.cuBLOCK_32x32.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_neon);
885
+ p.cuBLOCK_64x64.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_64x64_neon);
886
+
887
+#if !HIGH_BIT_DEPTH
888
+ // pixel_avg_pp
889
+ ALL_LUMA_PU(pixelavg_ppNONALIGNED, pixel_avg_pp, neon);
890
+ ALL_LUMA_PU(pixelavg_ppALIGNED, pixel_avg_pp, neon);
891
+
892
+ // addAvg
893
+ ALL_LUMA_PU(addAvgNONALIGNED, addAvg, neon);
894
+ ALL_LUMA_PU(addAvgALIGNED, addAvg, neon);
895
+ ALL_CHROMA_420_PU(addAvgNONALIGNED, addAvg, neon);
896
+ ALL_CHROMA_422_PU(addAvgNONALIGNED, addAvg, neon);
897
+ ALL_CHROMA_420_PU(addAvgALIGNED, addAvg, neon);
898
+ ALL_CHROMA_422_PU(addAvgALIGNED, addAvg, neon);
899
+
900
+ // sad
901
+ ALL_LUMA_PU(sad, pixel_sad, neon);
902
+ ALL_LUMA_PU(sad_x3, sad_x3, neon);
903
+ ALL_LUMA_PU(sad_x4, sad_x4, neon);
904
+
905
+ // sse_pp
906
+ p.cuBLOCK_4x4.sse_pp = PFX(pixel_sse_pp_4x4_neon);
907
+ p.cuBLOCK_8x8.sse_pp = PFX(pixel_sse_pp_8x8_neon);
908
+ p.cuBLOCK_16x16.sse_pp = PFX(pixel_sse_pp_16x16_neon);
909
+ p.cuBLOCK_32x32.sse_pp = PFX(pixel_sse_pp_32x32_neon);
910
+ p.cuBLOCK_64x64.sse_pp = PFX(pixel_sse_pp_64x64_neon);
911
+
912
+ p.chromaX265_CSP_I420.cuBLOCK_420_4x4.sse_pp = PFX(pixel_sse_pp_4x4_neon);
913
+ p.chromaX265_CSP_I420.cuBLOCK_420_8x8.sse_pp = PFX(pixel_sse_pp_8x8_neon);
914
+ p.chromaX265_CSP_I420.cuBLOCK_420_16x16.sse_pp = PFX(pixel_sse_pp_16x16_neon);
915
+ p.chromaX265_CSP_I420.cuBLOCK_420_32x32.sse_pp = PFX(pixel_sse_pp_32x32_neon);
916
+ p.chromaX265_CSP_I422.cuBLOCK_422_4x8.sse_pp = PFX(pixel_sse_pp_4x8_neon);
917
+ p.chromaX265_CSP_I422.cuBLOCK_422_8x16.sse_pp = PFX(pixel_sse_pp_8x16_neon);
918
+ p.chromaX265_CSP_I422.cuBLOCK_422_16x32.sse_pp = PFX(pixel_sse_pp_16x32_neon);
919
+ p.chromaX265_CSP_I422.cuBLOCK_422_32x64.sse_pp = PFX(pixel_sse_pp_32x64_neon);
920
+
921
+ // sse_ss
922
+ p.cuBLOCK_4x4.sse_ss = PFX(pixel_sse_ss_4x4_neon);
923
+ p.cuBLOCK_8x8.sse_ss = PFX(pixel_sse_ss_8x8_neon);
924
+ p.cuBLOCK_16x16.sse_ss = PFX(pixel_sse_ss_16x16_neon);
925
+ p.cuBLOCK_32x32.sse_ss = PFX(pixel_sse_ss_32x32_neon);
926
+ p.cuBLOCK_64x64.sse_ss = PFX(pixel_sse_ss_64x64_neon);
927
+
928
+ // ssd_s
929
+ p.cuBLOCK_4x4.ssd_sNONALIGNED = PFX(pixel_ssd_s_4x4_neon);
930
+ p.cuBLOCK_8x8.ssd_sNONALIGNED = PFX(pixel_ssd_s_8x8_neon);
931
+ p.cuBLOCK_16x16.ssd_sNONALIGNED = PFX(pixel_ssd_s_16x16_neon);
932
+ p.cuBLOCK_32x32.ssd_sNONALIGNED = PFX(pixel_ssd_s_32x32_neon);
933
+
934
+ p.cuBLOCK_4x4.ssd_sALIGNED = PFX(pixel_ssd_s_4x4_neon);
935
+ p.cuBLOCK_8x8.ssd_sALIGNED = PFX(pixel_ssd_s_8x8_neon);
936
+ p.cuBLOCK_16x16.ssd_sALIGNED = PFX(pixel_ssd_s_16x16_neon);
937
+ p.cuBLOCK_32x32.ssd_sALIGNED = PFX(pixel_ssd_s_32x32_neon);
938
+
939
+ // pixel_var
940
+ p.cuBLOCK_8x8.var = PFX(pixel_var_8x8_neon);
941
+ p.cuBLOCK_16x16.var = PFX(pixel_var_16x16_neon);
942
+ p.cuBLOCK_32x32.var = PFX(pixel_var_32x32_neon);
943
+ p.cuBLOCK_64x64.var = PFX(pixel_var_64x64_neon);
944
+
945
+ // calc_Residual
946
+ p.cuBLOCK_4x4.calcresidualNONALIGNED = PFX(getResidual4_neon);
947
+ p.cuBLOCK_8x8.calcresidualNONALIGNED = PFX(getResidual8_neon);
948
+ p.cuBLOCK_16x16.calcresidualNONALIGNED = PFX(getResidual16_neon);
949
+ p.cuBLOCK_32x32.calcresidualNONALIGNED = PFX(getResidual32_neon);
950
+
951
+ p.cuBLOCK_4x4.calcresidualALIGNED = PFX(getResidual4_neon);
952
+ p.cuBLOCK_8x8.calcresidualALIGNED = PFX(getResidual8_neon);
953
+ p.cuBLOCK_16x16.calcresidualALIGNED = PFX(getResidual16_neon);
954
+ p.cuBLOCK_32x32.calcresidualALIGNED = PFX(getResidual32_neon);
955
+
956
+ // pixel_sub_ps
957
+ p.cuBLOCK_4x4.sub_ps = PFX(pixel_sub_ps_4x4_neon);
958
+ p.cuBLOCK_8x8.sub_ps = PFX(pixel_sub_ps_8x8_neon);
959
+ p.cuBLOCK_16x16.sub_ps = PFX(pixel_sub_ps_16x16_neon);
960
+ p.cuBLOCK_32x32.sub_ps = PFX(pixel_sub_ps_32x32_neon);
961
+ p.cuBLOCK_64x64.sub_ps = PFX(pixel_sub_ps_64x64_neon);
962
+
963
+ // chroma sub_ps
964
+ p.chromaX265_CSP_I420.cuBLOCK_420_4x4.sub_ps = PFX(pixel_sub_ps_4x4_neon);
965
+ p.chromaX265_CSP_I420.cuBLOCK_420_8x8.sub_ps = PFX(pixel_sub_ps_8x8_neon);
966
+ p.chromaX265_CSP_I420.cuBLOCK_420_16x16.sub_ps = PFX(pixel_sub_ps_16x16_neon);
967
+ p.chromaX265_CSP_I420.cuBLOCK_420_32x32.sub_ps = PFX(pixel_sub_ps_32x32_neon);
968
+ p.chromaX265_CSP_I422.cuBLOCK_422_4x8.sub_ps = PFX(pixel_sub_ps_4x8_neon);
969
+ p.chromaX265_CSP_I422.cuBLOCK_422_8x16.sub_ps = PFX(pixel_sub_ps_8x16_neon);
970
+ p.chromaX265_CSP_I422.cuBLOCK_422_16x32.sub_ps = PFX(pixel_sub_ps_16x32_neon);
971
+ p.chromaX265_CSP_I422.cuBLOCK_422_32x64.sub_ps = PFX(pixel_sub_ps_32x64_neon);
972
+
973
+ // pixel_add_ps
974
+ p.cuBLOCK_4x4.add_psNONALIGNED = PFX(pixel_add_ps_4x4_neon);
975
+ p.cuBLOCK_8x8.add_psNONALIGNED = PFX(pixel_add_ps_8x8_neon);
976
+ p.cuBLOCK_16x16.add_psNONALIGNED = PFX(pixel_add_ps_16x16_neon);
977
+ p.cuBLOCK_32x32.add_psNONALIGNED = PFX(pixel_add_ps_32x32_neon);
978
+ p.cuBLOCK_64x64.add_psNONALIGNED = PFX(pixel_add_ps_64x64_neon);
979
+
980
+ p.cuBLOCK_4x4.add_psALIGNED = PFX(pixel_add_ps_4x4_neon);
981
+ p.cuBLOCK_8x8.add_psALIGNED = PFX(pixel_add_ps_8x8_neon);
982
+ p.cuBLOCK_16x16.add_psALIGNED = PFX(pixel_add_ps_16x16_neon);
983
+ p.cuBLOCK_32x32.add_psALIGNED = PFX(pixel_add_ps_32x32_neon);
984
+ p.cuBLOCK_64x64.add_psALIGNED = PFX(pixel_add_ps_64x64_neon);
985
+
986
+ // chroma add_ps
987
+ p.chromaX265_CSP_I420.cuBLOCK_420_4x4.add_psNONALIGNED = PFX(pixel_add_ps_4x4_neon);
988
+ p.chromaX265_CSP_I420.cuBLOCK_420_8x8.add_psNONALIGNED = PFX(pixel_add_ps_8x8_neon);
989
+ p.chromaX265_CSP_I420.cuBLOCK_420_16x16.add_psNONALIGNED = PFX(pixel_add_ps_16x16_neon);
990
+ p.chromaX265_CSP_I420.cuBLOCK_420_32x32.add_psNONALIGNED = PFX(pixel_add_ps_32x32_neon);
991
+ p.chromaX265_CSP_I422.cuBLOCK_422_4x8.add_psNONALIGNED = PFX(pixel_add_ps_4x8_neon);
992
+ p.chromaX265_CSP_I422.cuBLOCK_422_8x16.add_psNONALIGNED = PFX(pixel_add_ps_8x16_neon);
993
+ p.chromaX265_CSP_I422.cuBLOCK_422_16x32.add_psNONALIGNED = PFX(pixel_add_ps_16x32_neon);
994
+ p.chromaX265_CSP_I422.cuBLOCK_422_32x64.add_psNONALIGNED = PFX(pixel_add_ps_32x64_neon);
995
+
996
+ p.chromaX265_CSP_I420.cuBLOCK_420_4x4.add_psALIGNED = PFX(pixel_add_ps_4x4_neon);
997
+ p.chromaX265_CSP_I420.cuBLOCK_420_8x8.add_psALIGNED = PFX(pixel_add_ps_8x8_neon);
998
+ p.chromaX265_CSP_I420.cuBLOCK_420_16x16.add_psALIGNED = PFX(pixel_add_ps_16x16_neon);
999
+ p.chromaX265_CSP_I420.cuBLOCK_420_32x32.add_psALIGNED = PFX(pixel_add_ps_32x32_neon);
1000
+ p.chromaX265_CSP_I422.cuBLOCK_422_4x8.add_psALIGNED = PFX(pixel_add_ps_4x8_neon);
1001
+ p.chromaX265_CSP_I422.cuBLOCK_422_8x16.add_psALIGNED = PFX(pixel_add_ps_8x16_neon);
1002
+ p.chromaX265_CSP_I422.cuBLOCK_422_16x32.add_psALIGNED = PFX(pixel_add_ps_16x32_neon);
1003
+ p.chromaX265_CSP_I422.cuBLOCK_422_32x64.add_psALIGNED = PFX(pixel_add_ps_32x64_neon);
1004
+
1005
+ //scale2D_64to32
1006
+ p.scale2D_64to32 = PFX(scale2D_64to32_neon);
1007
+
1008
+ // scale1D_128to64
1009
+ p.scale1D_128to64NONALIGNED = PFX(scale1D_128to64_neon);
1010
+ p.scale1D_128to64ALIGNED = PFX(scale1D_128to64_neon);
1011
+
1012
+ // planecopy
1013
+ p.planecopy_cp = PFX(pixel_planecopy_cp_neon);
1014
+
1015
+ // satd
1016
+ ALL_LUMA_PU(satd, pixel_satd, neon);
1017
+
1018
+ p.chromaX265_CSP_I420.puCHROMA_420_4x4.satd = PFX(pixel_satd_4x4_neon);
1019
+ p.chromaX265_CSP_I420.puCHROMA_420_8x8.satd = PFX(pixel_satd_8x8_neon);
1020
+ p.chromaX265_CSP_I420.puCHROMA_420_16x16.satd = PFX(pixel_satd_16x16_neon);
1021
+ p.chromaX265_CSP_I420.puCHROMA_420_32x32.satd = PFX(pixel_satd_32x32_neon);
1022
+ p.chromaX265_CSP_I420.puCHROMA_420_8x4.satd = PFX(pixel_satd_8x4_neon);
1023
+ p.chromaX265_CSP_I420.puCHROMA_420_4x8.satd = PFX(pixel_satd_4x8_neon);
1024
+ p.chromaX265_CSP_I420.puCHROMA_420_16x8.satd = PFX(pixel_satd_16x8_neon);
1025
+ p.chromaX265_CSP_I420.puCHROMA_420_8x16.satd = PFX(pixel_satd_8x16_neon);
1026
+ p.chromaX265_CSP_I420.puCHROMA_420_32x16.satd = PFX(pixel_satd_32x16_neon);
1027
+ p.chromaX265_CSP_I420.puCHROMA_420_16x32.satd = PFX(pixel_satd_16x32_neon);
1028
+ p.chromaX265_CSP_I420.puCHROMA_420_16x12.satd = PFX(pixel_satd_16x12_neon);
1029
+ p.chromaX265_CSP_I420.puCHROMA_420_12x16.satd = PFX(pixel_satd_12x16_neon);
1030
+ p.chromaX265_CSP_I420.puCHROMA_420_16x4.satd = PFX(pixel_satd_16x4_neon);
1031
+ p.chromaX265_CSP_I420.puCHROMA_420_4x16.satd = PFX(pixel_satd_4x16_neon);
1032
+ p.chromaX265_CSP_I420.puCHROMA_420_32x24.satd = PFX(pixel_satd_32x24_neon);
1033
+ p.chromaX265_CSP_I420.puCHROMA_420_24x32.satd = PFX(pixel_satd_24x32_neon);
1034
+ p.chromaX265_CSP_I420.puCHROMA_420_32x8.satd = PFX(pixel_satd_32x8_neon);
1035
+ p.chromaX265_CSP_I420.puCHROMA_420_8x32.satd = PFX(pixel_satd_8x32_neon);
1036
+
1037
+ p.chromaX265_CSP_I422.puCHROMA_422_4x8.satd = PFX(pixel_satd_4x8_neon);
1038
+ p.chromaX265_CSP_I422.puCHROMA_422_8x16.satd = PFX(pixel_satd_8x16_neon);
1039
+ p.chromaX265_CSP_I422.puCHROMA_422_16x32.satd = PFX(pixel_satd_16x32_neon);
1040
+ p.chromaX265_CSP_I422.puCHROMA_422_32x64.satd = PFX(pixel_satd_32x64_neon);
1041
+ p.chromaX265_CSP_I422.puCHROMA_422_4x4.satd = PFX(pixel_satd_4x4_neon);
1042
+ p.chromaX265_CSP_I422.puCHROMA_422_8x8.satd = PFX(pixel_satd_8x8_neon);
1043
+ p.chromaX265_CSP_I422.puCHROMA_422_4x16.satd = PFX(pixel_satd_4x16_neon);
1044
+ p.chromaX265_CSP_I422.puCHROMA_422_16x16.satd = PFX(pixel_satd_16x16_neon);
1045
+ p.chromaX265_CSP_I422.puCHROMA_422_8x32.satd = PFX(pixel_satd_8x32_neon);
1046
+ p.chromaX265_CSP_I422.puCHROMA_422_32x32.satd = PFX(pixel_satd_32x32_neon);
1047
+ p.chromaX265_CSP_I422.puCHROMA_422_16x64.satd = PFX(pixel_satd_16x64_neon);
1048
+ p.chromaX265_CSP_I422.puCHROMA_422_8x12.satd = PFX(pixel_satd_8x12_neon);
1049
+ p.chromaX265_CSP_I422.puCHROMA_422_8x4.satd = PFX(pixel_satd_8x4_neon);
1050
+ p.chromaX265_CSP_I422.puCHROMA_422_16x24.satd = PFX(pixel_satd_16x24_neon);
1051
+ p.chromaX265_CSP_I422.puCHROMA_422_12x32.satd = PFX(pixel_satd_12x32_neon);
1052
+ p.chromaX265_CSP_I422.puCHROMA_422_16x8.satd = PFX(pixel_satd_16x8_neon);
1053
+ p.chromaX265_CSP_I422.puCHROMA_422_4x32.satd = PFX(pixel_satd_4x32_neon);
1054
+ p.chromaX265_CSP_I422.puCHROMA_422_32x48.satd = PFX(pixel_satd_32x48_neon);
1055
+ p.chromaX265_CSP_I422.puCHROMA_422_24x64.satd = PFX(pixel_satd_24x64_neon);
1056
+ p.chromaX265_CSP_I422.puCHROMA_422_32x16.satd = PFX(pixel_satd_32x16_neon);
1057
+ p.chromaX265_CSP_I422.puCHROMA_422_8x64.satd = PFX(pixel_satd_8x64_neon);
1058
+
1059
+ // sa8d
1060
+ p.cuBLOCK_4x4.sa8d = PFX(pixel_satd_4x4_neon);
1061
+ p.cuBLOCK_8x8.sa8d = PFX(pixel_sa8d_8x8_neon);
1062
+ p.cuBLOCK_16x16.sa8d = PFX(pixel_sa8d_16x16_neon);
1063
+ p.cuBLOCK_32x32.sa8d = PFX(pixel_sa8d_32x32_neon);
1064
+ p.cuBLOCK_64x64.sa8d = PFX(pixel_sa8d_64x64_neon);
1065
+ p.chromaX265_CSP_I420.cuBLOCK_8x8.sa8d = PFX(pixel_satd_4x4_neon);
1066
+ p.chromaX265_CSP_I420.cuBLOCK_16x16.sa8d = PFX(pixel_sa8d_16x16_neon);
1067
+ p.chromaX265_CSP_I420.cuBLOCK_32x32.sa8d = PFX(pixel_sa8d_32x32_neon);
1068
+ p.chromaX265_CSP_I420.cuBLOCK_64x64.sa8d = PFX(pixel_sa8d_64x64_neon);
1069
+ p.chromaX265_CSP_I422.cuBLOCK_422_8x16.sa8d = PFX(pixel_sa8d_8x16_neon);
1070
+ p.chromaX265_CSP_I422.cuBLOCK_422_16x32.sa8d = PFX(pixel_sa8d_16x32_neon);
1071
+ p.chromaX265_CSP_I422.cuBLOCK_422_32x64.sa8d = PFX(pixel_sa8d_32x64_neon);
1072
+
1073
+ // dequant_scaling
1074
+ p.dequant_scaling = PFX(dequant_scaling_neon);
1075
+ p.dequant_normal = PFX(dequant_normal_neon);
1076
+
1077
+ // ssim_4x4x2_core
1078
+ p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_neon);
1079
+
1080
+ // ssimDist
1081
+ p.cuBLOCK_4x4.ssimDist = PFX(ssimDist4_neon);
1082
+ p.cuBLOCK_8x8.ssimDist = PFX(ssimDist8_neon);
1083
+ p.cuBLOCK_16x16.ssimDist = PFX(ssimDist16_neon);
1084
+ p.cuBLOCK_32x32.ssimDist = PFX(ssimDist32_neon);
1085
+ p.cuBLOCK_64x64.ssimDist = PFX(ssimDist64_neon);
1086
+
1087
+ // normFact
1088
+ p.cuBLOCK_8x8.normFact = PFX(normFact8_neon);
1089
+ p.cuBLOCK_16x16.normFact = PFX(normFact16_neon);
1090
+ p.cuBLOCK_32x32.normFact = PFX(normFact32_neon);
1091
+ p.cuBLOCK_64x64.normFact = PFX(normFact64_neon);
1092
+
1093
+ // psy_cost_pp
1094
+ p.cuBLOCK_4x4.psy_cost_pp = PFX(psyCost_4x4_neon);
1095
+
1096
+ p.weight_pp = PFX(weight_pp_neon);
1097
+#if !defined(__APPLE__)
1098
+ p.scanPosLast = PFX(scanPosLast_neon);
1099
#endif
1100
+ p.costCoeffNxN = PFX(costCoeffNxN_neon);
1101
#endif
1102
- }
1103
-}
1104
1105
+ // quant
1106
+ p.quant = PFX(quant_neon);
1107
+ p.nquant = PFX(nquant_neon);
1108
+}
1109
1110
-void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)
1111
+#if defined(HAVE_SVE2) || defined(HAVE_SVE)
1112
+void setupSvePrimitives(EncoderPrimitives &p)
1113
{
1114
- if (cpuMask & X265_CPU_NEON)
1115
- {
1116
- p.puLUMA_4x4.satd = PFX(pixel_satd_4x4_neon);
1117
- p.puLUMA_4x8.satd = PFX(pixel_satd_4x8_neon);
1118
- p.puLUMA_4x16.satd = PFX(pixel_satd_4x16_neon);
1119
- p.puLUMA_8x4.satd = PFX(pixel_satd_8x4_neon);
1120
- p.puLUMA_8x8.satd = PFX(pixel_satd_8x8_neon);
1121
- p.puLUMA_12x16.satd = PFX(pixel_satd_12x16_neon);
1122
-
1123
- p.chromaX265_CSP_I420.puCHROMA_420_4x4.satd = PFX(pixel_satd_4x4_neon);
1124
- p.chromaX265_CSP_I420.puCHROMA_420_4x8.satd = PFX(pixel_satd_4x8_neon);
1125
- p.chromaX265_CSP_I420.puCHROMA_420_4x16.satd = PFX(pixel_satd_4x16_neon);
1126
- p.chromaX265_CSP_I420.puCHROMA_420_8x4.satd = PFX(pixel_satd_8x4_neon);
1127
- p.chromaX265_CSP_I420.puCHROMA_420_8x8.satd = PFX(pixel_satd_8x8_neon);
1128
- p.chromaX265_CSP_I420.puCHROMA_420_12x16.satd = PFX(pixel_satd_12x16_neon);
1129
-
1130
- p.chromaX265_CSP_I422.puCHROMA_422_4x4.satd = PFX(pixel_satd_4x4_neon);
1131
- p.chromaX265_CSP_I422.puCHROMA_422_4x8.satd = PFX(pixel_satd_4x8_neon);
1132
- p.chromaX265_CSP_I422.puCHROMA_422_4x16.satd = PFX(pixel_satd_4x16_neon);
1133
- p.chromaX265_CSP_I422.puCHROMA_422_4x32.satd = PFX(pixel_satd_4x32_neon);
1134
- p.chromaX265_CSP_I422.puCHROMA_422_8x4.satd = PFX(pixel_satd_8x4_neon);
1135
- p.chromaX265_CSP_I422.puCHROMA_422_8x8.satd = PFX(pixel_satd_8x8_neon);
1136
- p.chromaX265_CSP_I422.puCHROMA_422_12x32.satd = PFX(pixel_satd_12x32_neon);
1137
-
1138
- p.puLUMA_4x4.pixelavg_ppNONALIGNED = PFX(pixel_avg_pp_4x4_neon);
1139
- p.puLUMA_4x8.pixelavg_ppNONALIGNED = PFX(pixel_avg_pp_4x8_neon);
1140
- p.puLUMA_4x16.pixelavg_ppNONALIGNED = PFX(pixel_avg_pp_4x16_neon);
1141
- p.puLUMA_8x4.pixelavg_ppNONALIGNED = PFX(pixel_avg_pp_8x4_neon);
1142
- p.puLUMA_8x8.pixelavg_ppNONALIGNED = PFX(pixel_avg_pp_8x8_neon);
1143
- p.puLUMA_8x16.pixelavg_ppNONALIGNED = PFX(pixel_avg_pp_8x16_neon);
1144
- p.puLUMA_8x32.pixelavg_ppNONALIGNED = PFX(pixel_avg_pp_8x32_neon);
1145
-
1146
- p.puLUMA_4x4.pixelavg_ppALIGNED = PFX(pixel_avg_pp_4x4_neon);
1147
- p.puLUMA_4x8.pixelavg_ppALIGNED = PFX(pixel_avg_pp_4x8_neon);
1148
- p.puLUMA_4x16.pixelavg_ppALIGNED = PFX(pixel_avg_pp_4x16_neon);
1149
- p.puLUMA_8x4.pixelavg_ppALIGNED = PFX(pixel_avg_pp_8x4_neon);
1150
- p.puLUMA_8x8.pixelavg_ppALIGNED = PFX(pixel_avg_pp_8x8_neon);
1151
- p.puLUMA_8x16.pixelavg_ppALIGNED = PFX(pixel_avg_pp_8x16_neon);
1152
- p.puLUMA_8x32.pixelavg_ppALIGNED = PFX(pixel_avg_pp_8x32_neon);
1153
-
1154
- p.puLUMA_8x4.sad_x3 = PFX(sad_x3_8x4_neon);
1155
- p.puLUMA_8x8.sad_x3 = PFX(sad_x3_8x8_neon);
1156
- p.puLUMA_8x16.sad_x3 = PFX(sad_x3_8x16_neon);
1157
- p.puLUMA_8x32.sad_x3 = PFX(sad_x3_8x32_neon);
1158
-
1159
- p.puLUMA_8x4.sad_x4 = PFX(sad_x4_8x4_neon);
1160
- p.puLUMA_8x8.sad_x4 = PFX(sad_x4_8x8_neon);
1161
- p.puLUMA_8x16.sad_x4 = PFX(sad_x4_8x16_neon);
1162
- p.puLUMA_8x32.sad_x4 = PFX(sad_x4_8x32_neon);
1163
-
1164
- // quant
1165
- p.quant = PFX(quant_neon);
1166
- // luma_hps
1167
- p.puLUMA_4x4.luma_hps = PFX(interp_8tap_horiz_ps_4x4_neon);
1168
- p.puLUMA_4x8.luma_hps = PFX(interp_8tap_horiz_ps_4x8_neon);
1169
- p.puLUMA_4x16.luma_hps = PFX(interp_8tap_horiz_ps_4x16_neon);
1170
- p.puLUMA_8x4.luma_hps = PFX(interp_8tap_horiz_ps_8x4_neon);
1171
- p.puLUMA_8x8.luma_hps = PFX(interp_8tap_horiz_ps_8x8_neon);
1172
- p.puLUMA_8x16.luma_hps = PFX(interp_8tap_horiz_ps_8x16_neon);
1173
- p.puLUMA_8x32.luma_hps = PFX(interp_8tap_horiz_ps_8x32_neon);
1174
- p.puLUMA_12x16.luma_hps = PFX(interp_8tap_horiz_ps_12x16_neon);
1175
- p.puLUMA_24x32.luma_hps = PFX(interp_8tap_horiz_ps_24x32_neon);
1176
-#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */
1177
- p.puLUMA_16x4.luma_hps = PFX(interp_8tap_horiz_ps_16x4_neon);
1178
- p.puLUMA_16x8.luma_hps = PFX(interp_8tap_horiz_ps_16x8_neon);
1179
- p.puLUMA_16x12.luma_hps = PFX(interp_8tap_horiz_ps_16x12_neon);
1180
- p.puLUMA_16x16.luma_hps = PFX(interp_8tap_horiz_ps_16x16_neon);
1181
- p.puLUMA_16x32.luma_hps = PFX(interp_8tap_horiz_ps_16x32_neon);
1182
- p.puLUMA_16x64.luma_hps = PFX(interp_8tap_horiz_ps_16x64_neon);
1183
- p.puLUMA_32x8.luma_hps = PFX(interp_8tap_horiz_ps_32x8_neon);
1184
- p.puLUMA_32x16.luma_hps = PFX(interp_8tap_horiz_ps_32x16_neon);
1185
- p.puLUMA_32x24.luma_hps = PFX(interp_8tap_horiz_ps_32x24_neon);
1186
- p.puLUMA_32x32.luma_hps = PFX(interp_8tap_horiz_ps_32x32_neon);
1187
- p.puLUMA_32x64.luma_hps = PFX(interp_8tap_horiz_ps_32x64_neon);
1188
- p.puLUMA_48x64.luma_hps = PFX(interp_8tap_horiz_ps_48x64_neon);
1189
- p.puLUMA_64x16.luma_hps = PFX(interp_8tap_horiz_ps_64x16_neon);
1190
- p.puLUMA_64x32.luma_hps = PFX(interp_8tap_horiz_ps_64x32_neon);
1191
- p.puLUMA_64x48.luma_hps = PFX(interp_8tap_horiz_ps_64x48_neon);
1192
- p.puLUMA_64x64.luma_hps = PFX(interp_8tap_horiz_ps_64x64_neon);
1193
-#endif
1194
-
1195
- p.puLUMA_8x4.luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_8x4>;
1196
- p.puLUMA_8x8.luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_8x8>;
1197
- p.puLUMA_8x16.luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_8x16>;
1198
- p.puLUMA_8x32.luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_8x32>;
1199
- p.puLUMA_12x16.luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_12x16>;
1200
-#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */
1201
- p.puLUMA_16x4.luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x4>;
1202
- p.puLUMA_16x8.luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x8>;
1203
- p.puLUMA_16x12.luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x12>;
1204
- p.puLUMA_16x16.luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x16>;
1205
- p.puLUMA_16x32.luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x32>;
1206
- p.puLUMA_16x64.luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x64>;
1207
- p.puLUMA_32x16.luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x16>;
1208
- p.puLUMA_32x24.luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x24>;
1209
- p.puLUMA_32x32.luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x32>;
1210
- p.puLUMA_32x64.luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x64>;
1211
- p.puLUMA_48x64.luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_48x64>;
1212
- p.puLUMA_64x16.luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x16>;
1213
- p.puLUMA_64x32.luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x32>;
1214
- p.puLUMA_64x48.luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x48>;
1215
- p.puLUMA_64x64.luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x64>;
1216
-#if !AUTO_VECTORIZE || GCC_VERSION < GCC_4_9_0 /* gcc_version < gcc-4.9.0 */
1217
- p.puLUMA_4x4.luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x4>;
1218
- p.puLUMA_4x8.luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x8>;
1219
- p.puLUMA_4x16.luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x16>;
1220
- p.puLUMA_24x32.luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_24x32>;
1221
- p.puLUMA_32x8.luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x8>;
1222
+ // When these primitives will use SVE/SVE2 instructions set,
1223
+ // change the following definitions to point to the SVE/SVE2 implementation
1224
+ setupPixelPrimitives_neon(p);
1225
+ setupFilterPrimitives_neon(p);
1226
+ setupDCTPrimitives_neon(p);
1227
+ setupLoopFilterPrimitives_neon(p);
1228
+ setupIntraPrimitives_neon(p);
1229
+
1230
+ CHROMA_420_PU_FILTER_PIXEL_TO_SHORT_NEON(p2sNONALIGNED);
1231
+ CHROMA_420_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1232
+ CHROMA_422_PU_NEON_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1233
+ CHROMA_422_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1234
+ CHROMA_444_PU_NEON_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1235
+ CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1236
+ LUMA_PU_NEON_FILTER_PIXEL_TO_SHORT(convert_p2sALIGNED);
1237
+ LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2sALIGNED);
1238
+ CHROMA_420_PU_FILTER_PIXEL_TO_SHORT_NEON(p2sALIGNED);
1239
+ CHROMA_420_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1240
+ CHROMA_422_PU_NEON_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1241
+ CHROMA_422_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1242
+ CHROMA_444_PU_NEON_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1243
+ CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1244
+ LUMA_PU_NEON_FILTER_PIXEL_TO_SHORT(convert_p2sNONALIGNED);
1245
+ LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2sNONALIGNED);
1246
+
1247
+#if !HIGH_BIT_DEPTH
1248
+ ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, neon);
1249
+ ALL_LUMA_PU(luma_vsp, interp_8tap_vert_sp, neon);
1250
+ ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, neon);
1251
+ ALL_LUMA_PU(luma_hpp, interp_horiz_pp, neon);
1252
+ ALL_LUMA_PU(luma_hps, interp_horiz_ps, neon);
1253
+ ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, neon);
1254
+ ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu);
1255
+ ALL_CHROMA_420_VERT_FILTERS(neon);
1256
+ CHROMA_422_VERT_FILTERS_NEON();
1257
+ CHROMA_422_VERT_FILTERS_CAN_USE_SVE2(neon);
1258
+ ALL_CHROMA_444_VERT_FILTERS(neon);
1259
+ ALL_CHROMA_420_FILTERS(neon);
1260
+ ALL_CHROMA_422_FILTERS(neon);
1261
+ ALL_CHROMA_444_FILTERS(neon);
1262
+
1263
+
1264
+ // Blockcopy_pp
1265
+ LUMA_PU_NEON_1(copy_pp, blockcopy_pp);
1266
+ LUMA_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
1267
+ CHROMA_420_PU_NEON_1(copy_pp, blockcopy_pp);
1268
+ CHROMA_420_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
1269
+ CHROMA_422_PU_NEON_1(copy_pp, blockcopy_pp);
1270
+ CHROMA_422_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
1271
+ p.cuBLOCK_4x4.copy_pp = PFX(blockcopy_pp_4x4_neon);
1272
+ p.cuBLOCK_8x8.copy_pp = PFX(blockcopy_pp_8x8_neon);
1273
+ p.cuBLOCK_16x16.copy_pp = PFX(blockcopy_pp_16x16_neon);
1274
+ p.cuBLOCK_32x32.copy_pp = PFX(blockcopy_pp_32x32_sve);
1275
+ p.cuBLOCK_64x64.copy_pp = PFX(blockcopy_pp_64x64_sve);
1276
+ p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_pp = PFX(blockcopy_pp_4x4_neon);
1277
+ p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_pp = PFX(blockcopy_pp_8x8_neon);
1278
+ p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_pp = PFX(blockcopy_pp_16x16_neon);
1279
+ p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_pp = PFX(blockcopy_pp_32x32_sve);
1280
+ p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_pp = PFX(blockcopy_pp_4x8_neon);
1281
+ p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_pp = PFX(blockcopy_pp_8x16_neon);
1282
+ p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_pp = PFX(blockcopy_pp_16x32_neon);
1283
+ p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_pp = PFX(blockcopy_pp_32x64_sve);
1284
+
1285
+#endif // !HIGH_BIT_DEPTH
1286
+
1287
+ // Blockcopy_ss
1288
+ p.cuBLOCK_4x4.copy_ss = PFX(blockcopy_ss_4x4_neon);
1289
+ p.cuBLOCK_8x8.copy_ss = PFX(blockcopy_ss_8x8_neon);
1290
+ p.cuBLOCK_16x16.copy_ss = PFX(blockcopy_ss_16x16_sve);
1291
+ p.cuBLOCK_32x32.copy_ss = PFX(blockcopy_ss_32x32_sve);
1292
+ p.cuBLOCK_64x64.copy_ss = PFX(blockcopy_ss_64x64_sve);
1293
+
1294
+ // Blockcopy_ps
1295
+ p.cuBLOCK_4x4.copy_ps = PFX(blockcopy_ps_4x4_neon);
1296
+ p.cuBLOCK_8x8.copy_ps = PFX(blockcopy_ps_8x8_neon);
1297
+ p.cuBLOCK_16x16.copy_ps = PFX(blockcopy_ps_16x16_sve);
1298
+ p.cuBLOCK_32x32.copy_ps = PFX(blockcopy_ps_32x32_sve);
1299
+ p.cuBLOCK_64x64.copy_ps = PFX(blockcopy_ps_64x64_sve);
1300
+
1301
+ // Blockcopy_sp
1302
+ p.cuBLOCK_4x4.copy_sp = PFX(blockcopy_sp_4x4_sve);
1303
+ p.cuBLOCK_8x8.copy_sp = PFX(blockcopy_sp_8x8_sve);
1304
+ p.cuBLOCK_16x16.copy_sp = PFX(blockcopy_sp_16x16_sve);
1305
+ p.cuBLOCK_32x32.copy_sp = PFX(blockcopy_sp_32x32_sve);
1306
+ p.cuBLOCK_64x64.copy_sp = PFX(blockcopy_sp_64x64_neon);
1307
+
1308
+ // chroma blockcopy_ss
1309
+ p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_ss = PFX(blockcopy_ss_4x4_neon);
1310
+ p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_ss = PFX(blockcopy_ss_8x8_neon);
1311
+ p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_ss = PFX(blockcopy_ss_16x16_sve);
1312
+ p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_ss = PFX(blockcopy_ss_32x32_sve);
1313
+ p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_ss = PFX(blockcopy_ss_4x8_neon);
1314
+ p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_ss = PFX(blockcopy_ss_8x16_neon);
1315
+ p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_ss = PFX(blockcopy_ss_16x32_sve);
1316
+ p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_ss = PFX(blockcopy_ss_32x64_sve);
1317
+
1318
+ // chroma blockcopy_ps
1319
+ p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_ps = PFX(blockcopy_ps_4x4_neon);
1320
+ p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_ps = PFX(blockcopy_ps_8x8_neon);
1321
+ p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_ps = PFX(blockcopy_ps_16x16_sve);
1322
+ p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_ps = PFX(blockcopy_ps_32x32_sve);
1323
+ p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_ps = PFX(blockcopy_ps_4x8_sve);
1324
+ p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_ps = PFX(blockcopy_ps_8x16_sve);
1325
+ p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_ps = PFX(blockcopy_ps_16x32_sve);
1326
+ p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_ps = PFX(blockcopy_ps_32x64_sve);
1327
+
1328
+ // chroma blockcopy_sp
1329
+ p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_sp = PFX(blockcopy_sp_4x4_sve);
1330
+ p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_sp = PFX(blockcopy_sp_8x8_sve);
1331
+ p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_sp = PFX(blockcopy_sp_16x16_sve);
1332
+ p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_sp = PFX(blockcopy_sp_32x32_sve);
1333
+ p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_sp = PFX(blockcopy_sp_4x8_sve);
1334
+ p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_sp = PFX(blockcopy_sp_8x16_sve);
1335
+ p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_sp = PFX(blockcopy_sp_16x32_sve);
1336
+ p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_sp = PFX(blockcopy_sp_32x64_sve);
1337
+
1338
+ // Block_fill
1339
+ LUMA_TU_NEON(blockfill_sALIGNED, blockfill_s);
1340
+ LUMA_TU_CAN_USE_SVE(blockfill_sALIGNED, blockfill_s);
1341
+ LUMA_TU_NEON(blockfill_sNONALIGNED, blockfill_s);
1342
+ LUMA_TU_CAN_USE_SVE(blockfill_sNONALIGNED, blockfill_s);
1343
+
1344
+ // copy_count
1345
+ p.cuBLOCK_4x4.copy_cnt = PFX(copy_cnt_4_neon);
1346
+ p.cuBLOCK_8x8.copy_cnt = PFX(copy_cnt_8_neon);
1347
+ p.cuBLOCK_16x16.copy_cnt = PFX(copy_cnt_16_neon);
1348
+ p.cuBLOCK_32x32.copy_cnt = PFX(copy_cnt_32_neon);
1349
+
1350
+ // count nonzero
1351
+ p.cuBLOCK_4x4.count_nonzero = PFX(count_nonzero_4_neon);
1352
+ p.cuBLOCK_8x8.count_nonzero = PFX(count_nonzero_8_neon);
1353
+ p.cuBLOCK_16x16.count_nonzero = PFX(count_nonzero_16_neon);
1354
+ p.cuBLOCK_32x32.count_nonzero = PFX(count_nonzero_32_neon);
1355
+
1356
+ // cpy2Dto1D_shl
1357
+ p.cuBLOCK_4x4.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_4x4_neon);
1358
+ p.cuBLOCK_8x8.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_8x8_neon);
1359
+ p.cuBLOCK_16x16.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16x16_sve);
1360
+ p.cuBLOCK_32x32.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32x32_sve);
1361
+ p.cuBLOCK_64x64.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_64x64_sve);
1362
+
1363
+ // cpy2Dto1D_shr
1364
+ p.cuBLOCK_4x4.cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_4x4_neon);
1365
+ p.cuBLOCK_8x8.cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_8x8_neon);
1366
+ p.cuBLOCK_16x16.cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16x16_sve);
1367
+ p.cuBLOCK_32x32.cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_sve);
1368
+
1369
+ // cpy1Dto2D_shl
1370
+ p.cuBLOCK_4x4.cpy1Dto2D_shlALIGNED = PFX(cpy1Dto2D_shl_4x4_neon);
1371
+ p.cuBLOCK_8x8.cpy1Dto2D_shlALIGNED = PFX(cpy1Dto2D_shl_8x8_neon);
1372
+ p.cuBLOCK_16x16.cpy1Dto2D_shlALIGNED = PFX(cpy1Dto2D_shl_16x16_sve);
1373
+ p.cuBLOCK_32x32.cpy1Dto2D_shlALIGNED = PFX(cpy1Dto2D_shl_32x32_sve);
1374
+ p.cuBLOCK_64x64.cpy1Dto2D_shlALIGNED = PFX(cpy1Dto2D_shl_64x64_sve);
1375
+
1376
+ p.cuBLOCK_4x4.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_4x4_neon);
1377
+ p.cuBLOCK_8x8.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_8x8_neon);
1378
+ p.cuBLOCK_16x16.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_16x16_sve);
1379
+ p.cuBLOCK_32x32.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_32x32_sve);
1380
+ p.cuBLOCK_64x64.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_64x64_sve);
1381
+
1382
+ // cpy1Dto2D_shr
1383
+ p.cuBLOCK_4x4.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_4x4_neon);
1384
+ p.cuBLOCK_8x8.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_8x8_neon);
1385
+ p.cuBLOCK_16x16.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16x16_sve);
1386
+ p.cuBLOCK_32x32.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_sve);
1387
+ p.cuBLOCK_64x64.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_64x64_sve);
1388
+
1389
+#if !HIGH_BIT_DEPTH
1390
+ // pixel_avg_pp
1391
+ ALL_LUMA_PU(pixelavg_ppNONALIGNED, pixel_avg_pp, neon);
1392
+ ALL_LUMA_PU(pixelavg_ppALIGNED, pixel_avg_pp, neon);
1393
+
1394
+ // addAvg
1395
+ ALL_LUMA_PU(addAvgNONALIGNED, addAvg, neon);
1396
+ ALL_LUMA_PU(addAvgALIGNED, addAvg, neon);
1397
+ ALL_CHROMA_420_PU(addAvgNONALIGNED, addAvg, neon);
1398
+ ALL_CHROMA_422_PU(addAvgNONALIGNED, addAvg, neon);
1399
+ ALL_CHROMA_420_PU(addAvgALIGNED, addAvg, neon);
1400
+ ALL_CHROMA_422_PU(addAvgALIGNED, addAvg, neon);
1401
+
1402
+ // sad
1403
+ ALL_LUMA_PU(sad, pixel_sad, neon);
1404
+ ALL_LUMA_PU(sad_x3, sad_x3, neon);
1405
+ ALL_LUMA_PU(sad_x4, sad_x4, neon);
1406
+
1407
+ // sse_pp
1408
+ p.cuBLOCK_4x4.sse_pp = PFX(pixel_sse_pp_4x4_sve);
1409
+ p.cuBLOCK_8x8.sse_pp = PFX(pixel_sse_pp_8x8_neon);
1410
+ p.cuBLOCK_16x16.sse_pp = PFX(pixel_sse_pp_16x16_neon);
1411
+ p.cuBLOCK_32x32.sse_pp = PFX(pixel_sse_pp_32x32_neon);
1412
+ p.cuBLOCK_64x64.sse_pp = PFX(pixel_sse_pp_64x64_neon);
1413
+
1414
+ p.chromaX265_CSP_I420.cuBLOCK_420_4x4.sse_pp = PFX(pixel_sse_pp_4x4_sve);
1415
+ p.chromaX265_CSP_I420.cuBLOCK_420_8x8.sse_pp = PFX(pixel_sse_pp_8x8_neon);
1416
+ p.chromaX265_CSP_I420.cuBLOCK_420_16x16.sse_pp = PFX(pixel_sse_pp_16x16_neon);
1417
+ p.chromaX265_CSP_I420.cuBLOCK_420_32x32.sse_pp = PFX(pixel_sse_pp_32x32_neon);
1418
+ p.chromaX265_CSP_I422.cuBLOCK_422_4x8.sse_pp = PFX(pixel_sse_pp_4x8_sve);
1419
+ p.chromaX265_CSP_I422.cuBLOCK_422_8x16.sse_pp = PFX(pixel_sse_pp_8x16_neon);
1420
+ p.chromaX265_CSP_I422.cuBLOCK_422_16x32.sse_pp = PFX(pixel_sse_pp_16x32_neon);
1421
+ p.chromaX265_CSP_I422.cuBLOCK_422_32x64.sse_pp = PFX(pixel_sse_pp_32x64_neon);
1422
+
1423
+ // sse_ss
1424
+ p.cuBLOCK_4x4.sse_ss = PFX(pixel_sse_ss_4x4_neon);
1425
+ p.cuBLOCK_8x8.sse_ss = PFX(pixel_sse_ss_8x8_neon);
1426
+ p.cuBLOCK_16x16.sse_ss = PFX(pixel_sse_ss_16x16_neon);
1427
+ p.cuBLOCK_32x32.sse_ss = PFX(pixel_sse_ss_32x32_neon);
1428
+ p.cuBLOCK_64x64.sse_ss = PFX(pixel_sse_ss_64x64_neon);
1429
+
1430
+ // ssd_s
1431
+ p.cuBLOCK_4x4.ssd_sNONALIGNED = PFX(pixel_ssd_s_4x4_neon);
1432
+ p.cuBLOCK_8x8.ssd_sNONALIGNED = PFX(pixel_ssd_s_8x8_neon);
1433
+ p.cuBLOCK_16x16.ssd_sNONALIGNED = PFX(pixel_ssd_s_16x16_neon);
1434
+ p.cuBLOCK_32x32.ssd_sNONALIGNED = PFX(pixel_ssd_s_32x32_neon);
1435
+
1436
+ p.cuBLOCK_4x4.ssd_sALIGNED = PFX(pixel_ssd_s_4x4_neon);
1437
+ p.cuBLOCK_8x8.ssd_sALIGNED = PFX(pixel_ssd_s_8x8_neon);
1438
+ p.cuBLOCK_16x16.ssd_sALIGNED = PFX(pixel_ssd_s_16x16_neon);
1439
+ p.cuBLOCK_32x32.ssd_sALIGNED = PFX(pixel_ssd_s_32x32_neon);
1440
+
1441
+ // pixel_var
1442
+ p.cuBLOCK_8x8.var = PFX(pixel_var_8x8_neon);
1443
+ p.cuBLOCK_16x16.var = PFX(pixel_var_16x16_neon);
1444
+ p.cuBLOCK_32x32.var = PFX(pixel_var_32x32_neon);
1445
+ p.cuBLOCK_64x64.var = PFX(pixel_var_64x64_neon);
1446
+
1447
+ // calc_Residual
1448
+ p.cuBLOCK_4x4.calcresidualNONALIGNED = PFX(getResidual4_neon);
1449
+ p.cuBLOCK_8x8.calcresidualNONALIGNED = PFX(getResidual8_neon);
1450
+ p.cuBLOCK_16x16.calcresidualNONALIGNED = PFX(getResidual16_neon);
1451
+ p.cuBLOCK_32x32.calcresidualNONALIGNED = PFX(getResidual32_neon);
1452
+
1453
+ p.cuBLOCK_4x4.calcresidualALIGNED = PFX(getResidual4_neon);
1454
+ p.cuBLOCK_8x8.calcresidualALIGNED = PFX(getResidual8_neon);
1455
+ p.cuBLOCK_16x16.calcresidualALIGNED = PFX(getResidual16_neon);
1456
+ p.cuBLOCK_32x32.calcresidualALIGNED = PFX(getResidual32_neon);
1457
+
1458
+ // pixel_sub_ps
1459
+ p.cuBLOCK_4x4.sub_ps = PFX(pixel_sub_ps_4x4_neon);
1460
+ p.cuBLOCK_8x8.sub_ps = PFX(pixel_sub_ps_8x8_neon);
1461
+ p.cuBLOCK_16x16.sub_ps = PFX(pixel_sub_ps_16x16_neon);
1462
+ p.cuBLOCK_32x32.sub_ps = PFX(pixel_sub_ps_32x32_neon);
1463
+ p.cuBLOCK_64x64.sub_ps = PFX(pixel_sub_ps_64x64_neon);
1464
+
1465
+ // chroma sub_ps
1466
+ p.chromaX265_CSP_I420.cuBLOCK_420_4x4.sub_ps = PFX(pixel_sub_ps_4x4_neon);
1467
+ p.chromaX265_CSP_I420.cuBLOCK_420_8x8.sub_ps = PFX(pixel_sub_ps_8x8_neon);
1468
+ p.chromaX265_CSP_I420.cuBLOCK_420_16x16.sub_ps = PFX(pixel_sub_ps_16x16_neon);
1469
+ p.chromaX265_CSP_I420.cuBLOCK_420_32x32.sub_ps = PFX(pixel_sub_ps_32x32_neon);
1470
+ p.chromaX265_CSP_I422.cuBLOCK_422_4x8.sub_ps = PFX(pixel_sub_ps_4x8_neon);
1471
+ p.chromaX265_CSP_I422.cuBLOCK_422_8x16.sub_ps = PFX(pixel_sub_ps_8x16_sve);
1472
+ p.chromaX265_CSP_I422.cuBLOCK_422_16x32.sub_ps = PFX(pixel_sub_ps_16x32_neon);
1473
+ p.chromaX265_CSP_I422.cuBLOCK_422_32x64.sub_ps = PFX(pixel_sub_ps_32x64_neon);
1474
+
1475
+ // pixel_add_ps
1476
+ p.cuBLOCK_4x4.add_psNONALIGNED = PFX(pixel_add_ps_4x4_neon);
1477
+ p.cuBLOCK_8x8.add_psNONALIGNED = PFX(pixel_add_ps_8x8_neon);
1478
+ p.cuBLOCK_16x16.add_psNONALIGNED = PFX(pixel_add_ps_16x16_neon);
1479
+ p.cuBLOCK_32x32.add_psNONALIGNED = PFX(pixel_add_ps_32x32_neon);
1480
+ p.cuBLOCK_64x64.add_psNONALIGNED = PFX(pixel_add_ps_64x64_neon);
1481
+
1482
+ p.cuBLOCK_4x4.add_psALIGNED = PFX(pixel_add_ps_4x4_neon);
1483
+ p.cuBLOCK_8x8.add_psALIGNED = PFX(pixel_add_ps_8x8_neon);
1484
+ p.cuBLOCK_16x16.add_psALIGNED = PFX(pixel_add_ps_16x16_neon);
1485
+ p.cuBLOCK_32x32.add_psALIGNED = PFX(pixel_add_ps_32x32_neon);
1486
+ p.cuBLOCK_64x64.add_psALIGNED = PFX(pixel_add_ps_64x64_neon);
1487
+
1488
+ // chroma add_ps
1489
+ p.chromaX265_CSP_I420.cuBLOCK_420_4x4.add_psNONALIGNED = PFX(pixel_add_ps_4x4_neon);
1490
+ p.chromaX265_CSP_I420.cuBLOCK_420_8x8.add_psNONALIGNED = PFX(pixel_add_ps_8x8_neon);
1491
+ p.chromaX265_CSP_I420.cuBLOCK_420_16x16.add_psNONALIGNED = PFX(pixel_add_ps_16x16_neon);
1492
+ p.chromaX265_CSP_I420.cuBLOCK_420_32x32.add_psNONALIGNED = PFX(pixel_add_ps_32x32_neon);
1493
+ p.chromaX265_CSP_I422.cuBLOCK_422_4x8.add_psNONALIGNED = PFX(pixel_add_ps_4x8_neon);
1494
+ p.chromaX265_CSP_I422.cuBLOCK_422_8x16.add_psNONALIGNED = PFX(pixel_add_ps_8x16_neon);
1495
+ p.chromaX265_CSP_I422.cuBLOCK_422_16x32.add_psNONALIGNED = PFX(pixel_add_ps_16x32_neon);
1496
+ p.chromaX265_CSP_I422.cuBLOCK_422_32x64.add_psNONALIGNED = PFX(pixel_add_ps_32x64_neon);
1497
+
1498
+ p.chromaX265_CSP_I420.cuBLOCK_420_4x4.add_psALIGNED = PFX(pixel_add_ps_4x4_neon);
1499
+ p.chromaX265_CSP_I420.cuBLOCK_420_8x8.add_psALIGNED = PFX(pixel_add_ps_8x8_neon);
1500
+ p.chromaX265_CSP_I420.cuBLOCK_420_16x16.add_psALIGNED = PFX(pixel_add_ps_16x16_neon);
1501
+ p.chromaX265_CSP_I420.cuBLOCK_420_32x32.add_psALIGNED = PFX(pixel_add_ps_32x32_neon);
1502
+ p.chromaX265_CSP_I422.cuBLOCK_422_4x8.add_psALIGNED = PFX(pixel_add_ps_4x8_neon);
1503
+ p.chromaX265_CSP_I422.cuBLOCK_422_8x16.add_psALIGNED = PFX(pixel_add_ps_8x16_neon);
1504
+ p.chromaX265_CSP_I422.cuBLOCK_422_16x32.add_psALIGNED = PFX(pixel_add_ps_16x32_neon);
1505
+ p.chromaX265_CSP_I422.cuBLOCK_422_32x64.add_psALIGNED = PFX(pixel_add_ps_32x64_neon);
1506
+
1507
+ //scale2D_64to32
1508
+ p.scale2D_64to32 = PFX(scale2D_64to32_neon);
1509
+
1510
+ // scale1D_128to64
1511
+ p.scale1D_128to64NONALIGNED = PFX(scale1D_128to64_neon);
1512
+ p.scale1D_128to64ALIGNED = PFX(scale1D_128to64_neon);
1513
+
1514
+ // planecopy
1515
+ p.planecopy_cp = PFX(pixel_planecopy_cp_neon);
1516
+
1517
+ // satd
1518
+ p.puLUMA_4x4.satd = PFX(pixel_satd_4x4_sve);
1519
+ p.puLUMA_8x8.satd = PFX(pixel_satd_8x8_neon);
1520
+ p.puLUMA_16x16.satd = PFX(pixel_satd_16x16_neon);
1521
+ p.puLUMA_32x32.satd = PFX(pixel_satd_32x32_sve);
1522
+ p.puLUMA_64x64.satd = PFX(pixel_satd_64x64_neon);
1523
+ p.puLUMA_8x4.satd = PFX(pixel_satd_8x4_sve);
1524
+ p.puLUMA_4x8.satd = PFX(pixel_satd_4x8_neon);
1525
+ p.puLUMA_16x8.satd = PFX(pixel_satd_16x8_neon);
1526
+ p.puLUMA_8x16.satd = PFX(pixel_satd_8x16_neon);
1527
+ p.puLUMA_16x32.satd = PFX(pixel_satd_16x32_neon);
1528
+ p.puLUMA_32x16.satd = PFX(pixel_satd_32x16_sve);
1529
+ p.puLUMA_64x32.satd = PFX(pixel_satd_64x32_neon);
1530
+ p.puLUMA_32x64.satd = PFX(pixel_satd_32x64_neon);
1531
+ p.puLUMA_16x12.satd = PFX(pixel_satd_16x12_neon);
1532
+ p.puLUMA_12x16.satd = PFX(pixel_satd_12x16_neon);
1533
+ p.puLUMA_16x4.satd = PFX(pixel_satd_16x4_neon);
1534
+ p.puLUMA_4x16.satd = PFX(pixel_satd_4x16_neon);
1535
+ p.puLUMA_32x24.satd = PFX(pixel_satd_32x24_neon);
1536
+ p.puLUMA_24x32.satd = PFX(pixel_satd_24x32_neon);
1537
+ p.puLUMA_32x8.satd = PFX(pixel_satd_32x8_neon);
1538
+ p.puLUMA_8x32.satd = PFX(pixel_satd_8x32_neon);
1539
+ p.puLUMA_64x48.satd = PFX(pixel_satd_64x48_sve);
1540
+ p.puLUMA_48x64.satd = PFX(pixel_satd_48x64_neon);
1541
+ p.puLUMA_64x16.satd = PFX(pixel_satd_64x16_neon);
1542
+ p.puLUMA_16x64.satd = PFX(pixel_satd_16x64_neon);
1543
+
1544
+ p.chromaX265_CSP_I420.puCHROMA_420_4x4.satd = PFX(pixel_satd_4x4_sve);
1545
+ p.chromaX265_CSP_I420.puCHROMA_420_8x8.satd = PFX(pixel_satd_8x8_neon);
1546
+ p.chromaX265_CSP_I420.puCHROMA_420_16x16.satd = PFX(pixel_satd_16x16_neon);
1547
+ p.chromaX265_CSP_I420.puCHROMA_420_32x32.satd = PFX(pixel_satd_32x32_neon);
1548
+ p.chromaX265_CSP_I420.puCHROMA_420_8x4.satd = PFX(pixel_satd_8x4_sve);
1549
+ p.chromaX265_CSP_I420.puCHROMA_420_4x8.satd = PFX(pixel_satd_4x8_neon);
1550
+ p.chromaX265_CSP_I420.puCHROMA_420_16x8.satd = PFX(pixel_satd_16x8_neon);
1551
+ p.chromaX265_CSP_I420.puCHROMA_420_8x16.satd = PFX(pixel_satd_8x16_neon);
1552
+ p.chromaX265_CSP_I420.puCHROMA_420_32x16.satd = PFX(pixel_satd_32x16_neon);
1553
+ p.chromaX265_CSP_I420.puCHROMA_420_16x32.satd = PFX(pixel_satd_16x32_neon);
1554
+ p.chromaX265_CSP_I420.puCHROMA_420_16x12.satd = PFX(pixel_satd_16x12_neon);
1555
+ p.chromaX265_CSP_I420.puCHROMA_420_12x16.satd = PFX(pixel_satd_12x16_neon);
1556
+ p.chromaX265_CSP_I420.puCHROMA_420_16x4.satd = PFX(pixel_satd_16x4_neon);
1557
+ p.chromaX265_CSP_I420.puCHROMA_420_4x16.satd = PFX(pixel_satd_4x16_neon);
1558
+ p.chromaX265_CSP_I420.puCHROMA_420_32x24.satd = PFX(pixel_satd_32x24_neon);
1559
+ p.chromaX265_CSP_I420.puCHROMA_420_24x32.satd = PFX(pixel_satd_24x32_neon);
1560
+ p.chromaX265_CSP_I420.puCHROMA_420_32x8.satd = PFX(pixel_satd_32x8_neon);
1561
+ p.chromaX265_CSP_I420.puCHROMA_420_8x32.satd = PFX(pixel_satd_8x32_neon);
1562
+
1563
+ p.chromaX265_CSP_I422.puCHROMA_422_4x8.satd = PFX(pixel_satd_4x8_neon);
1564
+ p.chromaX265_CSP_I422.puCHROMA_422_8x16.satd = PFX(pixel_satd_8x16_neon);
1565
+ p.chromaX265_CSP_I422.puCHROMA_422_16x32.satd = PFX(pixel_satd_16x32_neon);
1566
+ p.chromaX265_CSP_I422.puCHROMA_422_32x64.satd = PFX(pixel_satd_32x64_neon);
1567
+ p.chromaX265_CSP_I422.puCHROMA_422_4x4.satd = PFX(pixel_satd_4x4_sve);
1568
+ p.chromaX265_CSP_I422.puCHROMA_422_8x8.satd = PFX(pixel_satd_8x8_neon);
1569
+ p.chromaX265_CSP_I422.puCHROMA_422_4x16.satd = PFX(pixel_satd_4x16_neon);
1570
+ p.chromaX265_CSP_I422.puCHROMA_422_16x16.satd = PFX(pixel_satd_16x16_neon);
1571
+ p.chromaX265_CSP_I422.puCHROMA_422_8x32.satd = PFX(pixel_satd_8x32_neon);
1572
+ p.chromaX265_CSP_I422.puCHROMA_422_32x32.satd = PFX(pixel_satd_32x32_neon);
1573
+ p.chromaX265_CSP_I422.puCHROMA_422_16x64.satd = PFX(pixel_satd_16x64_neon);
1574
+ p.chromaX265_CSP_I422.puCHROMA_422_8x12.satd = PFX(pixel_satd_8x12_sve);
1575
+ p.chromaX265_CSP_I422.puCHROMA_422_8x4.satd = PFX(pixel_satd_8x4_sve);
1576
+ p.chromaX265_CSP_I422.puCHROMA_422_16x24.satd = PFX(pixel_satd_16x24_neon);
1577
+ p.chromaX265_CSP_I422.puCHROMA_422_12x32.satd = PFX(pixel_satd_12x32_neon);
1578
+ p.chromaX265_CSP_I422.puCHROMA_422_16x8.satd = PFX(pixel_satd_16x8_neon);
1579
+ p.chromaX265_CSP_I422.puCHROMA_422_4x32.satd = PFX(pixel_satd_4x32_neon);
1580
+ p.chromaX265_CSP_I422.puCHROMA_422_32x48.satd = PFX(pixel_satd_32x48_neon);
1581
+ p.chromaX265_CSP_I422.puCHROMA_422_24x64.satd = PFX(pixel_satd_24x64_neon);
1582
+ p.chromaX265_CSP_I422.puCHROMA_422_32x16.satd = PFX(pixel_satd_32x16_neon);
1583
+ p.chromaX265_CSP_I422.puCHROMA_422_8x64.satd = PFX(pixel_satd_8x64_neon);
1584
+
1585
+ // sa8d
1586
+ p.cuBLOCK_4x4.sa8d = PFX(pixel_satd_4x4_sve);
1587
+ p.cuBLOCK_8x8.sa8d = PFX(pixel_sa8d_8x8_neon);
1588
+ p.cuBLOCK_16x16.sa8d = PFX(pixel_sa8d_16x16_neon);
1589
+ p.cuBLOCK_32x32.sa8d = PFX(pixel_sa8d_32x32_neon);
1590
+ p.cuBLOCK_64x64.sa8d = PFX(pixel_sa8d_64x64_neon);
1591
+ p.chromaX265_CSP_I420.cuBLOCK_8x8.sa8d = PFX(pixel_satd_4x4_sve);
1592
+ p.chromaX265_CSP_I420.cuBLOCK_16x16.sa8d = PFX(pixel_sa8d_16x16_neon);
1593
+ p.chromaX265_CSP_I420.cuBLOCK_32x32.sa8d = PFX(pixel_sa8d_32x32_neon);
1594
+ p.chromaX265_CSP_I420.cuBLOCK_64x64.sa8d = PFX(pixel_sa8d_64x64_neon);
1595
+ p.chromaX265_CSP_I422.cuBLOCK_422_8x16.sa8d = PFX(pixel_sa8d_8x16_neon);
1596
+ p.chromaX265_CSP_I422.cuBLOCK_422_16x32.sa8d = PFX(pixel_sa8d_16x32_neon);
1597
+ p.chromaX265_CSP_I422.cuBLOCK_422_32x64.sa8d = PFX(pixel_sa8d_32x64_neon);
1598
+
1599
+ // dequant_scaling
1600
+ p.dequant_scaling = PFX(dequant_scaling_neon);
1601
+ p.dequant_normal = PFX(dequant_normal_neon);
1602
+
1603
+ // ssim_4x4x2_core
1604
+ p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_neon);
1605
+
1606
+ // ssimDist
1607
+ p.cuBLOCK_4x4.ssimDist = PFX(ssimDist4_neon);
1608
+ p.cuBLOCK_8x8.ssimDist = PFX(ssimDist8_neon);
1609
+ p.cuBLOCK_16x16.ssimDist = PFX(ssimDist16_neon);
1610
+ p.cuBLOCK_32x32.ssimDist = PFX(ssimDist32_neon);
1611
+ p.cuBLOCK_64x64.ssimDist = PFX(ssimDist64_neon);
1612
+
1613
+ // normFact
1614
+ p.cuBLOCK_8x8.normFact = PFX(normFact8_neon);
1615
+ p.cuBLOCK_16x16.normFact = PFX(normFact16_neon);
1616
+ p.cuBLOCK_32x32.normFact = PFX(normFact32_neon);
1617
+ p.cuBLOCK_64x64.normFact = PFX(normFact64_neon);
1618
+
1619
+ // psy_cost_pp
1620
+ p.cuBLOCK_4x4.psy_cost_pp = PFX(psyCost_4x4_neon);
1621
+
1622
+ p.weight_pp = PFX(weight_pp_neon);
1623
+#if !defined(__APPLE__)
1624
+ p.scanPosLast = PFX(scanPosLast_neon);
1625
+#endif
1626
+ p.costCoeffNxN = PFX(costCoeffNxN_neon);
1627
#endif
1628
+
1629
+ // quant
1630
+ p.quant = PFX(quant_sve);
1631
+ p.nquant = PFX(nquant_neon);
1632
+}
1633
#endif
1634
1635
+#if defined(HAVE_SVE2)
1636
+void setupSve2Primitives(EncoderPrimitives &p)
1637
+{
1638
+ // When these primitives will use SVE/SVE2 instructions set,
1639
+ // change the following definitions to point to the SVE/SVE2 implementation
1640
+ setupPixelPrimitives_neon(p);
1641
+ setupFilterPrimitives_neon(p);
1642
+ setupDCTPrimitives_neon(p);
1643
+ setupLoopFilterPrimitives_neon(p);
1644
+ setupIntraPrimitives_neon(p);
1645
+
1646
+ CHROMA_420_PU_FILTER_PIXEL_TO_SHORT_NEON(p2sNONALIGNED);
1647
+ CHROMA_420_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1648
+ CHROMA_422_PU_NEON_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1649
+ CHROMA_422_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1650
+ CHROMA_444_PU_NEON_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1651
+ CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1652
+ LUMA_PU_NEON_FILTER_PIXEL_TO_SHORT(convert_p2sALIGNED);
1653
+ LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2sALIGNED);
1654
+ CHROMA_420_PU_FILTER_PIXEL_TO_SHORT_NEON(p2sALIGNED);
1655
+ CHROMA_420_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sALIGNED);
1656
+ CHROMA_422_PU_NEON_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1657
+ CHROMA_422_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1658
+ CHROMA_444_PU_NEON_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1659
+ CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2sNONALIGNED);
1660
+ LUMA_PU_NEON_FILTER_PIXEL_TO_SHORT(convert_p2sNONALIGNED);
1661
+ LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2sNONALIGNED);
1662
+
1663
#if !HIGH_BIT_DEPTH
1664
- p.cuBLOCK_4x4.psy_cost_pp = PFX(psyCost_4x4_neon);
1665
+ LUMA_PU_MULTIPLE_ARCHS_1(luma_vpp, interp_8tap_vert_pp, neon);
1666
+ LUMA_PU_MULTIPLE_ARCHS_2(luma_vpp, interp_8tap_vert_pp, sve2);
1667
+ LUMA_PU_MULTIPLE_ARCHS_1(luma_vsp, interp_8tap_vert_sp, sve2);
1668
+ LUMA_PU_MULTIPLE_ARCHS_2(luma_vsp, interp_8tap_vert_sp, neon);
1669
+ ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, sve2);
1670
+ ALL_LUMA_PU(luma_hpp, interp_horiz_pp, neon);
1671
+ ALL_LUMA_PU(luma_hps, interp_horiz_ps, neon);
1672
+ ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, sve2);
1673
+ ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu);
1674
+ CHROMA_420_VERT_FILTERS_NEON();
1675
+ CHROMA_420_VERT_FILTERS_CAN_USE_SVE2();
1676
+ CHROMA_422_VERT_FILTERS_NEON();
1677
+ CHROMA_422_VERT_FILTERS_CAN_USE_SVE2(sve2);
1678
+ CHROMA_444_VERT_FILTERS_NEON();
1679
+ CHROMA_444_VERT_FILTERS_CAN_USE_SVE2();
1680
+ CHROMA_420_FILTERS_NEON();
1681
+ CHROMA_420_FILTERS_CAN_USE_SVE2();
1682
+ CHROMA_422_FILTERS_NEON();
1683
+ CHROMA_422_FILTERS_CAN_USE_SVE2();
1684
+ CHROMA_444_FILTERS_NEON();
1685
+ CHROMA_444_FILTERS_CAN_USE_SVE2();
1686
+
1687
+ // Blockcopy_pp
1688
+ LUMA_PU_NEON_1(copy_pp, blockcopy_pp);
1689
+ LUMA_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
1690
+ CHROMA_420_PU_NEON_1(copy_pp, blockcopy_pp);
1691
+ CHROMA_420_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
1692
+ CHROMA_422_PU_NEON_1(copy_pp, blockcopy_pp);
1693
+ CHROMA_422_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
1694
+ p.cuBLOCK_4x4.copy_pp = PFX(blockcopy_pp_4x4_neon);
1695
+ p.cuBLOCK_8x8.copy_pp = PFX(blockcopy_pp_8x8_neon);
1696
+ p.cuBLOCK_16x16.copy_pp = PFX(blockcopy_pp_16x16_neon);
1697
+ p.cuBLOCK_32x32.copy_pp = PFX(blockcopy_pp_32x32_sve);
1698
+ p.cuBLOCK_64x64.copy_pp = PFX(blockcopy_pp_64x64_sve);
1699
+ p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_pp = PFX(blockcopy_pp_4x4_neon);
1700
+ p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_pp = PFX(blockcopy_pp_8x8_neon);
1701
+ p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_pp = PFX(blockcopy_pp_16x16_neon);
1702
+ p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_pp = PFX(blockcopy_pp_32x32_sve);
1703
+ p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_pp = PFX(blockcopy_pp_4x8_neon);
1704
+ p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_pp = PFX(blockcopy_pp_8x16_neon);
1705
+ p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_pp = PFX(blockcopy_pp_16x32_neon);
1706
+ p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_pp = PFX(blockcopy_pp_32x64_sve);
1707
+
1708
#endif // !HIGH_BIT_DEPTH
1709
1710
+ // Blockcopy_ss
1711
+ p.cuBLOCK_4x4.copy_ss = PFX(blockcopy_ss_4x4_neon);
1712
+ p.cuBLOCK_8x8.copy_ss = PFX(blockcopy_ss_8x8_neon);
1713
+ p.cuBLOCK_16x16.copy_ss = PFX(blockcopy_ss_16x16_sve);
1714
+ p.cuBLOCK_32x32.copy_ss = PFX(blockcopy_ss_32x32_sve);
1715
+ p.cuBLOCK_64x64.copy_ss = PFX(blockcopy_ss_64x64_sve);
1716
+
1717
+ // Blockcopy_ps
1718
+ p.cuBLOCK_4x4.copy_ps = PFX(blockcopy_ps_4x4_neon);
1719
+ p.cuBLOCK_8x8.copy_ps = PFX(blockcopy_ps_8x8_neon);
1720
+ p.cuBLOCK_16x16.copy_ps = PFX(blockcopy_ps_16x16_sve);
1721
+ p.cuBLOCK_32x32.copy_ps = PFX(blockcopy_ps_32x32_sve);
1722
+ p.cuBLOCK_64x64.copy_ps = PFX(blockcopy_ps_64x64_sve);
1723
+
1724
+ // Blockcopy_sp
1725
+ p.cuBLOCK_4x4.copy_sp = PFX(blockcopy_sp_4x4_sve);
1726
+ p.cuBLOCK_8x8.copy_sp = PFX(blockcopy_sp_8x8_sve);
1727
+ p.cuBLOCK_16x16.copy_sp = PFX(blockcopy_sp_16x16_sve);
1728
+ p.cuBLOCK_32x32.copy_sp = PFX(blockcopy_sp_32x32_sve);
1729
+ p.cuBLOCK_64x64.copy_sp = PFX(blockcopy_sp_64x64_neon);
1730
+
1731
+ // chroma blockcopy_ss
1732
+ p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_ss = PFX(blockcopy_ss_4x4_neon);
1733
+ p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_ss = PFX(blockcopy_ss_8x8_neon);
1734
+ p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_ss = PFX(blockcopy_ss_16x16_sve);
1735
+ p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_ss = PFX(blockcopy_ss_32x32_sve);
1736
+ p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_ss = PFX(blockcopy_ss_4x8_neon);
1737
+ p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_ss = PFX(blockcopy_ss_8x16_neon);
1738
+ p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_ss = PFX(blockcopy_ss_16x32_sve);
1739
+ p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_ss = PFX(blockcopy_ss_32x64_sve);
1740
+
1741
+ // chroma blockcopy_ps
1742
+ p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_ps = PFX(blockcopy_ps_4x4_neon);
1743
+ p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_ps = PFX(blockcopy_ps_8x8_neon);
1744
+ p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_ps = PFX(blockcopy_ps_16x16_sve);
1745
+ p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_ps = PFX(blockcopy_ps_32x32_sve);
1746
+ p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_ps = PFX(blockcopy_ps_4x8_sve);
1747
+ p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_ps = PFX(blockcopy_ps_8x16_sve);
1748
+ p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_ps = PFX(blockcopy_ps_16x32_sve);
1749
+ p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_ps = PFX(blockcopy_ps_32x64_sve);
1750
+
1751
+ // chroma blockcopy_sp
1752
+ p.chromaX265_CSP_I420.cuBLOCK_420_4x4.copy_sp = PFX(blockcopy_sp_4x4_sve);
1753
+ p.chromaX265_CSP_I420.cuBLOCK_420_8x8.copy_sp = PFX(blockcopy_sp_8x8_sve);
1754
+ p.chromaX265_CSP_I420.cuBLOCK_420_16x16.copy_sp = PFX(blockcopy_sp_16x16_sve);
1755
+ p.chromaX265_CSP_I420.cuBLOCK_420_32x32.copy_sp = PFX(blockcopy_sp_32x32_sve);
1756
+ p.chromaX265_CSP_I422.cuBLOCK_422_4x8.copy_sp = PFX(blockcopy_sp_4x8_sve);
1757
+ p.chromaX265_CSP_I422.cuBLOCK_422_8x16.copy_sp = PFX(blockcopy_sp_8x16_sve);
1758
+ p.chromaX265_CSP_I422.cuBLOCK_422_16x32.copy_sp = PFX(blockcopy_sp_16x32_sve);
1759
+ p.chromaX265_CSP_I422.cuBLOCK_422_32x64.copy_sp = PFX(blockcopy_sp_32x64_sve);
1760
+
1761
+ // Block_fill
1762
+ LUMA_TU_NEON(blockfill_sALIGNED, blockfill_s);
1763
+ LUMA_TU_CAN_USE_SVE(blockfill_sALIGNED, blockfill_s);
1764
+ LUMA_TU_NEON(blockfill_sNONALIGNED, blockfill_s);
1765
+ LUMA_TU_CAN_USE_SVE(blockfill_sNONALIGNED, blockfill_s);
1766
+
1767
+ // copy_count
1768
+ p.cuBLOCK_4x4.copy_cnt = PFX(copy_cnt_4_neon);
1769
+ p.cuBLOCK_8x8.copy_cnt = PFX(copy_cnt_8_neon);
1770
+ p.cuBLOCK_16x16.copy_cnt = PFX(copy_cnt_16_neon);
1771
+ p.cuBLOCK_32x32.copy_cnt = PFX(copy_cnt_32_neon);
1772
+
1773
+ // count nonzero
1774
+ p.cuBLOCK_4x4.count_nonzero = PFX(count_nonzero_4_neon);
1775
+ p.cuBLOCK_8x8.count_nonzero = PFX(count_nonzero_8_neon);
1776
+ p.cuBLOCK_16x16.count_nonzero = PFX(count_nonzero_16_neon);
1777
+ p.cuBLOCK_32x32.count_nonzero = PFX(count_nonzero_32_neon);
1778
+
1779
+ // cpy2Dto1D_shl
1780
+ p.cuBLOCK_4x4.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_4x4_neon);
1781
+ p.cuBLOCK_8x8.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_8x8_neon);
1782
+ p.cuBLOCK_16x16.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16x16_sve);
1783
+ p.cuBLOCK_32x32.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32x32_sve);
1784
+ p.cuBLOCK_64x64.cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_64x64_sve);
1785
+
1786
+ // cpy2Dto1D_shr
1787
+ p.cuBLOCK_4x4.cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_4x4_neon);
1788
+ p.cuBLOCK_8x8.cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_8x8_neon);
1789
+ p.cuBLOCK_16x16.cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16x16_sve);
1790
+ p.cuBLOCK_32x32.cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_sve);
1791
+
1792
+ // cpy1Dto2D_shl
1793
+ p.cuBLOCK_4x4.cpy1Dto2D_shlALIGNED = PFX(cpy1Dto2D_shl_4x4_neon);
1794
+ p.cuBLOCK_8x8.cpy1Dto2D_shlALIGNED = PFX(cpy1Dto2D_shl_8x8_neon);
1795
+ p.cuBLOCK_16x16.cpy1Dto2D_shlALIGNED = PFX(cpy1Dto2D_shl_16x16_sve);
1796
+ p.cuBLOCK_32x32.cpy1Dto2D_shlALIGNED = PFX(cpy1Dto2D_shl_32x32_sve);
1797
+ p.cuBLOCK_64x64.cpy1Dto2D_shlALIGNED = PFX(cpy1Dto2D_shl_64x64_sve);
1798
+
1799
+ p.cuBLOCK_4x4.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_4x4_neon);
1800
+ p.cuBLOCK_8x8.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_8x8_neon);
1801
+ p.cuBLOCK_16x16.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_16x16_sve);
1802
+ p.cuBLOCK_32x32.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_32x32_sve);
1803
+ p.cuBLOCK_64x64.cpy1Dto2D_shlNONALIGNED = PFX(cpy1Dto2D_shl_64x64_sve);
1804
+
1805
+ // cpy1Dto2D_shr
1806
+ p.cuBLOCK_4x4.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_4x4_neon);
1807
+ p.cuBLOCK_8x8.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_8x8_neon);
1808
+ p.cuBLOCK_16x16.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16x16_sve);
1809
+ p.cuBLOCK_32x32.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_sve);
1810
+ p.cuBLOCK_64x64.cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_64x64_sve);
1811
+
1812
+#if !HIGH_BIT_DEPTH
1813
+ // pixel_avg_pp
1814
+ LUMA_PU_NEON_2(pixelavg_ppNONALIGNED, pixel_avg_pp);
1815
+ LUMA_PU_MULTIPLE_ARCHS_3(pixelavg_ppNONALIGNED, pixel_avg_pp, sve2);
1816
+ LUMA_PU_NEON_2(pixelavg_ppALIGNED, pixel_avg_pp);
1817
+ LUMA_PU_MULTIPLE_ARCHS_3(pixelavg_ppALIGNED, pixel_avg_pp, sve2);
1818
+
1819
+ // addAvg
1820
+ LUMA_PU_NEON_3(addAvgNONALIGNED, addAvg);
1821
+ LUMA_PU_CAN_USE_SVE2(addAvgNONALIGNED, addAvg);
1822
+ LUMA_PU_NEON_3(addAvgALIGNED, addAvg);
1823
+ LUMA_PU_CAN_USE_SVE2(addAvgALIGNED, addAvg);
1824
+ CHROMA_420_PU_NEON_2(addAvgNONALIGNED, addAvg);
1825
+ CHROMA_420_PU_MULTIPLE_ARCHS(addAvgNONALIGNED, addAvg, sve2);
1826
+ CHROMA_420_PU_NEON_2(addAvgALIGNED, addAvg);
1827
+ CHROMA_420_PU_MULTIPLE_ARCHS(addAvgALIGNED, addAvg, sve2);
1828
+ CHROMA_422_PU_NEON_2(addAvgNONALIGNED, addAvg);
1829
+ CHROMA_422_PU_CAN_USE_SVE2(addAvgNONALIGNED, addAvg);
1830
+ CHROMA_422_PU_NEON_2(addAvgALIGNED, addAvg);
1831
+ CHROMA_422_PU_CAN_USE_SVE2(addAvgALIGNED, addAvg);
1832
+
1833
+ // sad
1834
+ ALL_LUMA_PU(sad, pixel_sad, sve2);
1835
+ ALL_LUMA_PU(sad_x3, sad_x3, sve2);
1836
+ ALL_LUMA_PU(sad_x4, sad_x4, sve2);
1837
+
1838
+ // sse_pp
1839
+ p.cuBLOCK_4x4.sse_pp = PFX(pixel_sse_pp_4x4_sve);
1840
+ p.cuBLOCK_8x8.sse_pp = PFX(pixel_sse_pp_8x8_neon);
1841
+ p.cuBLOCK_16x16.sse_pp = PFX(pixel_sse_pp_16x16_neon);
1842
+ p.cuBLOCK_32x32.sse_pp = PFX(pixel_sse_pp_32x32_sve2);
1843
+ p.cuBLOCK_64x64.sse_pp = PFX(pixel_sse_pp_64x64_sve2);
1844
+
1845
+ p.chromaX265_CSP_I420.cuBLOCK_420_4x4.sse_pp = PFX(pixel_sse_pp_4x4_sve);
1846
+ p.chromaX265_CSP_I420.cuBLOCK_420_8x8.sse_pp = PFX(pixel_sse_pp_8x8_neon);
1847
+ p.chromaX265_CSP_I420.cuBLOCK_420_16x16.sse_pp = PFX(pixel_sse_pp_16x16_neon);
1848
+ p.chromaX265_CSP_I420.cuBLOCK_420_32x32.sse_pp = PFX(pixel_sse_pp_32x32_sve2);
1849
+ p.chromaX265_CSP_I422.cuBLOCK_422_4x8.sse_pp = PFX(pixel_sse_pp_4x8_sve);
1850
+ p.chromaX265_CSP_I422.cuBLOCK_422_8x16.sse_pp = PFX(pixel_sse_pp_8x16_neon);
1851
+ p.chromaX265_CSP_I422.cuBLOCK_422_16x32.sse_pp = PFX(pixel_sse_pp_16x32_neon);
1852
+ p.chromaX265_CSP_I422.cuBLOCK_422_32x64.sse_pp = PFX(pixel_sse_pp_32x64_sve2);
1853
+
1854
+ // sse_ss
1855
+ p.cuBLOCK_4x4.sse_ss = PFX(pixel_sse_ss_4x4_sve2);
1856
+ p.cuBLOCK_8x8.sse_ss = PFX(pixel_sse_ss_8x8_sve2);
1857
+ p.cuBLOCK_16x16.sse_ss = PFX(pixel_sse_ss_16x16_sve2);
1858
+ p.cuBLOCK_32x32.sse_ss = PFX(pixel_sse_ss_32x32_sve2);
1859
+ p.cuBLOCK_64x64.sse_ss = PFX(pixel_sse_ss_64x64_sve2);
1860
+
1861
+ // ssd_s
1862
+ p.cuBLOCK_4x4.ssd_sNONALIGNED = PFX(pixel_ssd_s_4x4_sve2);
1863
+ p.cuBLOCK_8x8.ssd_sNONALIGNED = PFX(pixel_ssd_s_8x8_sve2);
1864
+ p.cuBLOCK_16x16.ssd_sNONALIGNED = PFX(pixel_ssd_s_16x16_sve2);
1865
+ p.cuBLOCK_32x32.ssd_sNONALIGNED = PFX(pixel_ssd_s_32x32_sve2);
1866
+
1867
+ p.cuBLOCK_4x4.ssd_sALIGNED = PFX(pixel_ssd_s_4x4_sve2);
1868
+ p.cuBLOCK_8x8.ssd_sALIGNED = PFX(pixel_ssd_s_8x8_sve2);
1869
+ p.cuBLOCK_16x16.ssd_sALIGNED = PFX(pixel_ssd_s_16x16_sve2);
1870
+ p.cuBLOCK_32x32.ssd_sALIGNED = PFX(pixel_ssd_s_32x32_sve2);
1871
+
1872
+ // pixel_var
1873
+ p.cuBLOCK_8x8.var = PFX(pixel_var_8x8_sve2);
1874
+ p.cuBLOCK_16x16.var = PFX(pixel_var_16x16_sve2);
1875
+ p.cuBLOCK_32x32.var = PFX(pixel_var_32x32_sve2);
1876
+ p.cuBLOCK_64x64.var = PFX(pixel_var_64x64_sve2);
1877
+
1878
+ // calc_Residual
1879
+ p.cuBLOCK_4x4.calcresidualNONALIGNED = PFX(getResidual4_neon);
1880
+ p.cuBLOCK_8x8.calcresidualNONALIGNED = PFX(getResidual8_neon);
1881
+ p.cuBLOCK_16x16.calcresidualNONALIGNED = PFX(getResidual16_sve2);
1882
+ p.cuBLOCK_32x32.calcresidualNONALIGNED = PFX(getResidual32_sve2);
1883
+
1884
+ p.cuBLOCK_4x4.calcresidualALIGNED = PFX(getResidual4_neon);
1885
+ p.cuBLOCK_8x8.calcresidualALIGNED = PFX(getResidual8_neon);
1886
+ p.cuBLOCK_16x16.calcresidualALIGNED = PFX(getResidual16_sve2);
1887
+ p.cuBLOCK_32x32.calcresidualALIGNED = PFX(getResidual32_sve2);
1888
+
1889
+ // pixel_sub_ps
1890
+ p.cuBLOCK_4x4.sub_ps = PFX(pixel_sub_ps_4x4_neon);
1891
+ p.cuBLOCK_8x8.sub_ps = PFX(pixel_sub_ps_8x8_neon);
1892
+ p.cuBLOCK_16x16.sub_ps = PFX(pixel_sub_ps_16x16_neon);
1893
+ p.cuBLOCK_32x32.sub_ps = PFX(pixel_sub_ps_32x32_sve2);
1894
+ p.cuBLOCK_64x64.sub_ps = PFX(pixel_sub_ps_64x64_sve2);
1895
+
1896
+ // chroma sub_ps
1897
+ p.chromaX265_CSP_I420.cuBLOCK_420_4x4.sub_ps = PFX(pixel_sub_ps_4x4_neon);
1898
+ p.chromaX265_CSP_I420.cuBLOCK_420_8x8.sub_ps = PFX(pixel_sub_ps_8x8_neon);
1899
+ p.chromaX265_CSP_I420.cuBLOCK_420_16x16.sub_ps = PFX(pixel_sub_ps_16x16_neon);
1900
+ p.chromaX265_CSP_I420.cuBLOCK_420_32x32.sub_ps = PFX(pixel_sub_ps_32x32_sve2);
1901
+ p.chromaX265_CSP_I422.cuBLOCK_422_4x8.sub_ps = PFX(pixel_sub_ps_4x8_neon);
1902
+ p.chromaX265_CSP_I422.cuBLOCK_422_8x16.sub_ps = PFX(pixel_sub_ps_8x16_sve);
1903
+ p.chromaX265_CSP_I422.cuBLOCK_422_16x32.sub_ps = PFX(pixel_sub_ps_16x32_neon);
1904
+ p.chromaX265_CSP_I422.cuBLOCK_422_32x64.sub_ps = PFX(pixel_sub_ps_32x64_sve2);
1905
+
1906
+ // pixel_add_ps
1907
+ p.cuBLOCK_4x4.add_psNONALIGNED = PFX(pixel_add_ps_4x4_sve2);
1908
+ p.cuBLOCK_8x8.add_psNONALIGNED = PFX(pixel_add_ps_8x8_sve2);
1909
+ p.cuBLOCK_16x16.add_psNONALIGNED = PFX(pixel_add_ps_16x16_sve2);
1910
+ p.cuBLOCK_32x32.add_psNONALIGNED = PFX(pixel_add_ps_32x32_sve2);
1911
+ p.cuBLOCK_64x64.add_psNONALIGNED = PFX(pixel_add_ps_64x64_sve2);
1912
+
1913
+ p.cuBLOCK_4x4.add_psALIGNED = PFX(pixel_add_ps_4x4_sve2);
1914
+ p.cuBLOCK_8x8.add_psALIGNED = PFX(pixel_add_ps_8x8_sve2);
1915
+ p.cuBLOCK_16x16.add_psALIGNED = PFX(pixel_add_ps_16x16_sve2);
1916
+ p.cuBLOCK_32x32.add_psALIGNED = PFX(pixel_add_ps_32x32_sve2);
1917
+ p.cuBLOCK_64x64.add_psALIGNED = PFX(pixel_add_ps_64x64_sve2);
1918
+
1919
+ // chroma add_ps
1920
+ p.chromaX265_CSP_I420.cuBLOCK_420_4x4.add_psNONALIGNED = PFX(pixel_add_ps_4x4_sve2);
1921
+ p.chromaX265_CSP_I420.cuBLOCK_420_8x8.add_psNONALIGNED = PFX(pixel_add_ps_8x8_sve2);
1922
+ p.chromaX265_CSP_I420.cuBLOCK_420_16x16.add_psNONALIGNED = PFX(pixel_add_ps_16x16_sve2);
1923
+ p.chromaX265_CSP_I420.cuBLOCK_420_32x32.add_psNONALIGNED = PFX(pixel_add_ps_32x32_sve2);
1924
+ p.chromaX265_CSP_I422.cuBLOCK_422_4x8.add_psNONALIGNED = PFX(pixel_add_ps_4x8_sve2);
1925
+ p.chromaX265_CSP_I422.cuBLOCK_422_8x16.add_psNONALIGNED = PFX(pixel_add_ps_8x16_sve2);
1926
+ p.chromaX265_CSP_I422.cuBLOCK_422_16x32.add_psNONALIGNED = PFX(pixel_add_ps_16x32_sve2);
1927
+ p.chromaX265_CSP_I422.cuBLOCK_422_32x64.add_psNONALIGNED = PFX(pixel_add_ps_32x64_sve2);
1928
+
1929
+ p.chromaX265_CSP_I420.cuBLOCK_420_4x4.add_psALIGNED = PFX(pixel_add_ps_4x4_sve2);
1930
+ p.chromaX265_CSP_I420.cuBLOCK_420_8x8.add_psALIGNED = PFX(pixel_add_ps_8x8_sve2);
1931
+ p.chromaX265_CSP_I420.cuBLOCK_420_16x16.add_psALIGNED = PFX(pixel_add_ps_16x16_sve2);
1932
+ p.chromaX265_CSP_I420.cuBLOCK_420_32x32.add_psALIGNED = PFX(pixel_add_ps_32x32_sve2);
1933
+ p.chromaX265_CSP_I422.cuBLOCK_422_4x8.add_psALIGNED = PFX(pixel_add_ps_4x8_sve2);
1934
+ p.chromaX265_CSP_I422.cuBLOCK_422_8x16.add_psALIGNED = PFX(pixel_add_ps_8x16_sve2);
1935
+ p.chromaX265_CSP_I422.cuBLOCK_422_16x32.add_psALIGNED = PFX(pixel_add_ps_16x32_sve2);
1936
+ p.chromaX265_CSP_I422.cuBLOCK_422_32x64.add_psALIGNED = PFX(pixel_add_ps_32x64_sve2);
1937
+
1938
+ //scale2D_64to32
1939
+ p.scale2D_64to32 = PFX(scale2D_64to32_neon);
1940
+
1941
+ // scale1D_128to64
1942
+ p.scale1D_128to64NONALIGNED = PFX(scale1D_128to64_sve2);
1943
+ p.scale1D_128to64ALIGNED = PFX(scale1D_128to64_sve2);
1944
+
1945
+ // planecopy
1946
+ p.planecopy_cp = PFX(pixel_planecopy_cp_neon);
1947
+
1948
+ // satd
1949
+ p.puLUMA_4x4.satd = PFX(pixel_satd_4x4_sve);
1950
+ p.puLUMA_8x8.satd = PFX(pixel_satd_8x8_neon);
1951
+ p.puLUMA_16x16.satd = PFX(pixel_satd_16x16_neon);
1952
+ p.puLUMA_32x32.satd = PFX(pixel_satd_32x32_sve);
1953
+ p.puLUMA_64x64.satd = PFX(pixel_satd_64x64_neon);
1954
+ p.puLUMA_8x4.satd = PFX(pixel_satd_8x4_sve);
1955
+ p.puLUMA_4x8.satd = PFX(pixel_satd_4x8_neon);
1956
+ p.puLUMA_16x8.satd = PFX(pixel_satd_16x8_neon);
1957
+ p.puLUMA_8x16.satd = PFX(pixel_satd_8x16_neon);
1958
+ p.puLUMA_16x32.satd = PFX(pixel_satd_16x32_neon);
1959
+ p.puLUMA_32x16.satd = PFX(pixel_satd_32x16_sve);
1960
+ p.puLUMA_64x32.satd = PFX(pixel_satd_64x32_neon);
1961
+ p.puLUMA_32x64.satd = PFX(pixel_satd_32x64_neon);
1962
+ p.puLUMA_16x12.satd = PFX(pixel_satd_16x12_neon);
1963
+ p.puLUMA_12x16.satd = PFX(pixel_satd_12x16_neon);
1964
+ p.puLUMA_16x4.satd = PFX(pixel_satd_16x4_neon);
1965
+ p.puLUMA_4x16.satd = PFX(pixel_satd_4x16_neon);
1966
+ p.puLUMA_32x24.satd = PFX(pixel_satd_32x24_neon);
1967
+ p.puLUMA_24x32.satd = PFX(pixel_satd_24x32_neon);
1968
+ p.puLUMA_32x8.satd = PFX(pixel_satd_32x8_neon);
1969
+ p.puLUMA_8x32.satd = PFX(pixel_satd_8x32_neon);
1970
+ p.puLUMA_64x48.satd = PFX(pixel_satd_64x48_sve);
1971
+ p.puLUMA_48x64.satd = PFX(pixel_satd_48x64_neon);
1972
+ p.puLUMA_64x16.satd = PFX(pixel_satd_64x16_neon);
1973
+ p.puLUMA_16x64.satd = PFX(pixel_satd_16x64_neon);
1974
+
1975
+ p.chromaX265_CSP_I420.puCHROMA_420_4x4.satd = PFX(pixel_satd_4x4_sve);
1976
+ p.chromaX265_CSP_I420.puCHROMA_420_8x8.satd = PFX(pixel_satd_8x8_neon);
1977
+ p.chromaX265_CSP_I420.puCHROMA_420_16x16.satd = PFX(pixel_satd_16x16_neon);
1978
+ p.chromaX265_CSP_I420.puCHROMA_420_32x32.satd = PFX(pixel_satd_32x32_neon);
1979
+ p.chromaX265_CSP_I420.puCHROMA_420_8x4.satd = PFX(pixel_satd_8x4_sve);
1980
+ p.chromaX265_CSP_I420.puCHROMA_420_4x8.satd = PFX(pixel_satd_4x8_neon);
1981
+ p.chromaX265_CSP_I420.puCHROMA_420_16x8.satd = PFX(pixel_satd_16x8_neon);
1982
+ p.chromaX265_CSP_I420.puCHROMA_420_8x16.satd = PFX(pixel_satd_8x16_neon);
1983
+ p.chromaX265_CSP_I420.puCHROMA_420_32x16.satd = PFX(pixel_satd_32x16_neon);
1984
+ p.chromaX265_CSP_I420.puCHROMA_420_16x32.satd = PFX(pixel_satd_16x32_neon);
1985
+ p.chromaX265_CSP_I420.puCHROMA_420_16x12.satd = PFX(pixel_satd_16x12_neon);
1986
+ p.chromaX265_CSP_I420.puCHROMA_420_12x16.satd = PFX(pixel_satd_12x16_neon);
1987
+ p.chromaX265_CSP_I420.puCHROMA_420_16x4.satd = PFX(pixel_satd_16x4_neon);
1988
+ p.chromaX265_CSP_I420.puCHROMA_420_4x16.satd = PFX(pixel_satd_4x16_neon);
1989
+ p.chromaX265_CSP_I420.puCHROMA_420_32x24.satd = PFX(pixel_satd_32x24_neon);
1990
+ p.chromaX265_CSP_I420.puCHROMA_420_24x32.satd = PFX(pixel_satd_24x32_neon);
1991
+ p.chromaX265_CSP_I420.puCHROMA_420_32x8.satd = PFX(pixel_satd_32x8_neon);
1992
+ p.chromaX265_CSP_I420.puCHROMA_420_8x32.satd = PFX(pixel_satd_8x32_neon);
1993
+
1994
+ p.chromaX265_CSP_I422.puCHROMA_422_4x8.satd = PFX(pixel_satd_4x8_neon);
1995
+ p.chromaX265_CSP_I422.puCHROMA_422_8x16.satd = PFX(pixel_satd_8x16_neon);
1996
+ p.chromaX265_CSP_I422.puCHROMA_422_16x32.satd = PFX(pixel_satd_16x32_neon);
1997
+ p.chromaX265_CSP_I422.puCHROMA_422_32x64.satd = PFX(pixel_satd_32x64_neon);
1998
+ p.chromaX265_CSP_I422.puCHROMA_422_4x4.satd = PFX(pixel_satd_4x4_sve);
1999
+ p.chromaX265_CSP_I422.puCHROMA_422_8x8.satd = PFX(pixel_satd_8x8_neon);
2000
+ p.chromaX265_CSP_I422.puCHROMA_422_4x16.satd = PFX(pixel_satd_4x16_neon);
2001
+ p.chromaX265_CSP_I422.puCHROMA_422_16x16.satd = PFX(pixel_satd_16x16_neon);
2002
+ p.chromaX265_CSP_I422.puCHROMA_422_8x32.satd = PFX(pixel_satd_8x32_neon);
2003
+ p.chromaX265_CSP_I422.puCHROMA_422_32x32.satd = PFX(pixel_satd_32x32_neon);
2004
+ p.chromaX265_CSP_I422.puCHROMA_422_16x64.satd = PFX(pixel_satd_16x64_neon);
2005
+ p.chromaX265_CSP_I422.puCHROMA_422_8x12.satd = PFX(pixel_satd_8x12_sve);
2006
+ p.chromaX265_CSP_I422.puCHROMA_422_8x4.satd = PFX(pixel_satd_8x4_sve);
2007
+ p.chromaX265_CSP_I422.puCHROMA_422_16x24.satd = PFX(pixel_satd_16x24_neon);
2008
+ p.chromaX265_CSP_I422.puCHROMA_422_12x32.satd = PFX(pixel_satd_12x32_neon);
2009
+ p.chromaX265_CSP_I422.puCHROMA_422_16x8.satd = PFX(pixel_satd_16x8_neon);
2010
+ p.chromaX265_CSP_I422.puCHROMA_422_4x32.satd = PFX(pixel_satd_4x32_neon);
2011
+ p.chromaX265_CSP_I422.puCHROMA_422_32x48.satd = PFX(pixel_satd_32x48_neon);
2012
+ p.chromaX265_CSP_I422.puCHROMA_422_24x64.satd = PFX(pixel_satd_24x64_neon);
2013
+ p.chromaX265_CSP_I422.puCHROMA_422_32x16.satd = PFX(pixel_satd_32x16_neon);
2014
+ p.chromaX265_CSP_I422.puCHROMA_422_8x64.satd = PFX(pixel_satd_8x64_neon);
2015
+
2016
+ // sa8d
2017
+ p.cuBLOCK_4x4.sa8d = PFX(pixel_satd_4x4_sve);
2018
+ p.cuBLOCK_8x8.sa8d = PFX(pixel_sa8d_8x8_neon);
2019
+ p.cuBLOCK_16x16.sa8d = PFX(pixel_sa8d_16x16_neon);
2020
+ p.cuBLOCK_32x32.sa8d = PFX(pixel_sa8d_32x32_neon);
2021
+ p.cuBLOCK_64x64.sa8d = PFX(pixel_sa8d_64x64_neon);
2022
+ p.chromaX265_CSP_I420.cuBLOCK_8x8.sa8d = PFX(pixel_satd_4x4_sve);
2023
+ p.chromaX265_CSP_I420.cuBLOCK_16x16.sa8d = PFX(pixel_sa8d_16x16_neon);
2024
+ p.chromaX265_CSP_I420.cuBLOCK_32x32.sa8d = PFX(pixel_sa8d_32x32_neon);
2025
+ p.chromaX265_CSP_I420.cuBLOCK_64x64.sa8d = PFX(pixel_sa8d_64x64_neon);
2026
+ p.chromaX265_CSP_I422.cuBLOCK_422_8x16.sa8d = PFX(pixel_sa8d_8x16_neon);
2027
+ p.chromaX265_CSP_I422.cuBLOCK_422_16x32.sa8d = PFX(pixel_sa8d_16x32_neon);
2028
+ p.chromaX265_CSP_I422.cuBLOCK_422_32x64.sa8d = PFX(pixel_sa8d_32x64_neon);
2029
+
2030
+ // dequant_scaling
2031
+ p.dequant_scaling = PFX(dequant_scaling_sve2);
2032
+ p.dequant_normal = PFX(dequant_normal_sve2);
2033
+
2034
+ // ssim_4x4x2_core
2035
+ p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_sve2);
2036
+
2037
+ // ssimDist
2038
+ p.cuBLOCK_4x4.ssimDist = PFX(ssimDist4_sve2);
2039
+ p.cuBLOCK_8x8.ssimDist = PFX(ssimDist8_sve2);
2040
+ p.cuBLOCK_16x16.ssimDist = PFX(ssimDist16_sve2);
2041
+ p.cuBLOCK_32x32.ssimDist = PFX(ssimDist32_sve2);
2042
+ p.cuBLOCK_64x64.ssimDist = PFX(ssimDist64_sve2);
2043
+
2044
+ // normFact
2045
+ p.cuBLOCK_8x8.normFact = PFX(normFact8_sve2);
2046
+ p.cuBLOCK_16x16.normFact = PFX(normFact16_sve2);
2047
+ p.cuBLOCK_32x32.normFact = PFX(normFact32_sve2);
2048
+ p.cuBLOCK_64x64.normFact = PFX(normFact64_sve2);
2049
+
2050
+ // psy_cost_pp
2051
+ p.cuBLOCK_4x4.psy_cost_pp = PFX(psyCost_4x4_neon);
2052
+
2053
+ p.weight_pp = PFX(weight_pp_neon);
2054
+#if !defined(__APPLE__)
2055
+ p.scanPosLast = PFX(scanPosLast_neon);
2056
+#endif
2057
+ p.costCoeffNxN = PFX(costCoeffNxN_neon);
2058
+#endif
2059
+
2060
+ // quant
2061
+ p.quant = PFX(quant_sve);
2062
+ p.nquant = PFX(nquant_neon);
2063
+}
2064
+#endif
2065
+
2066
+void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)
2067
+{
2068
+
2069
+#ifdef HAVE_SVE2
2070
+ if (cpuMask & X265_CPU_SVE2)
2071
+ {
2072
+ setupSve2Primitives(p);
2073
}
2074
+ else if (cpuMask & X265_CPU_SVE)
2075
+ {
2076
+ setupSvePrimitives(p);
2077
+ }
2078
+ else if (cpuMask & X265_CPU_NEON)
2079
+ {
2080
+ setupNeonPrimitives(p);
2081
+ }
2082
+
2083
+#elif defined(HAVE_SVE)
2084
+ if (cpuMask & X265_CPU_SVE)
2085
+ {
2086
+ setupSvePrimitives(p);
2087
+ }
2088
+ else if (cpuMask & X265_CPU_NEON)
2089
+ {
2090
+ setupNeonPrimitives(p);
2091
+ }
2092
+
2093
+#else
2094
+ if (cpuMask & X265_CPU_NEON)
2095
+ {
2096
+ setupNeonPrimitives(p);
2097
+ }
2098
+#endif
2099
+
2100
}
2101
} // namespace X265_NS
2102
x265_3.6.tar.gz/source/common/aarch64/asm-sve.S
Added
41
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+
27
+.arch armv8-a+sve
28
+
29
+.macro ABS2_SVE a b c
30
+ abs \a, \c\()/m, \a
31
+ abs \b, \c\()/m, \b
32
+.endm
33
+
34
+.macro ABS8_SVE z0, z1, z2, z3, z4, z5, z6, z7, p0
35
+ ABS2_SVE \z0, \z1, p0
36
+ ABS2_SVE \z2, \z3, p0
37
+ ABS2_SVE \z4, \z5, p0
38
+ ABS2_SVE \z6, \z7, p0
39
+.endm
40
+
41
x265_3.5.tar.gz/source/common/aarch64/asm.S -> x265_3.6.tar.gz/source/common/aarch64/asm.S
Changed
173
1
2
/*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
+ * Copyright (C) 2020-2021 MulticoreWare, Inc
5
*
6
* Authors: Hongbin Liu <liuhongbin1@huawei.com>
7
+ * Sebastian Pop <spop@amazon.com>
8
*
9
* This program is free software; you can redistribute it and/or modify
10
* it under the terms of the GNU General Public License as published by
11
12
* For more information, contact us at license @ x265.com.
13
*****************************************************************************/
14
15
+#ifndef ASM_S_ // #include guards
16
+#define ASM_S_
17
+
18
.arch armv8-a
19
20
+#define PFX3(prefix, name) prefix ## _ ## name
21
+#define PFX2(prefix, name) PFX3(prefix, name)
22
+#define PFX(name) PFX2(X265_NS, name)
23
+
24
+#ifdef __APPLE__
25
+#define PREFIX 1
26
+#endif
27
+
28
#ifdef PREFIX
29
#define EXTERN_ASM _
30
+#define HAVE_AS_FUNC 0
31
+#elif defined __clang__
32
+#define EXTERN_ASM
33
+#define HAVE_AS_FUNC 0
34
+#define PREFIX 1
35
#else
36
#define EXTERN_ASM
37
+#define HAVE_AS_FUNC 1
38
#endif
39
40
#ifdef __ELF__
41
#define ELF
42
#else
43
+#ifdef PREFIX
44
+#define ELF #
45
+#else
46
#define ELF @
47
#endif
48
-
49
-#define HAVE_AS_FUNC 1
50
+#endif
51
52
#if HAVE_AS_FUNC
53
#define FUNC
54
#else
55
+#ifdef PREFIX
56
+#define FUNC #
57
+#else
58
#define FUNC @
59
#endif
60
+#endif
61
+
62
+#define GLUE(a, b) a ## b
63
+#define JOIN(a, b) GLUE(a, b)
64
+
65
+#define PFX_C(name) JOIN(JOIN(JOIN(EXTERN_ASM, X265_NS), _), name)
66
+
67
+#ifdef __APPLE__
68
+.macro endfunc
69
+ELF .size \name, . - \name
70
+FUNC .endfunc
71
+.endm
72
+#endif
73
74
.macro function name, export=1
75
+#ifdef __APPLE__
76
+ .global \name
77
+ endfunc
78
+#else
79
.macro endfunc
80
ELF .size \name, . - \name
81
FUNC .endfunc
82
.purgem endfunc
83
.endm
84
+#endif
85
.align 2
86
.if \export == 1
87
.global EXTERN_ASM\name
88
89
.endif
90
.endm
91
92
+.macro const name, align=2
93
+ .macro endconst
94
+ELF .size \name, . - \name
95
+ .purgem endconst
96
+ .endm
97
+#ifdef __MACH__
98
+ .const_data
99
+#else
100
+ .section .rodata
101
+#endif
102
+ .align \align
103
+\name:
104
+.endm
105
+
106
+.macro movrel rd, val, offset=0
107
+#if defined(__APPLE__)
108
+ .if \offset < 0
109
+ adrp \rd, \val@PAGE
110
+ add \rd, \rd, \val@PAGEOFF
111
+ sub \rd, \rd, -(\offset)
112
+ .else
113
+ adrp \rd, \val+(\offset)@PAGE
114
+ add \rd, \rd, \val+(\offset)@PAGEOFF
115
+ .endif
116
+#elif defined(PIC) && defined(_WIN32)
117
+ .if \offset < 0
118
+ adrp \rd, \val
119
+ add \rd, \rd, :lo12:\val
120
+ sub \rd, \rd, -(\offset)
121
+ .else
122
+ adrp \rd, \val+(\offset)
123
+ add \rd, \rd, :lo12:\val+(\offset)
124
+ .endif
125
+#else
126
+ adrp \rd, \val+(\offset)
127
+ add \rd, \rd, :lo12:\val+(\offset)
128
+#endif
129
+.endm
130
131
#define FENC_STRIDE 64
132
#define FDEC_STRIDE 32
133
+
134
+.macro SUMSUB_AB sum, diff, a, b
135
+ add \sum, \a, \b
136
+ sub \diff, \a, \b
137
+.endm
138
+
139
+.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
140
+ SUMSUB_AB \s1, \d1, \a, \b
141
+ SUMSUB_AB \s2, \d2, \c, \d
142
+.endm
143
+
144
+.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
145
+ SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
146
+ SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
147
+.endm
148
+
149
+.macro ABS2 a b
150
+ abs \a, \a
151
+ abs \b, \b
152
+.endm
153
+
154
+.macro ABS8 v0, v1, v2, v3, v4, v5, v6, v7
155
+ ABS2 \v0, \v1
156
+ ABS2 \v2, \v3
157
+ ABS2 \v4, \v5
158
+ ABS2 \v6, \v7
159
+.endm
160
+
161
+.macro vtrn t1, t2, s1, s2
162
+ trn1 \t1, \s1, \s2
163
+ trn2 \t2, \s1, \s2
164
+.endm
165
+
166
+.macro trn4 t1, t2, t3, t4, s1, s2, s3, s4
167
+ vtrn \t1, \t2, \s1, \s2
168
+ vtrn \t3, \t4, \s3, \s4
169
+.endm
170
+
171
+#endif
172
\ No newline at end of file
173
x265_3.6.tar.gz/source/common/aarch64/blockcopy8-common.S
Added
56
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+#include "asm.S"
29
+
30
+.arch armv8-a
31
+
32
+// void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
33
+.macro cpy1Dto2D_shr_start
34
+ add x2, x2, x2
35
+ dup v0.8h, w3
36
+ cmeq v1.8h, v1.8h, v1.8h
37
+ sshl v1.8h, v1.8h, v0.8h
38
+ sri v1.8h, v1.8h, #1
39
+ neg v0.8h, v0.8h
40
+.endm
41
+
42
+.macro cpy2Dto1D_shr_start
43
+ add x2, x2, x2
44
+ dup v0.8h, w3
45
+ cmeq v1.8h, v1.8h, v1.8h
46
+ sshl v1.8h, v1.8h, v0.8h
47
+ sri v1.8h, v1.8h, #1
48
+ neg v0.8h, v0.8h
49
+.endm
50
+
51
+const xtn_xtn2_table, align=4
52
+.byte 0, 2, 4, 6, 8, 10, 12, 14
53
+.byte 16, 18, 20, 22, 24, 26, 28, 30
54
+endconst
55
+
56
x265_3.6.tar.gz/source/common/aarch64/blockcopy8-sve.S
Added
1418
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "blockcopy8-common.S"
27
+
28
+.arch armv8-a+sve
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+/* void blockcopy_sp(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
41
+ *
42
+ * r0 - a
43
+ * r1 - stridea
44
+ * r2 - b
45
+ * r3 - strideb */
46
+
47
+function PFX(blockcopy_sp_4x4_sve)
48
+ ptrue p0.h, vl4
49
+.rept 2
50
+ ld1h {z0.h}, p0/z, x2
51
+ add x2, x2, x3, lsl #1
52
+ st1b {z0.h}, p0, x0
53
+ add x0, x0, x1
54
+ ld1h {z1.h}, p0/z, x2
55
+ add x2, x2, x3, lsl #1
56
+ st1b {z1.h}, p0, x0
57
+ add x0, x0, x1
58
+.endr
59
+ ret
60
+endfunc
61
+
62
+function PFX(blockcopy_sp_8x8_sve)
63
+ ptrue p0.h, vl8
64
+.rept 4
65
+ ld1h {z0.h}, p0/z, x2
66
+ add x2, x2, x3, lsl #1
67
+ st1b {z0.h}, p0, x0
68
+ add x0, x0, x1
69
+ ld1h {z1.h}, p0/z, x2
70
+ add x2, x2, x3, lsl #1
71
+ st1b {z1.h}, p0, x0
72
+ add x0, x0, x1
73
+.endr
74
+ ret
75
+endfunc
76
+
77
+function PFX(blockcopy_sp_16x16_sve)
78
+ rdvl x9, #1
79
+ cmp x9, #16
80
+ bgt .vl_gt_16_blockcopy_sp_16_16
81
+ lsl x3, x3, #1
82
+ movrel x11, xtn_xtn2_table
83
+ ld1 {v31.16b}, x11
84
+.rept 8
85
+ ld1 {v0.8h-v1.8h}, x2, x3
86
+ ld1 {v2.8h-v3.8h}, x2, x3
87
+ tbl v0.16b, {v0.16b,v1.16b}, v31.16b
88
+ tbl v1.16b, {v2.16b,v3.16b}, v31.16b
89
+ st1 {v0.16b}, x0, x1
90
+ st1 {v1.16b}, x0, x1
91
+.endr
92
+ ret
93
+.vl_gt_16_blockcopy_sp_16_16:
94
+ ptrue p0.h, vl16
95
+.rept 8
96
+ ld1h {z0.h}, p0/z, x2
97
+ st1b {z0.h}, p0, x0
98
+ add x2, x2, x3, lsl #1
99
+ add x0, x0, x1
100
+ ld1h {z1.h}, p0/z, x2
101
+ st1b {z1.h}, p0, x0
102
+ add x2, x2, x3, lsl #1
103
+ add x0, x0, x1
104
+.endr
105
+ ret
106
+endfunc
107
+
108
+function PFX(blockcopy_sp_32x32_sve)
109
+ mov w12, #4
110
+ rdvl x9, #1
111
+ cmp x9, #16
112
+ bgt .vl_gt_16_blockcopy_sp_32_32
113
+ lsl x3, x3, #1
114
+ movrel x11, xtn_xtn2_table
115
+ ld1 {v31.16b}, x11
116
+.loop_csp32_sve:
117
+ sub w12, w12, #1
118
+.rept 4
119
+ ld1 {v0.8h-v3.8h}, x2, x3
120
+ ld1 {v4.8h-v7.8h}, x2, x3
121
+ tbl v0.16b, {v0.16b,v1.16b}, v31.16b
122
+ tbl v1.16b, {v2.16b,v3.16b}, v31.16b
123
+ tbl v2.16b, {v4.16b,v5.16b}, v31.16b
124
+ tbl v3.16b, {v6.16b,v7.16b}, v31.16b
125
+ st1 {v0.16b-v1.16b}, x0, x1
126
+ st1 {v2.16b-v3.16b}, x0, x1
127
+.endr
128
+ cbnz w12, .loop_csp32_sve
129
+ ret
130
+.vl_gt_16_blockcopy_sp_32_32:
131
+ cmp x9, #48
132
+ bgt .vl_gt_48_blockcopy_sp_32_32
133
+ ptrue p0.h, vl16
134
+.vl_gt_16_loop_csp32_sve:
135
+ sub w12, w12, #1
136
+.rept 4
137
+ ld1h {z0.h}, p0/z, x2
138
+ ld1h {z1.h}, p0/z, x2, #1, mul vl
139
+ st1b {z0.h}, p0, x0
140
+ st1b {z1.h}, p0, x0, #1, mul vl
141
+ add x2, x2, x3, lsl #1
142
+ add x0, x0, x1
143
+ ld1h {z2.h}, p0/z, x2
144
+ ld1h {z3.h}, p0/z, x2, #1, mul vl
145
+ st1b {z2.h}, p0, x0
146
+ st1b {z3.h}, p0, x0, #1, mul vl
147
+ add x2, x2, x3, lsl #1
148
+ add x0, x0, x1
149
+.endr
150
+ cbnz w12, .vl_gt_16_loop_csp32_sve
151
+ ret
152
+.vl_gt_48_blockcopy_sp_32_32:
153
+ ptrue p0.h, vl32
154
+.vl_gt_48_loop_csp32_sve:
155
+ sub w12, w12, #1
156
+.rept 4
157
+ ld1h {z0.h}, p0/z, x2
158
+ st1b {z0.h}, p0, x0
159
+ add x2, x2, x3, lsl #1
160
+ add x0, x0, x1
161
+ ld1h {z1.h}, p0/z, x2
162
+ st1b {z1.h}, p0, x0
163
+ add x2, x2, x3, lsl #1
164
+ add x0, x0, x1
165
+.endr
166
+ cbnz w12, .vl_gt_48_loop_csp32_sve
167
+ ret
168
+endfunc
169
+
170
+function PFX(blockcopy_ps_16x16_sve)
171
+ rdvl x9, #1
172
+ cmp x9, #16
173
+ bgt .vl_gt_16_blockcopy_ps_16_16
174
+ lsl x1, x1, #1
175
+.rept 8
176
+ ld1 {v4.16b}, x2, x3
177
+ ld1 {v5.16b}, x2, x3
178
+ uxtl v0.8h, v4.8b
179
+ uxtl2 v1.8h, v4.16b
180
+ uxtl v2.8h, v5.8b
181
+ uxtl2 v3.8h, v5.16b
182
+ st1 {v0.8h-v1.8h}, x0, x1
183
+ st1 {v2.8h-v3.8h}, x0, x1
184
+.endr
185
+ ret
186
+.vl_gt_16_blockcopy_ps_16_16:
187
+ ptrue p0.b, vl32
188
+.rept 16
189
+ ld1b {z1.h}, p0/z, x2
190
+ st1h {z1.h}, p0, x0
191
+ add x0, x0, x1, lsl #1
192
+ add x2, x2, x3
193
+.endr
194
+ ret
195
+endfunc
196
+
197
+function PFX(blockcopy_ps_32x32_sve)
198
+ rdvl x9, #1
199
+ cmp x9, #16
200
+ bgt .vl_gt_16_blockcopy_ps_32_32
201
+ lsl x1, x1, #1
202
+ mov w12, #4
203
+.loop_cps32_sve:
204
+ sub w12, w12, #1
205
+.rept 4
206
+ ld1 {v16.16b-v17.16b}, x2, x3
207
+ ld1 {v18.16b-v19.16b}, x2, x3
208
+ uxtl v0.8h, v16.8b
209
+ uxtl2 v1.8h, v16.16b
210
+ uxtl v2.8h, v17.8b
211
+ uxtl2 v3.8h, v17.16b
212
+ uxtl v4.8h, v18.8b
213
+ uxtl2 v5.8h, v18.16b
214
+ uxtl v6.8h, v19.8b
215
+ uxtl2 v7.8h, v19.16b
216
+ st1 {v0.8h-v3.8h}, x0, x1
217
+ st1 {v4.8h-v7.8h}, x0, x1
218
+.endr
219
+ cbnz w12, .loop_cps32_sve
220
+ ret
221
+.vl_gt_16_blockcopy_ps_32_32:
222
+ cmp x9, #48
223
+ bgt .vl_gt_48_blockcopy_ps_32_32
224
+ ptrue p0.b, vl32
225
+.rept 32
226
+ ld1b {z2.h}, p0/z, x2
227
+ ld1b {z3.h}, p0/z, x2, #1, mul vl
228
+ st1h {z2.h}, p0, x0
229
+ st1h {z3.h}, p0, x0, #1, mul vl
230
+ add x0, x0, x1, lsl #1
231
+ add x2, x2, x3
232
+.endr
233
+ ret
234
+.vl_gt_48_blockcopy_ps_32_32:
235
+ ptrue p0.b, vl64
236
+.rept 32
237
+ ld1b {z2.h}, p0/z, x2
238
+ st1h {z2.h}, p0, x0
239
+ add x0, x0, x1, lsl #1
240
+ add x2, x2, x3
241
+.endr
242
+ ret
243
+endfunc
244
+
245
+function PFX(blockcopy_ps_64x64_sve)
246
+ rdvl x9, #1
247
+ cmp x9, #16
248
+ bgt .vl_gt_16_blockcopy_ps_64_64
249
+ lsl x1, x1, #1
250
+ sub x1, x1, #64
251
+ mov w12, #16
252
+.loop_cps64_sve:
253
+ sub w12, w12, #1
254
+.rept 4
255
+ ld1 {v16.16b-v19.16b}, x2, x3
256
+ uxtl v0.8h, v16.8b
257
+ uxtl2 v1.8h, v16.16b
258
+ uxtl v2.8h, v17.8b
259
+ uxtl2 v3.8h, v17.16b
260
+ uxtl v4.8h, v18.8b
261
+ uxtl2 v5.8h, v18.16b
262
+ uxtl v6.8h, v19.8b
263
+ uxtl2 v7.8h, v19.16b
264
+ st1 {v0.8h-v3.8h}, x0, #64
265
+ st1 {v4.8h-v7.8h}, x0, x1
266
+.endr
267
+ cbnz w12, .loop_cps64_sve
268
+ ret
269
+.vl_gt_16_blockcopy_ps_64_64:
270
+ cmp x9, #48
271
+ bgt .vl_gt_48_blockcopy_ps_64_64
272
+ ptrue p0.b, vl32
273
+.rept 64
274
+ ld1b {z4.h}, p0/z, x2
275
+ ld1b {z5.h}, p0/z, x2, #1, mul vl
276
+ ld1b {z6.h}, p0/z, x2, #2, mul vl
277
+ ld1b {z7.h}, p0/z, x2, #3, mul vl
278
+ st1h {z4.h}, p0, x0
279
+ st1h {z5.h}, p0, x0, #1, mul vl
280
+ st1h {z6.h}, p0, x0, #2, mul vl
281
+ st1h {z7.h}, p0, x0, #3, mul vl
282
+ add x0, x0, x1, lsl #1
283
+ add x2, x2, x3
284
+.endr
285
+ ret
286
+.vl_gt_48_blockcopy_ps_64_64:
287
+ cmp x9, #112
288
+ bgt .vl_gt_112_blockcopy_ps_64_64
289
+ ptrue p0.b, vl64
290
+.rept 64
291
+ ld1b {z4.h}, p0/z, x2
292
+ ld1b {z5.h}, p0/z, x2, #1, mul vl
293
+ st1h {z4.h}, p0, x0
294
+ st1h {z5.h}, p0, x0, #1, mul vl
295
+ add x0, x0, x1, lsl #1
296
+ add x2, x2, x3
297
+.endr
298
+ ret
299
+.vl_gt_112_blockcopy_ps_64_64:
300
+ ptrue p0.b, vl128
301
+.rept 64
302
+ ld1b {z4.h}, p0/z, x2
303
+ st1h {z4.h}, p0, x0
304
+ add x0, x0, x1, lsl #1
305
+ add x2, x2, x3
306
+.endr
307
+ ret
308
+
309
+endfunc
310
+
311
+function PFX(blockcopy_ss_16x16_sve)
312
+ rdvl x9, #1
313
+ cmp x9, #16
314
+ bgt .vl_gt_16_blockcopy_ss_16_16
315
+ lsl x1, x1, #1
316
+ lsl x3, x3, #1
317
+.rept 8
318
+ ld1 {v0.8h-v1.8h}, x2, x3
319
+ ld1 {v2.8h-v3.8h}, x2, x3
320
+ st1 {v0.8h-v1.8h}, x0, x1
321
+ st1 {v2.8h-v3.8h}, x0, x1
322
+.endr
323
+ ret
324
+.vl_gt_16_blockcopy_ss_16_16:
325
+ ptrue p0.h, vl16
326
+.rept 16
327
+ ld1h {z0.h}, p0/z, x2
328
+ st1h {z0.h}, p0, x0
329
+ add x2, x2, x3, lsl #1
330
+ add x0, x0, x1, lsl #1
331
+.endr
332
+ ret
333
+endfunc
334
+
335
+function PFX(blockcopy_ss_32x32_sve)
336
+ rdvl x9, #1
337
+ cmp x9, #16
338
+ bgt .vl_gt_16_blockcopy_ss_32_32
339
+ lsl x1, x1, #1
340
+ lsl x3, x3, #1
341
+ mov w12, #4
342
+.loop_css32_sve:
343
+ sub w12, w12, #1
344
+.rept 8
345
+ ld1 {v0.8h-v3.8h}, x2, x3
346
+ st1 {v0.8h-v3.8h}, x0, x1
347
+.endr
348
+ cbnz w12, .loop_css32_sve
349
+ ret
350
+.vl_gt_16_blockcopy_ss_32_32:
351
+ cmp x9, #48
352
+ bgt .vl_gt_48_blockcopy_ss_32_32
353
+ ptrue p0.h, vl16
354
+.rept 32
355
+ ld1h {z0.h}, p0/z, x2
356
+ ld1h {z1.h}, p0/z, x2, #1, mul vl
357
+ st1h {z0.h}, p0, x0
358
+ st1h {z1.h}, p0, x0, #1, mul vl
359
+ add x2, x2, x3, lsl #1
360
+ add x0, x0, x1, lsl #1
361
+.endr
362
+ ret
363
+.vl_gt_48_blockcopy_ss_32_32:
364
+ ptrue p0.h, vl32
365
+.rept 32
366
+ ld1h {z0.h}, p0/z, x2
367
+ st1h {z0.h}, p0, x0
368
+ add x2, x2, x3, lsl #1
369
+ add x0, x0, x1, lsl #1
370
+.endr
371
+ ret
372
+endfunc
373
+
374
+function PFX(blockcopy_ss_64x64_sve)
375
+ rdvl x9, #1
376
+ cmp x9, #16
377
+ bgt .vl_gt_16_blockcopy_ss_64_64
378
+ lsl x1, x1, #1
379
+ sub x1, x1, #64
380
+ lsl x3, x3, #1
381
+ sub x3, x3, #64
382
+ mov w12, #8
383
+.loop_css64_sve:
384
+ sub w12, w12, #1
385
+.rept 8
386
+ ld1 {v0.8h-v3.8h}, x2, #64
387
+ ld1 {v4.8h-v7.8h}, x2, x3
388
+ st1 {v0.8h-v3.8h}, x0, #64
389
+ st1 {v4.8h-v7.8h}, x0, x1
390
+.endr
391
+ cbnz w12, .loop_css64_sve
392
+ ret
393
+.vl_gt_16_blockcopy_ss_64_64:
394
+ cmp x9, #48
395
+ bgt .vl_gt_48_blockcopy_ss_64_64
396
+ mov w12, #8
397
+ ptrue p0.b, vl32
398
+.vl_gt_16_loop_css64_sve:
399
+ sub w12, w12, #1
400
+.rept 8
401
+ ld1b {z0.b}, p0/z, x2
402
+ ld1b {z1.b}, p0/z, x2, #1, mul vl
403
+ ld1b {z2.b}, p0/z, x2, #2, mul vl
404
+ ld1b {z3.b}, p0/z, x2, #3, mul vl
405
+ st1b {z0.b}, p0, x0
406
+ st1b {z1.b}, p0, x0, #1, mul vl
407
+ st1b {z2.b}, p0, x0, #2, mul vl
408
+ st1b {z3.b}, p0, x0, #3, mul vl
409
+ add x2, x2, x3, lsl #1
410
+ add x0, x0, x1, lsl #1
411
+.endr
412
+ cbnz w12, .vl_gt_16_loop_css64_sve
413
+ ret
414
+.vl_gt_48_blockcopy_ss_64_64:
415
+ cmp x9, #112
416
+ bgt .vl_gt_112_blockcopy_ss_64_64
417
+ mov w12, #8
418
+ ptrue p0.b, vl64
419
+.vl_gt_48_loop_css64_sve:
420
+ sub w12, w12, #1
421
+.rept 8
422
+ ld1b {z0.b}, p0/z, x2
423
+ ld1b {z1.b}, p0/z, x2, #1, mul vl
424
+ st1b {z0.b}, p0, x0
425
+ st1b {z1.b}, p0, x0, #1, mul vl
426
+ add x2, x2, x3, lsl #1
427
+ add x0, x0, x1, lsl #1
428
+.endr
429
+ cbnz w12, .vl_gt_48_loop_css64_sve
430
+ ret
431
+.vl_gt_112_blockcopy_ss_64_64:
432
+ mov w12, #8
433
+ ptrue p0.b, vl128
434
+.vl_gt_112_loop_css64_sve:
435
+ sub w12, w12, #1
436
+.rept 8
437
+ ld1b {z0.b}, p0/z, x2
438
+ st1b {z0.b}, p0, x0
439
+ add x2, x2, x3, lsl #1
440
+ add x0, x0, x1, lsl #1
441
+.endr
442
+ cbnz w12, .vl_gt_112_loop_css64_sve
443
+ ret
444
+endfunc
445
+
446
+/******** Chroma blockcopy********/
447
+function PFX(blockcopy_ss_16x32_sve)
448
+ rdvl x9, #1
449
+ cmp x9, #16
450
+ bgt .vl_gt_16_blockcopy_ss_16_32
451
+ lsl x1, x1, #1
452
+ lsl x3, x3, #1
453
+.rept 16
454
+ ld1 {v0.8h-v1.8h}, x2, x3
455
+ ld1 {v2.8h-v3.8h}, x2, x3
456
+ st1 {v0.8h-v1.8h}, x0, x1
457
+ st1 {v2.8h-v3.8h}, x0, x1
458
+.endr
459
+ ret
460
+.vl_gt_16_blockcopy_ss_16_32:
461
+ ptrue p0.h, vl16
462
+.rept 32
463
+ ld1h {z0.h}, p0/z, x2
464
+ st1h {z0.h}, p0, x0
465
+ add x2, x2, x3, lsl #1
466
+ add x0, x0, x1, lsl #1
467
+.endr
468
+ ret
469
+endfunc
470
+
471
+function PFX(blockcopy_ss_32x64_sve)
472
+ rdvl x9, #1
473
+ cmp x9, #16
474
+ bgt .vl_gt_16_blockcopy_ss_32_64
475
+ lsl x1, x1, #1
476
+ lsl x3, x3, #1
477
+ mov w12, #8
478
+.loop_css32x64_sve:
479
+ sub w12, w12, #1
480
+.rept 8
481
+ ld1 {v0.8h-v3.8h}, x2, x3
482
+ st1 {v0.8h-v3.8h}, x0, x1
483
+.endr
484
+ cbnz w12, .loop_css32x64_sve
485
+ ret
486
+.vl_gt_16_blockcopy_ss_32_64:
487
+ cmp x9, #48
488
+ bgt .vl_gt_48_blockcopy_ss_32_64
489
+ mov w12, #8
490
+ ptrue p0.b, vl32
491
+.vl_gt_32_loop_css32x64_sve:
492
+ sub w12, w12, #1
493
+.rept 8
494
+ ld1b {z0.b}, p0/z, x2
495
+ ld1b {z1.b}, p0/z, x2, #1, mul vl
496
+ st1b {z0.b}, p0, x0
497
+ st1b {z1.b}, p0, x0, #1, mul vl
498
+ add x2, x2, x3, lsl #1
499
+ add x0, x0, x1, lsl #1
500
+.endr
501
+ cbnz w12, .vl_gt_32_loop_css32x64_sve
502
+ ret
503
+.vl_gt_48_blockcopy_ss_32_64:
504
+ mov w12, #8
505
+ ptrue p0.b, vl64
506
+.vl_gt_48_loop_css32x64_sve:
507
+ sub w12, w12, #1
508
+.rept 8
509
+ ld1b {z0.b}, p0/z, x2
510
+ st1b {z0.b}, p0, x0
511
+ add x2, x2, x3, lsl #1
512
+ add x0, x0, x1, lsl #1
513
+.endr
514
+ cbnz w12, .vl_gt_48_loop_css32x64_sve
515
+ ret
516
+endfunc
517
+
518
+// chroma blockcopy_ps
519
+function PFX(blockcopy_ps_4x8_sve)
520
+ ptrue p0.h, vl4
521
+.rept 8
522
+ ld1b {z0.h}, p0/z, x2
523
+ st1h {z0.h}, p0, x0
524
+ add x0, x0, x1, lsl #1
525
+ add x2, x2, x3
526
+.endr
527
+ ret
528
+endfunc
529
+
530
+function PFX(blockcopy_ps_8x16_sve)
531
+ ptrue p0.h, vl8
532
+.rept 16
533
+ ld1b {z0.h}, p0/z, x2
534
+ st1h {z0.h}, p0, x0
535
+ add x0, x0, x1, lsl #1
536
+ add x2, x2, x3
537
+.endr
538
+ ret
539
+endfunc
540
+
541
+function PFX(blockcopy_ps_16x32_sve)
542
+ rdvl x9, #1
543
+ cmp x9, #16
544
+ bgt .vl_gt_16_blockcopy_ps_16_32
545
+ lsl x1, x1, #1
546
+.rept 16
547
+ ld1 {v4.16b}, x2, x3
548
+ ld1 {v5.16b}, x2, x3
549
+ uxtl v0.8h, v4.8b
550
+ uxtl2 v1.8h, v4.16b
551
+ uxtl v2.8h, v5.8b
552
+ uxtl2 v3.8h, v5.16b
553
+ st1 {v0.8h-v1.8h}, x0, x1
554
+ st1 {v2.8h-v3.8h}, x0, x1
555
+.endr
556
+ ret
557
+.vl_gt_16_blockcopy_ps_16_32:
558
+ ptrue p0.b, vl32
559
+.rept 32
560
+ ld1b {z1.h}, p0/z, x2
561
+ st1h {z1.h}, p0, x0
562
+ add x0, x0, x1, lsl #1
563
+ add x2, x2, x3
564
+.endr
565
+ ret
566
+endfunc
567
+
568
+function PFX(blockcopy_ps_32x64_sve)
569
+ rdvl x9, #1
570
+ cmp x9, #16
571
+ bgt .vl_gt_16_blockcopy_ps_32_64
572
+ lsl x1, x1, #1
573
+ mov w12, #8
574
+.loop_cps32x64_sve:
575
+ sub w12, w12, #1
576
+.rept 4
577
+ ld1 {v16.16b-v17.16b}, x2, x3
578
+ ld1 {v18.16b-v19.16b}, x2, x3
579
+ uxtl v0.8h, v16.8b
580
+ uxtl2 v1.8h, v16.16b
581
+ uxtl v2.8h, v17.8b
582
+ uxtl2 v3.8h, v17.16b
583
+ uxtl v4.8h, v18.8b
584
+ uxtl2 v5.8h, v18.16b
585
+ uxtl v6.8h, v19.8b
586
+ uxtl2 v7.8h, v19.16b
587
+ st1 {v0.8h-v3.8h}, x0, x1
588
+ st1 {v4.8h-v7.8h}, x0, x1
589
+.endr
590
+ cbnz w12, .loop_cps32x64_sve
591
+ ret
592
+.vl_gt_16_blockcopy_ps_32_64:
593
+ cmp x9, #48
594
+ bgt .vl_gt_48_blockcopy_ps_32_64
595
+ ptrue p0.b, vl32
596
+.rept 64
597
+ ld1b {z2.h}, p0/z, x2
598
+ ld1b {z3.h}, p0/z, x2, #1, mul vl
599
+ st1h {z2.h}, p0, x0
600
+ st1h {z3.h}, p0, x0, #1, mul vl
601
+ add x0, x0, x1, lsl #1
602
+ add x2, x2, x3
603
+.endr
604
+ ret
605
+.vl_gt_48_blockcopy_ps_32_64:
606
+ ptrue p0.b, vl64
607
+.rept 64
608
+ ld1b {z2.h}, p0/z, x2
609
+ st1h {z2.h}, p0, x0
610
+ add x0, x0, x1, lsl #1
611
+ add x2, x2, x3
612
+.endr
613
+ ret
614
+endfunc
615
+
616
+// chroma blockcopy_sp
617
+function PFX(blockcopy_sp_4x8_sve)
618
+ ptrue p0.h, vl4
619
+.rept 8
620
+ ld1h {z0.h}, p0/z, x2
621
+ st1b {z0.h}, p0, x0
622
+ add x2, x2, x3, lsl #1
623
+ add x0, x0, x1
624
+.endr
625
+ ret
626
+endfunc
627
+
628
+function PFX(blockcopy_sp_8x16_sve)
629
+ ptrue p0.h, vl8
630
+.rept 16
631
+ ld1h {z0.h}, p0/z, x2
632
+ st1b {z0.h}, p0, x0
633
+ add x2, x2, x3, lsl #1
634
+ add x0, x0, x1
635
+.endr
636
+ ret
637
+endfunc
638
+
639
+function PFX(blockcopy_sp_16x32_sve)
640
+ rdvl x9, #1
641
+ cmp x9, #16
642
+ bgt .vl_gt_16_blockcopy_sp_16_32
643
+ ptrue p0.h, vl8
644
+.rept 32
645
+ ld1h {z0.h}, p0/z, x2
646
+ ld1h {z1.h}, p0/z, x2, #1, mul vl
647
+ st1b {z0.h}, p0, x0
648
+ st1b {z1.h}, p0, x0, #1, mul vl
649
+ add x2, x2, x3, lsl #1
650
+ add x0, x0, x1
651
+.endr
652
+ ret
653
+.vl_gt_16_blockcopy_sp_16_32:
654
+ ptrue p0.h, vl16
655
+.rept 32
656
+ ld1h {z0.h}, p0/z, x2
657
+ st1b {z0.h}, p0, x0
658
+ add x2, x2, x3, lsl #1
659
+ add x0, x0, x1
660
+.endr
661
+ ret
662
+endfunc
663
+
664
+function PFX(blockcopy_sp_32x64_sve)
665
+ rdvl x9, #1
666
+ cmp x9, #16
667
+ bgt .vl_gt_16_blockcopy_sp_32_64
668
+ ptrue p0.h, vl8
669
+.rept 64
670
+ ld1h {z0.h}, p0/z, x2
671
+ ld1h {z1.h}, p0/z, x2, #1, mul vl
672
+ ld1h {z2.h}, p0/z, x2, #2, mul vl
673
+ ld1h {z3.h}, p0/z, x2, #3, mul vl
674
+ st1b {z0.h}, p0, x0
675
+ st1b {z1.h}, p0, x0, #1, mul vl
676
+ st1b {z2.h}, p0, x0, #2, mul vl
677
+ st1b {z3.h}, p0, x0, #3, mul vl
678
+ add x2, x2, x3, lsl #1
679
+ add x0, x0, x1
680
+.endr
681
+ ret
682
+.vl_gt_16_blockcopy_sp_32_64:
683
+ cmp x9, #48
684
+ bgt .vl_gt_48_blockcopy_sp_32_64
685
+ ptrue p0.h, vl16
686
+.rept 64
687
+ ld1h {z0.h}, p0/z, x2
688
+ ld1h {z1.h}, p0/z, x2, #1, mul vl
689
+ st1b {z0.h}, p0, x0
690
+ st1b {z1.h}, p0, x0, #1, mul vl
691
+ add x2, x2, x3, lsl #1
692
+ add x0, x0, x1
693
+.endr
694
+ ret
695
+.vl_gt_48_blockcopy_sp_32_64:
696
+ ptrue p0.h, vl32
697
+.rept 64
698
+ ld1h {z0.h}, p0/z, x2
699
+ st1b {z0.h}, p0, x0
700
+ add x2, x2, x3, lsl #1
701
+ add x0, x0, x1
702
+.endr
703
+ ret
704
+endfunc
705
+
706
+/* blockcopy_pp(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) */
707
+
708
+function PFX(blockcopy_pp_32x8_sve)
709
+ rdvl x9, #1
710
+ cmp x9, #16
711
+ bgt .vl_gt_16_blockcopy_pp_32_8
712
+.rept 8
713
+ ld1 {v0.16b-v1.16b}, x2, x3
714
+ st1 {v0.16b-v1.16b}, x0, x1
715
+.endr
716
+ ret
717
+.vl_gt_16_blockcopy_pp_32_8:
718
+ ptrue p0.b, vl32
719
+.rept 8
720
+ ld1b {z0.b}, p0/z, x2
721
+ st1b {z0.b}, p0, x0
722
+ add x2, x2, x3
723
+ add x0, x0, x1
724
+.endr
725
+ ret
726
+endfunc
727
+
728
+.macro blockcopy_pp_32xN_sve h
729
+function PFX(blockcopy_pp_32x\h\()_sve)
730
+ mov w12, #\h / 8
731
+ rdvl x9, #1
732
+ cmp x9, #16
733
+ bgt .vl_gt_16_blockcopy_pp_32xN_\h
734
+.loop_sve_32x\h\():
735
+ sub w12, w12, #1
736
+.rept 8
737
+ ld1 {v0.16b-v1.16b}, x2, x3
738
+ st1 {v0.16b-v1.16b}, x0, x1
739
+.endr
740
+ cbnz w12, .loop_sve_32x\h
741
+ ret
742
+.vl_gt_16_blockcopy_pp_32xN_\h:
743
+ ptrue p0.b, vl32
744
+.L_gt_16_blockcopy_pp_32xN_\h:
745
+ sub w12, w12, #1
746
+.rept 8
747
+ ld1b {z0.b}, p0/z, x2
748
+ st1b {z0.b}, p0, x0
749
+ add x2, x2, x3
750
+ add x0, x0, x1
751
+.endr
752
+ cbnz w12, .L_gt_16_blockcopy_pp_32xN_\h
753
+ ret
754
+endfunc
755
+.endm
756
+
757
+blockcopy_pp_32xN_sve 16
758
+blockcopy_pp_32xN_sve 24
759
+blockcopy_pp_32xN_sve 32
760
+blockcopy_pp_32xN_sve 64
761
+blockcopy_pp_32xN_sve 48
762
+
763
+.macro blockcopy_pp_64xN_sve h
764
+function PFX(blockcopy_pp_64x\h\()_sve)
765
+ mov w12, #\h / 4
766
+ rdvl x9, #1
767
+ cmp x9, #16
768
+ bgt .vl_gt_16_blockcopy_pp_64xN_\h
769
+.loop_sve_64x\h\():
770
+ sub w12, w12, #1
771
+.rept 4
772
+ ld1 {v0.16b-v3.16b}, x2, x3
773
+ st1 {v0.16b-v3.16b}, x0, x1
774
+.endr
775
+ cbnz w12, .loop_sve_64x\h
776
+ ret
777
+.vl_gt_16_blockcopy_pp_64xN_\h:
778
+ cmp x9, #48
779
+ bgt .vl_gt_48_blockcopy_pp_64xN_\h
780
+ ptrue p0.b, vl32
781
+.L_le_32_blockcopy_pp_64xN_\h:
782
+ sub w12, w12, #1
783
+.rept 4
784
+ ld1b {z0.b}, p0/z, x2
785
+ ld1b {z1.b}, p0/z, x2, #1, mul vl
786
+ st1b {z0.b}, p0, x0
787
+ st1b {z1.b}, p0, x0, #1, mul vl
788
+ add x2, x2, x3
789
+ add x0, x0, x1
790
+.endr
791
+ cbnz w12, .L_le_32_blockcopy_pp_64xN_\h
792
+ ret
793
+.vl_gt_48_blockcopy_pp_64xN_\h:
794
+ ptrue p0.b, vl64
795
+.L_blockcopy_pp_64xN_\h:
796
+ sub w12, w12, #1
797
+.rept 4
798
+ ld1b {z0.b}, p0/z, x2
799
+ st1b {z0.b}, p0, x0
800
+ add x2, x2, x3
801
+ add x0, x0, x1
802
+.endr
803
+ cbnz w12, .L_blockcopy_pp_64xN_\h
804
+ ret
805
+endfunc
806
+.endm
807
+
808
+blockcopy_pp_64xN_sve 16
809
+blockcopy_pp_64xN_sve 32
810
+blockcopy_pp_64xN_sve 48
811
+blockcopy_pp_64xN_sve 64
812
+
813
+function PFX(blockfill_s_32x32_sve)
814
+ rdvl x9, #1
815
+ cmp x9, #16
816
+ bgt .vl_gt_16_blockfill_s_32_32
817
+ dup v0.8h, w2
818
+ mov v1.16b, v0.16b
819
+ mov v2.16b, v0.16b
820
+ mov v3.16b, v0.16b
821
+ lsl x1, x1, #1
822
+.rept 32
823
+ st1 {v0.8h-v3.8h}, x0, x1
824
+.endr
825
+ ret
826
+.vl_gt_16_blockfill_s_32_32:
827
+ cmp x9, #48
828
+ bgt .vl_gt_48_blockfill_s_32_32
829
+ dup z0.h, w2
830
+ ptrue p0.h, vl16
831
+.rept 32
832
+ st1h {z0.h}, p0, x0
833
+ st1h {z0.h}, p0, x0, #1, mul vl
834
+ add x0, x0, x1, lsl #1
835
+.endr
836
+ ret
837
+.vl_gt_48_blockfill_s_32_32:
838
+ dup z0.h, w2
839
+ ptrue p0.h, vl32
840
+.rept 32
841
+ st1h {z0.h}, p0, x0
842
+ add x0, x0, x1, lsl #1
843
+.endr
844
+ ret
845
+endfunc
846
+
847
+// void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
848
+.macro cpy2Dto1D_shl_start_sve
849
+ add x2, x2, x2
850
+ mov z0.h, w3
851
+.endm
852
+
853
+function PFX(cpy2Dto1D_shl_16x16_sve)
854
+ dup z0.h, w3
855
+ rdvl x9, #1
856
+ cmp x9, #16
857
+ bgt .vl_gt_16_cpy2Dto1D_shl_16x16
858
+ cpy2Dto1D_shl_start_sve
859
+ mov w12, #4
860
+.loop_cpy2Dto1D_shl_16_sve:
861
+ sub w12, w12, #1
862
+.rept 4
863
+ ld1 {v2.16b-v3.16b}, x1, x2
864
+ sshl v2.8h, v2.8h, v0.8h
865
+ sshl v3.8h, v3.8h, v0.8h
866
+ st1 {v2.16b-v3.16b}, x0, #32
867
+.endr
868
+ cbnz w12, .loop_cpy2Dto1D_shl_16_sve
869
+ ret
870
+.vl_gt_16_cpy2Dto1D_shl_16x16:
871
+ ptrue p0.h, vl16
872
+.rept 16
873
+ ld1h {z1.h}, p0/z, x1
874
+ lsl z1.h, p0/m, z1.h, z0.h
875
+ st1h {z1.h}, p0, x0
876
+ add x1, x1, x2, lsl #1
877
+ add x0, x0, #32
878
+.endr
879
+ ret
880
+endfunc
881
+
882
+function PFX(cpy2Dto1D_shl_32x32_sve)
883
+ dup z0.h, w3
884
+ rdvl x9, #1
885
+ cmp x9, #16
886
+ bgt .vl_gt_16_cpy2Dto1D_shl_32x32
887
+ cpy2Dto1D_shl_start_sve
888
+ mov w12, #16
889
+.loop_cpy2Dto1D_shl_32_sve:
890
+ sub w12, w12, #1
891
+.rept 2
892
+ ld1 {v2.16b-v5.16b}, x1, x2
893
+ sshl v2.8h, v2.8h, v0.8h
894
+ sshl v3.8h, v3.8h, v0.8h
895
+ sshl v4.8h, v4.8h, v0.8h
896
+ sshl v5.8h, v5.8h, v0.8h
897
+ st1 {v2.16b-v5.16b}, x0, #64
898
+.endr
899
+ cbnz w12, .loop_cpy2Dto1D_shl_32_sve
900
+ ret
901
+.vl_gt_16_cpy2Dto1D_shl_32x32:
902
+ cmp x9, #48
903
+ bgt .vl_gt_48_cpy2Dto1D_shl_32x32
904
+ ptrue p0.h, vl16
905
+.rept 32
906
+ ld1h {z1.h}, p0/z, x1
907
+ ld1h {z2.h}, p0/z, x1, #1, mul vl
908
+ lsl z1.h, p0/m, z1.h, z0.h
909
+ lsl z2.h, p0/m, z2.h, z0.h
910
+ st1h {z1.h}, p0, x0
911
+ st1h {z2.h}, p0, x0, #1, mul vl
912
+ add x1, x1, x2, lsl #1
913
+ add x0, x0, #64
914
+.endr
915
+ ret
916
+.vl_gt_48_cpy2Dto1D_shl_32x32:
917
+ ptrue p0.h, vl32
918
+.rept 32
919
+ ld1h {z1.h}, p0/z, x1
920
+ lsl z1.h, p0/m, z1.h, z0.h
921
+ st1h {z1.h}, p0, x0
922
+ add x1, x1, x2, lsl #1
923
+ add x0, x0, #64
924
+.endr
925
+ ret
926
+endfunc
927
+
928
+function PFX(cpy2Dto1D_shl_64x64_sve)
929
+ rdvl x9, #1
930
+ cmp x9, #16
931
+ bgt .vl_gt_16_cpy2Dto1D_shl_64x64
932
+ cpy2Dto1D_shl_start_sve
933
+ mov w12, #32
934
+ sub x2, x2, #64
935
+.loop_cpy2Dto1D_shl_64_sve:
936
+ sub w12, w12, #1
937
+.rept 2
938
+ ld1 {v2.16b-v5.16b}, x1, #64
939
+ ld1 {v16.16b-v19.16b}, x1, x2
940
+ sshl v2.8h, v2.8h, v0.8h
941
+ sshl v3.8h, v3.8h, v0.8h
942
+ sshl v4.8h, v4.8h, v0.8h
943
+ sshl v5.8h, v5.8h, v0.8h
944
+ sshl v16.8h, v16.8h, v0.8h
945
+ sshl v17.8h, v17.8h, v0.8h
946
+ sshl v18.8h, v18.8h, v0.8h
947
+ sshl v19.8h, v19.8h, v0.8h
948
+ st1 {v2.16b-v5.16b}, x0, #64
949
+ st1 {v16.16b-v19.16b}, x0, #64
950
+.endr
951
+ cbnz w12, .loop_cpy2Dto1D_shl_64_sve
952
+ ret
953
+.vl_gt_16_cpy2Dto1D_shl_64x64:
954
+ dup z0.h, w3
955
+ mov x8, #64
956
+ mov w12, #64
957
+.L_init_cpy2Dto1D_shl_64x64:
958
+ sub w12, w12, 1
959
+ mov x9, #0
960
+ whilelt p0.h, x9, x8
961
+.L_cpy2Dto1D_shl_64x64:
962
+ ld1h {z1.h}, p0/z, x1, x9, lsl #1
963
+ lsl z1.h, p0/m, z1.h, z0.h
964
+ st1h {z1.h}, p0, x0, x9, lsl #1
965
+ inch x9
966
+ whilelt p0.h, x9, x8
967
+ b.first .L_cpy2Dto1D_shl_64x64
968
+ add x1, x1, x2, lsl #1
969
+ addvl x0, x0, #1
970
+ cbnz w12, .L_init_cpy2Dto1D_shl_64x64
971
+ ret
972
+endfunc
973
+
974
+// void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
975
+
976
+function PFX(cpy2Dto1D_shr_4x4_sve)
977
+ dup z0.h, w3
978
+ sub w4, w3, #1
979
+ dup z1.h, w4
980
+ ptrue p0.h, vl8
981
+ mov z2.h, #1
982
+ lsl z2.h, p0/m, z2.h, z1.h
983
+ lsl x2, x2, #1
984
+ index z3.d, #0, x2
985
+ index z4.d, #0, #8
986
+.rept 2
987
+ ld1d {z5.d}, p0/z, x1, z3.d
988
+ add x1, x1, x2, lsl #1
989
+ add z5.h, p0/m, z5.h, z2.h
990
+ asr z5.h, p0/m, z5.h, z0.h
991
+ st1d {z5.d}, p0, x0, z4.d
992
+ add x0, x0, #16
993
+.endr
994
+ ret
995
+endfunc
996
+
997
+function PFX(cpy2Dto1D_shr_8x8_sve)
998
+ dup z0.h, w3
999
+ sub w4, w3, #1
1000
+ dup z1.h, w4
1001
+ ptrue p0.h, vl8
1002
+ mov z2.h, #1
1003
+ lsl z2.h, p0/m, z2.h, z1.h
1004
+.rept 8
1005
+ ld1d {z5.d}, p0/z, x1
1006
+ add x1, x1, x2, lsl #1
1007
+ add z5.h, p0/m, z5.h, z2.h
1008
+ asr z5.h, p0/m, z5.h, z0.h
1009
+ st1d {z5.d}, p0, x0
1010
+ add x0, x0, #16
1011
+.endr
1012
+ ret
1013
+endfunc
1014
+
1015
+function PFX(cpy2Dto1D_shr_16x16_sve)
1016
+ dup z0.h, w3
1017
+ sub w4, w3, #1
1018
+ dup z1.h, w4
1019
+ rdvl x9, #1
1020
+ cmp x9, #16
1021
+ bgt .vl_gt_16_cpy2Dto1D_shr_16x16
1022
+ ptrue p0.h, vl8
1023
+ mov z2.h, #1
1024
+ lsl z2.h, p0/m, z2.h, z1.h
1025
+.rept 16
1026
+ ld1d {z5.d}, p0/z, x1
1027
+ ld1d {z6.d}, p0/z, x1, #1, mul vl
1028
+ add x1, x1, x2, lsl #1
1029
+ add z5.h, p0/m, z5.h, z2.h
1030
+ add z6.h, p0/m, z6.h, z2.h
1031
+ asr z5.h, p0/m, z5.h, z0.h
1032
+ asr z6.h, p0/m, z6.h, z0.h
1033
+ st1d {z5.d}, p0, x0
1034
+ st1d {z6.d}, p0, x0, #1, mul vl
1035
+ add x0, x0, #32
1036
+.endr
1037
+ ret
1038
+.vl_gt_16_cpy2Dto1D_shr_16x16:
1039
+ ptrue p0.h, vl16
1040
+ mov z2.h, #1
1041
+ lsl z2.h, p0/m, z2.h, z1.h
1042
+.rept 16
1043
+ ld1d {z5.d}, p0/z, x1
1044
+ add x1, x1, x2, lsl #1
1045
+ add z5.h, p0/m, z5.h, z2.h
1046
+ asr z5.h, p0/m, z5.h, z0.h
1047
+ st1d {z5.d}, p0, x0
1048
+ add x0, x0, #32
1049
+.endr
1050
+ ret
1051
+endfunc
1052
+
1053
+function PFX(cpy2Dto1D_shr_32x32_sve)
1054
+ rdvl x9, #1
1055
+ cmp x9, #16
1056
+ bgt .vl_gt_16_cpy2Dto1D_shr_32x32
1057
+ cpy2Dto1D_shr_start
1058
+ mov w12, #16
1059
+.loop_cpy2Dto1D_shr_32_sve:
1060
+ sub w12, w12, #1
1061
+.rept 2
1062
+ ld1 {v2.8h-v5.8h}, x1, x2
1063
+ sub v2.8h, v2.8h, v1.8h
1064
+ sub v3.8h, v3.8h, v1.8h
1065
+ sub v4.8h, v4.8h, v1.8h
1066
+ sub v5.8h, v5.8h, v1.8h
1067
+ sshl v2.8h, v2.8h, v0.8h
1068
+ sshl v3.8h, v3.8h, v0.8h
1069
+ sshl v4.8h, v4.8h, v0.8h
1070
+ sshl v5.8h, v5.8h, v0.8h
1071
+ st1 {v2.8h-v5.8h}, x0, #64
1072
+.endr
1073
+ cbnz w12, .loop_cpy2Dto1D_shr_32_sve
1074
+ ret
1075
+.vl_gt_16_cpy2Dto1D_shr_32x32:
1076
+ dup z0.h, w3
1077
+ sub w4, w3, #1
1078
+ dup z1.h, w4
1079
+ cmp x9, #48
1080
+ bgt .vl_gt_48_cpy2Dto1D_shr_32x32
1081
+ ptrue p0.h, vl16
1082
+ mov z2.h, #1
1083
+ lsl z2.h, p0/m, z2.h, z1.h
1084
+.rept 32
1085
+ ld1d {z5.d}, p0/z, x1
1086
+ ld1d {z6.d}, p0/z, x1, #1, mul vl
1087
+ add x1, x1, x2, lsl #1
1088
+ add z5.h, p0/m, z5.h, z2.h
1089
+ add z6.h, p0/m, z6.h, z2.h
1090
+ asr z5.h, p0/m, z5.h, z0.h
1091
+ asr z6.h, p0/m, z6.h, z0.h
1092
+ st1d {z5.d}, p0, x0
1093
+ st1d {z6.d}, p0, x0, #1, mul vl
1094
+ add x0, x0, #64
1095
+.endr
1096
+ ret
1097
+.vl_gt_48_cpy2Dto1D_shr_32x32:
1098
+ ptrue p0.h, vl32
1099
+ mov z2.h, #1
1100
+ lsl z2.h, p0/m, z2.h, z1.h
1101
+.rept 32
1102
+ ld1d {z5.d}, p0/z, x1
1103
+ add x1, x1, x2, lsl #1
1104
+ add z5.h, p0/m, z5.h, z2.h
1105
+ asr z5.h, p0/m, z5.h, z0.h
1106
+ st1d {z5.d}, p0, x0
1107
+ add x0, x0, #64
1108
+.endr
1109
+ ret
1110
+endfunc
1111
+
1112
+// void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
1113
+
1114
+function PFX(cpy1Dto2D_shl_16x16_sve)
1115
+ dup z0.h, w3
1116
+ rdvl x9, #1
1117
+ cmp x9, #16
1118
+ bgt .vl_gt_16_cpy1Dto2D_shl_16x16
1119
+ ptrue p0.h, vl8
1120
+.rept 16
1121
+ ld1h {z1.h}, p0/z, x1
1122
+ ld1h {z2.h}, p0/z, x1, #1, mul vl
1123
+ lsl z1.h, p0/m, z1.h, z0.h
1124
+ lsl z2.h, p0/m, z2.h, z0.h
1125
+ st1h {z1.h}, p0, x0
1126
+ st1h {z2.h}, p0, x0, #1, mul vl
1127
+ add x1, x1, #32
1128
+ add x0, x0, x2, lsl #1
1129
+.endr
1130
+ ret
1131
+.vl_gt_16_cpy1Dto2D_shl_16x16:
1132
+ ptrue p0.h, vl16
1133
+.rept 16
1134
+ ld1h {z1.h}, p0/z, x1
1135
+ lsl z1.h, p0/m, z1.h, z0.h
1136
+ st1h {z1.h}, p0, x0
1137
+ add x1, x1, #32
1138
+ add x0, x0, x2, lsl #1
1139
+.endr
1140
+ ret
1141
+endfunc
1142
+
1143
+function PFX(cpy1Dto2D_shl_32x32_sve)
1144
+ dup z0.h, w3
1145
+ rdvl x9, #1
1146
+ cmp x9, #16
1147
+ bgt .vl_gt_16_cpy1Dto2D_shl_32x32
1148
+ ptrue p0.h, vl8
1149
+.rept 32
1150
+ ld1h {z1.h}, p0/z, x1
1151
+ ld1h {z2.h}, p0/z, x1, #1, mul vl
1152
+ ld1h {z3.h}, p0/z, x1, #2, mul vl
1153
+ ld1h {z4.h}, p0/z, x1, #3, mul vl
1154
+ lsl z1.h, p0/m, z1.h, z0.h
1155
+ lsl z2.h, p0/m, z2.h, z0.h
1156
+ lsl z3.h, p0/m, z3.h, z0.h
1157
+ lsl z4.h, p0/m, z4.h, z0.h
1158
+ st1h {z1.h}, p0, x0
1159
+ st1h {z2.h}, p0, x0, #1, mul vl
1160
+ st1h {z3.h}, p0, x0, #2, mul vl
1161
+ st1h {z4.h}, p0, x0, #3, mul vl
1162
+ add x1, x1, #64
1163
+ add x0, x0, x2, lsl #1
1164
+.endr
1165
+ ret
1166
+.vl_gt_16_cpy1Dto2D_shl_32x32:
1167
+ cmp x9, #48
1168
+ bgt .vl_gt_48_cpy1Dto2D_shl_32x32
1169
+ ptrue p0.h, vl16
1170
+.rept 32
1171
+ ld1h {z1.h}, p0/z, x1
1172
+ ld1h {z2.h}, p0/z, x1, #1, mul vl
1173
+ lsl z1.h, p0/m, z1.h, z0.h
1174
+ lsl z2.h, p0/m, z2.h, z0.h
1175
+ st1h {z1.h}, p0, x0
1176
+ st1h {z2.h}, p0, x0, #1, mul vl
1177
+ add x1, x1, #64
1178
+ add x0, x0, x2, lsl #1
1179
+.endr
1180
+ ret
1181
+.vl_gt_48_cpy1Dto2D_shl_32x32:
1182
+ ptrue p0.h, vl32
1183
+.rept 32
1184
+ ld1h {z1.h}, p0/z, x1
1185
+ lsl z1.h, p0/m, z1.h, z0.h
1186
+ st1h {z1.h}, p0, x0
1187
+ add x1, x1, #64
1188
+ add x0, x0, x2, lsl #1
1189
+.endr
1190
+ ret
1191
+endfunc
1192
+
1193
+function PFX(cpy1Dto2D_shl_64x64_sve)
1194
+ dup z0.h, w3
1195
+ mov x8, #64
1196
+ mov w12, #64
1197
+.L_init_cpy1Dto2D_shl_64x64:
1198
+ sub w12, w12, 1
1199
+ mov x9, #0
1200
+ whilelt p0.h, x9, x8
1201
+.L_cpy1Dto2D_shl_64x64:
1202
+ ld1h {z1.h}, p0/z, x1, x9, lsl #1
1203
+ lsl z1.h, p0/m, z1.h, z0.h
1204
+ st1h {z1.h}, p0, x0, x9, lsl #1
1205
+ inch x9
1206
+ whilelt p0.h, x9, x8
1207
+ b.first .L_cpy1Dto2D_shl_64x64
1208
+ addvl x1, x1, #1
1209
+ add x0, x0, x2, lsl #1
1210
+ cbnz w12, .L_init_cpy1Dto2D_shl_64x64
1211
+ ret
1212
+endfunc
1213
+
1214
+// void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
1215
+
1216
+function PFX(cpy1Dto2D_shr_16x16_sve)
1217
+ rdvl x9, #1
1218
+ cmp x9, #16
1219
+ bgt .vl_gt_16_cpy1Dto2D_shr_16x16
1220
+ cpy1Dto2D_shr_start
1221
+ mov w12, #4
1222
+.loop_cpy1Dto2D_shr_16:
1223
+ sub w12, w12, #1
1224
+.rept 4
1225
+ ld1 {v2.8h-v3.8h}, x1, #32
1226
+ sub v2.8h, v2.8h, v1.8h
1227
+ sub v3.8h, v3.8h, v1.8h
1228
+ sshl v2.8h, v2.8h, v0.8h
1229
+ sshl v3.8h, v3.8h, v0.8h
1230
+ st1 {v2.8h-v3.8h}, x0, x2
1231
+.endr
1232
+ cbnz w12, .loop_cpy1Dto2D_shr_16
1233
+ ret
1234
+.vl_gt_16_cpy1Dto2D_shr_16x16:
1235
+ dup z0.h, w3
1236
+ sub w4, w3, #1
1237
+ dup z1.h, w4
1238
+ ptrue p0.h, vl16
1239
+ mov z2.h, #1
1240
+ lsl z2.h, p0/m, z2.h, z1.h
1241
+.rept 16
1242
+ ld1d {z5.d}, p0/z, x1
1243
+ add x1, x1, #32
1244
+ add z5.h, p0/m, z5.h, z2.h
1245
+ asr z5.h, p0/m, z5.h, z0.h
1246
+ st1d {z5.d}, p0, x0
1247
+ add x0, x0, x2, lsl #1
1248
+.endr
1249
+ ret
1250
+endfunc
1251
+
1252
+function PFX(cpy1Dto2D_shr_32x32_sve)
1253
+ rdvl x9, #1
1254
+ cmp x9, #16
1255
+ bgt .vl_gt_16_cpy1Dto2D_shr_32x32
1256
+ cpy1Dto2D_shr_start
1257
+ mov w12, #16
1258
+.loop_cpy1Dto2D_shr_32_sve:
1259
+ sub w12, w12, #1
1260
+.rept 2
1261
+ ld1 {v2.16b-v5.16b}, x1, #64
1262
+ sub v2.8h, v2.8h, v1.8h
1263
+ sub v3.8h, v3.8h, v1.8h
1264
+ sub v4.8h, v4.8h, v1.8h
1265
+ sub v5.8h, v5.8h, v1.8h
1266
+ sshl v2.8h, v2.8h, v0.8h
1267
+ sshl v3.8h, v3.8h, v0.8h
1268
+ sshl v4.8h, v4.8h, v0.8h
1269
+ sshl v5.8h, v5.8h, v0.8h
1270
+ st1 {v2.16b-v5.16b}, x0, x2
1271
+.endr
1272
+ cbnz w12, .loop_cpy1Dto2D_shr_32_sve
1273
+ ret
1274
+.vl_gt_16_cpy1Dto2D_shr_32x32:
1275
+ dup z0.h, w3
1276
+ sub w4, w3, #1
1277
+ dup z1.h, w4
1278
+ cmp x9, #48
1279
+ bgt .vl_gt_48_cpy2Dto1D_shr_32x32
1280
+ ptrue p0.h, vl16
1281
+ mov z2.h, #1
1282
+ lsl z2.h, p0/m, z2.h, z1.h
1283
+.rept 32
1284
+ ld1d {z5.d}, p0/z, x1
1285
+ ld1d {z6.d}, p0/z, x1, #1, mul vl
1286
+ add x1, x1, #64
1287
+ add z5.h, p0/m, z5.h, z2.h
1288
+ add z6.h, p0/m, z6.h, z2.h
1289
+ asr z5.h, p0/m, z5.h, z0.h
1290
+ asr z6.h, p0/m, z6.h, z0.h
1291
+ st1d {z5.d}, p0, x0
1292
+ st1d {z6.d}, p0, x0, #1, mul vl
1293
+ add x0, x0, x2, lsl #1
1294
+.endr
1295
+ ret
1296
+.vl_gt_48_cpy1Dto2D_shr_32x32:
1297
+ ptrue p0.h, vl32
1298
+ mov z2.h, #1
1299
+ lsl z2.h, p0/m, z2.h, z1.h
1300
+.rept 32
1301
+ ld1d {z5.d}, p0/z, x1
1302
+ add x1, x1, #64
1303
+ add z5.h, p0/m, z5.h, z2.h
1304
+ asr z5.h, p0/m, z5.h, z0.h
1305
+ st1d {z5.d}, p0, x0
1306
+ add x0, x0, x2, lsl #1
1307
+.endr
1308
+ ret
1309
+endfunc
1310
+
1311
+function PFX(cpy1Dto2D_shr_64x64_sve)
1312
+ dup z0.h, w3
1313
+ sub w4, w3, #1
1314
+ dup z1.h, w4
1315
+ rdvl x9, #1
1316
+ cmp x9, #16
1317
+ bgt .vl_gt_16_cpy1Dto2D_shr_64x64
1318
+ ptrue p0.h, vl8
1319
+ mov z2.h, #1
1320
+ lsl z2.h, p0/m, z2.h, z1.h
1321
+.rept 128
1322
+ ld1d {z5.d}, p0/z, x1
1323
+ ld1d {z6.d}, p0/z, x1, #1, mul vl
1324
+ ld1d {z7.d}, p0/z, x1, #2, mul vl
1325
+ ld1d {z8.d}, p0/z, x1, #3, mul vl
1326
+ ld1d {z9.d}, p0/z, x1, #4, mul vl
1327
+ ld1d {z10.d}, p0/z, x1, #5, mul vl
1328
+ ld1d {z11.d}, p0/z, x1, #6, mul vl
1329
+ ld1d {z12.d}, p0/z, x1, #7, mul vl
1330
+ add x1, x1, #128
1331
+ add z5.h, p0/m, z5.h, z2.h
1332
+ add z6.h, p0/m, z6.h, z2.h
1333
+ add z7.h, p0/m, z7.h, z2.h
1334
+ add z8.h, p0/m, z8.h, z2.h
1335
+ add z9.h, p0/m, z9.h, z2.h
1336
+ add z10.h, p0/m, z10.h, z2.h
1337
+ add z11.h, p0/m, z11.h, z2.h
1338
+ add z12.h, p0/m, z12.h, z2.h
1339
+ asr z5.h, p0/m, z5.h, z0.h
1340
+ asr z6.h, p0/m, z6.h, z0.h
1341
+ asr z7.h, p0/m, z7.h, z0.h
1342
+ asr z8.h, p0/m, z8.h, z0.h
1343
+ asr z9.h, p0/m, z9.h, z0.h
1344
+ asr z10.h, p0/m, z10.h, z0.h
1345
+ asr z11.h, p0/m, z11.h, z0.h
1346
+ asr z12.h, p0/m, z12.h, z0.h
1347
+ st1d {z5.d}, p0, x0
1348
+ st1d {z6.d}, p0, x0, #1, mul vl
1349
+ st1d {z7.d}, p0, x0, #2, mul vl
1350
+ st1d {z8.d}, p0, x0, #3, mul vl
1351
+ st1d {z9.d}, p0, x0, #4, mul vl
1352
+ st1d {z10.d}, p0, x0, #5, mul vl
1353
+ st1d {z11.d}, p0, x0, #6, mul vl
1354
+ st1d {z12.d}, p0, x0, #7, mul vl
1355
+ add x0, x0, x2, lsl #1
1356
+.endr
1357
+ ret
1358
+.vl_gt_16_cpy1Dto2D_shr_64x64:
1359
+ cmp x9, #48
1360
+ bgt .vl_gt_48_cpy1Dto2D_shr_64x64
1361
+ ptrue p0.h, vl16
1362
+ mov z2.h, #1
1363
+ lsl z2.h, p0/m, z2.h, z1.h
1364
+.rept 128
1365
+ ld1d {z5.d}, p0/z, x1
1366
+ ld1d {z6.d}, p0/z, x1, #1, mul vl
1367
+ ld1d {z7.d}, p0/z, x1, #2, mul vl
1368
+ ld1d {z8.d}, p0/z, x1, #3, mul vl
1369
+ add x1, x1, #128
1370
+ add z5.h, p0/m, z5.h, z2.h
1371
+ add z6.h, p0/m, z6.h, z2.h
1372
+ add z7.h, p0/m, z7.h, z2.h
1373
+ add z8.h, p0/m, z8.h, z2.h
1374
+ asr z5.h, p0/m, z5.h, z0.h
1375
+ asr z6.h, p0/m, z6.h, z0.h
1376
+ asr z7.h, p0/m, z7.h, z0.h
1377
+ asr z8.h, p0/m, z8.h, z0.h
1378
+ st1d {z5.d}, p0, x0
1379
+ st1d {z6.d}, p0, x0, #1, mul vl
1380
+ st1d {z7.d}, p0, x0, #2, mul vl
1381
+ st1d {z8.d}, p0, x0, #3, mul vl
1382
+ add x0, x0, x2, lsl #1
1383
+.endr
1384
+ ret
1385
+.vl_gt_48_cpy1Dto2D_shr_64x64:
1386
+ cmp x9, #112
1387
+ bgt .vl_gt_112_cpy1Dto2D_shr_64x64
1388
+ ptrue p0.h, vl32
1389
+ mov z2.h, #1
1390
+ lsl z2.h, p0/m, z2.h, z1.h
1391
+.rept 128
1392
+ ld1d {z5.d}, p0/z, x1
1393
+ ld1d {z6.d}, p0/z, x1, #1, mul vl
1394
+ add x1, x1, #128
1395
+ add z5.h, p0/m, z5.h, z2.h
1396
+ add z6.h, p0/m, z6.h, z2.h
1397
+ asr z5.h, p0/m, z5.h, z0.h
1398
+ asr z6.h, p0/m, z6.h, z0.h
1399
+ st1d {z5.d}, p0, x0
1400
+ st1d {z6.d}, p0, x0, #1, mul vl
1401
+ add x0, x0, x2, lsl #1
1402
+.endr
1403
+ ret
1404
+.vl_gt_112_cpy1Dto2D_shr_64x64:
1405
+ ptrue p0.h, vl64
1406
+ mov z2.h, #1
1407
+ lsl z2.h, p0/m, z2.h, z1.h
1408
+.rept 128
1409
+ ld1d {z5.d}, p0/z, x1
1410
+ add x1, x1, #128
1411
+ add z5.h, p0/m, z5.h, z2.h
1412
+ asr z5.h, p0/m, z5.h, z0.h
1413
+ st1d {z5.d}, p0, x0
1414
+ add x0, x0, x2, lsl #1
1415
+.endr
1416
+ ret
1417
+endfunc
1418
x265_3.6.tar.gz/source/common/aarch64/blockcopy8.S
Added
1301
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2021 MulticoreWare, Inc
4
+ *
5
+ * Authors: Sebastian Pop <spop@amazon.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+#include "blockcopy8-common.S"
27
+
28
+#ifdef __APPLE__
29
+.section __RODATA,__rodata
30
+#else
31
+.section .rodata
32
+#endif
33
+
34
+.align 4
35
+
36
+.text
37
+
38
+/* void blockcopy_sp(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
39
+ *
40
+ * r0 - a
41
+ * r1 - stridea
42
+ * r2 - b
43
+ * r3 - strideb */
44
+function PFX(blockcopy_sp_4x4_neon)
45
+ lsl x3, x3, #1
46
+.rept 2
47
+ ld1 {v0.8h}, x2, x3
48
+ ld1 {v1.8h}, x2, x3
49
+ xtn v0.8b, v0.8h
50
+ xtn v1.8b, v1.8h
51
+ st1 {v0.s}0, x0, x1
52
+ st1 {v1.s}0, x0, x1
53
+.endr
54
+ ret
55
+endfunc
56
+
57
+function PFX(blockcopy_sp_8x8_neon)
58
+ lsl x3, x3, #1
59
+.rept 4
60
+ ld1 {v0.8h}, x2, x3
61
+ ld1 {v1.8h}, x2, x3
62
+ xtn v0.8b, v0.8h
63
+ xtn v1.8b, v1.8h
64
+ st1 {v0.d}0, x0, x1
65
+ st1 {v1.d}0, x0, x1
66
+.endr
67
+ ret
68
+endfunc
69
+
70
+function PFX(blockcopy_sp_16x16_neon)
71
+ lsl x3, x3, #1
72
+ movrel x11, xtn_xtn2_table
73
+ ld1 {v31.16b}, x11
74
+.rept 8
75
+ ld1 {v0.8h-v1.8h}, x2, x3
76
+ ld1 {v2.8h-v3.8h}, x2, x3
77
+ tbl v0.16b, {v0.16b,v1.16b}, v31.16b
78
+ tbl v1.16b, {v2.16b,v3.16b}, v31.16b
79
+ st1 {v0.16b}, x0, x1
80
+ st1 {v1.16b}, x0, x1
81
+.endr
82
+ ret
83
+endfunc
84
+
85
+function PFX(blockcopy_sp_32x32_neon)
86
+ mov w12, #4
87
+ lsl x3, x3, #1
88
+ movrel x11, xtn_xtn2_table
89
+ ld1 {v31.16b}, x11
90
+.loop_csp32:
91
+ sub w12, w12, #1
92
+.rept 4
93
+ ld1 {v0.8h-v3.8h}, x2, x3
94
+ ld1 {v4.8h-v7.8h}, x2, x3
95
+ tbl v0.16b, {v0.16b,v1.16b}, v31.16b
96
+ tbl v1.16b, {v2.16b,v3.16b}, v31.16b
97
+ tbl v2.16b, {v4.16b,v5.16b}, v31.16b
98
+ tbl v3.16b, {v6.16b,v7.16b}, v31.16b
99
+ st1 {v0.16b-v1.16b}, x0, x1
100
+ st1 {v2.16b-v3.16b}, x0, x1
101
+.endr
102
+ cbnz w12, .loop_csp32
103
+ ret
104
+endfunc
105
+
106
+function PFX(blockcopy_sp_64x64_neon)
107
+ mov w12, #16
108
+ lsl x3, x3, #1
109
+ sub x3, x3, #64
110
+ movrel x11, xtn_xtn2_table
111
+ ld1 {v31.16b}, x11
112
+.loop_csp64:
113
+ sub w12, w12, #1
114
+.rept 4
115
+ ld1 {v0.8h-v3.8h}, x2, #64
116
+ ld1 {v4.8h-v7.8h}, x2, x3
117
+ tbl v0.16b, {v0.16b,v1.16b}, v31.16b
118
+ tbl v1.16b, {v2.16b,v3.16b}, v31.16b
119
+ tbl v2.16b, {v4.16b,v5.16b}, v31.16b
120
+ tbl v3.16b, {v6.16b,v7.16b}, v31.16b
121
+ st1 {v0.16b-v3.16b}, x0, x1
122
+.endr
123
+ cbnz w12, .loop_csp64
124
+ ret
125
+endfunc
126
+
127
+// void blockcopy_ps(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb)
128
+function PFX(blockcopy_ps_4x4_neon)
129
+ lsl x1, x1, #1
130
+.rept 2
131
+ ld1 {v0.8b}, x2, x3
132
+ ld1 {v1.8b}, x2, x3
133
+ uxtl v0.8h, v0.8b
134
+ uxtl v1.8h, v1.8b
135
+ st1 {v0.4h}, x0, x1
136
+ st1 {v1.4h}, x0, x1
137
+.endr
138
+ ret
139
+endfunc
140
+
141
+function PFX(blockcopy_ps_8x8_neon)
142
+ lsl x1, x1, #1
143
+.rept 4
144
+ ld1 {v0.8b}, x2, x3
145
+ ld1 {v1.8b}, x2, x3
146
+ uxtl v0.8h, v0.8b
147
+ uxtl v1.8h, v1.8b
148
+ st1 {v0.8h}, x0, x1
149
+ st1 {v1.8h}, x0, x1
150
+.endr
151
+ ret
152
+endfunc
153
+
154
+function PFX(blockcopy_ps_16x16_neon)
155
+ lsl x1, x1, #1
156
+.rept 8
157
+ ld1 {v4.16b}, x2, x3
158
+ ld1 {v5.16b}, x2, x3
159
+ uxtl v0.8h, v4.8b
160
+ uxtl2 v1.8h, v4.16b
161
+ uxtl v2.8h, v5.8b
162
+ uxtl2 v3.8h, v5.16b
163
+ st1 {v0.8h-v1.8h}, x0, x1
164
+ st1 {v2.8h-v3.8h}, x0, x1
165
+.endr
166
+ ret
167
+endfunc
168
+
169
+function PFX(blockcopy_ps_32x32_neon)
170
+ lsl x1, x1, #1
171
+ mov w12, #4
172
+.loop_cps32:
173
+ sub w12, w12, #1
174
+.rept 4
175
+ ld1 {v16.16b-v17.16b}, x2, x3
176
+ ld1 {v18.16b-v19.16b}, x2, x3
177
+ uxtl v0.8h, v16.8b
178
+ uxtl2 v1.8h, v16.16b
179
+ uxtl v2.8h, v17.8b
180
+ uxtl2 v3.8h, v17.16b
181
+ uxtl v4.8h, v18.8b
182
+ uxtl2 v5.8h, v18.16b
183
+ uxtl v6.8h, v19.8b
184
+ uxtl2 v7.8h, v19.16b
185
+ st1 {v0.8h-v3.8h}, x0, x1
186
+ st1 {v4.8h-v7.8h}, x0, x1
187
+.endr
188
+ cbnz w12, .loop_cps32
189
+ ret
190
+endfunc
191
+
192
+function PFX(blockcopy_ps_64x64_neon)
193
+ lsl x1, x1, #1
194
+ sub x1, x1, #64
195
+ mov w12, #16
196
+.loop_cps64:
197
+ sub w12, w12, #1
198
+.rept 4
199
+ ld1 {v16.16b-v19.16b}, x2, x3
200
+ uxtl v0.8h, v16.8b
201
+ uxtl2 v1.8h, v16.16b
202
+ uxtl v2.8h, v17.8b
203
+ uxtl2 v3.8h, v17.16b
204
+ uxtl v4.8h, v18.8b
205
+ uxtl2 v5.8h, v18.16b
206
+ uxtl v6.8h, v19.8b
207
+ uxtl2 v7.8h, v19.16b
208
+ st1 {v0.8h-v3.8h}, x0, #64
209
+ st1 {v4.8h-v7.8h}, x0, x1
210
+.endr
211
+ cbnz w12, .loop_cps64
212
+ ret
213
+endfunc
214
+
215
+// void x265_blockcopy_ss(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
216
+function PFX(blockcopy_ss_4x4_neon)
217
+ lsl x1, x1, #1
218
+ lsl x3, x3, #1
219
+.rept 2
220
+ ld1 {v0.8b}, x2, x3
221
+ ld1 {v1.8b}, x2, x3
222
+ st1 {v0.8b}, x0, x1
223
+ st1 {v1.8b}, x0, x1
224
+.endr
225
+ ret
226
+endfunc
227
+
228
+function PFX(blockcopy_ss_8x8_neon)
229
+ lsl x1, x1, #1
230
+ lsl x3, x3, #1
231
+.rept 4
232
+ ld1 {v0.8h}, x2, x3
233
+ ld1 {v1.8h}, x2, x3
234
+ st1 {v0.8h}, x0, x1
235
+ st1 {v1.8h}, x0, x1
236
+.endr
237
+ ret
238
+endfunc
239
+
240
+function PFX(blockcopy_ss_16x16_neon)
241
+ lsl x1, x1, #1
242
+ lsl x3, x3, #1
243
+.rept 8
244
+ ld1 {v0.8h-v1.8h}, x2, x3
245
+ ld1 {v2.8h-v3.8h}, x2, x3
246
+ st1 {v0.8h-v1.8h}, x0, x1
247
+ st1 {v2.8h-v3.8h}, x0, x1
248
+.endr
249
+ ret
250
+endfunc
251
+
252
+function PFX(blockcopy_ss_32x32_neon)
253
+ lsl x1, x1, #1
254
+ lsl x3, x3, #1
255
+ mov w12, #4
256
+.loop_css32:
257
+ sub w12, w12, #1
258
+.rept 8
259
+ ld1 {v0.8h-v3.8h}, x2, x3
260
+ st1 {v0.8h-v3.8h}, x0, x1
261
+.endr
262
+ cbnz w12, .loop_css32
263
+ ret
264
+endfunc
265
+
266
+function PFX(blockcopy_ss_64x64_neon)
267
+ lsl x1, x1, #1
268
+ sub x1, x1, #64
269
+ lsl x3, x3, #1
270
+ sub x3, x3, #64
271
+ mov w12, #8
272
+.loop_css64:
273
+ sub w12, w12, #1
274
+.rept 8
275
+ ld1 {v0.8h-v3.8h}, x2, #64
276
+ ld1 {v4.8h-v7.8h}, x2, x3
277
+ st1 {v0.8h-v3.8h}, x0, #64
278
+ st1 {v4.8h-v7.8h}, x0, x1
279
+.endr
280
+ cbnz w12, .loop_css64
281
+ ret
282
+endfunc
283
+
284
+/******** Chroma blockcopy********/
285
+function PFX(blockcopy_ss_4x8_neon)
286
+ lsl x1, x1, #1
287
+ lsl x3, x3, #1
288
+.rept 4
289
+ ld1 {v0.8b}, x2, x3
290
+ ld1 {v1.8b}, x2, x3
291
+ st1 {v0.8b}, x0, x1
292
+ st1 {v1.8b}, x0, x1
293
+.endr
294
+ ret
295
+endfunc
296
+
297
+function PFX(blockcopy_ss_8x16_neon)
298
+ lsl x1, x1, #1
299
+ lsl x3, x3, #1
300
+.rept 8
301
+ ld1 {v0.8h}, x2, x3
302
+ ld1 {v1.8h}, x2, x3
303
+ st1 {v0.8h}, x0, x1
304
+ st1 {v1.8h}, x0, x1
305
+.endr
306
+ ret
307
+endfunc
308
+
309
+function PFX(blockcopy_ss_16x32_neon)
310
+ lsl x1, x1, #1
311
+ lsl x3, x3, #1
312
+.rept 16
313
+ ld1 {v0.8h-v1.8h}, x2, x3
314
+ ld1 {v2.8h-v3.8h}, x2, x3
315
+ st1 {v0.8h-v1.8h}, x0, x1
316
+ st1 {v2.8h-v3.8h}, x0, x1
317
+.endr
318
+ ret
319
+endfunc
320
+
321
+function PFX(blockcopy_ss_32x64_neon)
322
+ lsl x1, x1, #1
323
+ lsl x3, x3, #1
324
+ mov w12, #8
325
+.loop_css32x64:
326
+ sub w12, w12, #1
327
+.rept 8
328
+ ld1 {v0.8h-v3.8h}, x2, x3
329
+ st1 {v0.8h-v3.8h}, x0, x1
330
+.endr
331
+ cbnz w12, .loop_css32x64
332
+ ret
333
+endfunc
334
+
335
+// chroma blockcopy_ps
336
+function PFX(blockcopy_ps_4x8_neon)
337
+ lsl x1, x1, #1
338
+.rept 4
339
+ ld1 {v0.8b}, x2, x3
340
+ ld1 {v1.8b}, x2, x3
341
+ uxtl v0.8h, v0.8b
342
+ uxtl v1.8h, v1.8b
343
+ st1 {v0.4h}, x0, x1
344
+ st1 {v1.4h}, x0, x1
345
+.endr
346
+ ret
347
+endfunc
348
+
349
+function PFX(blockcopy_ps_8x16_neon)
350
+ lsl x1, x1, #1
351
+.rept 8
352
+ ld1 {v0.8b}, x2, x3
353
+ ld1 {v1.8b}, x2, x3
354
+ uxtl v0.8h, v0.8b
355
+ uxtl v1.8h, v1.8b
356
+ st1 {v0.8h}, x0, x1
357
+ st1 {v1.8h}, x0, x1
358
+.endr
359
+ ret
360
+endfunc
361
+
362
+function PFX(blockcopy_ps_16x32_neon)
363
+ lsl x1, x1, #1
364
+.rept 16
365
+ ld1 {v4.16b}, x2, x3
366
+ ld1 {v5.16b}, x2, x3
367
+ uxtl v0.8h, v4.8b
368
+ uxtl2 v1.8h, v4.16b
369
+ uxtl v2.8h, v5.8b
370
+ uxtl2 v3.8h, v5.16b
371
+ st1 {v0.8h-v1.8h}, x0, x1
372
+ st1 {v2.8h-v3.8h}, x0, x1
373
+.endr
374
+ ret
375
+endfunc
376
+
377
+function PFX(blockcopy_ps_32x64_neon)
378
+ lsl x1, x1, #1
379
+ mov w12, #8
380
+.loop_cps32x64:
381
+ sub w12, w12, #1
382
+.rept 4
383
+ ld1 {v16.16b-v17.16b}, x2, x3
384
+ ld1 {v18.16b-v19.16b}, x2, x3
385
+ uxtl v0.8h, v16.8b
386
+ uxtl2 v1.8h, v16.16b
387
+ uxtl v2.8h, v17.8b
388
+ uxtl2 v3.8h, v17.16b
389
+ uxtl v4.8h, v18.8b
390
+ uxtl2 v5.8h, v18.16b
391
+ uxtl v6.8h, v19.8b
392
+ uxtl2 v7.8h, v19.16b
393
+ st1 {v0.8h-v3.8h}, x0, x1
394
+ st1 {v4.8h-v7.8h}, x0, x1
395
+.endr
396
+ cbnz w12, .loop_cps32x64
397
+ ret
398
+endfunc
399
+
400
+// chroma blockcopy_sp
401
+function PFX(blockcopy_sp_4x8_neon)
402
+ lsl x3, x3, #1
403
+.rept 4
404
+ ld1 {v0.8h}, x2, x3
405
+ ld1 {v1.8h}, x2, x3
406
+ xtn v0.8b, v0.8h
407
+ xtn v1.8b, v1.8h
408
+ st1 {v0.s}0, x0, x1
409
+ st1 {v1.s}0, x0, x1
410
+.endr
411
+ ret
412
+endfunc
413
+
414
+function PFX(blockcopy_sp_8x16_neon)
415
+ lsl x3, x3, #1
416
+.rept 8
417
+ ld1 {v0.8h}, x2, x3
418
+ ld1 {v1.8h}, x2, x3
419
+ xtn v0.8b, v0.8h
420
+ xtn v1.8b, v1.8h
421
+ st1 {v0.d}0, x0, x1
422
+ st1 {v1.d}0, x0, x1
423
+.endr
424
+ ret
425
+endfunc
426
+
427
+function PFX(blockcopy_sp_16x32_neon)
428
+ lsl x3, x3, #1
429
+ movrel x11, xtn_xtn2_table
430
+ ld1 {v31.16b}, x11
431
+.rept 16
432
+ ld1 {v0.8h-v1.8h}, x2, x3
433
+ ld1 {v2.8h-v3.8h}, x2, x3
434
+ tbl v0.16b, {v0.16b,v1.16b}, v31.16b
435
+ tbl v1.16b, {v2.16b,v3.16b}, v31.16b
436
+ st1 {v0.16b}, x0, x1
437
+ st1 {v1.16b}, x0, x1
438
+.endr
439
+ ret
440
+endfunc
441
+
442
+function PFX(blockcopy_sp_32x64_neon)
443
+ mov w12, #8
444
+ lsl x3, x3, #1
445
+ movrel x11, xtn_xtn2_table
446
+ ld1 {v31.16b}, x11
447
+.loop_csp32x64:
448
+ sub w12, w12, #1
449
+.rept 4
450
+ ld1 {v0.8h-v3.8h}, x2, x3
451
+ ld1 {v4.8h-v7.8h}, x2, x3
452
+ tbl v0.16b, {v0.16b,v1.16b}, v31.16b
453
+ tbl v1.16b, {v2.16b,v3.16b}, v31.16b
454
+ tbl v2.16b, {v4.16b,v5.16b}, v31.16b
455
+ tbl v3.16b, {v6.16b,v7.16b}, v31.16b
456
+ st1 {v0.16b-v1.16b}, x0, x1
457
+ st1 {v2.16b-v3.16b}, x0, x1
458
+.endr
459
+ cbnz w12, .loop_csp32x64
460
+ ret
461
+endfunc
462
+
463
+/* blockcopy_pp(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) */
464
+
465
+function PFX(blockcopy_pp_2x4_neon)
466
+ ldrh w9, x2
467
+ add x4, x1, x1
468
+ add x14, x3, x3
469
+ strh w9, x0
470
+ ldrh w10, x2, x3
471
+ add x5, x4, x1
472
+ add x15, x14, x3
473
+ strh w10, x0, x1
474
+ ldrh w11, x2, x14
475
+ strh w11, x0, x4
476
+ ldrh w12, x2, x15
477
+ strh w12, x0, x5
478
+ ret
479
+endfunc
480
+
481
+.macro blockcopy_pp_2xN_neon h
482
+function PFX(blockcopy_pp_2x\h\()_neon)
483
+ add x4, x1, x1
484
+ add x5, x4, x1
485
+ add x6, x5, x1
486
+
487
+ add x14, x3, x3
488
+ add x15, x14, x3
489
+ add x16, x15, x3
490
+
491
+.rept \h / 4
492
+ ldrh w9, x2
493
+ strh w9, x0
494
+ ldrh w10, x2, x3
495
+ strh w10, x0, x1
496
+ ldrh w11, x2, x14
497
+ strh w11, x0, x4
498
+ ldrh w12, x2, x15
499
+ strh w12, x0, x5
500
+ add x2, x2, x16
501
+ add x0, x0, x6
502
+.endr
503
+ ret
504
+endfunc
505
+.endm
506
+
507
+blockcopy_pp_2xN_neon 8
508
+blockcopy_pp_2xN_neon 16
509
+
510
+function PFX(blockcopy_pp_4x2_neon)
511
+ ldr w9, x2
512
+ str w9, x0
513
+ ldr w10, x2, x3
514
+ str w10, x0, x1
515
+ ret
516
+endfunc
517
+
518
+function PFX(blockcopy_pp_4x4_neon)
519
+ ldr w9, x2
520
+ add x4, x1, x1
521
+ add x14, x3, x3
522
+ str w9, x0
523
+ ldr w10, x2, x3
524
+ add x5, x4, x1
525
+ add x15, x14, x3
526
+ str w10, x0, x1
527
+ ldr w11, x2, x14
528
+ str w11, x0, x4
529
+ ldr w12, x2, x15
530
+ str w12, x0, x5
531
+ ret
532
+endfunc
533
+
534
+.macro blockcopy_pp_4xN_neon h
535
+function PFX(blockcopy_pp_4x\h\()_neon)
536
+ add x4, x1, x1
537
+ add x5, x4, x1
538
+ add x6, x5, x1
539
+
540
+ add x14, x3, x3
541
+ add x15, x14, x3
542
+ add x16, x15, x3
543
+
544
+.rept \h / 4
545
+ ldr w9, x2
546
+ str w9, x0
547
+ ldr w10, x2, x3
548
+ str w10, x0, x1
549
+ ldr w11, x2, x14
550
+ str w11, x0, x4
551
+ ldr w12, x2, x15
552
+ str w12, x0, x5
553
+ add x2, x2, x16
554
+ add x0, x0, x6
555
+.endr
556
+ ret
557
+endfunc
558
+.endm
559
+
560
+blockcopy_pp_4xN_neon 8
561
+blockcopy_pp_4xN_neon 16
562
+blockcopy_pp_4xN_neon 32
563
+
564
+.macro blockcopy_pp_6xN_neon h
565
+function PFX(blockcopy_pp_6x\h\()_neon)
566
+ sub x1, x1, #4
567
+.rept \h
568
+ ld1 {v0.8b}, x2, x3
569
+ st1 {v0.s}0, x0, #4
570
+ st1 {v0.h}2, x0, x1
571
+.endr
572
+ ret
573
+endfunc
574
+.endm
575
+
576
+blockcopy_pp_6xN_neon 8
577
+blockcopy_pp_6xN_neon 16
578
+
579
+.macro blockcopy_pp_8xN_neon h
580
+function PFX(blockcopy_pp_8x\h\()_neon)
581
+.rept \h
582
+ ld1 {v0.4h}, x2, x3
583
+ st1 {v0.4h}, x0, x1
584
+.endr
585
+ ret
586
+endfunc
587
+.endm
588
+
589
+blockcopy_pp_8xN_neon 2
590
+blockcopy_pp_8xN_neon 4
591
+blockcopy_pp_8xN_neon 6
592
+blockcopy_pp_8xN_neon 8
593
+blockcopy_pp_8xN_neon 12
594
+blockcopy_pp_8xN_neon 16
595
+blockcopy_pp_8xN_neon 32
596
+
597
+function PFX(blockcopy_pp_8x64_neon)
598
+ mov w12, #4
599
+.loop_pp_8x64:
600
+ sub w12, w12, #1
601
+.rept 16
602
+ ld1 {v0.4h}, x2, x3
603
+ st1 {v0.4h}, x0, x1
604
+.endr
605
+ cbnz w12, .loop_pp_8x64
606
+ ret
607
+endfunc
608
+
609
+.macro blockcopy_pp_16xN_neon h
610
+function PFX(blockcopy_pp_16x\h\()_neon)
611
+.rept \h
612
+ ld1 {v0.8h}, x2, x3
613
+ st1 {v0.8h}, x0, x1
614
+.endr
615
+ ret
616
+endfunc
617
+.endm
618
+
619
+blockcopy_pp_16xN_neon 4
620
+blockcopy_pp_16xN_neon 8
621
+blockcopy_pp_16xN_neon 12
622
+blockcopy_pp_16xN_neon 16
623
+
624
+.macro blockcopy_pp_16xN1_neon h
625
+function PFX(blockcopy_pp_16x\h\()_neon)
626
+ mov w12, #\h / 8
627
+.loop_16x\h\():
628
+.rept 8
629
+ ld1 {v0.8h}, x2, x3
630
+ st1 {v0.8h}, x0, x1
631
+.endr
632
+ sub w12, w12, #1
633
+ cbnz w12, .loop_16x\h
634
+ ret
635
+endfunc
636
+.endm
637
+
638
+blockcopy_pp_16xN1_neon 24
639
+blockcopy_pp_16xN1_neon 32
640
+blockcopy_pp_16xN1_neon 64
641
+
642
+function PFX(blockcopy_pp_12x16_neon)
643
+ sub x1, x1, #8
644
+.rept 16
645
+ ld1 {v0.16b}, x2, x3
646
+ str d0, x0, #8
647
+ st1 {v0.s}2, x0, x1
648
+.endr
649
+ ret
650
+endfunc
651
+
652
+function PFX(blockcopy_pp_12x32_neon)
653
+ sub x1, x1, #8
654
+ mov w12, #4
655
+.loop_pp_12x32:
656
+ sub w12, w12, #1
657
+.rept 8
658
+ ld1 {v0.16b}, x2, x3
659
+ str d0, x0, #8
660
+ st1 {v0.s}2, x0, x1
661
+.endr
662
+ cbnz w12, .loop_pp_12x32
663
+ ret
664
+endfunc
665
+
666
+function PFX(blockcopy_pp_24x32_neon)
667
+ mov w12, #4
668
+.loop_24x32:
669
+ sub w12, w12, #1
670
+.rept 8
671
+ ld1 {v0.8b-v2.8b}, x2, x3
672
+ st1 {v0.8b-v2.8b}, x0, x1
673
+.endr
674
+ cbnz w12, .loop_24x32
675
+ ret
676
+endfunc
677
+
678
+function PFX(blockcopy_pp_24x64_neon)
679
+ mov w12, #4
680
+.loop_24x64:
681
+ sub w12, w12, #1
682
+.rept 16
683
+ ld1 {v0.8b-v2.8b}, x2, x3
684
+ st1 {v0.8b-v2.8b}, x0, x1
685
+.endr
686
+ cbnz w12, .loop_24x64
687
+ ret
688
+endfunc
689
+
690
+function PFX(blockcopy_pp_32x8_neon)
691
+.rept 8
692
+ ld1 {v0.16b-v1.16b}, x2, x3
693
+ st1 {v0.16b-v1.16b}, x0, x1
694
+.endr
695
+ ret
696
+endfunc
697
+
698
+.macro blockcopy_pp_32xN_neon h
699
+function PFX(blockcopy_pp_32x\h\()_neon)
700
+ mov w12, #\h / 8
701
+.loop_32x\h\():
702
+ sub w12, w12, #1
703
+.rept 8
704
+ ld1 {v0.16b-v1.16b}, x2, x3
705
+ st1 {v0.16b-v1.16b}, x0, x1
706
+.endr
707
+ cbnz w12, .loop_32x\h
708
+ ret
709
+endfunc
710
+.endm
711
+
712
+blockcopy_pp_32xN_neon 16
713
+blockcopy_pp_32xN_neon 24
714
+blockcopy_pp_32xN_neon 32
715
+blockcopy_pp_32xN_neon 64
716
+blockcopy_pp_32xN_neon 48
717
+
718
+function PFX(blockcopy_pp_48x64_neon)
719
+ mov w12, #8
720
+.loop_48x64:
721
+ sub w12, w12, #1
722
+.rept 8
723
+ ld1 {v0.16b-v2.16b}, x2, x3
724
+ st1 {v0.16b-v2.16b}, x0, x1
725
+.endr
726
+ cbnz w12, .loop_48x64
727
+ ret
728
+endfunc
729
+
730
+.macro blockcopy_pp_64xN_neon h
731
+function PFX(blockcopy_pp_64x\h\()_neon)
732
+ mov w12, #\h / 4
733
+.loop_64x\h\():
734
+ sub w12, w12, #1
735
+.rept 4
736
+ ld1 {v0.16b-v3.16b}, x2, x3
737
+ st1 {v0.16b-v3.16b}, x0, x1
738
+.endr
739
+ cbnz w12, .loop_64x\h
740
+ ret
741
+endfunc
742
+.endm
743
+
744
+blockcopy_pp_64xN_neon 16
745
+blockcopy_pp_64xN_neon 32
746
+blockcopy_pp_64xN_neon 48
747
+blockcopy_pp_64xN_neon 64
748
+
749
+// void x265_blockfill_s_neon(int16_t* dst, intptr_t dstride, int16_t val)
750
+function PFX(blockfill_s_4x4_neon)
751
+ dup v0.4h, w2
752
+ lsl x1, x1, #1
753
+.rept 4
754
+ st1 {v0.4h}, x0, x1
755
+.endr
756
+ ret
757
+endfunc
758
+
759
+function PFX(blockfill_s_8x8_neon)
760
+ dup v0.8h, w2
761
+ lsl x1, x1, #1
762
+.rept 8
763
+ st1 {v0.8h}, x0, x1
764
+.endr
765
+ ret
766
+endfunc
767
+
768
+function PFX(blockfill_s_16x16_neon)
769
+ dup v0.8h, w2
770
+ mov v1.16b, v0.16b
771
+ lsl x1, x1, #1
772
+.rept 16
773
+ stp q0, q1, x0
774
+ add x0, x0, x1
775
+.endr
776
+ ret
777
+endfunc
778
+
779
+function PFX(blockfill_s_32x32_neon)
780
+ dup v0.8h, w2
781
+ mov v1.16b, v0.16b
782
+ mov v2.16b, v0.16b
783
+ mov v3.16b, v0.16b
784
+ lsl x1, x1, #1
785
+.rept 32
786
+ st1 {v0.8h-v3.8h}, x0, x1
787
+.endr
788
+ ret
789
+endfunc
790
+
791
+function PFX(blockfill_s_64x64_neon)
792
+ dup v0.8h, w2
793
+ mov v1.16b, v0.16b
794
+ mov v2.16b, v0.16b
795
+ mov v3.16b, v0.16b
796
+ lsl x1, x1, #1
797
+ sub x1, x1, #64
798
+.rept 64
799
+ st1 {v0.8h-v3.8h}, x0, #64
800
+ st1 {v0.8h-v3.8h}, x0, x1
801
+.endr
802
+ ret
803
+endfunc
804
+
805
+// uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t resiStride)
806
+function PFX(copy_cnt_4_neon)
807
+ lsl x2, x2, #1
808
+ movi v4.8b, #0
809
+.rept 2
810
+ ld1 {v0.8b}, x1, x2
811
+ ld1 {v1.8b}, x1, x2
812
+ stp d0, d1, x0, #16
813
+ cmeq v0.4h, v0.4h, #0
814
+ cmeq v1.4h, v1.4h, #0
815
+ add v4.4h, v4.4h, v0.4h
816
+ add v4.4h, v4.4h, v1.4h
817
+.endr
818
+ saddlv s4, v4.4h
819
+ fmov w12, s4
820
+ add w0, w12, #16
821
+ ret
822
+endfunc
823
+
824
+function PFX(copy_cnt_8_neon)
825
+ lsl x2, x2, #1
826
+ movi v4.8b, #0
827
+.rept 4
828
+ ld1 {v0.16b}, x1, x2
829
+ ld1 {v1.16b}, x1, x2
830
+ stp q0, q1, x0, #32
831
+ cmeq v0.8h, v0.8h, #0
832
+ cmeq v1.8h, v1.8h, #0
833
+ add v4.8h, v4.8h, v0.8h
834
+ add v4.8h, v4.8h, v1.8h
835
+.endr
836
+ saddlv s4, v4.8h
837
+ fmov w12, s4
838
+ add w0, w12, #64
839
+ ret
840
+endfunc
841
+
842
+function PFX(copy_cnt_16_neon)
843
+ lsl x2, x2, #1
844
+ movi v4.8b, #0
845
+.rept 16
846
+ ld1 {v0.16b-v1.16b}, x1, x2
847
+ st1 {v0.16b-v1.16b}, x0, #32
848
+ cmeq v0.8h, v0.8h, #0
849
+ cmeq v1.8h, v1.8h, #0
850
+ add v4.8h, v4.8h, v0.8h
851
+ add v4.8h, v4.8h, v1.8h
852
+.endr
853
+ saddlv s4, v4.8h
854
+ fmov w12, s4
855
+ add w0, w12, #256
856
+ ret
857
+endfunc
858
+
859
+function PFX(copy_cnt_32_neon)
860
+ lsl x2, x2, #1
861
+ movi v4.8b, #0
862
+.rept 32
863
+ ld1 {v0.16b-v3.16b}, x1, x2
864
+ st1 {v0.16b-v3.16b}, x0, #64
865
+ cmeq v0.8h, v0.8h, #0
866
+ cmeq v1.8h, v1.8h, #0
867
+ cmeq v2.8h, v2.8h, #0
868
+ cmeq v3.8h, v3.8h, #0
869
+ add v0.8h, v0.8h, v1.8h
870
+ add v2.8h, v2.8h, v3.8h
871
+ add v4.8h, v4.8h, v0.8h
872
+ add v4.8h, v4.8h, v2.8h
873
+.endr
874
+ saddlv s4, v4.8h
875
+ fmov w12, s4
876
+ add w0, w12, #1024
877
+ ret
878
+endfunc
879
+
880
+// int count_nonzero_c(const int16_t* quantCoeff)
881
+function PFX(count_nonzero_4_neon)
882
+ movi v16.16b, #1
883
+ movi v17.16b, #0
884
+ trn1 v16.16b, v16.16b, v17.16b
885
+ ldp q0, q1, x0
886
+ cmhi v0.8h, v0.8h, v17.8h
887
+ cmhi v1.8h, v1.8h, v17.8h
888
+ and v0.16b, v0.16b, v16.16b
889
+ and v1.16b, v1.16b, v16.16b
890
+ add v0.8h, v0.8h, v1.8h
891
+ uaddlv s0, v0.8h
892
+ fmov w0, s0
893
+ ret
894
+endfunc
895
+
896
+.macro COUNT_NONZERO_8
897
+ ld1 {v0.16b-v3.16b}, x0, #64
898
+ ld1 {v4.16b-v7.16b}, x0, #64
899
+ cmhi v0.8h, v0.8h, v17.8h
900
+ cmhi v1.8h, v1.8h, v17.8h
901
+ cmhi v2.8h, v2.8h, v17.8h
902
+ cmhi v3.8h, v3.8h, v17.8h
903
+ cmhi v4.8h, v4.8h, v17.8h
904
+ cmhi v5.8h, v5.8h, v17.8h
905
+ cmhi v6.8h, v6.8h, v17.8h
906
+ cmhi v7.8h, v7.8h, v17.8h
907
+ and v0.16b, v0.16b, v16.16b
908
+ and v1.16b, v1.16b, v16.16b
909
+ and v2.16b, v2.16b, v16.16b
910
+ and v3.16b, v3.16b, v16.16b
911
+ and v4.16b, v4.16b, v16.16b
912
+ and v5.16b, v5.16b, v16.16b
913
+ and v6.16b, v6.16b, v16.16b
914
+ and v7.16b, v7.16b, v16.16b
915
+ add v0.8h, v0.8h, v1.8h
916
+ add v2.8h, v2.8h, v3.8h
917
+ add v4.8h, v4.8h, v5.8h
918
+ add v6.8h, v6.8h, v7.8h
919
+ add v0.8h, v0.8h, v2.8h
920
+ add v4.8h, v4.8h, v6.8h
921
+ add v0.8h, v0.8h, v4.8h
922
+.endm
923
+
924
+function PFX(count_nonzero_8_neon)
925
+ movi v16.16b, #1
926
+ movi v17.16b, #0
927
+ trn1 v16.16b, v16.16b, v17.16b
928
+ COUNT_NONZERO_8
929
+ uaddlv s0, v0.8h
930
+ fmov w0, s0
931
+ ret
932
+endfunc
933
+
934
+function PFX(count_nonzero_16_neon)
935
+ movi v16.16b, #1
936
+ movi v17.16b, #0
937
+ trn1 v16.16b, v16.16b, v17.16b
938
+ movi v18.16b, #0
939
+.rept 4
940
+ COUNT_NONZERO_8
941
+ add v18.16b, v18.16b, v0.16b
942
+.endr
943
+ uaddlv s0, v18.8h
944
+ fmov w0, s0
945
+ ret
946
+endfunc
947
+
948
+function PFX(count_nonzero_32_neon)
949
+ movi v16.16b, #1
950
+ movi v17.16b, #0
951
+ trn1 v16.16b, v16.16b, v17.16b
952
+ movi v18.16b, #0
953
+ mov w12, #16
954
+.loop_count_nonzero_32:
955
+ sub w12, w12, #1
956
+ COUNT_NONZERO_8
957
+ add v18.16b, v18.16b, v0.16b
958
+ cbnz w12, .loop_count_nonzero_32
959
+
960
+ uaddlv s0, v18.8h
961
+ fmov w0, s0
962
+ ret
963
+endfunc
964
+
965
+// void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
966
+.macro cpy2Dto1D_shl_start
967
+ add x2, x2, x2
968
+ dup v0.8h, w3
969
+.endm
970
+
971
+function PFX(cpy2Dto1D_shl_4x4_neon)
972
+ cpy2Dto1D_shl_start
973
+ ld1 {v2.d}0, x1, x2
974
+ ld1 {v2.d}1, x1, x2
975
+ ld1 {v3.d}0, x1, x2
976
+ ld1 {v3.d}1, x1, x2
977
+ sshl v2.8h, v2.8h, v0.8h
978
+ sshl v3.8h, v3.8h, v0.8h
979
+ st1 {v2.16b-v3.16b}, x0
980
+ ret
981
+endfunc
982
+
983
+function PFX(cpy2Dto1D_shl_8x8_neon)
984
+ cpy2Dto1D_shl_start
985
+.rept 4
986
+ ld1 {v2.16b}, x1, x2
987
+ ld1 {v3.16b}, x1, x2
988
+ sshl v2.8h, v2.8h, v0.8h
989
+ sshl v3.8h, v3.8h, v0.8h
990
+ st1 {v2.16b-v3.16b}, x0, #32
991
+.endr
992
+ ret
993
+endfunc
994
+
995
+function PFX(cpy2Dto1D_shl_16x16_neon)
996
+ cpy2Dto1D_shl_start
997
+ mov w12, #4
998
+.loop_cpy2Dto1D_shl_16:
999
+ sub w12, w12, #1
1000
+.rept 4
1001
+ ld1 {v2.16b-v3.16b}, x1, x2
1002
+ sshl v2.8h, v2.8h, v0.8h
1003
+ sshl v3.8h, v3.8h, v0.8h
1004
+ st1 {v2.16b-v3.16b}, x0, #32
1005
+.endr
1006
+ cbnz w12, .loop_cpy2Dto1D_shl_16
1007
+ ret
1008
+endfunc
1009
+
1010
+function PFX(cpy2Dto1D_shl_32x32_neon)
1011
+ cpy2Dto1D_shl_start
1012
+ mov w12, #16
1013
+.loop_cpy2Dto1D_shl_32:
1014
+ sub w12, w12, #1
1015
+.rept 2
1016
+ ld1 {v2.16b-v5.16b}, x1, x2
1017
+ sshl v2.8h, v2.8h, v0.8h
1018
+ sshl v3.8h, v3.8h, v0.8h
1019
+ sshl v4.8h, v4.8h, v0.8h
1020
+ sshl v5.8h, v5.8h, v0.8h
1021
+ st1 {v2.16b-v5.16b}, x0, #64
1022
+.endr
1023
+ cbnz w12, .loop_cpy2Dto1D_shl_32
1024
+ ret
1025
+endfunc
1026
+
1027
+function PFX(cpy2Dto1D_shl_64x64_neon)
1028
+ cpy2Dto1D_shl_start
1029
+ mov w12, #32
1030
+ sub x2, x2, #64
1031
+.loop_cpy2Dto1D_shl_64:
1032
+ sub w12, w12, #1
1033
+.rept 2
1034
+ ld1 {v2.16b-v5.16b}, x1, #64
1035
+ ld1 {v16.16b-v19.16b}, x1, x2
1036
+ sshl v2.8h, v2.8h, v0.8h
1037
+ sshl v3.8h, v3.8h, v0.8h
1038
+ sshl v4.8h, v4.8h, v0.8h
1039
+ sshl v5.8h, v5.8h, v0.8h
1040
+ sshl v16.8h, v16.8h, v0.8h
1041
+ sshl v17.8h, v17.8h, v0.8h
1042
+ sshl v18.8h, v18.8h, v0.8h
1043
+ sshl v19.8h, v19.8h, v0.8h
1044
+ st1 {v2.16b-v5.16b}, x0, #64
1045
+ st1 {v16.16b-v19.16b}, x0, #64
1046
+.endr
1047
+ cbnz w12, .loop_cpy2Dto1D_shl_64
1048
+ ret
1049
+endfunc
1050
+
1051
+// void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
1052
+function PFX(cpy2Dto1D_shr_4x4_neon)
1053
+ cpy2Dto1D_shr_start
1054
+ ld1 {v2.d}0, x1, x2
1055
+ ld1 {v2.d}1, x1, x2
1056
+ ld1 {v3.d}0, x1, x2
1057
+ ld1 {v3.d}1, x1, x2
1058
+ sub v2.8h, v2.8h, v1.8h
1059
+ sub v3.8h, v3.8h, v1.8h
1060
+ sshl v2.8h, v2.8h, v0.8h
1061
+ sshl v3.8h, v3.8h, v0.8h
1062
+ stp q2, q3, x0
1063
+ ret
1064
+endfunc
1065
+
1066
+function PFX(cpy2Dto1D_shr_8x8_neon)
1067
+ cpy2Dto1D_shr_start
1068
+.rept 4
1069
+ ld1 {v2.16b}, x1, x2
1070
+ ld1 {v3.16b}, x1, x2
1071
+ sub v2.8h, v2.8h, v1.8h
1072
+ sub v3.8h, v3.8h, v1.8h
1073
+ sshl v2.8h, v2.8h, v0.8h
1074
+ sshl v3.8h, v3.8h, v0.8h
1075
+ stp q2, q3, x0, #32
1076
+.endr
1077
+ ret
1078
+endfunc
1079
+
1080
+function PFX(cpy2Dto1D_shr_16x16_neon)
1081
+ cpy2Dto1D_shr_start
1082
+ mov w12, #4
1083
+.loop_cpy2Dto1D_shr_16:
1084
+ sub w12, w12, #1
1085
+.rept 4
1086
+ ld1 {v2.8h-v3.8h}, x1, x2
1087
+ sub v2.8h, v2.8h, v1.8h
1088
+ sub v3.8h, v3.8h, v1.8h
1089
+ sshl v2.8h, v2.8h, v0.8h
1090
+ sshl v3.8h, v3.8h, v0.8h
1091
+ st1 {v2.8h-v3.8h}, x0, #32
1092
+.endr
1093
+ cbnz w12, .loop_cpy2Dto1D_shr_16
1094
+ ret
1095
+endfunc
1096
+
1097
+function PFX(cpy2Dto1D_shr_32x32_neon)
1098
+ cpy2Dto1D_shr_start
1099
+ mov w12, #16
1100
+.loop_cpy2Dto1D_shr_32:
1101
+ sub w12, w12, #1
1102
+.rept 2
1103
+ ld1 {v2.8h-v5.8h}, x1, x2
1104
+ sub v2.8h, v2.8h, v1.8h
1105
+ sub v3.8h, v3.8h, v1.8h
1106
+ sub v4.8h, v4.8h, v1.8h
1107
+ sub v5.8h, v5.8h, v1.8h
1108
+ sshl v2.8h, v2.8h, v0.8h
1109
+ sshl v3.8h, v3.8h, v0.8h
1110
+ sshl v4.8h, v4.8h, v0.8h
1111
+ sshl v5.8h, v5.8h, v0.8h
1112
+ st1 {v2.8h-v5.8h}, x0, #64
1113
+.endr
1114
+ cbnz w12, .loop_cpy2Dto1D_shr_32
1115
+ ret
1116
+endfunc
1117
+
1118
+// void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
1119
+.macro cpy1Dto2D_shl_start
1120
+ add x2, x2, x2
1121
+ dup v0.8h, w3
1122
+.endm
1123
+
1124
+function PFX(cpy1Dto2D_shl_4x4_neon)
1125
+ cpy1Dto2D_shl_start
1126
+ ld1 {v2.16b-v3.16b}, x1
1127
+ sshl v2.8h, v2.8h, v0.8h
1128
+ sshl v3.8h, v3.8h, v0.8h
1129
+ st1 {v2.d}0, x0, x2
1130
+ st1 {v2.d}1, x0, x2
1131
+ st1 {v3.d}0, x0, x2
1132
+ st1 {v3.d}1, x0, x2
1133
+ ret
1134
+endfunc
1135
+
1136
+function PFX(cpy1Dto2D_shl_8x8_neon)
1137
+ cpy1Dto2D_shl_start
1138
+.rept 4
1139
+ ld1 {v2.16b-v3.16b}, x1, #32
1140
+ sshl v2.8h, v2.8h, v0.8h
1141
+ sshl v3.8h, v3.8h, v0.8h
1142
+ st1 {v2.16b}, x0, x2
1143
+ st1 {v3.16b}, x0, x2
1144
+.endr
1145
+ ret
1146
+endfunc
1147
+
1148
+function PFX(cpy1Dto2D_shl_16x16_neon)
1149
+ cpy1Dto2D_shl_start
1150
+ mov w12, #4
1151
+.loop_cpy1Dto2D_shl_16:
1152
+ sub w12, w12, #1
1153
+.rept 4
1154
+ ld1 {v2.16b-v3.16b}, x1, #32
1155
+ sshl v2.8h, v2.8h, v0.8h
1156
+ sshl v3.8h, v3.8h, v0.8h
1157
+ st1 {v2.16b-v3.16b}, x0, x2
1158
+.endr
1159
+ cbnz w12, .loop_cpy1Dto2D_shl_16
1160
+ ret
1161
+endfunc
1162
+
1163
+function PFX(cpy1Dto2D_shl_32x32_neon)
1164
+ cpy1Dto2D_shl_start
1165
+ mov w12, #16
1166
+.loop_cpy1Dto2D_shl_32:
1167
+ sub w12, w12, #1
1168
+.rept 2
1169
+ ld1 {v2.16b-v5.16b}, x1, #64
1170
+ sshl v2.8h, v2.8h, v0.8h
1171
+ sshl v3.8h, v3.8h, v0.8h
1172
+ sshl v4.8h, v4.8h, v0.8h
1173
+ sshl v5.8h, v5.8h, v0.8h
1174
+ st1 {v2.16b-v5.16b}, x0, x2
1175
+.endr
1176
+ cbnz w12, .loop_cpy1Dto2D_shl_32
1177
+ ret
1178
+endfunc
1179
+
1180
+function PFX(cpy1Dto2D_shl_64x64_neon)
1181
+ cpy1Dto2D_shl_start
1182
+ mov w12, #32
1183
+ sub x2, x2, #64
1184
+.loop_cpy1Dto2D_shl_64:
1185
+ sub w12, w12, #1
1186
+.rept 2
1187
+ ld1 {v2.16b-v5.16b}, x1, #64
1188
+ ld1 {v16.16b-v19.16b}, x1, #64
1189
+ sshl v2.8h, v2.8h, v0.8h
1190
+ sshl v3.8h, v3.8h, v0.8h
1191
+ sshl v4.8h, v4.8h, v0.8h
1192
+ sshl v5.8h, v5.8h, v0.8h
1193
+ sshl v16.8h, v16.8h, v0.8h
1194
+ sshl v17.8h, v17.8h, v0.8h
1195
+ sshl v18.8h, v18.8h, v0.8h
1196
+ sshl v19.8h, v19.8h, v0.8h
1197
+ st1 {v2.16b-v5.16b}, x0, #64
1198
+ st1 {v16.16b-v19.16b}, x0, x2
1199
+.endr
1200
+ cbnz w12, .loop_cpy1Dto2D_shl_64
1201
+ ret
1202
+endfunc
1203
+
1204
+function PFX(cpy1Dto2D_shr_4x4_neon)
1205
+ cpy1Dto2D_shr_start
1206
+ ld1 {v2.16b-v3.16b}, x1
1207
+ sub v2.8h, v2.8h, v1.8h
1208
+ sub v3.8h, v3.8h, v1.8h
1209
+ sshl v2.8h, v2.8h, v0.8h
1210
+ sshl v3.8h, v3.8h, v0.8h
1211
+ st1 {v2.d}0, x0, x2
1212
+ st1 {v2.d}1, x0, x2
1213
+ st1 {v3.d}0, x0, x2
1214
+ st1 {v3.d}1, x0, x2
1215
+ ret
1216
+endfunc
1217
+
1218
+function PFX(cpy1Dto2D_shr_8x8_neon)
1219
+ cpy1Dto2D_shr_start
1220
+.rept 4
1221
+ ld1 {v2.16b-v3.16b}, x1, #32
1222
+ sub v2.8h, v2.8h, v1.8h
1223
+ sub v3.8h, v3.8h, v1.8h
1224
+ sshl v2.8h, v2.8h, v0.8h
1225
+ sshl v3.8h, v3.8h, v0.8h
1226
+ st1 {v2.16b}, x0, x2
1227
+ st1 {v3.16b}, x0, x2
1228
+.endr
1229
+ ret
1230
+endfunc
1231
+
1232
+function PFX(cpy1Dto2D_shr_16x16_neon)
1233
+ cpy1Dto2D_shr_start
1234
+ mov w12, #4
1235
+.loop_cpy1Dto2D_shr_16:
1236
+ sub w12, w12, #1
1237
+.rept 4
1238
+ ld1 {v2.8h-v3.8h}, x1, #32
1239
+ sub v2.8h, v2.8h, v1.8h
1240
+ sub v3.8h, v3.8h, v1.8h
1241
+ sshl v2.8h, v2.8h, v0.8h
1242
+ sshl v3.8h, v3.8h, v0.8h
1243
+ st1 {v2.8h-v3.8h}, x0, x2
1244
+.endr
1245
+ cbnz w12, .loop_cpy1Dto2D_shr_16
1246
+ ret
1247
+endfunc
1248
+
1249
+function PFX(cpy1Dto2D_shr_32x32_neon)
1250
+ cpy1Dto2D_shr_start
1251
+ mov w12, #16
1252
+.loop_cpy1Dto2D_shr_32:
1253
+ sub w12, w12, #1
1254
+.rept 2
1255
+ ld1 {v2.16b-v5.16b}, x1, #64
1256
+ sub v2.8h, v2.8h, v1.8h
1257
+ sub v3.8h, v3.8h, v1.8h
1258
+ sub v4.8h, v4.8h, v1.8h
1259
+ sub v5.8h, v5.8h, v1.8h
1260
+ sshl v2.8h, v2.8h, v0.8h
1261
+ sshl v3.8h, v3.8h, v0.8h
1262
+ sshl v4.8h, v4.8h, v0.8h
1263
+ sshl v5.8h, v5.8h, v0.8h
1264
+ st1 {v2.16b-v5.16b}, x0, x2
1265
+.endr
1266
+ cbnz w12, .loop_cpy1Dto2D_shr_32
1267
+ ret
1268
+endfunc
1269
+
1270
+function PFX(cpy1Dto2D_shr_64x64_neon)
1271
+ cpy1Dto2D_shr_start
1272
+ mov w12, #32
1273
+ sub x2, x2, #64
1274
+.loop_cpy1Dto2D_shr_64:
1275
+ sub w12, w12, #1
1276
+.rept 2
1277
+ ld1 {v2.16b-v5.16b}, x1, #64
1278
+ ld1 {v16.16b-v19.16b}, x1, #64
1279
+ sub v2.8h, v2.8h, v1.8h
1280
+ sub v3.8h, v3.8h, v1.8h
1281
+ sub v4.8h, v4.8h, v1.8h
1282
+ sub v5.8h, v5.8h, v1.8h
1283
+ sub v16.8h, v16.8h, v1.8h
1284
+ sub v17.8h, v17.8h, v1.8h
1285
+ sub v18.8h, v18.8h, v1.8h
1286
+ sub v19.8h, v19.8h, v1.8h
1287
+ sshl v2.8h, v2.8h, v0.8h
1288
+ sshl v3.8h, v3.8h, v0.8h
1289
+ sshl v4.8h, v4.8h, v0.8h
1290
+ sshl v5.8h, v5.8h, v0.8h
1291
+ sshl v16.8h, v16.8h, v0.8h
1292
+ sshl v17.8h, v17.8h, v0.8h
1293
+ sshl v18.8h, v18.8h, v0.8h
1294
+ sshl v19.8h, v19.8h, v0.8h
1295
+ st1 {v2.16b-v5.16b}, x0, #64
1296
+ st1 {v16.16b-v19.16b}, x0, x2
1297
+.endr
1298
+ cbnz w12, .loop_cpy1Dto2D_shr_64
1299
+ ret
1300
+endfunc
1301
x265_3.6.tar.gz/source/common/aarch64/dct-prim.cpp
Added
950
1
2
+#include "dct-prim.h"
3
+
4
+
5
+#if HAVE_NEON
6
+
7
+#include <arm_neon.h>
8
+
9
+
10
+namespace
11
+{
12
+using namespace X265_NS;
13
+
14
+
15
+static int16x8_t rev16(const int16x8_t a)
16
+{
17
+ static const int8x16_t tbl = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
18
+ return vqtbx1q_u8(a, a, tbl);
19
+}
20
+
21
+static int32x4_t rev32(const int32x4_t a)
22
+{
23
+ static const int8x16_t tbl = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
24
+ return vqtbx1q_u8(a, a, tbl);
25
+}
26
+
27
+static void transpose_4x4x16(int16x4_t &x0, int16x4_t &x1, int16x4_t &x2, int16x4_t &x3)
28
+{
29
+ int16x4_t s0, s1, s2, s3;
30
+ s0 = vtrn1_s32(x0, x2);
31
+ s1 = vtrn1_s32(x1, x3);
32
+ s2 = vtrn2_s32(x0, x2);
33
+ s3 = vtrn2_s32(x1, x3);
34
+
35
+ x0 = vtrn1_s16(s0, s1);
36
+ x1 = vtrn2_s16(s0, s1);
37
+ x2 = vtrn1_s16(s2, s3);
38
+ x3 = vtrn2_s16(s2, s3);
39
+}
40
+
41
+
42
+
43
+static int scanPosLast_opt(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag,
44
+ uint8_t *coeffNum, int numSig, const uint16_t * /*scanCG4x4*/, const int /*trSize*/)
45
+{
46
+
47
+ // This is an optimized function for scanPosLast, which removes the rmw dependency, once integrated into mainline x265, should replace reference implementation
48
+ // For clarity, left the original reference code in comments
49
+ int scanPosLast = 0;
50
+
51
+ uint16_t cSign = 0;
52
+ uint16_t cFlag = 0;
53
+ uint8_t cNum = 0;
54
+
55
+ uint32_t prevcgIdx = 0;
56
+ do
57
+ {
58
+ const uint32_t cgIdx = (uint32_t)scanPosLast >> MLS_CG_SIZE;
59
+
60
+ const uint32_t posLast = scanscanPosLast;
61
+
62
+ const int curCoeff = coeffposLast;
63
+ const uint32_t isNZCoeff = (curCoeff != 0);
64
+ /*
65
+ NOTE: the new algorithm is complicated, so I keep reference code here
66
+ uint32_t posy = posLast >> log2TrSize;
67
+ uint32_t posx = posLast - (posy << log2TrSize);
68
+ uint32_t blkIdx0 = ((posy >> MLS_CG_LOG2_SIZE) << codingParameters.log2TrSizeCG) + (posx >> MLS_CG_LOG2_SIZE);
69
+ const uint32_t blkIdx = ((posLast >> (2 * MLS_CG_LOG2_SIZE)) & ~maskPosXY) + ((posLast >> MLS_CG_LOG2_SIZE) & maskPosXY);
70
+ sigCoeffGroupFlag64 |= ((uint64_t)isNZCoeff << blkIdx);
71
+ */
72
+
73
+ // get L1 sig map
74
+ numSig -= isNZCoeff;
75
+
76
+ if (scanPosLast % (1 << MLS_CG_SIZE) == 0)
77
+ {
78
+ coeffSignprevcgIdx = cSign;
79
+ coeffFlagprevcgIdx = cFlag;
80
+ coeffNumprevcgIdx = cNum;
81
+ cSign = 0;
82
+ cFlag = 0;
83
+ cNum = 0;
84
+ }
85
+ // TODO: optimize by instruction BTS
86
+ cSign += (uint16_t)(((curCoeff < 0) ? 1 : 0) << cNum);
87
+ cFlag = (cFlag << 1) + (uint16_t)isNZCoeff;
88
+ cNum += (uint8_t)isNZCoeff;
89
+ prevcgIdx = cgIdx;
90
+ scanPosLast++;
91
+ }
92
+ while (numSig > 0);
93
+
94
+ coeffSignprevcgIdx = cSign;
95
+ coeffFlagprevcgIdx = cFlag;
96
+ coeffNumprevcgIdx = cNum;
97
+ return scanPosLast - 1;
98
+}
99
+
100
+
101
+#if (MLS_CG_SIZE == 4)
102
+template<int log2TrSize>
103
+static void nonPsyRdoQuant_neon(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost,
104
+ int64_t *totalRdCost, uint32_t blkPos)
105
+{
106
+ const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH -
107
+ log2TrSize; /* Represents scaling through forward transform */
108
+ const int scaleBits = SCALE_BITS - 2 * transformShift;
109
+ const uint32_t trSize = 1 << log2TrSize;
110
+
111
+ int64x2_t vcost_sum_0 = vdupq_n_s64(0);
112
+ int64x2_t vcost_sum_1 = vdupq_n_s64(0);
113
+ for (int y = 0; y < MLS_CG_SIZE; y++)
114
+ {
115
+ int16x4_t in = *(int16x4_t *)&m_resiDctCoeffblkPos;
116
+ int32x4_t mul = vmull_s16(in, in);
117
+ int64x2_t cost0, cost1;
118
+ cost0 = vshll_n_s32(vget_low_s32(mul), scaleBits);
119
+ cost1 = vshll_high_n_s32(mul, scaleBits);
120
+ *(int64x2_t *)&costUncodedblkPos + 0 = cost0;
121
+ *(int64x2_t *)&costUncodedblkPos + 2 = cost1;
122
+ vcost_sum_0 = vaddq_s64(vcost_sum_0, cost0);
123
+ vcost_sum_1 = vaddq_s64(vcost_sum_1, cost1);
124
+ blkPos += trSize;
125
+ }
126
+ int64_t sum = vaddvq_s64(vaddq_s64(vcost_sum_0, vcost_sum_1));
127
+ *totalUncodedCost += sum;
128
+ *totalRdCost += sum;
129
+}
130
+
131
+template<int log2TrSize>
132
+static void psyRdoQuant_neon(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded,
133
+ int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
134
+{
135
+ const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH -
136
+ log2TrSize; /* Represents scaling through forward transform */
137
+ const int scaleBits = SCALE_BITS - 2 * transformShift;
138
+ const uint32_t trSize = 1 << log2TrSize;
139
+ //using preprocessor to bypass clang bug
140
+ const int max = X265_MAX(0, (2 * transformShift + 1));
141
+
142
+ int64x2_t vcost_sum_0 = vdupq_n_s64(0);
143
+ int64x2_t vcost_sum_1 = vdupq_n_s64(0);
144
+ int32x4_t vpsy = vdupq_n_s32(*psyScale);
145
+ for (int y = 0; y < MLS_CG_SIZE; y++)
146
+ {
147
+ int32x4_t signCoef = vmovl_s16(*(int16x4_t *)&m_resiDctCoeffblkPos);
148
+ int32x4_t predictedCoef = vsubq_s32(vmovl_s16(*(int16x4_t *)&m_fencDctCoeffblkPos), signCoef);
149
+ int64x2_t cost0, cost1;
150
+ cost0 = vmull_s32(vget_low_s32(signCoef), vget_low_s32(signCoef));
151
+ cost1 = vmull_high_s32(signCoef, signCoef);
152
+ cost0 = vshlq_n_s64(cost0, scaleBits);
153
+ cost1 = vshlq_n_s64(cost1, scaleBits);
154
+ int64x2_t neg0 = vmull_s32(vget_low_s32(predictedCoef), vget_low_s32(vpsy));
155
+ int64x2_t neg1 = vmull_high_s32(predictedCoef, vpsy);
156
+ if (max > 0)
157
+ {
158
+ int64x2_t shift = vdupq_n_s64(-max);
159
+ neg0 = vshlq_s64(neg0, shift);
160
+ neg1 = vshlq_s64(neg1, shift);
161
+ }
162
+ cost0 = vsubq_s64(cost0, neg0);
163
+ cost1 = vsubq_s64(cost1, neg1);
164
+ *(int64x2_t *)&costUncodedblkPos + 0 = cost0;
165
+ *(int64x2_t *)&costUncodedblkPos + 2 = cost1;
166
+ vcost_sum_0 = vaddq_s64(vcost_sum_0, cost0);
167
+ vcost_sum_1 = vaddq_s64(vcost_sum_1, cost1);
168
+
169
+ blkPos += trSize;
170
+ }
171
+ int64_t sum = vaddvq_s64(vaddq_s64(vcost_sum_0, vcost_sum_1));
172
+ *totalUncodedCost += sum;
173
+ *totalRdCost += sum;
174
+}
175
+
176
+#else
177
+#error "MLS_CG_SIZE must be 4 for neon version"
178
+#endif
179
+
180
+
181
+
182
+template<int trSize>
183
+int count_nonzero_neon(const int16_t *quantCoeff)
184
+{
185
+ X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
186
+ int count = 0;
187
+ int16x8_t vcount = vdupq_n_s16(0);
188
+ const int numCoeff = trSize * trSize;
189
+ int i = 0;
190
+ for (; (i + 8) <= numCoeff; i += 8)
191
+ {
192
+ int16x8_t in = *(int16x8_t *)&quantCoeffi;
193
+ vcount = vaddq_s16(vcount, vtstq_s16(in, in));
194
+ }
195
+ for (; i < numCoeff; i++)
196
+ {
197
+ count += quantCoeffi != 0;
198
+ }
199
+
200
+ return count - vaddvq_s16(vcount);
201
+}
202
+
203
+template<int trSize>
204
+uint32_t copy_count_neon(int16_t *coeff, const int16_t *residual, intptr_t resiStride)
205
+{
206
+ uint32_t numSig = 0;
207
+ int16x8_t vcount = vdupq_n_s16(0);
208
+ for (int k = 0; k < trSize; k++)
209
+ {
210
+ int j = 0;
211
+ for (; (j + 8) <= trSize; j += 8)
212
+ {
213
+ int16x8_t in = *(int16x8_t *)&residualj;
214
+ *(int16x8_t *)&coeffj = in;
215
+ vcount = vaddq_s16(vcount, vtstq_s16(in, in));
216
+ }
217
+ for (; j < trSize; j++)
218
+ {
219
+ coeffj = residualj;
220
+ numSig += (residualj != 0);
221
+ }
222
+ residual += resiStride;
223
+ coeff += trSize;
224
+ }
225
+
226
+ return numSig - vaddvq_s16(vcount);
227
+}
228
+
229
+
230
+static void partialButterfly16(const int16_t *src, int16_t *dst, int shift, int line)
231
+{
232
+ int j, k;
233
+ int32x4_t E2, O2;
234
+ int32x4_t EE, EO;
235
+ int32x2_t EEE, EEO;
236
+ const int add = 1 << (shift - 1);
237
+ const int32x4_t _vadd = {add, 0};
238
+
239
+ for (j = 0; j < line; j++)
240
+ {
241
+ int16x8_t in0 = *(int16x8_t *)src;
242
+ int16x8_t in1 = rev16(*(int16x8_t *)&src8);
243
+
244
+ E0 = vaddl_s16(vget_low_s16(in0), vget_low_s16(in1));
245
+ O0 = vsubl_s16(vget_low_s16(in0), vget_low_s16(in1));
246
+ E1 = vaddl_high_s16(in0, in1);
247
+ O1 = vsubl_high_s16(in0, in1);
248
+
249
+ for (k = 1; k < 16; k += 2)
250
+ {
251
+ int32x4_t c0 = vmovl_s16(*(int16x4_t *)&g_t16k0);
252
+ int32x4_t c1 = vmovl_s16(*(int16x4_t *)&g_t16k4);
253
+
254
+ int32x4_t res = _vadd;
255
+ res = vmlaq_s32(res, c0, O0);
256
+ res = vmlaq_s32(res, c1, O1);
257
+ dstk * line = (int16_t)(vaddvq_s32(res) >> shift);
258
+ }
259
+
260
+ /* EE and EO */
261
+ EE = vaddq_s32(E0, rev32(E1));
262
+ EO = vsubq_s32(E0, rev32(E1));
263
+
264
+ for (k = 2; k < 16; k += 4)
265
+ {
266
+ int32x4_t c0 = vmovl_s16(*(int16x4_t *)&g_t16k0);
267
+ int32x4_t res = _vadd;
268
+ res = vmlaq_s32(res, c0, EO);
269
+ dstk * line = (int16_t)(vaddvq_s32(res) >> shift);
270
+ }
271
+
272
+ /* EEE and EEO */
273
+ EEE0 = EE0 + EE3;
274
+ EEO0 = EE0 - EE3;
275
+ EEE1 = EE1 + EE2;
276
+ EEO1 = EE1 - EE2;
277
+
278
+ dst0 = (int16_t)((g_t1600 * EEE0 + g_t1601 * EEE1 + add) >> shift);
279
+ dst8 * line = (int16_t)((g_t1680 * EEE0 + g_t1681 * EEE1 + add) >> shift);
280
+ dst4 * line = (int16_t)((g_t1640 * EEO0 + g_t1641 * EEO1 + add) >> shift);
281
+ dst12 * line = (int16_t)((g_t16120 * EEO0 + g_t16121 * EEO1 + add) >> shift);
282
+
283
+
284
+ src += 16;
285
+ dst++;
286
+ }
287
+}
288
+
289
+
290
+static void partialButterfly32(const int16_t *src, int16_t *dst, int shift, int line)
291
+{
292
+ int j, k;
293
+ const int add = 1 << (shift - 1);
294
+
295
+
296
+ for (j = 0; j < line; j++)
297
+ {
298
+ int32x4_t VE4, VO0, VO1, VO2, VO3;
299
+ int32x4_t VEE2, VEO2;
300
+ int32x4_t VEEE, VEEO;
301
+ int EEEE2, EEEO2;
302
+
303
+ int16x8x4_t inputs;
304
+ inputs = *(int16x8x4_t *)&src0;
305
+ int16x8x4_t in_rev;
306
+
307
+ in_rev.val1 = rev16(inputs.val2);
308
+ in_rev.val0 = rev16(inputs.val3);
309
+
310
+ VE0 = vaddl_s16(vget_low_s16(inputs.val0), vget_low_s16(in_rev.val0));
311
+ VE1 = vaddl_high_s16(inputs.val0, in_rev.val0);
312
+ VO0 = vsubl_s16(vget_low_s16(inputs.val0), vget_low_s16(in_rev.val0));
313
+ VO1 = vsubl_high_s16(inputs.val0, in_rev.val0);
314
+ VE2 = vaddl_s16(vget_low_s16(inputs.val1), vget_low_s16(in_rev.val1));
315
+ VE3 = vaddl_high_s16(inputs.val1, in_rev.val1);
316
+ VO2 = vsubl_s16(vget_low_s16(inputs.val1), vget_low_s16(in_rev.val1));
317
+ VO3 = vsubl_high_s16(inputs.val1, in_rev.val1);
318
+
319
+ for (k = 1; k < 32; k += 2)
320
+ {
321
+ int32x4_t c0 = vmovl_s16(*(int16x4_t *)&g_t32k0);
322
+ int32x4_t c1 = vmovl_s16(*(int16x4_t *)&g_t32k4);
323
+ int32x4_t c2 = vmovl_s16(*(int16x4_t *)&g_t32k8);
324
+ int32x4_t c3 = vmovl_s16(*(int16x4_t *)&g_t32k12);
325
+ int32x4_t s = vmulq_s32(c0, VO0);
326
+ s = vmlaq_s32(s, c1, VO1);
327
+ s = vmlaq_s32(s, c2, VO2);
328
+ s = vmlaq_s32(s, c3, VO3);
329
+
330
+ dstk * line = (int16_t)((vaddvq_s32(s) + add) >> shift);
331
+
332
+ }
333
+
334
+ int32x4_t rev_VE2;
335
+
336
+
337
+ rev_VE0 = rev32(VE3);
338
+ rev_VE1 = rev32(VE2);
339
+
340
+ /* EE and EO */
341
+ for (k = 0; k < 2; k++)
342
+ {
343
+ VEEk = vaddq_s32(VEk, rev_VEk);
344
+ VEOk = vsubq_s32(VEk, rev_VEk);
345
+ }
346
+ for (k = 2; k < 32; k += 4)
347
+ {
348
+ int32x4_t c0 = vmovl_s16(*(int16x4_t *)&g_t32k0);
349
+ int32x4_t c1 = vmovl_s16(*(int16x4_t *)&g_t32k4);
350
+ int32x4_t s = vmulq_s32(c0, VEO0);
351
+ s = vmlaq_s32(s, c1, VEO1);
352
+
353
+ dstk * line = (int16_t)((vaddvq_s32(s) + add) >> shift);
354
+
355
+ }
356
+
357
+ int32x4_t tmp = rev32(VEE1);
358
+ VEEE = vaddq_s32(VEE0, tmp);
359
+ VEEO = vsubq_s32(VEE0, tmp);
360
+ for (k = 4; k < 32; k += 8)
361
+ {
362
+ int32x4_t c = vmovl_s16(*(int16x4_t *)&g_t32k0);
363
+ int32x4_t s = vmulq_s32(c, VEEO);
364
+
365
+ dstk * line = (int16_t)((vaddvq_s32(s) + add) >> shift);
366
+ }
367
+
368
+ /* EEEE and EEEO */
369
+ EEEE0 = VEEE0 + VEEE3;
370
+ EEEO0 = VEEE0 - VEEE3;
371
+ EEEE1 = VEEE1 + VEEE2;
372
+ EEEO1 = VEEE1 - VEEE2;
373
+
374
+ dst0 = (int16_t)((g_t3200 * EEEE0 + g_t3201 * EEEE1 + add) >> shift);
375
+ dst16 * line = (int16_t)((g_t32160 * EEEE0 + g_t32161 * EEEE1 + add) >> shift);
376
+ dst8 * line = (int16_t)((g_t3280 * EEEO0 + g_t3281 * EEEO1 + add) >> shift);
377
+ dst24 * line = (int16_t)((g_t32240 * EEEO0 + g_t32241 * EEEO1 + add) >> shift);
378
+
379
+
380
+
381
+ src += 32;
382
+ dst++;
383
+ }
384
+}
385
+
386
+static void partialButterfly8(const int16_t *src, int16_t *dst, int shift, int line)
387
+{
388
+ int j, k;
389
+ int E4, O4;
390
+ int EE2, EO2;
391
+ int add = 1 << (shift - 1);
392
+
393
+ for (j = 0; j < line; j++)
394
+ {
395
+ /* E and O*/
396
+ for (k = 0; k < 4; k++)
397
+ {
398
+ Ek = srck + src7 - k;
399
+ Ok = srck - src7 - k;
400
+ }
401
+
402
+ /* EE and EO */
403
+ EE0 = E0 + E3;
404
+ EO0 = E0 - E3;
405
+ EE1 = E1 + E2;
406
+ EO1 = E1 - E2;
407
+
408
+ dst0 = (int16_t)((g_t800 * EE0 + g_t801 * EE1 + add) >> shift);
409
+ dst4 * line = (int16_t)((g_t840 * EE0 + g_t841 * EE1 + add) >> shift);
410
+ dst2 * line = (int16_t)((g_t820 * EO0 + g_t821 * EO1 + add) >> shift);
411
+ dst6 * line = (int16_t)((g_t860 * EO0 + g_t861 * EO1 + add) >> shift);
412
+
413
+ dstline = (int16_t)((g_t810 * O0 + g_t811 * O1 + g_t812 * O2 + g_t813 * O3 + add) >> shift);
414
+ dst3 * line = (int16_t)((g_t830 * O0 + g_t831 * O1 + g_t832 * O2 + g_t833 * O3 + add) >>
415
+ shift);
416
+ dst5 * line = (int16_t)((g_t850 * O0 + g_t851 * O1 + g_t852 * O2 + g_t853 * O3 + add) >>
417
+ shift);
418
+ dst7 * line = (int16_t)((g_t870 * O0 + g_t871 * O1 + g_t872 * O2 + g_t873 * O3 + add) >>
419
+ shift);
420
+
421
+ src += 8;
422
+ dst++;
423
+ }
424
+}
425
+
426
+static void partialButterflyInverse4(const int16_t *src, int16_t *dst, int shift, int line)
427
+{
428
+ int j;
429
+ int E2, O2;
430
+ int add = 1 << (shift - 1);
431
+
432
+ for (j = 0; j < line; j++)
433
+ {
434
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
435
+ O0 = g_t410 * srcline + g_t430 * src3 * line;
436
+ O1 = g_t411 * srcline + g_t431 * src3 * line;
437
+ E0 = g_t400 * src0 + g_t420 * src2 * line;
438
+ E1 = g_t401 * src0 + g_t421 * src2 * line;
439
+
440
+ /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
441
+ dst0 = (int16_t)(x265_clip3(-32768, 32767, (E0 + O0 + add) >> shift));
442
+ dst1 = (int16_t)(x265_clip3(-32768, 32767, (E1 + O1 + add) >> shift));
443
+ dst2 = (int16_t)(x265_clip3(-32768, 32767, (E1 - O1 + add) >> shift));
444
+ dst3 = (int16_t)(x265_clip3(-32768, 32767, (E0 - O0 + add) >> shift));
445
+
446
+ src++;
447
+ dst += 4;
448
+ }
449
+}
450
+
451
+
452
+
453
+static void partialButterflyInverse16_neon(const int16_t *src, int16_t *orig_dst, int shift, int line)
454
+{
455
+#define FMAK(x,l) sl = vmlal_lane_s16(sl,*(int16x4_t*)&src(x)*line,*(int16x4_t *)&g_t16xk,l)
456
+#define MULK(x,l) vmull_lane_s16(*(int16x4_t*)&srcx*line,*(int16x4_t *)&g_t16xk,l);
457
+#define ODD3_15(k) FMAK(3,k);FMAK(5,k);FMAK(7,k);FMAK(9,k);FMAK(11,k);FMAK(13,k);FMAK(15,k);
458
+#define EVEN6_14_STEP4(k) FMAK(6,k);FMAK(10,k);FMAK(14,k);
459
+
460
+
461
+ int j, k;
462
+ int32x4_t E8, O8;
463
+ int32x4_t EE4, EO4;
464
+ int32x4_t EEE2, EEO2;
465
+ const int add = 1 << (shift - 1);
466
+
467
+
468
+#pragma unroll(4)
469
+ for (j = 0; j < line; j += 4)
470
+ {
471
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
472
+
473
+#pragma unroll(2)
474
+ for (k = 0; k < 2; k++)
475
+ {
476
+ int32x4_t s;
477
+ s = vmull_s16(vdup_n_s16(g_t164k), *(int16x4_t *)&src4 * line);;
478
+ EEOk = vmlal_s16(s, vdup_n_s16(g_t1612k), *(int16x4_t *)&src(12) * line);
479
+ s = vmull_s16(vdup_n_s16(g_t160k), *(int16x4_t *)&src0 * line);;
480
+ EEEk = vmlal_s16(s, vdup_n_s16(g_t168k), *(int16x4_t *)&src(8) * line);
481
+ }
482
+
483
+ /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
484
+ EE0 = vaddq_s32(EEE0 , EEO0);
485
+ EE2 = vsubq_s32(EEE1 , EEO1);
486
+ EE1 = vaddq_s32(EEE1 , EEO1);
487
+ EE3 = vsubq_s32(EEE0 , EEO0);
488
+
489
+
490
+#pragma unroll(1)
491
+ for (k = 0; k < 4; k += 4)
492
+ {
493
+ int32x4_t s4;
494
+ s0 = MULK(2, 0);
495
+ s1 = MULK(2, 1);
496
+ s2 = MULK(2, 2);
497
+ s3 = MULK(2, 3);
498
+
499
+ EVEN6_14_STEP4(0);
500
+ EVEN6_14_STEP4(1);
501
+ EVEN6_14_STEP4(2);
502
+ EVEN6_14_STEP4(3);
503
+
504
+ EOk = s0;
505
+ EOk + 1 = s1;
506
+ EOk + 2 = s2;
507
+ EOk + 3 = s3;
508
+ }
509
+
510
+
511
+
512
+ static const int32x4_t min = vdupq_n_s32(-32768);
513
+ static const int32x4_t max = vdupq_n_s32(32767);
514
+ const int32x4_t minus_shift = vdupq_n_s32(-shift);
515
+
516
+#pragma unroll(4)
517
+ for (k = 0; k < 4; k++)
518
+ {
519
+ Ek = vaddq_s32(EEk , EOk);
520
+ Ek + 4 = vsubq_s32(EE3 - k , EO3 - k);
521
+ }
522
+
523
+#pragma unroll(2)
524
+ for (k = 0; k < 8; k += 4)
525
+ {
526
+ int32x4_t s4;
527
+ s0 = MULK(1, 0);
528
+ s1 = MULK(1, 1);
529
+ s2 = MULK(1, 2);
530
+ s3 = MULK(1, 3);
531
+ ODD3_15(0);
532
+ ODD3_15(1);
533
+ ODD3_15(2);
534
+ ODD3_15(3);
535
+ Ok = s0;
536
+ Ok + 1 = s1;
537
+ Ok + 2 = s2;
538
+ Ok + 3 = s3;
539
+ int32x4_t t;
540
+ int16x4_t x0, x1, x2, x3;
541
+
542
+ Ek = vaddq_s32(vdupq_n_s32(add), Ek);
543
+ t = vaddq_s32(Ek, Ok);
544
+ t = vshlq_s32(t, minus_shift);
545
+ t = vmaxq_s32(t, min);
546
+ t = vminq_s32(t, max);
547
+ x0 = vmovn_s32(t);
548
+
549
+ Ek + 1 = vaddq_s32(vdupq_n_s32(add), Ek + 1);
550
+ t = vaddq_s32(Ek + 1, Ok + 1);
551
+ t = vshlq_s32(t, minus_shift);
552
+ t = vmaxq_s32(t, min);
553
+ t = vminq_s32(t, max);
554
+ x1 = vmovn_s32(t);
555
+
556
+ Ek + 2 = vaddq_s32(vdupq_n_s32(add), Ek + 2);
557
+ t = vaddq_s32(Ek + 2, Ok + 2);
558
+ t = vshlq_s32(t, minus_shift);
559
+ t = vmaxq_s32(t, min);
560
+ t = vminq_s32(t, max);
561
+ x2 = vmovn_s32(t);
562
+
563
+ Ek + 3 = vaddq_s32(vdupq_n_s32(add), Ek + 3);
564
+ t = vaddq_s32(Ek + 3, Ok + 3);
565
+ t = vshlq_s32(t, minus_shift);
566
+ t = vmaxq_s32(t, min);
567
+ t = vminq_s32(t, max);
568
+ x3 = vmovn_s32(t);
569
+
570
+ transpose_4x4x16(x0, x1, x2, x3);
571
+ *(int16x4_t *)&orig_dst0 * 16 + k = x0;
572
+ *(int16x4_t *)&orig_dst1 * 16 + k = x1;
573
+ *(int16x4_t *)&orig_dst2 * 16 + k = x2;
574
+ *(int16x4_t *)&orig_dst3 * 16 + k = x3;
575
+ }
576
+
577
+
578
+#pragma unroll(2)
579
+ for (k = 0; k < 8; k += 4)
580
+ {
581
+ int32x4_t t;
582
+ int16x4_t x0, x1, x2, x3;
583
+
584
+ t = vsubq_s32(E7 - k, O7 - k);
585
+ t = vshlq_s32(t, minus_shift);
586
+ t = vmaxq_s32(t, min);
587
+ t = vminq_s32(t, max);
588
+ x0 = vmovn_s32(t);
589
+
590
+ t = vsubq_s32(E6 - k, O6 - k);
591
+ t = vshlq_s32(t, minus_shift);
592
+ t = vmaxq_s32(t, min);
593
+ t = vminq_s32(t, max);
594
+ x1 = vmovn_s32(t);
595
+
596
+ t = vsubq_s32(E5 - k, O5 - k);
597
+
598
+ t = vshlq_s32(t, minus_shift);
599
+ t = vmaxq_s32(t, min);
600
+ t = vminq_s32(t, max);
601
+ x2 = vmovn_s32(t);
602
+
603
+ t = vsubq_s32(E4 - k, O4 - k);
604
+ t = vshlq_s32(t, minus_shift);
605
+ t = vmaxq_s32(t, min);
606
+ t = vminq_s32(t, max);
607
+ x3 = vmovn_s32(t);
608
+
609
+ transpose_4x4x16(x0, x1, x2, x3);
610
+ *(int16x4_t *)&orig_dst0 * 16 + k + 8 = x0;
611
+ *(int16x4_t *)&orig_dst1 * 16 + k + 8 = x1;
612
+ *(int16x4_t *)&orig_dst2 * 16 + k + 8 = x2;
613
+ *(int16x4_t *)&orig_dst3 * 16 + k + 8 = x3;
614
+ }
615
+ orig_dst += 4 * 16;
616
+ src += 4;
617
+ }
618
+
619
+#undef MUL
620
+#undef FMA
621
+#undef FMAK
622
+#undef MULK
623
+#undef ODD3_15
624
+#undef EVEN6_14_STEP4
625
+
626
+
627
+}
628
+
629
+
630
+
631
+static void partialButterflyInverse32_neon(const int16_t *src, int16_t *orig_dst, int shift, int line)
632
+{
633
+#define MUL(x) vmull_s16(vdup_n_s16(g_t32xk),*(int16x4_t*)&srcx*line);
634
+#define FMA(x) s = vmlal_s16(s,vdup_n_s16(g_t32xk),*(int16x4_t*)&src(x)*line)
635
+#define FMAK(x,l) sl = vmlal_lane_s16(sl,*(int16x4_t*)&src(x)*line,*(int16x4_t *)&g_t32xk,l)
636
+#define MULK(x,l) vmull_lane_s16(*(int16x4_t*)&srcx*line,*(int16x4_t *)&g_t32xk,l);
637
+#define ODD31(k) FMAK(3,k);FMAK(5,k);FMAK(7,k);FMAK(9,k);FMAK(11,k);FMAK(13,k);FMAK(15,k);FMAK(17,k);FMAK(19,k);FMAK(21,k);FMAK(23,k);FMAK(25,k);FMAK(27,k);FMAK(29,k);FMAK(31,k);
638
+
639
+#define ODD15(k) FMAK(6,k);FMAK(10,k);FMAK(14,k);FMAK(18,k);FMAK(22,k);FMAK(26,k);FMAK(30,k);
640
+#define ODD7(k) FMAK(12,k);FMAK(20,k);FMAK(28,k);
641
+
642
+
643
+ int j, k;
644
+ int32x4_t E16, O16;
645
+ int32x4_t EE8, EO8;
646
+ int32x4_t EEE4, EEO4;
647
+ int32x4_t EEEE2, EEEO2;
648
+ int16x4_t dst32;
649
+ int add = 1 << (shift - 1);
650
+
651
+#pragma unroll (8)
652
+ for (j = 0; j < line; j += 4)
653
+ {
654
+#pragma unroll (4)
655
+ for (k = 0; k < 16; k += 4)
656
+ {
657
+ int32x4_t s4;
658
+ s0 = MULK(1, 0);
659
+ s1 = MULK(1, 1);
660
+ s2 = MULK(1, 2);
661
+ s3 = MULK(1, 3);
662
+ ODD31(0);
663
+ ODD31(1);
664
+ ODD31(2);
665
+ ODD31(3);
666
+ Ok = s0;
667
+ Ok + 1 = s1;
668
+ Ok + 2 = s2;
669
+ Ok + 3 = s3;
670
+
671
+
672
+ }
673
+
674
+
675
+#pragma unroll (2)
676
+ for (k = 0; k < 8; k += 4)
677
+ {
678
+ int32x4_t s4;
679
+ s0 = MULK(2, 0);
680
+ s1 = MULK(2, 1);
681
+ s2 = MULK(2, 2);
682
+ s3 = MULK(2, 3);
683
+
684
+ ODD15(0);
685
+ ODD15(1);
686
+ ODD15(2);
687
+ ODD15(3);
688
+
689
+ EOk = s0;
690
+ EOk + 1 = s1;
691
+ EOk + 2 = s2;
692
+ EOk + 3 = s3;
693
+ }
694
+
695
+
696
+ for (k = 0; k < 4; k += 4)
697
+ {
698
+ int32x4_t s4;
699
+ s0 = MULK(4, 0);
700
+ s1 = MULK(4, 1);
701
+ s2 = MULK(4, 2);
702
+ s3 = MULK(4, 3);
703
+
704
+ ODD7(0);
705
+ ODD7(1);
706
+ ODD7(2);
707
+ ODD7(3);
708
+
709
+ EEOk = s0;
710
+ EEOk + 1 = s1;
711
+ EEOk + 2 = s2;
712
+ EEOk + 3 = s3;
713
+ }
714
+
715
+#pragma unroll (2)
716
+ for (k = 0; k < 2; k++)
717
+ {
718
+ int32x4_t s;
719
+ s = MUL(8);
720
+ EEEOk = FMA(24);
721
+ s = MUL(0);
722
+ EEEEk = FMA(16);
723
+ }
724
+ /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
725
+ EEE0 = vaddq_s32(EEEE0, EEEO0);
726
+ EEE3 = vsubq_s32(EEEE0, EEEO0);
727
+ EEE1 = vaddq_s32(EEEE1, EEEO1);
728
+ EEE2 = vsubq_s32(EEEE1, EEEO1);
729
+
730
+#pragma unroll (4)
731
+ for (k = 0; k < 4; k++)
732
+ {
733
+ EEk = vaddq_s32(EEEk, EEOk);
734
+ EEk + 4 = vsubq_s32((EEE3 - k), (EEO3 - k));
735
+ }
736
+
737
+#pragma unroll (8)
738
+ for (k = 0; k < 8; k++)
739
+ {
740
+ Ek = vaddq_s32(EEk, EOk);
741
+ Ek + 8 = vsubq_s32((EE7 - k), (EO7 - k));
742
+ }
743
+
744
+ static const int32x4_t min = vdupq_n_s32(-32768);
745
+ static const int32x4_t max = vdupq_n_s32(32767);
746
+
747
+
748
+
749
+#pragma unroll (16)
750
+ for (k = 0; k < 16; k++)
751
+ {
752
+ int32x4_t adde = vaddq_s32(vdupq_n_s32(add), Ek);
753
+ int32x4_t s = vaddq_s32(adde, Ok);
754
+ s = vshlq_s32(s, vdupq_n_s32(-shift));
755
+ s = vmaxq_s32(s, min);
756
+ s = vminq_s32(s, max);
757
+
758
+
759
+
760
+ dstk = vmovn_s32(s);
761
+ adde = vaddq_s32(vdupq_n_s32(add), (E15 - k));
762
+ s = vsubq_s32(adde, (O15 - k));
763
+ s = vshlq_s32(s, vdupq_n_s32(-shift));
764
+ s = vmaxq_s32(s, min);
765
+ s = vminq_s32(s, max);
766
+
767
+ dstk + 16 = vmovn_s32(s);
768
+ }
769
+
770
+
771
+#pragma unroll (8)
772
+ for (k = 0; k < 32; k += 4)
773
+ {
774
+ int16x4_t x0 = dstk + 0;
775
+ int16x4_t x1 = dstk + 1;
776
+ int16x4_t x2 = dstk + 2;
777
+ int16x4_t x3 = dstk + 3;
778
+ transpose_4x4x16(x0, x1, x2, x3);
779
+ *(int16x4_t *)&orig_dst0 * 32 + k = x0;
780
+ *(int16x4_t *)&orig_dst1 * 32 + k = x1;
781
+ *(int16x4_t *)&orig_dst2 * 32 + k = x2;
782
+ *(int16x4_t *)&orig_dst3 * 32 + k = x3;
783
+ }
784
+ orig_dst += 4 * 32;
785
+ src += 4;
786
+ }
787
+#undef MUL
788
+#undef FMA
789
+#undef FMAK
790
+#undef MULK
791
+#undef ODD31
792
+#undef ODD15
793
+#undef ODD7
794
+
795
+}
796
+
797
+
798
+static void dct8_neon(const int16_t *src, int16_t *dst, intptr_t srcStride)
799
+{
800
+ const int shift_1st = 2 + X265_DEPTH - 8;
801
+ const int shift_2nd = 9;
802
+
803
+ ALIGN_VAR_32(int16_t, coef8 * 8);
804
+ ALIGN_VAR_32(int16_t, block8 * 8);
805
+
806
+ for (int i = 0; i < 8; i++)
807
+ {
808
+ memcpy(&blocki * 8, &srci * srcStride, 8 * sizeof(int16_t));
809
+ }
810
+
811
+ partialButterfly8(block, coef, shift_1st, 8);
812
+ partialButterfly8(coef, dst, shift_2nd, 8);
813
+}
814
+
815
+static void dct16_neon(const int16_t *src, int16_t *dst, intptr_t srcStride)
816
+{
817
+ const int shift_1st = 3 + X265_DEPTH - 8;
818
+ const int shift_2nd = 10;
819
+
820
+ ALIGN_VAR_32(int16_t, coef16 * 16);
821
+ ALIGN_VAR_32(int16_t, block16 * 16);
822
+
823
+ for (int i = 0; i < 16; i++)
824
+ {
825
+ memcpy(&blocki * 16, &srci * srcStride, 16 * sizeof(int16_t));
826
+ }
827
+
828
+ partialButterfly16(block, coef, shift_1st, 16);
829
+ partialButterfly16(coef, dst, shift_2nd, 16);
830
+}
831
+
832
+static void dct32_neon(const int16_t *src, int16_t *dst, intptr_t srcStride)
833
+{
834
+ const int shift_1st = 4 + X265_DEPTH - 8;
835
+ const int shift_2nd = 11;
836
+
837
+ ALIGN_VAR_32(int16_t, coef32 * 32);
838
+ ALIGN_VAR_32(int16_t, block32 * 32);
839
+
840
+ for (int i = 0; i < 32; i++)
841
+ {
842
+ memcpy(&blocki * 32, &srci * srcStride, 32 * sizeof(int16_t));
843
+ }
844
+
845
+ partialButterfly32(block, coef, shift_1st, 32);
846
+ partialButterfly32(coef, dst, shift_2nd, 32);
847
+}
848
+
849
+static void idct4_neon(const int16_t *src, int16_t *dst, intptr_t dstStride)
850
+{
851
+ const int shift_1st = 7;
852
+ const int shift_2nd = 12 - (X265_DEPTH - 8);
853
+
854
+ ALIGN_VAR_32(int16_t, coef4 * 4);
855
+ ALIGN_VAR_32(int16_t, block4 * 4);
856
+
857
+ partialButterflyInverse4(src, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output
858
+ partialButterflyInverse4(coef, block, shift_2nd, 4); // Forward DST BY FAST ALGORITHM, coef input, coeff output
859
+
860
+ for (int i = 0; i < 4; i++)
861
+ {
862
+ memcpy(&dsti * dstStride, &blocki * 4, 4 * sizeof(int16_t));
863
+ }
864
+}
865
+
866
+static void idct16_neon(const int16_t *src, int16_t *dst, intptr_t dstStride)
867
+{
868
+ const int shift_1st = 7;
869
+ const int shift_2nd = 12 - (X265_DEPTH - 8);
870
+
871
+ ALIGN_VAR_32(int16_t, coef16 * 16);
872
+ ALIGN_VAR_32(int16_t, block16 * 16);
873
+
874
+ partialButterflyInverse16_neon(src, coef, shift_1st, 16);
875
+ partialButterflyInverse16_neon(coef, block, shift_2nd, 16);
876
+
877
+ for (int i = 0; i < 16; i++)
878
+ {
879
+ memcpy(&dsti * dstStride, &blocki * 16, 16 * sizeof(int16_t));
880
+ }
881
+}
882
+
883
+static void idct32_neon(const int16_t *src, int16_t *dst, intptr_t dstStride)
884
+{
885
+ const int shift_1st = 7;
886
+ const int shift_2nd = 12 - (X265_DEPTH - 8);
887
+
888
+ ALIGN_VAR_32(int16_t, coef32 * 32);
889
+ ALIGN_VAR_32(int16_t, block32 * 32);
890
+
891
+ partialButterflyInverse32_neon(src, coef, shift_1st, 32);
892
+ partialButterflyInverse32_neon(coef, block, shift_2nd, 32);
893
+
894
+ for (int i = 0; i < 32; i++)
895
+ {
896
+ memcpy(&dsti * dstStride, &blocki * 32, 32 * sizeof(int16_t));
897
+ }
898
+}
899
+
900
+
901
+
902
+}
903
+
904
+namespace X265_NS
905
+{
906
+// x265 private namespace
907
+void setupDCTPrimitives_neon(EncoderPrimitives &p)
908
+{
909
+ p.cuBLOCK_4x4.nonPsyRdoQuant = nonPsyRdoQuant_neon<2>;
910
+ p.cuBLOCK_8x8.nonPsyRdoQuant = nonPsyRdoQuant_neon<3>;
911
+ p.cuBLOCK_16x16.nonPsyRdoQuant = nonPsyRdoQuant_neon<4>;
912
+ p.cuBLOCK_32x32.nonPsyRdoQuant = nonPsyRdoQuant_neon<5>;
913
+ p.cuBLOCK_4x4.psyRdoQuant = psyRdoQuant_neon<2>;
914
+ p.cuBLOCK_8x8.psyRdoQuant = psyRdoQuant_neon<3>;
915
+ p.cuBLOCK_16x16.psyRdoQuant = psyRdoQuant_neon<4>;
916
+ p.cuBLOCK_32x32.psyRdoQuant = psyRdoQuant_neon<5>;
917
+ p.cuBLOCK_8x8.dct = dct8_neon;
918
+ p.cuBLOCK_16x16.dct = dct16_neon;
919
+ p.cuBLOCK_32x32.dct = dct32_neon;
920
+ p.cuBLOCK_4x4.idct = idct4_neon;
921
+ p.cuBLOCK_16x16.idct = idct16_neon;
922
+ p.cuBLOCK_32x32.idct = idct32_neon;
923
+ p.cuBLOCK_4x4.count_nonzero = count_nonzero_neon<4>;
924
+ p.cuBLOCK_8x8.count_nonzero = count_nonzero_neon<8>;
925
+ p.cuBLOCK_16x16.count_nonzero = count_nonzero_neon<16>;
926
+ p.cuBLOCK_32x32.count_nonzero = count_nonzero_neon<32>;
927
+
928
+ p.cuBLOCK_4x4.copy_cnt = copy_count_neon<4>;
929
+ p.cuBLOCK_8x8.copy_cnt = copy_count_neon<8>;
930
+ p.cuBLOCK_16x16.copy_cnt = copy_count_neon<16>;
931
+ p.cuBLOCK_32x32.copy_cnt = copy_count_neon<32>;
932
+ p.cuBLOCK_4x4.psyRdoQuant_1p = nonPsyRdoQuant_neon<2>;
933
+ p.cuBLOCK_4x4.psyRdoQuant_2p = psyRdoQuant_neon<2>;
934
+ p.cuBLOCK_8x8.psyRdoQuant_1p = nonPsyRdoQuant_neon<3>;
935
+ p.cuBLOCK_8x8.psyRdoQuant_2p = psyRdoQuant_neon<3>;
936
+ p.cuBLOCK_16x16.psyRdoQuant_1p = nonPsyRdoQuant_neon<4>;
937
+ p.cuBLOCK_16x16.psyRdoQuant_2p = psyRdoQuant_neon<4>;
938
+ p.cuBLOCK_32x32.psyRdoQuant_1p = nonPsyRdoQuant_neon<5>;
939
+ p.cuBLOCK_32x32.psyRdoQuant_2p = psyRdoQuant_neon<5>;
940
+
941
+ p.scanPosLast = scanPosLast_opt;
942
+
943
+}
944
+
945
+};
946
+
947
+
948
+
949
+#endif
950
x265_3.6.tar.gz/source/common/aarch64/dct-prim.h
Added
21
1
2
+#ifndef __DCT_PRIM_NEON_H__
3
+#define __DCT_PRIM_NEON_H__
4
+
5
+
6
+#include "common.h"
7
+#include "primitives.h"
8
+#include "contexts.h" // costCoeffNxN_c
9
+#include "threading.h" // CLZ
10
+
11
+namespace X265_NS
12
+{
13
+// x265 private namespace
14
+void setupDCTPrimitives_neon(EncoderPrimitives &p);
15
+};
16
+
17
+
18
+
19
+#endif
20
+
21
x265_3.6.tar.gz/source/common/aarch64/filter-prim.cpp
Added
997
1
2
+#if HAVE_NEON
3
+
4
+#include "filter-prim.h"
5
+#include <arm_neon.h>
6
+
7
+namespace
8
+{
9
+
10
+using namespace X265_NS;
11
+
12
+
13
+template<int width, int height>
14
+void filterPixelToShort_neon(const pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
15
+{
16
+ const int shift = IF_INTERNAL_PREC - X265_DEPTH;
17
+ int row, col;
18
+ const int16x8_t off = vdupq_n_s16(IF_INTERNAL_OFFS);
19
+ for (row = 0; row < height; row++)
20
+ {
21
+
22
+ for (col = 0; col < width; col += 8)
23
+ {
24
+ int16x8_t in;
25
+
26
+#if HIGH_BIT_DEPTH
27
+ in = *(int16x8_t *)&srccol;
28
+#else
29
+ in = vmovl_u8(*(uint8x8_t *)&srccol);
30
+#endif
31
+
32
+ int16x8_t tmp = vshlq_n_s16(in, shift);
33
+ tmp = vsubq_s16(tmp, off);
34
+ *(int16x8_t *)&dstcol = tmp;
35
+
36
+ }
37
+
38
+ src += srcStride;
39
+ dst += dstStride;
40
+ }
41
+}
42
+
43
+
44
+template<int N, int width, int height>
45
+void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
46
+{
47
+ const int16_t *coeff = (N == 4) ? g_chromaFiltercoeffIdx : g_lumaFiltercoeffIdx;
48
+ int headRoom = IF_FILTER_PREC;
49
+ int offset = (1 << (headRoom - 1));
50
+ uint16_t maxVal = (1 << X265_DEPTH) - 1;
51
+ int cStride = 1;
52
+
53
+ src -= (N / 2 - 1) * cStride;
54
+ int16x8_t vc;
55
+ vc = *(int16x8_t *)coeff;
56
+ int16x4_t low_vc = vget_low_s16(vc);
57
+ int16x4_t high_vc = vget_high_s16(vc);
58
+
59
+ const int32x4_t voffset = vdupq_n_s32(offset);
60
+ const int32x4_t vhr = vdupq_n_s32(-headRoom);
61
+
62
+ int row, col;
63
+ for (row = 0; row < height; row++)
64
+ {
65
+ for (col = 0; col < width; col += 8)
66
+ {
67
+ int32x4_t vsum1, vsum2;
68
+
69
+ int16x8_t inputN;
70
+
71
+ for (int i = 0; i < N; i++)
72
+ {
73
+#if HIGH_BIT_DEPTH
74
+ inputi = *(int16x8_t *)&srccol + i;
75
+#else
76
+ inputi = vmovl_u8(*(uint8x8_t *)&srccol + i);
77
+#endif
78
+ }
79
+ vsum1 = voffset;
80
+ vsum2 = voffset;
81
+
82
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input0), low_vc, 0);
83
+ vsum2 = vmlal_high_lane_s16(vsum2, input0, low_vc, 0);
84
+
85
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input1), low_vc, 1);
86
+ vsum2 = vmlal_high_lane_s16(vsum2, input1, low_vc, 1);
87
+
88
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input2), low_vc, 2);
89
+ vsum2 = vmlal_high_lane_s16(vsum2, input2, low_vc, 2);
90
+
91
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input3), low_vc, 3);
92
+ vsum2 = vmlal_high_lane_s16(vsum2, input3, low_vc, 3);
93
+
94
+ if (N == 8)
95
+ {
96
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input4), high_vc, 0);
97
+ vsum2 = vmlal_high_lane_s16(vsum2, input4, high_vc, 0);
98
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input5), high_vc, 1);
99
+ vsum2 = vmlal_high_lane_s16(vsum2, input5, high_vc, 1);
100
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input6), high_vc, 2);
101
+ vsum2 = vmlal_high_lane_s16(vsum2, input6, high_vc, 2);
102
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input7), high_vc, 3);
103
+ vsum2 = vmlal_high_lane_s16(vsum2, input7, high_vc, 3);
104
+
105
+ }
106
+
107
+ vsum1 = vshlq_s32(vsum1, vhr);
108
+ vsum2 = vshlq_s32(vsum2, vhr);
109
+
110
+ int16x8_t vsum = vuzp1q_s16(vsum1, vsum2);
111
+ vsum = vminq_s16(vsum, vdupq_n_s16(maxVal));
112
+ vsum = vmaxq_s16(vsum, vdupq_n_s16(0));
113
+#if HIGH_BIT_DEPTH
114
+ *(int16x8_t *)&dstcol = vsum;
115
+#else
116
+ uint8x16_t usum = vuzp1q_u8(vsum, vsum);
117
+ *(uint8x8_t *)&dstcol = vget_low_u8(usum);
118
+#endif
119
+
120
+ }
121
+
122
+ src += srcStride;
123
+ dst += dstStride;
124
+ }
125
+}
126
+
127
+#if HIGH_BIT_DEPTH
128
+
129
+template<int N, int width, int height>
130
+void interp_horiz_ps_neon(const uint16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx,
131
+ int isRowExt)
132
+{
133
+ const int16_t *coeff = (N == 4) ? g_chromaFiltercoeffIdx : g_lumaFiltercoeffIdx;
134
+ const int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
135
+ const int shift = IF_FILTER_PREC - headRoom;
136
+ const int offset = (unsigned) - IF_INTERNAL_OFFS << shift;
137
+
138
+ int blkheight = height;
139
+ src -= N / 2 - 1;
140
+
141
+ if (isRowExt)
142
+ {
143
+ src -= (N / 2 - 1) * srcStride;
144
+ blkheight += N - 1;
145
+ }
146
+ int16x8_t vc3 = vld1q_s16(coeff);
147
+ const int32x4_t voffset = vdupq_n_s32(offset);
148
+ const int32x4_t vhr = vdupq_n_s32(-shift);
149
+
150
+ int row, col;
151
+ for (row = 0; row < blkheight; row++)
152
+ {
153
+ for (col = 0; col < width; col += 8)
154
+ {
155
+ int32x4_t vsum, vsum2;
156
+
157
+ int16x8_t inputN;
158
+ for (int i = 0; i < N; i++)
159
+ {
160
+ inputi = vld1q_s16((int16_t *)&srccol + i);
161
+ }
162
+
163
+ vsum = voffset;
164
+ vsum2 = voffset;
165
+
166
+ vsum = vmlal_lane_s16(vsum, vget_low_u16(input0), vget_low_s16(vc3), 0);
167
+ vsum2 = vmlal_high_lane_s16(vsum2, input0, vget_low_s16(vc3), 0);
168
+
169
+ vsum = vmlal_lane_s16(vsum, vget_low_u16(input1), vget_low_s16(vc3), 1);
170
+ vsum2 = vmlal_high_lane_s16(vsum2, input1, vget_low_s16(vc3), 1);
171
+
172
+ vsum = vmlal_lane_s16(vsum, vget_low_u16(input2), vget_low_s16(vc3), 2);
173
+ vsum2 = vmlal_high_lane_s16(vsum2, input2, vget_low_s16(vc3), 2);
174
+
175
+ vsum = vmlal_lane_s16(vsum, vget_low_u16(input3), vget_low_s16(vc3), 3);
176
+ vsum2 = vmlal_high_lane_s16(vsum2, input3, vget_low_s16(vc3), 3);
177
+
178
+ if (N == 8)
179
+ {
180
+ vsum = vmlal_lane_s16(vsum, vget_low_s16(input4), vget_high_s16(vc3), 0);
181
+ vsum2 = vmlal_high_lane_s16(vsum2, input4, vget_high_s16(vc3), 0);
182
+
183
+ vsum = vmlal_lane_s16(vsum, vget_low_s16(input5), vget_high_s16(vc3), 1);
184
+ vsum2 = vmlal_high_lane_s16(vsum2, input5, vget_high_s16(vc3), 1);
185
+
186
+ vsum = vmlal_lane_s16(vsum, vget_low_s16(input6), vget_high_s16(vc3), 2);
187
+ vsum2 = vmlal_high_lane_s16(vsum2, input6, vget_high_s16(vc3), 2);
188
+
189
+ vsum = vmlal_lane_s16(vsum, vget_low_s16(input7), vget_high_s16(vc3), 3);
190
+ vsum2 = vmlal_high_lane_s16(vsum2, input7, vget_high_s16(vc3), 3);
191
+ }
192
+
193
+ vsum = vshlq_s32(vsum, vhr);
194
+ vsum2 = vshlq_s32(vsum2, vhr);
195
+ *(int16x4_t *)&dstcol = vmovn_u32(vsum);
196
+ *(int16x4_t *)&dstcol+4 = vmovn_u32(vsum2);
197
+ }
198
+
199
+ src += srcStride;
200
+ dst += dstStride;
201
+ }
202
+}
203
+
204
+
205
+#else
206
+
207
+template<int N, int width, int height>
208
+void interp_horiz_ps_neon(const uint8_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx,
209
+ int isRowExt)
210
+{
211
+ const int16_t *coeff = (N == 4) ? g_chromaFiltercoeffIdx : g_lumaFiltercoeffIdx;
212
+ const int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
213
+ const int shift = IF_FILTER_PREC - headRoom;
214
+ const int offset = (unsigned) - IF_INTERNAL_OFFS << shift;
215
+
216
+ int blkheight = height;
217
+ src -= N / 2 - 1;
218
+
219
+ if (isRowExt)
220
+ {
221
+ src -= (N / 2 - 1) * srcStride;
222
+ blkheight += N - 1;
223
+ }
224
+ int16x8_t vc;
225
+ vc = *(int16x8_t *)coeff;
226
+
227
+ const int16x8_t voffset = vdupq_n_s16(offset);
228
+ const int16x8_t vhr = vdupq_n_s16(-shift);
229
+
230
+ int row, col;
231
+ for (row = 0; row < blkheight; row++)
232
+ {
233
+ for (col = 0; col < width; col += 8)
234
+ {
235
+ int16x8_t vsum;
236
+
237
+ int16x8_t inputN;
238
+
239
+ for (int i = 0; i < N; i++)
240
+ {
241
+ inputi = vmovl_u8(*(uint8x8_t *)&srccol + i);
242
+ }
243
+ vsum = voffset;
244
+ vsum = vmlaq_laneq_s16(vsum, (input0), vc, 0);
245
+ vsum = vmlaq_laneq_s16(vsum, (input1), vc, 1);
246
+ vsum = vmlaq_laneq_s16(vsum, (input2), vc, 2);
247
+ vsum = vmlaq_laneq_s16(vsum, (input3), vc, 3);
248
+
249
+
250
+ if (N == 8)
251
+ {
252
+ vsum = vmlaq_laneq_s16(vsum, (input4), vc, 4);
253
+ vsum = vmlaq_laneq_s16(vsum, (input5), vc, 5);
254
+ vsum = vmlaq_laneq_s16(vsum, (input6), vc, 6);
255
+ vsum = vmlaq_laneq_s16(vsum, (input7), vc, 7);
256
+
257
+ }
258
+
259
+ vsum = vshlq_s16(vsum, vhr);
260
+ *(int16x8_t *)&dstcol = vsum;
261
+ }
262
+
263
+ src += srcStride;
264
+ dst += dstStride;
265
+ }
266
+}
267
+
268
+#endif
269
+
270
+
271
+template<int N, int width, int height>
272
+void interp_vert_ss_neon(const int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
273
+{
274
+ const int16_t *c = (N == 8 ? g_lumaFiltercoeffIdx : g_chromaFiltercoeffIdx);
275
+ int shift = IF_FILTER_PREC;
276
+ src -= (N / 2 - 1) * srcStride;
277
+ int16x8_t vc;
278
+ vc = *(int16x8_t *)c;
279
+ int16x4_t low_vc = vget_low_s16(vc);
280
+ int16x4_t high_vc = vget_high_s16(vc);
281
+
282
+ const int32x4_t vhr = vdupq_n_s32(-shift);
283
+
284
+ int row, col;
285
+ for (row = 0; row < height; row++)
286
+ {
287
+ for (col = 0; col < width; col += 8)
288
+ {
289
+ int32x4_t vsum1, vsum2;
290
+
291
+ int16x8_t inputN;
292
+
293
+ for (int i = 0; i < N; i++)
294
+ {
295
+ inputi = *(int16x8_t *)&srccol + i * srcStride;
296
+ }
297
+
298
+ vsum1 = vmull_lane_s16(vget_low_s16(input0), low_vc, 0);
299
+ vsum2 = vmull_high_lane_s16(input0, low_vc, 0);
300
+
301
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input1), low_vc, 1);
302
+ vsum2 = vmlal_high_lane_s16(vsum2, input1, low_vc, 1);
303
+
304
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input2), low_vc, 2);
305
+ vsum2 = vmlal_high_lane_s16(vsum2, input2, low_vc, 2);
306
+
307
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input3), low_vc, 3);
308
+ vsum2 = vmlal_high_lane_s16(vsum2, input3, low_vc, 3);
309
+
310
+ if (N == 8)
311
+ {
312
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input4), high_vc, 0);
313
+ vsum2 = vmlal_high_lane_s16(vsum2, input4, high_vc, 0);
314
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input5), high_vc, 1);
315
+ vsum2 = vmlal_high_lane_s16(vsum2, input5, high_vc, 1);
316
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input6), high_vc, 2);
317
+ vsum2 = vmlal_high_lane_s16(vsum2, input6, high_vc, 2);
318
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input7), high_vc, 3);
319
+ vsum2 = vmlal_high_lane_s16(vsum2, input7, high_vc, 3);
320
+
321
+ }
322
+
323
+ vsum1 = vshlq_s32(vsum1, vhr);
324
+ vsum2 = vshlq_s32(vsum2, vhr);
325
+
326
+ int16x8_t vsum = vuzp1q_s16(vsum1, vsum2);
327
+ *(int16x8_t *)&dstcol = vsum;
328
+ }
329
+
330
+ src += srcStride;
331
+ dst += dstStride;
332
+ }
333
+
334
+}
335
+
336
+
337
+#if HIGH_BIT_DEPTH
338
+
339
+template<int N, int width, int height>
340
+void interp_vert_pp_neon(const uint16_t *src, intptr_t srcStride, uint16_t *dst, intptr_t dstStride, int coeffIdx)
341
+{
342
+
343
+ const int16_t *c = (N == 4) ? g_chromaFiltercoeffIdx : g_lumaFiltercoeffIdx;
344
+ int shift = IF_FILTER_PREC;
345
+ int offset = 1 << (shift - 1);
346
+ const uint16_t maxVal = (1 << X265_DEPTH) - 1;
347
+
348
+ src -= (N / 2 - 1) * srcStride;
349
+ int16x8_t vc;
350
+ vc = *(int16x8_t *)c;
351
+ int32x4_t low_vc = vmovl_s16(vget_low_s16(vc));
352
+ int32x4_t high_vc = vmovl_s16(vget_high_s16(vc));
353
+
354
+ const int32x4_t voffset = vdupq_n_s32(offset);
355
+ const int32x4_t vhr = vdupq_n_s32(-shift);
356
+
357
+ int row, col;
358
+ for (row = 0; row < height; row++)
359
+ {
360
+ for (col = 0; col < width; col += 4)
361
+ {
362
+ int32x4_t vsum;
363
+
364
+ int32x4_t inputN;
365
+
366
+ for (int i = 0; i < N; i++)
367
+ {
368
+ inputi = vmovl_u16(*(uint16x4_t *)&srccol + i * srcStride);
369
+ }
370
+ vsum = voffset;
371
+
372
+ vsum = vmlaq_laneq_s32(vsum, (input0), low_vc, 0);
373
+ vsum = vmlaq_laneq_s32(vsum, (input1), low_vc, 1);
374
+ vsum = vmlaq_laneq_s32(vsum, (input2), low_vc, 2);
375
+ vsum = vmlaq_laneq_s32(vsum, (input3), low_vc, 3);
376
+
377
+ if (N == 8)
378
+ {
379
+ vsum = vmlaq_laneq_s32(vsum, (input4), high_vc, 0);
380
+ vsum = vmlaq_laneq_s32(vsum, (input5), high_vc, 1);
381
+ vsum = vmlaq_laneq_s32(vsum, (input6), high_vc, 2);
382
+ vsum = vmlaq_laneq_s32(vsum, (input7), high_vc, 3);
383
+ }
384
+
385
+ vsum = vshlq_s32(vsum, vhr);
386
+ vsum = vminq_s32(vsum, vdupq_n_s32(maxVal));
387
+ vsum = vmaxq_s32(vsum, vdupq_n_s32(0));
388
+ *(uint16x4_t *)&dstcol = vmovn_u32(vsum);
389
+ }
390
+ src += srcStride;
391
+ dst += dstStride;
392
+ }
393
+}
394
+
395
+
396
+
397
+
398
+#else
399
+
400
+template<int N, int width, int height>
401
+void interp_vert_pp_neon(const uint8_t *src, intptr_t srcStride, uint8_t *dst, intptr_t dstStride, int coeffIdx)
402
+{
403
+
404
+ const int16_t *c = (N == 4) ? g_chromaFiltercoeffIdx : g_lumaFiltercoeffIdx;
405
+ int shift = IF_FILTER_PREC;
406
+ int offset = 1 << (shift - 1);
407
+ const uint16_t maxVal = (1 << X265_DEPTH) - 1;
408
+
409
+ src -= (N / 2 - 1) * srcStride;
410
+ int16x8_t vc;
411
+ vc = *(int16x8_t *)c;
412
+
413
+ const int16x8_t voffset = vdupq_n_s16(offset);
414
+ const int16x8_t vhr = vdupq_n_s16(-shift);
415
+
416
+ int row, col;
417
+ for (row = 0; row < height; row++)
418
+ {
419
+ for (col = 0; col < width; col += 8)
420
+ {
421
+ int16x8_t vsum;
422
+
423
+ int16x8_t inputN;
424
+
425
+ for (int i = 0; i < N; i++)
426
+ {
427
+ inputi = vmovl_u8(*(uint8x8_t *)&srccol + i * srcStride);
428
+ }
429
+ vsum = voffset;
430
+
431
+ vsum = vmlaq_laneq_s16(vsum, (input0), vc, 0);
432
+ vsum = vmlaq_laneq_s16(vsum, (input1), vc, 1);
433
+ vsum = vmlaq_laneq_s16(vsum, (input2), vc, 2);
434
+ vsum = vmlaq_laneq_s16(vsum, (input3), vc, 3);
435
+
436
+ if (N == 8)
437
+ {
438
+ vsum = vmlaq_laneq_s16(vsum, (input4), vc, 4);
439
+ vsum = vmlaq_laneq_s16(vsum, (input5), vc, 5);
440
+ vsum = vmlaq_laneq_s16(vsum, (input6), vc, 6);
441
+ vsum = vmlaq_laneq_s16(vsum, (input7), vc, 7);
442
+
443
+ }
444
+
445
+ vsum = vshlq_s16(vsum, vhr);
446
+
447
+ vsum = vminq_s16(vsum, vdupq_n_s16(maxVal));
448
+ vsum = vmaxq_s16(vsum, vdupq_n_s16(0));
449
+ uint8x16_t usum = vuzp1q_u8(vsum, vsum);
450
+ *(uint8x8_t *)&dstcol = vget_low_u8(usum);
451
+
452
+ }
453
+
454
+ src += srcStride;
455
+ dst += dstStride;
456
+ }
457
+}
458
+
459
+
460
+#endif
461
+
462
+
463
+#if HIGH_BIT_DEPTH
464
+
465
+template<int N, int width, int height>
466
+void interp_vert_ps_neon(const uint16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
467
+{
468
+ const int16_t *c = (N == 4) ? g_chromaFiltercoeffIdx : g_lumaFiltercoeffIdx;
469
+ int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
470
+ int shift = IF_FILTER_PREC - headRoom;
471
+ int offset = (unsigned) - IF_INTERNAL_OFFS << shift;
472
+ src -= (N / 2 - 1) * srcStride;
473
+
474
+ int16x8_t vc;
475
+ vc = *(int16x8_t *)c;
476
+ int32x4_t low_vc = vmovl_s16(vget_low_s16(vc));
477
+ int32x4_t high_vc = vmovl_s16(vget_high_s16(vc));
478
+
479
+ const int32x4_t voffset = vdupq_n_s32(offset);
480
+ const int32x4_t vhr = vdupq_n_s32(-shift);
481
+
482
+ int row, col;
483
+ for (row = 0; row < height; row++)
484
+ {
485
+ for (col = 0; col < width; col += 4)
486
+ {
487
+ int16x8_t vsum;
488
+
489
+ int16x8_t inputN;
490
+
491
+ for (int i = 0; i < N; i++)
492
+ {
493
+ inputi = vmovl_u16(*(uint16x4_t *)&srccol + i * srcStride);
494
+ }
495
+ vsum = voffset;
496
+
497
+ vsum = vmlaq_laneq_s32(vsum, (input0), low_vc, 0);
498
+ vsum = vmlaq_laneq_s32(vsum, (input1), low_vc, 1);
499
+ vsum = vmlaq_laneq_s32(vsum, (input2), low_vc, 2);
500
+ vsum = vmlaq_laneq_s32(vsum, (input3), low_vc, 3);
501
+
502
+ if (N == 8)
503
+ {
504
+ int16x8_t vsum1 = vmulq_laneq_s32((input4), high_vc, 0);
505
+ vsum1 = vmlaq_laneq_s32(vsum1, (input5), high_vc, 1);
506
+ vsum1 = vmlaq_laneq_s32(vsum1, (input6), high_vc, 2);
507
+ vsum1 = vmlaq_laneq_s32(vsum1, (input7), high_vc, 3);
508
+ vsum = vaddq_s32(vsum, vsum1);
509
+ }
510
+
511
+ vsum = vshlq_s32(vsum, vhr);
512
+
513
+ *(uint16x4_t *)&dstcol = vmovn_s32(vsum);
514
+ }
515
+
516
+ src += srcStride;
517
+ dst += dstStride;
518
+ }
519
+}
520
+
521
+#else
522
+
523
+template<int N, int width, int height>
524
+void interp_vert_ps_neon(const uint8_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
525
+{
526
+ const int16_t *c = (N == 4) ? g_chromaFiltercoeffIdx : g_lumaFiltercoeffIdx;
527
+ int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
528
+ int shift = IF_FILTER_PREC - headRoom;
529
+ int offset = (unsigned) - IF_INTERNAL_OFFS << shift;
530
+ src -= (N / 2 - 1) * srcStride;
531
+
532
+ int16x8_t vc;
533
+ vc = *(int16x8_t *)c;
534
+
535
+ const int16x8_t voffset = vdupq_n_s16(offset);
536
+ const int16x8_t vhr = vdupq_n_s16(-shift);
537
+
538
+ int row, col;
539
+ for (row = 0; row < height; row++)
540
+ {
541
+ for (col = 0; col < width; col += 8)
542
+ {
543
+ int16x8_t vsum;
544
+
545
+ int16x8_t inputN;
546
+
547
+ for (int i = 0; i < N; i++)
548
+ {
549
+ inputi = vmovl_u8(*(uint8x8_t *)&srccol + i * srcStride);
550
+ }
551
+ vsum = voffset;
552
+
553
+ vsum = vmlaq_laneq_s16(vsum, (input0), vc, 0);
554
+ vsum = vmlaq_laneq_s16(vsum, (input1), vc, 1);
555
+ vsum = vmlaq_laneq_s16(vsum, (input2), vc, 2);
556
+ vsum = vmlaq_laneq_s16(vsum, (input3), vc, 3);
557
+
558
+ if (N == 8)
559
+ {
560
+ int16x8_t vsum1 = vmulq_laneq_s16((input4), vc, 4);
561
+ vsum1 = vmlaq_laneq_s16(vsum1, (input5), vc, 5);
562
+ vsum1 = vmlaq_laneq_s16(vsum1, (input6), vc, 6);
563
+ vsum1 = vmlaq_laneq_s16(vsum1, (input7), vc, 7);
564
+ vsum = vaddq_s16(vsum, vsum1);
565
+ }
566
+
567
+ vsum = vshlq_s32(vsum, vhr);
568
+ *(int16x8_t *)&dstcol = vsum;
569
+ }
570
+
571
+ src += srcStride;
572
+ dst += dstStride;
573
+ }
574
+}
575
+
576
+#endif
577
+
578
+
579
+
580
+template<int N, int width, int height>
581
+void interp_vert_sp_neon(const int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
582
+{
583
+ int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
584
+ int shift = IF_FILTER_PREC + headRoom;
585
+ int offset = (1 << (shift - 1)) + (IF_INTERNAL_OFFS << IF_FILTER_PREC);
586
+ uint16_t maxVal = (1 << X265_DEPTH) - 1;
587
+ const int16_t *coeff = (N == 8 ? g_lumaFiltercoeffIdx : g_chromaFiltercoeffIdx);
588
+
589
+ src -= (N / 2 - 1) * srcStride;
590
+
591
+ int16x8_t vc;
592
+ vc = *(int16x8_t *)coeff;
593
+ int16x4_t low_vc = vget_low_s16(vc);
594
+ int16x4_t high_vc = vget_high_s16(vc);
595
+
596
+ const int32x4_t voffset = vdupq_n_s32(offset);
597
+ const int32x4_t vhr = vdupq_n_s32(-shift);
598
+
599
+ int row, col;
600
+ for (row = 0; row < height; row++)
601
+ {
602
+ for (col = 0; col < width; col += 8)
603
+ {
604
+ int32x4_t vsum1, vsum2;
605
+
606
+ int16x8_t inputN;
607
+
608
+ for (int i = 0; i < N; i++)
609
+ {
610
+ inputi = *(int16x8_t *)&srccol + i * srcStride;
611
+ }
612
+ vsum1 = voffset;
613
+ vsum2 = voffset;
614
+
615
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input0), low_vc, 0);
616
+ vsum2 = vmlal_high_lane_s16(vsum2, input0, low_vc, 0);
617
+
618
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input1), low_vc, 1);
619
+ vsum2 = vmlal_high_lane_s16(vsum2, input1, low_vc, 1);
620
+
621
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input2), low_vc, 2);
622
+ vsum2 = vmlal_high_lane_s16(vsum2, input2, low_vc, 2);
623
+
624
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input3), low_vc, 3);
625
+ vsum2 = vmlal_high_lane_s16(vsum2, input3, low_vc, 3);
626
+
627
+ if (N == 8)
628
+ {
629
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input4), high_vc, 0);
630
+ vsum2 = vmlal_high_lane_s16(vsum2, input4, high_vc, 0);
631
+
632
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input5), high_vc, 1);
633
+ vsum2 = vmlal_high_lane_s16(vsum2, input5, high_vc, 1);
634
+
635
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input6), high_vc, 2);
636
+ vsum2 = vmlal_high_lane_s16(vsum2, input6, high_vc, 2);
637
+
638
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input7), high_vc, 3);
639
+ vsum2 = vmlal_high_lane_s16(vsum2, input7, high_vc, 3);
640
+ }
641
+
642
+ vsum1 = vshlq_s32(vsum1, vhr);
643
+ vsum2 = vshlq_s32(vsum2, vhr);
644
+
645
+ int16x8_t vsum = vuzp1q_s16(vsum1, vsum2);
646
+ vsum = vminq_s16(vsum, vdupq_n_s16(maxVal));
647
+ vsum = vmaxq_s16(vsum, vdupq_n_s16(0));
648
+#if HIGH_BIT_DEPTH
649
+ *(int16x8_t *)&dstcol = vsum;
650
+#else
651
+ uint8x16_t usum = vuzp1q_u8(vsum, vsum);
652
+ *(uint8x8_t *)&dstcol = vget_low_u8(usum);
653
+#endif
654
+
655
+ }
656
+
657
+ src += srcStride;
658
+ dst += dstStride;
659
+ }
660
+}
661
+
662
+
663
+
664
+
665
+
666
+
667
+template<int N, int width, int height>
668
+void interp_hv_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
669
+{
670
+ ALIGN_VAR_32(int16_t, immedwidth * (height + N - 1));
671
+
672
+ interp_horiz_ps_neon<N, width, height>(src, srcStride, immed, width, idxX, 1);
673
+ interp_vert_sp_neon<N, width, height>(immed + (N / 2 - 1) * width, width, dst, dstStride, idxY);
674
+}
675
+
676
+
677
+
678
+}
679
+
680
+
681
+
682
+
683
+namespace X265_NS
684
+{
685
+#if defined(__APPLE__)
686
+#define CHROMA_420(W, H) \
687
+ p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_hpp = interp_horiz_pp_neon<4, W, H>; \
688
+ p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_vpp = interp_vert_pp_neon<4, W, H>; \
689
+ p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_vps = interp_vert_ps_neon<4, W, H>; \
690
+ p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_vsp = interp_vert_sp_neon<4, W, H>; \
691
+ p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_vss = interp_vert_ss_neon<4, W, H>; \
692
+ p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.p2sNONALIGNED = filterPixelToShort_neon<W, H>;\
693
+ p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.p2sALIGNED = filterPixelToShort_neon<W, H>;
694
+
695
+#define CHROMA_FILTER_420(W, H) \
696
+ p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_hps = interp_horiz_ps_neon<4, W, H>;
697
+
698
+#else // defined(__APPLE__)
699
+#define CHROMA_420(W, H) \
700
+ p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_vss = interp_vert_ss_neon<4, W, H>; \
701
+ p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.p2sNONALIGNED = filterPixelToShort_neon<W, H>;\
702
+ p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.p2sALIGNED = filterPixelToShort_neon<W, H>;
703
+
704
+#define CHROMA_FILTER_420(W, H) \
705
+ p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_hpp = interp_horiz_pp_neon<4, W, H>; \
706
+ p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_hps = interp_horiz_ps_neon<4, W, H>; \
707
+ p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_vpp = interp_vert_pp_neon<4, W, H>; \
708
+ p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_vps = interp_vert_ps_neon<4, W, H>; \
709
+ p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_vsp = interp_vert_sp_neon<4, W, H>;
710
+#endif // defined(__APPLE__)
711
+
712
+#if defined(__APPLE__)
713
+#define CHROMA_422(W, H) \
714
+ p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_hpp = interp_horiz_pp_neon<4, W, H>; \
715
+ p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vpp = interp_vert_pp_neon<4, W, H>; \
716
+ p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vps = interp_vert_ps_neon<4, W, H>; \
717
+ p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vsp = interp_vert_sp_neon<4, W, H>; \
718
+ p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vss = interp_vert_ss_neon<4, W, H>; \
719
+ p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.p2sNONALIGNED = filterPixelToShort_neon<W, H>;\
720
+ p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.p2sALIGNED = filterPixelToShort_neon<W, H>;
721
+
722
+#define CHROMA_FILTER_422(W, H) \
723
+ p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_hps = interp_horiz_ps_neon<4, W, H>;
724
+
725
+#else // defined(__APPLE__)
726
+#define CHROMA_422(W, H) \
727
+ p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vss = interp_vert_ss_neon<4, W, H>; \
728
+ p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.p2sNONALIGNED = filterPixelToShort_neon<W, H>;\
729
+ p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.p2sALIGNED = filterPixelToShort_neon<W, H>;
730
+
731
+#define CHROMA_FILTER_422(W, H) \
732
+ p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_hpp = interp_horiz_pp_neon<4, W, H>; \
733
+ p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_hps = interp_horiz_ps_neon<4, W, H>; \
734
+ p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vpp = interp_vert_pp_neon<4, W, H>; \
735
+ p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vps = interp_vert_ps_neon<4, W, H>; \
736
+ p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.filter_vsp = interp_vert_sp_neon<4, W, H>;
737
+#endif // defined(__APPLE__)
738
+
739
+#if defined(__APPLE__)
740
+#define CHROMA_444(W, H) \
741
+ p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_hpp = interp_horiz_pp_neon<4, W, H>; \
742
+ p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_vpp = interp_vert_pp_neon<4, W, H>; \
743
+ p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_vps = interp_vert_ps_neon<4, W, H>; \
744
+ p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_vsp = interp_vert_sp_neon<4, W, H>; \
745
+ p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_vss = interp_vert_ss_neon<4, W, H>; \
746
+ p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.p2sNONALIGNED = filterPixelToShort_neon<W, H>;\
747
+ p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.p2sALIGNED = filterPixelToShort_neon<W, H>;
748
+
749
+#define CHROMA_FILTER_444(W, H) \
750
+ p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_hps = interp_horiz_ps_neon<4, W, H>;
751
+
752
+#else // defined(__APPLE__)
753
+#define CHROMA_444(W, H) \
754
+ p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.p2sNONALIGNED = filterPixelToShort_neon<W, H>;\
755
+ p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.p2sALIGNED = filterPixelToShort_neon<W, H>;
756
+
757
+#define CHROMA_FILTER_444(W, H) \
758
+ p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_hpp = interp_horiz_pp_neon<4, W, H>; \
759
+ p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_hps = interp_horiz_ps_neon<4, W, H>; \
760
+ p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_vpp = interp_vert_pp_neon<4, W, H>; \
761
+ p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_vps = interp_vert_ps_neon<4, W, H>; \
762
+ p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_vsp = interp_vert_sp_neon<4, W, H>; \
763
+ p.chromaX265_CSP_I444.puLUMA_ ## W ## x ## H.filter_vss = interp_vert_ss_neon<4, W, H>;
764
+#endif // defined(__APPLE__)
765
+
766
+#if defined(__APPLE__)
767
+#define LUMA(W, H) \
768
+ p.puLUMA_ ## W ## x ## H.luma_hpp = interp_horiz_pp_neon<8, W, H>; \
769
+ p.puLUMA_ ## W ## x ## H.luma_vpp = interp_vert_pp_neon<8, W, H>; \
770
+ p.puLUMA_ ## W ## x ## H.luma_vps = interp_vert_ps_neon<8, W, H>; \
771
+ p.puLUMA_ ## W ## x ## H.luma_vsp = interp_vert_sp_neon<8, W, H>; \
772
+ p.puLUMA_ ## W ## x ## H.luma_vss = interp_vert_ss_neon<8, W, H>; \
773
+ p.puLUMA_ ## W ## x ## H.luma_hvpp = interp_hv_pp_neon<8, W, H>; \
774
+ p.puLUMA_ ## W ## x ## H.convert_p2sNONALIGNED = filterPixelToShort_neon<W, H>;\
775
+ p.puLUMA_ ## W ## x ## H.convert_p2sALIGNED = filterPixelToShort_neon<W, H>;
776
+
777
+#else // defined(__APPLE__)
778
+#define LUMA(W, H) \
779
+ p.puLUMA_ ## W ## x ## H.luma_vss = interp_vert_ss_neon<8, W, H>; \
780
+ p.puLUMA_ ## W ## x ## H.convert_p2sNONALIGNED = filterPixelToShort_neon<W, H>;\
781
+ p.puLUMA_ ## W ## x ## H.convert_p2sALIGNED = filterPixelToShort_neon<W, H>;
782
+
783
+#define LUMA_FILTER(W, H) \
784
+ p.puLUMA_ ## W ## x ## H.luma_hpp = interp_horiz_pp_neon<8, W, H>; \
785
+ p.puLUMA_ ## W ## x ## H.luma_vpp = interp_vert_pp_neon<8, W, H>; \
786
+ p.puLUMA_ ## W ## x ## H.luma_vps = interp_vert_ps_neon<8, W, H>; \
787
+ p.puLUMA_ ## W ## x ## H.luma_vsp = interp_vert_sp_neon<8, W, H>; \
788
+ p.puLUMA_ ## W ## x ## H.luma_hvpp = interp_hv_pp_neon<8, W, H>;
789
+#endif // defined(__APPLE__)
790
+
791
+void setupFilterPrimitives_neon(EncoderPrimitives &p)
792
+{
793
+
794
+ // All neon functions assume width of multiple of 8, (2,4,12 variants are not optimized)
795
+
796
+ LUMA(8, 8);
797
+ LUMA(8, 4);
798
+ LUMA(16, 16);
799
+ CHROMA_420(8, 8);
800
+ LUMA(16, 8);
801
+ CHROMA_420(8, 4);
802
+ LUMA(8, 16);
803
+ LUMA(16, 12);
804
+ CHROMA_420(8, 6);
805
+ LUMA(16, 4);
806
+ CHROMA_420(8, 2);
807
+ LUMA(32, 32);
808
+ CHROMA_420(16, 16);
809
+ LUMA(32, 16);
810
+ CHROMA_420(16, 8);
811
+ LUMA(16, 32);
812
+ CHROMA_420(8, 16);
813
+ LUMA(32, 24);
814
+ CHROMA_420(16, 12);
815
+ LUMA(24, 32);
816
+ LUMA(32, 8);
817
+ CHROMA_420(16, 4);
818
+ LUMA(8, 32);
819
+ LUMA(64, 64);
820
+ CHROMA_420(32, 32);
821
+ LUMA(64, 32);
822
+ CHROMA_420(32, 16);
823
+ LUMA(32, 64);
824
+ CHROMA_420(16, 32);
825
+ LUMA(64, 48);
826
+ CHROMA_420(32, 24);
827
+ LUMA(48, 64);
828
+ CHROMA_420(24, 32);
829
+ LUMA(64, 16);
830
+ CHROMA_420(32, 8);
831
+ LUMA(16, 64);
832
+ CHROMA_420(8, 32);
833
+ CHROMA_422(8, 16);
834
+ CHROMA_422(8, 8);
835
+ CHROMA_422(8, 12);
836
+ CHROMA_422(8, 4);
837
+ CHROMA_422(16, 32);
838
+ CHROMA_422(16, 16);
839
+ CHROMA_422(8, 32);
840
+ CHROMA_422(16, 24);
841
+ CHROMA_422(16, 8);
842
+ CHROMA_422(32, 64);
843
+ CHROMA_422(32, 32);
844
+ CHROMA_422(16, 64);
845
+ CHROMA_422(32, 48);
846
+ CHROMA_422(24, 64);
847
+ CHROMA_422(32, 16);
848
+ CHROMA_422(8, 64);
849
+ CHROMA_444(8, 8);
850
+ CHROMA_444(8, 4);
851
+ CHROMA_444(16, 16);
852
+ CHROMA_444(16, 8);
853
+ CHROMA_444(8, 16);
854
+ CHROMA_444(16, 12);
855
+ CHROMA_444(16, 4);
856
+ CHROMA_444(32, 32);
857
+ CHROMA_444(32, 16);
858
+ CHROMA_444(16, 32);
859
+ CHROMA_444(32, 24);
860
+ CHROMA_444(24, 32);
861
+ CHROMA_444(32, 8);
862
+ CHROMA_444(8, 32);
863
+ CHROMA_444(64, 64);
864
+ CHROMA_444(64, 32);
865
+ CHROMA_444(32, 64);
866
+ CHROMA_444(64, 48);
867
+ CHROMA_444(48, 64);
868
+ CHROMA_444(64, 16);
869
+ CHROMA_444(16, 64);
870
+
871
+#if defined(__APPLE__) || HIGH_BIT_DEPTH
872
+ p.puLUMA_8x4.luma_hps = interp_horiz_ps_neon<8, 8, 4>;
873
+ p.puLUMA_8x8.luma_hps = interp_horiz_ps_neon<8, 8, 8>;
874
+ p.puLUMA_8x16.luma_hps = interp_horiz_ps_neon<8, 8, 16>;
875
+ p.puLUMA_8x32.luma_hps = interp_horiz_ps_neon<8, 8, 32>;
876
+#endif // HIGH_BIT_DEPTH
877
+
878
+#if !defined(__APPLE__) && HIGH_BIT_DEPTH
879
+ p.puLUMA_24x32.luma_hps = interp_horiz_ps_neon<8, 24, 32>;
880
+#endif // !defined(__APPLE__)
881
+
882
+#if !defined(__APPLE__)
883
+ p.puLUMA_32x8.luma_hpp = interp_horiz_pp_neon<8, 32, 8>;
884
+ p.puLUMA_32x16.luma_hpp = interp_horiz_pp_neon<8, 32, 16>;
885
+ p.puLUMA_32x24.luma_hpp = interp_horiz_pp_neon<8, 32, 24>;
886
+ p.puLUMA_32x32.luma_hpp = interp_horiz_pp_neon<8, 32, 32>;
887
+ p.puLUMA_32x64.luma_hpp = interp_horiz_pp_neon<8, 32, 64>;
888
+ p.puLUMA_48x64.luma_hpp = interp_horiz_pp_neon<8, 48, 64>;
889
+ p.puLUMA_64x16.luma_hpp = interp_horiz_pp_neon<8, 64, 16>;
890
+ p.puLUMA_64x32.luma_hpp = interp_horiz_pp_neon<8, 64, 32>;
891
+ p.puLUMA_64x48.luma_hpp = interp_horiz_pp_neon<8, 64, 48>;
892
+ p.puLUMA_64x64.luma_hpp = interp_horiz_pp_neon<8, 64, 64>;
893
+
894
+ LUMA_FILTER(8, 4);
895
+ LUMA_FILTER(8, 8);
896
+ LUMA_FILTER(8, 16);
897
+ LUMA_FILTER(8, 32);
898
+ LUMA_FILTER(24, 32);
899
+
900
+ LUMA_FILTER(16, 32);
901
+ LUMA_FILTER(32, 16);
902
+ LUMA_FILTER(32, 24);
903
+ LUMA_FILTER(32, 32);
904
+ LUMA_FILTER(32, 64);
905
+ LUMA_FILTER(48, 64);
906
+ LUMA_FILTER(64, 32);
907
+ LUMA_FILTER(64, 48);
908
+ LUMA_FILTER(64, 64);
909
+
910
+ CHROMA_FILTER_420(24, 32);
911
+
912
+ p.chromaX265_CSP_I420.puCHROMA_420_32x8.filter_hpp = interp_horiz_pp_neon<4, 32, 8>;
913
+ p.chromaX265_CSP_I420.puCHROMA_420_32x16.filter_hpp = interp_horiz_pp_neon<4, 32, 16>;
914
+ p.chromaX265_CSP_I420.puCHROMA_420_32x24.filter_hpp = interp_horiz_pp_neon<4, 32, 24>;
915
+ p.chromaX265_CSP_I420.puCHROMA_420_32x32.filter_hpp = interp_horiz_pp_neon<4, 32, 32>;
916
+
917
+ CHROMA_FILTER_422(24, 64);
918
+
919
+ p.chromaX265_CSP_I422.puCHROMA_422_32x16.filter_hpp = interp_horiz_pp_neon<4, 32, 16>;
920
+ p.chromaX265_CSP_I422.puCHROMA_422_32x32.filter_hpp = interp_horiz_pp_neon<4, 32, 32>;
921
+ p.chromaX265_CSP_I422.puCHROMA_422_32x48.filter_hpp = interp_horiz_pp_neon<4, 32, 48>;
922
+ p.chromaX265_CSP_I422.puCHROMA_422_32x64.filter_hpp = interp_horiz_pp_neon<4, 32, 64>;
923
+
924
+ CHROMA_FILTER_444(24, 32);
925
+
926
+ p.chromaX265_CSP_I444.puLUMA_32x8.filter_hpp = interp_horiz_pp_neon<4, 32, 8>;
927
+ p.chromaX265_CSP_I444.puLUMA_32x16.filter_hpp = interp_horiz_pp_neon<4, 32, 16>;
928
+ p.chromaX265_CSP_I444.puLUMA_32x24.filter_hpp = interp_horiz_pp_neon<4, 32, 24>;
929
+ p.chromaX265_CSP_I444.puLUMA_32x32.filter_hpp = interp_horiz_pp_neon<4, 32, 32>;
930
+ p.chromaX265_CSP_I444.puLUMA_32x64.filter_hpp = interp_horiz_pp_neon<4, 32, 64>;
931
+ p.chromaX265_CSP_I444.puLUMA_48x64.filter_hpp = interp_horiz_pp_neon<4, 48, 64>;
932
+ p.chromaX265_CSP_I444.puLUMA_64x16.filter_hpp = interp_horiz_pp_neon<4, 64, 16>;
933
+ p.chromaX265_CSP_I444.puLUMA_64x32.filter_hpp = interp_horiz_pp_neon<4, 64, 32>;
934
+ p.chromaX265_CSP_I444.puLUMA_64x48.filter_hpp = interp_horiz_pp_neon<4, 64, 48>;
935
+ p.chromaX265_CSP_I444.puLUMA_64x64.filter_hpp = interp_horiz_pp_neon<4, 64, 64>;
936
+
937
+ p.chromaX265_CSP_I444.puLUMA_16x4.filter_vss = interp_vert_ss_neon<4, 16, 4>;
938
+ p.chromaX265_CSP_I444.puLUMA_16x8.filter_vss = interp_vert_ss_neon<4, 16, 8>;
939
+ p.chromaX265_CSP_I444.puLUMA_16x12.filter_vss = interp_vert_ss_neon<4, 16, 12>;
940
+ p.chromaX265_CSP_I444.puLUMA_16x16.filter_vss = interp_vert_ss_neon<4, 16, 16>;
941
+ p.chromaX265_CSP_I444.puLUMA_16x32.filter_vss = interp_vert_ss_neon<4, 16, 32>;
942
+ p.chromaX265_CSP_I444.puLUMA_16x64.filter_vss = interp_vert_ss_neon<4, 16, 64>;
943
+ p.chromaX265_CSP_I444.puLUMA_32x8.filter_vss = interp_vert_ss_neon<4, 32, 8>;
944
+ p.chromaX265_CSP_I444.puLUMA_32x16.filter_vss = interp_vert_ss_neon<4, 32, 16>;
945
+ p.chromaX265_CSP_I444.puLUMA_32x24.filter_vss = interp_vert_ss_neon<4, 32, 24>;
946
+ p.chromaX265_CSP_I444.puLUMA_32x32.filter_vss = interp_vert_ss_neon<4, 32, 32>;
947
+ p.chromaX265_CSP_I444.puLUMA_32x64.filter_vss = interp_vert_ss_neon<4, 32, 64>;
948
+#endif // !defined(__APPLE__)
949
+
950
+ CHROMA_FILTER_420(8, 2);
951
+ CHROMA_FILTER_420(8, 4);
952
+ CHROMA_FILTER_420(8, 6);
953
+ CHROMA_FILTER_420(8, 8);
954
+ CHROMA_FILTER_420(8, 16);
955
+ CHROMA_FILTER_420(8, 32);
956
+
957
+ CHROMA_FILTER_422(8, 4);
958
+ CHROMA_FILTER_422(8, 8);
959
+ CHROMA_FILTER_422(8, 12);
960
+ CHROMA_FILTER_422(8, 16);
961
+ CHROMA_FILTER_422(8, 32);
962
+ CHROMA_FILTER_422(8, 64);
963
+
964
+ CHROMA_FILTER_444(8, 4);
965
+ CHROMA_FILTER_444(8, 8);
966
+ CHROMA_FILTER_444(8, 16);
967
+ CHROMA_FILTER_444(8, 32);
968
+
969
+#if defined(__APPLE__)
970
+ CHROMA_FILTER_420(16, 4);
971
+ CHROMA_FILTER_420(16, 8);
972
+ CHROMA_FILTER_420(16, 12);
973
+ CHROMA_FILTER_420(16, 16);
974
+ CHROMA_FILTER_420(16, 32);
975
+
976
+ CHROMA_FILTER_422(16, 8);
977
+ CHROMA_FILTER_422(16, 16);
978
+ CHROMA_FILTER_422(16, 24);
979
+ CHROMA_FILTER_422(16, 32);
980
+ CHROMA_FILTER_422(16, 64);
981
+
982
+ CHROMA_FILTER_444(16, 4);
983
+ CHROMA_FILTER_444(16, 8);
984
+ CHROMA_FILTER_444(16, 12);
985
+ CHROMA_FILTER_444(16, 16);
986
+ CHROMA_FILTER_444(16, 32);
987
+ CHROMA_FILTER_444(16, 64);
988
+#endif // defined(__APPLE__)
989
+}
990
+
991
+};
992
+
993
+
994
+#endif
995
+
996
+
997
x265_3.6.tar.gz/source/common/aarch64/filter-prim.h
Added
23
1
2
+#ifndef _FILTER_PRIM_ARM64_H__
3
+#define _FILTER_PRIM_ARM64_H__
4
+
5
+
6
+#include "common.h"
7
+#include "slicetype.h" // LOWRES_COST_MASK
8
+#include "primitives.h"
9
+#include "x265.h"
10
+
11
+
12
+namespace X265_NS
13
+{
14
+
15
+
16
+void setupFilterPrimitives_neon(EncoderPrimitives &p);
17
+
18
+};
19
+
20
+
21
+#endif
22
+
23
x265_3.6.tar.gz/source/common/aarch64/fun-decls.h
Added
258
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2021 MulticoreWare, Inc
4
+ *
5
+ * Authors: Sebastian Pop <spop@amazon.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#define FUNCDEF_TU(ret, name, cpu, ...) \
26
+ ret PFX(name ## _4x4_ ## cpu(__VA_ARGS__)); \
27
+ ret PFX(name ## _8x8_ ## cpu(__VA_ARGS__)); \
28
+ ret PFX(name ## _16x16_ ## cpu(__VA_ARGS__)); \
29
+ ret PFX(name ## _32x32_ ## cpu(__VA_ARGS__)); \
30
+ ret PFX(name ## _64x64_ ## cpu(__VA_ARGS__))
31
+
32
+#define FUNCDEF_TU_S(ret, name, cpu, ...) \
33
+ ret PFX(name ## _4_ ## cpu(__VA_ARGS__)); \
34
+ ret PFX(name ## _8_ ## cpu(__VA_ARGS__)); \
35
+ ret PFX(name ## _16_ ## cpu(__VA_ARGS__)); \
36
+ ret PFX(name ## _32_ ## cpu(__VA_ARGS__)); \
37
+ ret PFX(name ## _64_ ## cpu(__VA_ARGS__))
38
+
39
+#define FUNCDEF_TU_S2(ret, name, cpu, ...) \
40
+ ret PFX(name ## 4_ ## cpu(__VA_ARGS__)); \
41
+ ret PFX(name ## 8_ ## cpu(__VA_ARGS__)); \
42
+ ret PFX(name ## 16_ ## cpu(__VA_ARGS__)); \
43
+ ret PFX(name ## 32_ ## cpu(__VA_ARGS__)); \
44
+ ret PFX(name ## 64_ ## cpu(__VA_ARGS__))
45
+
46
+#define FUNCDEF_PU(ret, name, cpu, ...) \
47
+ ret PFX(name ## _4x4_ ## cpu)(__VA_ARGS__); \
48
+ ret PFX(name ## _8x8_ ## cpu)(__VA_ARGS__); \
49
+ ret PFX(name ## _16x16_ ## cpu)(__VA_ARGS__); \
50
+ ret PFX(name ## _32x32_ ## cpu)(__VA_ARGS__); \
51
+ ret PFX(name ## _64x64_ ## cpu)(__VA_ARGS__); \
52
+ ret PFX(name ## _8x4_ ## cpu)(__VA_ARGS__); \
53
+ ret PFX(name ## _4x8_ ## cpu)(__VA_ARGS__); \
54
+ ret PFX(name ## _16x8_ ## cpu)(__VA_ARGS__); \
55
+ ret PFX(name ## _8x16_ ## cpu)(__VA_ARGS__); \
56
+ ret PFX(name ## _16x32_ ## cpu)(__VA_ARGS__); \
57
+ ret PFX(name ## _32x16_ ## cpu)(__VA_ARGS__); \
58
+ ret PFX(name ## _64x32_ ## cpu)(__VA_ARGS__); \
59
+ ret PFX(name ## _32x64_ ## cpu)(__VA_ARGS__); \
60
+ ret PFX(name ## _16x12_ ## cpu)(__VA_ARGS__); \
61
+ ret PFX(name ## _12x16_ ## cpu)(__VA_ARGS__); \
62
+ ret PFX(name ## _16x4_ ## cpu)(__VA_ARGS__); \
63
+ ret PFX(name ## _4x16_ ## cpu)(__VA_ARGS__); \
64
+ ret PFX(name ## _32x24_ ## cpu)(__VA_ARGS__); \
65
+ ret PFX(name ## _24x32_ ## cpu)(__VA_ARGS__); \
66
+ ret PFX(name ## _32x8_ ## cpu)(__VA_ARGS__); \
67
+ ret PFX(name ## _8x32_ ## cpu)(__VA_ARGS__); \
68
+ ret PFX(name ## _64x48_ ## cpu)(__VA_ARGS__); \
69
+ ret PFX(name ## _48x64_ ## cpu)(__VA_ARGS__); \
70
+ ret PFX(name ## _64x16_ ## cpu)(__VA_ARGS__); \
71
+ ret PFX(name ## _16x64_ ## cpu)(__VA_ARGS__)
72
+
73
+#define FUNCDEF_CHROMA_PU(ret, name, cpu, ...) \
74
+ FUNCDEF_PU(ret, name, cpu, __VA_ARGS__); \
75
+ ret PFX(name ## _4x2_ ## cpu)(__VA_ARGS__); \
76
+ ret PFX(name ## _4x4_ ## cpu)(__VA_ARGS__); \
77
+ ret PFX(name ## _2x4_ ## cpu)(__VA_ARGS__); \
78
+ ret PFX(name ## _8x2_ ## cpu)(__VA_ARGS__); \
79
+ ret PFX(name ## _2x8_ ## cpu)(__VA_ARGS__); \
80
+ ret PFX(name ## _8x6_ ## cpu)(__VA_ARGS__); \
81
+ ret PFX(name ## _6x8_ ## cpu)(__VA_ARGS__); \
82
+ ret PFX(name ## _8x12_ ## cpu)(__VA_ARGS__); \
83
+ ret PFX(name ## _12x8_ ## cpu)(__VA_ARGS__); \
84
+ ret PFX(name ## _6x16_ ## cpu)(__VA_ARGS__); \
85
+ ret PFX(name ## _16x6_ ## cpu)(__VA_ARGS__); \
86
+ ret PFX(name ## _2x16_ ## cpu)(__VA_ARGS__); \
87
+ ret PFX(name ## _16x2_ ## cpu)(__VA_ARGS__); \
88
+ ret PFX(name ## _4x12_ ## cpu)(__VA_ARGS__); \
89
+ ret PFX(name ## _12x4_ ## cpu)(__VA_ARGS__); \
90
+ ret PFX(name ## _32x12_ ## cpu)(__VA_ARGS__); \
91
+ ret PFX(name ## _12x32_ ## cpu)(__VA_ARGS__); \
92
+ ret PFX(name ## _32x4_ ## cpu)(__VA_ARGS__); \
93
+ ret PFX(name ## _4x32_ ## cpu)(__VA_ARGS__); \
94
+ ret PFX(name ## _32x48_ ## cpu)(__VA_ARGS__); \
95
+ ret PFX(name ## _48x32_ ## cpu)(__VA_ARGS__); \
96
+ ret PFX(name ## _16x24_ ## cpu)(__VA_ARGS__); \
97
+ ret PFX(name ## _24x16_ ## cpu)(__VA_ARGS__); \
98
+ ret PFX(name ## _8x64_ ## cpu)(__VA_ARGS__); \
99
+ ret PFX(name ## _64x8_ ## cpu)(__VA_ARGS__); \
100
+ ret PFX(name ## _64x24_ ## cpu)(__VA_ARGS__); \
101
+ ret PFX(name ## _24x64_ ## cpu)(__VA_ARGS__);
102
+
103
+#define DECLS(cpu) \
104
+ FUNCDEF_TU(void, cpy2Dto1D_shl, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
105
+ FUNCDEF_TU(void, cpy2Dto1D_shr, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
106
+ FUNCDEF_TU(void, cpy1Dto2D_shl, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
107
+ FUNCDEF_TU(void, cpy1Dto2D_shl_aligned, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
108
+ FUNCDEF_TU(void, cpy1Dto2D_shr, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
109
+ FUNCDEF_TU_S(uint32_t, copy_cnt, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride); \
110
+ FUNCDEF_TU_S(int, count_nonzero, cpu, const int16_t* quantCoeff); \
111
+ FUNCDEF_TU(void, blockfill_s, cpu, int16_t* dst, intptr_t dstride, int16_t val); \
112
+ FUNCDEF_TU(void, blockfill_s_aligned, cpu, int16_t* dst, intptr_t dstride, int16_t val); \
113
+ FUNCDEF_CHROMA_PU(void, blockcopy_ss, cpu, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); \
114
+ FUNCDEF_CHROMA_PU(void, blockcopy_pp, cpu, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); \
115
+ FUNCDEF_PU(void, blockcopy_sp, cpu, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); \
116
+ FUNCDEF_PU(void, blockcopy_ps, cpu, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); \
117
+ FUNCDEF_PU(void, interp_8tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
118
+ FUNCDEF_PU(void, interp_8tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
119
+ FUNCDEF_PU(void, interp_8tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
120
+ FUNCDEF_PU(void, interp_8tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
121
+ FUNCDEF_PU(void, interp_8tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
122
+ FUNCDEF_PU(void, interp_8tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
123
+ FUNCDEF_PU(void, interp_8tap_hv_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY); \
124
+ FUNCDEF_CHROMA_PU(void, filterPixelToShort, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
125
+ FUNCDEF_CHROMA_PU(void, filterPixelToShort_aligned, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
126
+ FUNCDEF_CHROMA_PU(void, interp_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
127
+ FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
128
+ FUNCDEF_CHROMA_PU(void, interp_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
129
+ FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
130
+ FUNCDEF_CHROMA_PU(void, interp_4tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
131
+ FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
132
+ FUNCDEF_CHROMA_PU(void, interp_4tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
133
+ FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
134
+ FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
135
+ FUNCDEF_CHROMA_PU(void, addAvg_aligned, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
136
+ FUNCDEF_PU(void, pixel_avg_pp, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
137
+ FUNCDEF_PU(void, pixel_avg_pp_aligned, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
138
+ FUNCDEF_PU(void, sad_x3, cpu, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
139
+ FUNCDEF_PU(void, sad_x4, cpu, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
140
+ FUNCDEF_CHROMA_PU(int, pixel_sad, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
141
+ FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
142
+ FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s_aligned, cpu, const int16_t*, intptr_t); \
143
+ FUNCDEF_TU_S(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
144
+ FUNCDEF_TU_S(sse_t, pixel_ssd_s_aligned, cpu, const int16_t*, intptr_t); \
145
+ FUNCDEF_PU(sse_t, pixel_sse_pp, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
146
+ FUNCDEF_CHROMA_PU(sse_t, pixel_sse_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
147
+ FUNCDEF_PU(void, pixel_sub_ps, cpu, int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); \
148
+ FUNCDEF_PU(void, pixel_add_ps, cpu, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \
149
+ FUNCDEF_PU(void, pixel_add_ps_aligned, cpu, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \
150
+ FUNCDEF_CHROMA_PU(int, pixel_satd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
151
+ FUNCDEF_TU_S2(void, ssimDist, cpu, const pixel *fenc, uint32_t fStride, const pixel *recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k); \
152
+ FUNCDEF_TU_S2(void, normFact, cpu, const pixel *src, uint32_t blockSize, int shift, uint64_t *z_k)
153
+
154
+DECLS(neon);
155
+DECLS(sve);
156
+DECLS(sve2);
157
+
158
+
159
+void x265_pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
160
+
161
+uint64_t x265_pixel_var_8x8_neon(const pixel* pix, intptr_t stride);
162
+uint64_t x265_pixel_var_16x16_neon(const pixel* pix, intptr_t stride);
163
+uint64_t x265_pixel_var_32x32_neon(const pixel* pix, intptr_t stride);
164
+uint64_t x265_pixel_var_64x64_neon(const pixel* pix, intptr_t stride);
165
+
166
+void x265_getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
167
+void x265_getResidual8_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
168
+void x265_getResidual16_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
169
+void x265_getResidual32_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
170
+
171
+void x265_scale1D_128to64_neon(pixel *dst, const pixel *src);
172
+void x265_scale2D_64to32_neon(pixel* dst, const pixel* src, intptr_t stride);
173
+
174
+int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
175
+int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
176
+int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
177
+int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
178
+int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
179
+int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
180
+int x265_pixel_satd_8x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
181
+int x265_pixel_satd_8x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
182
+int x265_pixel_satd_8x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
183
+int x265_pixel_satd_8x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
184
+int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
185
+int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
186
+int x265_pixel_satd_16x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
187
+int x265_pixel_satd_16x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
188
+int x265_pixel_satd_16x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
189
+int x265_pixel_satd_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
190
+int x265_pixel_satd_16x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
191
+int x265_pixel_satd_16x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
192
+int x265_pixel_satd_16x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
193
+int x265_pixel_satd_24x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
194
+int x265_pixel_satd_24x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
195
+int x265_pixel_satd_32x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
196
+int x265_pixel_satd_32x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
197
+int x265_pixel_satd_32x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
198
+int x265_pixel_satd_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
199
+int x265_pixel_satd_32x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
200
+int x265_pixel_satd_32x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
201
+int x265_pixel_satd_48x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
202
+int x265_pixel_satd_64x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
203
+int x265_pixel_satd_64x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
204
+int x265_pixel_satd_64x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
205
+int x265_pixel_satd_64x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
206
+
207
+int x265_pixel_sa8d_8x8_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
208
+int x265_pixel_sa8d_8x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
209
+int x265_pixel_sa8d_16x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
210
+int x265_pixel_sa8d_16x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
211
+int x265_pixel_sa8d_32x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
212
+int x265_pixel_sa8d_32x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
213
+int x265_pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
214
+
215
+uint32_t PFX(quant_neon)(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
216
+uint32_t PFX(nquant_neon)(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
217
+
218
+void x265_dequant_scaling_neon(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift);
219
+void x265_dequant_normal_neon(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
220
+
221
+void x265_ssim_4x4x2_core_neon(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24);
222
+
223
+int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
224
+int PFX(psyCost_8x8_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
225
+void PFX(weight_pp_neon)(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
226
+void PFX(weight_sp_neon)(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
227
+int PFX(scanPosLast_neon)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
228
+uint32_t PFX(costCoeffNxN_neon)(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
229
+
230
+uint64_t x265_pixel_var_8x8_sve2(const pixel* pix, intptr_t stride);
231
+uint64_t x265_pixel_var_16x16_sve2(const pixel* pix, intptr_t stride);
232
+uint64_t x265_pixel_var_32x32_sve2(const pixel* pix, intptr_t stride);
233
+uint64_t x265_pixel_var_64x64_sve2(const pixel* pix, intptr_t stride);
234
+
235
+void x265_getResidual16_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
236
+void x265_getResidual32_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
237
+
238
+void x265_scale1D_128to64_sve2(pixel *dst, const pixel *src);
239
+void x265_scale2D_64to32_sve2(pixel* dst, const pixel* src, intptr_t stride);
240
+
241
+int x265_pixel_satd_4x4_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
242
+int x265_pixel_satd_8x4_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
243
+int x265_pixel_satd_8x12_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
244
+int x265_pixel_satd_32x16_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
245
+int x265_pixel_satd_32x32_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
246
+int x265_pixel_satd_64x48_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
247
+
248
+uint32_t PFX(quant_sve)(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
249
+
250
+void x265_dequant_scaling_sve2(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift);
251
+void x265_dequant_normal_sve2(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
252
+
253
+void x265_ssim_4x4x2_core_sve2(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24);
254
+
255
+int PFX(psyCost_8x8_sve2)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
256
+void PFX(weight_sp_sve2)(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
257
+int PFX(scanPosLast_sve2)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
258
x265_3.6.tar.gz/source/common/aarch64/intrapred-prim.cpp
Added
267
1
2
+#include "common.h"
3
+#include "primitives.h"
4
+
5
+
6
+#if 1
7
+#include "arm64-utils.h"
8
+#include <arm_neon.h>
9
+
10
+using namespace X265_NS;
11
+
12
+namespace
13
+{
14
+
15
+
16
+
17
+template<int width>
18
+void intra_pred_ang_neon(pixel *dst, intptr_t dstStride, const pixel *srcPix0, int dirMode, int bFilter)
19
+{
20
+ int width2 = width << 1;
21
+ // Flip the neighbours in the horizontal case.
22
+ int horMode = dirMode < 18;
23
+ pixel neighbourBuf129;
24
+ const pixel *srcPix = srcPix0;
25
+
26
+ if (horMode)
27
+ {
28
+ neighbourBuf0 = srcPix0;
29
+ //for (int i = 0; i < width << 1; i++)
30
+ //{
31
+ // neighbourBuf1 + i = srcPixwidth2 + 1 + i;
32
+ // neighbourBufwidth2 + 1 + i = srcPix1 + i;
33
+ //}
34
+ memcpy(&neighbourBuf1, &srcPixwidth2 + 1, sizeof(pixel) * (width << 1));
35
+ memcpy(&neighbourBufwidth2 + 1, &srcPix1, sizeof(pixel) * (width << 1));
36
+ srcPix = neighbourBuf;
37
+ }
38
+
39
+ // Intra prediction angle and inverse angle tables.
40
+ const int8_t angleTable17 = { -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 };
41
+ const int16_t invAngleTable8 = { 4096, 1638, 910, 630, 482, 390, 315, 256 };
42
+
43
+ // Get the prediction angle.
44
+ int angleOffset = horMode ? 10 - dirMode : dirMode - 26;
45
+ int angle = angleTable8 + angleOffset;
46
+
47
+ // Vertical Prediction.
48
+ if (!angle)
49
+ {
50
+ for (int y = 0; y < width; y++)
51
+ {
52
+ memcpy(&dsty * dstStride, srcPix + 1, sizeof(pixel)*width);
53
+ }
54
+ if (bFilter)
55
+ {
56
+ int topLeft = srcPix0, top = srcPix1;
57
+ for (int y = 0; y < width; y++)
58
+ {
59
+ dsty * dstStride = x265_clip((int16_t)(top + ((srcPixwidth2 + 1 + y - topLeft) >> 1)));
60
+ }
61
+ }
62
+ }
63
+ else // Angular prediction.
64
+ {
65
+ // Get the reference pixels. The reference base is the first pixel to the top (neighbourBuf1).
66
+ pixel refBuf64;
67
+ const pixel *ref;
68
+
69
+ // Use the projected left neighbours and the top neighbours.
70
+ if (angle < 0)
71
+ {
72
+ // Number of neighbours projected.
73
+ int nbProjected = -((width * angle) >> 5) - 1;
74
+ pixel *ref_pix = refBuf + nbProjected + 1;
75
+
76
+ // Project the neighbours.
77
+ int invAngle = invAngleTable- angleOffset - 1;
78
+ int invAngleSum = 128;
79
+ for (int i = 0; i < nbProjected; i++)
80
+ {
81
+ invAngleSum += invAngle;
82
+ ref_pix- 2 - i = srcPixwidth2 + (invAngleSum >> 8);
83
+ }
84
+
85
+ // Copy the top-left and top pixels.
86
+ //for (int i = 0; i < width + 1; i++)
87
+ //ref_pix-1 + i = srcPixi;
88
+
89
+ memcpy(&ref_pix-1, srcPix, (width + 1)*sizeof(pixel));
90
+ ref = ref_pix;
91
+ }
92
+ else // Use the top and top-right neighbours.
93
+ {
94
+ ref = srcPix + 1;
95
+ }
96
+
97
+ // Pass every row.
98
+ int angleSum = 0;
99
+ for (int y = 0; y < width; y++)
100
+ {
101
+ angleSum += angle;
102
+ int offset = angleSum >> 5;
103
+ int fraction = angleSum & 31;
104
+
105
+ if (fraction) // Interpolate
106
+ {
107
+ if (width >= 8 && sizeof(pixel) == 1)
108
+ {
109
+ const int16x8_t f0 = vdupq_n_s16(32 - fraction);
110
+ const int16x8_t f1 = vdupq_n_s16(fraction);
111
+ for (int x = 0; x < width; x += 8)
112
+ {
113
+ uint8x8_t in0 = *(uint8x8_t *)&refoffset + x;
114
+ uint8x8_t in1 = *(uint8x8_t *)&refoffset + x + 1;
115
+ int16x8_t lo = vmlaq_s16(vdupq_n_s16(16), vmovl_u8(in0), f0);
116
+ lo = vmlaq_s16(lo, vmovl_u8(in1), f1);
117
+ lo = vshrq_n_s16(lo, 5);
118
+ *(uint8x8_t *)&dsty * dstStride + x = vmovn_u16(lo);
119
+ }
120
+ }
121
+ else if (width >= 4 && sizeof(pixel) == 2)
122
+ {
123
+ const int32x4_t f0 = vdupq_n_s32(32 - fraction);
124
+ const int32x4_t f1 = vdupq_n_s32(fraction);
125
+ for (int x = 0; x < width; x += 4)
126
+ {
127
+ uint16x4_t in0 = *(uint16x4_t *)&refoffset + x;
128
+ uint16x4_t in1 = *(uint16x4_t *)&refoffset + x + 1;
129
+ int32x4_t lo = vmlaq_s32(vdupq_n_s32(16), vmovl_u16(in0), f0);
130
+ lo = vmlaq_s32(lo, vmovl_u16(in1), f1);
131
+ lo = vshrq_n_s32(lo, 5);
132
+ *(uint16x4_t *)&dsty * dstStride + x = vmovn_u32(lo);
133
+ }
134
+ }
135
+ else
136
+ {
137
+ for (int x = 0; x < width; x++)
138
+ {
139
+ dsty * dstStride + x = (pixel)(((32 - fraction) * refoffset + x + fraction * refoffset + x + 1 + 16) >> 5);
140
+ }
141
+ }
142
+ }
143
+ else // Copy.
144
+ {
145
+ memcpy(&dsty * dstStride, &refoffset, sizeof(pixel)*width);
146
+ }
147
+ }
148
+ }
149
+
150
+ // Flip for horizontal.
151
+ if (horMode)
152
+ {
153
+ if (width == 8)
154
+ {
155
+ transpose8x8(dst, dst, dstStride, dstStride);
156
+ }
157
+ else if (width == 16)
158
+ {
159
+ transpose16x16(dst, dst, dstStride, dstStride);
160
+ }
161
+ else if (width == 32)
162
+ {
163
+ transpose32x32(dst, dst, dstStride, dstStride);
164
+ }
165
+ else
166
+ {
167
+ for (int y = 0; y < width - 1; y++)
168
+ {
169
+ for (int x = y + 1; x < width; x++)
170
+ {
171
+ pixel tmp = dsty * dstStride + x;
172
+ dsty * dstStride + x = dstx * dstStride + y;
173
+ dstx * dstStride + y = tmp;
174
+ }
175
+ }
176
+ }
177
+ }
178
+}
179
+
180
+template<int log2Size>
181
+void all_angs_pred_neon(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
182
+{
183
+ const int size = 1 << log2Size;
184
+ for (int mode = 2; mode <= 34; mode++)
185
+ {
186
+ pixel *srcPix = (g_intraFilterFlagsmode & size ? filtPix : refPix);
187
+ pixel *out = dest + ((mode - 2) << (log2Size * 2));
188
+
189
+ intra_pred_ang_neon<size>(out, size, srcPix, mode, bLuma);
190
+
191
+ // Optimize code don't flip buffer
192
+ bool modeHor = (mode < 18);
193
+
194
+ // transpose the block if this is a horizontal mode
195
+ if (modeHor)
196
+ {
197
+ if (size == 8)
198
+ {
199
+ transpose8x8(out, out, size, size);
200
+ }
201
+ else if (size == 16)
202
+ {
203
+ transpose16x16(out, out, size, size);
204
+ }
205
+ else if (size == 32)
206
+ {
207
+ transpose32x32(out, out, size, size);
208
+ }
209
+ else
210
+ {
211
+ for (int k = 0; k < size - 1; k++)
212
+ {
213
+ for (int l = k + 1; l < size; l++)
214
+ {
215
+ pixel tmp = outk * size + l;
216
+ outk * size + l = outl * size + k;
217
+ outl * size + k = tmp;
218
+ }
219
+ }
220
+ }
221
+ }
222
+ }
223
+}
224
+}
225
+
226
+namespace X265_NS
227
+{
228
+// x265 private namespace
229
+
230
+void setupIntraPrimitives_neon(EncoderPrimitives &p)
231
+{
232
+ for (int i = 2; i < NUM_INTRA_MODE; i++)
233
+ {
234
+ p.cuBLOCK_8x8.intra_predi = intra_pred_ang_neon<8>;
235
+ p.cuBLOCK_16x16.intra_predi = intra_pred_ang_neon<16>;
236
+ p.cuBLOCK_32x32.intra_predi = intra_pred_ang_neon<32>;
237
+ }
238
+ p.cuBLOCK_4x4.intra_pred2 = intra_pred_ang_neon<4>;
239
+ p.cuBLOCK_4x4.intra_pred10 = intra_pred_ang_neon<4>;
240
+ p.cuBLOCK_4x4.intra_pred18 = intra_pred_ang_neon<4>;
241
+ p.cuBLOCK_4x4.intra_pred26 = intra_pred_ang_neon<4>;
242
+ p.cuBLOCK_4x4.intra_pred34 = intra_pred_ang_neon<4>;
243
+
244
+ p.cuBLOCK_4x4.intra_pred_allangs = all_angs_pred_neon<2>;
245
+ p.cuBLOCK_8x8.intra_pred_allangs = all_angs_pred_neon<3>;
246
+ p.cuBLOCK_16x16.intra_pred_allangs = all_angs_pred_neon<4>;
247
+ p.cuBLOCK_32x32.intra_pred_allangs = all_angs_pred_neon<5>;
248
+}
249
+
250
+}
251
+
252
+
253
+
254
+#else
255
+
256
+namespace X265_NS
257
+{
258
+// x265 private namespace
259
+void setupIntraPrimitives_neon(EncoderPrimitives &p)
260
+{}
261
+}
262
+
263
+#endif
264
+
265
+
266
+
267
x265_3.6.tar.gz/source/common/aarch64/intrapred-prim.h
Added
17
1
2
+#ifndef INTRAPRED_PRIM_H__
3
+
4
+#if defined(__aarch64__)
5
+
6
+namespace X265_NS
7
+{
8
+// x265 private namespace
9
+
10
+void setupIntraPrimitives_neon(EncoderPrimitives &p);
11
+}
12
+
13
+#endif
14
+
15
+#endif
16
+
17
x265_3.6.tar.gz/source/common/aarch64/ipfilter-common.S
Added
1438
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+// Macros below follow these conventions:
29
+// - input data in registers: v0, v1, v2, v3, v4, v5, v6, v7
30
+// - constants in registers: v24, v25, v26, v27, v31
31
+// - temporary registers: v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30.
32
+// - _32b macros output a result in v17.4s
33
+// - _64b and _32b_1 macros output results in v17.4s, v18.4s
34
+
35
+#include "asm.S"
36
+
37
+.arch armv8-a
38
+
39
+#ifdef __APPLE__
40
+.section __RODATA,__rodata
41
+#else
42
+.section .rodata
43
+#endif
44
+
45
+.align 4
46
+
47
+.macro vextin8 v
48
+ ldp d6, d7, x11, #16
49
+.if \v == 0
50
+ // qpel_filter_0 only uses values in v3
51
+ ext v3.8b, v6.8b, v7.8b, #4
52
+.else
53
+.if \v != 3
54
+ ext v0.8b, v6.8b, v7.8b, #1
55
+.endif
56
+ ext v1.8b, v6.8b, v7.8b, #2
57
+ ext v2.8b, v6.8b, v7.8b, #3
58
+ ext v3.8b, v6.8b, v7.8b, #4
59
+ ext v4.8b, v6.8b, v7.8b, #5
60
+ ext v5.8b, v6.8b, v7.8b, #6
61
+ ext v6.8b, v6.8b, v7.8b, #7
62
+.endif
63
+.endm
64
+
65
+.macro vextin8_64 v
66
+ ldp q6, q7, x11, #32
67
+.if \v == 0
68
+ // qpel_filter_0 only uses values in v3
69
+ ext v3.16b, v6.16b, v7.16b, #4
70
+.else
71
+.if \v != 3
72
+ // qpel_filter_3 does not use values in v0
73
+ ext v0.16b, v6.16b, v7.16b, #1
74
+.endif
75
+ ext v1.16b, v6.16b, v7.16b, #2
76
+ ext v2.16b, v6.16b, v7.16b, #3
77
+ ext v3.16b, v6.16b, v7.16b, #4
78
+ ext v4.16b, v6.16b, v7.16b, #5
79
+ ext v5.16b, v6.16b, v7.16b, #6
80
+.if \v == 1
81
+ ext v6.16b, v6.16b, v7.16b, #7
82
+ // qpel_filter_1 does not use v7
83
+.else
84
+ ext v16.16b, v6.16b, v7.16b, #7
85
+ ext v7.16b, v6.16b, v7.16b, #8
86
+ mov v6.16b, v16.16b
87
+.endif
88
+.endif
89
+.endm
90
+
91
+.macro vextin8_chroma v
92
+ ldp d6, d7, x11, #16
93
+.if \v == 0
94
+ // qpel_filter_chroma_0 only uses values in v1
95
+ ext v1.8b, v6.8b, v7.8b, #2
96
+.else
97
+ ext v0.8b, v6.8b, v7.8b, #1
98
+ ext v1.8b, v6.8b, v7.8b, #2
99
+ ext v2.8b, v6.8b, v7.8b, #3
100
+ ext v3.8b, v6.8b, v7.8b, #4
101
+.endif
102
+.endm
103
+
104
+.macro vextin8_chroma_64 v
105
+ ldp q16, q17, x11, #32
106
+.if \v == 0
107
+ // qpel_filter_chroma_0 only uses values in v1
108
+ ext v1.16b, v16.16b, v17.16b, #2
109
+.else
110
+ ext v0.16b, v16.16b, v17.16b, #1
111
+ ext v1.16b, v16.16b, v17.16b, #2
112
+ ext v2.16b, v16.16b, v17.16b, #3
113
+ ext v3.16b, v16.16b, v17.16b, #4
114
+.endif
115
+.endm
116
+
117
+.macro qpel_load_32b v
118
+.if \v == 0
119
+ add x6, x6, x11 // do not load 3 values that are not used in qpel_filter_0
120
+ ld1 {v3.8b}, x6, x1
121
+.elseif \v == 1 || \v == 2 || \v == 3
122
+.if \v != 3 // not used in qpel_filter_3
123
+ ld1 {v0.8b}, x6, x1
124
+.else
125
+ add x6, x6, x1
126
+.endif
127
+ ld1 {v1.8b}, x6, x1
128
+ ld1 {v2.8b}, x6, x1
129
+ ld1 {v3.8b}, x6, x1
130
+ ld1 {v4.8b}, x6, x1
131
+ ld1 {v5.8b}, x6, x1
132
+.if \v != 1 // not used in qpel_filter_1
133
+ ld1 {v6.8b}, x6, x1
134
+ ld1 {v7.8b}, x6
135
+.else
136
+ ld1 {v6.8b}, x6
137
+.endif
138
+.endif
139
+.endm
140
+
141
+.macro qpel_load_64b v
142
+.if \v == 0
143
+ add x6, x6, x11 // do not load 3 values that are not used in qpel_filter_0
144
+ ld1 {v3.16b}, x6, x1
145
+.elseif \v == 1 || \v == 2 || \v == 3
146
+.if \v != 3 // not used in qpel_filter_3
147
+ ld1 {v0.16b}, x6, x1
148
+.else
149
+ add x6, x6, x1
150
+.endif
151
+ ld1 {v1.16b}, x6, x1
152
+ ld1 {v2.16b}, x6, x1
153
+ ld1 {v3.16b}, x6, x1
154
+ ld1 {v4.16b}, x6, x1
155
+ ld1 {v5.16b}, x6, x1
156
+.if \v != 1 // not used in qpel_filter_1
157
+ ld1 {v6.16b}, x6, x1
158
+ ld1 {v7.16b}, x6
159
+.else
160
+ ld1 {v6.16b}, x6
161
+.endif
162
+.endif
163
+.endm
164
+
165
+.macro qpel_chroma_load_32b v
166
+.if \v == 0
167
+ // qpel_filter_chroma_0 only uses values in v1
168
+ add x6, x6, x1
169
+ ldr d1, x6
170
+.else
171
+ ld1 {v0.8b}, x6, x1
172
+ ld1 {v1.8b}, x6, x1
173
+ ld1 {v2.8b}, x6, x1
174
+ ld1 {v3.8b}, x6
175
+.endif
176
+.endm
177
+
178
+.macro qpel_chroma_load_64b v
179
+.if \v == 0
180
+ // qpel_filter_chroma_0 only uses values in v1
181
+ add x6, x6, x1
182
+ ldr q1, x6
183
+.else
184
+ ld1 {v0.16b}, x6, x1
185
+ ld1 {v1.16b}, x6, x1
186
+ ld1 {v2.16b}, x6, x1
187
+ ld1 {v3.16b}, x6
188
+.endif
189
+.endm
190
+
191
+// a, b, c, d, e, f, g, h
192
+// .hword 0, 0, 0, 64, 0, 0, 0, 0
193
+.macro qpel_start_0
194
+ movi v24.16b, #64
195
+.endm
196
+
197
+.macro qpel_filter_0_32b
198
+ umull v17.8h, v3.8b, v24.8b // 64*d
199
+.endm
200
+
201
+.macro qpel_filter_0_64b
202
+ qpel_filter_0_32b
203
+ umull2 v18.8h, v3.16b, v24.16b // 64*d
204
+.endm
205
+
206
+.macro qpel_start_0_1
207
+ movi v24.8h, #64
208
+.endm
209
+
210
+.macro qpel_filter_0_32b_1
211
+ smull v17.4s, v3.4h, v24.4h // 64*d0
212
+ smull2 v18.4s, v3.8h, v24.8h // 64*d1
213
+.endm
214
+
215
+// a, b, c, d, e, f, g, h
216
+// .hword -1, 4, -10, 58, 17, -5, 1, 0
217
+.macro qpel_start_1
218
+ movi v24.16b, #58
219
+ movi v25.16b, #10
220
+ movi v26.16b, #17
221
+ movi v27.16b, #5
222
+.endm
223
+
224
+.macro qpel_filter_1_32b
225
+ umull v19.8h, v2.8b, v25.8b // c*10
226
+ umull v17.8h, v3.8b, v24.8b // d*58
227
+ umull v21.8h, v4.8b, v26.8b // e*17
228
+ umull v23.8h, v5.8b, v27.8b // f*5
229
+ sub v17.8h, v17.8h, v19.8h // d*58 - c*10
230
+ ushll v18.8h, v1.8b, #2 // b*4
231
+ add v17.8h, v17.8h, v21.8h // d*58 - c*10 + e*17
232
+ usubl v21.8h, v6.8b, v0.8b // g - a
233
+ add v17.8h, v17.8h, v18.8h // d*58 - c*10 + e*17 + b*4
234
+ sub v21.8h, v21.8h, v23.8h // g - a - f*5
235
+ add v17.8h, v17.8h, v21.8h // d*58 - c*10 + e*17 + b*4 + g - a - f*5
236
+.endm
237
+
238
+.macro qpel_filter_1_64b
239
+ qpel_filter_1_32b
240
+ umull2 v20.8h, v2.16b, v25.16b // c*10
241
+ umull2 v18.8h, v3.16b, v24.16b // d*58
242
+ umull2 v21.8h, v4.16b, v26.16b // e*17
243
+ umull2 v23.8h, v5.16b, v27.16b // f*5
244
+ sub v18.8h, v18.8h, v20.8h // d*58 - c*10
245
+ ushll2 v28.8h, v1.16b, #2 // b*4
246
+ add v18.8h, v18.8h, v21.8h // d*58 - c*10 + e*17
247
+ usubl2 v21.8h, v6.16b, v0.16b // g - a
248
+ add v18.8h, v18.8h, v28.8h // d*58 - c*10 + e*17 + b*4
249
+ sub v21.8h, v21.8h, v23.8h // g - a - f*5
250
+ add v18.8h, v18.8h, v21.8h // d*58 - c*10 + e*17 + b*4 + g - a - f*5
251
+.endm
252
+
253
+.macro qpel_start_1_1
254
+ movi v24.8h, #58
255
+ movi v25.8h, #10
256
+ movi v26.8h, #17
257
+ movi v27.8h, #5
258
+.endm
259
+
260
+.macro qpel_filter_1_32b_1
261
+ smull v17.4s, v3.4h, v24.4h // 58 * d0
262
+ smull2 v18.4s, v3.8h, v24.8h // 58 * d1
263
+ smull v19.4s, v2.4h, v25.4h // 10 * c0
264
+ smull2 v20.4s, v2.8h, v25.8h // 10 * c1
265
+ smull v21.4s, v4.4h, v26.4h // 17 * e0
266
+ smull2 v22.4s, v4.8h, v26.8h // 17 * e1
267
+ smull v23.4s, v5.4h, v27.4h // 5 * f0
268
+ smull2 v16.4s, v5.8h, v27.8h // 5 * f1
269
+ sub v17.4s, v17.4s, v19.4s // 58 * d0 - 10 * c0
270
+ sub v18.4s, v18.4s, v20.4s // 58 * d1 - 10 * c1
271
+ sshll v19.4s, v1.4h, #2 // 4 * b0
272
+ sshll2 v20.4s, v1.8h, #2 // 4 * b1
273
+ add v17.4s, v17.4s, v21.4s // 58 * d0 - 10 * c0 + 17 * e0
274
+ add v18.4s, v18.4s, v22.4s // 58 * d1 - 10 * c1 + 17 * e1
275
+ ssubl v21.4s, v6.4h, v0.4h // g0 - a0
276
+ ssubl2 v22.4s, v6.8h, v0.8h // g1 - a1
277
+ add v17.4s, v17.4s, v19.4s // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0
278
+ add v18.4s, v18.4s, v20.4s // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1
279
+ sub v21.4s, v21.4s, v23.4s // g0 - a0 - 5 * f0
280
+ sub v22.4s, v22.4s, v16.4s // g1 - a1 - 5 * f1
281
+ add v17.4s, v17.4s, v21.4s // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0
282
+ add v18.4s, v18.4s, v22.4s // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1
283
+.endm
284
+
285
+// a, b, c, d, e, f, g, h
286
+// .hword -1, 4, -11, 40, 40, -11, 4, -1
287
+.macro qpel_start_2
288
+ movi v24.8h, #11
289
+ movi v25.8h, #40
290
+.endm
291
+
292
+.macro qpel_filter_2_32b
293
+ uaddl v17.8h, v3.8b, v4.8b // d + e
294
+ uaddl v19.8h, v2.8b, v5.8b // c + f
295
+ uaddl v23.8h, v1.8b, v6.8b // b + g
296
+ uaddl v21.8h, v0.8b, v7.8b // a + h
297
+ mul v17.8h, v17.8h, v25.8h // 40 * (d + e)
298
+ mul v19.8h, v19.8h, v24.8h // 11 * (c + f)
299
+ shl v23.8h, v23.8h, #2 // (b + g) * 4
300
+ add v19.8h, v19.8h, v21.8h // 11 * (c + f) + a + h
301
+ add v17.8h, v17.8h, v23.8h // 40 * (d + e) + (b + g) * 4
302
+ sub v17.8h, v17.8h, v19.8h // 40 * (d + e) + (b + g) * 4 - 11 * (c + f) - a - h
303
+.endm
304
+
305
+.macro qpel_filter_2_64b
306
+ qpel_filter_2_32b
307
+ uaddl2 v27.8h, v3.16b, v4.16b // d + e
308
+ uaddl2 v16.8h, v2.16b, v5.16b // c + f
309
+ uaddl2 v23.8h, v1.16b, v6.16b // b + g
310
+ uaddl2 v21.8h, v0.16b, v7.16b // a + h
311
+ mul v27.8h, v27.8h, v25.8h // 40 * (d + e)
312
+ mul v16.8h, v16.8h, v24.8h // 11 * (c + f)
313
+ shl v23.8h, v23.8h, #2 // (b + g) * 4
314
+ add v16.8h, v16.8h, v21.8h // 11 * (c + f) + a + h
315
+ add v27.8h, v27.8h, v23.8h // 40 * (d + e) + (b + g) * 4
316
+ sub v18.8h, v27.8h, v16.8h // 40 * (d + e) + (b + g) * 4 - 11 * (c + f) - a - h
317
+.endm
318
+
319
+.macro qpel_start_2_1
320
+ movi v24.4s, #11
321
+ movi v25.4s, #40
322
+.endm
323
+
324
+.macro qpel_filter_2_32b_1
325
+ saddl v17.4s, v3.4h, v4.4h // d0 + e0
326
+ saddl2 v18.4s, v3.8h, v4.8h // d1 + e1
327
+ saddl v19.4s, v2.4h, v5.4h // c0 + f0
328
+ saddl2 v20.4s, v2.8h, v5.8h // c1 + f1
329
+ mul v19.4s, v19.4s, v24.4s // 11 * (c0 + f0)
330
+ mul v20.4s, v20.4s, v24.4s // 11 * (c1 + f1)
331
+ saddl v23.4s, v1.4h, v6.4h // b0 + g0
332
+ mul v17.4s, v17.4s, v25.4s // 40 * (d0 + e0)
333
+ mul v18.4s, v18.4s, v25.4s // 40 * (d1 + e1)
334
+ saddl2 v16.4s, v1.8h, v6.8h // b1 + g1
335
+ saddl v21.4s, v0.4h, v7.4h // a0 + h0
336
+ saddl2 v22.4s, v0.8h, v7.8h // a1 + h1
337
+ shl v23.4s, v23.4s, #2 // 4*(b0+g0)
338
+ shl v16.4s, v16.4s, #2 // 4*(b1+g1)
339
+ add v19.4s, v19.4s, v21.4s // 11 * (c0 + f0) + a0 + h0
340
+ add v20.4s, v20.4s, v22.4s // 11 * (c1 + f1) + a1 + h1
341
+ add v17.4s, v17.4s, v23.4s // 40 * (d0 + e0) + 4*(b0+g0)
342
+ add v18.4s, v18.4s, v16.4s // 40 * (d1 + e1) + 4*(b1+g1)
343
+ sub v17.4s, v17.4s, v19.4s // 40 * (d0 + e0) + 4*(b0+g0) - (11 * (c0 + f0) + a0 + h0)
344
+ sub v18.4s, v18.4s, v20.4s // 40 * (d1 + e1) + 4*(b1+g1) - (11 * (c1 + f1) + a1 + h1)
345
+.endm
346
+
347
+// a, b, c, d, e, f, g, h
348
+// .hword 0, 1, -5, 17, 58, -10, 4, -1
349
+.macro qpel_start_3
350
+ movi v24.16b, #17
351
+ movi v25.16b, #5
352
+ movi v26.16b, #58
353
+ movi v27.16b, #10
354
+.endm
355
+
356
+.macro qpel_filter_3_32b
357
+ umull v19.8h, v2.8b, v25.8b // c * 5
358
+ umull v17.8h, v3.8b, v24.8b // d * 17
359
+ umull v21.8h, v4.8b, v26.8b // e * 58
360
+ umull v23.8h, v5.8b, v27.8b // f * 10
361
+ sub v17.8h, v17.8h, v19.8h // d * 17 - c * 5
362
+ ushll v19.8h, v6.8b, #2 // g * 4
363
+ add v17.8h, v17.8h, v21.8h // d * 17 - c * 5 + e * 58
364
+ usubl v21.8h, v1.8b, v7.8b // b - h
365
+ add v17.8h, v17.8h, v19.8h // d * 17 - c * 5 + e * 58 + g * 4
366
+ sub v21.8h, v21.8h, v23.8h // b - h - f * 10
367
+ add v17.8h, v17.8h, v21.8h // d * 17 - c * 5 + e * 58 + g * 4 + b - h - f * 10
368
+.endm
369
+
370
+.macro qpel_filter_3_64b
371
+ qpel_filter_3_32b
372
+ umull2 v16.8h, v2.16b, v25.16b // c * 5
373
+ umull2 v18.8h, v3.16b, v24.16b // d * 17
374
+ umull2 v21.8h, v4.16b, v26.16b // e * 58
375
+ umull2 v23.8h, v5.16b, v27.16b // f * 10
376
+ sub v18.8h, v18.8h, v16.8h // d * 17 - c * 5
377
+ ushll2 v16.8h, v6.16b, #2 // g * 4
378
+ add v18.8h, v18.8h, v21.8h // d * 17 - c * 5 + e * 58
379
+ usubl2 v21.8h, v1.16b, v7.16b // b - h
380
+ add v18.8h, v18.8h, v16.8h // d * 17 - c * 5 + e * 58 + g * 4
381
+ sub v21.8h, v21.8h, v23.8h // b - h - f * 10
382
+ add v18.8h, v18.8h, v21.8h // d * 17 - c * 5 + e * 58 + g * 4 + b - h - f * 10
383
+.endm
384
+
385
+.macro qpel_start_3_1
386
+ movi v24.8h, #17
387
+ movi v25.8h, #5
388
+ movi v26.8h, #58
389
+ movi v27.8h, #10
390
+.endm
391
+
392
+.macro qpel_filter_3_32b_1
393
+ smull v17.4s, v3.4h, v24.4h // 17 * d0
394
+ smull2 v18.4s, v3.8h, v24.8h // 17 * d1
395
+ smull v19.4s, v2.4h, v25.4h // 5 * c0
396
+ smull2 v20.4s, v2.8h, v25.8h // 5 * c1
397
+ smull v21.4s, v4.4h, v26.4h // 58 * e0
398
+ smull2 v22.4s, v4.8h, v26.8h // 58 * e1
399
+ smull v23.4s, v5.4h, v27.4h // 10 * f0
400
+ smull2 v16.4s, v5.8h, v27.8h // 10 * f1
401
+ sub v17.4s, v17.4s, v19.4s // 17 * d0 - 5 * c0
402
+ sub v18.4s, v18.4s, v20.4s // 17 * d1 - 5 * c1
403
+ sshll v19.4s, v6.4h, #2 // 4 * g0
404
+ sshll2 v20.4s, v6.8h, #2 // 4 * g1
405
+ add v17.4s, v17.4s, v21.4s // 17 * d0 - 5 * c0 + 58 * e0
406
+ add v18.4s, v18.4s, v22.4s // 17 * d1 - 5 * c1 + 58 * e1
407
+ ssubl v21.4s, v1.4h, v7.4h // b0 - h0
408
+ ssubl2 v22.4s, v1.8h, v7.8h // b1 - h1
409
+ add v17.4s, v17.4s, v19.4s // 17 * d0 - 5 * c0 + 58 * e0 + 4 * g0
410
+ add v18.4s, v18.4s, v20.4s // 17 * d1 - 5 * c1 + 58 * e1 + 4 * g1
411
+ sub v21.4s, v21.4s, v23.4s // b0 - h0 - 10 * f0
412
+ sub v22.4s, v22.4s, v16.4s // b1 - h1 - 10 * f1
413
+ add v17.4s, v17.4s, v21.4s // 17 * d0 - 5 * c0 + 58 * e0 + 4 * g0 + b0 - h0 - 10 * f0
414
+ add v18.4s, v18.4s, v22.4s // 17 * d1 - 5 * c1 + 58 * e1 + 4 * g1 + b1 - h1 - 10 * f1
415
+.endm
416
+
417
+.macro qpel_start_chroma_0
418
+ movi v24.16b, #64
419
+.endm
420
+
421
+.macro qpel_filter_chroma_0_32b
422
+ umull v17.8h, v1.8b, v24.8b // 64*b
423
+.endm
424
+
425
+.macro qpel_filter_chroma_0_64b
426
+ umull v17.8h, v1.8b, v24.8b // 64*b
427
+ umull2 v18.8h, v1.16b, v24.16b // 64*b
428
+.endm
429
+
430
+.macro qpel_start_chroma_0_1
431
+ movi v24.8h, #64
432
+.endm
433
+
434
+.macro qpel_filter_chroma_0_32b_1
435
+ smull v17.4s, v1.4h, v24.4h // 64*b0
436
+ smull2 v18.4s, v1.8h, v24.8h // 64*b1
437
+.endm
438
+
439
+.macro qpel_start_chroma_1
440
+ movi v24.16b, #58
441
+ movi v25.16b, #10
442
+.endm
443
+
444
+.macro qpel_filter_chroma_1_32b
445
+ umull v17.8h, v1.8b, v24.8b // 58 * b
446
+ umull v19.8h, v2.8b, v25.8b // 10 * c
447
+ uaddl v22.8h, v0.8b, v3.8b // a + d
448
+ shl v22.8h, v22.8h, #1 // 2 * (a+d)
449
+ sub v17.8h, v17.8h, v22.8h // 58*b - 2*(a+d)
450
+ add v17.8h, v17.8h, v19.8h // 58*b-2*(a+d) + 10*c
451
+.endm
452
+
453
+.macro qpel_filter_chroma_1_64b
454
+ umull v17.8h, v1.8b, v24.8b // 58 * b
455
+ umull2 v18.8h, v1.16b, v24.16b // 58 * b
456
+ umull v19.8h, v2.8b, v25.8b // 10 * c
457
+ umull2 v20.8h, v2.16b, v25.16b // 10 * c
458
+ uaddl v22.8h, v0.8b, v3.8b // a + d
459
+ uaddl2 v23.8h, v0.16b, v3.16b // a + d
460
+ shl v22.8h, v22.8h, #1 // 2 * (a+d)
461
+ shl v23.8h, v23.8h, #1 // 2 * (a+d)
462
+ sub v17.8h, v17.8h, v22.8h // 58*b - 2*(a+d)
463
+ sub v18.8h, v18.8h, v23.8h // 58*b - 2*(a+d)
464
+ add v17.8h, v17.8h, v19.8h // 58*b-2*(a+d) + 10*c
465
+ add v18.8h, v18.8h, v20.8h // 58*b-2*(a+d) + 10*c
466
+.endm
467
+
468
+.macro qpel_start_chroma_1_1
469
+ movi v24.8h, #58
470
+ movi v25.8h, #10
471
+.endm
472
+
473
+.macro qpel_filter_chroma_1_32b_1
474
+ smull v17.4s, v1.4h, v24.4h // 58 * b0
475
+ smull2 v18.4s, v1.8h, v24.8h // 58 * b1
476
+ smull v19.4s, v2.4h, v25.4h // 10 * c0
477
+ smull2 v20.4s, v2.8h, v25.8h // 10 * c1
478
+ add v22.8h, v0.8h, v3.8h // a + d
479
+ sshll v21.4s, v22.4h, #1 // 2 * (a0+d0)
480
+ sshll2 v22.4s, v22.8h, #1 // 2 * (a1+d1)
481
+ sub v17.4s, v17.4s, v21.4s // 58*b0 - 2*(a0+d0)
482
+ sub v18.4s, v18.4s, v22.4s // 58*b1 - 2*(a1+d1)
483
+ add v17.4s, v17.4s, v19.4s // 58*b0-2*(a0+d0) + 10*c0
484
+ add v18.4s, v18.4s, v20.4s // 58*b1-2*(a1+d1) + 10*c1
485
+.endm
486
+
487
+.macro qpel_start_chroma_2
488
+ movi v25.16b, #54
489
+.endm
490
+
491
+.macro qpel_filter_chroma_2_32b
492
+ umull v17.8h, v1.8b, v25.8b // 54 * b
493
+ ushll v19.8h, v0.8b, #2 // 4 * a
494
+ ushll v21.8h, v2.8b, #4 // 16 * c
495
+ ushll v23.8h, v3.8b, #1 // 2 * d
496
+ add v17.8h, v17.8h, v21.8h // 54*b + 16*c
497
+ add v19.8h, v19.8h, v23.8h // 4*a + 2*d
498
+ sub v17.8h, v17.8h, v19.8h // 54*b+16*c - (4*a+2*d)
499
+.endm
500
+
501
+.macro qpel_filter_chroma_2_64b
502
+ umull v17.8h, v1.8b, v25.8b // 54 * b
503
+ umull2 v18.8h, v1.16b, v25.16b // 54 * b
504
+ ushll v19.8h, v0.8b, #2 // 4 * a
505
+ ushll2 v20.8h, v0.16b, #2 // 4 * a
506
+ ushll v21.8h, v2.8b, #4 // 16 * c
507
+ ushll2 v22.8h, v2.16b, #4 // 16 * c
508
+ ushll v23.8h, v3.8b, #1 // 2 * d
509
+ ushll2 v24.8h, v3.16b, #1 // 2 * d
510
+ add v17.8h, v17.8h, v21.8h // 54*b + 16*c
511
+ add v18.8h, v18.8h, v22.8h // 54*b + 16*c
512
+ add v19.8h, v19.8h, v23.8h // 4*a + 2*d
513
+ add v20.8h, v20.8h, v24.8h // 4*a + 2*d
514
+ sub v17.8h, v17.8h, v19.8h // 54*b+16*c - (4*a+2*d)
515
+ sub v18.8h, v18.8h, v20.8h // 54*b+16*c - (4*a+2*d)
516
+.endm
517
+
518
+.macro qpel_start_chroma_2_1
519
+ movi v25.8h, #54
520
+.endm
521
+
522
+.macro qpel_filter_chroma_2_32b_1
523
+ smull v17.4s, v1.4h, v25.4h // 54 * b0
524
+ smull2 v18.4s, v1.8h, v25.8h // 54 * b1
525
+ sshll v19.4s, v0.4h, #2 // 4 * a0
526
+ sshll2 v20.4s, v0.8h, #2 // 4 * a1
527
+ sshll v21.4s, v2.4h, #4 // 16 * c0
528
+ sshll2 v22.4s, v2.8h, #4 // 16 * c1
529
+ sshll v23.4s, v3.4h, #1 // 2 * d0
530
+ sshll2 v24.4s, v3.8h, #1 // 2 * d1
531
+ add v17.4s, v17.4s, v21.4s // 54*b0 + 16*c0
532
+ add v18.4s, v18.4s, v22.4s // 54*b1 + 16*c1
533
+ add v19.4s, v19.4s, v23.4s // 4*a0 + 2*d0
534
+ add v20.4s, v20.4s, v24.4s // 4*a1 + 2*d1
535
+ sub v17.4s, v17.4s, v19.4s // 54*b0+16*c0 - (4*a0+2*d0)
536
+ sub v18.4s, v18.4s, v20.4s // 54*b1+16*c1 - (4*a1+2*d1)
537
+.endm
538
+
539
+.macro qpel_start_chroma_3
540
+ movi v25.16b, #46
541
+ movi v26.16b, #28
542
+ movi v27.16b, #6
543
+.endm
544
+
545
+.macro qpel_filter_chroma_3_32b
546
+ umull v17.8h, v1.8b, v25.8b // 46 * b
547
+ umull v19.8h, v2.8b, v26.8b // 28 * c
548
+ ushll v21.8h, v3.8b, #2 // 4 * d
549
+ umull v23.8h, v0.8b, v27.8b // 6 * a
550
+ add v17.8h, v17.8h, v19.8h // 46*b + 28*c
551
+ add v21.8h, v21.8h, v23.8h // 4*d + 6*a
552
+ sub v17.8h, v17.8h, v21.8h // 46*b+28*c - (4*d+6*a)
553
+.endm
554
+
555
+.macro qpel_filter_chroma_3_64b
556
+ umull v17.8h, v1.8b, v25.8b // 46 * b
557
+ umull2 v18.8h, v1.16b, v25.16b // 46 * b
558
+ umull v19.8h, v2.8b, v26.8b // 28 * c
559
+ umull2 v20.8h, v2.16b, v26.16b // 28 * c
560
+ ushll v21.8h, v3.8b, #2 // 4 * d
561
+ ushll2 v22.8h, v3.16b, #2 // 4 * d
562
+ umull v23.8h, v0.8b, v27.8b // 6 * a
563
+ umull2 v24.8h, v0.16b, v27.16b // 6 * a
564
+ add v17.8h, v17.8h, v19.8h // 46*b + 28*c
565
+ add v18.8h, v18.8h, v20.8h // 46*b + 28*c
566
+ add v21.8h, v21.8h, v23.8h // 4*d + 6*a
567
+ add v22.8h, v22.8h, v24.8h // 4*d + 6*a
568
+ sub v17.8h, v17.8h, v21.8h // 46*b+28*c - (4*d+6*a)
569
+ sub v18.8h, v18.8h, v22.8h // 46*b+28*c - (4*d+6*a)
570
+.endm
571
+
572
+.macro qpel_start_chroma_3_1
573
+ movi v25.8h, #46
574
+ movi v26.8h, #28
575
+ movi v27.8h, #6
576
+.endm
577
+
578
+.macro qpel_filter_chroma_3_32b_1
579
+ smull v17.4s, v1.4h, v25.4h // 46 * b0
580
+ smull2 v18.4s, v1.8h, v25.8h // 46 * b1
581
+ smull v19.4s, v2.4h, v26.4h // 28 * c0
582
+ smull2 v20.4s, v2.8h, v26.8h // 28 * c1
583
+ sshll v21.4s, v3.4h, #2 // 4 * d0
584
+ sshll2 v22.4s, v3.8h, #2 // 4 * d1
585
+ smull v23.4s, v0.4h, v27.4h // 6 * a0
586
+ smull2 v24.4s, v0.8h, v27.8h // 6 * a1
587
+ add v17.4s, v17.4s, v19.4s // 46*b0 + 28*c0
588
+ add v18.4s, v18.4s, v20.4s // 46*b1 + 28*c1
589
+ add v21.4s, v21.4s, v23.4s // 4*d0 + 6*a0
590
+ add v22.4s, v22.4s, v24.4s // 4*d1 + 6*a1
591
+ sub v17.4s, v17.4s, v21.4s // 46*b0+28*c0 - (4*d0+6*a0)
592
+ sub v18.4s, v18.4s, v22.4s // 46*b1+28*c1 - (4*d1+6*a1)
593
+.endm
594
+
595
+.macro qpel_start_chroma_4
596
+ movi v24.8h, #36
597
+.endm
598
+
599
+.macro qpel_filter_chroma_4_32b
600
+ uaddl v20.8h, v0.8b, v3.8b // a + d
601
+ uaddl v17.8h, v1.8b, v2.8b // b + c
602
+ shl v20.8h, v20.8h, #2 // 4 * (a+d)
603
+ mul v17.8h, v17.8h, v24.8h // 36 * (b+c)
604
+ sub v17.8h, v17.8h, v20.8h // 36*(b+c) - 4*(a+d)
605
+.endm
606
+
607
+.macro qpel_filter_chroma_4_64b
608
+ uaddl v20.8h, v0.8b, v3.8b // a + d
609
+ uaddl2 v21.8h, v0.16b, v3.16b // a + d
610
+ uaddl v17.8h, v1.8b, v2.8b // b + c
611
+ uaddl2 v18.8h, v1.16b, v2.16b // b + c
612
+ shl v20.8h, v20.8h, #2 // 4 * (a+d)
613
+ shl v21.8h, v21.8h, #2 // 4 * (a+d)
614
+ mul v17.8h, v17.8h, v24.8h // 36 * (b+c)
615
+ mul v18.8h, v18.8h, v24.8h // 36 * (b+c)
616
+ sub v17.8h, v17.8h, v20.8h // 36*(b+c) - 4*(a+d)
617
+ sub v18.8h, v18.8h, v21.8h // 36*(b+c) - 4*(a+d)
618
+.endm
619
+
620
+.macro qpel_start_chroma_4_1
621
+ movi v24.8h, #36
622
+.endm
623
+
624
+.macro qpel_filter_chroma_4_32b_1
625
+ add v20.8h, v0.8h, v3.8h // a + d
626
+ add v21.8h, v1.8h, v2.8h // b + c
627
+ smull v17.4s, v21.4h, v24.4h // 36 * (b0+c0)
628
+ smull2 v18.4s, v21.8h, v24.8h // 36 * (b1+c1)
629
+ sshll v21.4s, v20.4h, #2 // 4 * (a0+d0)
630
+ sshll2 v22.4s, v20.8h, #2 // 4 * (a1+d1)
631
+ sub v17.4s, v17.4s, v21.4s // 36*(b0+c0) - 4*(a0+d0)
632
+ sub v18.4s, v18.4s, v22.4s // 36*(b1+c1) - 4*(a1+d1)
633
+.endm
634
+
635
+.macro qpel_start_chroma_5
636
+ movi v25.16b, #28
637
+ movi v26.16b, #46
638
+ movi v27.16b, #6
639
+.endm
640
+
641
+.macro qpel_filter_chroma_5_32b
642
+ umull v17.8h, v1.8b, v25.8b // 28 * b
643
+ umull v19.8h, v2.8b, v26.8b // 46 * c
644
+ ushll v21.8h, v0.8b, #2 // 4 * a
645
+ umull v23.8h, v3.8b, v27.8b // 6 * d
646
+ add v17.8h, v17.8h, v19.8h // 28*b + 46*c
647
+ add v21.8h, v21.8h, v23.8h // 4*a + 6*d
648
+ sub v17.8h, v17.8h, v21.8h // 28*b+46*c - (4*a+6*d)
649
+.endm
650
+
651
+.macro qpel_filter_chroma_5_64b
652
+ umull v17.8h, v1.8b, v25.8b // 28 * b
653
+ umull2 v18.8h, v1.16b, v25.16b // 28 * b
654
+ umull v19.8h, v2.8b, v26.8b // 46 * c
655
+ umull2 v20.8h, v2.16b, v26.16b // 46 * c
656
+ ushll v21.8h, v0.8b, #2 // 4 * a
657
+ ushll2 v22.8h, v0.16b, #2 // 4 * a
658
+ umull v23.8h, v3.8b, v27.8b // 6 * d
659
+ umull2 v24.8h, v3.16b, v27.16b // 6 * d
660
+ add v17.8h, v17.8h, v19.8h // 28*b + 46*c
661
+ add v18.8h, v18.8h, v20.8h // 28*b + 46*c
662
+ add v21.8h, v21.8h, v23.8h // 4*a + 6*d
663
+ add v22.8h, v22.8h, v24.8h // 4*a + 6*d
664
+ sub v17.8h, v17.8h, v21.8h // 28*b+46*c - (4*a+6*d)
665
+ sub v18.8h, v18.8h, v22.8h // 28*b+46*c - (4*a+6*d)
666
+.endm
667
+
668
+.macro qpel_start_chroma_5_1
669
+ movi v25.8h, #28
670
+ movi v26.8h, #46
671
+ movi v27.8h, #6
672
+.endm
673
+
674
+.macro qpel_filter_chroma_5_32b_1
675
+ smull v17.4s, v1.4h, v25.4h // 28 * b0
676
+ smull2 v18.4s, v1.8h, v25.8h // 28 * b1
677
+ smull v19.4s, v2.4h, v26.4h // 46 * c0
678
+ smull2 v20.4s, v2.8h, v26.8h // 46 * c1
679
+ sshll v21.4s, v0.4h, #2 // 4 * a0
680
+ sshll2 v22.4s, v0.8h, #2 // 4 * a1
681
+ smull v23.4s, v3.4h, v27.4h // 6 * d0
682
+ smull2 v24.4s, v3.8h, v27.8h // 6 * d1
683
+ add v17.4s, v17.4s, v19.4s // 28*b0 + 46*c0
684
+ add v18.4s, v18.4s, v20.4s // 28*b1 + 46*c1
685
+ add v21.4s, v21.4s, v23.4s // 4*a0 + 6*d0
686
+ add v22.4s, v22.4s, v24.4s // 4*a1 + 6*d1
687
+ sub v17.4s, v17.4s, v21.4s // 28*b0+46*c0 - (4*a0+6*d0)
688
+ sub v18.4s, v18.4s, v22.4s // 28*b1+46*c1 - (4*a1+6*d1)
689
+.endm
690
+
691
+.macro qpel_start_chroma_6
692
+ movi v25.16b, #54
693
+.endm
694
+
695
+.macro qpel_filter_chroma_6_32b
696
+ umull v17.8h, v2.8b, v25.8b // 54 * c
697
+ ushll v19.8h, v0.8b, #1 // 2 * a
698
+ ushll v21.8h, v1.8b, #4 // 16 * b
699
+ ushll v23.8h, v3.8b, #2 // 4 * d
700
+ add v17.8h, v17.8h, v21.8h // 54*c + 16*b
701
+ add v19.8h, v19.8h, v23.8h // 2*a + 4*d
702
+ sub v17.8h, v17.8h, v19.8h // 54*c+16*b - (2*a+4*d)
703
+.endm
704
+
705
+.macro qpel_filter_chroma_6_64b
706
+ umull v17.8h, v2.8b, v25.8b // 54 * c
707
+ umull2 v18.8h, v2.16b, v25.16b // 54 * c
708
+ ushll v19.8h, v0.8b, #1 // 2 * a
709
+ ushll2 v20.8h, v0.16b, #1 // 2 * a
710
+ ushll v21.8h, v1.8b, #4 // 16 * b
711
+ ushll2 v22.8h, v1.16b, #4 // 16 * b
712
+ ushll v23.8h, v3.8b, #2 // 4 * d
713
+ ushll2 v24.8h, v3.16b, #2 // 4 * d
714
+ add v17.8h, v17.8h, v21.8h // 54*c + 16*b
715
+ add v18.8h, v18.8h, v22.8h // 54*c + 16*b
716
+ add v19.8h, v19.8h, v23.8h // 2*a + 4*d
717
+ add v20.8h, v20.8h, v24.8h // 2*a + 4*d
718
+ sub v17.8h, v17.8h, v19.8h // 54*c+16*b - (2*a+4*d)
719
+ sub v18.8h, v18.8h, v20.8h // 54*c+16*b - (2*a+4*d)
720
+.endm
721
+
722
+.macro qpel_start_chroma_6_1
723
+ movi v25.8h, #54
724
+.endm
725
+
726
+.macro qpel_filter_chroma_6_32b_1
727
+ smull v17.4s, v2.4h, v25.4h // 54 * c0
728
+ smull2 v18.4s, v2.8h, v25.8h // 54 * c1
729
+ sshll v19.4s, v0.4h, #1 // 2 * a0
730
+ sshll2 v20.4s, v0.8h, #1 // 2 * a1
731
+ sshll v21.4s, v1.4h, #4 // 16 * b0
732
+ sshll2 v22.4s, v1.8h, #4 // 16 * b1
733
+ sshll v23.4s, v3.4h, #2 // 4 * d0
734
+ sshll2 v24.4s, v3.8h, #2 // 4 * d1
735
+ add v17.4s, v17.4s, v21.4s // 54*c0 + 16*b0
736
+ add v18.4s, v18.4s, v22.4s // 54*c1 + 16*b1
737
+ add v19.4s, v19.4s, v23.4s // 2*a0 + 4*d0
738
+ add v20.4s, v20.4s, v24.4s // 2*a1 + 4*d1
739
+ sub v17.4s, v17.4s, v19.4s // 54*c0+16*b0 - (2*a0+4*d0)
740
+ sub v18.4s, v18.4s, v20.4s // 54*c1+16*b1 - (2*a1+4*d1)
741
+.endm
742
+
743
+.macro qpel_start_chroma_7
744
+ movi v24.16b, #58
745
+ movi v25.16b, #10
746
+.endm
747
+
748
+.macro qpel_filter_chroma_7_32b
749
+ uaddl v20.8h, v0.8b, v3.8b // a + d
750
+ umull v17.8h, v2.8b, v24.8b // 58 * c
751
+ shl v20.8h, v20.8h, #1 // 2 * (a+d)
752
+ umull v19.8h, v1.8b, v25.8b // 10 * b
753
+ sub v17.8h, v17.8h, v20.8h // 58*c - 2*(a+d)
754
+ add v17.8h, v17.8h, v19.8h // 58*c-2*(a+d) + 10*b
755
+.endm
756
+
757
+.macro qpel_filter_chroma_7_64b
758
+ uaddl v20.8h, v0.8b, v3.8b // a + d
759
+ uaddl2 v21.8h, v0.16b, v3.16b // a + d
760
+ umull v17.8h, v2.8b, v24.8b // 58 * c
761
+ umull2 v18.8h, v2.16b, v24.16b // 58 * c
762
+ shl v20.8h, v20.8h, #1 // 2 * (a+d)
763
+ shl v21.8h, v21.8h, #1 // 2 * (a+d)
764
+ umull v22.8h, v1.8b, v25.8b // 10 * b
765
+ umull2 v23.8h, v1.16b, v25.16b // 10 * b
766
+ sub v17.8h, v17.8h, v20.8h // 58*c - 2*(a+d)
767
+ sub v18.8h, v18.8h, v21.8h // 58*c - 2*(a+d)
768
+ add v17.8h, v17.8h, v22.8h // 58*c-2*(a+d) + 10*b
769
+ add v18.8h, v18.8h, v23.8h // 58*c-2*(a+d) + 10*b
770
+.endm
771
+
772
+.macro qpel_start_chroma_7_1
773
+ movi v24.8h, #58
774
+ movi v25.8h, #10
775
+.endm
776
+
777
+.macro qpel_filter_chroma_7_32b_1
778
+ add v20.8h, v0.8h, v3.8h // a + d
779
+ smull v17.4s, v2.4h, v24.4h // 58 * c0
780
+ smull2 v18.4s, v2.8h, v24.8h // 58 * c1
781
+ sshll v21.4s, v20.4h, #1 // 2 * (a0+d0)
782
+ sshll2 v22.4s, v20.8h, #1 // 2 * (a1+d1)
783
+ smull v19.4s, v1.4h, v25.4h // 10 * b0
784
+ smull2 v20.4s, v1.8h, v25.8h // 10 * b1
785
+ sub v17.4s, v17.4s, v21.4s // 58*c0 - 2*(a0+d0)
786
+ sub v18.4s, v18.4s, v22.4s // 58*c1 - 2*(a1+d1)
787
+ add v17.4s, v17.4s, v19.4s // 58*c0-2*(a0+d0) + 10*b0
788
+ add v18.4s, v18.4s, v20.4s // 58*c1-2*(a1+d1) + 10*b1
789
+.endm
790
+
791
+.macro vpp_end
792
+ add v17.8h, v17.8h, v31.8h
793
+ sqshrun v17.8b, v17.8h, #6
794
+.endm
795
+
796
+.macro FILTER_LUMA_VPP w, h, v
797
+ lsl x10, x1, #2 // x10 = 4 * x1
798
+ sub x11, x10, x1 // x11 = 3 * x1
799
+ sub x0, x0, x11 // src -= (8 / 2 - 1) * srcStride
800
+ mov x5, #\h
801
+ mov w12, #32
802
+ dup v31.8h, w12
803
+ qpel_start_\v
804
+.loop_luma_vpp_\v\()_\w\()x\h:
805
+ mov x7, x2
806
+ mov x9, #0
807
+.loop_luma_vpp_w8_\v\()_\w\()x\h:
808
+ add x6, x0, x9
809
+.if \w == 8 || \w == 24
810
+ qpel_load_32b \v
811
+ qpel_filter_\v\()_32b
812
+ vpp_end
813
+ str d17, x7, #8
814
+ add x9, x9, #8
815
+.elseif \w == 12
816
+ qpel_load_32b \v
817
+ qpel_filter_\v\()_32b
818
+ vpp_end
819
+ str d17, x7, #8
820
+ add x6, x0, #8
821
+ qpel_load_32b \v
822
+ qpel_filter_\v\()_32b
823
+ vpp_end
824
+ fmov w6, s17
825
+ str w6, x7, #4
826
+ add x9, x9, #12
827
+.else
828
+ qpel_load_64b \v
829
+ qpel_filter_\v\()_64b
830
+ vpp_end
831
+ add v18.8h, v18.8h, v31.8h
832
+ sqshrun2 v17.16b, v18.8h, #6
833
+ str q17, x7, #16
834
+ add x9, x9, #16
835
+.endif
836
+ cmp x9, #\w
837
+ blt .loop_luma_vpp_w8_\v\()_\w\()x\h
838
+ add x0, x0, x1
839
+ add x2, x2, x3
840
+ sub x5, x5, #1
841
+ cbnz x5, .loop_luma_vpp_\v\()_\w\()x\h
842
+ ret
843
+.endm
844
+
845
+.macro vps_end
846
+ sub v17.8h, v17.8h, v31.8h
847
+.endm
848
+
849
+.macro FILTER_VPS w, h, v
850
+ lsl x3, x3, #1
851
+ lsl x10, x1, #2 // x10 = 4 * x1
852
+ sub x11, x10, x1 // x11 = 3 * x1
853
+ sub x0, x0, x11 // src -= (8 / 2 - 1) * srcStride
854
+ mov x5, #\h
855
+ mov w12, #8192
856
+ dup v31.8h, w12
857
+ qpel_start_\v
858
+.loop_ps_\v\()_\w\()x\h:
859
+ mov x7, x2
860
+ mov x9, #0
861
+.loop_ps_w8_\v\()_\w\()x\h:
862
+ add x6, x0, x9
863
+.if \w == 8 || \w == 24
864
+ qpel_load_32b \v
865
+ qpel_filter_\v\()_32b
866
+ vps_end
867
+ str q17, x7, #16
868
+ add x9, x9, #8
869
+.elseif \w == 12
870
+ qpel_load_32b \v
871
+ qpel_filter_\v\()_32b
872
+ vps_end
873
+ str q17, x7, #16
874
+ add x6, x0, #8
875
+ qpel_load_32b \v
876
+ qpel_filter_\v\()_32b
877
+ vps_end
878
+ str d17, x7, #8
879
+ add x9, x9, #12
880
+.else
881
+ qpel_load_64b \v
882
+ qpel_filter_\v\()_64b
883
+ vps_end
884
+ sub v18.8h, v18.8h, v31.8h
885
+ stp q17, q18, x7, #32
886
+ add x9, x9, #16
887
+.endif
888
+ cmp x9, #\w
889
+ blt .loop_ps_w8_\v\()_\w\()x\h
890
+ add x0, x0, x1
891
+ add x2, x2, x3
892
+ sub x5, x5, #1
893
+ cbnz x5, .loop_ps_\v\()_\w\()x\h
894
+ ret
895
+.endm
896
+
897
+.macro vsp_end
898
+ add v17.4s, v17.4s, v31.4s
899
+ add v18.4s, v18.4s, v31.4s
900
+ sqshrun v17.4h, v17.4s, #12
901
+ sqshrun2 v17.8h, v18.4s, #12
902
+ sqxtun v17.8b, v17.8h
903
+.endm
904
+
905
+.macro FILTER_VSP w, h, v
906
+ lsl x1, x1, #1
907
+ lsl x10, x1, #2 // x10 = 4 * x1
908
+ sub x11, x10, x1 // x11 = 3 * x1
909
+ sub x0, x0, x11
910
+ mov x5, #\h
911
+ mov w12, #1
912
+ lsl w12, w12, #19
913
+ add w12, w12, #2048
914
+ dup v31.4s, w12
915
+ mov x12, #\w
916
+ lsl x12, x12, #1
917
+ qpel_start_\v\()_1
918
+.loop_luma_vsp_\v\()_\w\()x\h:
919
+ mov x7, x2
920
+ mov x9, #0
921
+.loop_luma_vsp_w8_\v\()_\w\()x\h:
922
+ add x6, x0, x9
923
+ qpel_load_64b \v
924
+ qpel_filter_\v\()_32b_1
925
+ vsp_end
926
+ str d17, x7, #8
927
+ add x9, x9, #16
928
+.if \w == 12
929
+ add x6, x0, #16
930
+ qpel_load_64b \v
931
+ qpel_filter_\v\()_32b_1
932
+ vsp_end
933
+ str s17, x7, #4
934
+ add x9, x9, #8
935
+.endif
936
+ cmp x9, x12
937
+ blt .loop_luma_vsp_w8_\v\()_\w\()x\h
938
+ add x0, x0, x1
939
+ add x2, x2, x3
940
+ sub x5, x5, #1
941
+ cbnz x5, .loop_luma_vsp_\v\()_\w\()x\h
942
+ ret
943
+.endm
944
+
945
+.macro vss_end
946
+ sshr v17.4s, v17.4s, #6
947
+ sshr v18.4s, v18.4s, #6
948
+ uzp1 v17.8h, v17.8h, v18.8h
949
+.endm
950
+
951
+.macro FILTER_VSS w, h, v
952
+ lsl x1, x1, #1
953
+ lsl x10, x1, #2 // x10 = 4 * x1
954
+ sub x11, x10, x1 // x11 = 3 * x1
955
+ sub x0, x0, x11
956
+ lsl x3, x3, #1
957
+ mov x5, #\h
958
+ mov x12, #\w
959
+ lsl x12, x12, #1
960
+ qpel_start_\v\()_1
961
+.loop_luma_vss_\v\()_\w\()x\h:
962
+ mov x7, x2
963
+ mov x9, #0
964
+.loop_luma_vss_w8_\v\()_\w\()x\h:
965
+ add x6, x0, x9
966
+ qpel_load_64b \v
967
+ qpel_filter_\v\()_32b_1
968
+ vss_end
969
+.if \w == 4
970
+ str s17, x7, #4
971
+ add x9, x9, #4
972
+.else
973
+ str q17, x7, #16
974
+ add x9, x9, #16
975
+.if \w == 12
976
+ add x6, x0, x9
977
+ qpel_load_64b \v
978
+ qpel_filter_\v\()_32b_1
979
+ vss_end
980
+ str d17, x7, #8
981
+ add x9, x9, #8
982
+.endif
983
+.endif
984
+ cmp x9, x12
985
+ blt .loop_luma_vss_w8_\v\()_\w\()x\h
986
+ add x0, x0, x1
987
+ add x2, x2, x3
988
+ sub x5, x5, #1
989
+ cbnz x5, .loop_luma_vss_\v\()_\w\()x\h
990
+ ret
991
+.endm
992
+
993
+.macro hpp_end
994
+ add v17.8h, v17.8h, v31.8h
995
+ sqshrun v17.8b, v17.8h, #6
996
+.endm
997
+
998
+.macro FILTER_HPP w, h, v
999
+ mov w6, #\h
1000
+ sub x3, x3, #\w
1001
+ mov w12, #32
1002
+ dup v31.8h, w12
1003
+ qpel_start_\v
1004
+.if \w == 4
1005
+.rept \h
1006
+ mov x11, x0
1007
+ sub x11, x11, #4
1008
+ vextin8 \v
1009
+ qpel_filter_\v\()_32b
1010
+ hpp_end
1011
+ str s17, x2, #4
1012
+ add x0, x0, x1
1013
+ add x2, x2, x3
1014
+.endr
1015
+ ret
1016
+.else
1017
+.loop1_hpp_\v\()_\w\()x\h:
1018
+ mov x7, #\w
1019
+ mov x11, x0
1020
+ sub x11, x11, #4
1021
+.loop2_hpp_\v\()_\w\()x\h:
1022
+ vextin8 \v
1023
+ qpel_filter_\v\()_32b
1024
+ hpp_end
1025
+ str d17, x2, #8
1026
+ sub x11, x11, #8
1027
+ sub x7, x7, #8
1028
+.if \w == 12
1029
+ vextin8 \v
1030
+ qpel_filter_\v\()_32b
1031
+ hpp_end
1032
+ str s17, x2, #4
1033
+ sub x7, x7, #4
1034
+.endif
1035
+ cbnz x7, .loop2_hpp_\v\()_\w\()x\h
1036
+ sub x6, x6, #1
1037
+ add x0, x0, x1
1038
+ add x2, x2, x3
1039
+ cbnz x6, .loop1_hpp_\v\()_\w\()x\h
1040
+ ret
1041
+.endif
1042
+.endm
1043
+
1044
+.macro hps_end
1045
+ sub v17.8h, v17.8h, v31.8h
1046
+.endm
1047
+
1048
+.macro FILTER_HPS w, h, v
1049
+ sub x3, x3, #\w
1050
+ lsl x3, x3, #1
1051
+ mov w12, #8192
1052
+ dup v31.8h, w12
1053
+ qpel_start_\v
1054
+.if \w == 4
1055
+.loop_hps_\v\()_\w\()x\h\():
1056
+ mov x11, x0
1057
+ sub x11, x11, #4
1058
+ vextin8 \v
1059
+ qpel_filter_\v\()_32b
1060
+ hps_end
1061
+ str d17, x2, #8
1062
+ sub w6, w6, #1
1063
+ add x0, x0, x1
1064
+ add x2, x2, x3
1065
+ cbnz w6, .loop_hps_\v\()_\w\()x\h
1066
+ ret
1067
+.else
1068
+.loop1_hps_\v\()_\w\()x\h\():
1069
+ mov w7, #\w
1070
+ mov x11, x0
1071
+ sub x11, x11, #4
1072
+.loop2_hps_\v\()_\w\()x\h\():
1073
+.if \w == 8 || \w == 12 || \w == 24
1074
+ vextin8 \v
1075
+ qpel_filter_\v\()_32b
1076
+ hps_end
1077
+ str q17, x2, #16
1078
+ sub w7, w7, #8
1079
+ sub x11, x11, #8
1080
+.if \w == 12
1081
+ vextin8 \v
1082
+ qpel_filter_\v\()_32b
1083
+ hps_end
1084
+ str d17, x2, #8
1085
+ sub w7, w7, #4
1086
+.endif
1087
+.elseif \w == 16 || \w == 32 || \w == 48 || \w == 64
1088
+ vextin8_64 \v
1089
+ qpel_filter_\v\()_64b
1090
+ hps_end
1091
+ sub v18.8h, v18.8h, v31.8h
1092
+ stp q17, q18, x2, #32
1093
+ sub w7, w7, #16
1094
+ sub x11, x11, #16
1095
+.endif
1096
+ cbnz w7, .loop2_hps_\v\()_\w\()x\h
1097
+ sub w6, w6, #1
1098
+ add x0, x0, x1
1099
+ add x2, x2, x3
1100
+ cbnz w6, .loop1_hps_\v\()_\w\()x\h
1101
+ ret
1102
+.endif
1103
+.endm
1104
+
1105
+.macro FILTER_CHROMA_VPP w, h, v
1106
+ qpel_start_chroma_\v
1107
+ mov w12, #32
1108
+ dup v31.8h, w12
1109
+ sub x0, x0, x1
1110
+ mov x5, #\h
1111
+.loop_chroma_vpp_\v\()_\w\()x\h:
1112
+ mov x7, x2
1113
+ mov x9, #0
1114
+.loop_chroma_vpp_w8_\v\()_\w\()x\h:
1115
+ add x6, x0, x9
1116
+ qpel_chroma_load_32b \v
1117
+ qpel_filter_chroma_\v\()_32b
1118
+ vpp_end
1119
+ add x9, x9, #8
1120
+.if \w == 2
1121
+ fmov w12, s17
1122
+ strh w12, x7, #2
1123
+.elseif \w == 4
1124
+ str s17, x7, #4
1125
+.elseif \w == 6
1126
+ str s17, x7, #4
1127
+ umov w12, v17.h2
1128
+ strh w12, x7, #2
1129
+.elseif \w == 12
1130
+ str d17, x7, #8
1131
+ add x6, x0, x9
1132
+ qpel_chroma_load_32b \v
1133
+ qpel_filter_chroma_\v\()_32b
1134
+ vpp_end
1135
+ str s17, x7, #4
1136
+ add x9, x9, #8
1137
+.else
1138
+ str d17, x7, #8
1139
+.endif
1140
+ cmp x9, #\w
1141
+ blt .loop_chroma_vpp_w8_\v\()_\w\()x\h
1142
+ add x0, x0, x1
1143
+ add x2, x2, x3
1144
+ sub x5, x5, #1
1145
+ cbnz x5, .loop_chroma_vpp_\v\()_\w\()x\h
1146
+ ret
1147
+.endm
1148
+
1149
+.macro FILTER_CHROMA_VPS w, h, v
1150
+ qpel_start_chroma_\v
1151
+ mov w12, #8192
1152
+ dup v31.8h, w12
1153
+ lsl x3, x3, #1
1154
+ sub x0, x0, x1
1155
+ mov x5, #\h
1156
+.loop_vps_\v\()_\w\()x\h:
1157
+ mov x7, x2
1158
+ mov x9, #0
1159
+.loop_vps_w8_\v\()_\w\()x\h:
1160
+ add x6, x0, x9
1161
+ qpel_chroma_load_32b \v
1162
+ qpel_filter_chroma_\v\()_32b
1163
+ vps_end
1164
+ add x9, x9, #8
1165
+.if \w == 2
1166
+ str s17, x7, #4
1167
+.elseif \w == 4
1168
+ str d17, x7, #8
1169
+.elseif \w == 6
1170
+ str d17, x7, #8
1171
+ st1 {v17.s}2, x7, #4
1172
+.elseif \w == 12
1173
+ str q17, x7, #16
1174
+ add x6, x0, x9
1175
+ qpel_chroma_load_32b \v
1176
+ qpel_filter_chroma_\v\()_32b
1177
+ vps_end
1178
+ str d17, x7, #8
1179
+ add x9, x9, #8
1180
+.else
1181
+ str q17, x7, #16
1182
+.endif
1183
+ cmp x9, #\w
1184
+ blt .loop_vps_w8_\v\()_\w\()x\h
1185
+
1186
+ add x0, x0, x1
1187
+ add x2, x2, x3
1188
+ sub x5, x5, #1
1189
+ cbnz x5, .loop_vps_\v\()_\w\()x\h
1190
+ ret
1191
+.endm
1192
+
1193
+.macro FILTER_CHROMA_VSP w, h, v
1194
+ lsl x1, x1, #1
1195
+ sub x0, x0, x1
1196
+ mov x5, #\h
1197
+ mov w12, #1
1198
+ lsl w12, w12, #19
1199
+ add w12, w12, #2048
1200
+ dup v31.4s, w12
1201
+ mov x12, #\w
1202
+ lsl x12, x12, #1
1203
+ qpel_start_chroma_\v\()_1
1204
+.loop_vsp_\v\()_\w\()x\h:
1205
+ mov x7, x2
1206
+ mov x9, #0
1207
+.loop_vsp_w8_\v\()_\w\()x\h:
1208
+ add x6, x0, x9
1209
+ qpel_chroma_load_64b \v
1210
+ qpel_filter_chroma_\v\()_32b_1
1211
+ vsp_end
1212
+ add x9, x9, #16
1213
+.if \w == 4
1214
+ str s17, x7, #4
1215
+.elseif \w == 12
1216
+ str d17, x7, #8
1217
+ add x6, x0, x9
1218
+ qpel_chroma_load_64b \v
1219
+ qpel_filter_chroma_\v\()_32b_1
1220
+ vsp_end
1221
+ str s17, x7, #4
1222
+ add x9, x9, #8
1223
+.else
1224
+ str d17, x7, #8
1225
+.endif
1226
+ cmp x9, x12
1227
+ blt .loop_vsp_w8_\v\()_\w\()x\h
1228
+ add x0, x0, x1
1229
+ add x2, x2, x3
1230
+ sub x5, x5, #1
1231
+ cbnz x5, .loop_vsp_\v\()_\w\()x\h
1232
+ ret
1233
+.endm
1234
+
1235
+.macro FILTER_CHROMA_VSS w, h, v
1236
+ lsl x1, x1, #1
1237
+ sub x0, x0, x1
1238
+ lsl x3, x3, #1
1239
+ mov x5, #\h
1240
+ mov x12, #\w
1241
+ lsl x12, x12, #1
1242
+ qpel_start_chroma_\v\()_1
1243
+.loop_vss_\v\()_\w\()x\h:
1244
+ mov x7, x2
1245
+ mov x9, #0
1246
+.if \w == 4
1247
+.rept 2
1248
+ add x6, x0, x9
1249
+ qpel_chroma_load_64b \v
1250
+ qpel_filter_chroma_\v\()_32b_1
1251
+ vss_end
1252
+ str s17, x7, #4
1253
+ add x9, x9, #4
1254
+.endr
1255
+.else
1256
+.loop_vss_w8_\v\()_\w\()x\h:
1257
+ add x6, x0, x9
1258
+ qpel_chroma_load_64b \v
1259
+ qpel_filter_chroma_\v\()_32b_1
1260
+ vss_end
1261
+ str q17, x7, #16
1262
+ add x9, x9, #16
1263
+.if \w == 12
1264
+ add x6, x0, x9
1265
+ qpel_chroma_load_64b \v
1266
+ qpel_filter_chroma_\v\()_32b_1
1267
+ vss_end
1268
+ str d17, x7, #8
1269
+ add x9, x9, #8
1270
+.endif
1271
+ cmp x9, x12
1272
+ blt .loop_vss_w8_\v\()_\w\()x\h
1273
+.endif
1274
+ add x0, x0, x1
1275
+ add x2, x2, x3
1276
+ sub x5, x5, #1
1277
+ cbnz x5, .loop_vss_\v\()_\w\()x\h
1278
+ ret
1279
+.endm
1280
+
1281
+.macro FILTER_CHROMA_HPP w, h, v
1282
+ qpel_start_chroma_\v
1283
+ mov w12, #32
1284
+ dup v31.8h, w12
1285
+ mov w6, #\h
1286
+ sub x3, x3, #\w
1287
+.if \w == 2 || \w == 4 || \w == 6 || \w == 12
1288
+.loop4_chroma_hpp_\v\()_\w\()x\h:
1289
+ mov x11, x0
1290
+ sub x11, x11, #2
1291
+ vextin8_chroma \v
1292
+ qpel_filter_chroma_\v\()_32b
1293
+ hpp_end
1294
+.if \w == 2
1295
+ fmov w12, s17
1296
+ strh w12, x2, #2
1297
+.elseif \w == 4
1298
+ str s17, x2, #4
1299
+.elseif \w == 6
1300
+ str s17, x2, #4
1301
+ umov w12, v17.h2
1302
+ strh w12, x2, #2
1303
+.elseif \w == 12
1304
+ str d17, x2, #8
1305
+ sub x11, x11, #8
1306
+ vextin8_chroma \v
1307
+ qpel_filter_chroma_\v\()_32b
1308
+ hpp_end
1309
+ str s17, x2, #4
1310
+.endif
1311
+ sub w6, w6, #1
1312
+ add x0, x0, x1
1313
+ add x2, x2, x3
1314
+ cbnz w6, .loop4_chroma_hpp_\v\()_\w\()x\h
1315
+ ret
1316
+.else
1317
+.loop2_chroma_hpp_\v\()_\w\()x\h:
1318
+ mov x7, #\w
1319
+ lsr x7, x7, #3
1320
+ mov x11, x0
1321
+ sub x11, x11, #2
1322
+.loop3_chroma_hpp_\v\()_\w\()x\h:
1323
+.if \w == 8 || \w == 24
1324
+ vextin8_chroma \v
1325
+ qpel_filter_chroma_\v\()_32b
1326
+ hpp_end
1327
+ str d17, x2, #8
1328
+ sub x7, x7, #1
1329
+ sub x11, x11, #8
1330
+.elseif \w == 16 || \w == 32 || \w == 48 || \w == 64
1331
+ vextin8_chroma_64 \v
1332
+ qpel_filter_chroma_\v\()_64b
1333
+ hpp_end
1334
+ add v18.8h, v18.8h, v31.8h
1335
+ sqshrun2 v17.16b, v18.8h, #6
1336
+ str q17, x2, #16
1337
+ sub x7, x7, #2
1338
+ sub x11, x11, #16
1339
+.endif
1340
+ cbnz x7, .loop3_chroma_hpp_\v\()_\w\()x\h
1341
+ sub w6, w6, #1
1342
+ add x0, x0, x1
1343
+ add x2, x2, x3
1344
+ cbnz w6, .loop2_chroma_hpp_\v\()_\w\()x\h
1345
+ ret
1346
+.endif
1347
+.endm
1348
+
1349
+.macro CHROMA_HPS_2_4_6_12 w, v
1350
+ mov x11, x0
1351
+ sub x11, x11, #2
1352
+ vextin8_chroma \v
1353
+ qpel_filter_chroma_\v\()_32b
1354
+ hps_end
1355
+ sub x11, x11, #8
1356
+.if \w == 2
1357
+ str s17, x2, #4
1358
+.elseif \w == 4
1359
+ str d17, x2, #8
1360
+.elseif \w == 6
1361
+ str d17, x2, #8
1362
+ st1 {v17.s}2, x2, #4
1363
+.elseif \w == 12
1364
+ str q17, x2, #16
1365
+ vextin8_chroma \v
1366
+ qpel_filter_chroma_\v\()_32b
1367
+ sub v17.8h, v17.8h, v31.8h
1368
+ str d17, x2, #8
1369
+.endif
1370
+ add x0, x0, x1
1371
+ add x2, x2, x3
1372
+.endm
1373
+
1374
+.macro FILTER_CHROMA_HPS w, h, v
1375
+ qpel_start_chroma_\v
1376
+ mov w12, #8192
1377
+ dup v31.8h, w12
1378
+ sub x3, x3, #\w
1379
+ lsl x3, x3, #1
1380
+
1381
+.if \w == 2 || \w == 4 || \w == 6 || \w == 12
1382
+ cmp x5, #0
1383
+ beq 0f
1384
+ sub x0, x0, x1
1385
+.rept 3
1386
+ CHROMA_HPS_2_4_6_12 \w, \v
1387
+.endr
1388
+0:
1389
+.rept \h
1390
+ CHROMA_HPS_2_4_6_12 \w, \v
1391
+.endr
1392
+ ret
1393
+.else
1394
+ mov w10, #\h
1395
+ cmp x5, #0
1396
+ beq 9f
1397
+ sub x0, x0, x1
1398
+ add w10, w10, #3
1399
+9:
1400
+ mov w6, w10
1401
+.loop1_chroma_hps_\v\()_\w\()x\h\():
1402
+ mov x7, #\w
1403
+ lsr x7, x7, #3
1404
+ mov x11, x0
1405
+ sub x11, x11, #2
1406
+.loop2_chroma_hps_\v\()_\w\()x\h\():
1407
+.if \w == 8 || \w == 24
1408
+ vextin8_chroma \v
1409
+ qpel_filter_chroma_\v\()_32b
1410
+ hps_end
1411
+ str q17, x2, #16
1412
+ sub x7, x7, #1
1413
+ sub x11, x11, #8
1414
+.elseif \w == 16 || \w == 32 || \w == 48 || \w == 64
1415
+ vextin8_chroma_64 \v
1416
+ qpel_filter_chroma_\v\()_64b
1417
+ hps_end
1418
+ sub v18.8h, v18.8h, v31.8h
1419
+ stp q17, q18, x2, #32
1420
+ sub x7, x7, #2
1421
+ sub x11, x11, #16
1422
+.endif
1423
+ cbnz x7, .loop2_chroma_hps_\v\()_\w\()x\h\()
1424
+ sub w6, w6, #1
1425
+ add x0, x0, x1
1426
+ add x2, x2, x3
1427
+ cbnz w6, .loop1_chroma_hps_\v\()_\w\()x\h\()
1428
+ ret
1429
+.endif
1430
+.endm
1431
+
1432
+const g_lumaFilter, align=8
1433
+.word 0,0,0,0,0,0,64,64,0,0,0,0,0,0,0,0
1434
+.word -1,-1,4,4,-10,-10,58,58,17,17,-5,-5,1,1,0,0
1435
+.word -1,-1,4,4,-11,-11,40,40,40,40,-11,-11,4,4,-1,-1
1436
+.word 0,0,1,1,-5,-5,17,17,58,58,-10,-10,4,4,-1,-1
1437
+endconst
1438
x265_3.6.tar.gz/source/common/aarch64/ipfilter-sve2.S
Added
1284
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// Functions in this file:
26
+// ***** luma_vpp *****
27
+// ***** luma_vps *****
28
+// ***** luma_vsp *****
29
+// ***** luma_vss *****
30
+// ***** luma_hpp *****
31
+// ***** luma_hps *****
32
+// ***** chroma_vpp *****
33
+// ***** chroma_vps *****
34
+// ***** chroma_vsp *****
35
+// ***** chroma_vss *****
36
+// ***** chroma_hpp *****
37
+// ***** chroma_hps *****
38
+
39
+#include "asm-sve.S"
40
+#include "ipfilter-common.S"
41
+
42
+.arch armv8-a+sve2
43
+
44
+#ifdef __APPLE__
45
+.section __RODATA,__rodata
46
+#else
47
+.section .rodata
48
+#endif
49
+
50
+.align 4
51
+
52
+.text
53
+
54
+.macro qpel_load_32b_sve2 v
55
+.if \v == 0
56
+ add x6, x6, x11 // do not load 3 values that are not used in qpel_filter_0
57
+ ld1b {z3.h}, p0/z, x6
58
+ add x6, x6, x1
59
+.elseif \v == 1 || \v == 2 || \v == 3
60
+.if \v != 3 // not used in qpel_filter_3
61
+ ld1b {z0.h}, p0/z, x6
62
+ add x6, x6, x1
63
+.else
64
+ add x6, x6, x1
65
+.endif
66
+ ld1b {z1.h}, p0/z, x6
67
+ add x6, x6, x1
68
+ ld1b {z2.h}, p0/z, x6
69
+ add x6, x6, x1
70
+ ld1b {z3.h}, p0/z, x6
71
+ add x6, x6, x1
72
+ ld1b {z4.h}, p0/z, x6
73
+ add x6, x6, x1
74
+ ld1b {z5.h}, p0/z, x6
75
+ add x6, x6, x1
76
+.if \v != 1 // not used in qpel_filter_1
77
+ ld1b {z6.h}, p0/z, x6
78
+ add x6, x6, x1
79
+ ld1b {z7.h}, p0/z, x6
80
+.else
81
+ ld1b {z6.h}, p0/z, x6
82
+.endif
83
+.endif
84
+.endm
85
+
86
+.macro qpel_load_64b_sve2_gt_16 v
87
+.if \v == 0
88
+ add x6, x6, x11 // do not load 3 values that are not used in qpel_filter_0
89
+ ld1b {z3.h}, p2/z, x6
90
+ add x6, x6, x1
91
+.elseif \v == 1 || \v == 2 || \v == 3
92
+.if \v != 3 // not used in qpel_filter_3
93
+ ld1b {z0.h}, p2/z, x6
94
+ add x6, x6, x1
95
+.else
96
+ add x6, x6, x1
97
+.endif
98
+ ld1b {z1.h}, p2/z, x6
99
+ add x6, x6, x1
100
+ ld1b {z2.h}, p2/z, x6
101
+ add x6, x6, x1
102
+ ld1b {z3.h}, p2/z, x6
103
+ add x6, x6, x1
104
+ ld1b {z4.h}, p2/z, x6
105
+ add x6, x6, x1
106
+ ld1b {z5.h}, p2/z, x6
107
+ add x6, x6, x1
108
+.if \v != 1 // not used in qpel_filter_1
109
+ ld1b {z6.h}, p2/z, x6
110
+ add x6, x6, x1
111
+ ld1b {z7.h}, p2/z, x6
112
+.else
113
+ ld1b {z6.h}, p2/z, x6
114
+.endif
115
+.endif
116
+.endm
117
+
118
+.macro qpel_chroma_load_32b_sve2 v
119
+.if \v == 0
120
+ // qpel_filter_chroma_0 only uses values in v1
121
+ add x6, x6, x1
122
+ ld1b {z1.h}, p0/z, x6
123
+.else
124
+ ld1b {z0.h}, p0/z, x6
125
+ add x6, x6, x1
126
+ ld1b {z1.h}, p0/z, x6
127
+ add x6, x6, x1
128
+ ld1b {z2.h}, p0/z, x6
129
+ add x6, x6, x1
130
+ ld1b {z3.h}, p0/z, x6
131
+.endif
132
+.endm
133
+
134
+.macro qpel_start_sve2_0
135
+ mov z24.h, #64
136
+.endm
137
+
138
+.macro qpel_filter_sve2_0_32b
139
+ mul z17.h, z3.h, z24.h // 64*d
140
+.endm
141
+
142
+.macro qpel_filter_sve2_0_64b
143
+ qpel_filter_sve2_0_32b
144
+ mul z18.h, z11.h, z24.h
145
+.endm
146
+
147
+.macro qpel_start_sve2_1
148
+ mov z24.h, #58
149
+ mov z25.h, #10
150
+ mov z26.h, #17
151
+ mov z27.h, #5
152
+.endm
153
+
154
+.macro qpel_filter_sve2_1_32b
155
+ mul z19.h, z2.h, z25.h // c*10
156
+ mul z17.h, z3.h, z24.h // d*58
157
+ mul z21.h, z4.h, z26.h // e*17
158
+ mul z23.h, z5.h, z27.h // f*5
159
+ sub z17.h, z17.h, z19.h // d*58 - c*10
160
+ lsl z18.h, z1.h, #2 // b*4
161
+ add z17.h, z17.h, z21.h // d*58 - c*10 + e*17
162
+ sub z21.h, z6.h, z0.h // g - a
163
+ add z17.h, z17.h, z18.h // d*58 - c*10 + e*17 + b*4
164
+ sub z21.h, z21.h, z23.h // g - a - f*5
165
+ add z17.h, z17.h, z21.h // d*58 - c*10 + e*17 + b*4 + g - a - f*5
166
+.endm
167
+
168
+.macro qpel_filter_sve2_1_64b
169
+ qpel_filter_sve2_1_32b
170
+ mul z20.h, z10.h, z25.h // c*10
171
+ mul z18.h, z11.h, z24.h // d*58
172
+ mul z21.h, z12.h, z26.h // e*17
173
+ mul z23.h, z13.h, z27.h // f*5
174
+ sub z18.h, z18.h, z20.h // d*58 - c*10
175
+ lsl z28.h, z30.h, #2 // b*4
176
+ add z18.h, z18.h, z21.h // d*58 - c*10 + e*17
177
+ sub z21.h, z14.h, z29.h // g - a
178
+ add z18.h, z18.h, z28.h // d*58 - c*10 + e*17 + b*4
179
+ sub z21.h, z21.h, z23.h // g - a - f*5
180
+ add z18.h, z18.h, z21.h // d*58 - c*10 + e*17 + b*4 + g - a - f*5
181
+.endm
182
+
183
+.macro qpel_start_sve2_2
184
+ mov z24.h, #11
185
+ mov z25.h, #40
186
+.endm
187
+
188
+.macro qpel_filter_sve2_2_32b
189
+ add z17.h, z3.h, z4.h // d + e
190
+ add z19.h, z2.h, z5.h // c + f
191
+ add z23.h, z1.h, z6.h // b + g
192
+ add z21.h, z0.h, z7.h // a + h
193
+ mul z17.h, z17.h, z25.h // 40 * (d + e)
194
+ mul z19.h, z19.h, z24.h // 11 * (c + f)
195
+ lsl z23.h, z23.h, #2 // (b + g) * 4
196
+ add z19.h, z19.h, z21.h // 11 * (c + f) + a + h
197
+ add z17.h, z17.h, z23.h // 40 * (d + e) + (b + g) * 4
198
+ sub z17.h, z17.h, z19.h // 40 * (d + e) + (b + g) * 4 - 11 * (c + f) - a - h
199
+.endm
200
+
201
+.macro qpel_filter_sve2_2_64b
202
+ qpel_filter_sve2_2_32b
203
+ add z27.h, z11.h, z12.h // d + e
204
+ add z16.h, z10.h, z13.h // c + f
205
+ add z23.h, z30.h, z14.h // b + g
206
+ add z21.h, z29.h, z15.h // a + h
207
+ mul z27.h, z27.h, z25.h // 40 * (d + e)
208
+ mul z16.h, z16.h, z24.h // 11 * (c + f)
209
+ lsl z23.h, z23.h, #2 // (b + g) * 4
210
+ add z16.h, z16.h, z21.h // 11 * (c + f) + a + h
211
+ add z27.h, z27.h, z23.h // 40 * (d + e) + (b + g) * 4
212
+ sub z18.h, z27.h, z16.h // 40 * (d + e) + (b + g) * 4 - 11 * (c + f) - a - h
213
+.endm
214
+
215
+.macro qpel_start_sve2_3
216
+ mov z24.h, #17
217
+ mov z25.h, #5
218
+ mov z26.h, #58
219
+ mov z27.h, #10
220
+.endm
221
+
222
+.macro qpel_filter_sve2_3_32b
223
+ mul z19.h, z2.h, z25.h // c * 5
224
+ mul z17.h, z3.h, z24.h // d * 17
225
+ mul z21.h, z4.h, z26.h // e * 58
226
+ mul z23.h, z5.h, z27.h // f * 10
227
+ sub z17.h, z17.h, z19.h // d * 17 - c * 5
228
+ lsl z19.h, z6.h, #2 // g * 4
229
+ add z17.h, z17.h, z21.h // d * 17 - c * 5 + e * 58
230
+ sub z21.h, z1.h, z7.h // b - h
231
+ add z17.h, z17.h, z19.h // d * 17 - c * 5 + e * 58 + g * 4
232
+ sub z21.h, z21.h, z23.h // b - h - f * 10
233
+ add z17.h, z17.h, z21.h // d * 17 - c * 5 + e * 58 + g * 4 + b - h - f * 10
234
+.endm
235
+
236
+.macro qpel_filter_sve2_3_64b
237
+ qpel_filter_sve2_3_32b
238
+ mul z16.h, z10.h, z25.h // c * 5
239
+ mul z18.h, z11.h, z24.h // d * 17
240
+ mul z21.h, z12.h, z26.h // e * 58
241
+ mul z23.h, z13.h, z27.h // f * 10
242
+ sub z18.h, z18.h, z16.h // d * 17 - c * 5
243
+ lsl z16.h, z14.h, #2 // g * 4
244
+ add z18.h, z18.h, z21.h // d * 17 - c * 5 + e * 58
245
+ sub z21.h, z30.h, z15.h // b - h
246
+ add z18.h, z18.h, z16.h // d * 17 - c * 5 + e * 58 + g * 4
247
+ sub z21.h, z21.h, z23.h // b - h - f * 10
248
+ add z18.h, z18.h, z21.h // d * 17 - c * 5 + e * 58 + g * 4 + b - h - f * 10
249
+.endm
250
+
251
+.macro qpel_start_chroma_sve2_0
252
+ mov z29.h, #64
253
+.endm
254
+
255
+.macro qpel_filter_chroma_sve2_0_32b
256
+ mul z17.h, z1.h, z29.h // 64*b
257
+.endm
258
+
259
+.macro qpel_start_chroma_sve2_1
260
+ mov z29.h, #58
261
+ mov z30.h, #10
262
+.endm
263
+
264
+.macro qpel_filter_chroma_sve2_1_32b
265
+ mul z17.h, z1.h, z29.h // 58 * b
266
+ mul z19.h, z2.h, z30.h // 10 * c
267
+ add z22.h, z0.h, z3.h // a + d
268
+ lsl z22.h, z22.h, #1 // 2 * (a+d)
269
+ sub z17.h, z17.h, z22.h // 58*b - 2*(a+d)
270
+ add z17.h, z17.h, z19.h // 58*b-2*(a+d) + 10*c
271
+.endm
272
+
273
+.macro qpel_start_chroma_sve2_2
274
+ mov z30.h, #54
275
+.endm
276
+
277
+.macro qpel_filter_chroma_sve2_2_32b
278
+ mul z17.h, z1.h, z30.h // 54 * b
279
+ lsl z19.h, z0.h, #2 // 4 * a
280
+ lsl z21.h, z2.h, #4 // 16 * c
281
+ lsl z23.h, z3.h, #1 // 2 * d
282
+ add z17.h, z17.h, z21.h // 54*b + 16*c
283
+ add z19.h, z19.h, z23.h // 4*a + 2*d
284
+ sub z17.h, z17.h, z19.h // 54*b+16*c - (4*a+2*d)
285
+.endm
286
+
287
+.macro qpel_start_chroma_sve2_3
288
+ mov z28.h, #46
289
+ mov z29.h, #28
290
+ mov z30.h, #6
291
+.endm
292
+
293
+.macro qpel_filter_chroma_sve2_3_32b
294
+ mul z17.h, z1.h, z28.h // 46 * b
295
+ mul z19.h, z2.h, z29.h // 28 * c
296
+ lsl z21.h, z3.h, #2 // 4 * d
297
+ mul z23.h, z0.h, z30.h // 6 * a
298
+ add z17.h, z17.h, z19.h // 46*b + 28*c
299
+ add z21.h, z21.h, z23.h // 4*d + 6*a
300
+ sub z17.h, z17.h, z21.h // 46*b+28*c - (4*d+6*a)
301
+.endm
302
+
303
+.macro qpel_start_chroma_sve2_4
304
+ mov z29.h, #36
305
+.endm
306
+
307
+.macro qpel_filter_chroma_sve2_4_32b
308
+ add z20.h, z0.h, z3.h // a + d
309
+ add z17.h, z1.h, z2.h // b + c
310
+ lsl z20.h, z20.h, #2 // 4 * (a+d)
311
+ mul z17.h, z17.h, z29.h // 36 * (b+c)
312
+ sub z17.h, z17.h, z20.h // 36*(b+c) - 4*(a+d)
313
+.endm
314
+
315
+.macro qpel_start_chroma_sve2_5
316
+ mov z28.h, #28
317
+ mov z29.h, #46
318
+ mov z30.h, #6
319
+.endm
320
+
321
+.macro qpel_filter_chroma_sve2_5_32b
322
+ mul z17.h, z1.h, z28.h // 28 * b
323
+ mul z19.h, z2.h, z29.h // 46 * c
324
+ lsl z21.h, z0.h, #2 // 4 * a
325
+ mul z23.h, z3.h, z30.h // 6 * d
326
+ add z17.h, z17.h, z19.h // 28*b + 46*c
327
+ add z21.h, z21.h, z23.h // 4*a + 6*d
328
+ sub z17.h, z17.h, z21.h // 28*b+46*c - (4*a+6*d)
329
+.endm
330
+
331
+.macro qpel_start_chroma_sve2_6
332
+ mov z30.h, #54
333
+.endm
334
+
335
+.macro qpel_filter_chroma_sve2_6_32b
336
+ mul z17.h, z2.h, z30.h // 54 * c
337
+ lsl z19.h, z0.h, #1 // 2 * a
338
+ lsl z21.h, z1.h, #4 // 16 * b
339
+ lsl z23.h, z3.h, #2 // 4 * d
340
+ add z17.h, z17.h, z21.h // 54*c + 16*b
341
+ add z19.h, z19.h, z23.h // 2*a + 4*d
342
+ sub z17.h, z17.h, z19.h // 54*c+16*b - (2*a+4*d)
343
+.endm
344
+
345
+.macro qpel_start_chroma_sve2_7
346
+ mov z29.h, #58
347
+ mov z30.h, #10
348
+.endm
349
+
350
+.macro qpel_filter_chroma_sve2_7_32b
351
+ add z20.h, z0.h, z3.h // a + d
352
+ mul z17.h, z2.h, z29.h // 58 * c
353
+ lsl z20.h, z20.h, #1 // 2 * (a+d)
354
+ mul z19.h, z1.h, z30.h // 10 * b
355
+ sub z17.h, z17.h, z20.h // 58*c - 2*(a+d)
356
+ add z17.h, z17.h, z19.h // 58*c-2*(a+d) + 10*b
357
+.endm
358
+
359
+.macro vpp_end_sve2
360
+ add z17.h, z17.h, z31.h
361
+ sqshrun v17.8b, v17.8h, #6
362
+.endm
363
+
364
+.macro FILTER_LUMA_VPP_SVE2 w, h, v
365
+ lsl x10, x1, #2 // x10 = 4 * x1
366
+ sub x11, x10, x1 // x11 = 3 * x1
367
+ sub x0, x0, x11 // src -= (8 / 2 - 1) * srcStride
368
+ mov x5, #\h
369
+ mov z31.h, #32
370
+ rdvl x9, #1
371
+ cmp x9, #16
372
+ bgt .vl_gt_16_FILTER_LUMA_VPP_\v\()_\w\()x\h
373
+ qpel_start_\v
374
+.loop_luma_vpp_sve2_\v\()_\w\()x\h:
375
+ mov x7, x2
376
+ mov x9, #0
377
+.loop_luma_vpp_w8_sve2_\v\()_\w\()x\h:
378
+ add x6, x0, x9
379
+.if \w == 8 || \w == 24
380
+ qpel_load_32b \v
381
+ qpel_filter_\v\()_32b
382
+ vpp_end
383
+ str d17, x7, #8
384
+ add x9, x9, #8
385
+.elseif \w == 12
386
+ qpel_load_32b \v
387
+ qpel_filter_\v\()_32b
388
+ vpp_end
389
+ str d17, x7, #8
390
+ add x6, x0, #8
391
+ qpel_load_32b \v
392
+ qpel_filter_\v\()_32b
393
+ vpp_end
394
+ fmov w6, s17
395
+ str w6, x7, #4
396
+ add x9, x9, #12
397
+.else
398
+ qpel_load_64b \v
399
+ qpel_filter_\v\()_64b
400
+ vpp_end
401
+ add v18.8h, v18.8h, v31.8h
402
+ sqshrun2 v17.16b, v18.8h, #6
403
+ str q17, x7, #16
404
+ add x9, x9, #16
405
+.endif
406
+ cmp x9, #\w
407
+ blt .loop_luma_vpp_w8_sve2_\v\()_\w\()x\h
408
+ add x0, x0, x1
409
+ add x2, x2, x3
410
+ sub x5, x5, #1
411
+ cbnz x5, .loop_luma_vpp_sve2_\v\()_\w\()x\h
412
+ ret
413
+.vl_gt_16_FILTER_LUMA_VPP_\v\()_\w\()x\h:
414
+ ptrue p0.h, vl8
415
+ ptrue p2.h, vl16
416
+ qpel_start_sve2_\v
417
+.gt_16_loop_luma_vpp_sve2_\v\()_\w\()x\h:
418
+ mov x7, x2
419
+ mov x9, #0
420
+.gt_16_loop_luma_vpp_w8_sve2_\v\()_\w\()x\h:
421
+ add x6, x0, x9
422
+.if \w == 8 || \w == 24
423
+ qpel_load_32b_sve2 \v
424
+ qpel_filter_sve2_\v\()_32b
425
+ vpp_end_sve2
426
+ str d17, x7, #8
427
+ add x9, x9, #8
428
+.elseif \w == 12
429
+ qpel_load_32b_sve2 \v
430
+ qpel_filter_sve2_\v\()_32b
431
+ vpp_end_sve2
432
+ str d17, x7, #8
433
+ add x6, x0, #8
434
+ qpel_load_32b_sve2 \v
435
+ qpel_filter_sve2_\v\()_32b
436
+ vpp_end_sve2
437
+ fmov w6, s17
438
+ str w6, x7, #4
439
+ add x9, x9, #12
440
+.else
441
+ qpel_load_64b_sve2_gt_16 \v
442
+ qpel_filter_sve2_\v\()_32b
443
+ vpp_end_sve2
444
+ add z18.h, z18.h, z31.h
445
+ sqshrun2 v17.16b, v18.8h, #6
446
+ str q17, x7, #16
447
+ add x9, x9, #16
448
+.endif
449
+ cmp x9, #\w
450
+ blt .gt_16_loop_luma_vpp_w8_sve2_\v\()_\w\()x\h
451
+ add x0, x0, x1
452
+ add x2, x2, x3
453
+ sub x5, x5, #1
454
+ cbnz x5, .gt_16_loop_luma_vpp_sve2_\v\()_\w\()x\h
455
+ ret
456
+.endm
457
+
458
+// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
459
+.macro LUMA_VPP_SVE2 w, h
460
+function x265_interp_8tap_vert_pp_\w\()x\h\()_sve2
461
+ cmp x4, #0
462
+ b.eq 0f
463
+ cmp x4, #1
464
+ b.eq 1f
465
+ cmp x4, #2
466
+ b.eq 2f
467
+ cmp x4, #3
468
+ b.eq 3f
469
+0:
470
+ FILTER_LUMA_VPP_SVE2 \w, \h, 0
471
+1:
472
+ FILTER_LUMA_VPP_SVE2 \w, \h, 1
473
+2:
474
+ FILTER_LUMA_VPP_SVE2 \w, \h, 2
475
+3:
476
+ FILTER_LUMA_VPP_SVE2 \w, \h, 3
477
+endfunc
478
+.endm
479
+
480
+LUMA_VPP_SVE2 8, 4
481
+LUMA_VPP_SVE2 8, 8
482
+LUMA_VPP_SVE2 8, 16
483
+LUMA_VPP_SVE2 8, 32
484
+LUMA_VPP_SVE2 12, 16
485
+LUMA_VPP_SVE2 16, 4
486
+LUMA_VPP_SVE2 16, 8
487
+LUMA_VPP_SVE2 16, 16
488
+LUMA_VPP_SVE2 16, 32
489
+LUMA_VPP_SVE2 16, 64
490
+LUMA_VPP_SVE2 16, 12
491
+LUMA_VPP_SVE2 24, 32
492
+LUMA_VPP_SVE2 32, 8
493
+LUMA_VPP_SVE2 32, 16
494
+LUMA_VPP_SVE2 32, 32
495
+LUMA_VPP_SVE2 32, 64
496
+LUMA_VPP_SVE2 32, 24
497
+LUMA_VPP_SVE2 48, 64
498
+LUMA_VPP_SVE2 64, 16
499
+LUMA_VPP_SVE2 64, 32
500
+LUMA_VPP_SVE2 64, 64
501
+LUMA_VPP_SVE2 64, 48
502
+
503
+// void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
504
+.macro LUMA_VPS_4xN_SVE2 h
505
+function x265_interp_8tap_vert_ps_4x\h\()_sve2
506
+ lsl x3, x3, #1
507
+ lsl x5, x4, #6
508
+ lsl x4, x1, #2
509
+ sub x4, x4, x1
510
+ sub x0, x0, x4
511
+
512
+ mov z28.s, #8192
513
+ mov x4, #\h
514
+ movrel x12, g_lumaFilter
515
+ add x12, x12, x5
516
+ ptrue p0.s, vl4
517
+ ld1rd {z16.d}, p0/z, x12
518
+ ld1rd {z17.d}, p0/z, x12, #8
519
+ ld1rd {z18.d}, p0/z, x12, #16
520
+ ld1rd {z19.d}, p0/z, x12, #24
521
+ ld1rd {z20.d}, p0/z, x12, #32
522
+ ld1rd {z21.d}, p0/z, x12, #40
523
+ ld1rd {z22.d}, p0/z, x12, #48
524
+ ld1rd {z23.d}, p0/z, x12, #56
525
+
526
+.loop_vps_sve2_4x\h:
527
+ mov x6, x0
528
+
529
+ ld1b {z0.s}, p0/z, x6
530
+ add x6, x6, x1
531
+ ld1b {z1.s}, p0/z, x6
532
+ add x6, x6, x1
533
+ ld1b {z2.s}, p0/z, x6
534
+ add x6, x6, x1
535
+ ld1b {z3.s}, p0/z, x6
536
+ add x6, x6, x1
537
+ ld1b {z4.s}, p0/z, x6
538
+ add x6, x6, x1
539
+ ld1b {z5.s}, p0/z, x6
540
+ add x6, x6, x1
541
+ ld1b {z6.s}, p0/z, x6
542
+ add x6, x6, x1
543
+ ld1b {z7.s}, p0/z, x6
544
+ add x6, x6, x1
545
+
546
+ mul z0.s, z0.s, z16.s
547
+ mla z0.s, p0/m, z1.s, z17.s
548
+ mla z0.s, p0/m, z2.s, z18.s
549
+ mla z0.s, p0/m, z3.s, z19.s
550
+ mla z0.s, p0/m, z4.s, z20.s
551
+ mla z0.s, p0/m, z5.s, z21.s
552
+ mla z0.s, p0/m, z6.s, z22.s
553
+ mla z0.s, p0/m, z7.s, z23.s
554
+
555
+ sub z0.s, z0.s, z28.s
556
+ sqxtn v0.4h, v0.4s
557
+ st1 {v0.8b}, x2, x3
558
+
559
+ add x0, x0, x1
560
+ sub x4, x4, #1
561
+ cbnz x4, .loop_vps_sve2_4x\h
562
+ ret
563
+endfunc
564
+.endm
565
+
566
+LUMA_VPS_4xN_SVE2 4
567
+LUMA_VPS_4xN_SVE2 8
568
+LUMA_VPS_4xN_SVE2 16
569
+
570
+// void interp_vert_sp_c(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
571
+.macro LUMA_VSP_4xN_SVE2 h
572
+function x265_interp_8tap_vert_sp_4x\h\()_sve2
573
+ lsl x5, x4, #6
574
+ lsl x1, x1, #1
575
+ lsl x4, x1, #2
576
+ sub x4, x4, x1
577
+ sub x0, x0, x4
578
+
579
+ mov w12, #1
580
+ lsl w12, w12, #19
581
+ add w12, w12, #2048
582
+ dup v24.4s, w12
583
+ mov x4, #\h
584
+ movrel x12, g_lumaFilter
585
+ add x12, x12, x5
586
+
587
+ ptrue p0.s, vl4
588
+ ld1rd {z16.d}, p0/z, x12
589
+ ld1rd {z17.d}, p0/z, x12, #8
590
+ ld1rd {z18.d}, p0/z, x12, #16
591
+ ld1rd {z19.d}, p0/z, x12, #24
592
+ ld1rd {z20.d}, p0/z, x12, #32
593
+ ld1rd {z21.d}, p0/z, x12, #40
594
+ ld1rd {z22.d}, p0/z, x12, #48
595
+ ld1rd {z23.d}, p0/z, x12, #56
596
+
597
+.loop_vsp_sve2_4x\h:
598
+ mov x6, x0
599
+
600
+ ld1 {v0.8b}, x6, x1
601
+ ld1 {v1.8b}, x6, x1
602
+ ld1 {v2.8b}, x6, x1
603
+ ld1 {v3.8b}, x6, x1
604
+ ld1 {v4.8b}, x6, x1
605
+ ld1 {v5.8b}, x6, x1
606
+ ld1 {v6.8b}, x6, x1
607
+ ld1 {v7.8b}, x6, x1
608
+
609
+ sunpklo z0.s, z0.h
610
+ sunpklo z1.s, z1.h
611
+ mul z0.s, z0.s, z16.s
612
+ sunpklo z2.s, z2.h
613
+ mla z0.s, p0/m, z1.s, z17.s
614
+ sunpklo z3.s, z3.h
615
+ mla z0.s, p0/m, z2.s, z18.s
616
+ sunpklo z4.s, z4.h
617
+ mla z0.s, p0/m, z3.s, z19.s
618
+ sunpklo z5.s, z5.h
619
+ mla z0.s, p0/m, z4.s, z20.s
620
+ sunpklo z6.s, z6.h
621
+ mla z0.s, p0/m, z5.s, z21.s
622
+ sunpklo z7.s, z7.h
623
+ mla z0.s, p0/m, z6.s, z22.s
624
+
625
+ mla z0.s, p0/m, z7.s, z23.s
626
+
627
+ add z0.s, z0.s, z24.s
628
+ sqshrun v0.4h, v0.4s, #12
629
+ sqxtun v0.8b, v0.8h
630
+ st1 {v0.s}0, x2, x3
631
+
632
+ add x0, x0, x1
633
+ sub x4, x4, #1
634
+ cbnz x4, .loop_vsp_sve2_4x\h
635
+ ret
636
+endfunc
637
+.endm
638
+
639
+LUMA_VSP_4xN_SVE2 4
640
+LUMA_VSP_4xN_SVE2 8
641
+LUMA_VSP_4xN_SVE2 16
642
+
643
+.macro vps_end_sve2
644
+ sub z17.h, z17.h, z31.h
645
+.endm
646
+
647
+.macro FILTER_VPS_SVE2 w, h, v
648
+ lsl x3, x3, #1
649
+ lsl x10, x1, #2 // x10 = 4 * x1
650
+ sub x11, x10, x1 // x11 = 3 * x1
651
+ sub x0, x0, x11 // src -= (8 / 2 - 1) * srcStride
652
+ mov x5, #\h
653
+ mov z31.h, #8192
654
+ rdvl x14, #1
655
+ cmp x14, #16
656
+ bgt .vl_gt_16_FILTER_VPS_\v\()_\w\()x\h
657
+ qpel_start_\v
658
+.loop_ps_sve2_\v\()_\w\()x\h:
659
+ mov x7, x2
660
+ mov x9, #0
661
+.loop_ps_w8_sve2_\v\()_\w\()x\h:
662
+ add x6, x0, x9
663
+.if \w == 8 || \w == 24
664
+ qpel_load_32b \v
665
+ qpel_filter_\v\()_32b
666
+ vps_end
667
+ str q17, x7, #16
668
+ add x9, x9, #8
669
+.elseif \w == 12
670
+ qpel_load_32b \v
671
+ qpel_filter_\v\()_32b
672
+ vps_end
673
+ str q17, x7, #16
674
+ add x6, x0, #8
675
+ qpel_load_32b \v
676
+ qpel_filter_\v\()_32b
677
+ vps_end
678
+ str d17, x7, #8
679
+ add x9, x9, #12
680
+.else
681
+ qpel_load_64b \v
682
+ qpel_filter_\v\()_64b
683
+ vps_end
684
+ sub v18.8h, v18.8h, v31.8h
685
+ stp q17, q18, x7, #32
686
+ add x9, x9, #16
687
+.endif
688
+ cmp x9, #\w
689
+ blt .loop_ps_w8_sve2_\v\()_\w\()x\h
690
+ add x0, x0, x1
691
+ add x2, x2, x3
692
+ sub x5, x5, #1
693
+ cbnz x5, .loop_ps_sve2_\v\()_\w\()x\h
694
+ ret
695
+.vl_gt_16_FILTER_VPS_\v\()_\w\()x\h:
696
+ ptrue p0.h, vl8
697
+ ptrue p2.h, vl16
698
+ qpel_start_sve2_\v
699
+.gt_16_loop_ps_sve2_\v\()_\w\()x\h:
700
+ mov x7, x2
701
+ mov x9, #0
702
+.gt_16_loop_ps_w8_sve2_\v\()_\w\()x\h:
703
+ add x6, x0, x9
704
+.if \w == 8 || \w == 24
705
+ qpel_load_32b_sve2 \v
706
+ qpel_filter_sve2_\v\()_32b
707
+ vps_end_sve2
708
+ str q17, x7, #16
709
+ add x9, x9, #8
710
+.elseif \w == 12
711
+ qpel_load_32b_sve2 \v
712
+ qpel_filter_sve2_\v\()_32b
713
+ vps_end_sve2
714
+ str q17, x7, #16
715
+ add x6, x0, #8
716
+ qpel_load_32b_sve2 \v
717
+ qpel_filter_sve2_\v\()_32b
718
+ vps_end_sve2
719
+ str d17, x7, #8
720
+ add x9, x9, #12
721
+.else
722
+ qpel_load_64b_sve2_gt_16 \v
723
+ qpel_filter_sve2_\v\()_32b
724
+ vps_end_sve2
725
+ sub z18.h, z18.h, z31.h
726
+ stp q17, q18, x7, #32
727
+ add x9, x9, #16
728
+.endif
729
+ cmp x9, #\w
730
+ blt .gt_16_loop_ps_w8_sve2_\v\()_\w\()x\h
731
+ add x0, x0, x1
732
+ add x2, x2, x3
733
+ sub x5, x5, #1
734
+ cbnz x5, .gt_16_loop_ps_sve2_\v\()_\w\()x\h
735
+ ret
736
+.endm
737
+
738
+// void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
739
+.macro LUMA_VPS_SVE2 w, h
740
+function x265_interp_8tap_vert_ps_\w\()x\h\()_sve2
741
+ cmp x4, #0
742
+ beq 0f
743
+ cmp x4, #1
744
+ beq 1f
745
+ cmp x4, #2
746
+ beq 2f
747
+ cmp x4, #3
748
+ beq 3f
749
+0:
750
+ FILTER_VPS_SVE2 \w, \h, 0
751
+1:
752
+ FILTER_VPS_SVE2 \w, \h, 1
753
+2:
754
+ FILTER_VPS_SVE2 \w, \h, 2
755
+3:
756
+ FILTER_VPS_SVE2 \w, \h, 3
757
+endfunc
758
+.endm
759
+
760
+LUMA_VPS_SVE2 8, 4
761
+LUMA_VPS_SVE2 8, 8
762
+LUMA_VPS_SVE2 8, 16
763
+LUMA_VPS_SVE2 8, 32
764
+LUMA_VPS_SVE2 12, 16
765
+LUMA_VPS_SVE2 16, 4
766
+LUMA_VPS_SVE2 16, 8
767
+LUMA_VPS_SVE2 16, 16
768
+LUMA_VPS_SVE2 16, 32
769
+LUMA_VPS_SVE2 16, 64
770
+LUMA_VPS_SVE2 16, 12
771
+LUMA_VPS_SVE2 24, 32
772
+LUMA_VPS_SVE2 32, 8
773
+LUMA_VPS_SVE2 32, 16
774
+LUMA_VPS_SVE2 32, 32
775
+LUMA_VPS_SVE2 32, 64
776
+LUMA_VPS_SVE2 32, 24
777
+LUMA_VPS_SVE2 48, 64
778
+LUMA_VPS_SVE2 64, 16
779
+LUMA_VPS_SVE2 64, 32
780
+LUMA_VPS_SVE2 64, 64
781
+LUMA_VPS_SVE2 64, 48
782
+
783
+// ***** luma_vss *****
784
+.macro vss_end_sve2
785
+ asr z17.s, z17.s, #6
786
+ asr z18.s, z18.s, #6
787
+ uzp1 v17.8h, v17.8h, v18.8h
788
+.endm
789
+
790
+.macro FILTER_VSS_SVE2 w, h, v
791
+ lsl x1, x1, #1
792
+ lsl x10, x1, #2 // x10 = 4 * x1
793
+ sub x11, x10, x1 // x11 = 3 * x1
794
+ sub x0, x0, x11
795
+ lsl x3, x3, #1
796
+ mov x5, #\h
797
+ mov x12, #\w
798
+ lsl x12, x12, #1
799
+ qpel_start_\v\()_1
800
+.loop_luma_vss_sve2_\v\()_\w\()x\h:
801
+ mov x7, x2
802
+ mov x9, #0
803
+.loop_luma_vss_w8_sve2_\v\()_\w\()x\h:
804
+ add x6, x0, x9
805
+ qpel_load_64b \v
806
+ qpel_filter_\v\()_32b_1
807
+ vss_end_sve2
808
+.if \w == 4
809
+ str s17, x7, #4
810
+ add x9, x9, #4
811
+.else
812
+ str q17, x7, #16
813
+ add x9, x9, #16
814
+.if \w == 12
815
+ add x6, x0, x9
816
+ qpel_load_64b \v
817
+ qpel_filter_\v\()_32b_1
818
+ vss_end_sve2
819
+ str d17, x7, #8
820
+ add x9, x9, #8
821
+.endif
822
+.endif
823
+ cmp x9, x12
824
+ blt .loop_luma_vss_w8_sve2_\v\()_\w\()x\h
825
+ add x0, x0, x1
826
+ add x2, x2, x3
827
+ sub x5, x5, #1
828
+ cbnz x5, .loop_luma_vss_sve2_\v\()_\w\()x\h
829
+ ret
830
+.endm
831
+
832
+// void interp_vert_ss_c(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
833
+.macro LUMA_VSS_SVE2 w, h
834
+function x265_interp_8tap_vert_ss_\w\()x\h\()_sve2
835
+ cmp x4, #0
836
+ beq 0f
837
+ cmp x4, #1
838
+ beq 1f
839
+ cmp x4, #2
840
+ beq 2f
841
+ cmp x4, #3
842
+ beq 3f
843
+0:
844
+ FILTER_VSS_SVE2 \w, \h, 0
845
+1:
846
+ FILTER_VSS_SVE2 \w, \h, 1
847
+2:
848
+ FILTER_VSS_SVE2 \w, \h, 2
849
+3:
850
+ FILTER_VSS_SVE2 \w, \h, 3
851
+endfunc
852
+.endm
853
+
854
+LUMA_VSS_SVE2 4, 4
855
+LUMA_VSS_SVE2 4, 8
856
+LUMA_VSS_SVE2 4, 16
857
+LUMA_VSS_SVE2 8, 4
858
+LUMA_VSS_SVE2 8, 8
859
+LUMA_VSS_SVE2 8, 16
860
+LUMA_VSS_SVE2 8, 32
861
+LUMA_VSS_SVE2 12, 16
862
+LUMA_VSS_SVE2 16, 4
863
+LUMA_VSS_SVE2 16, 8
864
+LUMA_VSS_SVE2 16, 16
865
+LUMA_VSS_SVE2 16, 32
866
+LUMA_VSS_SVE2 16, 64
867
+LUMA_VSS_SVE2 16, 12
868
+LUMA_VSS_SVE2 32, 8
869
+LUMA_VSS_SVE2 32, 16
870
+LUMA_VSS_SVE2 32, 32
871
+LUMA_VSS_SVE2 32, 64
872
+LUMA_VSS_SVE2 32, 24
873
+LUMA_VSS_SVE2 64, 16
874
+LUMA_VSS_SVE2 64, 32
875
+LUMA_VSS_SVE2 64, 64
876
+LUMA_VSS_SVE2 64, 48
877
+LUMA_VSS_SVE2 24, 32
878
+LUMA_VSS_SVE2 48, 64
879
+
880
+// ***** luma_hps *****
881
+
882
+.macro FILTER_CHROMA_VPP_SVE2 w, h, v
883
+ ptrue p0.h, vl8
884
+ qpel_start_chroma_sve2_\v
885
+ mov z31.h, #32
886
+ sub x0, x0, x1
887
+ mov x5, #\h
888
+.loop_chroma_vpp_sve2_\v\()_\w\()x\h:
889
+ mov x7, x2
890
+ mov x9, #0
891
+.loop_chroma_vpp_w8_sve2_\v\()_\w\()x\h:
892
+ add x6, x0, x9
893
+ qpel_chroma_load_32b_sve2 \v
894
+ qpel_filter_chroma_sve2_\v\()_32b
895
+ vpp_end_sve2
896
+ add x9, x9, #8
897
+.if \w == 2
898
+ fmov w12, s17
899
+ strh w12, x7, #2
900
+.elseif \w == 4
901
+ str s17, x7, #4
902
+.elseif \w == 6
903
+ str s17, x7, #4
904
+ umov w12, v17.h2
905
+ strh w12, x7, #2
906
+.elseif \w == 12
907
+ str d17, x7, #8
908
+ add x6, x0, x9
909
+ qpel_chroma_load_32b_sve2 \v
910
+ qpel_filter_chroma_sve2_\v\()_32b
911
+ vpp_end_sve2
912
+ str s17, x7, #4
913
+ add x9, x9, #8
914
+.else
915
+ str d17, x7, #8
916
+.endif
917
+ cmp x9, #\w
918
+ blt .loop_chroma_vpp_w8_sve2_\v\()_\w\()x\h
919
+ add x0, x0, x1
920
+ add x2, x2, x3
921
+ sub x5, x5, #1
922
+ cbnz x5, .loop_chroma_vpp_sve2_\v\()_\w\()x\h
923
+ ret
924
+.endm
925
+
926
+// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
927
+.macro CHROMA_VPP_SVE2 w, h
928
+function x265_interp_4tap_vert_pp_\w\()x\h\()_sve2
929
+ cmp x4, #0
930
+ beq 0f
931
+ cmp x4, #1
932
+ beq 1f
933
+ cmp x4, #2
934
+ beq 2f
935
+ cmp x4, #3
936
+ beq 3f
937
+ cmp x4, #4
938
+ beq 4f
939
+ cmp x4, #5
940
+ beq 5f
941
+ cmp x4, #6
942
+ beq 6f
943
+ cmp x4, #7
944
+ beq 7f
945
+0:
946
+ FILTER_CHROMA_VPP_SVE2 \w, \h, 0
947
+1:
948
+ FILTER_CHROMA_VPP_SVE2 \w, \h, 1
949
+2:
950
+ FILTER_CHROMA_VPP_SVE2 \w, \h, 2
951
+3:
952
+ FILTER_CHROMA_VPP_SVE2 \w, \h, 3
953
+4:
954
+ FILTER_CHROMA_VPP_SVE2 \w, \h, 4
955
+5:
956
+ FILTER_CHROMA_VPP_SVE2 \w, \h, 5
957
+6:
958
+ FILTER_CHROMA_VPP_SVE2 \w, \h, 6
959
+7:
960
+ FILTER_CHROMA_VPP_SVE2 \w, \h, 7
961
+endfunc
962
+.endm
963
+
964
+CHROMA_VPP_SVE2 2, 4
965
+CHROMA_VPP_SVE2 2, 8
966
+CHROMA_VPP_SVE2 2, 16
967
+CHROMA_VPP_SVE2 4, 2
968
+CHROMA_VPP_SVE2 4, 4
969
+CHROMA_VPP_SVE2 4, 8
970
+CHROMA_VPP_SVE2 4, 16
971
+CHROMA_VPP_SVE2 4, 32
972
+CHROMA_VPP_SVE2 6, 8
973
+CHROMA_VPP_SVE2 6, 16
974
+CHROMA_VPP_SVE2 8, 2
975
+CHROMA_VPP_SVE2 8, 4
976
+CHROMA_VPP_SVE2 8, 6
977
+CHROMA_VPP_SVE2 8, 8
978
+CHROMA_VPP_SVE2 8, 16
979
+CHROMA_VPP_SVE2 8, 32
980
+CHROMA_VPP_SVE2 8, 12
981
+CHROMA_VPP_SVE2 8, 64
982
+CHROMA_VPP_SVE2 12, 16
983
+CHROMA_VPP_SVE2 12, 32
984
+CHROMA_VPP_SVE2 16, 4
985
+CHROMA_VPP_SVE2 16, 8
986
+CHROMA_VPP_SVE2 16, 12
987
+CHROMA_VPP_SVE2 16, 16
988
+CHROMA_VPP_SVE2 16, 32
989
+CHROMA_VPP_SVE2 16, 64
990
+CHROMA_VPP_SVE2 16, 24
991
+CHROMA_VPP_SVE2 32, 8
992
+CHROMA_VPP_SVE2 32, 16
993
+CHROMA_VPP_SVE2 32, 24
994
+CHROMA_VPP_SVE2 32, 32
995
+CHROMA_VPP_SVE2 32, 64
996
+CHROMA_VPP_SVE2 32, 48
997
+CHROMA_VPP_SVE2 24, 32
998
+CHROMA_VPP_SVE2 24, 64
999
+CHROMA_VPP_SVE2 64, 16
1000
+CHROMA_VPP_SVE2 64, 32
1001
+CHROMA_VPP_SVE2 64, 48
1002
+CHROMA_VPP_SVE2 64, 64
1003
+CHROMA_VPP_SVE2 48, 64
1004
+
1005
+.macro FILTER_CHROMA_VPS_SVE2 w, h, v
1006
+ ptrue p0.h, vl8
1007
+ qpel_start_chroma_sve2_\v
1008
+ mov z31.h, #8192
1009
+ lsl x3, x3, #1
1010
+ sub x0, x0, x1
1011
+ mov x5, #\h
1012
+.loop_vps_sve2_\v\()_\w\()x\h:
1013
+ mov x7, x2
1014
+ mov x9, #0
1015
+.loop_vps_w8_sve2_\v\()_\w\()x\h:
1016
+ add x6, x0, x9
1017
+ qpel_chroma_load_32b_sve2 \v
1018
+ qpel_filter_chroma_sve2_\v\()_32b
1019
+ vps_end_sve2
1020
+ add x9, x9, #8
1021
+.if \w == 2
1022
+ str s17, x7, #4
1023
+.elseif \w == 4
1024
+ str d17, x7, #8
1025
+.elseif \w == 6
1026
+ str d17, x7, #8
1027
+ st1 {v17.s}2, x7, #4
1028
+.elseif \w == 12
1029
+ str q17, x7, #16
1030
+ add x6, x0, x9
1031
+ qpel_chroma_load_32b_sve2 \v
1032
+ qpel_filter_chroma_sve2_\v\()_32b
1033
+ vps_end_sve2
1034
+ str d17, x7, #8
1035
+ add x9, x9, #8
1036
+.else
1037
+ str q17, x7, #16
1038
+.endif
1039
+ cmp x9, #\w
1040
+ blt .loop_vps_w8_sve2_\v\()_\w\()x\h
1041
+
1042
+ add x0, x0, x1
1043
+ add x2, x2, x3
1044
+ sub x5, x5, #1
1045
+ cbnz x5, .loop_vps_sve2_\v\()_\w\()x\h
1046
+ ret
1047
+.endm
1048
+
1049
+// void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
1050
+.macro CHROMA_VPS_SVE2 w, h
1051
+function x265_interp_4tap_vert_ps_\w\()x\h\()_sve2
1052
+ cmp x4, #0
1053
+ beq 0f
1054
+ cmp x4, #1
1055
+ beq 1f
1056
+ cmp x4, #2
1057
+ beq 2f
1058
+ cmp x4, #3
1059
+ beq 3f
1060
+ cmp x4, #4
1061
+ beq 4f
1062
+ cmp x4, #5
1063
+ beq 5f
1064
+ cmp x4, #6
1065
+ beq 6f
1066
+ cmp x4, #7
1067
+ beq 7f
1068
+0:
1069
+ FILTER_CHROMA_VPS_SVE2 \w, \h, 0
1070
+1:
1071
+ FILTER_CHROMA_VPS_SVE2 \w, \h, 1
1072
+2:
1073
+ FILTER_CHROMA_VPS_SVE2 \w, \h, 2
1074
+3:
1075
+ FILTER_CHROMA_VPS_SVE2 \w, \h, 3
1076
+4:
1077
+ FILTER_CHROMA_VPS_SVE2 \w, \h, 4
1078
+5:
1079
+ FILTER_CHROMA_VPS_SVE2 \w, \h, 5
1080
+6:
1081
+ FILTER_CHROMA_VPS_SVE2 \w, \h, 6
1082
+7:
1083
+ FILTER_CHROMA_VPS_SVE2 \w, \h, 7
1084
+endfunc
1085
+.endm
1086
+
1087
+CHROMA_VPS_SVE2 2, 4
1088
+CHROMA_VPS_SVE2 2, 8
1089
+CHROMA_VPS_SVE2 2, 16
1090
+CHROMA_VPS_SVE2 4, 2
1091
+CHROMA_VPS_SVE2 4, 4
1092
+CHROMA_VPS_SVE2 4, 8
1093
+CHROMA_VPS_SVE2 4, 16
1094
+CHROMA_VPS_SVE2 4, 32
1095
+CHROMA_VPS_SVE2 6, 8
1096
+CHROMA_VPS_SVE2 6, 16
1097
+CHROMA_VPS_SVE2 8, 2
1098
+CHROMA_VPS_SVE2 8, 4
1099
+CHROMA_VPS_SVE2 8, 6
1100
+CHROMA_VPS_SVE2 8, 8
1101
+CHROMA_VPS_SVE2 8, 16
1102
+CHROMA_VPS_SVE2 8, 32
1103
+CHROMA_VPS_SVE2 8, 12
1104
+CHROMA_VPS_SVE2 8, 64
1105
+CHROMA_VPS_SVE2 12, 16
1106
+CHROMA_VPS_SVE2 12, 32
1107
+CHROMA_VPS_SVE2 16, 4
1108
+CHROMA_VPS_SVE2 16, 8
1109
+CHROMA_VPS_SVE2 16, 12
1110
+CHROMA_VPS_SVE2 16, 16
1111
+CHROMA_VPS_SVE2 16, 32
1112
+CHROMA_VPS_SVE2 16, 64
1113
+CHROMA_VPS_SVE2 16, 24
1114
+CHROMA_VPS_SVE2 32, 8
1115
+CHROMA_VPS_SVE2 32, 16
1116
+CHROMA_VPS_SVE2 32, 24
1117
+CHROMA_VPS_SVE2 32, 32
1118
+CHROMA_VPS_SVE2 32, 64
1119
+CHROMA_VPS_SVE2 32, 48
1120
+CHROMA_VPS_SVE2 24, 32
1121
+CHROMA_VPS_SVE2 24, 64
1122
+CHROMA_VPS_SVE2 64, 16
1123
+CHROMA_VPS_SVE2 64, 32
1124
+CHROMA_VPS_SVE2 64, 48
1125
+CHROMA_VPS_SVE2 64, 64
1126
+CHROMA_VPS_SVE2 48, 64
1127
+
1128
+.macro qpel_start_chroma_sve2_0_1
1129
+ mov z24.h, #64
1130
+.endm
1131
+
1132
+.macro qpel_start_chroma_sve2_1_1
1133
+ mov z24.h, #58
1134
+ mov z25.h, #10
1135
+.endm
1136
+
1137
+.macro qpel_start_chroma_sve2_2_1
1138
+ mov z25.h, #54
1139
+.endm
1140
+
1141
+.macro qpel_start_chroma_sve2_3_1
1142
+ mov z25.h, #46
1143
+ mov z26.h, #28
1144
+ mov z27.h, #6
1145
+.endm
1146
+
1147
+.macro qpel_start_chroma_sve2_4_1
1148
+ mov z24.h, #36
1149
+.endm
1150
+
1151
+.macro qpel_start_chroma_sve2_5_1
1152
+ mov z25.h, #28
1153
+ mov z26.h, #46
1154
+ mov z27.h, #6
1155
+.endm
1156
+
1157
+.macro qpel_start_chroma_sve2_6_1
1158
+ mov z25.h, #54
1159
+.endm
1160
+
1161
+.macro qpel_start_chroma_sve2_7_1
1162
+ mov z24.h, #58
1163
+ mov z25.h, #10
1164
+.endm
1165
+
1166
+.macro FILTER_CHROMA_VSS_SVE2 w, h, v
1167
+ lsl x1, x1, #1
1168
+ sub x0, x0, x1
1169
+ lsl x3, x3, #1
1170
+ mov x5, #\h
1171
+ mov x12, #\w
1172
+ lsl x12, x12, #1
1173
+ qpel_start_chroma_sve2_\v\()_1
1174
+.loop_vss_sve2_\v\()_\w\()x\h:
1175
+ mov x7, x2
1176
+ mov x9, #0
1177
+.if \w == 4
1178
+.rept 2
1179
+ add x6, x0, x9
1180
+ qpel_chroma_load_64b \v
1181
+ qpel_filter_chroma_\v\()_32b_1
1182
+ vss_end_sve2
1183
+ str s17, x7, #4
1184
+ add x9, x9, #4
1185
+.endr
1186
+.else
1187
+.loop_vss_w8_sve2_\v\()_\w\()x\h:
1188
+ add x6, x0, x9
1189
+ qpel_chroma_load_64b \v
1190
+ qpel_filter_chroma_\v\()_32b_1
1191
+ vss_end_sve2
1192
+ str q17, x7, #16
1193
+ add x9, x9, #16
1194
+.if \w == 12
1195
+ add x6, x0, x9
1196
+ qpel_chroma_load_64b \v
1197
+ qpel_filter_chroma_\v\()_32b_1
1198
+ vss_end_sve2
1199
+ str d17, x7, #8
1200
+ add x9, x9, #8
1201
+.endif
1202
+ cmp x9, x12
1203
+ blt .loop_vss_w8_sve2_\v\()_\w\()x\h
1204
+.endif
1205
+ add x0, x0, x1
1206
+ add x2, x2, x3
1207
+ sub x5, x5, #1
1208
+ cbnz x5, .loop_vss_sve2_\v\()_\w\()x\h
1209
+ ret
1210
+.endm
1211
+
1212
+// void interp_vert_ss_c(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
1213
+.macro CHROMA_VSS_SVE2 w, h
1214
+function x265_interp_4tap_vert_ss_\w\()x\h\()_sve2
1215
+ cmp x4, #0
1216
+ beq 0f
1217
+ cmp x4, #1
1218
+ beq 1f
1219
+ cmp x4, #2
1220
+ beq 2f
1221
+ cmp x4, #3
1222
+ beq 3f
1223
+ cmp x4, #4
1224
+ beq 4f
1225
+ cmp x4, #5
1226
+ beq 5f
1227
+ cmp x4, #6
1228
+ beq 6f
1229
+ cmp x4, #7
1230
+ beq 7f
1231
+0:
1232
+ FILTER_CHROMA_VSS_SVE2 \w, \h, 0
1233
+1:
1234
+ FILTER_CHROMA_VSS_SVE2 \w, \h, 1
1235
+2:
1236
+ FILTER_CHROMA_VSS_SVE2 \w, \h, 2
1237
+3:
1238
+ FILTER_CHROMA_VSS_SVE2 \w, \h, 3
1239
+4:
1240
+ FILTER_CHROMA_VSS_SVE2 \w, \h, 4
1241
+5:
1242
+ FILTER_CHROMA_VSS_SVE2 \w, \h, 5
1243
+6:
1244
+ FILTER_CHROMA_VSS_SVE2 \w, \h, 6
1245
+7:
1246
+ FILTER_CHROMA_VSS_SVE2 \w, \h, 7
1247
+endfunc
1248
+.endm
1249
+
1250
+CHROMA_VSS_SVE2 4, 4
1251
+CHROMA_VSS_SVE2 4, 8
1252
+CHROMA_VSS_SVE2 4, 16
1253
+CHROMA_VSS_SVE2 4, 32
1254
+CHROMA_VSS_SVE2 8, 2
1255
+CHROMA_VSS_SVE2 8, 4
1256
+CHROMA_VSS_SVE2 8, 6
1257
+CHROMA_VSS_SVE2 8, 8
1258
+CHROMA_VSS_SVE2 8, 16
1259
+CHROMA_VSS_SVE2 8, 32
1260
+CHROMA_VSS_SVE2 8, 12
1261
+CHROMA_VSS_SVE2 8, 64
1262
+CHROMA_VSS_SVE2 12, 16
1263
+CHROMA_VSS_SVE2 12, 32
1264
+CHROMA_VSS_SVE2 16, 4
1265
+CHROMA_VSS_SVE2 16, 8
1266
+CHROMA_VSS_SVE2 16, 12
1267
+CHROMA_VSS_SVE2 16, 16
1268
+CHROMA_VSS_SVE2 16, 32
1269
+CHROMA_VSS_SVE2 16, 64
1270
+CHROMA_VSS_SVE2 16, 24
1271
+CHROMA_VSS_SVE2 32, 8
1272
+CHROMA_VSS_SVE2 32, 16
1273
+CHROMA_VSS_SVE2 32, 24
1274
+CHROMA_VSS_SVE2 32, 32
1275
+CHROMA_VSS_SVE2 32, 64
1276
+CHROMA_VSS_SVE2 32, 48
1277
+CHROMA_VSS_SVE2 24, 32
1278
+CHROMA_VSS_SVE2 24, 64
1279
+CHROMA_VSS_SVE2 64, 16
1280
+CHROMA_VSS_SVE2 64, 32
1281
+CHROMA_VSS_SVE2 64, 48
1282
+CHROMA_VSS_SVE2 64, 64
1283
+CHROMA_VSS_SVE2 48, 64
1284
x265_3.6.tar.gz/source/common/aarch64/ipfilter.S
Added
1056
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2021 MulticoreWare, Inc
4
+ *
5
+ * Authors: Sebastian Pop <spop@amazon.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// Functions in this file:
26
+// ***** luma_vpp *****
27
+// ***** luma_vps *****
28
+// ***** luma_vsp *****
29
+// ***** luma_vss *****
30
+// ***** luma_hpp *****
31
+// ***** luma_hps *****
32
+// ***** chroma_vpp *****
33
+// ***** chroma_vps *****
34
+// ***** chroma_vsp *****
35
+// ***** chroma_vss *****
36
+// ***** chroma_hpp *****
37
+// ***** chroma_hps *****
38
+
39
+#include "asm.S"
40
+#include "ipfilter-common.S"
41
+
42
+#ifdef __APPLE__
43
+.section __RODATA,__rodata
44
+#else
45
+.section .rodata
46
+#endif
47
+
48
+.align 4
49
+
50
+.text
51
+
52
+// ***** luma_vpp *****
53
+// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
54
+.macro LUMA_VPP_4xN h
55
+function x265_interp_8tap_vert_pp_4x\h\()_neon
56
+ movrel x10, g_luma_s16
57
+ sub x0, x0, x1
58
+ sub x0, x0, x1, lsl #1 // src -= 3 * srcStride
59
+ lsl x4, x4, #4
60
+ ldr q0, x10, x4 // q0 = luma interpolate coeff
61
+ dup v24.8h, v0.h0
62
+ dup v25.8h, v0.h1
63
+ trn1 v24.2d, v24.2d, v25.2d
64
+ dup v26.8h, v0.h2
65
+ dup v27.8h, v0.h3
66
+ trn1 v26.2d, v26.2d, v27.2d
67
+ dup v28.8h, v0.h4
68
+ dup v29.8h, v0.h5
69
+ trn1 v28.2d, v28.2d, v29.2d
70
+ dup v30.8h, v0.h6
71
+ dup v31.8h, v0.h7
72
+ trn1 v30.2d, v30.2d, v31.2d
73
+
74
+ // prepare to load 8 lines
75
+ ld1 {v0.s}0, x0, x1
76
+ ld1 {v0.s}1, x0, x1
77
+ ushll v0.8h, v0.8b, #0
78
+ ld1 {v1.s}0, x0, x1
79
+ ld1 {v1.s}1, x0, x1
80
+ ushll v1.8h, v1.8b, #0
81
+ ld1 {v2.s}0, x0, x1
82
+ ld1 {v2.s}1, x0, x1
83
+ ushll v2.8h, v2.8b, #0
84
+ ld1 {v3.s}0, x0, x1
85
+ ld1 {v3.s}1, x0, x1
86
+ ushll v3.8h, v3.8b, #0
87
+
88
+ mov x9, #\h
89
+.loop_4x\h:
90
+ ld1 {v4.s}0, x0, x1
91
+ ld1 {v4.s}1, x0, x1
92
+ ushll v4.8h, v4.8b, #0
93
+
94
+ // row0-1
95
+ mul v16.8h, v0.8h, v24.8h
96
+ ext v21.16b, v0.16b, v1.16b, #8
97
+ mul v17.8h, v21.8h, v24.8h
98
+ mov v0.16b, v1.16b
99
+
100
+ // row2-3
101
+ mla v16.8h, v1.8h, v26.8h
102
+ ext v21.16b, v1.16b, v2.16b, #8
103
+ mla v17.8h, v21.8h, v26.8h
104
+ mov v1.16b, v2.16b
105
+
106
+ // row4-5
107
+ mla v16.8h, v2.8h, v28.8h
108
+ ext v21.16b, v2.16b, v3.16b, #8
109
+ mla v17.8h, v21.8h, v28.8h
110
+ mov v2.16b, v3.16b
111
+
112
+ // row6-7
113
+ mla v16.8h, v3.8h, v30.8h
114
+ ext v21.16b, v3.16b, v4.16b, #8
115
+ mla v17.8h, v21.8h, v30.8h
116
+ mov v3.16b, v4.16b
117
+
118
+ // sum row0-7
119
+ trn1 v20.2d, v16.2d, v17.2d
120
+ trn2 v21.2d, v16.2d, v17.2d
121
+ add v16.8h, v20.8h, v21.8h
122
+
123
+ sqrshrun v16.8b, v16.8h, #6
124
+ st1 {v16.s}0, x2, x3
125
+ st1 {v16.s}1, x2, x3
126
+
127
+ sub x9, x9, #2
128
+ cbnz x9, .loop_4x\h
129
+ ret
130
+endfunc
131
+.endm
132
+
133
+LUMA_VPP_4xN 4
134
+LUMA_VPP_4xN 8
135
+LUMA_VPP_4xN 16
136
+
137
+// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
138
+.macro LUMA_VPP w, h
139
+function x265_interp_8tap_vert_pp_\w\()x\h\()_neon
140
+ cmp x4, #0
141
+ b.eq 0f
142
+ cmp x4, #1
143
+ b.eq 1f
144
+ cmp x4, #2
145
+ b.eq 2f
146
+ cmp x4, #3
147
+ b.eq 3f
148
+0:
149
+ FILTER_LUMA_VPP \w, \h, 0
150
+1:
151
+ FILTER_LUMA_VPP \w, \h, 1
152
+2:
153
+ FILTER_LUMA_VPP \w, \h, 2
154
+3:
155
+ FILTER_LUMA_VPP \w, \h, 3
156
+endfunc
157
+.endm
158
+
159
+LUMA_VPP 8, 4
160
+LUMA_VPP 8, 8
161
+LUMA_VPP 8, 16
162
+LUMA_VPP 8, 32
163
+LUMA_VPP 12, 16
164
+LUMA_VPP 16, 4
165
+LUMA_VPP 16, 8
166
+LUMA_VPP 16, 16
167
+LUMA_VPP 16, 32
168
+LUMA_VPP 16, 64
169
+LUMA_VPP 16, 12
170
+LUMA_VPP 24, 32
171
+LUMA_VPP 32, 8
172
+LUMA_VPP 32, 16
173
+LUMA_VPP 32, 32
174
+LUMA_VPP 32, 64
175
+LUMA_VPP 32, 24
176
+LUMA_VPP 48, 64
177
+LUMA_VPP 64, 16
178
+LUMA_VPP 64, 32
179
+LUMA_VPP 64, 64
180
+LUMA_VPP 64, 48
181
+
182
+// ***** luma_vps *****
183
+// void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
184
+.macro LUMA_VPS_4xN h
185
+function x265_interp_8tap_vert_ps_4x\h\()_neon
186
+ lsl x3, x3, #1
187
+ lsl x5, x4, #6
188
+ lsl x4, x1, #2
189
+ sub x4, x4, x1
190
+ sub x0, x0, x4
191
+
192
+ mov w6, #8192
193
+ dup v28.4s, w6
194
+ mov x4, #\h
195
+ movrel x12, g_lumaFilter
196
+ add x12, x12, x5
197
+ ld1r {v16.2d}, x12, #8
198
+ ld1r {v17.2d}, x12, #8
199
+ ld1r {v18.2d}, x12, #8
200
+ ld1r {v19.2d}, x12, #8
201
+ ld1r {v20.2d}, x12, #8
202
+ ld1r {v21.2d}, x12, #8
203
+ ld1r {v22.2d}, x12, #8
204
+ ld1r {v23.2d}, x12, #8
205
+
206
+.loop_vps_4x\h:
207
+ mov x6, x0
208
+
209
+ ld1 {v0.s}0, x6, x1
210
+ ld1 {v1.s}0, x6, x1
211
+ ld1 {v2.s}0, x6, x1
212
+ ld1 {v3.s}0, x6, x1
213
+ ld1 {v4.s}0, x6, x1
214
+ ld1 {v5.s}0, x6, x1
215
+ ld1 {v6.s}0, x6, x1
216
+ ld1 {v7.s}0, x6, x1
217
+ uxtl v0.8h, v0.8b
218
+ uxtl v0.4s, v0.4h
219
+
220
+ uxtl v1.8h, v1.8b
221
+ uxtl v1.4s, v1.4h
222
+ mul v0.4s, v0.4s, v16.4s
223
+
224
+ uxtl v2.8h, v2.8b
225
+ uxtl v2.4s, v2.4h
226
+ mla v0.4s, v1.4s, v17.4s
227
+
228
+ uxtl v3.8h, v3.8b
229
+ uxtl v3.4s, v3.4h
230
+ mla v0.4s, v2.4s, v18.4s
231
+
232
+ uxtl v4.8h, v4.8b
233
+ uxtl v4.4s, v4.4h
234
+ mla v0.4s, v3.4s, v19.4s
235
+
236
+ uxtl v5.8h, v5.8b
237
+ uxtl v5.4s, v5.4h
238
+ mla v0.4s, v4.4s, v20.4s
239
+
240
+ uxtl v6.8h, v6.8b
241
+ uxtl v6.4s, v6.4h
242
+ mla v0.4s, v5.4s, v21.4s
243
+
244
+ uxtl v7.8h, v7.8b
245
+ uxtl v7.4s, v7.4h
246
+ mla v0.4s, v6.4s, v22.4s
247
+
248
+ mla v0.4s, v7.4s, v23.4s
249
+
250
+ sub v0.4s, v0.4s, v28.4s
251
+ sqxtn v0.4h, v0.4s
252
+ st1 {v0.8b}, x2, x3
253
+
254
+ add x0, x0, x1
255
+ sub x4, x4, #1
256
+ cbnz x4, .loop_vps_4x\h
257
+ ret
258
+endfunc
259
+.endm
260
+
261
+LUMA_VPS_4xN 4
262
+LUMA_VPS_4xN 8
263
+LUMA_VPS_4xN 16
264
+
265
+// void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
266
+.macro LUMA_VPS w, h
267
+function x265_interp_8tap_vert_ps_\w\()x\h\()_neon
268
+ cmp x4, #0
269
+ beq 0f
270
+ cmp x4, #1
271
+ beq 1f
272
+ cmp x4, #2
273
+ beq 2f
274
+ cmp x4, #3
275
+ beq 3f
276
+0:
277
+ FILTER_VPS \w, \h, 0
278
+1:
279
+ FILTER_VPS \w, \h, 1
280
+2:
281
+ FILTER_VPS \w, \h, 2
282
+3:
283
+ FILTER_VPS \w, \h, 3
284
+endfunc
285
+.endm
286
+
287
+LUMA_VPS 8, 4
288
+LUMA_VPS 8, 8
289
+LUMA_VPS 8, 16
290
+LUMA_VPS 8, 32
291
+LUMA_VPS 12, 16
292
+LUMA_VPS 16, 4
293
+LUMA_VPS 16, 8
294
+LUMA_VPS 16, 16
295
+LUMA_VPS 16, 32
296
+LUMA_VPS 16, 64
297
+LUMA_VPS 16, 12
298
+LUMA_VPS 24, 32
299
+LUMA_VPS 32, 8
300
+LUMA_VPS 32, 16
301
+LUMA_VPS 32, 32
302
+LUMA_VPS 32, 64
303
+LUMA_VPS 32, 24
304
+LUMA_VPS 48, 64
305
+LUMA_VPS 64, 16
306
+LUMA_VPS 64, 32
307
+LUMA_VPS 64, 64
308
+LUMA_VPS 64, 48
309
+
310
+// ***** luma_vsp *****
311
+// void interp_vert_sp_c(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
312
+.macro LUMA_VSP_4xN h
313
+function x265_interp_8tap_vert_sp_4x\h\()_neon
314
+ lsl x5, x4, #6
315
+ lsl x1, x1, #1
316
+ lsl x4, x1, #2
317
+ sub x4, x4, x1
318
+ sub x0, x0, x4
319
+
320
+ mov w12, #1
321
+ lsl w12, w12, #19
322
+ add w12, w12, #2048
323
+ dup v24.4s, w12
324
+ mov x4, #\h
325
+ movrel x12, g_lumaFilter
326
+ add x12, x12, x5
327
+ ld1r {v16.2d}, x12, #8
328
+ ld1r {v17.2d}, x12, #8
329
+ ld1r {v18.2d}, x12, #8
330
+ ld1r {v19.2d}, x12, #8
331
+ ld1r {v20.2d}, x12, #8
332
+ ld1r {v21.2d}, x12, #8
333
+ ld1r {v22.2d}, x12, #8
334
+ ld1r {v23.2d}, x12, #8
335
+.loop_vsp_4x\h:
336
+ mov x6, x0
337
+
338
+ ld1 {v0.8b}, x6, x1
339
+ ld1 {v1.8b}, x6, x1
340
+ ld1 {v2.8b}, x6, x1
341
+ ld1 {v3.8b}, x6, x1
342
+ ld1 {v4.8b}, x6, x1
343
+ ld1 {v5.8b}, x6, x1
344
+ ld1 {v6.8b}, x6, x1
345
+ ld1 {v7.8b}, x6, x1
346
+
347
+ sshll v0.4s, v0.4h, #0
348
+ sshll v1.4s, v1.4h, #0
349
+ mul v0.4s, v0.4s, v16.4s
350
+ sshll v2.4s, v2.4h, #0
351
+ mla v0.4s, v1.4s, v17.4s
352
+ sshll v3.4s, v3.4h, #0
353
+ mla v0.4s, v2.4s, v18.4s
354
+ sshll v4.4s, v4.4h, #0
355
+ mla v0.4s, v3.4s, v19.4s
356
+ sshll v5.4s, v5.4h, #0
357
+ mla v0.4s, v4.4s, v20.4s
358
+ sshll v6.4s, v6.4h, #0
359
+ mla v0.4s, v5.4s, v21.4s
360
+ sshll v7.4s, v7.4h, #0
361
+ mla v0.4s, v6.4s, v22.4s
362
+
363
+ mla v0.4s, v7.4s, v23.4s
364
+
365
+ add v0.4s, v0.4s, v24.4s
366
+ sqshrun v0.4h, v0.4s, #12
367
+ sqxtun v0.8b, v0.8h
368
+ st1 {v0.s}0, x2, x3
369
+
370
+ add x0, x0, x1
371
+ sub x4, x4, #1
372
+ cbnz x4, .loop_vsp_4x\h
373
+ ret
374
+endfunc
375
+.endm
376
+
377
+LUMA_VSP_4xN 4
378
+LUMA_VSP_4xN 8
379
+LUMA_VSP_4xN 16
380
+
381
+// void interp_vert_sp_c(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
382
+.macro LUMA_VSP w, h
383
+function x265_interp_8tap_vert_sp_\w\()x\h\()_neon
384
+ cmp x4, #0
385
+ beq 0f
386
+ cmp x4, #1
387
+ beq 1f
388
+ cmp x4, #2
389
+ beq 2f
390
+ cmp x4, #3
391
+ beq 3f
392
+0:
393
+ FILTER_VSP \w, \h, 0
394
+1:
395
+ FILTER_VSP \w, \h, 1
396
+2:
397
+ FILTER_VSP \w, \h, 2
398
+3:
399
+ FILTER_VSP \w, \h, 3
400
+endfunc
401
+.endm
402
+
403
+LUMA_VSP 8, 4
404
+LUMA_VSP 8, 8
405
+LUMA_VSP 8, 16
406
+LUMA_VSP 8, 32
407
+LUMA_VSP 12, 16
408
+LUMA_VSP 16, 4
409
+LUMA_VSP 16, 8
410
+LUMA_VSP 16, 16
411
+LUMA_VSP 16, 32
412
+LUMA_VSP 16, 64
413
+LUMA_VSP 16, 12
414
+LUMA_VSP 32, 8
415
+LUMA_VSP 32, 16
416
+LUMA_VSP 32, 32
417
+LUMA_VSP 32, 64
418
+LUMA_VSP 32, 24
419
+LUMA_VSP 64, 16
420
+LUMA_VSP 64, 32
421
+LUMA_VSP 64, 64
422
+LUMA_VSP 64, 48
423
+LUMA_VSP 24, 32
424
+LUMA_VSP 48, 64
425
+
426
+// ***** luma_vss *****
427
+// void interp_vert_ss_c(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
428
+.macro LUMA_VSS w, h
429
+function x265_interp_8tap_vert_ss_\w\()x\h\()_neon
430
+ cmp x4, #0
431
+ beq 0f
432
+ cmp x4, #1
433
+ beq 1f
434
+ cmp x4, #2
435
+ beq 2f
436
+ cmp x4, #3
437
+ beq 3f
438
+0:
439
+ FILTER_VSS \w, \h, 0
440
+1:
441
+ FILTER_VSS \w, \h, 1
442
+2:
443
+ FILTER_VSS \w, \h, 2
444
+3:
445
+ FILTER_VSS \w, \h, 3
446
+endfunc
447
+.endm
448
+
449
+LUMA_VSS 4, 4
450
+LUMA_VSS 4, 8
451
+LUMA_VSS 4, 16
452
+LUMA_VSS 8, 4
453
+LUMA_VSS 8, 8
454
+LUMA_VSS 8, 16
455
+LUMA_VSS 8, 32
456
+LUMA_VSS 12, 16
457
+LUMA_VSS 16, 4
458
+LUMA_VSS 16, 8
459
+LUMA_VSS 16, 16
460
+LUMA_VSS 16, 32
461
+LUMA_VSS 16, 64
462
+LUMA_VSS 16, 12
463
+LUMA_VSS 32, 8
464
+LUMA_VSS 32, 16
465
+LUMA_VSS 32, 32
466
+LUMA_VSS 32, 64
467
+LUMA_VSS 32, 24
468
+LUMA_VSS 64, 16
469
+LUMA_VSS 64, 32
470
+LUMA_VSS 64, 64
471
+LUMA_VSS 64, 48
472
+LUMA_VSS 24, 32
473
+LUMA_VSS 48, 64
474
+
475
+// ***** luma_hpp *****
476
+// void interp_horiz_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
477
+.macro LUMA_HPP w, h
478
+function x265_interp_horiz_pp_\w\()x\h\()_neon
479
+ cmp x4, #0
480
+ beq 0f
481
+ cmp x4, #1
482
+ beq 1f
483
+ cmp x4, #2
484
+ beq 2f
485
+ cmp x4, #3
486
+ beq 3f
487
+0:
488
+ FILTER_HPP \w, \h, 0
489
+1:
490
+ FILTER_HPP \w, \h, 1
491
+2:
492
+ FILTER_HPP \w, \h, 2
493
+3:
494
+ FILTER_HPP \w, \h, 3
495
+endfunc
496
+.endm
497
+
498
+LUMA_HPP 4, 4
499
+LUMA_HPP 4, 8
500
+LUMA_HPP 4, 16
501
+LUMA_HPP 8, 4
502
+LUMA_HPP 8, 8
503
+LUMA_HPP 8, 16
504
+LUMA_HPP 8, 32
505
+LUMA_HPP 12, 16
506
+LUMA_HPP 16, 4
507
+LUMA_HPP 16, 8
508
+LUMA_HPP 16, 12
509
+LUMA_HPP 16, 16
510
+LUMA_HPP 16, 32
511
+LUMA_HPP 16, 64
512
+LUMA_HPP 24, 32
513
+LUMA_HPP 32, 8
514
+LUMA_HPP 32, 16
515
+LUMA_HPP 32, 24
516
+LUMA_HPP 32, 32
517
+LUMA_HPP 32, 64
518
+LUMA_HPP 48, 64
519
+LUMA_HPP 64, 16
520
+LUMA_HPP 64, 32
521
+LUMA_HPP 64, 48
522
+LUMA_HPP 64, 64
523
+
524
+// ***** luma_hps *****
525
+// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
526
+.macro LUMA_HPS w, h
527
+function x265_interp_horiz_ps_\w\()x\h\()_neon
528
+ mov w10, #\h
529
+ cmp w5, #0
530
+ b.eq 6f
531
+ sub x0, x0, x1, lsl #2
532
+ add x0, x0, x1
533
+ add w10, w10, #7
534
+6:
535
+ mov w6, w10
536
+ cmp w4, #0
537
+ b.eq 0f
538
+ cmp w4, #1
539
+ b.eq 1f
540
+ cmp w4, #2
541
+ b.eq 2f
542
+ cmp w4, #3
543
+ b.eq 3f
544
+0:
545
+ FILTER_HPS \w, \h, 0
546
+1:
547
+ FILTER_HPS \w, \h, 1
548
+2:
549
+ FILTER_HPS \w, \h, 2
550
+3:
551
+ FILTER_HPS \w, \h, 3
552
+endfunc
553
+.endm
554
+
555
+LUMA_HPS 4, 4
556
+LUMA_HPS 4, 8
557
+LUMA_HPS 4, 16
558
+LUMA_HPS 8, 4
559
+LUMA_HPS 8, 8
560
+LUMA_HPS 8, 16
561
+LUMA_HPS 8, 32
562
+LUMA_HPS 12, 16
563
+LUMA_HPS 16, 4
564
+LUMA_HPS 16, 8
565
+LUMA_HPS 16, 12
566
+LUMA_HPS 16, 16
567
+LUMA_HPS 16, 32
568
+LUMA_HPS 16, 64
569
+LUMA_HPS 24, 32
570
+LUMA_HPS 32, 8
571
+LUMA_HPS 32, 16
572
+LUMA_HPS 32, 24
573
+LUMA_HPS 32, 32
574
+LUMA_HPS 32, 64
575
+LUMA_HPS 48, 64
576
+LUMA_HPS 64, 16
577
+LUMA_HPS 64, 32
578
+LUMA_HPS 64, 48
579
+LUMA_HPS 64, 64
580
+
581
+// ***** chroma_vpp *****
582
+// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
583
+.macro CHROMA_VPP w, h
584
+function x265_interp_4tap_vert_pp_\w\()x\h\()_neon
585
+ cmp x4, #0
586
+ beq 0f
587
+ cmp x4, #1
588
+ beq 1f
589
+ cmp x4, #2
590
+ beq 2f
591
+ cmp x4, #3
592
+ beq 3f
593
+ cmp x4, #4
594
+ beq 4f
595
+ cmp x4, #5
596
+ beq 5f
597
+ cmp x4, #6
598
+ beq 6f
599
+ cmp x4, #7
600
+ beq 7f
601
+0:
602
+ FILTER_CHROMA_VPP \w, \h, 0
603
+1:
604
+ FILTER_CHROMA_VPP \w, \h, 1
605
+2:
606
+ FILTER_CHROMA_VPP \w, \h, 2
607
+3:
608
+ FILTER_CHROMA_VPP \w, \h, 3
609
+4:
610
+ FILTER_CHROMA_VPP \w, \h, 4
611
+5:
612
+ FILTER_CHROMA_VPP \w, \h, 5
613
+6:
614
+ FILTER_CHROMA_VPP \w, \h, 6
615
+7:
616
+ FILTER_CHROMA_VPP \w, \h, 7
617
+endfunc
618
+.endm
619
+
620
+CHROMA_VPP 2, 4
621
+CHROMA_VPP 2, 8
622
+CHROMA_VPP 2, 16
623
+CHROMA_VPP 4, 2
624
+CHROMA_VPP 4, 4
625
+CHROMA_VPP 4, 8
626
+CHROMA_VPP 4, 16
627
+CHROMA_VPP 4, 32
628
+CHROMA_VPP 6, 8
629
+CHROMA_VPP 6, 16
630
+CHROMA_VPP 8, 2
631
+CHROMA_VPP 8, 4
632
+CHROMA_VPP 8, 6
633
+CHROMA_VPP 8, 8
634
+CHROMA_VPP 8, 16
635
+CHROMA_VPP 8, 32
636
+CHROMA_VPP 8, 12
637
+CHROMA_VPP 8, 64
638
+CHROMA_VPP 12, 16
639
+CHROMA_VPP 12, 32
640
+CHROMA_VPP 16, 4
641
+CHROMA_VPP 16, 8
642
+CHROMA_VPP 16, 12
643
+CHROMA_VPP 16, 16
644
+CHROMA_VPP 16, 32
645
+CHROMA_VPP 16, 64
646
+CHROMA_VPP 16, 24
647
+CHROMA_VPP 32, 8
648
+CHROMA_VPP 32, 16
649
+CHROMA_VPP 32, 24
650
+CHROMA_VPP 32, 32
651
+CHROMA_VPP 32, 64
652
+CHROMA_VPP 32, 48
653
+CHROMA_VPP 24, 32
654
+CHROMA_VPP 24, 64
655
+CHROMA_VPP 64, 16
656
+CHROMA_VPP 64, 32
657
+CHROMA_VPP 64, 48
658
+CHROMA_VPP 64, 64
659
+CHROMA_VPP 48, 64
660
+
661
+// ***** chroma_vps *****
662
+// void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
663
+.macro CHROMA_VPS w, h
664
+function x265_interp_4tap_vert_ps_\w\()x\h\()_neon
665
+ cmp x4, #0
666
+ beq 0f
667
+ cmp x4, #1
668
+ beq 1f
669
+ cmp x4, #2
670
+ beq 2f
671
+ cmp x4, #3
672
+ beq 3f
673
+ cmp x4, #4
674
+ beq 4f
675
+ cmp x4, #5
676
+ beq 5f
677
+ cmp x4, #6
678
+ beq 6f
679
+ cmp x4, #7
680
+ beq 7f
681
+0:
682
+ FILTER_CHROMA_VPS \w, \h, 0
683
+1:
684
+ FILTER_CHROMA_VPS \w, \h, 1
685
+2:
686
+ FILTER_CHROMA_VPS \w, \h, 2
687
+3:
688
+ FILTER_CHROMA_VPS \w, \h, 3
689
+4:
690
+ FILTER_CHROMA_VPS \w, \h, 4
691
+5:
692
+ FILTER_CHROMA_VPS \w, \h, 5
693
+6:
694
+ FILTER_CHROMA_VPS \w, \h, 6
695
+7:
696
+ FILTER_CHROMA_VPS \w, \h, 7
697
+endfunc
698
+.endm
699
+
700
+CHROMA_VPS 2, 4
701
+CHROMA_VPS 2, 8
702
+CHROMA_VPS 2, 16
703
+CHROMA_VPS 4, 2
704
+CHROMA_VPS 4, 4
705
+CHROMA_VPS 4, 8
706
+CHROMA_VPS 4, 16
707
+CHROMA_VPS 4, 32
708
+CHROMA_VPS 6, 8
709
+CHROMA_VPS 6, 16
710
+CHROMA_VPS 8, 2
711
+CHROMA_VPS 8, 4
712
+CHROMA_VPS 8, 6
713
+CHROMA_VPS 8, 8
714
+CHROMA_VPS 8, 16
715
+CHROMA_VPS 8, 32
716
+CHROMA_VPS 8, 12
717
+CHROMA_VPS 8, 64
718
+CHROMA_VPS 12, 16
719
+CHROMA_VPS 12, 32
720
+CHROMA_VPS 16, 4
721
+CHROMA_VPS 16, 8
722
+CHROMA_VPS 16, 12
723
+CHROMA_VPS 16, 16
724
+CHROMA_VPS 16, 32
725
+CHROMA_VPS 16, 64
726
+CHROMA_VPS 16, 24
727
+CHROMA_VPS 32, 8
728
+CHROMA_VPS 32, 16
729
+CHROMA_VPS 32, 24
730
+CHROMA_VPS 32, 32
731
+CHROMA_VPS 32, 64
732
+CHROMA_VPS 32, 48
733
+CHROMA_VPS 24, 32
734
+CHROMA_VPS 24, 64
735
+CHROMA_VPS 64, 16
736
+CHROMA_VPS 64, 32
737
+CHROMA_VPS 64, 48
738
+CHROMA_VPS 64, 64
739
+CHROMA_VPS 48, 64
740
+
741
+// ***** chroma_vsp *****
742
+// void interp_vert_sp_c(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
743
+.macro CHROMA_VSP w, h
744
+function x265_interp_4tap_vert_sp_\w\()x\h\()_neon
745
+ cmp x4, #0
746
+ beq 0f
747
+ cmp x4, #1
748
+ beq 1f
749
+ cmp x4, #2
750
+ beq 2f
751
+ cmp x4, #3
752
+ beq 3f
753
+ cmp x4, #4
754
+ beq 4f
755
+ cmp x4, #5
756
+ beq 5f
757
+ cmp x4, #6
758
+ beq 6f
759
+ cmp x4, #7
760
+ beq 7f
761
+0:
762
+ FILTER_CHROMA_VSP \w, \h, 0
763
+1:
764
+ FILTER_CHROMA_VSP \w, \h, 1
765
+2:
766
+ FILTER_CHROMA_VSP \w, \h, 2
767
+3:
768
+ FILTER_CHROMA_VSP \w, \h, 3
769
+4:
770
+ FILTER_CHROMA_VSP \w, \h, 4
771
+5:
772
+ FILTER_CHROMA_VSP \w, \h, 5
773
+6:
774
+ FILTER_CHROMA_VSP \w, \h, 6
775
+7:
776
+ FILTER_CHROMA_VSP \w, \h, 7
777
+endfunc
778
+.endm
779
+
780
+CHROMA_VSP 4, 4
781
+CHROMA_VSP 4, 8
782
+CHROMA_VSP 4, 16
783
+CHROMA_VSP 4, 32
784
+CHROMA_VSP 8, 2
785
+CHROMA_VSP 8, 4
786
+CHROMA_VSP 8, 6
787
+CHROMA_VSP 8, 8
788
+CHROMA_VSP 8, 16
789
+CHROMA_VSP 8, 32
790
+CHROMA_VSP 8, 12
791
+CHROMA_VSP 8, 64
792
+CHROMA_VSP 12, 16
793
+CHROMA_VSP 12, 32
794
+CHROMA_VSP 16, 4
795
+CHROMA_VSP 16, 8
796
+CHROMA_VSP 16, 12
797
+CHROMA_VSP 16, 16
798
+CHROMA_VSP 16, 32
799
+CHROMA_VSP 16, 64
800
+CHROMA_VSP 16, 24
801
+CHROMA_VSP 32, 8
802
+CHROMA_VSP 32, 16
803
+CHROMA_VSP 32, 24
804
+CHROMA_VSP 32, 32
805
+CHROMA_VSP 32, 64
806
+CHROMA_VSP 32, 48
807
+CHROMA_VSP 24, 32
808
+CHROMA_VSP 24, 64
809
+CHROMA_VSP 64, 16
810
+CHROMA_VSP 64, 32
811
+CHROMA_VSP 64, 48
812
+CHROMA_VSP 64, 64
813
+CHROMA_VSP 48, 64
814
+
815
+// ***** chroma_vss *****
816
+// void interp_vert_ss_c(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
817
+.macro CHROMA_VSS w, h
818
+function x265_interp_4tap_vert_ss_\w\()x\h\()_neon
819
+ cmp x4, #0
820
+ beq 0f
821
+ cmp x4, #1
822
+ beq 1f
823
+ cmp x4, #2
824
+ beq 2f
825
+ cmp x4, #3
826
+ beq 3f
827
+ cmp x4, #4
828
+ beq 4f
829
+ cmp x4, #5
830
+ beq 5f
831
+ cmp x4, #6
832
+ beq 6f
833
+ cmp x4, #7
834
+ beq 7f
835
+0:
836
+ FILTER_CHROMA_VSS \w, \h, 0
837
+1:
838
+ FILTER_CHROMA_VSS \w, \h, 1
839
+2:
840
+ FILTER_CHROMA_VSS \w, \h, 2
841
+3:
842
+ FILTER_CHROMA_VSS \w, \h, 3
843
+4:
844
+ FILTER_CHROMA_VSS \w, \h, 4
845
+5:
846
+ FILTER_CHROMA_VSS \w, \h, 5
847
+6:
848
+ FILTER_CHROMA_VSS \w, \h, 6
849
+7:
850
+ FILTER_CHROMA_VSS \w, \h, 7
851
+endfunc
852
+.endm
853
+
854
+CHROMA_VSS 4, 4
855
+CHROMA_VSS 4, 8
856
+CHROMA_VSS 4, 16
857
+CHROMA_VSS 4, 32
858
+CHROMA_VSS 8, 2
859
+CHROMA_VSS 8, 4
860
+CHROMA_VSS 8, 6
861
+CHROMA_VSS 8, 8
862
+CHROMA_VSS 8, 16
863
+CHROMA_VSS 8, 32
864
+CHROMA_VSS 8, 12
865
+CHROMA_VSS 8, 64
866
+CHROMA_VSS 12, 16
867
+CHROMA_VSS 12, 32
868
+CHROMA_VSS 16, 4
869
+CHROMA_VSS 16, 8
870
+CHROMA_VSS 16, 12
871
+CHROMA_VSS 16, 16
872
+CHROMA_VSS 16, 32
873
+CHROMA_VSS 16, 64
874
+CHROMA_VSS 16, 24
875
+CHROMA_VSS 32, 8
876
+CHROMA_VSS 32, 16
877
+CHROMA_VSS 32, 24
878
+CHROMA_VSS 32, 32
879
+CHROMA_VSS 32, 64
880
+CHROMA_VSS 32, 48
881
+CHROMA_VSS 24, 32
882
+CHROMA_VSS 24, 64
883
+CHROMA_VSS 64, 16
884
+CHROMA_VSS 64, 32
885
+CHROMA_VSS 64, 48
886
+CHROMA_VSS 64, 64
887
+CHROMA_VSS 48, 64
888
+
889
+// ***** chroma_hpp *****
890
+// void interp_horiz_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
891
+.macro CHROMA_HPP w, h
892
+function x265_interp_4tap_horiz_pp_\w\()x\h\()_neon
893
+ cmp x4, #0
894
+ beq 0f
895
+ cmp x4, #1
896
+ beq 1f
897
+ cmp x4, #2
898
+ beq 2f
899
+ cmp x4, #3
900
+ beq 3f
901
+ cmp x4, #4
902
+ beq 4f
903
+ cmp x4, #5
904
+ beq 5f
905
+ cmp x4, #6
906
+ beq 6f
907
+ cmp x4, #7
908
+ beq 7f
909
+0:
910
+ FILTER_CHROMA_HPP \w, \h, 0
911
+1:
912
+ FILTER_CHROMA_HPP \w, \h, 1
913
+2:
914
+ FILTER_CHROMA_HPP \w, \h, 2
915
+3:
916
+ FILTER_CHROMA_HPP \w, \h, 3
917
+4:
918
+ FILTER_CHROMA_HPP \w, \h, 4
919
+5:
920
+ FILTER_CHROMA_HPP \w, \h, 5
921
+6:
922
+ FILTER_CHROMA_HPP \w, \h, 6
923
+7:
924
+ FILTER_CHROMA_HPP \w, \h, 7
925
+endfunc
926
+.endm
927
+
928
+CHROMA_HPP 2, 4
929
+CHROMA_HPP 2, 8
930
+CHROMA_HPP 2, 16
931
+CHROMA_HPP 4, 2
932
+CHROMA_HPP 4, 4
933
+CHROMA_HPP 4, 8
934
+CHROMA_HPP 4, 16
935
+CHROMA_HPP 4, 32
936
+CHROMA_HPP 6, 8
937
+CHROMA_HPP 6, 16
938
+CHROMA_HPP 8, 2
939
+CHROMA_HPP 8, 4
940
+CHROMA_HPP 8, 6
941
+CHROMA_HPP 8, 8
942
+CHROMA_HPP 8, 12
943
+CHROMA_HPP 8, 16
944
+CHROMA_HPP 8, 32
945
+CHROMA_HPP 8, 64
946
+CHROMA_HPP 12, 16
947
+CHROMA_HPP 12, 32
948
+CHROMA_HPP 16, 4
949
+CHROMA_HPP 16, 8
950
+CHROMA_HPP 16, 12
951
+CHROMA_HPP 16, 16
952
+CHROMA_HPP 16, 24
953
+CHROMA_HPP 16, 32
954
+CHROMA_HPP 16, 64
955
+CHROMA_HPP 24, 32
956
+CHROMA_HPP 24, 64
957
+CHROMA_HPP 32, 8
958
+CHROMA_HPP 32, 16
959
+CHROMA_HPP 32, 24
960
+CHROMA_HPP 32, 32
961
+CHROMA_HPP 32, 48
962
+CHROMA_HPP 32, 64
963
+CHROMA_HPP 48, 64
964
+CHROMA_HPP 64, 16
965
+CHROMA_HPP 64, 32
966
+CHROMA_HPP 64, 48
967
+CHROMA_HPP 64, 64
968
+
969
+// ***** chroma_hps *****
970
+// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
971
+.macro CHROMA_HPS w, h
972
+function x265_interp_4tap_horiz_ps_\w\()x\h\()_neon
973
+ cmp x4, #0
974
+ beq 0f
975
+ cmp x4, #1
976
+ beq 1f
977
+ cmp x4, #2
978
+ beq 2f
979
+ cmp x4, #3
980
+ beq 3f
981
+ cmp x4, #4
982
+ beq 4f
983
+ cmp x4, #5
984
+ beq 5f
985
+ cmp x4, #6
986
+ beq 6f
987
+ cmp x4, #7
988
+ beq 7f
989
+0:
990
+ FILTER_CHROMA_HPS \w, \h, 0
991
+1:
992
+ FILTER_CHROMA_HPS \w, \h, 1
993
+2:
994
+ FILTER_CHROMA_HPS \w, \h, 2
995
+3:
996
+ FILTER_CHROMA_HPS \w, \h, 3
997
+4:
998
+ FILTER_CHROMA_HPS \w, \h, 4
999
+5:
1000
+ FILTER_CHROMA_HPS \w, \h, 5
1001
+6:
1002
+ FILTER_CHROMA_HPS \w, \h, 6
1003
+7:
1004
+ FILTER_CHROMA_HPS \w, \h, 7
1005
+endfunc
1006
+.endm
1007
+
1008
+CHROMA_HPS 2, 4
1009
+CHROMA_HPS 2, 8
1010
+CHROMA_HPS 2, 16
1011
+CHROMA_HPS 4, 2
1012
+CHROMA_HPS 4, 4
1013
+CHROMA_HPS 4, 8
1014
+CHROMA_HPS 4, 16
1015
+CHROMA_HPS 4, 32
1016
+CHROMA_HPS 6, 8
1017
+CHROMA_HPS 6, 16
1018
+CHROMA_HPS 8, 2
1019
+CHROMA_HPS 8, 4
1020
+CHROMA_HPS 8, 6
1021
+CHROMA_HPS 8, 8
1022
+CHROMA_HPS 8, 12
1023
+CHROMA_HPS 8, 16
1024
+CHROMA_HPS 8, 32
1025
+CHROMA_HPS 8, 64
1026
+CHROMA_HPS 12, 16
1027
+CHROMA_HPS 12, 32
1028
+CHROMA_HPS 16, 4
1029
+CHROMA_HPS 16, 8
1030
+CHROMA_HPS 16, 12
1031
+CHROMA_HPS 16, 16
1032
+CHROMA_HPS 16, 24
1033
+CHROMA_HPS 16, 32
1034
+CHROMA_HPS 16, 64
1035
+CHROMA_HPS 24, 32
1036
+CHROMA_HPS 24, 64
1037
+CHROMA_HPS 32, 8
1038
+CHROMA_HPS 32, 16
1039
+CHROMA_HPS 32, 24
1040
+CHROMA_HPS 32, 32
1041
+CHROMA_HPS 32, 48
1042
+CHROMA_HPS 32, 64
1043
+CHROMA_HPS 48, 64
1044
+CHROMA_HPS 64, 16
1045
+CHROMA_HPS 64, 32
1046
+CHROMA_HPS 64, 48
1047
+CHROMA_HPS 64, 64
1048
+
1049
+const g_luma_s16, align=8
1050
+// a, b, c, d, e, f, g, h
1051
+.hword 0, 0, 0, 64, 0, 0, 0, 0
1052
+.hword -1, 4, -10, 58, 17, -5, 1, 0
1053
+.hword -1, 4, -11, 40, 40, -11, 4, -1
1054
+.hword 0, 1, -5, 17, 58, -10, 4, -1
1055
+endconst
1056
x265_3.6.tar.gz/source/common/aarch64/loopfilter-prim.cpp
Added
293
1
2
+#include "loopfilter-prim.h"
3
+
4
+#define PIXEL_MIN 0
5
+
6
+
7
+
8
+#if !(HIGH_BIT_DEPTH) && defined(HAVE_NEON)
9
+#include<arm_neon.h>
10
+
11
+namespace
12
+{
13
+
14
+
15
+/* get the sign of input variable (TODO: this is a dup, make common) */
16
+static inline int8_t signOf(int x)
17
+{
18
+ return (x >> 31) | ((int)((((uint32_t) - x)) >> 31));
19
+}
20
+
21
+static inline int8x8_t sign_diff_neon(const uint8x8_t in0, const uint8x8_t in1)
22
+{
23
+ int16x8_t in = vsubl_u8(in0, in1);
24
+ return vmovn_s16(vmaxq_s16(vminq_s16(in, vdupq_n_s16(1)), vdupq_n_s16(-1)));
25
+}
26
+
27
+static void calSign_neon(int8_t *dst, const pixel *src1, const pixel *src2, const int endX)
28
+{
29
+ int x = 0;
30
+ for (; (x + 8) <= endX; x += 8)
31
+ {
32
+ *(int8x8_t *)&dstx = sign_diff_neon(*(uint8x8_t *)&src1x, *(uint8x8_t *)&src2x);
33
+ }
34
+
35
+ for (; x < endX; x++)
36
+ {
37
+ dstx = signOf(src1x - src2x);
38
+ }
39
+}
40
+
41
+static void processSaoCUE0_neon(pixel *rec, int8_t *offsetEo, int width, int8_t *signLeft, intptr_t stride)
42
+{
43
+
44
+
45
+ int y;
46
+ int8_t signRight, signLeft0;
47
+ int8_t edgeType;
48
+
49
+ for (y = 0; y < 2; y++)
50
+ {
51
+ signLeft0 = signLefty;
52
+ int x = 0;
53
+
54
+ if (width >= 8)
55
+ {
56
+ int8x8_t vsignRight;
57
+ int8x8x2_t shifter;
58
+ shifter.val10 = signLeft0;
59
+ static const int8x8_t index = {8, 0, 1, 2, 3, 4, 5, 6};
60
+ int8x8_t tbl = *(int8x8_t *)offsetEo;
61
+ for (; (x + 8) <= width; x += 8)
62
+ {
63
+ uint8x8_t in = *(uint8x8_t *)&recx;
64
+ vsignRight = sign_diff_neon(in, *(uint8x8_t *)&recx + 1);
65
+ shifter.val0 = vneg_s8(vsignRight);
66
+ int8x8_t tmp = shifter.val0;
67
+ int8x8_t edge = vtbl2_s8(shifter, index);
68
+ int8x8_t vedgeType = vadd_s8(vadd_s8(vsignRight, edge), vdup_n_s8(2));
69
+ shifter.val10 = tmp7;
70
+ int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
71
+ t1 = vaddw_u8(t1, in);
72
+ t1 = vmaxq_s16(t1, vdupq_n_s16(0));
73
+ t1 = vminq_s16(t1, vdupq_n_s16(255));
74
+ *(uint8x8_t *)&recx = vmovn_u16(t1);
75
+ }
76
+ signLeft0 = shifter.val10;
77
+ }
78
+ for (; x < width; x++)
79
+ {
80
+ signRight = ((recx - recx + 1) < 0) ? -1 : ((recx - recx + 1) > 0) ? 1 : 0;
81
+ edgeType = signRight + signLeft0 + 2;
82
+ signLeft0 = -signRight;
83
+ recx = x265_clip(recx + offsetEoedgeType);
84
+ }
85
+ rec += stride;
86
+ }
87
+}
88
+
89
+static void processSaoCUE1_neon(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int width)
90
+{
91
+ int x = 0;
92
+ int8_t signDown;
93
+ int edgeType;
94
+
95
+ if (width >= 8)
96
+ {
97
+ int8x8_t tbl = *(int8x8_t *)offsetEo;
98
+ for (; (x + 8) <= width; x += 8)
99
+ {
100
+ uint8x8_t in0 = *(uint8x8_t *)&recx;
101
+ uint8x8_t in1 = *(uint8x8_t *)&recx + stride;
102
+ int8x8_t vsignDown = sign_diff_neon(in0, in1);
103
+ int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1x), vdup_n_s8(2));
104
+ *(int8x8_t *)&upBuff1x = vneg_s8(vsignDown);
105
+ int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
106
+ t1 = vaddw_u8(t1, in0);
107
+ *(uint8x8_t *)&recx = vqmovun_s16(t1);
108
+ }
109
+ }
110
+ for (; x < width; x++)
111
+ {
112
+ signDown = signOf(recx - recx + stride);
113
+ edgeType = signDown + upBuff1x + 2;
114
+ upBuff1x = -signDown;
115
+ recx = x265_clip(recx + offsetEoedgeType);
116
+ }
117
+}
118
+
119
+static void processSaoCUE1_2Rows_neon(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int width)
120
+{
121
+ int y;
122
+ int8_t signDown;
123
+ int edgeType;
124
+
125
+ for (y = 0; y < 2; y++)
126
+ {
127
+ int x = 0;
128
+ if (width >= 8)
129
+ {
130
+ int8x8_t tbl = *(int8x8_t *)offsetEo;
131
+ for (; (x + 8) <= width; x += 8)
132
+ {
133
+ uint8x8_t in0 = *(uint8x8_t *)&recx;
134
+ uint8x8_t in1 = *(uint8x8_t *)&recx + stride;
135
+ int8x8_t vsignDown = sign_diff_neon(in0, in1);
136
+ int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1x), vdup_n_s8(2));
137
+ *(int8x8_t *)&upBuff1x = vneg_s8(vsignDown);
138
+ int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
139
+ t1 = vaddw_u8(t1, in0);
140
+ t1 = vmaxq_s16(t1, vdupq_n_s16(0));
141
+ t1 = vminq_s16(t1, vdupq_n_s16(255));
142
+ *(uint8x8_t *)&recx = vmovn_u16(t1);
143
+
144
+ }
145
+ }
146
+ for (; x < width; x++)
147
+ {
148
+ signDown = signOf(recx - recx + stride);
149
+ edgeType = signDown + upBuff1x + 2;
150
+ upBuff1x = -signDown;
151
+ recx = x265_clip(recx + offsetEoedgeType);
152
+ }
153
+ rec += stride;
154
+ }
155
+}
156
+
157
+static void processSaoCUE2_neon(pixel *rec, int8_t *bufft, int8_t *buff1, int8_t *offsetEo, int width, intptr_t stride)
158
+{
159
+ int x;
160
+
161
+ if (abs(buff1 - bufft) < 16)
162
+ {
163
+ for (x = 0; x < width; x++)
164
+ {
165
+ int8_t signDown = signOf(recx - recx + stride + 1);
166
+ int edgeType = signDown + buff1x + 2;
167
+ bufftx + 1 = -signDown;
168
+ recx = x265_clip(recx + offsetEoedgeType);;
169
+ }
170
+ }
171
+ else
172
+ {
173
+ int8x8_t tbl = *(int8x8_t *)offsetEo;
174
+ x = 0;
175
+ for (; (x + 8) <= width; x += 8)
176
+ {
177
+ uint8x8_t in0 = *(uint8x8_t *)&recx;
178
+ uint8x8_t in1 = *(uint8x8_t *)&recx + stride + 1;
179
+ int8x8_t vsignDown = sign_diff_neon(in0, in1);
180
+ int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&buff1x), vdup_n_s8(2));
181
+ *(int8x8_t *)&bufftx + 1 = vneg_s8(vsignDown);
182
+ int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
183
+ t1 = vaddw_u8(t1, in0);
184
+ t1 = vmaxq_s16(t1, vdupq_n_s16(0));
185
+ t1 = vminq_s16(t1, vdupq_n_s16(255));
186
+ *(uint8x8_t *)&recx = vmovn_u16(t1);
187
+ }
188
+ for (; x < width; x++)
189
+ {
190
+ int8_t signDown = signOf(recx - recx + stride + 1);
191
+ int edgeType = signDown + buff1x + 2;
192
+ bufftx + 1 = -signDown;
193
+ recx = x265_clip(recx + offsetEoedgeType);;
194
+ }
195
+
196
+ }
197
+}
198
+
199
+
200
+static void processSaoCUE3_neon(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int startX, int endX)
201
+{
202
+ int8_t signDown;
203
+ int8_t edgeType;
204
+ int8x8_t tbl = *(int8x8_t *)offsetEo;
205
+
206
+ int x = startX + 1;
207
+ for (; (x + 8) <= endX; x += 8)
208
+ {
209
+ uint8x8_t in0 = *(uint8x8_t *)&recx;
210
+ uint8x8_t in1 = *(uint8x8_t *)&recx + stride;
211
+ int8x8_t vsignDown = sign_diff_neon(in0, in1);
212
+ int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1x), vdup_n_s8(2));
213
+ *(int8x8_t *)&upBuff1x - 1 = vneg_s8(vsignDown);
214
+ int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
215
+ t1 = vaddw_u8(t1, in0);
216
+ t1 = vmaxq_s16(t1, vdupq_n_s16(0));
217
+ t1 = vminq_s16(t1, vdupq_n_s16(255));
218
+ *(uint8x8_t *)&recx = vmovn_u16(t1);
219
+
220
+ }
221
+ for (; x < endX; x++)
222
+ {
223
+ signDown = signOf(recx - recx + stride);
224
+ edgeType = signDown + upBuff1x + 2;
225
+ upBuff1x - 1 = -signDown;
226
+ recx = x265_clip(recx + offsetEoedgeType);
227
+ }
228
+}
229
+
230
+static void processSaoCUB0_neon(pixel *rec, const int8_t *offset, int ctuWidth, int ctuHeight, intptr_t stride)
231
+{
232
+#define SAO_BO_BITS 5
233
+ const int boShift = X265_DEPTH - SAO_BO_BITS;
234
+ int x, y;
235
+ int8x8x4_t table;
236
+ table = *(int8x8x4_t *)offset;
237
+
238
+ for (y = 0; y < ctuHeight; y++)
239
+ {
240
+
241
+ for (x = 0; (x + 8) <= ctuWidth; x += 8)
242
+ {
243
+ int8x8_t in = *(int8x8_t *)&recx;
244
+ int8x8_t offsets = vtbl4_s8(table, vshr_n_u8(in, boShift));
245
+ int16x8_t tmp = vmovl_s8(offsets);
246
+ tmp = vaddw_u8(tmp, in);
247
+ tmp = vmaxq_s16(tmp, vdupq_n_s16(0));
248
+ tmp = vminq_s16(tmp, vdupq_n_s16(255));
249
+ *(uint8x8_t *)&recx = vmovn_u16(tmp);
250
+ }
251
+ for (; x < ctuWidth; x++)
252
+ {
253
+ recx = x265_clip(recx + offsetrecx >> boShift);
254
+ }
255
+ rec += stride;
256
+ }
257
+}
258
+
259
+}
260
+
261
+
262
+
263
+namespace X265_NS
264
+{
265
+void setupLoopFilterPrimitives_neon(EncoderPrimitives &p)
266
+{
267
+ p.saoCuOrgE0 = processSaoCUE0_neon;
268
+ p.saoCuOrgE1 = processSaoCUE1_neon;
269
+ p.saoCuOrgE1_2Rows = processSaoCUE1_2Rows_neon;
270
+ p.saoCuOrgE20 = processSaoCUE2_neon;
271
+ p.saoCuOrgE21 = processSaoCUE2_neon;
272
+ p.saoCuOrgE30 = processSaoCUE3_neon;
273
+ p.saoCuOrgE31 = processSaoCUE3_neon;
274
+ p.saoCuOrgB0 = processSaoCUB0_neon;
275
+ p.sign = calSign_neon;
276
+
277
+}
278
+
279
+
280
+#else //HIGH_BIT_DEPTH
281
+
282
+
283
+namespace X265_NS
284
+{
285
+void setupLoopFilterPrimitives_neon(EncoderPrimitives &)
286
+{
287
+}
288
+
289
+#endif
290
+
291
+
292
+}
293
x265_3.6.tar.gz/source/common/aarch64/loopfilter-prim.h
Added
18
1
2
+#ifndef _LOOPFILTER_NEON_H__
3
+#define _LOOPFILTER_NEON_H__
4
+
5
+#include "common.h"
6
+#include "primitives.h"
7
+
8
+#define PIXEL_MIN 0
9
+
10
+namespace X265_NS
11
+{
12
+void setupLoopFilterPrimitives_neon(EncoderPrimitives &p);
13
+
14
+};
15
+
16
+
17
+#endif
18
x265_3.6.tar.gz/source/common/aarch64/mc-a-common.S
Added
50
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+.arch armv8-a
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.macro addAvg_start
37
+ lsl x3, x3, #1
38
+ lsl x4, x4, #1
39
+ mov w11, #0x40
40
+ dup v30.16b, w11
41
+.endm
42
+
43
+.macro addavg_1 v0, v1
44
+ add \v0\().8h, \v0\().8h, \v1\().8h
45
+ saddl v16.4s, \v0\().4h, v30.4h
46
+ saddl2 v17.4s, \v0\().8h, v30.8h
47
+ shrn \v0\().4h, v16.4s, #7
48
+ shrn2 \v0\().8h, v17.4s, #7
49
+.endm
50
x265_3.6.tar.gz/source/common/aarch64/mc-a-sve2.S
Added
926
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "mc-a-common.S"
27
+
28
+.arch armv8-a+sve2
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+function PFX(pixel_avg_pp_12x16_sve2)
41
+ sub x1, x1, #4
42
+ sub x3, x3, #4
43
+ sub x5, x5, #4
44
+ ptrue p0.s, vl1
45
+ ptrue p1.b, vl8
46
+ mov x11, #4
47
+.rept 16
48
+ ld1w {z0.s}, p0/z, x2
49
+ ld1b {z1.b}, p1/z, x2, x11
50
+ ld1w {z2.s}, p0/z, x4
51
+ ld1b {z3.b}, p1/z, x4, x11
52
+ add x2, x2, #4
53
+ add x2, x2, x3
54
+ add x4, x4, #4
55
+ add x4, x4, x5
56
+ urhadd z0.b, p1/m, z0.b, z2.b
57
+ urhadd z1.b, p1/m, z1.b, z3.b
58
+ st1b {z0.b}, p1, x0
59
+ st1b {z1.b}, p1, x0, x11
60
+ add x0, x0, #4
61
+ add x0, x0, x1
62
+.endr
63
+ ret
64
+endfunc
65
+
66
+function PFX(pixel_avg_pp_24x32_sve2)
67
+ mov w12, #4
68
+ rdvl x9, #1
69
+ cmp x9, #16
70
+ bgt .vl_gt_16_pixel_avg_pp_24x32
71
+ sub x1, x1, #16
72
+ sub x3, x3, #16
73
+ sub x5, x5, #16
74
+.lpavg_24x32_sve2:
75
+ sub w12, w12, #1
76
+.rept 8
77
+ ld1 {v0.16b}, x2, #16
78
+ ld1 {v1.8b}, x2, x3
79
+ ld1 {v2.16b}, x4, #16
80
+ ld1 {v3.8b}, x4, x5
81
+ urhadd v0.16b, v0.16b, v2.16b
82
+ urhadd v1.8b, v1.8b, v3.8b
83
+ st1 {v0.16b}, x0, #16
84
+ st1 {v1.8b}, x0, x1
85
+.endr
86
+ cbnz w12, .lpavg_24x32_sve2
87
+ ret
88
+.vl_gt_16_pixel_avg_pp_24x32:
89
+ mov x10, #24
90
+ mov x11, #0
91
+ whilelt p0.b, x11, x10
92
+.vl_gt_16_loop_pixel_avg_pp_24x32:
93
+ sub w12, w12, #1
94
+.rept 8
95
+ ld1b {z0.b}, p0/z, x2
96
+ ld1b {z2.b}, p0/z, x4
97
+ add x2, x2, x3
98
+ add x4, x4, x5
99
+ urhadd z0.b, p0/m, z0.b, z2.b
100
+ st1b {z0.b}, p0, x0
101
+ add x0, x0, x1
102
+.endr
103
+ cbnz w12, .vl_gt_16_loop_pixel_avg_pp_24x32
104
+ ret
105
+endfunc
106
+
107
+.macro pixel_avg_pp_32xN_sve2 h
108
+function PFX(pixel_avg_pp_32x\h\()_sve2)
109
+ rdvl x9, #1
110
+ cmp x9, #16
111
+ bgt .vl_gt_16_pixel_avg_pp_32_\h
112
+.rept \h
113
+ ld1 {v0.16b-v1.16b}, x2, x3
114
+ ld1 {v2.16b-v3.16b}, x4, x5
115
+ urhadd v0.16b, v0.16b, v2.16b
116
+ urhadd v1.16b, v1.16b, v3.16b
117
+ st1 {v0.16b-v1.16b}, x0, x1
118
+.endr
119
+ ret
120
+.vl_gt_16_pixel_avg_pp_32_\h:
121
+ ptrue p0.b, vl32
122
+.rept \h
123
+ ld1b {z0.b}, p0/z, x2
124
+ ld1b {z2.b}, p0/z, x4
125
+ add x2, x2, x3
126
+ add x4, x4, x5
127
+ urhadd z0.b, p0/m, z0.b, z2.b
128
+ st1b {z0.b}, p0, x0
129
+ add x0, x0, x1
130
+.endr
131
+ ret
132
+endfunc
133
+.endm
134
+
135
+pixel_avg_pp_32xN_sve2 8
136
+pixel_avg_pp_32xN_sve2 16
137
+pixel_avg_pp_32xN_sve2 24
138
+
139
+.macro pixel_avg_pp_32xN1_sve2 h
140
+function PFX(pixel_avg_pp_32x\h\()_sve2)
141
+ rdvl x9, #1
142
+ cmp x9, #16
143
+ bgt .vl_gt_16_pixel_avg_pp_32xN1_\h
144
+ mov w12, #\h / 8
145
+.lpavg_sve2_32x\h\():
146
+ sub w12, w12, #1
147
+.rept 8
148
+ ld1 {v0.16b-v1.16b}, x2, x3
149
+ ld1 {v2.16b-v3.16b}, x4, x5
150
+ urhadd v0.16b, v0.16b, v2.16b
151
+ urhadd v1.16b, v1.16b, v3.16b
152
+ st1 {v0.16b-v1.16b}, x0, x1
153
+.endr
154
+ cbnz w12, .lpavg_sve2_32x\h
155
+ ret
156
+.vl_gt_16_pixel_avg_pp_32xN1_\h:
157
+ ptrue p0.b, vl32
158
+ mov w12, #\h / 8
159
+.eq_32_loop_pixel_avg_pp_32xN1_\h\():
160
+ sub w12, w12, #1
161
+.rept 8
162
+ ld1b {z0.b}, p0/z, x2
163
+ ld1b {z2.b}, p0/z, x4
164
+ add x2, x2, x3
165
+ add x4, x4, x5
166
+ urhadd z0.b, p0/m, z0.b, z2.b
167
+ st1b {z0.b}, p0, x0
168
+ add x0, x0, x1
169
+.endr
170
+ cbnz w12, .eq_32_loop_pixel_avg_pp_32xN1_\h
171
+ ret
172
+endfunc
173
+.endm
174
+
175
+pixel_avg_pp_32xN1_sve2 32
176
+pixel_avg_pp_32xN1_sve2 64
177
+
178
+function PFX(pixel_avg_pp_48x64_sve2)
179
+ rdvl x9, #1
180
+ cmp x9, #16
181
+ bgt .vl_gt_16_pixel_avg_pp_48x64
182
+ mov w12, #8
183
+.lpavg_48x64_sve2:
184
+ sub w12, w12, #1
185
+.rept 8
186
+ ld1 {v0.16b-v2.16b}, x2, x3
187
+ ld1 {v3.16b-v5.16b}, x4, x5
188
+ urhadd v0.16b, v0.16b, v3.16b
189
+ urhadd v1.16b, v1.16b, v4.16b
190
+ urhadd v2.16b, v2.16b, v5.16b
191
+ st1 {v0.16b-v2.16b}, x0, x1
192
+.endr
193
+ cbnz w12, .lpavg_48x64_sve2
194
+ ret
195
+.vl_gt_16_pixel_avg_pp_48x64:
196
+ cmp x9, #32
197
+ bgt .vl_gt_32_pixel_avg_pp_48x64
198
+ ptrue p0.b, vl32
199
+ ptrue p1.b, vl16
200
+ mov w12, #8
201
+.vl_eq_32_pixel_avg_pp_48x64:
202
+ sub w12, w12, #1
203
+.rept 8
204
+ ld1b {z0.b}, p0/z, x2
205
+ ld1b {z1.b}, p1/z, x2, #1, mul vl
206
+ ld1b {z2.b}, p0/z, x4
207
+ ld1b {z3.b}, p1/z, x4, #1, mul vl
208
+ add x2, x2, x3
209
+ add x4, x4, x5
210
+ urhadd z0.b, p0/m, z0.b, z2.b
211
+ urhadd z1.b, p1/m, z1.b, z3.b
212
+ st1b {z0.b}, p0, x0
213
+ st1b {z1.b}, p1, x0, #1, mul vl
214
+ add x0, x0, x1
215
+.endr
216
+ cbnz w12, .vl_eq_32_pixel_avg_pp_48x64
217
+ ret
218
+.vl_gt_32_pixel_avg_pp_48x64:
219
+ mov x10, #48
220
+ mov x11, #0
221
+ whilelt p0.b, x11, x10
222
+ mov w12, #8
223
+.loop_gt_32_pixel_avg_pp_48x64:
224
+ sub w12, w12, #1
225
+.rept 8
226
+ ld1b {z0.b}, p0/z, x2
227
+ ld1b {z2.b}, p0/z, x4
228
+ add x2, x2, x3
229
+ add x4, x4, x5
230
+ urhadd z0.b, p0/m, z0.b, z2.b
231
+ st1b {z0.b}, p0, x0
232
+ add x0, x0, x1
233
+.endr
234
+ cbnz w12, .loop_gt_32_pixel_avg_pp_48x64
235
+ ret
236
+endfunc
237
+
238
+.macro pixel_avg_pp_64xN_sve2 h
239
+function PFX(pixel_avg_pp_64x\h\()_sve2)
240
+ rdvl x9, #1
241
+ cmp x9, #16
242
+ bgt .vl_gt_16_pixel_avg_pp_64x\h
243
+ mov w12, #\h / 4
244
+.lpavg_sve2_64x\h\():
245
+ sub w12, w12, #1
246
+.rept 4
247
+ ld1 {v0.16b-v3.16b}, x2, x3
248
+ ld1 {v4.16b-v7.16b}, x4, x5
249
+ urhadd v0.16b, v0.16b, v4.16b
250
+ urhadd v1.16b, v1.16b, v5.16b
251
+ urhadd v2.16b, v2.16b, v6.16b
252
+ urhadd v3.16b, v3.16b, v7.16b
253
+ st1 {v0.16b-v3.16b}, x0, x1
254
+.endr
255
+ cbnz w12, .lpavg_sve2_64x\h
256
+ ret
257
+.vl_gt_16_pixel_avg_pp_64x\h\():
258
+ cmp x9, #48
259
+ bgt .vl_gt_48_pixel_avg_pp_64x\h
260
+ ptrue p0.b, vl32
261
+ mov w12, #\h / 4
262
+.vl_eq_32_pixel_avg_pp_64x\h\():
263
+ sub w12, w12, #1
264
+.rept 4
265
+ ld1b {z0.b}, p0/z, x2
266
+ ld1b {z1.b}, p0/z, x2, #1, mul vl
267
+ ld1b {z2.b}, p0/z, x4
268
+ ld1b {z3.b}, p0/z, x4, #1, mul vl
269
+ add x2, x2, x3
270
+ add x4, x4, x5
271
+ urhadd z0.b, p0/m, z0.b, z2.b
272
+ urhadd z1.b, p0/m, z1.b, z3.b
273
+ st1b {z0.b}, p0, x0
274
+ st1b {z1.b}, p0, x0, #1, mul vl
275
+ add x0, x0, x1
276
+.endr
277
+ cbnz w12, .vl_eq_32_pixel_avg_pp_64x\h
278
+ ret
279
+.vl_gt_48_pixel_avg_pp_64x\h\():
280
+ ptrue p0.b, vl64
281
+ mov w12, #\h / 4
282
+.vl_eq_64_pixel_avg_pp_64x\h\():
283
+ sub w12, w12, #1
284
+.rept 4
285
+ ld1b {z0.b}, p0/z, x2
286
+ ld1b {z2.b}, p0/z, x4
287
+ add x2, x2, x3
288
+ add x4, x4, x5
289
+ urhadd z0.b, p0/m, z0.b, z2.b
290
+ st1b {z0.b}, p0, x0
291
+ add x0, x0, x1
292
+.endr
293
+ cbnz w12, .vl_eq_64_pixel_avg_pp_64x\h
294
+ ret
295
+endfunc
296
+.endm
297
+
298
+pixel_avg_pp_64xN_sve2 16
299
+pixel_avg_pp_64xN_sve2 32
300
+pixel_avg_pp_64xN_sve2 48
301
+pixel_avg_pp_64xN_sve2 64
302
+
303
+// void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
304
+
305
+.macro addAvg_2xN_sve2 h
306
+function PFX(addAvg_2x\h\()_sve2)
307
+ ptrue p0.s, vl2
308
+ ptrue p1.h, vl4
309
+ ptrue p2.h, vl2
310
+.rept \h / 2
311
+ ld1rw {z0.s}, p0/z, x0
312
+ ld1rw {z1.s}, p0/z, x1
313
+ add x0, x0, x3, lsl #1
314
+ add x1, x1, x4, lsl #1
315
+ ld1rw {z2.s}, p0/z, x0
316
+ ld1rw {z3.s}, p0/z, x1
317
+ add x0, x0, x3, lsl #1
318
+ add x1, x1, x4, lsl #1
319
+ add z0.h, p1/m, z0.h, z1.h
320
+ add z2.h, p1/m, z2.h, z3.h
321
+ sqrshrnb z0.b, z0.h, #7
322
+ add z0.b, z0.b, #0x80
323
+ sqrshrnb z2.b, z2.h, #7
324
+ add z2.b, z2.b, #0x80
325
+ st1b {z0.h}, p2, x2
326
+ add x2, x2, x5
327
+ st1b {z2.h}, p2, x2
328
+ add x2, x2, x5
329
+.endr
330
+ ret
331
+endfunc
332
+.endm
333
+
334
+addAvg_2xN_sve2 4
335
+addAvg_2xN_sve2 8
336
+addAvg_2xN_sve2 16
337
+
338
+.macro addAvg_6xN_sve2 h
339
+function PFX(addAvg_6x\h\()_sve2)
340
+ mov w12, #\h / 2
341
+ ptrue p0.b, vl16
342
+ ptrue p2.h, vl6
343
+.loop_sve2_addavg_6x\h\():
344
+ sub w12, w12, #1
345
+ ld1b {z0.b}, p0/z, x0
346
+ ld1b {z1.b}, p0/z, x1
347
+ add x0, x0, x3, lsl #1
348
+ add x1, x1, x4, lsl #1
349
+ ld1b {z2.b}, p0/z, x0
350
+ ld1b {z3.b}, p0/z, x1
351
+ add x0, x0, x3, lsl #1
352
+ add x1, x1, x4, lsl #1
353
+ add z0.h, p0/m, z0.h, z1.h
354
+ add z2.h, p0/m, z2.h, z3.h
355
+ sqrshrnb z0.b, z0.h, #7
356
+ sqrshrnb z2.b, z2.h, #7
357
+ add z0.b, z0.b, #0x80
358
+ add z2.b, z2.b, #0x80
359
+ st1b {z0.h}, p2, x2
360
+ add x2, x2, x5
361
+ st1b {z2.h}, p2, x2
362
+ add x2, x2, x5
363
+ cbnz w12, .loop_sve2_addavg_6x\h
364
+ ret
365
+endfunc
366
+.endm
367
+
368
+addAvg_6xN_sve2 8
369
+addAvg_6xN_sve2 16
370
+
371
+.macro addAvg_8xN_sve2 h
372
+function PFX(addAvg_8x\h\()_sve2)
373
+ ptrue p0.b, vl16
374
+.rept \h / 2
375
+ ld1b {z0.b}, p0/z, x0
376
+ ld1b {z1.b}, p0/z, x1
377
+ add x0, x0, x3, lsl #1
378
+ add x1, x1, x4, lsl #1
379
+ ld1b {z2.b}, p0/z, x0
380
+ ld1b {z3.b}, p0/z, x1
381
+ add x0, x0, x3, lsl #1
382
+ add x1, x1, x4, lsl #1
383
+ add z0.h, p0/m, z0.h, z1.h
384
+ add z2.h, p0/m, z2.h, z3.h
385
+ sqrshrnb z0.b, z0.h, #7
386
+ add z0.b, z0.b, #0x80
387
+ sqrshrnb z2.b, z2.h, #7
388
+ add z2.b, z2.b, #0x80
389
+ st1b {z0.h}, p0, x2
390
+ add x2, x2, x5
391
+ st1b {z2.h}, p0, x2
392
+ add x2, x2, x5
393
+.endr
394
+ ret
395
+endfunc
396
+.endm
397
+
398
+.macro addAvg_8xN1_sve2 h
399
+function PFX(addAvg_8x\h\()_sve2)
400
+ mov w12, #\h / 2
401
+ ptrue p0.b, vl16
402
+.loop_sve2_addavg_8x\h\():
403
+ sub w12, w12, #1
404
+ ld1b {z0.b}, p0/z, x0
405
+ ld1b {z1.b}, p0/z, x1
406
+ add x0, x0, x3, lsl #1
407
+ add x1, x1, x4, lsl #1
408
+ ld1b {z2.b}, p0/z, x0
409
+ ld1b {z3.b}, p0/z, x1
410
+ add x0, x0, x3, lsl #1
411
+ add x1, x1, x4, lsl #1
412
+ add z0.h, p0/m, z0.h, z1.h
413
+ add z2.h, p0/m, z2.h, z3.h
414
+ sqrshrnb z0.b, z0.h, #7
415
+ add z0.b, z0.b, #0x80
416
+ sqrshrnb z2.b, z2.h, #7
417
+ add z2.b, z2.b, #0x80
418
+ st1b {z0.h}, p0, x2
419
+ add x2, x2, x5
420
+ st1b {z2.h}, p0, x2
421
+ add x2, x2, x5
422
+ cbnz w12, .loop_sve2_addavg_8x\h
423
+ ret
424
+endfunc
425
+.endm
426
+
427
+addAvg_8xN_sve2 2
428
+addAvg_8xN_sve2 4
429
+addAvg_8xN_sve2 6
430
+addAvg_8xN_sve2 8
431
+addAvg_8xN_sve2 12
432
+addAvg_8xN_sve2 16
433
+addAvg_8xN1_sve2 32
434
+addAvg_8xN1_sve2 64
435
+
436
+.macro addAvg_12xN_sve2 h
437
+function PFX(addAvg_12x\h\()_sve2)
438
+ mov w12, #\h
439
+ rdvl x9, #1
440
+ cmp x9, #16
441
+ bgt .vl_gt_16_addAvg_12x\h
442
+ ptrue p0.b, vl16
443
+ ptrue p1.b, vl8
444
+.loop_sve2_addavg_12x\h\():
445
+ sub w12, w12, #1
446
+ ld1b {z0.b}, p0/z, x0
447
+ ld1b {z1.b}, p0/z, x1
448
+ ld1b {z2.b}, p1/z, x0, #1, mul vl
449
+ ld1b {z3.b}, p1/z, x1, #1, mul vl
450
+ add x0, x0, x3, lsl #1
451
+ add x1, x1, x4, lsl #1
452
+ add z0.h, p0/m, z0.h, z1.h
453
+ add z2.h, p1/m, z2.h, z3.h
454
+ sqrshrnb z0.b, z0.h, #7
455
+ add z0.b, z0.b, #0x80
456
+ sqrshrnb z2.b, z2.h, #7
457
+ add z2.b, z2.b, #0x80
458
+ st1b {z0.h}, p0, x2
459
+ st1b {z2.h}, p1, x2, #1, mul vl
460
+ add x2, x2, x5
461
+ cbnz w12, .loop_sve2_addavg_12x\h
462
+ ret
463
+.vl_gt_16_addAvg_12x\h\():
464
+ mov x10, #24
465
+ mov x11, #0
466
+ whilelt p0.b, x11, x10
467
+.loop_sve2_gt_16_addavg_12x\h\():
468
+ sub w12, w12, #1
469
+ ld1b {z0.b}, p0/z, x0
470
+ ld1b {z1.b}, p0/z, x1
471
+ add x0, x0, x3, lsl #1
472
+ add x1, x1, x4, lsl #1
473
+ add z0.h, p0/m, z0.h, z1.h
474
+ sqrshrnb z0.b, z0.h, #7
475
+ add z0.b, z0.b, #0x80
476
+ sqrshrnb z2.b, z2.h, #7
477
+ add z2.b, z2.b, #0x80
478
+ st1b {z0.h}, p0, x2
479
+ add x2, x2, x5
480
+ cbnz w12, .loop_sve2_gt_16_addavg_12x\h
481
+ ret
482
+endfunc
483
+.endm
484
+
485
+addAvg_12xN_sve2 16
486
+addAvg_12xN_sve2 32
487
+
488
+.macro addAvg_16xN_sve2 h
489
+function PFX(addAvg_16x\h\()_sve2)
490
+ mov w12, #\h
491
+ rdvl x9, #1
492
+ cmp x9, #16
493
+ bgt .vl_gt_16_addAvg_16x\h
494
+ ptrue p0.b, vl16
495
+.loop_eq_16_sve2_addavg_16x\h\():
496
+ sub w12, w12, #1
497
+ ld1b {z0.b}, p0/z, x0
498
+ ld1b {z1.b}, p0/z, x1
499
+ ld1b {z2.b}, p0/z, x0, #1, mul vl
500
+ ld1b {z3.b}, p0/z, x1, #1, mul vl
501
+ add x0, x0, x3, lsl #1
502
+ add x1, x1, x4, lsl #1
503
+ add z0.h, p0/m, z0.h, z1.h
504
+ add z2.h, p0/m, z2.h, z3.h
505
+ sqrshrnb z0.b, z0.h, #7
506
+ add z0.b, z0.b, #0x80
507
+ sqrshrnb z2.b, z2.h, #7
508
+ add z2.b, z2.b, #0x80
509
+ st1b {z0.h}, p0, x2
510
+ st1b {z2.h}, p0, x2, #1, mul vl
511
+ add x2, x2, x5
512
+ cbnz w12, .loop_eq_16_sve2_addavg_16x\h
513
+ ret
514
+.vl_gt_16_addAvg_16x\h\():
515
+ cmp x9, #32
516
+ bgt .vl_gt_32_addAvg_16x\h
517
+ ptrue p0.b, vl32
518
+.loop_gt_16_sve2_addavg_16x\h\():
519
+ sub w12, w12, #1
520
+ ld1b {z0.b}, p0/z, x0
521
+ ld1b {z1.b}, p0/z, x1
522
+ add x0, x0, x3, lsl #1
523
+ add x1, x1, x4, lsl #1
524
+ add z0.h, p0/m, z0.h, z1.h
525
+ sqrshrnb z0.b, z0.h, #7
526
+ add z0.b, z0.b, #0x80
527
+ st1b {z0.h}, p1, x2
528
+ add x2, x2, x5
529
+ cbnz w12, .loop_gt_16_sve2_addavg_16x\h
530
+ ret
531
+.vl_gt_32_addAvg_16x\h\():
532
+ mov x10, #48
533
+ mov x11, #0
534
+ whilelt p0.b, x11, x10
535
+.loop_gt_32_sve2_addavg_16x\h\():
536
+ sub w12, w12, #1
537
+ ld1b {z0.b}, p0/z, x0
538
+ add x0, x0, x3, lsl #1
539
+ add x1, x1, x4, lsl #1
540
+ add z0.h, p0/m, z0.h, z1.h
541
+ sqrshrnb z0.b, z0.h, #7
542
+ add z0.b, z0.b, #0x80
543
+ st1b {z0.h}, p0, x2
544
+ add x2, x2, x5
545
+ cbnz w12, .loop_gt_32_sve2_addavg_16x\h
546
+ ret
547
+endfunc
548
+.endm
549
+
550
+addAvg_16xN_sve2 4
551
+addAvg_16xN_sve2 8
552
+addAvg_16xN_sve2 12
553
+addAvg_16xN_sve2 16
554
+addAvg_16xN_sve2 24
555
+addAvg_16xN_sve2 32
556
+addAvg_16xN_sve2 64
557
+
558
+.macro addAvg_24xN_sve2 h
559
+function PFX(addAvg_24x\h\()_sve2)
560
+ mov w12, #\h
561
+ rdvl x9, #1
562
+ cmp x9, #16
563
+ bgt .vl_gt_16_addAvg_24x\h
564
+ addAvg_start
565
+.loop_eq_16_sve2_addavg_24x\h\():
566
+ sub w12, w12, #1
567
+ ld1 {v0.16b-v2.16b}, x0, x3
568
+ ld1 {v3.16b-v5.16b}, x1, x4
569
+ addavg_1 v0, v3
570
+ addavg_1 v1, v4
571
+ addavg_1 v2, v5
572
+ sqxtun v0.8b, v0.8h
573
+ sqxtun v1.8b, v1.8h
574
+ sqxtun v2.8b, v2.8h
575
+ st1 {v0.8b-v2.8b}, x2, x5
576
+ cbnz w12, .loop_eq_16_sve2_addavg_24x\h
577
+ ret
578
+.vl_gt_16_addAvg_24x\h\():
579
+ cmp x9, #48
580
+ bgt .vl_gt_48_addAvg_24x\h
581
+ ptrue p0.b, vl32
582
+ ptrue p1.b, vl16
583
+.loop_gt_16_sve2_addavg_24x\h\():
584
+ sub w12, w12, #1
585
+ ld1b {z0.b}, p0/z, x0
586
+ ld1b {z1.b}, p1/z, x0, #1, mul vl
587
+ ld1b {z2.b}, p0/z, x1
588
+ ld1b {z3.b}, p1/z, x1, #1, mul vl
589
+ add x0, x0, x3, lsl #1
590
+ add x1, x1, x4, lsl #1
591
+ add z0.h, p0/m, z0.h, z2.h
592
+ add z1.h, p1/m, z1.h, z3.h
593
+ sqrshrnb z0.b, z0.h, #7
594
+ add z0.b, z0.b, #0x80
595
+ sqrshrnb z1.b, z1.h, #7
596
+ add z1.b, z1.b, #0x80
597
+ st1b {z0.h}, p0, x2
598
+ st1b {z1.h}, p1, x2, #1, mul vl
599
+ add x2, x2, x5
600
+ cbnz w12, .loop_gt_16_sve2_addavg_24x\h
601
+ ret
602
+.vl_gt_48_addAvg_24x\h\():
603
+ mov x10, #48
604
+ mov x11, #0
605
+ whilelt p0.b, x11, x10
606
+.loop_gt_48_sve2_addavg_24x\h\():
607
+ sub w12, w12, #1
608
+ ld1b {z0.b}, p0/z, x0
609
+ ld1b {z2.b}, p0/z, x1
610
+ add x0, x0, x3, lsl #1
611
+ add x1, x1, x4, lsl #1
612
+ add z0.h, p0/m, z0.h, z2.h
613
+ sqrshrnb z0.b, z0.h, #7
614
+ add z0.b, z0.b, #0x80
615
+ st1b {z0.h}, p0, x2
616
+ add x2, x2, x5
617
+ cbnz w12, .loop_gt_48_sve2_addavg_24x\h
618
+ ret
619
+endfunc
620
+.endm
621
+
622
+addAvg_24xN_sve2 32
623
+addAvg_24xN_sve2 64
624
+
625
+.macro addAvg_32xN_sve2 h
626
+function PFX(addAvg_32x\h\()_sve2)
627
+ mov w12, #\h
628
+ rdvl x9, #1
629
+ cmp x9, #16
630
+ bgt .vl_gt_16_addAvg_32x\h
631
+ ptrue p0.b, vl16
632
+.loop_eq_16_sve2_addavg_32x\h\():
633
+ sub w12, w12, #1
634
+ ld1b {z0.b}, p0/z, x0
635
+ ld1b {z1.b}, p0/z, x0, #1, mul vl
636
+ ld1b {z2.b}, p0/z, x0, #2, mul vl
637
+ ld1b {z3.b}, p0/z, x0, #3, mul vl
638
+ ld1b {z4.b}, p0/z, x1
639
+ ld1b {z5.b}, p0/z, x1, #1, mul vl
640
+ ld1b {z6.b}, p0/z, x1, #2, mul vl
641
+ ld1b {z7.b}, p0/z, x1, #3, mul vl
642
+ add x0, x0, x3, lsl #1
643
+ add x1, x1, x4, lsl #1
644
+ add z0.h, p0/m, z0.h, z4.h
645
+ add z1.h, p0/m, z1.h, z5.h
646
+ add z2.h, p0/m, z2.h, z6.h
647
+ add z3.h, p0/m, z3.h, z7.h
648
+ sqrshrnb z0.b, z0.h, #7
649
+ add z0.b, z0.b, #0x80
650
+ sqrshrnb z1.b, z1.h, #7
651
+ add z1.b, z1.b, #0x80
652
+ sqrshrnb z2.b, z2.h, #7
653
+ add z2.b, z2.b, #0x80
654
+ sqrshrnb z3.b, z3.h, #7
655
+ add z3.b, z3.b, #0x80
656
+ st1b {z0.h}, p0, x2
657
+ st1b {z1.h}, p0, x2, #1, mul vl
658
+ st1b {z2.h}, p0, x2, #2, mul vl
659
+ st1b {z3.h}, p0, x2, #3, mul vl
660
+ add x2, x2, x5
661
+ cbnz w12, .loop_eq_16_sve2_addavg_32x\h
662
+ ret
663
+.vl_gt_16_addAvg_32x\h\():
664
+ cmp x9, #48
665
+ bgt .vl_gt_48_addAvg_32x\h
666
+ ptrue p0.b, vl32
667
+.loop_gt_eq_32_sve2_addavg_32x\h\():
668
+ sub w12, w12, #1
669
+ ld1b {z0.b}, p0/z, x0
670
+ ld1b {z1.b}, p0/z, x0, #1, mul vl
671
+ ld1b {z2.b}, p0/z, x1
672
+ ld1b {z3.b}, p0/z, x1, #1, mul vl
673
+ add x0, x0, x3, lsl #1
674
+ add x1, x1, x4, lsl #1
675
+ add z0.h, p0/m, z0.h, z2.h
676
+ add z1.h, p0/m, z1.h, z3.h
677
+ sqrshrnb z0.b, z0.h, #7
678
+ add z1.b, z1.b, #0x80
679
+ sqrshrnb z1.b, z1.h, #7
680
+ add z0.b, z0.b, #0x80
681
+ st1b {z0.h}, p0, x2
682
+ st1b {z1.h}, p0, x2, #1, mul vl
683
+ add x2, x2, x5
684
+ cbnz w12, .loop_gt_eq_32_sve2_addavg_32x\h
685
+ ret
686
+.vl_gt_48_addAvg_32x\h\():
687
+ ptrue p0.b, vl64
688
+.loop_eq_64_sve2_addavg_32x\h\():
689
+ sub w12, w12, #1
690
+ ld1b {z0.b}, p0/z, x0
691
+ ld1b {z1.b}, p0/z, x1
692
+ add x0, x0, x3, lsl #1
693
+ add x1, x1, x4, lsl #1
694
+ add z0.h, p0/m, z0.h, z1.h
695
+ sqrshrnb z0.b, z0.h, #7
696
+ add z0.b, z0.b, #0x80
697
+ st1b {z0.h}, p0, x2
698
+ add x2, x2, x5
699
+ cbnz w12, .loop_eq_64_sve2_addavg_32x\h
700
+ ret
701
+endfunc
702
+.endm
703
+
704
+addAvg_32xN_sve2 8
705
+addAvg_32xN_sve2 16
706
+addAvg_32xN_sve2 24
707
+addAvg_32xN_sve2 32
708
+addAvg_32xN_sve2 48
709
+addAvg_32xN_sve2 64
710
+
711
+function PFX(addAvg_48x64_sve2)
712
+ mov w12, #64
713
+ rdvl x9, #1
714
+ cmp x9, #16
715
+ bgt .vl_gt_16_addAvg_48x64
716
+ addAvg_start
717
+ sub x3, x3, #64
718
+ sub x4, x4, #64
719
+.loop_eq_16_sve2_addavg_48x64:
720
+ sub w12, w12, #1
721
+ ld1 {v0.8h-v3.8h}, x0, #64
722
+ ld1 {v4.8h-v7.8h}, x1, #64
723
+ ld1 {v20.8h-v21.8h}, x0, x3
724
+ ld1 {v22.8h-v23.8h}, x1, x4
725
+ addavg_1 v0, v4
726
+ addavg_1 v1, v5
727
+ addavg_1 v2, v6
728
+ addavg_1 v3, v7
729
+ addavg_1 v20, v22
730
+ addavg_1 v21, v23
731
+ sqxtun v0.8b, v0.8h
732
+ sqxtun2 v0.16b, v1.8h
733
+ sqxtun v1.8b, v2.8h
734
+ sqxtun2 v1.16b, v3.8h
735
+ sqxtun v2.8b, v20.8h
736
+ sqxtun2 v2.16b, v21.8h
737
+ st1 {v0.16b-v2.16b}, x2, x5
738
+ cbnz w12, .loop_eq_16_sve2_addavg_48x64
739
+ ret
740
+.vl_gt_16_addAvg_48x64:
741
+ cmp x9, #48
742
+ bgt .vl_gt_48_addAvg_48x64
743
+ ptrue p0.b, vl32
744
+.loop_gt_eq_32_sve2_addavg_48x64:
745
+ sub w12, w12, #1
746
+ ld1b {z0.b}, p0/z, x0
747
+ ld1b {z1.b}, p0/z, x0, #1, mul vl
748
+ ld1b {z2.b}, p0/z, x0, #2, mul vl
749
+ ld1b {z4.b}, p0/z, x1
750
+ ld1b {z5.b}, p0/z, x1, #1, mul vl
751
+ ld1b {z6.b}, p0/z, x1, #2, mul vl
752
+ add x0, x0, x3, lsl #1
753
+ add x1, x1, x4, lsl #1
754
+ add z0.h, p0/m, z0.h, z4.h
755
+ add z1.h, p0/m, z1.h, z5.h
756
+ add z2.h, p0/m, z2.h, z6.h
757
+ sqrshrnb z0.b, z0.h, #7
758
+ add z0.b, z0.b, #0x80
759
+ sqrshrnb z1.b, z1.h, #7
760
+ add z1.b, z1.b, #0x80
761
+ sqrshrnb z2.b, z2.h, #7
762
+ add z2.b, z2.b, #0x80
763
+ st1b {z0.h}, p0, x2
764
+ st1b {z1.h}, p0, x2, #1, mul vl
765
+ st1b {z2.h}, p0, x2, #2, mul vl
766
+ add x2, x2, x5
767
+ cbnz w12, .loop_gt_eq_32_sve2_addavg_48x64
768
+ ret
769
+.vl_gt_48_addAvg_48x64:
770
+ cmp x9, #112
771
+ bgt .vl_gt_112_addAvg_48x64
772
+ ptrue p0.b, vl64
773
+ ptrue p1.b, vl32
774
+.loop_gt_48_sve2_addavg_48x64:
775
+ sub w12, w12, #1
776
+ ld1b {z0.b}, p0/z, x0
777
+ ld1b {z1.b}, p1/z, x0, #1, mul vl
778
+ ld1b {z4.b}, p0/z, x1
779
+ ld1b {z5.b}, p1/z, x1, #1, mul vl
780
+ add x0, x0, x3, lsl #1
781
+ add x1, x1, x4, lsl #1
782
+ add z0.h, p0/m, z0.h, z4.h
783
+ add z1.h, p1/m, z1.h, z5.h
784
+ sqrshrnb z0.b, z0.h, #7
785
+ add z0.b, z0.b, #0x80
786
+ sqrshrnb z1.b, z1.h, #7
787
+ add z1.b, z1.b, #0x80
788
+ st1b {z0.h}, p0, x2
789
+ st1b {z1.h}, p1, x2, #1, mul vl
790
+ add x2, x2, x5
791
+ cbnz w12, .loop_gt_48_sve2_addavg_48x64
792
+ ret
793
+.vl_gt_112_addAvg_48x64:
794
+ mov x10, #96
795
+ mov x11, #0
796
+ whilelt p0.b, x11, x10
797
+.loop_gt_112_sve2_addavg_48x64:
798
+ sub w12, w12, #1
799
+ ld1b {z0.b}, p0/z, x0
800
+ ld1b {z4.b}, p0/z, x1
801
+ add x0, x0, x3, lsl #1
802
+ add x1, x1, x4, lsl #1
803
+ add z0.h, p0/m, z0.h, z4.h
804
+ sqrshrnb z0.b, z0.h, #7
805
+ add z0.b, z0.b, #0x80
806
+ st1b {z0.h}, p0, x2
807
+ add x2, x2, x5
808
+ cbnz w12, .loop_gt_112_sve2_addavg_48x64
809
+ ret
810
+endfunc
811
+
812
+.macro addAvg_64xN_sve2 h
813
+function PFX(addAvg_64x\h\()_sve2)
814
+ mov w12, #\h
815
+ rdvl x9, #1
816
+ cmp x9, #16
817
+ bgt .vl_gt_16_addAvg_64x\h
818
+ addAvg_start
819
+ sub x3, x3, #64
820
+ sub x4, x4, #64
821
+.loop_eq_16_sve2_addavg_64x\h\():
822
+ sub w12, w12, #1
823
+ ld1 {v0.8h-v3.8h}, x0, #64
824
+ ld1 {v4.8h-v7.8h}, x1, #64
825
+ ld1 {v20.8h-v23.8h}, x0, x3
826
+ ld1 {v24.8h-v27.8h}, x1, x4
827
+ addavg_1 v0, v4
828
+ addavg_1 v1, v5
829
+ addavg_1 v2, v6
830
+ addavg_1 v3, v7
831
+ addavg_1 v20, v24
832
+ addavg_1 v21, v25
833
+ addavg_1 v22, v26
834
+ addavg_1 v23, v27
835
+ sqxtun v0.8b, v0.8h
836
+ sqxtun2 v0.16b, v1.8h
837
+ sqxtun v1.8b, v2.8h
838
+ sqxtun2 v1.16b, v3.8h
839
+ sqxtun v2.8b, v20.8h
840
+ sqxtun2 v2.16b, v21.8h
841
+ sqxtun v3.8b, v22.8h
842
+ sqxtun2 v3.16b, v23.8h
843
+ st1 {v0.16b-v3.16b}, x2, x5
844
+ cbnz w12, .loop_eq_16_sve2_addavg_64x\h
845
+ ret
846
+.vl_gt_16_addAvg_64x\h\():
847
+ cmp x9, #48
848
+ bgt .vl_gt_48_addAvg_64x\h
849
+ ptrue p0.b, vl32
850
+.loop_gt_eq_32_sve2_addavg_64x\h\():
851
+ sub w12, w12, #1
852
+ ld1b {z0.b}, p0/z, x0
853
+ ld1b {z1.b}, p0/z, x0, #1, mul vl
854
+ ld1b {z2.b}, p0/z, x0, #2, mul vl
855
+ ld1b {z3.b}, p0/z, x0, #3, mul vl
856
+ ld1b {z4.b}, p0/z, x1
857
+ ld1b {z5.b}, p0/z, x1, #1, mul vl
858
+ ld1b {z6.b}, p0/z, x1, #2, mul vl
859
+ ld1b {z7.b}, p0/z, x1, #3, mul vl
860
+ add x0, x0, x3, lsl #1
861
+ add x1, x1, x4, lsl #1
862
+ add z0.h, p0/m, z0.h, z4.h
863
+ add z1.h, p0/m, z1.h, z5.h
864
+ add z2.h, p0/m, z2.h, z6.h
865
+ add z3.h, p0/m, z3.h, z7.h
866
+ sqrshrnb z0.b, z0.h, #7
867
+ add z0.b, z0.b, #0x80
868
+ sqrshrnb z1.b, z1.h, #7
869
+ add z1.b, z1.b, #0x80
870
+ sqrshrnb z2.b, z2.h, #7
871
+ add z2.b, z2.b, #0x80
872
+ sqrshrnb z3.b, z3.h, #7
873
+ add z3.b, z3.b, #0x80
874
+ st1b {z0.h}, p0, x2
875
+ st1b {z1.h}, p0, x2, #1, mul vl
876
+ st1b {z2.h}, p0, x2, #2, mul vl
877
+ st1b {z3.h}, p0, x2, #3, mul vl
878
+ add x2, x2, x5
879
+ cbnz w12, .loop_gt_eq_32_sve2_addavg_64x\h
880
+ ret
881
+.vl_gt_48_addAvg_64x\h\():
882
+ cmp x9, #112
883
+ bgt .vl_gt_112_addAvg_64x\h
884
+ ptrue p0.b, vl64
885
+.loop_gt_eq_48_sve2_addavg_64x\h\():
886
+ sub w12, w12, #1
887
+ ld1b {z0.b}, p0/z, x0
888
+ ld1b {z1.b}, p0/z, x0, #1, mul vl
889
+ ld1b {z4.b}, p0/z, x1
890
+ ld1b {z5.b}, p0/z, x1, #1, mul vl
891
+ add x0, x0, x3, lsl #1
892
+ add x1, x1, x4, lsl #1
893
+ add z0.h, p0/m, z0.h, z4.h
894
+ add z1.h, p0/m, z1.h, z5.h
895
+ sqrshrnb z0.b, z0.h, #7
896
+ add z0.b, z0.b, #0x80
897
+ sqrshrnb z1.b, z1.h, #7
898
+ add z1.b, z1.b, #0x80
899
+ st1b {z0.h}, p0, x2
900
+ st1b {z1.h}, p0, x2, #1, mul vl
901
+ add x2, x2, x5
902
+ cbnz w12, .loop_gt_eq_48_sve2_addavg_64x\h
903
+ ret
904
+.vl_gt_112_addAvg_64x\h\():
905
+ ptrue p0.b, vl128
906
+.loop_gt_eq_128_sve2_addavg_64x\h\():
907
+ sub w12, w12, #1
908
+ ld1b {z0.b}, p0/z, x0
909
+ ld1b {z4.b}, p0/z, x1
910
+ add x0, x0, x3, lsl #1
911
+ add x1, x1, x4, lsl #1
912
+ add z0.h, p0/m, z0.h, z4.h
913
+ sqrshrnb z0.b, z0.h, #7
914
+ add z0.b, z0.b, #0x80
915
+ st1b {z0.h}, p0, x2
916
+ add x2, x2, x5
917
+ cbnz w12, .loop_gt_eq_128_sve2_addavg_64x\h
918
+ ret
919
+endfunc
920
+.endm
921
+
922
+addAvg_64xN_sve2 16
923
+addAvg_64xN_sve2 32
924
+addAvg_64xN_sve2 48
925
+addAvg_64xN_sve2 64
926
x265_3.5.tar.gz/source/common/aarch64/mc-a.S -> x265_3.6.tar.gz/source/common/aarch64/mc-a.S
Changed
534
1
2
/*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
+ * Copyright (C) 2020-2021 MulticoreWare, Inc
5
*
6
* Authors: Hongbin Liu <liuhongbin1@huawei.com>
7
+ * Sebastian Pop <spop@amazon.com>
8
*
9
* This program is free software; you can redistribute it and/or modify
10
* it under the terms of the GNU General Public License as published by
11
12
*****************************************************************************/
13
14
#include "asm.S"
15
+#include "mc-a-common.S"
16
17
+#ifdef __APPLE__
18
+.section __RODATA,__rodata
19
+#else
20
.section .rodata
21
+#endif
22
23
.align 4
24
25
.text
26
27
.macro pixel_avg_pp_4xN_neon h
28
-function x265_pixel_avg_pp_4x\h\()_neon
29
+function PFX(pixel_avg_pp_4x\h\()_neon)
30
.rept \h
31
ld1 {v0.s}0, x2, x3
32
ld1 {v1.s}0, x4, x5
33
34
pixel_avg_pp_4xN_neon 16
35
36
.macro pixel_avg_pp_8xN_neon h
37
-function x265_pixel_avg_pp_8x\h\()_neon
38
+function PFX(pixel_avg_pp_8x\h\()_neon)
39
.rept \h
40
ld1 {v0.8b}, x2, x3
41
ld1 {v1.8b}, x4, x5
42
43
pixel_avg_pp_8xN_neon 8
44
pixel_avg_pp_8xN_neon 16
45
pixel_avg_pp_8xN_neon 32
46
+
47
+function PFX(pixel_avg_pp_12x16_neon)
48
+ sub x1, x1, #4
49
+ sub x3, x3, #4
50
+ sub x5, x5, #4
51
+.rept 16
52
+ ld1 {v0.s}0, x2, #4
53
+ ld1 {v1.8b}, x2, x3
54
+ ld1 {v2.s}0, x4, #4
55
+ ld1 {v3.8b}, x4, x5
56
+ urhadd v4.8b, v0.8b, v2.8b
57
+ urhadd v5.8b, v1.8b, v3.8b
58
+ st1 {v4.s}0, x0, #4
59
+ st1 {v5.8b}, x0, x1
60
+.endr
61
+ ret
62
+endfunc
63
+
64
+.macro pixel_avg_pp_16xN_neon h
65
+function PFX(pixel_avg_pp_16x\h\()_neon)
66
+.rept \h
67
+ ld1 {v0.16b}, x2, x3
68
+ ld1 {v1.16b}, x4, x5
69
+ urhadd v2.16b, v0.16b, v1.16b
70
+ st1 {v2.16b}, x0, x1
71
+.endr
72
+ ret
73
+endfunc
74
+.endm
75
+
76
+pixel_avg_pp_16xN_neon 4
77
+pixel_avg_pp_16xN_neon 8
78
+pixel_avg_pp_16xN_neon 12
79
+pixel_avg_pp_16xN_neon 16
80
+pixel_avg_pp_16xN_neon 32
81
+
82
+function PFX(pixel_avg_pp_16x64_neon)
83
+ mov w12, #8
84
+.lpavg_16x64:
85
+ sub w12, w12, #1
86
+.rept 8
87
+ ld1 {v0.16b}, x2, x3
88
+ ld1 {v1.16b}, x4, x5
89
+ urhadd v2.16b, v0.16b, v1.16b
90
+ st1 {v2.16b}, x0, x1
91
+.endr
92
+ cbnz w12, .lpavg_16x64
93
+ ret
94
+endfunc
95
+
96
+function PFX(pixel_avg_pp_24x32_neon)
97
+ sub x1, x1, #16
98
+ sub x3, x3, #16
99
+ sub x5, x5, #16
100
+ mov w12, #4
101
+.lpavg_24x32:
102
+ sub w12, w12, #1
103
+.rept 8
104
+ ld1 {v0.16b}, x2, #16
105
+ ld1 {v1.8b}, x2, x3
106
+ ld1 {v2.16b}, x4, #16
107
+ ld1 {v3.8b}, x4, x5
108
+ urhadd v0.16b, v0.16b, v2.16b
109
+ urhadd v1.8b, v1.8b, v3.8b
110
+ st1 {v0.16b}, x0, #16
111
+ st1 {v1.8b}, x0, x1
112
+.endr
113
+ cbnz w12, .lpavg_24x32
114
+ ret
115
+endfunc
116
+
117
+.macro pixel_avg_pp_32xN_neon h
118
+function PFX(pixel_avg_pp_32x\h\()_neon)
119
+.rept \h
120
+ ld1 {v0.16b-v1.16b}, x2, x3
121
+ ld1 {v2.16b-v3.16b}, x4, x5
122
+ urhadd v0.16b, v0.16b, v2.16b
123
+ urhadd v1.16b, v1.16b, v3.16b
124
+ st1 {v0.16b-v1.16b}, x0, x1
125
+.endr
126
+ ret
127
+endfunc
128
+.endm
129
+
130
+pixel_avg_pp_32xN_neon 8
131
+pixel_avg_pp_32xN_neon 16
132
+pixel_avg_pp_32xN_neon 24
133
+
134
+.macro pixel_avg_pp_32xN1_neon h
135
+function PFX(pixel_avg_pp_32x\h\()_neon)
136
+ mov w12, #\h / 8
137
+.lpavg_32x\h\():
138
+ sub w12, w12, #1
139
+.rept 8
140
+ ld1 {v0.16b-v1.16b}, x2, x3
141
+ ld1 {v2.16b-v3.16b}, x4, x5
142
+ urhadd v0.16b, v0.16b, v2.16b
143
+ urhadd v1.16b, v1.16b, v3.16b
144
+ st1 {v0.16b-v1.16b}, x0, x1
145
+.endr
146
+ cbnz w12, .lpavg_32x\h
147
+ ret
148
+endfunc
149
+.endm
150
+
151
+pixel_avg_pp_32xN1_neon 32
152
+pixel_avg_pp_32xN1_neon 64
153
+
154
+function PFX(pixel_avg_pp_48x64_neon)
155
+ mov w12, #8
156
+.lpavg_48x64:
157
+ sub w12, w12, #1
158
+.rept 8
159
+ ld1 {v0.16b-v2.16b}, x2, x3
160
+ ld1 {v3.16b-v5.16b}, x4, x5
161
+ urhadd v0.16b, v0.16b, v3.16b
162
+ urhadd v1.16b, v1.16b, v4.16b
163
+ urhadd v2.16b, v2.16b, v5.16b
164
+ st1 {v0.16b-v2.16b}, x0, x1
165
+.endr
166
+ cbnz w12, .lpavg_48x64
167
+ ret
168
+endfunc
169
+
170
+.macro pixel_avg_pp_64xN_neon h
171
+function PFX(pixel_avg_pp_64x\h\()_neon)
172
+ mov w12, #\h / 4
173
+.lpavg_64x\h\():
174
+ sub w12, w12, #1
175
+.rept 4
176
+ ld1 {v0.16b-v3.16b}, x2, x3
177
+ ld1 {v4.16b-v7.16b}, x4, x5
178
+ urhadd v0.16b, v0.16b, v4.16b
179
+ urhadd v1.16b, v1.16b, v5.16b
180
+ urhadd v2.16b, v2.16b, v6.16b
181
+ urhadd v3.16b, v3.16b, v7.16b
182
+ st1 {v0.16b-v3.16b}, x0, x1
183
+.endr
184
+ cbnz w12, .lpavg_64x\h
185
+ ret
186
+endfunc
187
+.endm
188
+
189
+pixel_avg_pp_64xN_neon 16
190
+pixel_avg_pp_64xN_neon 32
191
+pixel_avg_pp_64xN_neon 48
192
+pixel_avg_pp_64xN_neon 64
193
+
194
+// void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
195
+.macro addAvg_2xN h
196
+function PFX(addAvg_2x\h\()_neon)
197
+ addAvg_start
198
+.rept \h / 2
199
+ ldr w10, x0
200
+ ldr w11, x1
201
+ add x0, x0, x3
202
+ add x1, x1, x4
203
+ ldr w12, x0
204
+ ldr w13, x1
205
+ add x0, x0, x3
206
+ add x1, x1, x4
207
+ dup v0.2s, w10
208
+ dup v1.2s, w11
209
+ dup v2.2s, w12
210
+ dup v3.2s, w13
211
+ add v0.4h, v0.4h, v1.4h
212
+ add v2.4h, v2.4h, v3.4h
213
+ saddl v0.4s, v0.4h, v30.4h
214
+ saddl v2.4s, v2.4h, v30.4h
215
+ shrn v0.4h, v0.4s, #7
216
+ shrn2 v0.8h, v2.4s, #7
217
+ sqxtun v0.8b, v0.8h
218
+ st1 {v0.h}0, x2, x5
219
+ st1 {v0.h}2, x2, x5
220
+.endr
221
+ ret
222
+endfunc
223
+.endm
224
+
225
+addAvg_2xN 4
226
+addAvg_2xN 8
227
+addAvg_2xN 16
228
+
229
+.macro addAvg_4xN h
230
+function PFX(addAvg_4x\h\()_neon)
231
+ addAvg_start
232
+.rept \h / 2
233
+ ld1 {v0.8b}, x0, x3
234
+ ld1 {v1.8b}, x1, x4
235
+ ld1 {v2.8b}, x0, x3
236
+ ld1 {v3.8b}, x1, x4
237
+ add v0.4h, v0.4h, v1.4h
238
+ add v2.4h, v2.4h, v3.4h
239
+ saddl v0.4s, v0.4h, v30.4h
240
+ saddl v2.4s, v2.4h, v30.4h
241
+ shrn v0.4h, v0.4s, #7
242
+ shrn2 v0.8h, v2.4s, #7
243
+ sqxtun v0.8b, v0.8h
244
+ st1 {v0.s}0, x2, x5
245
+ st1 {v0.s}1, x2, x5
246
+.endr
247
+ ret
248
+endfunc
249
+.endm
250
+
251
+addAvg_4xN 2
252
+addAvg_4xN 4
253
+addAvg_4xN 8
254
+addAvg_4xN 16
255
+addAvg_4xN 32
256
+
257
+.macro addAvg_6xN h
258
+function PFX(addAvg_6x\h\()_neon)
259
+ addAvg_start
260
+ mov w12, #\h / 2
261
+ sub x5, x5, #4
262
+.loop_addavg_6x\h:
263
+ sub w12, w12, #1
264
+ ld1 {v0.16b}, x0, x3
265
+ ld1 {v1.16b}, x1, x4
266
+ ld1 {v2.16b}, x0, x3
267
+ ld1 {v3.16b}, x1, x4
268
+ add v0.8h, v0.8h, v1.8h
269
+ add v2.8h, v2.8h, v3.8h
270
+ saddl v16.4s, v0.4h, v30.4h
271
+ saddl2 v17.4s, v0.8h, v30.8h
272
+ saddl v18.4s, v2.4h, v30.4h
273
+ saddl2 v19.4s, v2.8h, v30.8h
274
+ shrn v0.4h, v16.4s, #7
275
+ shrn2 v0.8h, v17.4s, #7
276
+ shrn v1.4h, v18.4s, #7
277
+ shrn2 v1.8h, v19.4s, #7
278
+ sqxtun v0.8b, v0.8h
279
+ sqxtun v1.8b, v1.8h
280
+ str s0, x2, #4
281
+ st1 {v0.h}2, x2, x5
282
+ str s1, x2, #4
283
+ st1 {v1.h}2, x2, x5
284
+ cbnz w12, .loop_addavg_6x\h
285
+ ret
286
+endfunc
287
+.endm
288
+
289
+addAvg_6xN 8
290
+addAvg_6xN 16
291
+
292
+.macro addAvg_8xN h
293
+function PFX(addAvg_8x\h\()_neon)
294
+ addAvg_start
295
+.rept \h / 2
296
+ ld1 {v0.16b}, x0, x3
297
+ ld1 {v1.16b}, x1, x4
298
+ ld1 {v2.16b}, x0, x3
299
+ ld1 {v3.16b}, x1, x4
300
+ add v0.8h, v0.8h, v1.8h
301
+ add v2.8h, v2.8h, v3.8h
302
+ saddl v16.4s, v0.4h, v30.4h
303
+ saddl2 v17.4s, v0.8h, v30.8h
304
+ saddl v18.4s, v2.4h, v30.4h
305
+ saddl2 v19.4s, v2.8h, v30.8h
306
+ shrn v0.4h, v16.4s, #7
307
+ shrn2 v0.8h, v17.4s, #7
308
+ shrn v1.4h, v18.4s, #7
309
+ shrn2 v1.8h, v19.4s, #7
310
+ sqxtun v0.8b, v0.8h
311
+ sqxtun v1.8b, v1.8h
312
+ st1 {v0.8b}, x2, x5
313
+ st1 {v1.8b}, x2, x5
314
+.endr
315
+ ret
316
+endfunc
317
+.endm
318
+
319
+.macro addAvg_8xN1 h
320
+function PFX(addAvg_8x\h\()_neon)
321
+ addAvg_start
322
+ mov w12, #\h / 2
323
+.loop_addavg_8x\h:
324
+ sub w12, w12, #1
325
+ ld1 {v0.16b}, x0, x3
326
+ ld1 {v1.16b}, x1, x4
327
+ ld1 {v2.16b}, x0, x3
328
+ ld1 {v3.16b}, x1, x4
329
+ add v0.8h, v0.8h, v1.8h
330
+ add v2.8h, v2.8h, v3.8h
331
+ saddl v16.4s, v0.4h, v30.4h
332
+ saddl2 v17.4s, v0.8h, v30.8h
333
+ saddl v18.4s, v2.4h, v30.4h
334
+ saddl2 v19.4s, v2.8h, v30.8h
335
+ shrn v0.4h, v16.4s, #7
336
+ shrn2 v0.8h, v17.4s, #7
337
+ shrn v1.4h, v18.4s, #7
338
+ shrn2 v1.8h, v19.4s, #7
339
+ sqxtun v0.8b, v0.8h
340
+ sqxtun v1.8b, v1.8h
341
+ st1 {v0.8b}, x2, x5
342
+ st1 {v1.8b}, x2, x5
343
+ cbnz w12, .loop_addavg_8x\h
344
+ ret
345
+endfunc
346
+.endm
347
+
348
+addAvg_8xN 2
349
+addAvg_8xN 4
350
+addAvg_8xN 6
351
+addAvg_8xN 8
352
+addAvg_8xN 12
353
+addAvg_8xN 16
354
+addAvg_8xN1 32
355
+addAvg_8xN1 64
356
+
357
+.macro addAvg_12xN h
358
+function PFX(addAvg_12x\h\()_neon)
359
+ addAvg_start
360
+ sub x3, x3, #16
361
+ sub x4, x4, #16
362
+ sub x5, x5, #8
363
+ mov w12, #\h
364
+.loop_addAvg_12X\h\():
365
+ sub w12, w12, #1
366
+ ld1 {v0.16b}, x0, #16
367
+ ld1 {v1.16b}, x1, #16
368
+ ld1 {v2.8b}, x0, x3
369
+ ld1 {v3.8b}, x1, x4
370
+ add v0.8h, v0.8h, v1.8h
371
+ add v2.4h, v2.4h, v3.4h
372
+ saddl v16.4s, v0.4h, v30.4h
373
+ saddl2 v17.4s, v0.8h, v30.8h
374
+ saddl v18.4s, v2.4h, v30.4h
375
+ shrn v0.4h, v16.4s, #7
376
+ shrn2 v0.8h, v17.4s, #7
377
+ shrn v1.4h, v18.4s, #7
378
+ sqxtun v0.8b, v0.8h
379
+ sqxtun v1.8b, v1.8h
380
+ st1 {v0.8b}, x2, #8
381
+ st1 {v1.s}0, x2, x5
382
+ cbnz w12, .loop_addAvg_12X\h
383
+ ret
384
+endfunc
385
+.endm
386
+
387
+addAvg_12xN 16
388
+addAvg_12xN 32
389
+
390
+.macro addAvg_16xN h
391
+function PFX(addAvg_16x\h\()_neon)
392
+ addAvg_start
393
+ mov w12, #\h
394
+.loop_addavg_16x\h:
395
+ sub w12, w12, #1
396
+ ld1 {v0.8h-v1.8h}, x0, x3
397
+ ld1 {v2.8h-v3.8h}, x1, x4
398
+ addavg_1 v0, v2
399
+ addavg_1 v1, v3
400
+ sqxtun v0.8b, v0.8h
401
+ sqxtun2 v0.16b, v1.8h
402
+ st1 {v0.16b}, x2, x5
403
+ cbnz w12, .loop_addavg_16x\h
404
+ ret
405
+endfunc
406
+.endm
407
+
408
+addAvg_16xN 4
409
+addAvg_16xN 8
410
+addAvg_16xN 12
411
+addAvg_16xN 16
412
+addAvg_16xN 24
413
+addAvg_16xN 32
414
+addAvg_16xN 64
415
+
416
+.macro addAvg_24xN h
417
+function PFX(addAvg_24x\h\()_neon)
418
+ addAvg_start
419
+ mov w12, #\h
420
+.loop_addavg_24x\h\():
421
+ sub w12, w12, #1
422
+ ld1 {v0.16b-v2.16b}, x0, x3
423
+ ld1 {v3.16b-v5.16b}, x1, x4
424
+ addavg_1 v0, v3
425
+ addavg_1 v1, v4
426
+ addavg_1 v2, v5
427
+ sqxtun v0.8b, v0.8h
428
+ sqxtun v1.8b, v1.8h
429
+ sqxtun v2.8b, v2.8h
430
+ st1 {v0.8b-v2.8b}, x2, x5
431
+ cbnz w12, .loop_addavg_24x\h
432
+ ret
433
+endfunc
434
+.endm
435
+
436
+addAvg_24xN 32
437
+addAvg_24xN 64
438
+
439
+.macro addAvg_32xN h
440
+function PFX(addAvg_32x\h\()_neon)
441
+ addAvg_start
442
+ mov w12, #\h
443
+.loop_addavg_32x\h\():
444
+ sub w12, w12, #1
445
+ ld1 {v0.8h-v3.8h}, x0, x3
446
+ ld1 {v4.8h-v7.8h}, x1, x4
447
+ addavg_1 v0, v4
448
+ addavg_1 v1, v5
449
+ addavg_1 v2, v6
450
+ addavg_1 v3, v7
451
+ sqxtun v0.8b, v0.8h
452
+ sqxtun v1.8b, v1.8h
453
+ sqxtun v2.8b, v2.8h
454
+ sqxtun v3.8b, v3.8h
455
+ st1 {v0.8b-v3.8b}, x2, x5
456
+ cbnz w12, .loop_addavg_32x\h
457
+ ret
458
+endfunc
459
+.endm
460
+
461
+addAvg_32xN 8
462
+addAvg_32xN 16
463
+addAvg_32xN 24
464
+addAvg_32xN 32
465
+addAvg_32xN 48
466
+addAvg_32xN 64
467
+
468
+function PFX(addAvg_48x64_neon)
469
+ addAvg_start
470
+ sub x3, x3, #64
471
+ sub x4, x4, #64
472
+ mov w12, #64
473
+.loop_addavg_48x64:
474
+ sub w12, w12, #1
475
+ ld1 {v0.8h-v3.8h}, x0, #64
476
+ ld1 {v4.8h-v7.8h}, x1, #64
477
+ ld1 {v20.8h-v21.8h}, x0, x3
478
+ ld1 {v22.8h-v23.8h}, x1, x4
479
+ addavg_1 v0, v4
480
+ addavg_1 v1, v5
481
+ addavg_1 v2, v6
482
+ addavg_1 v3, v7
483
+ addavg_1 v20, v22
484
+ addavg_1 v21, v23
485
+ sqxtun v0.8b, v0.8h
486
+ sqxtun2 v0.16b, v1.8h
487
+ sqxtun v1.8b, v2.8h
488
+ sqxtun2 v1.16b, v3.8h
489
+ sqxtun v2.8b, v20.8h
490
+ sqxtun2 v2.16b, v21.8h
491
+ st1 {v0.16b-v2.16b}, x2, x5
492
+ cbnz w12, .loop_addavg_48x64
493
+ ret
494
+endfunc
495
+
496
+.macro addAvg_64xN h
497
+function PFX(addAvg_64x\h\()_neon)
498
+ addAvg_start
499
+ mov w12, #\h
500
+ sub x3, x3, #64
501
+ sub x4, x4, #64
502
+.loop_addavg_64x\h\():
503
+ sub w12, w12, #1
504
+ ld1 {v0.8h-v3.8h}, x0, #64
505
+ ld1 {v4.8h-v7.8h}, x1, #64
506
+ ld1 {v20.8h-v23.8h}, x0, x3
507
+ ld1 {v24.8h-v27.8h}, x1, x4
508
+ addavg_1 v0, v4
509
+ addavg_1 v1, v5
510
+ addavg_1 v2, v6
511
+ addavg_1 v3, v7
512
+ addavg_1 v20, v24
513
+ addavg_1 v21, v25
514
+ addavg_1 v22, v26
515
+ addavg_1 v23, v27
516
+ sqxtun v0.8b, v0.8h
517
+ sqxtun2 v0.16b, v1.8h
518
+ sqxtun v1.8b, v2.8h
519
+ sqxtun2 v1.16b, v3.8h
520
+ sqxtun v2.8b, v20.8h
521
+ sqxtun2 v2.16b, v21.8h
522
+ sqxtun v3.8b, v22.8h
523
+ sqxtun2 v3.16b, v23.8h
524
+ st1 {v0.16b-v3.16b}, x2, x5
525
+ cbnz w12, .loop_addavg_64x\h
526
+ ret
527
+endfunc
528
+.endm
529
+
530
+addAvg_64xN 16
531
+addAvg_64xN 32
532
+addAvg_64xN 48
533
+addAvg_64xN 64
534
x265_3.6.tar.gz/source/common/aarch64/p2s-common.S
Added
104
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+.arch armv8-a
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+#if HIGH_BIT_DEPTH
39
+# if BIT_DEPTH == 10
40
+# define P2S_SHIFT 4
41
+# elif BIT_DEPTH == 12
42
+# define P2S_SHIFT 2
43
+# endif
44
+.macro p2s_start
45
+ add x3, x3, x3
46
+ add x1, x1, x1
47
+ movi v31.8h, #0xe0, lsl #8
48
+.endm
49
+
50
+#else // if !HIGH_BIT_DEPTH
51
+# define P2S_SHIFT 6
52
+.macro p2s_start
53
+ add x3, x3, x3
54
+ movi v31.8h, #0xe0, lsl #8
55
+.endm
56
+#endif // HIGH_BIT_DEPTH
57
+
58
+.macro p2s_2x2
59
+#if HIGH_BIT_DEPTH
60
+ ld1 {v0.s}0, x0, x1
61
+ ld1 {v0.s}1, x0, x1
62
+ shl v3.8h, v0.8h, #P2S_SHIFT
63
+#else
64
+ ldrh w10, x0
65
+ add x0, x0, x1
66
+ ldrh w11, x0
67
+ orr w10, w10, w11, lsl #16
68
+ add x0, x0, x1
69
+ dup v0.4s, w10
70
+ ushll v3.8h, v0.8b, #P2S_SHIFT
71
+#endif
72
+ add v3.8h, v3.8h, v31.8h
73
+ st1 {v3.s}0, x2, x3
74
+ st1 {v3.s}1, x2, x3
75
+.endm
76
+
77
+.macro p2s_6x2
78
+#if HIGH_BIT_DEPTH
79
+ ld1 {v0.d}0, x0, #8
80
+ ld1 {v1.s}0, x0, x1
81
+ ld1 {v0.d}1, x0, #8
82
+ ld1 {v1.s}1, x0, x1
83
+ shl v3.8h, v0.8h, #P2S_SHIFT
84
+ shl v4.8h, v1.8h, #P2S_SHIFT
85
+#else
86
+ ldr s0, x0
87
+ ldrh w10, x0, #4
88
+ add x0, x0, x1
89
+ ld1 {v0.s}1, x0
90
+ ldrh w11, x0, #4
91
+ add x0, x0, x1
92
+ orr w10, w10, w11, lsl #16
93
+ dup v1.4s, w10
94
+ ushll v3.8h, v0.8b, #P2S_SHIFT
95
+ ushll v4.8h, v1.8b, #P2S_SHIFT
96
+#endif
97
+ add v3.8h, v3.8h, v31.8h
98
+ add v4.8h, v4.8h, v31.8h
99
+ st1 {v3.d}0, x2, #8
100
+ st1 {v4.s}0, x2, x3
101
+ st1 {v3.d}1, x2, #8
102
+ st1 {v4.s}1, x2, x3
103
+.endm
104
x265_3.6.tar.gz/source/common/aarch64/p2s-sve.S
Added
447
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "p2s-common.S"
27
+
28
+.arch armv8-a+sve
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+#if HIGH_BIT_DEPTH
41
+# if BIT_DEPTH == 10
42
+# define P2S_SHIFT 4
43
+# elif BIT_DEPTH == 12
44
+# define P2S_SHIFT 2
45
+# endif
46
+
47
+.macro p2s_start_sve
48
+ add x3, x3, x3
49
+ add x1, x1, x1
50
+ mov z31.h, #0xe0, lsl #8
51
+.endm
52
+
53
+#else // if !HIGH_BIT_DEPTH
54
+# define P2S_SHIFT 6
55
+.macro p2s_start_sve
56
+ add x3, x3, x3
57
+ mov z31.h, #0xe0, lsl #8
58
+.endm
59
+
60
+#endif // HIGH_BIT_DEPTH
61
+
62
+// filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
63
+.macro p2s_2xN_sve h
64
+function PFX(filterPixelToShort_2x\h\()_sve)
65
+ p2s_start_sve
66
+.rept \h / 2
67
+ p2s_2x2
68
+.endr
69
+ ret
70
+endfunc
71
+.endm
72
+
73
+p2s_2xN_sve 4
74
+p2s_2xN_sve 8
75
+p2s_2xN_sve 16
76
+
77
+.macro p2s_6xN_sve h
78
+function PFX(filterPixelToShort_6x\h\()_sve)
79
+ p2s_start_sve
80
+ sub x3, x3, #8
81
+#if HIGH_BIT_DEPTH
82
+ sub x1, x1, #8
83
+#endif
84
+.rept \h / 2
85
+ p2s_6x2
86
+.endr
87
+ ret
88
+endfunc
89
+.endm
90
+
91
+p2s_6xN_sve 8
92
+p2s_6xN_sve 16
93
+
94
+function PFX(filterPixelToShort_4x2_sve)
95
+ p2s_start_sve
96
+#if HIGH_BIT_DEPTH
97
+ ptrue p0.h, vl8
98
+ index z1.d, #0, x1
99
+ index z2.d, #0, x3
100
+ ld1d {z3.d}, p0/z, x0, z1.d
101
+ lsl z3.h, p0/m, z3.h, #P2S_SHIFT
102
+ add z3.h, p0/m, z3.h, z31.h
103
+ st1d {z3.d}, p0, x2, z2.d
104
+#else
105
+ ptrue p0.h, vl4
106
+ ld1b {z0.h}, p0/z, x0
107
+ add x0, x0, x1
108
+ ld1b {z1.h}, p0/z, x0
109
+ lsl z0.h, p0/m, z0.h, #P2S_SHIFT
110
+ lsl z1.h, p0/m, z1.h, #P2S_SHIFT
111
+ add z0.h, p0/m, z0.h, z31.h
112
+ add z1.h, p0/m, z1.h, z31.h
113
+ st1h {z0.h}, p0, x2
114
+ add x2, x2, x3
115
+ st1h {z1.h}, p0, x2
116
+#endif
117
+ ret
118
+endfunc
119
+
120
+
121
+.macro p2s_8xN_sve h
122
+function PFX(filterPixelToShort_8x\h\()_sve)
123
+ p2s_start_sve
124
+ ptrue p0.h, vl8
125
+.rept \h
126
+#if HIGH_BIT_DEPTH
127
+ ld1d {z0.d}, p0/z, x0
128
+ add x0, x0, x1
129
+ lsl z0.h, p0/m, z0.h, #P2S_SHIFT
130
+ add z0.h, p0/m, z0.h, z31.h
131
+ st1h {z0.h}, p0, x2
132
+ add x2, x2, x3
133
+#else
134
+ ld1b {z0.h}, p0/z, x0
135
+ add x0, x0, x1
136
+ lsl z0.h, p0/m, z0.h, #P2S_SHIFT
137
+ add z0.h, p0/m, z0.h, z31.h
138
+ st1h {z0.h}, p0, x2
139
+ add x2, x2, x3
140
+#endif
141
+.endr
142
+ ret
143
+endfunc
144
+.endm
145
+
146
+p2s_8xN_sve 2
147
+
148
+.macro p2s_32xN_sve h
149
+function PFX(filterPixelToShort_32x\h\()_sve)
150
+#if HIGH_BIT_DEPTH
151
+ p2s_start_sve
152
+ rdvl x9, #1
153
+ cmp x9, #16
154
+ bgt .vl_gt_16_filterPixelToShort_high_32x\h
155
+ ptrue p0.h, vl8
156
+.rept \h
157
+ ld1h {z0.h}, p0/z, x0
158
+ ld1h {z1.h}, p0/z, x0, #1, mul vl
159
+ ld1h {z2.h}, p0/z, x0, #2, mul vl
160
+ ld1h {z3.h}, p0/z, x0, #3, mul vl
161
+ add x0, x0, x1
162
+ lsl z0.h, p0/m, z0.h, #P2S_SHIFT
163
+ lsl z1.h, p0/m, z1.h, #P2S_SHIFT
164
+ lsl z2.h, p0/m, z2.h, #P2S_SHIFT
165
+ lsl z3.h, p0/m, z3.h, #P2S_SHIFT
166
+ add z0.h, p0/m, z0.h, z31.h
167
+ add z1.h, p0/m, z1.h, z31.h
168
+ add z2.h, p0/m, z2.h, z31.h
169
+ add z3.h, p0/m, z3.h, z31.h
170
+ st1h {z0.h}, p0, x2
171
+ st1h {z1.h}, p0, x2, #1, mul vl
172
+ st1h {z2.h}, p0, x2, #2, mul vl
173
+ st1h {z3.h}, p0, x2, #3, mul vl
174
+ add x2, x2, x3
175
+.endr
176
+ ret
177
+.vl_gt_16_filterPixelToShort_high_32x\h\():
178
+ cmp x9, #48
179
+ bgt .vl_gt_48_filterPixelToShort_high_32x\h
180
+ ptrue p0.h, vl16
181
+.rept \h
182
+ ld1h {z0.h}, p0/z, x0
183
+ ld1h {z1.h}, p0/z, x0, #1, mul vl
184
+ add x0, x0, x1
185
+ lsl z0.h, p0/m, z0.h, #P2S_SHIFT
186
+ lsl z1.h, p0/m, z1.h, #P2S_SHIFT
187
+ add z0.h, p0/m, z0.h, z31.h
188
+ add z1.h, p0/m, z1.h, z31.h
189
+ st1h {z0.h}, p0, x2
190
+ st1h {z1.h}, p0, x2, #1, mul vl
191
+ add x2, x2, x3
192
+.endr
193
+ ret
194
+.vl_gt_48_filterPixelToShort_high_32x\h\():
195
+ ptrue p0.h, vl32
196
+.rept \h
197
+ ld1h {z0.h}, p0/z, x0
198
+ add x0, x0, x1
199
+ lsl z0.h, p0/m, z0.h, #P2S_SHIFT
200
+ add z0.h, p0/m, z0.h, z31.h
201
+ st1h {z0.h}, p0, x2
202
+ add x2, x2, x3
203
+.endr
204
+ ret
205
+#else
206
+ p2s_start
207
+ mov x9, #\h
208
+.loop_filter_sve_P2S_32x\h:
209
+ sub x9, x9, #1
210
+ ld1 {v0.16b-v1.16b}, x0, x1
211
+ ushll v22.8h, v0.8b, #P2S_SHIFT
212
+ ushll2 v23.8h, v0.16b, #P2S_SHIFT
213
+ ushll v24.8h, v1.8b, #P2S_SHIFT
214
+ ushll2 v25.8h, v1.16b, #P2S_SHIFT
215
+ add v22.8h, v22.8h, v31.8h
216
+ add v23.8h, v23.8h, v31.8h
217
+ add v24.8h, v24.8h, v31.8h
218
+ add v25.8h, v25.8h, v31.8h
219
+ st1 {v22.16b-v25.16b}, x2, x3
220
+ cbnz x9, .loop_filter_sve_P2S_32x\h
221
+ ret
222
+#endif
223
+endfunc
224
+.endm
225
+
226
+p2s_32xN_sve 8
227
+p2s_32xN_sve 16
228
+p2s_32xN_sve 24
229
+p2s_32xN_sve 32
230
+p2s_32xN_sve 48
231
+p2s_32xN_sve 64
232
+
233
+.macro p2s_64xN_sve h
234
+function PFX(filterPixelToShort_64x\h\()_sve)
235
+#if HIGH_BIT_DEPTH
236
+ p2s_start_sve
237
+ rdvl x9, #1
238
+ cmp x9, #16
239
+ bgt .vl_gt_16_filterPixelToShort_high_64x\h
240
+ ptrue p0.h, vl8
241
+.rept \h
242
+ ld1h {z0.h}, p0/z, x0
243
+ ld1h {z1.h}, p0/z, x0, #1, mul vl
244
+ ld1h {z2.h}, p0/z, x0, #2, mul vl
245
+ ld1h {z3.h}, p0/z, x0, #3, mul vl
246
+ ld1h {z4.h}, p0/z, x0, #4, mul vl
247
+ ld1h {z5.h}, p0/z, x0, #5, mul vl
248
+ ld1h {z6.h}, p0/z, x0, #6, mul vl
249
+ ld1h {z7.h}, p0/z, x0, #7, mul vl
250
+ add x0, x0, x1
251
+ lsl z0.h, p0/m, z0.h, #P2S_SHIFT
252
+ lsl z1.h, p0/m, z1.h, #P2S_SHIFT
253
+ lsl z2.h, p0/m, z2.h, #P2S_SHIFT
254
+ lsl z3.h, p0/m, z3.h, #P2S_SHIFT
255
+ lsl z4.h, p0/m, z4.h, #P2S_SHIFT
256
+ lsl z5.h, p0/m, z5.h, #P2S_SHIFT
257
+ lsl z6.h, p0/m, z6.h, #P2S_SHIFT
258
+ lsl z7.h, p0/m, z7.h, #P2S_SHIFT
259
+ add z0.h, p0/m, z0.h, z31.h
260
+ add z1.h, p0/m, z1.h, z31.h
261
+ add z2.h, p0/m, z2.h, z31.h
262
+ add z3.h, p0/m, z3.h, z31.h
263
+ add z4.h, p0/m, z4.h, z31.h
264
+ add z5.h, p0/m, z5.h, z31.h
265
+ add z6.h, p0/m, z6.h, z31.h
266
+ add z7.h, p0/m, z7.h, z31.h
267
+ st1h {z0.h}, p0, x2
268
+ st1h {z1.h}, p0, x2, #1, mul vl
269
+ st1h {z2.h}, p0, x2, #2, mul vl
270
+ st1h {z3.h}, p0, x2, #3, mul vl
271
+ st1h {z4.h}, p0, x2, #4, mul vl
272
+ st1h {z5.h}, p0, x2, #5, mul vl
273
+ st1h {z6.h}, p0, x2, #6, mul vl
274
+ st1h {z7.h}, p0, x2, #7, mul vl
275
+ add x2, x2, x3
276
+.endr
277
+ ret
278
+.vl_gt_16_filterPixelToShort_high_64x\h\():
279
+ cmp x9, #48
280
+ bgt .vl_gt_48_filterPixelToShort_high_64x\h
281
+ ptrue p0.h, vl16
282
+.rept \h
283
+ ld1h {z0.h}, p0/z, x0
284
+ ld1h {z1.h}, p0/z, x0, #1, mul vl
285
+ ld1h {z2.h}, p0/z, x0, #2, mul vl
286
+ ld1h {z3.h}, p0/z, x0, #3, mul vl
287
+ add x0, x0, x1
288
+ lsl z0.h, p0/m, z0.h, #P2S_SHIFT
289
+ lsl z1.h, p0/m, z1.h, #P2S_SHIFT
290
+ lsl z2.h, p0/m, z2.h, #P2S_SHIFT
291
+ lsl z3.h, p0/m, z3.h, #P2S_SHIFT
292
+ add z0.h, p0/m, z0.h, z31.h
293
+ add z1.h, p0/m, z1.h, z31.h
294
+ add z2.h, p0/m, z2.h, z31.h
295
+ add z3.h, p0/m, z3.h, z31.h
296
+ st1h {z0.h}, p0, x2
297
+ st1h {z1.h}, p0, x2, #1, mul vl
298
+ st1h {z2.h}, p0, x2, #2, mul vl
299
+ st1h {z3.h}, p0, x2, #3, mul vl
300
+ add x2, x2, x3
301
+.endr
302
+ ret
303
+.vl_gt_48_filterPixelToShort_high_64x\h\():
304
+ cmp x9, #112
305
+ bgt .vl_gt_112_filterPixelToShort_high_64x\h
306
+ ptrue p0.h, vl32
307
+.rept \h
308
+ ld1h {z0.h}, p0/z, x0
309
+ ld1h {z1.h}, p0/z, x0, #1, mul vl
310
+ add x0, x0, x1
311
+ lsl z0.h, p0/m, z0.h, #P2S_SHIFT
312
+ lsl z1.h, p0/m, z1.h, #P2S_SHIFT
313
+ add z0.h, p0/m, z0.h, z31.h
314
+ add z1.h, p0/m, z1.h, z31.h
315
+ st1h {z0.h}, p0, x2
316
+ st1h {z1.h}, p0, x2, #1, mul vl
317
+ add x2, x2, x3
318
+.endr
319
+ ret
320
+.vl_gt_112_filterPixelToShort_high_64x\h\():
321
+ ptrue p0.h, vl64
322
+.rept \h
323
+ ld1h {z0.h}, p0/z, x0
324
+ add x0, x0, x1
325
+ lsl z0.h, p0/m, z0.h, #P2S_SHIFT
326
+ add z0.h, p0/m, z0.h, z31.h
327
+ st1h {z0.h}, p0, x2
328
+ add x2, x2, x3
329
+.endr
330
+ ret
331
+#else
332
+ p2s_start
333
+ sub x3, x3, #64
334
+ mov x9, #\h
335
+.loop_filter_sve_P2S_64x\h:
336
+ sub x9, x9, #1
337
+ ld1 {v0.16b-v3.16b}, x0, x1
338
+ ushll v16.8h, v0.8b, #P2S_SHIFT
339
+ ushll2 v17.8h, v0.16b, #P2S_SHIFT
340
+ ushll v18.8h, v1.8b, #P2S_SHIFT
341
+ ushll2 v19.8h, v1.16b, #P2S_SHIFT
342
+ ushll v20.8h, v2.8b, #P2S_SHIFT
343
+ ushll2 v21.8h, v2.16b, #P2S_SHIFT
344
+ ushll v22.8h, v3.8b, #P2S_SHIFT
345
+ ushll2 v23.8h, v3.16b, #P2S_SHIFT
346
+ add v16.8h, v16.8h, v31.8h
347
+ add v17.8h, v17.8h, v31.8h
348
+ add v18.8h, v18.8h, v31.8h
349
+ add v19.8h, v19.8h, v31.8h
350
+ add v20.8h, v20.8h, v31.8h
351
+ add v21.8h, v21.8h, v31.8h
352
+ add v22.8h, v22.8h, v31.8h
353
+ add v23.8h, v23.8h, v31.8h
354
+ st1 {v16.16b-v19.16b}, x2, #64
355
+ st1 {v20.16b-v23.16b}, x2, x3
356
+ cbnz x9, .loop_filter_sve_P2S_64x\h
357
+ ret
358
+#endif
359
+endfunc
360
+.endm
361
+
362
+p2s_64xN_sve 16
363
+p2s_64xN_sve 32
364
+p2s_64xN_sve 48
365
+p2s_64xN_sve 64
366
+
367
+function PFX(filterPixelToShort_48x64_sve)
368
+#if HIGH_BIT_DEPTH
369
+ p2s_start_sve
370
+ rdvl x9, #1
371
+ cmp x9, #16
372
+ bgt .vl_gt_16_filterPixelToShort_high_48x64
373
+ ptrue p0.h, vl8
374
+.rept 64
375
+ ld1h {z0.h}, p0/z, x0
376
+ ld1h {z1.h}, p0/z, x0, #1, mul vl
377
+ ld1h {z2.h}, p0/z, x0, #2, mul vl
378
+ ld1h {z3.h}, p0/z, x0, #3, mul vl
379
+ ld1h {z4.h}, p0/z, x0, #4, mul vl
380
+ ld1h {z5.h}, p0/z, x0, #5, mul vl
381
+ add x0, x0, x1
382
+ lsl z0.h, p0/m, z0.h, #P2S_SHIFT
383
+ lsl z1.h, p0/m, z1.h, #P2S_SHIFT
384
+ lsl z2.h, p0/m, z2.h, #P2S_SHIFT
385
+ lsl z3.h, p0/m, z3.h, #P2S_SHIFT
386
+ lsl z4.h, p0/m, z4.h, #P2S_SHIFT
387
+ lsl z5.h, p0/m, z5.h, #P2S_SHIFT
388
+ add z0.h, p0/m, z0.h, z31.h
389
+ add z1.h, p0/m, z1.h, z31.h
390
+ add z2.h, p0/m, z2.h, z31.h
391
+ add z3.h, p0/m, z3.h, z31.h
392
+ add z4.h, p0/m, z4.h, z31.h
393
+ add z5.h, p0/m, z5.h, z31.h
394
+ st1h {z0.h}, p0, x2
395
+ st1h {z1.h}, p0, x2, #1, mul vl
396
+ st1h {z2.h}, p0, x2, #2, mul vl
397
+ st1h {z3.h}, p0, x2, #3, mul vl
398
+ st1h {z4.h}, p0, x2, #4, mul vl
399
+ st1h {z5.h}, p0, x2, #5, mul vl
400
+ add x2, x2, x3
401
+.endr
402
+ ret
403
+.vl_gt_16_filterPixelToShort_high_48x64:
404
+ ptrue p0.h, vl16
405
+.rept 64
406
+ ld1h {z0.h}, p0/z, x0
407
+ ld1h {z1.h}, p0/z, x0, #1, mul vl
408
+ ld1h {z2.h}, p0/z, x0, #2, mul vl
409
+ add x0, x0, x1
410
+ lsl z0.h, p0/m, z0.h, #P2S_SHIFT
411
+ lsl z1.h, p0/m, z1.h, #P2S_SHIFT
412
+ lsl z2.h, p0/m, z2.h, #P2S_SHIFT
413
+ add z0.h, p0/m, z0.h, z31.h
414
+ add z1.h, p0/m, z1.h, z31.h
415
+ add z2.h, p0/m, z2.h, z31.h
416
+ st1h {z0.h}, p0, x2
417
+ st1h {z1.h}, p0, x2, #1, mul vl
418
+ st1h {z2.h}, p0, x2, #2, mul vl
419
+ add x2, x2, x3
420
+.endr
421
+ ret
422
+#else
423
+ p2s_start
424
+ sub x3, x3, #64
425
+ mov x9, #64
426
+.loop_filterP2S_sve_48x64:
427
+ sub x9, x9, #1
428
+ ld1 {v0.16b-v2.16b}, x0, x1
429
+ ushll v16.8h, v0.8b, #P2S_SHIFT
430
+ ushll2 v17.8h, v0.16b, #P2S_SHIFT
431
+ ushll v18.8h, v1.8b, #P2S_SHIFT
432
+ ushll2 v19.8h, v1.16b, #P2S_SHIFT
433
+ ushll v20.8h, v2.8b, #P2S_SHIFT
434
+ ushll2 v21.8h, v2.16b, #P2S_SHIFT
435
+ add v16.8h, v16.8h, v31.8h
436
+ add v17.8h, v17.8h, v31.8h
437
+ add v18.8h, v18.8h, v31.8h
438
+ add v19.8h, v19.8h, v31.8h
439
+ add v20.8h, v20.8h, v31.8h
440
+ add v21.8h, v21.8h, v31.8h
441
+ st1 {v16.16b-v19.16b}, x2, #64
442
+ st1 {v20.16b-v21.16b}, x2, x3
443
+ cbnz x9, .loop_filterP2S_sve_48x64
444
+ ret
445
+#endif
446
+endfunc
447
x265_3.6.tar.gz/source/common/aarch64/p2s.S
Added
388
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2021 MulticoreWare, Inc
4
+ *
5
+ * Authors: Sebastian Pop <spop@amazon.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+#include "p2s-common.S"
27
+
28
+#ifdef __APPLE__
29
+.section __RODATA,__rodata
30
+#else
31
+.section .rodata
32
+#endif
33
+
34
+.align 4
35
+
36
+.text
37
+
38
+// filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
39
+.macro p2s_2xN h
40
+function PFX(filterPixelToShort_2x\h\()_neon)
41
+ p2s_start
42
+.rept \h / 2
43
+ p2s_2x2
44
+.endr
45
+ ret
46
+endfunc
47
+.endm
48
+
49
+p2s_2xN 4
50
+p2s_2xN 8
51
+p2s_2xN 16
52
+
53
+.macro p2s_6xN h
54
+function PFX(filterPixelToShort_6x\h\()_neon)
55
+ p2s_start
56
+ sub x3, x3, #8
57
+#if HIGH_BIT_DEPTH
58
+ sub x1, x1, #8
59
+#endif
60
+.rept \h / 2
61
+ p2s_6x2
62
+.endr
63
+ ret
64
+endfunc
65
+.endm
66
+
67
+p2s_6xN 8
68
+p2s_6xN 16
69
+
70
+function PFX(filterPixelToShort_4x2_neon)
71
+ p2s_start
72
+#if HIGH_BIT_DEPTH
73
+ ld1 {v0.d}0, x0, x1
74
+ ld1 {v0.d}1, x0, x1
75
+ shl v3.8h, v0.8h, #P2S_SHIFT
76
+#else
77
+ ld1 {v0.s}0, x0, x1
78
+ ld1 {v0.s}1, x0, x1
79
+ ushll v3.8h, v0.8b, #P2S_SHIFT
80
+#endif
81
+ add v3.8h, v3.8h, v31.8h
82
+ st1 {v3.d}0, x2, x3
83
+ st1 {v3.d}1, x2, x3
84
+ ret
85
+endfunc
86
+
87
+function PFX(filterPixelToShort_4x4_neon)
88
+ p2s_start
89
+#if HIGH_BIT_DEPTH
90
+ ld1 {v0.d}0, x0, x1
91
+ ld1 {v0.d}1, x0, x1
92
+ shl v3.8h, v0.8h, #P2S_SHIFT
93
+#else
94
+ ld1 {v0.s}0, x0, x1
95
+ ld1 {v0.s}1, x0, x1
96
+ ushll v3.8h, v0.8b, #P2S_SHIFT
97
+#endif
98
+ add v3.8h, v3.8h, v31.8h
99
+ st1 {v3.d}0, x2, x3
100
+ st1 {v3.d}1, x2, x3
101
+#if HIGH_BIT_DEPTH
102
+ ld1 {v1.d}0, x0, x1
103
+ ld1 {v1.d}1, x0, x1
104
+ shl v4.8h, v1.8h, #P2S_SHIFT
105
+#else
106
+ ld1 {v1.s}0, x0, x1
107
+ ld1 {v1.s}1, x0, x1
108
+ ushll v4.8h, v1.8b, #P2S_SHIFT
109
+#endif
110
+ add v4.8h, v4.8h, v31.8h
111
+ st1 {v4.d}0, x2, x3
112
+ st1 {v4.d}1, x2, x3
113
+ ret
114
+endfunc
115
+
116
+.macro p2s_4xN h
117
+function PFX(filterPixelToShort_4x\h\()_neon)
118
+ p2s_start
119
+.rept \h / 2
120
+#if HIGH_BIT_DEPTH
121
+ ld1 {v0.16b}, x0, x1
122
+ shl v0.8h, v0.8h, #P2S_SHIFT
123
+#else
124
+ ld1 {v0.8b}, x0, x1
125
+ ushll v0.8h, v0.8b, #P2S_SHIFT
126
+#endif
127
+ add v2.4h, v0.4h, v31.4h
128
+ st1 {v2.4h}, x2, x3
129
+#if HIGH_BIT_DEPTH
130
+ ld1 {v1.16b}, x0, x1
131
+ shl v1.8h, v1.8h, #P2S_SHIFT
132
+#else
133
+ ld1 {v1.8b}, x0, x1
134
+ ushll v1.8h, v1.8b, #P2S_SHIFT
135
+#endif
136
+ add v3.4h, v1.4h, v31.4h
137
+ st1 {v3.4h}, x2, x3
138
+.endr
139
+ ret
140
+endfunc
141
+.endm
142
+
143
+p2s_4xN 8
144
+p2s_4xN 16
145
+p2s_4xN 32
146
+
147
+.macro p2s_8xN h
148
+function PFX(filterPixelToShort_8x\h\()_neon)
149
+ p2s_start
150
+.rept \h / 2
151
+#if HIGH_BIT_DEPTH
152
+ ld1 {v0.16b}, x0, x1
153
+ ld1 {v1.16b}, x0, x1
154
+ shl v0.8h, v0.8h, #P2S_SHIFT
155
+ shl v1.8h, v1.8h, #P2S_SHIFT
156
+#else
157
+ ld1 {v0.8b}, x0, x1
158
+ ld1 {v1.8b}, x0, x1
159
+ ushll v0.8h, v0.8b, #P2S_SHIFT
160
+ ushll v1.8h, v1.8b, #P2S_SHIFT
161
+#endif
162
+ add v2.8h, v0.8h, v31.8h
163
+ st1 {v2.8h}, x2, x3
164
+ add v3.8h, v1.8h, v31.8h
165
+ st1 {v3.8h}, x2, x3
166
+.endr
167
+ ret
168
+endfunc
169
+.endm
170
+
171
+p2s_8xN 2
172
+p2s_8xN 4
173
+p2s_8xN 6
174
+p2s_8xN 8
175
+p2s_8xN 12
176
+p2s_8xN 16
177
+p2s_8xN 32
178
+p2s_8xN 64
179
+
180
+.macro p2s_12xN h
181
+function PFX(filterPixelToShort_12x\h\()_neon)
182
+ p2s_start
183
+ sub x3, x3, #16
184
+.rept \h
185
+#if HIGH_BIT_DEPTH
186
+ ld1 {v0.16b-v1.16b}, x0, x1
187
+ shl v2.8h, v0.8h, #P2S_SHIFT
188
+ shl v3.8h, v1.8h, #P2S_SHIFT
189
+#else
190
+ ld1 {v0.16b}, x0, x1
191
+ ushll v2.8h, v0.8b, #P2S_SHIFT
192
+ ushll2 v3.8h, v0.16b, #P2S_SHIFT
193
+#endif
194
+ add v2.8h, v2.8h, v31.8h
195
+ add v3.8h, v3.8h, v31.8h
196
+ st1 {v2.16b}, x2, #16
197
+ st1 {v3.8b}, x2, x3
198
+.endr
199
+ ret
200
+endfunc
201
+.endm
202
+
203
+p2s_12xN 16
204
+p2s_12xN 32
205
+
206
+.macro p2s_16xN h
207
+function PFX(filterPixelToShort_16x\h\()_neon)
208
+ p2s_start
209
+.rept \h
210
+#if HIGH_BIT_DEPTH
211
+ ld1 {v0.16b-v1.16b}, x0, x1
212
+ shl v2.8h, v0.8h, #P2S_SHIFT
213
+ shl v3.8h, v1.8h, #P2S_SHIFT
214
+#else
215
+ ld1 {v0.16b}, x0, x1
216
+ ushll v2.8h, v0.8b, #P2S_SHIFT
217
+ ushll2 v3.8h, v0.16b, #P2S_SHIFT
218
+#endif
219
+ add v2.8h, v2.8h, v31.8h
220
+ add v3.8h, v3.8h, v31.8h
221
+ st1 {v2.16b-v3.16b}, x2, x3
222
+.endr
223
+ ret
224
+endfunc
225
+.endm
226
+
227
+p2s_16xN 4
228
+p2s_16xN 8
229
+p2s_16xN 12
230
+p2s_16xN 16
231
+p2s_16xN 24
232
+p2s_16xN 32
233
+p2s_16xN 64
234
+
235
+.macro p2s_24xN h
236
+function PFX(filterPixelToShort_24x\h\()_neon)
237
+ p2s_start
238
+.rept \h
239
+#if HIGH_BIT_DEPTH
240
+ ld1 {v0.16b-v2.16b}, x0, x1
241
+ shl v3.8h, v0.8h, #P2S_SHIFT
242
+ shl v4.8h, v1.8h, #P2S_SHIFT
243
+ shl v5.8h, v2.8h, #P2S_SHIFT
244
+#else
245
+ ld1 {v0.8b-v2.8b}, x0, x1
246
+ ushll v3.8h, v0.8b, #P2S_SHIFT
247
+ ushll v4.8h, v1.8b, #P2S_SHIFT
248
+ ushll v5.8h, v2.8b, #P2S_SHIFT
249
+#endif
250
+ add v3.8h, v3.8h, v31.8h
251
+ add v4.8h, v4.8h, v31.8h
252
+ add v5.8h, v5.8h, v31.8h
253
+ st1 {v3.16b-v5.16b}, x2, x3
254
+.endr
255
+ ret
256
+endfunc
257
+.endm
258
+
259
+p2s_24xN 32
260
+p2s_24xN 64
261
+
262
+.macro p2s_32xN h
263
+function PFX(filterPixelToShort_32x\h\()_neon)
264
+ p2s_start
265
+ mov x9, #\h
266
+.loop_filterP2S_32x\h:
267
+ sub x9, x9, #1
268
+#if HIGH_BIT_DEPTH
269
+ ld1 {v0.16b-v3.16b}, x0, x1
270
+ shl v22.8h, v0.8h, #P2S_SHIFT
271
+ shl v23.8h, v1.8h, #P2S_SHIFT
272
+ shl v24.8h, v2.8h, #P2S_SHIFT
273
+ shl v25.8h, v3.8h, #P2S_SHIFT
274
+#else
275
+ ld1 {v0.16b-v1.16b}, x0, x1
276
+ ushll v22.8h, v0.8b, #P2S_SHIFT
277
+ ushll2 v23.8h, v0.16b, #P2S_SHIFT
278
+ ushll v24.8h, v1.8b, #P2S_SHIFT
279
+ ushll2 v25.8h, v1.16b, #P2S_SHIFT
280
+#endif
281
+ add v22.8h, v22.8h, v31.8h
282
+ add v23.8h, v23.8h, v31.8h
283
+ add v24.8h, v24.8h, v31.8h
284
+ add v25.8h, v25.8h, v31.8h
285
+ st1 {v22.16b-v25.16b}, x2, x3
286
+ cbnz x9, .loop_filterP2S_32x\h
287
+ ret
288
+endfunc
289
+.endm
290
+
291
+p2s_32xN 8
292
+p2s_32xN 16
293
+p2s_32xN 24
294
+p2s_32xN 32
295
+p2s_32xN 48
296
+p2s_32xN 64
297
+
298
+.macro p2s_64xN h
299
+function PFX(filterPixelToShort_64x\h\()_neon)
300
+ p2s_start
301
+#if HIGH_BIT_DEPTH
302
+ sub x1, x1, #64
303
+#endif
304
+ sub x3, x3, #64
305
+ mov x9, #\h
306
+.loop_filterP2S_64x\h:
307
+ sub x9, x9, #1
308
+#if HIGH_BIT_DEPTH
309
+ ld1 {v0.16b-v3.16b}, x0, #64
310
+ ld1 {v4.16b-v7.16b}, x0, x1
311
+ shl v16.8h, v0.8h, #P2S_SHIFT
312
+ shl v17.8h, v1.8h, #P2S_SHIFT
313
+ shl v18.8h, v2.8h, #P2S_SHIFT
314
+ shl v19.8h, v3.8h, #P2S_SHIFT
315
+ shl v20.8h, v4.8h, #P2S_SHIFT
316
+ shl v21.8h, v5.8h, #P2S_SHIFT
317
+ shl v22.8h, v6.8h, #P2S_SHIFT
318
+ shl v23.8h, v7.8h, #P2S_SHIFT
319
+#else
320
+ ld1 {v0.16b-v3.16b}, x0, x1
321
+ ushll v16.8h, v0.8b, #P2S_SHIFT
322
+ ushll2 v17.8h, v0.16b, #P2S_SHIFT
323
+ ushll v18.8h, v1.8b, #P2S_SHIFT
324
+ ushll2 v19.8h, v1.16b, #P2S_SHIFT
325
+ ushll v20.8h, v2.8b, #P2S_SHIFT
326
+ ushll2 v21.8h, v2.16b, #P2S_SHIFT
327
+ ushll v22.8h, v3.8b, #P2S_SHIFT
328
+ ushll2 v23.8h, v3.16b, #P2S_SHIFT
329
+#endif
330
+ add v16.8h, v16.8h, v31.8h
331
+ add v17.8h, v17.8h, v31.8h
332
+ add v18.8h, v18.8h, v31.8h
333
+ add v19.8h, v19.8h, v31.8h
334
+ add v20.8h, v20.8h, v31.8h
335
+ add v21.8h, v21.8h, v31.8h
336
+ add v22.8h, v22.8h, v31.8h
337
+ add v23.8h, v23.8h, v31.8h
338
+ st1 {v16.16b-v19.16b}, x2, #64
339
+ st1 {v20.16b-v23.16b}, x2, x3
340
+ cbnz x9, .loop_filterP2S_64x\h
341
+ ret
342
+endfunc
343
+.endm
344
+
345
+p2s_64xN 16
346
+p2s_64xN 32
347
+p2s_64xN 48
348
+p2s_64xN 64
349
+
350
+function PFX(filterPixelToShort_48x64_neon)
351
+ p2s_start
352
+#if HIGH_BIT_DEPTH
353
+ sub x1, x1, #64
354
+#endif
355
+ sub x3, x3, #64
356
+ mov x9, #64
357
+.loop_filterP2S_48x64:
358
+ sub x9, x9, #1
359
+#if HIGH_BIT_DEPTH
360
+ ld1 {v0.16b-v3.16b}, x0, #64
361
+ ld1 {v4.16b-v5.16b}, x0, x1
362
+ shl v16.8h, v0.8h, #P2S_SHIFT
363
+ shl v17.8h, v1.8h, #P2S_SHIFT
364
+ shl v18.8h, v2.8h, #P2S_SHIFT
365
+ shl v19.8h, v3.8h, #P2S_SHIFT
366
+ shl v20.8h, v4.8h, #P2S_SHIFT
367
+ shl v21.8h, v5.8h, #P2S_SHIFT
368
+#else
369
+ ld1 {v0.16b-v2.16b}, x0, x1
370
+ ushll v16.8h, v0.8b, #P2S_SHIFT
371
+ ushll2 v17.8h, v0.16b, #P2S_SHIFT
372
+ ushll v18.8h, v1.8b, #P2S_SHIFT
373
+ ushll2 v19.8h, v1.16b, #P2S_SHIFT
374
+ ushll v20.8h, v2.8b, #P2S_SHIFT
375
+ ushll2 v21.8h, v2.16b, #P2S_SHIFT
376
+#endif
377
+ add v16.8h, v16.8h, v31.8h
378
+ add v17.8h, v17.8h, v31.8h
379
+ add v18.8h, v18.8h, v31.8h
380
+ add v19.8h, v19.8h, v31.8h
381
+ add v20.8h, v20.8h, v31.8h
382
+ add v21.8h, v21.8h, v31.8h
383
+ st1 {v16.16b-v19.16b}, x2, #64
384
+ st1 {v20.16b-v21.16b}, x2, x3
385
+ cbnz x9, .loop_filterP2S_48x64
386
+ ret
387
+endfunc
388
x265_3.6.tar.gz/source/common/aarch64/pixel-prim.cpp
Added
2061
1
2
+#include "common.h"
3
+#include "slicetype.h" // LOWRES_COST_MASK
4
+#include "primitives.h"
5
+#include "x265.h"
6
+
7
+#include "pixel-prim.h"
8
+#include "arm64-utils.h"
9
+#if HAVE_NEON
10
+
11
+#include <arm_neon.h>
12
+
13
+using namespace X265_NS;
14
+
15
+
16
+
17
+namespace
18
+{
19
+
20
+
21
+/* SATD SA8D variants - based on x264 */
22
+static inline void SUMSUB_AB(int16x8_t &sum, int16x8_t &sub, const int16x8_t a, const int16x8_t b)
23
+{
24
+ sum = vaddq_s16(a, b);
25
+ sub = vsubq_s16(a, b);
26
+}
27
+
28
+static inline void transpose_8h(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
29
+{
30
+ t1 = vtrn1q_s16(s1, s2);
31
+ t2 = vtrn2q_s16(s1, s2);
32
+}
33
+
34
+static inline void transpose_4s(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
35
+{
36
+ t1 = vtrn1q_s32(s1, s2);
37
+ t2 = vtrn2q_s32(s1, s2);
38
+}
39
+
40
+#if (X265_DEPTH <= 10)
41
+static inline void transpose_2d(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
42
+{
43
+ t1 = vtrn1q_s64(s1, s2);
44
+ t2 = vtrn2q_s64(s1, s2);
45
+}
46
+#endif
47
+
48
+
49
+static inline void SUMSUB_ABCD(int16x8_t &s1, int16x8_t &d1, int16x8_t &s2, int16x8_t &d2,
50
+ int16x8_t a, int16x8_t b, int16x8_t c, int16x8_t d)
51
+{
52
+ SUMSUB_AB(s1, d1, a, b);
53
+ SUMSUB_AB(s2, d2, c, d);
54
+}
55
+
56
+static inline void HADAMARD4_V(int16x8_t &r1, int16x8_t &r2, int16x8_t &r3, int16x8_t &r4,
57
+ int16x8_t &t1, int16x8_t &t2, int16x8_t &t3, int16x8_t &t4)
58
+{
59
+ SUMSUB_ABCD(t1, t2, t3, t4, r1, r2, r3, r4);
60
+ SUMSUB_ABCD(r1, r3, r2, r4, t1, t3, t2, t4);
61
+}
62
+
63
+
64
+static int _satd_4x8_8x4_end_neon(int16x8_t v0, int16x8_t v1, int16x8_t v2, int16x8_t v3)
65
+
66
+{
67
+
68
+ int16x8_t v4, v5, v6, v7, v16, v17, v18, v19;
69
+
70
+
71
+ SUMSUB_AB(v16, v17, v0, v1);
72
+ SUMSUB_AB(v18, v19, v2, v3);
73
+
74
+ SUMSUB_AB(v4 , v6 , v16, v18);
75
+ SUMSUB_AB(v5 , v7 , v17, v19);
76
+
77
+ v0 = vtrn1q_s16(v4, v5);
78
+ v1 = vtrn2q_s16(v4, v5);
79
+ v2 = vtrn1q_s16(v6, v7);
80
+ v3 = vtrn2q_s16(v6, v7);
81
+
82
+ SUMSUB_AB(v16, v17, v0, v1);
83
+ SUMSUB_AB(v18, v19, v2, v3);
84
+
85
+ v0 = vtrn1q_s32(v16, v18);
86
+ v1 = vtrn2q_s32(v16, v18);
87
+ v2 = vtrn1q_s32(v17, v19);
88
+ v3 = vtrn2q_s32(v17, v19);
89
+
90
+ v0 = vabsq_s16(v0);
91
+ v1 = vabsq_s16(v1);
92
+ v2 = vabsq_s16(v2);
93
+ v3 = vabsq_s16(v3);
94
+
95
+ v0 = vmaxq_u16(v0, v1);
96
+ v1 = vmaxq_u16(v2, v3);
97
+
98
+ v0 = vaddq_u16(v0, v1);
99
+ return vaddlvq_u16(v0);
100
+}
101
+
102
+static inline int _satd_4x4_neon(int16x8_t v0, int16x8_t v1)
103
+{
104
+ int16x8_t v2, v3;
105
+ SUMSUB_AB(v2, v3, v0, v1);
106
+
107
+ v0 = vzip1q_s64(v2, v3);
108
+ v1 = vzip2q_s64(v2, v3);
109
+ SUMSUB_AB(v2, v3, v0, v1);
110
+
111
+ v0 = vtrn1q_s16(v2, v3);
112
+ v1 = vtrn2q_s16(v2, v3);
113
+ SUMSUB_AB(v2, v3, v0, v1);
114
+
115
+ v0 = vtrn1q_s32(v2, v3);
116
+ v1 = vtrn2q_s32(v2, v3);
117
+
118
+ v0 = vabsq_s16(v0);
119
+ v1 = vabsq_s16(v1);
120
+ v0 = vmaxq_u16(v0, v1);
121
+
122
+ return vaddlvq_s16(v0);
123
+}
124
+
125
+static void _satd_8x4v_8x8h_neon(int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3, int16x8_t &v20,
126
+ int16x8_t &v21, int16x8_t &v22, int16x8_t &v23)
127
+{
128
+ int16x8_t v16, v17, v18, v19, v4, v5, v6, v7;
129
+
130
+ SUMSUB_AB(v16, v18, v0, v2);
131
+ SUMSUB_AB(v17, v19, v1, v3);
132
+
133
+ HADAMARD4_V(v20, v21, v22, v23, v0, v1, v2, v3);
134
+
135
+ transpose_8h(v0, v1, v16, v17);
136
+ transpose_8h(v2, v3, v18, v19);
137
+ transpose_8h(v4, v5, v20, v21);
138
+ transpose_8h(v6, v7, v22, v23);
139
+
140
+ SUMSUB_AB(v16, v17, v0, v1);
141
+ SUMSUB_AB(v18, v19, v2, v3);
142
+ SUMSUB_AB(v20, v21, v4, v5);
143
+ SUMSUB_AB(v22, v23, v6, v7);
144
+
145
+ transpose_4s(v0, v2, v16, v18);
146
+ transpose_4s(v1, v3, v17, v19);
147
+ transpose_4s(v4, v6, v20, v22);
148
+ transpose_4s(v5, v7, v21, v23);
149
+
150
+ v0 = vabsq_s16(v0);
151
+ v1 = vabsq_s16(v1);
152
+ v2 = vabsq_s16(v2);
153
+ v3 = vabsq_s16(v3);
154
+ v4 = vabsq_s16(v4);
155
+ v5 = vabsq_s16(v5);
156
+ v6 = vabsq_s16(v6);
157
+ v7 = vabsq_s16(v7);
158
+
159
+ v0 = vmaxq_u16(v0, v2);
160
+ v1 = vmaxq_u16(v1, v3);
161
+ v2 = vmaxq_u16(v4, v6);
162
+ v3 = vmaxq_u16(v5, v7);
163
+
164
+}
165
+
166
+#if HIGH_BIT_DEPTH
167
+
168
+#if (X265_DEPTH > 10)
169
+static inline void transpose_2d(int32x4_t &t1, int32x4_t &t2, const int32x4_t s1, const int32x4_t s2)
170
+{
171
+ t1 = vtrn1q_s64(s1, s2);
172
+ t2 = vtrn2q_s64(s1, s2);
173
+}
174
+
175
+static inline void ISUMSUB_AB(int32x4_t &sum, int32x4_t &sub, const int32x4_t a, const int32x4_t b)
176
+{
177
+ sum = vaddq_s32(a, b);
178
+ sub = vsubq_s32(a, b);
179
+}
180
+
181
+static inline void ISUMSUB_AB_FROM_INT16(int32x4_t &suml, int32x4_t &sumh, int32x4_t &subl, int32x4_t &subh,
182
+ const int16x8_t a, const int16x8_t b)
183
+{
184
+ suml = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
185
+ sumh = vaddl_high_s16(a, b);
186
+ subl = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
187
+ subh = vsubl_high_s16(a, b);
188
+}
189
+
190
+#endif
191
+
192
+static inline void _sub_8x8_fly(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2,
193
+ int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3,
194
+ int16x8_t &v20, int16x8_t &v21, int16x8_t &v22, int16x8_t &v23)
195
+{
196
+ uint16x8_t r0, r1, r2, r3;
197
+ uint16x8_t t0, t1, t2, t3;
198
+ int16x8_t v16, v17;
199
+ int16x8_t v18, v19;
200
+
201
+ r0 = *(uint16x8_t *)(pix1 + 0 * stride_pix1);
202
+ r1 = *(uint16x8_t *)(pix1 + 1 * stride_pix1);
203
+ r2 = *(uint16x8_t *)(pix1 + 2 * stride_pix1);
204
+ r3 = *(uint16x8_t *)(pix1 + 3 * stride_pix1);
205
+
206
+ t0 = *(uint16x8_t *)(pix2 + 0 * stride_pix2);
207
+ t1 = *(uint16x8_t *)(pix2 + 1 * stride_pix2);
208
+ t2 = *(uint16x8_t *)(pix2 + 2 * stride_pix2);
209
+ t3 = *(uint16x8_t *)(pix2 + 3 * stride_pix2);
210
+
211
+ v16 = vsubq_u16(r0, t0);
212
+ v17 = vsubq_u16(r1, t1);
213
+ v18 = vsubq_u16(r2, t2);
214
+ v19 = vsubq_u16(r3, t3);
215
+
216
+ r0 = *(uint16x8_t *)(pix1 + 4 * stride_pix1);
217
+ r1 = *(uint16x8_t *)(pix1 + 5 * stride_pix1);
218
+ r2 = *(uint16x8_t *)(pix1 + 6 * stride_pix1);
219
+ r3 = *(uint16x8_t *)(pix1 + 7 * stride_pix1);
220
+
221
+ t0 = *(uint16x8_t *)(pix2 + 4 * stride_pix2);
222
+ t1 = *(uint16x8_t *)(pix2 + 5 * stride_pix2);
223
+ t2 = *(uint16x8_t *)(pix2 + 6 * stride_pix2);
224
+ t3 = *(uint16x8_t *)(pix2 + 7 * stride_pix2);
225
+
226
+ v20 = vsubq_u16(r0, t0);
227
+ v21 = vsubq_u16(r1, t1);
228
+ v22 = vsubq_u16(r2, t2);
229
+ v23 = vsubq_u16(r3, t3);
230
+
231
+ SUMSUB_AB(v0, v1, v16, v17);
232
+ SUMSUB_AB(v2, v3, v18, v19);
233
+
234
+}
235
+
236
+
237
+
238
+
239
+static void _satd_16x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2,
240
+ int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3)
241
+{
242
+ uint8x16_t r0, r1, r2, r3;
243
+ uint8x16_t t0, t1, t2, t3;
244
+ int16x8_t v16, v17, v20, v21;
245
+ int16x8_t v18, v19, v22, v23;
246
+
247
+ r0 = *(int16x8_t *)(pix1 + 0 * stride_pix1);
248
+ r1 = *(int16x8_t *)(pix1 + 1 * stride_pix1);
249
+ r2 = *(int16x8_t *)(pix1 + 2 * stride_pix1);
250
+ r3 = *(int16x8_t *)(pix1 + 3 * stride_pix1);
251
+
252
+ t0 = *(int16x8_t *)(pix2 + 0 * stride_pix2);
253
+ t1 = *(int16x8_t *)(pix2 + 1 * stride_pix2);
254
+ t2 = *(int16x8_t *)(pix2 + 2 * stride_pix2);
255
+ t3 = *(int16x8_t *)(pix2 + 3 * stride_pix2);
256
+
257
+
258
+ v16 = vsubq_u16((r0), (t0));
259
+ v17 = vsubq_u16((r1), (t1));
260
+ v18 = vsubq_u16((r2), (t2));
261
+ v19 = vsubq_u16((r3), (t3));
262
+
263
+ r0 = *(int16x8_t *)(pix1 + 0 * stride_pix1 + 8);
264
+ r1 = *(int16x8_t *)(pix1 + 1 * stride_pix1 + 8);
265
+ r2 = *(int16x8_t *)(pix1 + 2 * stride_pix1 + 8);
266
+ r3 = *(int16x8_t *)(pix1 + 3 * stride_pix1 + 8);
267
+
268
+ t0 = *(int16x8_t *)(pix2 + 0 * stride_pix2 + 8);
269
+ t1 = *(int16x8_t *)(pix2 + 1 * stride_pix2 + 8);
270
+ t2 = *(int16x8_t *)(pix2 + 2 * stride_pix2 + 8);
271
+ t3 = *(int16x8_t *)(pix2 + 3 * stride_pix2 + 8);
272
+
273
+
274
+ v20 = vsubq_u16(r0, t0);
275
+ v21 = vsubq_u16(r1, t1);
276
+ v22 = vsubq_u16(r2, t2);
277
+ v23 = vsubq_u16(r3, t3);
278
+
279
+ SUMSUB_AB(v0, v1, v16, v17);
280
+ SUMSUB_AB(v2, v3, v18, v19);
281
+
282
+ _satd_8x4v_8x8h_neon(v0, v1, v2, v3, v20, v21, v22, v23);
283
+
284
+}
285
+
286
+
287
+int pixel_satd_4x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2)
288
+{
289
+ uint64x2_t t0, t1, r0, r1;
290
+ t00 = *(uint64_t *)(pix1 + 0 * stride_pix1);
291
+ t10 = *(uint64_t *)(pix1 + 1 * stride_pix1);
292
+ t01 = *(uint64_t *)(pix1 + 2 * stride_pix1);
293
+ t11 = *(uint64_t *)(pix1 + 3 * stride_pix1);
294
+
295
+ r00 = *(uint64_t *)(pix2 + 0 * stride_pix1);
296
+ r10 = *(uint64_t *)(pix2 + 1 * stride_pix2);
297
+ r01 = *(uint64_t *)(pix2 + 2 * stride_pix2);
298
+ r11 = *(uint64_t *)(pix2 + 3 * stride_pix2);
299
+
300
+ return _satd_4x4_neon(vsubq_u16(t0, r0), vsubq_u16(r1, t1));
301
+}
302
+
303
+
304
+
305
+
306
+
307
+
308
+int pixel_satd_8x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2)
309
+{
310
+ uint16x8_t i0, i1, i2, i3, i4, i5, i6, i7;
311
+
312
+ i0 = *(uint16x8_t *)(pix1 + 0 * stride_pix1);
313
+ i1 = *(uint16x8_t *)(pix2 + 0 * stride_pix2);
314
+ i2 = *(uint16x8_t *)(pix1 + 1 * stride_pix1);
315
+ i3 = *(uint16x8_t *)(pix2 + 1 * stride_pix2);
316
+ i4 = *(uint16x8_t *)(pix1 + 2 * stride_pix1);
317
+ i5 = *(uint16x8_t *)(pix2 + 2 * stride_pix2);
318
+ i6 = *(uint16x8_t *)(pix1 + 3 * stride_pix1);
319
+ i7 = *(uint16x8_t *)(pix2 + 3 * stride_pix2);
320
+
321
+ int16x8_t v0 = vsubq_u16(i0, i1);
322
+ int16x8_t v1 = vsubq_u16(i2, i3);
323
+ int16x8_t v2 = vsubq_u16(i4, i5);
324
+ int16x8_t v3 = vsubq_u16(i6, i7);
325
+
326
+ return _satd_4x8_8x4_end_neon(v0, v1, v2, v3);
327
+}
328
+
329
+
330
+int pixel_satd_16x16_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2)
331
+{
332
+ int32x4_t v30 = vdupq_n_u32(0), v31 = vdupq_n_u32(0);
333
+ int16x8_t v0, v1, v2, v3;
334
+ for (int offset = 0; offset <= 12; offset += 4) {
335
+ _satd_16x4_neon(pix1 + offset * stride_pix1, stride_pix1, pix2 + offset * stride_pix2, stride_pix2, v0, v1, v2, v3);
336
+ v30 = vpadalq_u16(v30, v0);
337
+ v30 = vpadalq_u16(v30, v1);
338
+ v31 = vpadalq_u16(v31, v2);
339
+ v31 = vpadalq_u16(v31, v3);
340
+ }
341
+ return vaddvq_s32(vaddq_s32(v30, v31));
342
+
343
+}
344
+
345
+#else //HIGH_BIT_DEPTH
346
+
347
+static void _satd_16x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2,
348
+ int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3)
349
+{
350
+ uint8x16_t r0, r1, r2, r3;
351
+ uint8x16_t t0, t1, t2, t3;
352
+ int16x8_t v16, v17, v20, v21;
353
+ int16x8_t v18, v19, v22, v23;
354
+
355
+ r0 = *(uint8x16_t *)(pix1 + 0 * stride_pix1);
356
+ r1 = *(uint8x16_t *)(pix1 + 1 * stride_pix1);
357
+ r2 = *(uint8x16_t *)(pix1 + 2 * stride_pix1);
358
+ r3 = *(uint8x16_t *)(pix1 + 3 * stride_pix1);
359
+
360
+ t0 = *(uint8x16_t *)(pix2 + 0 * stride_pix2);
361
+ t1 = *(uint8x16_t *)(pix2 + 1 * stride_pix2);
362
+ t2 = *(uint8x16_t *)(pix2 + 2 * stride_pix2);
363
+ t3 = *(uint8x16_t *)(pix2 + 3 * stride_pix2);
364
+
365
+
366
+
367
+ v16 = vsubl_u8(vget_low_u8(r0), vget_low_u8(t0));
368
+ v20 = vsubl_high_u8(r0, t0);
369
+ v17 = vsubl_u8(vget_low_u8(r1), vget_low_u8(t1));
370
+ v21 = vsubl_high_u8(r1, t1);
371
+ v18 = vsubl_u8(vget_low_u8(r2), vget_low_u8(t2));
372
+ v22 = vsubl_high_u8(r2, t2);
373
+ v19 = vsubl_u8(vget_low_u8(r3), vget_low_u8(t3));
374
+ v23 = vsubl_high_u8(r3, t3);
375
+
376
+ SUMSUB_AB(v0, v1, v16, v17);
377
+ SUMSUB_AB(v2, v3, v18, v19);
378
+
379
+ _satd_8x4v_8x8h_neon(v0, v1, v2, v3, v20, v21, v22, v23);
380
+
381
+}
382
+
383
+
384
+static inline void _sub_8x8_fly(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2,
385
+ int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3,
386
+ int16x8_t &v20, int16x8_t &v21, int16x8_t &v22, int16x8_t &v23)
387
+{
388
+ uint8x8_t r0, r1, r2, r3;
389
+ uint8x8_t t0, t1, t2, t3;
390
+ int16x8_t v16, v17;
391
+ int16x8_t v18, v19;
392
+
393
+ r0 = *(uint8x8_t *)(pix1 + 0 * stride_pix1);
394
+ r1 = *(uint8x8_t *)(pix1 + 1 * stride_pix1);
395
+ r2 = *(uint8x8_t *)(pix1 + 2 * stride_pix1);
396
+ r3 = *(uint8x8_t *)(pix1 + 3 * stride_pix1);
397
+
398
+ t0 = *(uint8x8_t *)(pix2 + 0 * stride_pix2);
399
+ t1 = *(uint8x8_t *)(pix2 + 1 * stride_pix2);
400
+ t2 = *(uint8x8_t *)(pix2 + 2 * stride_pix2);
401
+ t3 = *(uint8x8_t *)(pix2 + 3 * stride_pix2);
402
+
403
+ v16 = vsubl_u8(r0, t0);
404
+ v17 = vsubl_u8(r1, t1);
405
+ v18 = vsubl_u8(r2, t2);
406
+ v19 = vsubl_u8(r3, t3);
407
+
408
+ r0 = *(uint8x8_t *)(pix1 + 4 * stride_pix1);
409
+ r1 = *(uint8x8_t *)(pix1 + 5 * stride_pix1);
410
+ r2 = *(uint8x8_t *)(pix1 + 6 * stride_pix1);
411
+ r3 = *(uint8x8_t *)(pix1 + 7 * stride_pix1);
412
+
413
+ t0 = *(uint8x8_t *)(pix2 + 4 * stride_pix2);
414
+ t1 = *(uint8x8_t *)(pix2 + 5 * stride_pix2);
415
+ t2 = *(uint8x8_t *)(pix2 + 6 * stride_pix2);
416
+ t3 = *(uint8x8_t *)(pix2 + 7 * stride_pix2);
417
+
418
+ v20 = vsubl_u8(r0, t0);
419
+ v21 = vsubl_u8(r1, t1);
420
+ v22 = vsubl_u8(r2, t2);
421
+ v23 = vsubl_u8(r3, t3);
422
+
423
+
424
+ SUMSUB_AB(v0, v1, v16, v17);
425
+ SUMSUB_AB(v2, v3, v18, v19);
426
+
427
+}
428
+
429
+int pixel_satd_4x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2)
430
+{
431
+ uint32x2_t t0, t1, r0, r1;
432
+ t00 = *(uint32_t *)(pix1 + 0 * stride_pix1);
433
+ t10 = *(uint32_t *)(pix1 + 1 * stride_pix1);
434
+ t01 = *(uint32_t *)(pix1 + 2 * stride_pix1);
435
+ t11 = *(uint32_t *)(pix1 + 3 * stride_pix1);
436
+
437
+ r00 = *(uint32_t *)(pix2 + 0 * stride_pix1);
438
+ r10 = *(uint32_t *)(pix2 + 1 * stride_pix2);
439
+ r01 = *(uint32_t *)(pix2 + 2 * stride_pix2);
440
+ r11 = *(uint32_t *)(pix2 + 3 * stride_pix2);
441
+
442
+ return _satd_4x4_neon(vsubl_u8(t0, r0), vsubl_u8(r1, t1));
443
+}
444
+
445
+
446
+int pixel_satd_8x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2)
447
+{
448
+ uint8x8_t i0, i1, i2, i3, i4, i5, i6, i7;
449
+
450
+ i0 = *(uint8x8_t *)(pix1 + 0 * stride_pix1);
451
+ i1 = *(uint8x8_t *)(pix2 + 0 * stride_pix2);
452
+ i2 = *(uint8x8_t *)(pix1 + 1 * stride_pix1);
453
+ i3 = *(uint8x8_t *)(pix2 + 1 * stride_pix2);
454
+ i4 = *(uint8x8_t *)(pix1 + 2 * stride_pix1);
455
+ i5 = *(uint8x8_t *)(pix2 + 2 * stride_pix2);
456
+ i6 = *(uint8x8_t *)(pix1 + 3 * stride_pix1);
457
+ i7 = *(uint8x8_t *)(pix2 + 3 * stride_pix2);
458
+
459
+ int16x8_t v0 = vsubl_u8(i0, i1);
460
+ int16x8_t v1 = vsubl_u8(i2, i3);
461
+ int16x8_t v2 = vsubl_u8(i4, i5);
462
+ int16x8_t v3 = vsubl_u8(i6, i7);
463
+
464
+ return _satd_4x8_8x4_end_neon(v0, v1, v2, v3);
465
+}
466
+
467
+int pixel_satd_16x16_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2)
468
+{
469
+ int16x8_t v30, v31;
470
+ int16x8_t v0, v1, v2, v3;
471
+
472
+ _satd_16x4_neon(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3);
473
+ v30 = vaddq_s16(v0, v1);
474
+ v31 = vaddq_s16(v2, v3);
475
+
476
+ _satd_16x4_neon(pix1 + 4 * stride_pix1, stride_pix1, pix2 + 4 * stride_pix2, stride_pix2, v0, v1, v2, v3);
477
+ v0 = vaddq_s16(v0, v1);
478
+ v1 = vaddq_s16(v2, v3);
479
+ v30 = vaddq_s16(v30, v0);
480
+ v31 = vaddq_s16(v31, v1);
481
+
482
+ _satd_16x4_neon(pix1 + 8 * stride_pix1, stride_pix1, pix2 + 8 * stride_pix2, stride_pix2, v0, v1, v2, v3);
483
+ v0 = vaddq_s16(v0, v1);
484
+ v1 = vaddq_s16(v2, v3);
485
+ v30 = vaddq_s16(v30, v0);
486
+ v31 = vaddq_s16(v31, v1);
487
+
488
+ _satd_16x4_neon(pix1 + 12 * stride_pix1, stride_pix1, pix2 + 12 * stride_pix2, stride_pix2, v0, v1, v2, v3);
489
+ v0 = vaddq_s16(v0, v1);
490
+ v1 = vaddq_s16(v2, v3);
491
+ v30 = vaddq_s16(v30, v0);
492
+ v31 = vaddq_s16(v31, v1);
493
+
494
+ int32x4_t sum0 = vpaddlq_u16(v30);
495
+ int32x4_t sum1 = vpaddlq_u16(v31);
496
+ sum0 = vaddq_s32(sum0, sum1);
497
+ return vaddvq_s32(sum0);
498
+
499
+}
500
+#endif //HIGH_BIT_DEPTH
501
+
502
+
503
+static inline void _sa8d_8x8_neon_end(int16x8_t &v0, int16x8_t &v1, int16x8_t v2, int16x8_t v3,
504
+ int16x8_t v20, int16x8_t v21, int16x8_t v22, int16x8_t v23)
505
+{
506
+ int16x8_t v16, v17, v18, v19;
507
+ int16x8_t v4, v5, v6, v7;
508
+
509
+ SUMSUB_AB(v16, v18, v0, v2);
510
+ SUMSUB_AB(v17, v19, v1, v3);
511
+
512
+ HADAMARD4_V(v20, v21, v22, v23, v0, v1, v2, v3);
513
+
514
+ SUMSUB_AB(v0, v16, v16, v20);
515
+ SUMSUB_AB(v1, v17, v17, v21);
516
+ SUMSUB_AB(v2, v18, v18, v22);
517
+ SUMSUB_AB(v3, v19, v19, v23);
518
+
519
+ transpose_8h(v20, v21, v16, v17);
520
+ transpose_8h(v4, v5, v0, v1);
521
+ transpose_8h(v22, v23, v18, v19);
522
+ transpose_8h(v6, v7, v2, v3);
523
+
524
+#if (X265_DEPTH <= 10)
525
+
526
+ int16x8_t v24, v25;
527
+
528
+ SUMSUB_AB(v2, v3, v20, v21);
529
+ SUMSUB_AB(v24, v25, v4, v5);
530
+ SUMSUB_AB(v0, v1, v22, v23);
531
+ SUMSUB_AB(v4, v5, v6, v7);
532
+
533
+ transpose_4s(v20, v22, v2, v0);
534
+ transpose_4s(v21, v23, v3, v1);
535
+ transpose_4s(v16, v18, v24, v4);
536
+ transpose_4s(v17, v19, v25, v5);
537
+
538
+ SUMSUB_AB(v0, v2, v20, v22);
539
+ SUMSUB_AB(v1, v3, v21, v23);
540
+ SUMSUB_AB(v4, v6, v16, v18);
541
+ SUMSUB_AB(v5, v7, v17, v19);
542
+
543
+ transpose_2d(v16, v20, v0, v4);
544
+ transpose_2d(v17, v21, v1, v5);
545
+ transpose_2d(v18, v22, v2, v6);
546
+ transpose_2d(v19, v23, v3, v7);
547
+
548
+
549
+ v16 = vabsq_s16(v16);
550
+ v17 = vabsq_s16(v17);
551
+ v18 = vabsq_s16(v18);
552
+ v19 = vabsq_s16(v19);
553
+ v20 = vabsq_s16(v20);
554
+ v21 = vabsq_s16(v21);
555
+ v22 = vabsq_s16(v22);
556
+ v23 = vabsq_s16(v23);
557
+
558
+ v16 = vmaxq_u16(v16, v20);
559
+ v17 = vmaxq_u16(v17, v21);
560
+ v18 = vmaxq_u16(v18, v22);
561
+ v19 = vmaxq_u16(v19, v23);
562
+
563
+#if HIGH_BIT_DEPTH
564
+ v0 = vpaddlq_u16(v16);
565
+ v1 = vpaddlq_u16(v17);
566
+ v0 = vpadalq_u16(v0, v18);
567
+ v1 = vpadalq_u16(v1, v19);
568
+
569
+#else //HIGH_BIT_DEPTH
570
+
571
+ v0 = vaddq_u16(v16, v17);
572
+ v1 = vaddq_u16(v18, v19);
573
+
574
+#endif //HIGH_BIT_DEPTH
575
+
576
+#else // HIGH_BIT_DEPTH 12 bit only, switching math to int32, each int16x8 is up-convreted to 2 int32x4 (low and high)
577
+
578
+ int32x4_t v2l, v2h, v3l, v3h, v24l, v24h, v25l, v25h, v0l, v0h, v1l, v1h;
579
+ int32x4_t v22l, v22h, v23l, v23h;
580
+ int32x4_t v4l, v4h, v5l, v5h;
581
+ int32x4_t v6l, v6h, v7l, v7h;
582
+ int32x4_t v16l, v16h, v17l, v17h;
583
+ int32x4_t v18l, v18h, v19l, v19h;
584
+ int32x4_t v20l, v20h, v21l, v21h;
585
+
586
+ ISUMSUB_AB_FROM_INT16(v2l, v2h, v3l, v3h, v20, v21);
587
+ ISUMSUB_AB_FROM_INT16(v24l, v24h, v25l, v25h, v4, v5);
588
+
589
+ v22l = vmovl_s16(vget_low_s16(v22));
590
+ v22h = vmovl_high_s16(v22);
591
+ v23l = vmovl_s16(vget_low_s16(v23));
592
+ v23h = vmovl_high_s16(v23);
593
+
594
+ ISUMSUB_AB(v0l, v1l, v22l, v23l);
595
+ ISUMSUB_AB(v0h, v1h, v22h, v23h);
596
+
597
+ v6l = vmovl_s16(vget_low_s16(v6));
598
+ v6h = vmovl_high_s16(v6);
599
+ v7l = vmovl_s16(vget_low_s16(v7));
600
+ v7h = vmovl_high_s16(v7);
601
+
602
+ ISUMSUB_AB(v4l, v5l, v6l, v7l);
603
+ ISUMSUB_AB(v4h, v5h, v6h, v7h);
604
+
605
+ transpose_2d(v20l, v22l, v2l, v0l);
606
+ transpose_2d(v21l, v23l, v3l, v1l);
607
+ transpose_2d(v16l, v18l, v24l, v4l);
608
+ transpose_2d(v17l, v19l, v25l, v5l);
609
+
610
+ transpose_2d(v20h, v22h, v2h, v0h);
611
+ transpose_2d(v21h, v23h, v3h, v1h);
612
+ transpose_2d(v16h, v18h, v24h, v4h);
613
+ transpose_2d(v17h, v19h, v25h, v5h);
614
+
615
+ ISUMSUB_AB(v0l, v2l, v20l, v22l);
616
+ ISUMSUB_AB(v1l, v3l, v21l, v23l);
617
+ ISUMSUB_AB(v4l, v6l, v16l, v18l);
618
+ ISUMSUB_AB(v5l, v7l, v17l, v19l);
619
+
620
+ ISUMSUB_AB(v0h, v2h, v20h, v22h);
621
+ ISUMSUB_AB(v1h, v3h, v21h, v23h);
622
+ ISUMSUB_AB(v4h, v6h, v16h, v18h);
623
+ ISUMSUB_AB(v5h, v7h, v17h, v19h);
624
+
625
+ v16l = v0l;
626
+ v16h = v4l;
627
+ v20l = v0h;
628
+ v20h = v4h;
629
+
630
+ v17l = v1l;
631
+ v17h = v5l;
632
+ v21l = v1h;
633
+ v21h = v5h;
634
+
635
+ v18l = v2l;
636
+ v18h = v6l;
637
+ v22l = v2h;
638
+ v22h = v6h;
639
+
640
+ v19l = v3l;
641
+ v19h = v7l;
642
+ v23l = v3h;
643
+ v23h = v7h;
644
+
645
+ v16l = vabsq_s32(v16l);
646
+ v17l = vabsq_s32(v17l);
647
+ v18l = vabsq_s32(v18l);
648
+ v19l = vabsq_s32(v19l);
649
+ v20l = vabsq_s32(v20l);
650
+ v21l = vabsq_s32(v21l);
651
+ v22l = vabsq_s32(v22l);
652
+ v23l = vabsq_s32(v23l);
653
+
654
+ v16h = vabsq_s32(v16h);
655
+ v17h = vabsq_s32(v17h);
656
+ v18h = vabsq_s32(v18h);
657
+ v19h = vabsq_s32(v19h);
658
+ v20h = vabsq_s32(v20h);
659
+ v21h = vabsq_s32(v21h);
660
+ v22h = vabsq_s32(v22h);
661
+ v23h = vabsq_s32(v23h);
662
+
663
+ v16l = vmaxq_u32(v16l, v20l);
664
+ v17l = vmaxq_u32(v17l, v21l);
665
+ v18l = vmaxq_u32(v18l, v22l);
666
+ v19l = vmaxq_u32(v19l, v23l);
667
+
668
+ v16h = vmaxq_u32(v16h, v20h);
669
+ v17h = vmaxq_u32(v17h, v21h);
670
+ v18h = vmaxq_u32(v18h, v22h);
671
+ v19h = vmaxq_u32(v19h, v23h);
672
+
673
+ v16l = vaddq_u32(v16l, v16h);
674
+ v17l = vaddq_u32(v17l, v17h);
675
+ v18l = vaddq_u32(v18l, v18h);
676
+ v19l = vaddq_u32(v19l, v19h);
677
+
678
+ v0 = vaddq_u32(v16l, v17l);
679
+ v1 = vaddq_u32(v18l, v19l);
680
+
681
+
682
+#endif
683
+
684
+}
685
+
686
+
687
+
688
+static inline void _satd_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2,
689
+ int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3)
690
+{
691
+
692
+ int16x8_t v20, v21, v22, v23;
693
+ _sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
694
+ _satd_8x4v_8x8h_neon(v0, v1, v2, v3, v20, v21, v22, v23);
695
+
696
+}
697
+
698
+
699
+
700
+int pixel_satd_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
701
+{
702
+ int16x8_t v30, v31;
703
+ int16x8_t v0, v1, v2, v3;
704
+
705
+ _satd_8x8_neon(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3);
706
+#if !(HIGH_BIT_DEPTH)
707
+ v30 = vaddq_u16(v0, v1);
708
+ v31 = vaddq_u16(v2, v3);
709
+
710
+ uint16x8_t sum = vaddq_u16(v30, v31);
711
+ return vaddvq_s32(vpaddlq_u16(sum));
712
+#else
713
+
714
+ v30 = vaddq_u16(v0, v1);
715
+ v31 = vaddq_u16(v2, v3);
716
+
717
+ int32x4_t sum = vpaddlq_u16(v30);
718
+ sum = vpadalq_u16(sum, v31);
719
+ return vaddvq_s32(sum);
720
+#endif
721
+}
722
+
723
+
724
+int pixel_sa8d_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
725
+{
726
+ int16x8_t v0, v1, v2, v3;
727
+ int16x8_t v20, v21, v22, v23;
728
+
729
+ _sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
730
+ _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
731
+
732
+#if HIGH_BIT_DEPTH
733
+ int32x4_t s = vaddq_u32(v0, v1);
734
+ return (vaddvq_u32(s) + 1) >> 1;
735
+#else
736
+ return (vaddlvq_s16(vaddq_u16(v0, v1)) + 1) >> 1;
737
+#endif
738
+}
739
+
740
+
741
+
742
+
743
+
744
+int pixel_sa8d_16x16_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
745
+{
746
+ int16x8_t v0, v1, v2, v3;
747
+ int16x8_t v20, v21, v22, v23;
748
+ int32x4_t v30, v31;
749
+
750
+ _sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
751
+ _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
752
+
753
+#if !(HIGH_BIT_DEPTH)
754
+ v30 = vpaddlq_u16(v0);
755
+ v31 = vpaddlq_u16(v1);
756
+#else
757
+ v30 = vaddq_s32(v0, v1);
758
+#endif
759
+
760
+ _sub_8x8_fly(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
761
+ _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
762
+
763
+#if !(HIGH_BIT_DEPTH)
764
+ v30 = vpadalq_u16(v30, v0);
765
+ v31 = vpadalq_u16(v31, v1);
766
+#else
767
+ v31 = vaddq_s32(v0, v1);
768
+#endif
769
+
770
+
771
+ _sub_8x8_fly(pix1 + 8 * stride_pix1, stride_pix1, pix2 + 8 * stride_pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22,
772
+ v23);
773
+ _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
774
+
775
+#if !(HIGH_BIT_DEPTH)
776
+ v30 = vpadalq_u16(v30, v0);
777
+ v31 = vpadalq_u16(v31, v1);
778
+#else
779
+ v30 = vaddq_s32(v30, v0);
780
+ v31 = vaddq_s32(v31, v1);
781
+#endif
782
+
783
+ _sub_8x8_fly(pix1 + 8 * stride_pix1 + 8, stride_pix1, pix2 + 8 * stride_pix2 + 8, stride_pix2, v0, v1, v2, v3, v20, v21,
784
+ v22, v23);
785
+ _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
786
+
787
+#if !(HIGH_BIT_DEPTH)
788
+ v30 = vpadalq_u16(v30, v0);
789
+ v31 = vpadalq_u16(v31, v1);
790
+#else
791
+ v30 = vaddq_s32(v30, v0);
792
+ v31 = vaddq_s32(v31, v1);
793
+#endif
794
+
795
+ v30 = vaddq_u32(v30, v31);
796
+
797
+ return (vaddvq_u32(v30) + 1) >> 1;
798
+}
799
+
800
+
801
+
802
+
803
+
804
+
805
+
806
+
807
+template<int size>
808
+void blockfill_s_neon(int16_t *dst, intptr_t dstride, int16_t val)
809
+{
810
+ for (int y = 0; y < size; y++)
811
+ {
812
+ int x = 0;
813
+ int16x8_t v = vdupq_n_s16(val);
814
+ for (; (x + 8) <= size; x += 8)
815
+ {
816
+ *(int16x8_t *)&dsty * dstride + x = v;
817
+ }
818
+ for (; x < size; x++)
819
+ {
820
+ dsty * dstride + x = val;
821
+ }
822
+ }
823
+}
824
+
825
+template<int lx, int ly>
826
+int sad_pp_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
827
+{
828
+ int sum = 0;
829
+
830
+
831
+ for (int y = 0; y < ly; y++)
832
+ {
833
+#if HIGH_BIT_DEPTH
834
+ int x = 0;
835
+ uint16x8_t vsum16_1 = vdupq_n_u16(0);
836
+ for (; (x + 8) <= lx; x += 8)
837
+ {
838
+ uint16x8_t p1 = *(uint16x8_t *)&pix1x;
839
+ uint16x8_t p2 = *(uint16x8_t *)&pix2x;
840
+ vsum16_1 = vabaq_s16(vsum16_1, p1, p2);
841
+
842
+ }
843
+ if (lx & 4)
844
+ {
845
+ uint16x4_t p1 = *(uint16x4_t *)&pix1x;
846
+ uint16x4_t p2 = *(uint16x4_t *)&pix2x;
847
+ sum += vaddlv_s16(vaba_s16(vdup_n_s16(0), p1, p2));
848
+ x += 4;
849
+ }
850
+ if (lx >= 4)
851
+ {
852
+ sum += vaddlvq_s16(vsum16_1);
853
+ }
854
+
855
+#else
856
+
857
+ int x = 0;
858
+ uint16x8_t vsum16_1 = vdupq_n_u16(0);
859
+ uint16x8_t vsum16_2 = vdupq_n_u16(0);
860
+
861
+ for (; (x + 16) <= lx; x += 16)
862
+ {
863
+ uint8x16_t p1 = *(uint8x16_t *)&pix1x;
864
+ uint8x16_t p2 = *(uint8x16_t *)&pix2x;
865
+ vsum16_1 = vabal_u8(vsum16_1, vget_low_u8(p1), vget_low_u8(p2));
866
+ vsum16_2 = vabal_high_u8(vsum16_2, p1, p2);
867
+ }
868
+ if (lx & 8)
869
+ {
870
+ uint8x8_t p1 = *(uint8x8_t *)&pix1x;
871
+ uint8x8_t p2 = *(uint8x8_t *)&pix2x;
872
+ vsum16_1 = vabal_u8(vsum16_1, p1, p2);
873
+ x += 8;
874
+ }
875
+ if (lx & 4)
876
+ {
877
+ uint32x2_t p1 = vdup_n_u32(0);
878
+ p10 = *(uint32_t *)&pix1x;
879
+ uint32x2_t p2 = vdup_n_u32(0);
880
+ p20 = *(uint32_t *)&pix2x;
881
+ vsum16_1 = vabal_u8(vsum16_1, p1, p2);
882
+ x += 4;
883
+ }
884
+ if (lx >= 16)
885
+ {
886
+ vsum16_1 = vaddq_u16(vsum16_1, vsum16_2);
887
+ }
888
+ if (lx >= 4)
889
+ {
890
+ sum += vaddvq_u16(vsum16_1);
891
+ }
892
+
893
+#endif
894
+ if (lx & 3) for (; x < lx; x++)
895
+ {
896
+ sum += abs(pix1x - pix2x);
897
+ }
898
+
899
+ pix1 += stride_pix1;
900
+ pix2 += stride_pix2;
901
+ }
902
+
903
+ return sum;
904
+}
905
+
906
+template<int lx, int ly>
907
+void sad_x3_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const pixel *pix4, intptr_t frefstride,
908
+ int32_t *res)
909
+{
910
+ res0 = 0;
911
+ res1 = 0;
912
+ res2 = 0;
913
+ for (int y = 0; y < ly; y++)
914
+ {
915
+ int x = 0;
916
+ uint16x8_t vsum16_0 = vdupq_n_u16(0);
917
+ uint16x8_t vsum16_1 = vdupq_n_u16(0);
918
+ uint16x8_t vsum16_2 = vdupq_n_u16(0);
919
+#if HIGH_BIT_DEPTH
920
+ for (; (x + 8) <= lx; x += 8)
921
+ {
922
+ uint16x8_t p1 = *(uint16x8_t *)&pix1x;
923
+ uint16x8_t p2 = *(uint16x8_t *)&pix2x;
924
+ uint16x8_t p3 = *(uint16x8_t *)&pix3x;
925
+ uint16x8_t p4 = *(uint16x8_t *)&pix4x;
926
+ vsum16_0 = vabaq_s16(vsum16_0, p1, p2);
927
+ vsum16_1 = vabaq_s16(vsum16_1, p1, p3);
928
+ vsum16_2 = vabaq_s16(vsum16_2, p1, p4);
929
+
930
+ }
931
+ if (lx & 4)
932
+ {
933
+ uint16x4_t p1 = *(uint16x4_t *)&pix1x;
934
+ uint16x4_t p2 = *(uint16x4_t *)&pix2x;
935
+ uint16x4_t p3 = *(uint16x4_t *)&pix3x;
936
+ uint16x4_t p4 = *(uint16x4_t *)&pix4x;
937
+ res0 += vaddlv_s16(vaba_s16(vdup_n_s16(0), p1, p2));
938
+ res1 += vaddlv_s16(vaba_s16(vdup_n_s16(0), p1, p3));
939
+ res2 += vaddlv_s16(vaba_s16(vdup_n_s16(0), p1, p4));
940
+ x += 4;
941
+ }
942
+ if (lx >= 4)
943
+ {
944
+ res0 += vaddlvq_s16(vsum16_0);
945
+ res1 += vaddlvq_s16(vsum16_1);
946
+ res2 += vaddlvq_s16(vsum16_2);
947
+ }
948
+#else
949
+
950
+ for (; (x + 16) <= lx; x += 16)
951
+ {
952
+ uint8x16_t p1 = *(uint8x16_t *)&pix1x;
953
+ uint8x16_t p2 = *(uint8x16_t *)&pix2x;
954
+ uint8x16_t p3 = *(uint8x16_t *)&pix3x;
955
+ uint8x16_t p4 = *(uint8x16_t *)&pix4x;
956
+ vsum16_0 = vabal_u8(vsum16_0, vget_low_u8(p1), vget_low_u8(p2));
957
+ vsum16_0 = vabal_high_u8(vsum16_0, p1, p2);
958
+ vsum16_1 = vabal_u8(vsum16_1, vget_low_u8(p1), vget_low_u8(p3));
959
+ vsum16_1 = vabal_high_u8(vsum16_1, p1, p3);
960
+ vsum16_2 = vabal_u8(vsum16_2, vget_low_u8(p1), vget_low_u8(p4));
961
+ vsum16_2 = vabal_high_u8(vsum16_2, p1, p4);
962
+ }
963
+ if (lx & 8)
964
+ {
965
+ uint8x8_t p1 = *(uint8x8_t *)&pix1x;
966
+ uint8x8_t p2 = *(uint8x8_t *)&pix2x;
967
+ uint8x8_t p3 = *(uint8x8_t *)&pix3x;
968
+ uint8x8_t p4 = *(uint8x8_t *)&pix4x;
969
+ vsum16_0 = vabal_u8(vsum16_0, p1, p2);
970
+ vsum16_1 = vabal_u8(vsum16_1, p1, p3);
971
+ vsum16_2 = vabal_u8(vsum16_2, p1, p4);
972
+ x += 8;
973
+ }
974
+ if (lx & 4)
975
+ {
976
+ uint32x2_t p1 = vdup_n_u32(0);
977
+ p10 = *(uint32_t *)&pix1x;
978
+ uint32x2_t p2 = vdup_n_u32(0);
979
+ p20 = *(uint32_t *)&pix2x;
980
+ uint32x2_t p3 = vdup_n_u32(0);
981
+ p30 = *(uint32_t *)&pix3x;
982
+ uint32x2_t p4 = vdup_n_u32(0);
983
+ p40 = *(uint32_t *)&pix4x;
984
+ vsum16_0 = vabal_u8(vsum16_0, p1, p2);
985
+ vsum16_1 = vabal_u8(vsum16_1, p1, p3);
986
+ vsum16_2 = vabal_u8(vsum16_2, p1, p4);
987
+ x += 4;
988
+ }
989
+ if (lx >= 4)
990
+ {
991
+ res0 += vaddvq_u16(vsum16_0);
992
+ res1 += vaddvq_u16(vsum16_1);
993
+ res2 += vaddvq_u16(vsum16_2);
994
+ }
995
+
996
+#endif
997
+ if (lx & 3) for (; x < lx; x++)
998
+ {
999
+ res0 += abs(pix1x - pix2x);
1000
+ res1 += abs(pix1x - pix3x);
1001
+ res2 += abs(pix1x - pix4x);
1002
+ }
1003
+
1004
+ pix1 += FENC_STRIDE;
1005
+ pix2 += frefstride;
1006
+ pix3 += frefstride;
1007
+ pix4 += frefstride;
1008
+ }
1009
+}
1010
+
1011
+template<int lx, int ly>
1012
+void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const pixel *pix4, const pixel *pix5,
1013
+ intptr_t frefstride, int32_t *res)
1014
+{
1015
+ int32x4_t result = {0};
1016
+ for (int y = 0; y < ly; y++)
1017
+ {
1018
+ int x = 0;
1019
+ uint16x8_t vsum16_0 = vdupq_n_u16(0);
1020
+ uint16x8_t vsum16_1 = vdupq_n_u16(0);
1021
+ uint16x8_t vsum16_2 = vdupq_n_u16(0);
1022
+ uint16x8_t vsum16_3 = vdupq_n_u16(0);
1023
+#if HIGH_BIT_DEPTH
1024
+ for (; (x + 16) <= lx; x += 16)
1025
+ {
1026
+ uint16x8x2_t p1 = vld1q_u16_x2(&pix1x);
1027
+ uint16x8x2_t p2 = vld1q_u16_x2(&pix2x);
1028
+ uint16x8x2_t p3 = vld1q_u16_x2(&pix3x);
1029
+ uint16x8x2_t p4 = vld1q_u16_x2(&pix4x);
1030
+ uint16x8x2_t p5 = vld1q_u16_x2(&pix5x);
1031
+ vsum16_0 = vabaq_s16(vsum16_0, p1.val0, p2.val0);
1032
+ vsum16_1 = vabaq_s16(vsum16_1, p1.val0, p3.val0);
1033
+ vsum16_2 = vabaq_s16(vsum16_2, p1.val0, p4.val0);
1034
+ vsum16_3 = vabaq_s16(vsum16_3, p1.val0, p5.val0);
1035
+ vsum16_0 = vabaq_s16(vsum16_0, p1.val1, p2.val1);
1036
+ vsum16_1 = vabaq_s16(vsum16_1, p1.val1, p3.val1);
1037
+ vsum16_2 = vabaq_s16(vsum16_2, p1.val1, p4.val1);
1038
+ vsum16_3 = vabaq_s16(vsum16_3, p1.val1, p5.val1);
1039
+ }
1040
+ if (lx & 8)
1041
+ {
1042
+ uint16x8_t p1 = *(uint16x8_t *)&pix1x;
1043
+ uint16x8_t p2 = *(uint16x8_t *)&pix2x;
1044
+ uint16x8_t p3 = *(uint16x8_t *)&pix3x;
1045
+ uint16x8_t p4 = *(uint16x8_t *)&pix4x;
1046
+ uint16x8_t p5 = *(uint16x8_t *)&pix5x;
1047
+ vsum16_0 = vabaq_s16(vsum16_0, p1, p2);
1048
+ vsum16_1 = vabaq_s16(vsum16_1, p1, p3);
1049
+ vsum16_2 = vabaq_s16(vsum16_2, p1, p4);
1050
+ vsum16_3 = vabaq_s16(vsum16_3, p1, p5);
1051
+ x += 8;
1052
+ }
1053
+ if (lx & 4)
1054
+ {
1055
+ /* This is equivalent to getting the absolute difference of pix1x with each of
1056
+ * pix2 - pix5, then summing across the vector (4 values each) and adding the
1057
+ * result to result. */
1058
+ uint16x8_t p1 = vreinterpretq_s16_u64(
1059
+ vld1q_dup_u64((uint64_t *)&pix1x));
1060
+ uint16x8_t p2_3 = vcombine_s16(*(uint16x4_t *)&pix2x, *(uint16x4_t *)&pix3x);
1061
+ uint16x8_t p4_5 = vcombine_s16(*(uint16x4_t *)&pix4x, *(uint16x4_t *)&pix5x);
1062
+
1063
+ uint16x8_t a = vabdq_u16(p1, p2_3);
1064
+ uint16x8_t b = vabdq_u16(p1, p4_5);
1065
+
1066
+ result = vpadalq_s16(result, vpaddq_s16(a, b));
1067
+ x += 4;
1068
+ }
1069
+ if (lx >= 4)
1070
+ {
1071
+ /* This is equivalent to adding across each of the sum vectors and then adding
1072
+ * to result. */
1073
+ uint16x8_t a = vpaddq_s16(vsum16_0, vsum16_1);
1074
+ uint16x8_t b = vpaddq_s16(vsum16_2, vsum16_3);
1075
+ uint16x8_t c = vpaddq_s16(a, b);
1076
+ result = vpadalq_s16(result, c);
1077
+ }
1078
+
1079
+#else
1080
+
1081
+ for (; (x + 16) <= lx; x += 16)
1082
+ {
1083
+ uint8x16_t p1 = *(uint8x16_t *)&pix1x;
1084
+ uint8x16_t p2 = *(uint8x16_t *)&pix2x;
1085
+ uint8x16_t p3 = *(uint8x16_t *)&pix3x;
1086
+ uint8x16_t p4 = *(uint8x16_t *)&pix4x;
1087
+ uint8x16_t p5 = *(uint8x16_t *)&pix5x;
1088
+ vsum16_0 = vabal_u8(vsum16_0, vget_low_u8(p1), vget_low_u8(p2));
1089
+ vsum16_0 = vabal_high_u8(vsum16_0, p1, p2);
1090
+ vsum16_1 = vabal_u8(vsum16_1, vget_low_u8(p1), vget_low_u8(p3));
1091
+ vsum16_1 = vabal_high_u8(vsum16_1, p1, p3);
1092
+ vsum16_2 = vabal_u8(vsum16_2, vget_low_u8(p1), vget_low_u8(p4));
1093
+ vsum16_2 = vabal_high_u8(vsum16_2, p1, p4);
1094
+ vsum16_3 = vabal_u8(vsum16_3, vget_low_u8(p1), vget_low_u8(p5));
1095
+ vsum16_3 = vabal_high_u8(vsum16_3, p1, p5);
1096
+ }
1097
+ if (lx & 8)
1098
+ {
1099
+ uint8x8_t p1 = *(uint8x8_t *)&pix1x;
1100
+ uint8x8_t p2 = *(uint8x8_t *)&pix2x;
1101
+ uint8x8_t p3 = *(uint8x8_t *)&pix3x;
1102
+ uint8x8_t p4 = *(uint8x8_t *)&pix4x;
1103
+ uint8x8_t p5 = *(uint8x8_t *)&pix5x;
1104
+ vsum16_0 = vabal_u8(vsum16_0, p1, p2);
1105
+ vsum16_1 = vabal_u8(vsum16_1, p1, p3);
1106
+ vsum16_2 = vabal_u8(vsum16_2, p1, p4);
1107
+ vsum16_3 = vabal_u8(vsum16_3, p1, p5);
1108
+ x += 8;
1109
+ }
1110
+ if (lx & 4)
1111
+ {
1112
+ uint8x16_t p1 = vreinterpretq_u32_u8(
1113
+ vld1q_dup_u32((uint32_t *)&pix1x));
1114
+
1115
+ uint32x4_t p_x4;
1116
+ p_x4 = vld1q_lane_u32((uint32_t *)&pix2x, p_x4, 0);
1117
+ p_x4 = vld1q_lane_u32((uint32_t *)&pix3x, p_x4, 1);
1118
+ p_x4 = vld1q_lane_u32((uint32_t *)&pix4x, p_x4, 2);
1119
+ p_x4 = vld1q_lane_u32((uint32_t *)&pix5x, p_x4, 3);
1120
+
1121
+ uint16x8_t sum = vabdl_u8(vget_low_u8(p1), vget_low_u8(p_x4));
1122
+ uint16x8_t sum2 = vabdl_high_u8(p1, p_x4);
1123
+
1124
+ uint16x8_t a = vpaddq_u16(sum, sum2);
1125
+ result = vpadalq_u16(result, a);
1126
+ }
1127
+ if (lx >= 4)
1128
+ {
1129
+ result0 += vaddvq_u16(vsum16_0);
1130
+ result1 += vaddvq_u16(vsum16_1);
1131
+ result2 += vaddvq_u16(vsum16_2);
1132
+ result3 += vaddvq_u16(vsum16_3);
1133
+ }
1134
+
1135
+#endif
1136
+ if (lx & 3) for (; x < lx; x++)
1137
+ {
1138
+ result0 += abs(pix1x - pix2x);
1139
+ result1 += abs(pix1x - pix3x);
1140
+ result2 += abs(pix1x - pix4x);
1141
+ result3 += abs(pix1x - pix5x);
1142
+ }
1143
+
1144
+ pix1 += FENC_STRIDE;
1145
+ pix2 += frefstride;
1146
+ pix3 += frefstride;
1147
+ pix4 += frefstride;
1148
+ pix5 += frefstride;
1149
+ }
1150
+ vst1q_s32(res, result);
1151
+}
1152
+
1153
+
1154
+template<int lx, int ly, class T1, class T2>
1155
+sse_t sse_neon(const T1 *pix1, intptr_t stride_pix1, const T2 *pix2, intptr_t stride_pix2)
1156
+{
1157
+ sse_t sum = 0;
1158
+
1159
+ int32x4_t vsum1 = vdupq_n_s32(0);
1160
+ int32x4_t vsum2 = vdupq_n_s32(0);
1161
+ for (int y = 0; y < ly; y++)
1162
+ {
1163
+ int x = 0;
1164
+ for (; (x + 8) <= lx; x += 8)
1165
+ {
1166
+ int16x8_t tmp;
1167
+ if (sizeof(T1) == 2 && sizeof(T2) == 2)
1168
+ {
1169
+ tmp = vsubq_s16(*(int16x8_t *)&pix1x, *(int16x8_t *)&pix2x);
1170
+ }
1171
+ else if (sizeof(T1) == 1 && sizeof(T2) == 1)
1172
+ {
1173
+ tmp = vsubl_u8(*(uint8x8_t *)&pix1x, *(uint8x8_t *)&pix2x);
1174
+ }
1175
+ else
1176
+ {
1177
+ X265_CHECK(false, "unsupported sse");
1178
+ }
1179
+ vsum1 = vmlal_s16(vsum1, vget_low_s16(tmp), vget_low_s16(tmp));
1180
+ vsum2 = vmlal_high_s16(vsum2, tmp, tmp);
1181
+ }
1182
+ for (; x < lx; x++)
1183
+ {
1184
+ int tmp = pix1x - pix2x;
1185
+ sum += (tmp * tmp);
1186
+ }
1187
+
1188
+ if (sizeof(T1) == 2 && sizeof(T2) == 2)
1189
+ {
1190
+ int32x4_t vsum = vaddq_u32(vsum1, vsum2);;
1191
+ sum += vaddvq_u32(vsum);
1192
+ vsum1 = vsum2 = vdupq_n_u16(0);
1193
+ }
1194
+
1195
+ pix1 += stride_pix1;
1196
+ pix2 += stride_pix2;
1197
+ }
1198
+ int32x4_t vsum = vaddq_u32(vsum1, vsum2);
1199
+
1200
+ return sum + vaddvq_u32(vsum);
1201
+}
1202
+
1203
+
1204
+template<int bx, int by>
1205
+void blockcopy_ps_neon(int16_t *a, intptr_t stridea, const pixel *b, intptr_t strideb)
1206
+{
1207
+ for (int y = 0; y < by; y++)
1208
+ {
1209
+ int x = 0;
1210
+ for (; (x + 8) <= bx; x += 8)
1211
+ {
1212
+#if HIGH_BIT_DEPTH
1213
+ *(int16x8_t *)&ax = *(int16x8_t *)&bx;
1214
+#else
1215
+ *(int16x8_t *)&ax = vmovl_u8(*(int8x8_t *)&bx);
1216
+#endif
1217
+ }
1218
+ for (; x < bx; x++)
1219
+ {
1220
+ ax = (int16_t)bx;
1221
+ }
1222
+
1223
+ a += stridea;
1224
+ b += strideb;
1225
+ }
1226
+}
1227
+
1228
+
1229
+template<int bx, int by>
1230
+void blockcopy_pp_neon(pixel *a, intptr_t stridea, const pixel *b, intptr_t strideb)
1231
+{
1232
+ for (int y = 0; y < by; y++)
1233
+ {
1234
+ int x = 0;
1235
+#if HIGH_BIT_DEPTH
1236
+ for (; (x + 8) <= bx; x += 8)
1237
+ {
1238
+ *(int16x8_t *)&ax = *(int16x8_t *)&bx;
1239
+ }
1240
+ if (bx & 4)
1241
+ {
1242
+ *(uint64_t *)&ax = *(uint64_t *)&bx;
1243
+ x += 4;
1244
+ }
1245
+#else
1246
+ for (; (x + 16) <= bx; x += 16)
1247
+ {
1248
+ *(uint8x16_t *)&ax = *(uint8x16_t *)&bx;
1249
+ }
1250
+ if (bx & 8)
1251
+ {
1252
+ *(uint8x8_t *)&ax = *(uint8x8_t *)&bx;
1253
+ x += 8;
1254
+ }
1255
+ if (bx & 4)
1256
+ {
1257
+ *(uint32_t *)&ax = *(uint32_t *)&bx;
1258
+ x += 4;
1259
+ }
1260
+#endif
1261
+ for (; x < bx; x++)
1262
+ {
1263
+ ax = bx;
1264
+ }
1265
+
1266
+ a += stridea;
1267
+ b += strideb;
1268
+ }
1269
+}
1270
+
1271
+
1272
+template<int bx, int by>
1273
+void pixel_sub_ps_neon(int16_t *a, intptr_t dstride, const pixel *b0, const pixel *b1, intptr_t sstride0,
1274
+ intptr_t sstride1)
1275
+{
1276
+ for (int y = 0; y < by; y++)
1277
+ {
1278
+ int x = 0;
1279
+ for (; (x + 8) <= bx; x += 8)
1280
+ {
1281
+#if HIGH_BIT_DEPTH
1282
+ *(int16x8_t *)&ax = vsubq_s16(*(int16x8_t *)&b0x, *(int16x8_t *)&b1x);
1283
+#else
1284
+ *(int16x8_t *)&ax = vsubl_u8(*(uint8x8_t *)&b0x, *(uint8x8_t *)&b1x);
1285
+#endif
1286
+ }
1287
+ for (; x < bx; x++)
1288
+ {
1289
+ ax = (int16_t)(b0x - b1x);
1290
+ }
1291
+
1292
+ b0 += sstride0;
1293
+ b1 += sstride1;
1294
+ a += dstride;
1295
+ }
1296
+}
1297
+
1298
+template<int bx, int by>
1299
+void pixel_add_ps_neon(pixel *a, intptr_t dstride, const pixel *b0, const int16_t *b1, intptr_t sstride0,
1300
+ intptr_t sstride1)
1301
+{
1302
+ for (int y = 0; y < by; y++)
1303
+ {
1304
+ int x = 0;
1305
+ for (; (x + 8) <= bx; x += 8)
1306
+ {
1307
+ int16x8_t t;
1308
+ int16x8_t b1e = *(int16x8_t *)&b1x;
1309
+ int16x8_t b0e;
1310
+#if HIGH_BIT_DEPTH
1311
+ b0e = *(int16x8_t *)&b0x;
1312
+ t = vaddq_s16(b0e, b1e);
1313
+ t = vminq_s16(t, vdupq_n_s16((1 << X265_DEPTH) - 1));
1314
+ t = vmaxq_s16(t, vdupq_n_s16(0));
1315
+ *(int16x8_t *)&ax = t;
1316
+#else
1317
+ b0e = vmovl_u8(*(uint8x8_t *)&b0x);
1318
+ t = vaddq_s16(b0e, b1e);
1319
+ *(uint8x8_t *)&ax = vqmovun_s16(t);
1320
+#endif
1321
+ }
1322
+ for (; x < bx; x++)
1323
+ {
1324
+ ax = (int16_t)x265_clip(b0x + b1x);
1325
+ }
1326
+
1327
+ b0 += sstride0;
1328
+ b1 += sstride1;
1329
+ a += dstride;
1330
+ }
1331
+}
1332
+
1333
+template<int bx, int by>
1334
+void addAvg_neon(const int16_t *src0, const int16_t *src1, pixel *dst, intptr_t src0Stride, intptr_t src1Stride,
1335
+ intptr_t dstStride)
1336
+{
1337
+
1338
+ const int shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH;
1339
+ const int offset = (1 << (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS;
1340
+
1341
+ const int32x4_t addon = vdupq_n_s32(offset);
1342
+ for (int y = 0; y < by; y++)
1343
+ {
1344
+ int x = 0;
1345
+
1346
+ for (; (x + 8) <= bx; x += 8)
1347
+ {
1348
+ int16x8_t in0 = *(int16x8_t *)&src0x;
1349
+ int16x8_t in1 = *(int16x8_t *)&src1x;
1350
+ int32x4_t t1 = vaddl_s16(vget_low_s16(in0), vget_low_s16(in1));
1351
+ int32x4_t t2 = vaddl_high_s16(in0, in1);
1352
+ t1 = vaddq_s32(t1, addon);
1353
+ t2 = vaddq_s32(t2, addon);
1354
+ t1 = vshrq_n_s32(t1, shiftNum);
1355
+ t2 = vshrq_n_s32(t2, shiftNum);
1356
+ int16x8_t t = vuzp1q_s16(t1, t2);
1357
+#if HIGH_BIT_DEPTH
1358
+ t = vminq_s16(t, vdupq_n_s16((1 << X265_DEPTH) - 1));
1359
+ t = vmaxq_s16(t, vdupq_n_s16(0));
1360
+ *(int16x8_t *)&dstx = t;
1361
+#else
1362
+ *(uint8x8_t *)&dstx = vqmovun_s16(t);
1363
+#endif
1364
+ }
1365
+ for (; x < bx; x += 2)
1366
+ {
1367
+ dstx + 0 = x265_clip((src0x + 0 + src1x + 0 + offset) >> shiftNum);
1368
+ dstx + 1 = x265_clip((src0x + 1 + src1x + 1 + offset) >> shiftNum);
1369
+ }
1370
+
1371
+ src0 += src0Stride;
1372
+ src1 += src1Stride;
1373
+ dst += dstStride;
1374
+ }
1375
+}
1376
+
1377
+template<int lx, int ly>
1378
+void pixelavg_pp_neon(pixel *dst, intptr_t dstride, const pixel *src0, intptr_t sstride0, const pixel *src1,
1379
+ intptr_t sstride1, int)
1380
+{
1381
+ for (int y = 0; y < ly; y++)
1382
+ {
1383
+ int x = 0;
1384
+ for (; (x + 8) <= lx; x += 8)
1385
+ {
1386
+#if HIGH_BIT_DEPTH
1387
+ uint16x8_t in0 = *(uint16x8_t *)&src0x;
1388
+ uint16x8_t in1 = *(uint16x8_t *)&src1x;
1389
+ uint16x8_t t = vrhaddq_u16(in0, in1);
1390
+ *(uint16x8_t *)&dstx = t;
1391
+#else
1392
+ int16x8_t in0 = vmovl_u8(*(uint8x8_t *)&src0x);
1393
+ int16x8_t in1 = vmovl_u8(*(uint8x8_t *)&src1x);
1394
+ int16x8_t t = vrhaddq_s16(in0, in1);
1395
+ *(uint8x8_t *)&dstx = vmovn_u16(t);
1396
+#endif
1397
+ }
1398
+ for (; x < lx; x++)
1399
+ {
1400
+ dstx = (src0x + src1x + 1) >> 1;
1401
+ }
1402
+
1403
+ src0 += sstride0;
1404
+ src1 += sstride1;
1405
+ dst += dstride;
1406
+ }
1407
+}
1408
+
1409
+
1410
+template<int size>
1411
+void cpy1Dto2D_shl_neon(int16_t *dst, const int16_t *src, intptr_t dstStride, int shift)
1412
+{
1413
+ X265_CHECK((((intptr_t)dst | (dstStride * sizeof(*dst))) & 15) == 0 || size == 4, "dst alignment error\n");
1414
+ X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
1415
+ X265_CHECK(shift >= 0, "invalid shift\n");
1416
+
1417
+ for (int i = 0; i < size; i++)
1418
+ {
1419
+ int j = 0;
1420
+ for (; (j + 8) <= size; j += 8)
1421
+ {
1422
+ *(int16x8_t *)&dstj = vshlq_s16(*(int16x8_t *)&srcj, vdupq_n_s16(shift));
1423
+ }
1424
+ for (; j < size; j++)
1425
+ {
1426
+ dstj = srcj << shift;
1427
+ }
1428
+ src += size;
1429
+ dst += dstStride;
1430
+ }
1431
+}
1432
+
1433
+
1434
+template<int size>
1435
+uint64_t pixel_var_neon(const uint8_t *pix, intptr_t i_stride)
1436
+{
1437
+ uint32_t sum = 0, sqr = 0;
1438
+
1439
+ int32x4_t vsqr = vdupq_n_s32(0);
1440
+ for (int y = 0; y < size; y++)
1441
+ {
1442
+ int x = 0;
1443
+ int16x8_t vsum = vdupq_n_s16(0);
1444
+ for (; (x + 8) <= size; x += 8)
1445
+ {
1446
+ int16x8_t in;
1447
+ in = vmovl_u8(*(uint8x8_t *)&pixx);
1448
+ vsum = vaddq_u16(vsum, in);
1449
+ vsqr = vmlal_s16(vsqr, vget_low_s16(in), vget_low_s16(in));
1450
+ vsqr = vmlal_high_s16(vsqr, in, in);
1451
+ }
1452
+ for (; x < size; x++)
1453
+ {
1454
+ sum += pixx;
1455
+ sqr += pixx * pixx;
1456
+ }
1457
+ sum += vaddvq_s16(vsum);
1458
+
1459
+ pix += i_stride;
1460
+ }
1461
+ sqr += vaddvq_u32(vsqr);
1462
+ return sum + ((uint64_t)sqr << 32);
1463
+}
1464
+
1465
+template<int blockSize>
1466
+void getResidual_neon(const pixel *fenc, const pixel *pred, int16_t *residual, intptr_t stride)
1467
+{
1468
+ for (int y = 0; y < blockSize; y++)
1469
+ {
1470
+ int x = 0;
1471
+ for (; (x + 8) < blockSize; x += 8)
1472
+ {
1473
+ int16x8_t vfenc, vpred;
1474
+#if HIGH_BIT_DEPTH
1475
+ vfenc = *(int16x8_t *)&fencx;
1476
+ vpred = *(int16x8_t *)&predx;
1477
+#else
1478
+ vfenc = vmovl_u8(*(uint8x8_t *)&fencx);
1479
+ vpred = vmovl_u8(*(uint8x8_t *)&predx);
1480
+#endif
1481
+ *(int16x8_t *)&residualx = vsubq_s16(vfenc, vpred);
1482
+ }
1483
+ for (; x < blockSize; x++)
1484
+ {
1485
+ residualx = static_cast<int16_t>(fencx) - static_cast<int16_t>(predx);
1486
+ }
1487
+ fenc += stride;
1488
+ residual += stride;
1489
+ pred += stride;
1490
+ }
1491
+}
1492
+
1493
+template<int size>
1494
+int psyCost_pp_neon(const pixel *source, intptr_t sstride, const pixel *recon, intptr_t rstride)
1495
+{
1496
+ static pixel zeroBuf8 /* = { 0 } */;
1497
+
1498
+ if (size)
1499
+ {
1500
+ int dim = 1 << (size + 2);
1501
+ uint32_t totEnergy = 0;
1502
+ for (int i = 0; i < dim; i += 8)
1503
+ {
1504
+ for (int j = 0; j < dim; j += 8)
1505
+ {
1506
+ /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
1507
+ int sourceEnergy = pixel_sa8d_8x8_neon(source + i * sstride + j, sstride, zeroBuf, 0) -
1508
+ (sad_pp_neon<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2);
1509
+ int reconEnergy = pixel_sa8d_8x8_neon(recon + i * rstride + j, rstride, zeroBuf, 0) -
1510
+ (sad_pp_neon<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2);
1511
+
1512
+ totEnergy += abs(sourceEnergy - reconEnergy);
1513
+ }
1514
+ }
1515
+ return totEnergy;
1516
+ }
1517
+ else
1518
+ {
1519
+ /* 4x4 is too small for sa8d */
1520
+ int sourceEnergy = pixel_satd_4x4_neon(source, sstride, zeroBuf, 0) - (sad_pp_neon<4, 4>(source, sstride, zeroBuf,
1521
+ 0) >> 2);
1522
+ int reconEnergy = pixel_satd_4x4_neon(recon, rstride, zeroBuf, 0) - (sad_pp_neon<4, 4>(recon, rstride, zeroBuf,
1523
+ 0) >> 2);
1524
+ return abs(sourceEnergy - reconEnergy);
1525
+ }
1526
+}
1527
+
1528
+
1529
+template<int w, int h>
1530
+// Calculate sa8d in blocks of 8x8
1531
+int sa8d8(const pixel *pix1, intptr_t i_pix1, const pixel *pix2, intptr_t i_pix2)
1532
+{
1533
+ int cost = 0;
1534
+
1535
+ for (int y = 0; y < h; y += 8)
1536
+ for (int x = 0; x < w; x += 8)
1537
+ {
1538
+ cost += pixel_sa8d_8x8_neon(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);
1539
+ }
1540
+
1541
+ return cost;
1542
+}
1543
+
1544
+template<int w, int h>
1545
+// Calculate sa8d in blocks of 16x16
1546
+int sa8d16(const pixel *pix1, intptr_t i_pix1, const pixel *pix2, intptr_t i_pix2)
1547
+{
1548
+ int cost = 0;
1549
+
1550
+ for (int y = 0; y < h; y += 16)
1551
+ for (int x = 0; x < w; x += 16)
1552
+ {
1553
+ cost += pixel_sa8d_16x16_neon(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);
1554
+ }
1555
+
1556
+ return cost;
1557
+}
1558
+
1559
+template<int size>
1560
+void cpy2Dto1D_shl_neon(int16_t *dst, const int16_t *src, intptr_t srcStride, int shift)
1561
+{
1562
+ X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
1563
+ X265_CHECK((((intptr_t)src | (srcStride * sizeof(*src))) & 15) == 0 || size == 4, "src alignment error\n");
1564
+ X265_CHECK(shift >= 0, "invalid shift\n");
1565
+
1566
+ for (int i = 0; i < size; i++)
1567
+ {
1568
+ for (int j = 0; j < size; j++)
1569
+ {
1570
+ dstj = srcj << shift;
1571
+ }
1572
+
1573
+ src += srcStride;
1574
+ dst += size;
1575
+ }
1576
+}
1577
+
1578
+
1579
+template<int w, int h>
1580
+// calculate satd in blocks of 4x4
1581
+int satd4_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
1582
+{
1583
+ int satd = 0;
1584
+
1585
+ for (int row = 0; row < h; row += 4)
1586
+ for (int col = 0; col < w; col += 4)
1587
+ satd += pixel_satd_4x4_neon(pix1 + row * stride_pix1 + col, stride_pix1,
1588
+ pix2 + row * stride_pix2 + col, stride_pix2);
1589
+
1590
+ return satd;
1591
+}
1592
+
1593
+template<int w, int h>
1594
+// calculate satd in blocks of 8x4
1595
+int satd8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
1596
+{
1597
+ int satd = 0;
1598
+
1599
+ if (((w | h) & 15) == 0)
1600
+ {
1601
+ for (int row = 0; row < h; row += 16)
1602
+ for (int col = 0; col < w; col += 16)
1603
+ satd += pixel_satd_16x16_neon(pix1 + row * stride_pix1 + col, stride_pix1,
1604
+ pix2 + row * stride_pix2 + col, stride_pix2);
1605
+
1606
+ }
1607
+ else if (((w | h) & 7) == 0)
1608
+ {
1609
+ for (int row = 0; row < h; row += 8)
1610
+ for (int col = 0; col < w; col += 8)
1611
+ satd += pixel_satd_8x8_neon(pix1 + row * stride_pix1 + col, stride_pix1,
1612
+ pix2 + row * stride_pix2 + col, stride_pix2);
1613
+
1614
+ }
1615
+ else
1616
+ {
1617
+ for (int row = 0; row < h; row += 4)
1618
+ for (int col = 0; col < w; col += 8)
1619
+ satd += pixel_satd_8x4_neon(pix1 + row * stride_pix1 + col, stride_pix1,
1620
+ pix2 + row * stride_pix2 + col, stride_pix2);
1621
+ }
1622
+
1623
+ return satd;
1624
+}
1625
+
1626
+
1627
+template<int blockSize>
1628
+void transpose_neon(pixel *dst, const pixel *src, intptr_t stride)
1629
+{
1630
+ for (int k = 0; k < blockSize; k++)
1631
+ for (int l = 0; l < blockSize; l++)
1632
+ {
1633
+ dstk * blockSize + l = srcl * stride + k;
1634
+ }
1635
+}
1636
+
1637
+
1638
+template<>
1639
+void transpose_neon<8>(pixel *dst, const pixel *src, intptr_t stride)
1640
+{
1641
+ transpose8x8(dst, src, 8, stride);
1642
+}
1643
+
1644
+template<>
1645
+void transpose_neon<16>(pixel *dst, const pixel *src, intptr_t stride)
1646
+{
1647
+ transpose16x16(dst, src, 16, stride);
1648
+}
1649
+
1650
+template<>
1651
+void transpose_neon<32>(pixel *dst, const pixel *src, intptr_t stride)
1652
+{
1653
+ transpose32x32(dst, src, 32, stride);
1654
+}
1655
+
1656
+
1657
+template<>
1658
+void transpose_neon<64>(pixel *dst, const pixel *src, intptr_t stride)
1659
+{
1660
+ transpose32x32(dst, src, 64, stride);
1661
+ transpose32x32(dst + 32 * 64 + 32, src + 32 * stride + 32, 64, stride);
1662
+ transpose32x32(dst + 32 * 64, src + 32, 64, stride);
1663
+ transpose32x32(dst + 32, src + 32 * stride, 64, stride);
1664
+}
1665
+
1666
+
1667
+template<int size>
1668
+sse_t pixel_ssd_s_neon(const int16_t *a, intptr_t dstride)
1669
+{
1670
+ sse_t sum = 0;
1671
+
1672
+
1673
+ int32x4_t vsum = vdupq_n_s32(0);
1674
+
1675
+ for (int y = 0; y < size; y++)
1676
+ {
1677
+ int x = 0;
1678
+
1679
+ for (; (x + 8) <= size; x += 8)
1680
+ {
1681
+ int16x8_t in = *(int16x8_t *)&ax;
1682
+ vsum = vmlal_s16(vsum, vget_low_s16(in), vget_low_s16(in));
1683
+ vsum = vmlal_high_s16(vsum, (in), (in));
1684
+ }
1685
+ for (; x < size; x++)
1686
+ {
1687
+ sum += ax * ax;
1688
+ }
1689
+
1690
+ a += dstride;
1691
+ }
1692
+ return sum + vaddvq_s32(vsum);
1693
+}
1694
+
1695
+
1696
+};
1697
+
1698
+
1699
+
1700
+
1701
+namespace X265_NS
1702
+{
1703
+
1704
+
1705
+void setupPixelPrimitives_neon(EncoderPrimitives &p)
1706
+{
1707
+#define LUMA_PU(W, H) \
1708
+ p.puLUMA_ ## W ## x ## H.copy_pp = blockcopy_pp_neon<W, H>; \
1709
+ p.puLUMA_ ## W ## x ## H.addAvgNONALIGNED = addAvg_neon<W, H>; \
1710
+ p.puLUMA_ ## W ## x ## H.addAvgALIGNED = addAvg_neon<W, H>; \
1711
+ p.puLUMA_ ## W ## x ## H.sad = sad_pp_neon<W, H>; \
1712
+ p.puLUMA_ ## W ## x ## H.sad_x3 = sad_x3_neon<W, H>; \
1713
+ p.puLUMA_ ## W ## x ## H.sad_x4 = sad_x4_neon<W, H>; \
1714
+ p.puLUMA_ ## W ## x ## H.pixelavg_ppNONALIGNED = pixelavg_pp_neon<W, H>; \
1715
+ p.puLUMA_ ## W ## x ## H.pixelavg_ppALIGNED = pixelavg_pp_neon<W, H>;
1716
+
1717
+#if !(HIGH_BIT_DEPTH)
1718
+#define LUMA_PU_S(W, H) \
1719
+ p.puLUMA_ ## W ## x ## H.copy_pp = blockcopy_pp_neon<W, H>; \
1720
+ p.puLUMA_ ## W ## x ## H.addAvgNONALIGNED = addAvg_neon<W, H>; \
1721
+ p.puLUMA_ ## W ## x ## H.addAvgALIGNED = addAvg_neon<W, H>;
1722
+#else // !(HIGH_BIT_DEPTH)
1723
+#define LUMA_PU_S(W, H) \
1724
+ p.puLUMA_ ## W ## x ## H.copy_pp = blockcopy_pp_neon<W, H>; \
1725
+ p.puLUMA_ ## W ## x ## H.addAvgNONALIGNED = addAvg_neon<W, H>; \
1726
+ p.puLUMA_ ## W ## x ## H.addAvgALIGNED = addAvg_neon<W, H>; \
1727
+ p.puLUMA_ ## W ## x ## H.sad_x3 = sad_x3_neon<W, H>; \
1728
+ p.puLUMA_ ## W ## x ## H.sad_x4 = sad_x4_neon<W, H>; \
1729
+ p.puLUMA_ ## W ## x ## H.pixelavg_ppNONALIGNED = pixelavg_pp_neon<W, H>; \
1730
+ p.puLUMA_ ## W ## x ## H.pixelavg_ppALIGNED = pixelavg_pp_neon<W, H>;
1731
+#endif // !(HIGH_BIT_DEPTH)
1732
+
1733
+#define LUMA_CU(W, H) \
1734
+ p.cuBLOCK_ ## W ## x ## H.sub_ps = pixel_sub_ps_neon<W, H>; \
1735
+ p.cuBLOCK_ ## W ## x ## H.add_psNONALIGNED = pixel_add_ps_neon<W, H>; \
1736
+ p.cuBLOCK_ ## W ## x ## H.add_psALIGNED = pixel_add_ps_neon<W, H>; \
1737
+ p.cuBLOCK_ ## W ## x ## H.copy_pp = blockcopy_pp_neon<W, H>; \
1738
+ p.cuBLOCK_ ## W ## x ## H.copy_ps = blockcopy_ps_neon<W, H>; \
1739
+ p.cuBLOCK_ ## W ## x ## H.copy_pp = blockcopy_pp_neon<W, H>; \
1740
+ p.cuBLOCK_ ## W ## x ## H.cpy2Dto1D_shl = cpy2Dto1D_shl_neon<W>; \
1741
+ p.cuBLOCK_ ## W ## x ## H.cpy1Dto2D_shlNONALIGNED = cpy1Dto2D_shl_neon<W>; \
1742
+ p.cuBLOCK_ ## W ## x ## H.cpy1Dto2D_shlALIGNED = cpy1Dto2D_shl_neon<W>; \
1743
+ p.cuBLOCK_ ## W ## x ## H.psy_cost_pp = psyCost_pp_neon<BLOCK_ ## W ## x ## H>; \
1744
+ p.cuBLOCK_ ## W ## x ## H.transpose = transpose_neon<W>;
1745
+
1746
+
1747
+ LUMA_PU_S(4, 4);
1748
+ LUMA_PU_S(8, 8);
1749
+ LUMA_PU(16, 16);
1750
+ LUMA_PU(32, 32);
1751
+ LUMA_PU(64, 64);
1752
+ LUMA_PU_S(4, 8);
1753
+ LUMA_PU_S(8, 4);
1754
+ LUMA_PU(16, 8);
1755
+ LUMA_PU_S(8, 16);
1756
+ LUMA_PU(16, 12);
1757
+ LUMA_PU(12, 16);
1758
+ LUMA_PU(16, 4);
1759
+ LUMA_PU_S(4, 16);
1760
+ LUMA_PU(32, 16);
1761
+ LUMA_PU(16, 32);
1762
+ LUMA_PU(32, 24);
1763
+ LUMA_PU(24, 32);
1764
+ LUMA_PU(32, 8);
1765
+ LUMA_PU_S(8, 32);
1766
+ LUMA_PU(64, 32);
1767
+ LUMA_PU(32, 64);
1768
+ LUMA_PU(64, 48);
1769
+ LUMA_PU(48, 64);
1770
+ LUMA_PU(64, 16);
1771
+ LUMA_PU(16, 64);
1772
+
1773
+#if defined(__APPLE__)
1774
+ p.puLUMA_4x4.sad = sad_pp_neon<4, 4>;
1775
+ p.puLUMA_4x8.sad = sad_pp_neon<4, 8>;
1776
+ p.puLUMA_4x16.sad = sad_pp_neon<4, 16>;
1777
+#endif // defined(__APPLE__)
1778
+ p.puLUMA_8x4.sad = sad_pp_neon<8, 4>;
1779
+ p.puLUMA_8x8.sad = sad_pp_neon<8, 8>;
1780
+ p.puLUMA_8x16.sad = sad_pp_neon<8, 16>;
1781
+ p.puLUMA_8x32.sad = sad_pp_neon<8, 32>;
1782
+
1783
+#if !(HIGH_BIT_DEPTH)
1784
+ p.puLUMA_4x4.sad_x3 = sad_x3_neon<4, 4>;
1785
+ p.puLUMA_4x4.sad_x4 = sad_x4_neon<4, 4>;
1786
+ p.puLUMA_4x8.sad_x3 = sad_x3_neon<4, 8>;
1787
+ p.puLUMA_4x8.sad_x4 = sad_x4_neon<4, 8>;
1788
+ p.puLUMA_4x16.sad_x3 = sad_x3_neon<4, 16>;
1789
+ p.puLUMA_4x16.sad_x4 = sad_x4_neon<4, 16>;
1790
+#endif // !(HIGH_BIT_DEPTH)
1791
+
1792
+ p.puLUMA_4x4.satd = pixel_satd_4x4_neon;
1793
+ p.puLUMA_8x4.satd = pixel_satd_8x4_neon;
1794
+
1795
+ p.puLUMA_8x8.satd = satd8_neon<8, 8>;
1796
+ p.puLUMA_16x16.satd = satd8_neon<16, 16>;
1797
+ p.puLUMA_16x8.satd = satd8_neon<16, 8>;
1798
+ p.puLUMA_8x16.satd = satd8_neon<8, 16>;
1799
+ p.puLUMA_16x12.satd = satd8_neon<16, 12>;
1800
+ p.puLUMA_16x4.satd = satd8_neon<16, 4>;
1801
+ p.puLUMA_32x32.satd = satd8_neon<32, 32>;
1802
+ p.puLUMA_32x16.satd = satd8_neon<32, 16>;
1803
+ p.puLUMA_16x32.satd = satd8_neon<16, 32>;
1804
+ p.puLUMA_32x24.satd = satd8_neon<32, 24>;
1805
+ p.puLUMA_24x32.satd = satd8_neon<24, 32>;
1806
+ p.puLUMA_32x8.satd = satd8_neon<32, 8>;
1807
+ p.puLUMA_8x32.satd = satd8_neon<8, 32>;
1808
+ p.puLUMA_64x64.satd = satd8_neon<64, 64>;
1809
+ p.puLUMA_64x32.satd = satd8_neon<64, 32>;
1810
+ p.puLUMA_32x64.satd = satd8_neon<32, 64>;
1811
+ p.puLUMA_64x48.satd = satd8_neon<64, 48>;
1812
+ p.puLUMA_48x64.satd = satd8_neon<48, 64>;
1813
+ p.puLUMA_64x16.satd = satd8_neon<64, 16>;
1814
+ p.puLUMA_16x64.satd = satd8_neon<16, 64>;
1815
+
1816
+#if HIGH_BIT_DEPTH
1817
+ p.puLUMA_4x8.satd = satd4_neon<4, 8>;
1818
+ p.puLUMA_4x16.satd = satd4_neon<4, 16>;
1819
+#endif // HIGH_BIT_DEPTH
1820
+
1821
+#if !defined(__APPLE__) || HIGH_BIT_DEPTH
1822
+ p.puLUMA_12x16.satd = satd4_neon<12, 16>;
1823
+#endif // !defined(__APPLE__)
1824
+
1825
+
1826
+ LUMA_CU(4, 4);
1827
+ LUMA_CU(8, 8);
1828
+ LUMA_CU(16, 16);
1829
+ LUMA_CU(32, 32);
1830
+ LUMA_CU(64, 64);
1831
+
1832
+#if !(HIGH_BIT_DEPTH)
1833
+ p.cuBLOCK_8x8.var = pixel_var_neon<8>;
1834
+ p.cuBLOCK_16x16.var = pixel_var_neon<16>;
1835
+#if defined(__APPLE__)
1836
+ p.cuBLOCK_32x32.var = pixel_var_neon<32>;
1837
+ p.cuBLOCK_64x64.var = pixel_var_neon<64>;
1838
+#endif // defined(__APPLE__)
1839
+#endif // !(HIGH_BIT_DEPTH)
1840
+
1841
+ p.cuBLOCK_16x16.blockfill_sNONALIGNED = blockfill_s_neon<16>;
1842
+ p.cuBLOCK_16x16.blockfill_sALIGNED = blockfill_s_neon<16>;
1843
+ p.cuBLOCK_32x32.blockfill_sNONALIGNED = blockfill_s_neon<32>;
1844
+ p.cuBLOCK_32x32.blockfill_sALIGNED = blockfill_s_neon<32>;
1845
+ p.cuBLOCK_64x64.blockfill_sNONALIGNED = blockfill_s_neon<64>;
1846
+ p.cuBLOCK_64x64.blockfill_sALIGNED = blockfill_s_neon<64>;
1847
+
1848
+
1849
+ p.cuBLOCK_4x4.calcresidualNONALIGNED = getResidual_neon<4>;
1850
+ p.cuBLOCK_4x4.calcresidualALIGNED = getResidual_neon<4>;
1851
+ p.cuBLOCK_8x8.calcresidualNONALIGNED = getResidual_neon<8>;
1852
+ p.cuBLOCK_8x8.calcresidualALIGNED = getResidual_neon<8>;
1853
+ p.cuBLOCK_16x16.calcresidualNONALIGNED = getResidual_neon<16>;
1854
+ p.cuBLOCK_16x16.calcresidualALIGNED = getResidual_neon<16>;
1855
+
1856
+#if defined(__APPLE__)
1857
+ p.cuBLOCK_32x32.calcresidualNONALIGNED = getResidual_neon<32>;
1858
+ p.cuBLOCK_32x32.calcresidualALIGNED = getResidual_neon<32>;
1859
+#endif // defined(__APPLE__)
1860
+
1861
+ p.cuBLOCK_4x4.sa8d = pixel_satd_4x4_neon;
1862
+ p.cuBLOCK_8x8.sa8d = pixel_sa8d_8x8_neon;
1863
+ p.cuBLOCK_16x16.sa8d = pixel_sa8d_16x16_neon;
1864
+ p.cuBLOCK_32x32.sa8d = sa8d16<32, 32>;
1865
+ p.cuBLOCK_64x64.sa8d = sa8d16<64, 64>;
1866
+
1867
+
1868
+#define CHROMA_PU_420(W, H) \
1869
+ p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.addAvgNONALIGNED = addAvg_neon<W, H>; \
1870
+ p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.addAvgALIGNED = addAvg_neon<W, H>; \
1871
+ p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.copy_pp = blockcopy_pp_neon<W, H>; \
1872
+
1873
+
1874
+ CHROMA_PU_420(4, 4);
1875
+ CHROMA_PU_420(8, 8);
1876
+ CHROMA_PU_420(16, 16);
1877
+ CHROMA_PU_420(32, 32);
1878
+ CHROMA_PU_420(4, 2);
1879
+ CHROMA_PU_420(8, 4);
1880
+ CHROMA_PU_420(4, 8);
1881
+ CHROMA_PU_420(8, 6);
1882
+ CHROMA_PU_420(6, 8);
1883
+ CHROMA_PU_420(8, 2);
1884
+ CHROMA_PU_420(2, 8);
1885
+ CHROMA_PU_420(16, 8);
1886
+ CHROMA_PU_420(8, 16);
1887
+ CHROMA_PU_420(16, 12);
1888
+ CHROMA_PU_420(12, 16);
1889
+ CHROMA_PU_420(16, 4);
1890
+ CHROMA_PU_420(4, 16);
1891
+ CHROMA_PU_420(32, 16);
1892
+ CHROMA_PU_420(16, 32);
1893
+ CHROMA_PU_420(32, 24);
1894
+ CHROMA_PU_420(24, 32);
1895
+ CHROMA_PU_420(32, 8);
1896
+ CHROMA_PU_420(8, 32);
1897
+
1898
+
1899
+
1900
+ p.chromaX265_CSP_I420.puCHROMA_420_2x2.satd = NULL;
1901
+ p.chromaX265_CSP_I420.puCHROMA_420_4x4.satd = pixel_satd_4x4_neon;
1902
+ p.chromaX265_CSP_I420.puCHROMA_420_8x8.satd = satd8_neon<8, 8>;
1903
+ p.chromaX265_CSP_I420.puCHROMA_420_16x16.satd = satd8_neon<16, 16>;
1904
+ p.chromaX265_CSP_I420.puCHROMA_420_32x32.satd = satd8_neon<32, 32>;
1905
+
1906
+ p.chromaX265_CSP_I420.puCHROMA_420_4x2.satd = NULL;
1907
+ p.chromaX265_CSP_I420.puCHROMA_420_2x4.satd = NULL;
1908
+ p.chromaX265_CSP_I420.puCHROMA_420_8x4.satd = pixel_satd_8x4_neon;
1909
+ p.chromaX265_CSP_I420.puCHROMA_420_16x8.satd = satd8_neon<16, 8>;
1910
+ p.chromaX265_CSP_I420.puCHROMA_420_8x16.satd = satd8_neon<8, 16>;
1911
+ p.chromaX265_CSP_I420.puCHROMA_420_32x16.satd = satd8_neon<32, 16>;
1912
+ p.chromaX265_CSP_I420.puCHROMA_420_16x32.satd = satd8_neon<16, 32>;
1913
+
1914
+ p.chromaX265_CSP_I420.puCHROMA_420_8x6.satd = NULL;
1915
+ p.chromaX265_CSP_I420.puCHROMA_420_6x8.satd = NULL;
1916
+ p.chromaX265_CSP_I420.puCHROMA_420_8x2.satd = NULL;
1917
+ p.chromaX265_CSP_I420.puCHROMA_420_2x8.satd = NULL;
1918
+ p.chromaX265_CSP_I420.puCHROMA_420_16x12.satd = satd4_neon<16, 12>;
1919
+ p.chromaX265_CSP_I420.puCHROMA_420_16x4.satd = satd4_neon<16, 4>;
1920
+ p.chromaX265_CSP_I420.puCHROMA_420_32x24.satd = satd8_neon<32, 24>;
1921
+ p.chromaX265_CSP_I420.puCHROMA_420_24x32.satd = satd8_neon<24, 32>;
1922
+ p.chromaX265_CSP_I420.puCHROMA_420_32x8.satd = satd8_neon<32, 8>;
1923
+ p.chromaX265_CSP_I420.puCHROMA_420_8x32.satd = satd8_neon<8, 32>;
1924
+
1925
+#if HIGH_BIT_DEPTH
1926
+ p.chromaX265_CSP_I420.puCHROMA_420_4x8.satd = satd4_neon<4, 8>;
1927
+ p.chromaX265_CSP_I420.puCHROMA_420_4x16.satd = satd4_neon<4, 16>;
1928
+#endif // HIGH_BIT_DEPTH
1929
+
1930
+#if !defined(__APPLE__) || HIGH_BIT_DEPTH
1931
+ p.chromaX265_CSP_I420.puCHROMA_420_12x16.satd = satd4_neon<12, 16>;
1932
+#endif // !defined(__APPLE__)
1933
+
1934
+
1935
+#define CHROMA_CU_420(W, H) \
1936
+ p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.sse_pp = sse_neon<W, H, pixel, pixel>; \
1937
+ p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.copy_pp = blockcopy_pp_neon<W, H>; \
1938
+ p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.copy_ps = blockcopy_ps_neon<W, H>; \
1939
+ p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.sub_ps = pixel_sub_ps_neon<W, H>; \
1940
+ p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.add_psNONALIGNED = pixel_add_ps_neon<W, H>; \
1941
+ p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.add_psALIGNED = pixel_add_ps_neon<W, H>;
1942
+
1943
+#define CHROMA_CU_S_420(W, H) \
1944
+ p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.copy_pp = blockcopy_pp_neon<W, H>; \
1945
+ p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.copy_ps = blockcopy_ps_neon<W, H>; \
1946
+ p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.sub_ps = pixel_sub_ps_neon<W, H>; \
1947
+ p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.add_psNONALIGNED = pixel_add_ps_neon<W, H>; \
1948
+ p.chromaX265_CSP_I420.cuBLOCK_420_ ## W ## x ## H.add_psALIGNED = pixel_add_ps_neon<W, H>;
1949
+
1950
+
1951
+ CHROMA_CU_S_420(4, 4)
1952
+ CHROMA_CU_420(8, 8)
1953
+ CHROMA_CU_420(16, 16)
1954
+ CHROMA_CU_420(32, 32)
1955
+
1956
+
1957
+ p.chromaX265_CSP_I420.cuBLOCK_8x8.sa8d = p.chromaX265_CSP_I420.puCHROMA_420_4x4.satd;
1958
+ p.chromaX265_CSP_I420.cuBLOCK_16x16.sa8d = sa8d8<8, 8>;
1959
+ p.chromaX265_CSP_I420.cuBLOCK_32x32.sa8d = sa8d16<16, 16>;
1960
+ p.chromaX265_CSP_I420.cuBLOCK_64x64.sa8d = sa8d16<32, 32>;
1961
+
1962
+
1963
+#define CHROMA_PU_422(W, H) \
1964
+ p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.addAvgNONALIGNED = addAvg_neon<W, H>; \
1965
+ p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.addAvgALIGNED = addAvg_neon<W, H>; \
1966
+ p.chromaX265_CSP_I422.puCHROMA_422_ ## W ## x ## H.copy_pp = blockcopy_pp_neon<W, H>; \
1967
+
1968
+
1969
+ CHROMA_PU_422(4, 8);
1970
+ CHROMA_PU_422(8, 16);
1971
+ CHROMA_PU_422(16, 32);
1972
+ CHROMA_PU_422(32, 64);
1973
+ CHROMA_PU_422(4, 4);
1974
+ CHROMA_PU_422(2, 8);
1975
+ CHROMA_PU_422(8, 8);
1976
+ CHROMA_PU_422(4, 16);
1977
+ CHROMA_PU_422(8, 12);
1978
+ CHROMA_PU_422(6, 16);
1979
+ CHROMA_PU_422(8, 4);
1980
+ CHROMA_PU_422(2, 16);
1981
+ CHROMA_PU_422(16, 16);
1982
+ CHROMA_PU_422(8, 32);
1983
+ CHROMA_PU_422(16, 24);
1984
+ CHROMA_PU_422(12, 32);
1985
+ CHROMA_PU_422(16, 8);
1986
+ CHROMA_PU_422(4, 32);
1987
+ CHROMA_PU_422(32, 32);
1988
+ CHROMA_PU_422(16, 64);
1989
+ CHROMA_PU_422(32, 48);
1990
+ CHROMA_PU_422(24, 64);
1991
+ CHROMA_PU_422(32, 16);
1992
+ CHROMA_PU_422(8, 64);
1993
+
1994
+
1995
+ p.chromaX265_CSP_I422.puCHROMA_422_2x4.satd = NULL;
1996
+ p.chromaX265_CSP_I422.puCHROMA_422_8x16.satd = satd8_neon<8, 16>;
1997
+ p.chromaX265_CSP_I422.puCHROMA_422_16x32.satd = satd8_neon<16, 32>;
1998
+ p.chromaX265_CSP_I422.puCHROMA_422_32x64.satd = satd8_neon<32, 64>;
1999
+ p.chromaX265_CSP_I422.puCHROMA_422_4x4.satd = pixel_satd_4x4_neon;
2000
+ p.chromaX265_CSP_I422.puCHROMA_422_2x8.satd = NULL;
2001
+ p.chromaX265_CSP_I422.puCHROMA_422_8x8.satd = satd8_neon<8, 8>;
2002
+ p.chromaX265_CSP_I422.puCHROMA_422_16x16.satd = satd8_neon<16, 16>;
2003
+ p.chromaX265_CSP_I422.puCHROMA_422_8x32.satd = satd8_neon<8, 32>;
2004
+ p.chromaX265_CSP_I422.puCHROMA_422_32x32.satd = satd8_neon<32, 32>;
2005
+ p.chromaX265_CSP_I422.puCHROMA_422_16x64.satd = satd8_neon<16, 64>;
2006
+ p.chromaX265_CSP_I422.puCHROMA_422_6x16.satd = NULL;
2007
+ p.chromaX265_CSP_I422.puCHROMA_422_8x4.satd = satd4_neon<8, 4>;
2008
+ p.chromaX265_CSP_I422.puCHROMA_422_2x16.satd = NULL;
2009
+ p.chromaX265_CSP_I422.puCHROMA_422_16x8.satd = satd8_neon<16, 8>;
2010
+ p.chromaX265_CSP_I422.puCHROMA_422_32x16.satd = satd8_neon<32, 16>;
2011
+
2012
+ p.chromaX265_CSP_I422.puCHROMA_422_8x12.satd = satd4_neon<8, 12>;
2013
+ p.chromaX265_CSP_I422.puCHROMA_422_8x64.satd = satd8_neon<8, 64>;
2014
+ p.chromaX265_CSP_I422.puCHROMA_422_12x32.satd = satd4_neon<12, 32>;
2015
+ p.chromaX265_CSP_I422.puCHROMA_422_16x24.satd = satd8_neon<16, 24>;
2016
+ p.chromaX265_CSP_I422.puCHROMA_422_24x64.satd = satd8_neon<24, 64>;
2017
+ p.chromaX265_CSP_I422.puCHROMA_422_32x48.satd = satd8_neon<32, 48>;
2018
+
2019
+#if HIGH_BIT_DEPTH
2020
+ p.chromaX265_CSP_I422.puCHROMA_422_4x8.satd = satd4_neon<4, 8>;
2021
+ p.chromaX265_CSP_I422.puCHROMA_422_4x16.satd = satd4_neon<4, 16>;
2022
+ p.chromaX265_CSP_I422.puCHROMA_422_4x32.satd = satd4_neon<4, 32>;
2023
+#endif // HIGH_BIT_DEPTH
2024
+
2025
+
2026
+#define CHROMA_CU_422(W, H) \
2027
+ p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.sse_pp = sse_neon<W, H, pixel, pixel>; \
2028
+ p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.copy_pp = blockcopy_pp_neon<W, H>; \
2029
+ p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.copy_ps = blockcopy_ps_neon<W, H>; \
2030
+ p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.sub_ps = pixel_sub_ps_neon<W, H>; \
2031
+ p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.add_psNONALIGNED = pixel_add_ps_neon<W, H>; \
2032
+ p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.add_psALIGNED = pixel_add_ps_neon<W, H>;
2033
+
2034
+#define CHROMA_CU_S_422(W, H) \
2035
+ p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.copy_pp = blockcopy_pp_neon<W, H>; \
2036
+ p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.copy_ps = blockcopy_ps_neon<W, H>; \
2037
+ p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.sub_ps = pixel_sub_ps_neon<W, H>; \
2038
+ p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.add_psNONALIGNED = pixel_add_ps_neon<W, H>; \
2039
+ p.chromaX265_CSP_I422.cuBLOCK_422_ ## W ## x ## H.add_psALIGNED = pixel_add_ps_neon<W, H>;
2040
+
2041
+
2042
+ CHROMA_CU_S_422(4, 8)
2043
+ CHROMA_CU_422(8, 16)
2044
+ CHROMA_CU_422(16, 32)
2045
+ CHROMA_CU_422(32, 64)
2046
+
2047
+ p.chromaX265_CSP_I422.cuBLOCK_8x8.sa8d = p.chromaX265_CSP_I422.puCHROMA_422_4x8.satd;
2048
+ p.chromaX265_CSP_I422.cuBLOCK_16x16.sa8d = sa8d8<8, 16>;
2049
+ p.chromaX265_CSP_I422.cuBLOCK_32x32.sa8d = sa8d16<16, 32>;
2050
+ p.chromaX265_CSP_I422.cuBLOCK_64x64.sa8d = sa8d16<32, 64>;
2051
+
2052
+
2053
+}
2054
+
2055
+
2056
+}
2057
+
2058
+
2059
+#endif
2060
+
2061
x265_3.6.tar.gz/source/common/aarch64/pixel-prim.h
Added
25
1
2
+#ifndef PIXEL_PRIM_NEON_H__
3
+#define PIXEL_PRIM_NEON_H__
4
+
5
+#include "common.h"
6
+#include "slicetype.h" // LOWRES_COST_MASK
7
+#include "primitives.h"
8
+#include "x265.h"
9
+
10
+
11
+
12
+namespace X265_NS
13
+{
14
+
15
+
16
+
17
+void setupPixelPrimitives_neon(EncoderPrimitives &p);
18
+
19
+
20
+}
21
+
22
+
23
+#endif
24
+
25
x265_3.6.tar.gz/source/common/aarch64/pixel-util-common.S
Added
86
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+.arch armv8-a
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.macro pixel_var_start
39
+ movi v0.16b, #0
40
+ movi v1.16b, #0
41
+ movi v2.16b, #0
42
+ movi v3.16b, #0
43
+.endm
44
+
45
+.macro pixel_var_1 v
46
+ uaddw v0.8h, v0.8h, \v\().8b
47
+ umull v30.8h, \v\().8b, \v\().8b
48
+ uaddw2 v1.8h, v1.8h, \v\().16b
49
+ umull2 v31.8h, \v\().16b, \v\().16b
50
+ uadalp v2.4s, v30.8h
51
+ uadalp v3.4s, v31.8h
52
+.endm
53
+
54
+.macro pixel_var_end
55
+ uaddlv s0, v0.8h
56
+ uaddlv s1, v1.8h
57
+ add v2.4s, v2.4s, v3.4s
58
+ fadd s0, s0, s1
59
+ uaddlv d2, v2.4s
60
+ fmov w0, s0
61
+ fmov x2, d2
62
+ orr x0, x0, x2, lsl #32
63
+.endm
64
+
65
+.macro ssimDist_start
66
+ movi v0.16b, #0
67
+ movi v1.16b, #0
68
+.endm
69
+
70
+.macro ssimDist_end
71
+ uaddlv d0, v0.4s
72
+ uaddlv d1, v1.4s
73
+ str d0, x6
74
+ str d1, x4
75
+.endm
76
+
77
+.macro normFact_start
78
+ movi v0.16b, #0
79
+.endm
80
+
81
+.macro normFact_end
82
+ uaddlv d0, v0.4s
83
+ str d0, x3
84
+.endm
85
+
86
x265_3.6.tar.gz/source/common/aarch64/pixel-util-sve.S
Added
375
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "pixel-util-common.S"
27
+
28
+.arch armv8-a+sve
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+function PFX(pixel_sub_ps_8x16_sve)
41
+ lsl x1, x1, #1
42
+ ptrue p0.h, vl8
43
+.rept 8
44
+ ld1b {z0.h}, p0/z, x2
45
+ ld1b {z1.h}, p0/z, x3
46
+ add x2, x2, x4
47
+ add x3, x3, x5
48
+ ld1b {z2.h}, p0/z, x2
49
+ ld1b {z3.h}, p0/z, x3
50
+ add x2, x2, x4
51
+ add x3, x3, x5
52
+ sub z4.h, z0.h, z1.h
53
+ sub z5.h, z2.h, z3.h
54
+ st1 {v4.8h}, x0, x1
55
+ st1 {v5.8h}, x0, x1
56
+.endr
57
+ ret
58
+endfunc
59
+
60
+//******* satd *******
61
+.macro satd_4x4_sve
62
+ ld1b {z0.h}, p0/z, x0
63
+ ld1b {z2.h}, p0/z, x2
64
+ add x0, x0, x1
65
+ add x2, x2, x3
66
+ ld1b {z1.h}, p0/z, x0
67
+ ld1b {z3.h}, p0/z, x2
68
+ add x0, x0, x1
69
+ add x2, x2, x3
70
+ ld1b {z4.h}, p0/z, x0
71
+ ld1b {z6.h}, p0/z, x2
72
+ add x0, x0, x1
73
+ add x2, x2, x3
74
+ ld1b {z5.h}, p0/z, x0
75
+ ld1b {z7.h}, p0/z, x2
76
+ add x0, x0, x1
77
+ add x2, x2, x3
78
+
79
+ sub z0.h, z0.h, z2.h
80
+ sub z1.h, z1.h, z3.h
81
+ sub z2.h, z4.h, z6.h
82
+ sub z3.h, z5.h, z7.h
83
+
84
+ add z4.h, z0.h, z2.h
85
+ add z5.h, z1.h, z3.h
86
+ sub z6.h, z0.h, z2.h
87
+ sub z7.h, z1.h, z3.h
88
+
89
+ add z0.h, z4.h, z5.h
90
+ sub z1.h, z4.h, z5.h
91
+
92
+ add z2.h, z6.h, z7.h
93
+ sub z3.h, z6.h, z7.h
94
+
95
+ trn1 z4.h, z0.h, z2.h
96
+ trn2 z5.h, z0.h, z2.h
97
+
98
+ trn1 z6.h, z1.h, z3.h
99
+ trn2 z7.h, z1.h, z3.h
100
+
101
+ add z0.h, z4.h, z5.h
102
+ sub z1.h, z4.h, z5.h
103
+
104
+ add z2.h, z6.h, z7.h
105
+ sub z3.h, z6.h, z7.h
106
+
107
+ trn1 z4.s, z0.s, z1.s
108
+ trn2 z5.s, z0.s, z1.s
109
+
110
+ trn1 z6.s, z2.s, z3.s
111
+ trn2 z7.s, z2.s, z3.s
112
+
113
+ abs z4.h, p0/m, z4.h
114
+ abs z5.h, p0/m, z5.h
115
+ abs z6.h, p0/m, z6.h
116
+ abs z7.h, p0/m, z7.h
117
+
118
+ smax z4.h, p0/m, z4.h, z5.h
119
+ smax z6.h, p0/m, z6.h, z7.h
120
+
121
+ add z0.h, z4.h, z6.h
122
+
123
+ uaddlp v0.2s, v0.4h
124
+ uaddlp v0.1d, v0.2s
125
+.endm
126
+
127
+// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
128
+function PFX(pixel_satd_4x4_sve)
129
+ ptrue p0.h, vl4
130
+ satd_4x4_sve
131
+ fmov x0, d0
132
+ ret
133
+endfunc
134
+
135
+function PFX(pixel_satd_8x4_sve)
136
+ ptrue p0.h, vl4
137
+ mov x4, x0
138
+ mov x5, x2
139
+ satd_4x4_sve
140
+ add x0, x4, #4
141
+ add x2, x5, #4
142
+ umov x6, v0.d0
143
+ satd_4x4_sve
144
+ umov x0, v0.d0
145
+ add x0, x0, x6
146
+ ret
147
+endfunc
148
+
149
+function PFX(pixel_satd_8x12_sve)
150
+ ptrue p0.h, vl4
151
+ mov x4, x0
152
+ mov x5, x2
153
+ mov x7, #0
154
+ satd_4x4_sve
155
+ umov x6, v0.d0
156
+ add x7, x7, x6
157
+ add x0, x4, #4
158
+ add x2, x5, #4
159
+ satd_4x4_sve
160
+ umov x6, v0.d0
161
+ add x7, x7, x6
162
+.rept 2
163
+ sub x0, x0, #4
164
+ sub x2, x2, #4
165
+ mov x4, x0
166
+ mov x5, x2
167
+ satd_4x4_sve
168
+ umov x6, v0.d0
169
+ add x7, x7, x6
170
+ add x0, x4, #4
171
+ add x2, x5, #4
172
+ satd_4x4_sve
173
+ umov x6, v0.d0
174
+ add x7, x7, x6
175
+.endr
176
+ mov x0, x7
177
+ ret
178
+endfunc
179
+
180
+.macro LOAD_DIFF_16x4_sve v0 v1 v2 v3 v4 v5 v6 v7
181
+ mov x11, #8 // in order to consider CPUs whose vector size is greater than 128 bits
182
+ ld1b {z0.h}, p0/z, x0
183
+ ld1b {z1.h}, p0/z, x0, x11
184
+ ld1b {z2.h}, p0/z, x2
185
+ ld1b {z3.h}, p0/z, x2, x11
186
+ add x0, x0, x1
187
+ add x2, x2, x3
188
+ ld1b {z4.h}, p0/z, x0
189
+ ld1b {z5.h}, p0/z, x0, x11
190
+ ld1b {z6.h}, p0/z, x2
191
+ ld1b {z7.h}, p0/z, x2, x11
192
+ add x0, x0, x1
193
+ add x2, x2, x3
194
+ ld1b {z29.h}, p0/z, x0
195
+ ld1b {z9.h}, p0/z, x0, x11
196
+ ld1b {z10.h}, p0/z, x2
197
+ ld1b {z11.h}, p0/z, x2, x11
198
+ add x0, x0, x1
199
+ add x2, x2, x3
200
+ ld1b {z12.h}, p0/z, x0
201
+ ld1b {z13.h}, p0/z, x0, x11
202
+ ld1b {z14.h}, p0/z, x2
203
+ ld1b {z15.h}, p0/z, x2, x11
204
+ add x0, x0, x1
205
+ add x2, x2, x3
206
+
207
+ sub \v0\().h, z0.h, z2.h
208
+ sub \v4\().h, z1.h, z3.h
209
+ sub \v1\().h, z4.h, z6.h
210
+ sub \v5\().h, z5.h, z7.h
211
+ sub \v2\().h, z29.h, z10.h
212
+ sub \v6\().h, z9.h, z11.h
213
+ sub \v3\().h, z12.h, z14.h
214
+ sub \v7\().h, z13.h, z15.h
215
+.endm
216
+
217
+// one vertical hadamard pass and two horizontal
218
+function PFX(satd_8x4v_8x8h_sve), export=0
219
+ HADAMARD4_V z16.h, z18.h, z17.h, z19.h, z0.h, z2.h, z1.h, z3.h
220
+ HADAMARD4_V z20.h, z21.h, z22.h, z23.h, z0.h, z1.h, z2.h, z3.h
221
+ trn4 z0.h, z1.h, z2.h, z3.h, z16.h, z17.h, z18.h, z19.h
222
+ trn4 z4.h, z5.h, z6.h, z7.h, z20.h, z21.h, z22.h, z23.h
223
+ SUMSUB_ABCD z16.h, z17.h, z18.h, z19.h, z0.h, z1.h, z2.h, z3.h
224
+ SUMSUB_ABCD z20.h, z21.h, z22.h, z23.h, z4.h, z5.h, z6.h, z7.h
225
+ trn4 z0.s, z2.s, z1.s, z3.s, z16.s, z18.s, z17.s, z19.s
226
+ trn4 z4.s, z6.s, z5.s, z7.s, z20.s, z22.s, z21.s, z23.s
227
+ ABS8_SVE z0.h, z1.h, z2.h, z3.h, z4.h, z5.h, z6.h, z7.h, p0
228
+ smax z0.h, p0/m, z0.h, z2.h
229
+ smax z1.h, p0/m, z1.h, z3.h
230
+ smax z4.h, p0/m, z4.h, z6.h
231
+ smax z5.h, p0/m, z5.h, z7.h
232
+ ret
233
+endfunc
234
+
235
+function PFX(satd_16x4_sve), export=0
236
+ LOAD_DIFF_16x4_sve z16, z17, z18, z19, z20, z21, z22, z23
237
+ b PFX(satd_8x4v_8x8h_sve)
238
+endfunc
239
+
240
+.macro pixel_satd_32x8_sve
241
+ mov x4, x0
242
+ mov x5, x2
243
+.rept 2
244
+ bl PFX(satd_16x4_sve)
245
+ add z30.h, z30.h, z0.h
246
+ add z31.h, z31.h, z1.h
247
+ add z30.h, z30.h, z4.h
248
+ add z31.h, z31.h, z5.h
249
+.endr
250
+ add x0, x4, #16
251
+ add x2, x5, #16
252
+.rept 2
253
+ bl PFX(satd_16x4_sve)
254
+ add z30.h, z30.h, z0.h
255
+ add z31.h, z31.h, z1.h
256
+ add z30.h, z30.h, z4.h
257
+ add z31.h, z31.h, z5.h
258
+.endr
259
+.endm
260
+
261
+.macro satd_32x16_sve
262
+ movi v30.2d, #0
263
+ movi v31.2d, #0
264
+ pixel_satd_32x8_sve
265
+ sub x0, x0, #16
266
+ sub x2, x2, #16
267
+ pixel_satd_32x8_sve
268
+ add z0.h, z30.h, z31.h
269
+ uaddlv s0, v0.8h
270
+ mov w6, v0.s0
271
+.endm
272
+
273
+function PFX(pixel_satd_32x16_sve)
274
+ ptrue p0.h, vl8
275
+ mov x10, x30
276
+ satd_32x16_sve
277
+ mov x0, x6
278
+ ret x10
279
+endfunc
280
+
281
+function PFX(pixel_satd_32x32_sve)
282
+ ptrue p0.h, vl8
283
+ mov x10, x30
284
+ mov x7, #0
285
+ satd_32x16_sve
286
+ sub x0, x0, #16
287
+ sub x2, x2, #16
288
+ add x7, x7, x6
289
+ satd_32x16_sve
290
+ add x0, x7, x6
291
+ ret x10
292
+endfunc
293
+
294
+.macro satd_64x16_sve
295
+ mov x8, x0
296
+ mov x9, x2
297
+ satd_32x16_sve
298
+ add x7, x7, x6
299
+ add x0, x8, #32
300
+ add x2, x9, #32
301
+ satd_32x16_sve
302
+ add x7, x7, x6
303
+.endm
304
+
305
+function PFX(pixel_satd_64x48_sve)
306
+ ptrue p0.h, vl8
307
+ mov x10, x30
308
+ mov x7, #0
309
+.rept 2
310
+ satd_64x16_sve
311
+ sub x0, x0, #48
312
+ sub x2, x2, #48
313
+.endr
314
+ satd_64x16_sve
315
+ mov x0, x7
316
+ ret x10
317
+endfunc
318
+
319
+/********* ssim ***********/
320
+// uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
321
+// No need to fully use sve instructions for this function
322
+function PFX(quant_sve)
323
+ mov w9, #1
324
+ lsl w9, w9, w4
325
+ mov z0.s, w9
326
+ neg w9, w4
327
+ mov z1.s, w9
328
+ add w9, w9, #8
329
+ mov z2.s, w9
330
+ mov z3.s, w5
331
+
332
+ lsr w6, w6, #2
333
+ eor z4.d, z4.d, z4.d
334
+ eor w10, w10, w10
335
+ eor z17.d, z17.d, z17.d
336
+
337
+.loop_quant_sve:
338
+ ld1 {v18.4h}, x0, #8
339
+ ld1 {v7.4s}, x1, #16
340
+ sxtl v6.4s, v18.4h
341
+
342
+ cmlt v5.4s, v6.4s, #0
343
+
344
+ abs v6.4s, v6.4s
345
+
346
+
347
+ mul v6.4s, v6.4s, v7.4s
348
+
349
+ add v7.4s, v6.4s, v3.4s
350
+ sshl v7.4s, v7.4s, v1.4s
351
+
352
+ mls v6.4s, v7.4s, v0.s0
353
+ sshl v16.4s, v6.4s, v2.4s
354
+ st1 {v16.4s}, x2, #16
355
+
356
+ // numsig
357
+ cmeq v16.4s, v7.4s, v17.4s
358
+ add v4.4s, v4.4s, v16.4s
359
+ add w10, w10, #4
360
+
361
+ // level *= sign
362
+ eor z16.d, z7.d, z5.d
363
+ sub v16.4s, v16.4s, v5.4s
364
+ sqxtn v5.4h, v16.4s
365
+ st1 {v5.4h}, x3, #8
366
+
367
+ subs w6, w6, #1
368
+ b.ne .loop_quant_sve
369
+
370
+ addv s4, v4.4s
371
+ mov w9, v4.s0
372
+ add w0, w10, w9
373
+ ret
374
+endfunc
375
x265_3.6.tar.gz/source/common/aarch64/pixel-util-sve2.S
Added
1688
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "pixel-util-common.S"
27
+
28
+.arch armv8-a+sve2
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+// uint64_t pixel_var(const pixel* pix, intptr_t i_stride)
41
+function PFX(pixel_var_8x8_sve2)
42
+ ptrue p0.h, vl8
43
+ ld1b {z0.h}, p0/z, x0
44
+ add x0, x0, x1
45
+ mul z31.h, z0.h, z0.h
46
+ uaddlp v1.4s, v31.8h
47
+.rept 7
48
+ ld1b {z4.h}, p0/z, x0
49
+ add x0, x0, x1
50
+ add z0.h, z0.h, z4.h
51
+ mul z31.h, z4.h, z4.h
52
+ uadalp z1.s, p0/m, z31.h
53
+.endr
54
+ uaddlv s0, v0.8h
55
+ uaddlv d1, v1.4s
56
+ fmov w0, s0
57
+ fmov x1, d1
58
+ orr x0, x0, x1, lsl #32
59
+ ret
60
+endfunc
61
+
62
+function PFX(pixel_var_16x16_sve2)
63
+ rdvl x9, #1
64
+ cmp x9, #16
65
+ bgt .vl_gt_16_pixel_var_16x16
66
+ pixel_var_start
67
+ mov w12, #16
68
+.loop_var_16_sve2:
69
+ sub w12, w12, #1
70
+ ld1 {v4.16b}, x0, x1
71
+ pixel_var_1 v4
72
+ cbnz w12, .loop_var_16_sve2
73
+ pixel_var_end
74
+ ret
75
+.vl_gt_16_pixel_var_16x16:
76
+ ptrue p0.h, vl16
77
+ mov z0.d, #0
78
+.rept 16
79
+ ld1b {z4.h}, p0/z, x0
80
+ add x0, x0, x1
81
+ add z0.h, z0.h, z4.h
82
+ mul z30.h, z4.h, z4.h
83
+ uadalp z1.s, p0/m, z30.h
84
+.endr
85
+ uaddv d0, p0, z0.h
86
+ uaddv d1, p0, z1.s
87
+ fmov w0, s0
88
+ fmov x1, d1
89
+ orr x0, x0, x1, lsl #32
90
+ ret
91
+endfunc
92
+
93
+function PFX(pixel_var_32x32_sve2)
94
+ rdvl x9, #1
95
+ cmp x9, #16
96
+ bgt .vl_gt_16_pixel_var_32x32
97
+ pixel_var_start
98
+ mov w12, #32
99
+.loop_var_32_sve2:
100
+ sub w12, w12, #1
101
+ ld1 {v4.16b-v5.16b}, x0, x1
102
+ pixel_var_1 v4
103
+ pixel_var_1 v5
104
+ cbnz w12, .loop_var_32_sve2
105
+ pixel_var_end
106
+ ret
107
+.vl_gt_16_pixel_var_32x32:
108
+ cmp x9, #48
109
+ bgt .vl_gt_48_pixel_var_32x32
110
+ ptrue p0.b, vl32
111
+ mov z0.d, #0
112
+ mov z1.d, #0
113
+.rept 32
114
+ ld1b {z4.b}, p0/z, x0
115
+ add x0, x0, x1
116
+ uaddwb z0.h, z0.h, z4.b
117
+ uaddwt z0.h, z0.h, z4.b
118
+ umullb z28.h, z4.b, z4.b
119
+ umullt z29.h, z4.b, z4.b
120
+ uadalp z1.s, p0/m, z28.h
121
+ uadalp z1.s, p0/m, z29.h
122
+.endr
123
+ uaddv d0, p0, z0.h
124
+ uaddv d1, p0, z1.s
125
+ fmov w0, s0
126
+ fmov x1, d1
127
+ orr x0, x0, x1, lsl #32
128
+ ret
129
+.vl_gt_48_pixel_var_32x32:
130
+ ptrue p0.h, vl32
131
+ mov z0.d, #0
132
+ mov z1.d, #0
133
+.rept 32
134
+ ld1b {z4.h}, p0/z, x0
135
+ add x0, x0, x1
136
+ add z0.h, z0.h, z4.h
137
+ mul z28.h, z4.h, z4.h
138
+ uadalp z1.s, p0/m, z28.h
139
+.endr
140
+ uaddv d0, p0, z0.h
141
+ uaddv d1, p0, z1.s
142
+ fmov w0, s0
143
+ fmov x1, d1
144
+ orr x0, x0, x1, lsl #32
145
+ ret
146
+endfunc
147
+
148
+function PFX(pixel_var_64x64_sve2)
149
+ rdvl x9, #1
150
+ cmp x9, #16
151
+ bgt .vl_gt_16_pixel_var_64x64
152
+ pixel_var_start
153
+ mov w12, #64
154
+.loop_var_64_sve2:
155
+ sub w12, w12, #1
156
+ ld1 {v4.16b-v7.16b}, x0, x1
157
+ pixel_var_1 v4
158
+ pixel_var_1 v5
159
+ pixel_var_1 v6
160
+ pixel_var_1 v7
161
+ cbnz w12, .loop_var_64_sve2
162
+ pixel_var_end
163
+ ret
164
+.vl_gt_16_pixel_var_64x64:
165
+ cmp x9, #48
166
+ bgt .vl_gt_48_pixel_var_64x64
167
+ ptrue p0.b, vl32
168
+ mov z0.d, #0
169
+ mov z2.d, #0
170
+.rept 64
171
+ ld1b {z4.b}, p0/z, x0
172
+ ld1b {z5.b}, p0/z, x0, #1, mul vl
173
+ add x0, x0, x1
174
+ uaddwb z0.h, z0.h, z4.b
175
+ uaddwt z0.h, z0.h, z4.b
176
+ uaddwb z0.h, z0.h, z5.b
177
+ uaddwt z0.h, z0.h, z5.b
178
+ umullb z24.h, z4.b, z4.b
179
+ umullt z25.h, z4.b, z4.b
180
+ umullb z26.h, z5.b, z5.b
181
+ umullt z27.h, z5.b, z5.b
182
+ uadalp z2.s, p0/m, z24.h
183
+ uadalp z2.s, p0/m, z25.h
184
+ uadalp z2.s, p0/m, z26.h
185
+ uadalp z2.s, p0/m, z27.h
186
+.endr
187
+ uaddv d0, p0, z0.h
188
+ uaddv d1, p0, z2.s
189
+ fmov w0, s0
190
+ fmov x1, d1
191
+ orr x0, x0, x1, lsl #32
192
+ ret
193
+.vl_gt_48_pixel_var_64x64:
194
+ cmp x9, #112
195
+ bgt .vl_gt_112_pixel_var_64x64
196
+ ptrue p0.b, vl64
197
+ mov z0.d, #0
198
+ mov z1.d, #0
199
+.rept 64
200
+ ld1b {z4.b}, p0/z, x0
201
+ add x0, x0, x1
202
+ uaddwb z0.h, z0.h, z4.b
203
+ uaddwt z0.h, z0.h, z4.b
204
+ umullb z24.h, z4.b, z4.b
205
+ umullt z25.h, z4.b, z4.b
206
+ uadalp z2.s, p0/m, z24.h
207
+ uadalp z2.s, p0/m, z25.h
208
+.endr
209
+ uaddv d0, p0, z0.h
210
+ uaddv d1, p0, z2.s
211
+ fmov w0, s0
212
+ fmov x1, d1
213
+ orr x0, x0, x1, lsl #32
214
+ ret
215
+.vl_gt_112_pixel_var_64x64:
216
+ ptrue p0.h, vl64
217
+ mov z0.d, #0
218
+ mov z1.d, #0
219
+.rept 64
220
+ ld1b {z4.h}, p0/z, x0
221
+ add x0, x0, x1
222
+ add z0.h, z0.h, z4.h
223
+ mul z24.h, z4.h, z4.h
224
+ uadalp z1.s, p0/m, z24.h
225
+.endr
226
+ uaddv d0, p0, z0.h
227
+ uaddv d1, p0, z1.s
228
+ fmov w0, s0
229
+ fmov x1, d1
230
+ orr x0, x0, x1, lsl #32
231
+ ret
232
+endfunc
233
+
234
+function PFX(getResidual16_sve2)
235
+ rdvl x9, #1
236
+ cmp x9, #16
237
+ bgt .vl_gt_16_getResidual16
238
+ lsl x4, x3, #1
239
+.rept 8
240
+ ld1 {v0.16b}, x0, x3
241
+ ld1 {v1.16b}, x1, x3
242
+ ld1 {v2.16b}, x0, x3
243
+ ld1 {v3.16b}, x1, x3
244
+ usubl v4.8h, v0.8b, v1.8b
245
+ usubl2 v5.8h, v0.16b, v1.16b
246
+ usubl v6.8h, v2.8b, v3.8b
247
+ usubl2 v7.8h, v2.16b, v3.16b
248
+ st1 {v4.8h-v5.8h}, x2, x4
249
+ st1 {v6.8h-v7.8h}, x2, x4
250
+.endr
251
+ ret
252
+.vl_gt_16_getResidual16:
253
+ ptrue p0.h, vl16
254
+.rept 16
255
+ ld1b {z0.h}, p0/z, x0
256
+ ld1b {z2.h}, p0/z, x1
257
+ add x0, x0, x3
258
+ add x1, x1, x3
259
+ sub z4.h, z0.h, z2.h
260
+ st1h {z4.h}, p0, x2
261
+ add x2, x2, x3, lsl #1
262
+.endr
263
+ ret
264
+endfunc
265
+
266
+function PFX(getResidual32_sve2)
267
+ rdvl x9, #1
268
+ cmp x9, #16
269
+ bgt .vl_gt_16_getResidual32
270
+ lsl x4, x3, #1
271
+ mov w12, #4
272
+.loop_residual_32:
273
+ sub w12, w12, #1
274
+.rept 4
275
+ ld1 {v0.16b-v1.16b}, x0, x3
276
+ ld1 {v2.16b-v3.16b}, x1, x3
277
+ ld1 {v4.16b-v5.16b}, x0, x3
278
+ ld1 {v6.16b-v7.16b}, x1, x3
279
+ usubl v16.8h, v0.8b, v2.8b
280
+ usubl2 v17.8h, v0.16b, v2.16b
281
+ usubl v18.8h, v1.8b, v3.8b
282
+ usubl2 v19.8h, v1.16b, v3.16b
283
+ usubl v20.8h, v4.8b, v6.8b
284
+ usubl2 v21.8h, v4.16b, v6.16b
285
+ usubl v22.8h, v5.8b, v7.8b
286
+ usubl2 v23.8h, v5.16b, v7.16b
287
+ st1 {v16.8h-v19.8h}, x2, x4
288
+ st1 {v20.8h-v23.8h}, x2, x4
289
+.endr
290
+ cbnz w12, .loop_residual_32
291
+ ret
292
+.vl_gt_16_getResidual32:
293
+ cmp x9, #48
294
+ bgt .vl_gt_48_getResidual32
295
+ ptrue p0.b, vl32
296
+.rept 32
297
+ ld1b {z0.b}, p0/z, x0
298
+ ld1b {z2.b}, p0/z, x1
299
+ add x0, x0, x3
300
+ add x1, x1, x3
301
+ usublb z4.h, z0.b, z2.b
302
+ usublt z5.h, z0.b, z2.b
303
+ st2h {z4.h, z5.h}, p0, x2
304
+ add x2, x2, x3, lsl #1
305
+.endr
306
+ ret
307
+.vl_gt_48_getResidual32:
308
+ ptrue p0.h, vl32
309
+.rept 32
310
+ ld1b {z0.h}, p0/z, x0
311
+ ld1b {z4.h}, p0/z, x1
312
+ add x0, x0, x3
313
+ add x1, x1, x3
314
+ sub z8.h, z0.h, z4.h
315
+ st1h {z8.h}, p0, x2
316
+ add x2, x2, x3, lsl #1
317
+.endr
318
+ ret
319
+endfunc
320
+
321
+function PFX(pixel_sub_ps_32x32_sve2)
322
+ rdvl x9, #1
323
+ cmp x9, #16
324
+ bgt .vl_gt_16_pixel_sub_ps_32x32
325
+ lsl x1, x1, #1
326
+ mov w12, #4
327
+.loop_sub_ps_32_sve2:
328
+ sub w12, w12, #1
329
+.rept 4
330
+ ld1 {v0.16b-v1.16b}, x2, x4
331
+ ld1 {v2.16b-v3.16b}, x3, x5
332
+ ld1 {v4.16b-v5.16b}, x2, x4
333
+ ld1 {v6.16b-v7.16b}, x3, x5
334
+ usubl v16.8h, v0.8b, v2.8b
335
+ usubl2 v17.8h, v0.16b, v2.16b
336
+ usubl v18.8h, v1.8b, v3.8b
337
+ usubl2 v19.8h, v1.16b, v3.16b
338
+ usubl v20.8h, v4.8b, v6.8b
339
+ usubl2 v21.8h, v4.16b, v6.16b
340
+ usubl v22.8h, v5.8b, v7.8b
341
+ usubl2 v23.8h, v5.16b, v7.16b
342
+ st1 {v16.8h-v19.8h}, x0, x1
343
+ st1 {v20.8h-v23.8h}, x0, x1
344
+.endr
345
+ cbnz w12, .loop_sub_ps_32_sve2
346
+ ret
347
+.vl_gt_16_pixel_sub_ps_32x32:
348
+ cmp x9, #48
349
+ bgt .vl_gt_48_pixel_sub_ps_32x32
350
+ ptrue p0.b, vl32
351
+ mov w12, #8
352
+.vl_gt_16_loop_sub_ps_32_sve2:
353
+ sub w12, w12, #1
354
+.rept 4
355
+ ld1b {z0.b}, p0/z, x2
356
+ ld1b {z2.b}, p0/z, x3
357
+ add x2, x2, x4
358
+ add x3, x3, x5
359
+ usublb z16.h, z0.b, z2.b
360
+ usublt z17.h, z0.b, z2.b
361
+ st2h {z16.h, z17.h}, p0, x0
362
+ add x0, x0, x1, lsl #1
363
+.endr
364
+ cbnz w12, .vl_gt_16_loop_sub_ps_32_sve2
365
+ ret
366
+.vl_gt_48_pixel_sub_ps_32x32:
367
+ ptrue p0.h, vl32
368
+ mov w12, #8
369
+.vl_gt_48_loop_sub_ps_32_sve2:
370
+ sub w12, w12, #1
371
+.rept 4
372
+ ld1b {z0.h}, p0/z, x2
373
+ ld1b {z4.h}, p0/z, x3
374
+ add x2, x2, x4
375
+ add x3, x3, x5
376
+ sub z8.h, z0.h, z4.h
377
+ st1h {z8.h}, p0, x0
378
+ add x0, x0, x1, lsl #1
379
+.endr
380
+ cbnz w12, .vl_gt_48_loop_sub_ps_32_sve2
381
+ ret
382
+endfunc
383
+
384
+function PFX(pixel_sub_ps_64x64_sve2)
385
+ rdvl x9, #1
386
+ cmp x9, #16
387
+ bgt .vl_gt_16_pixel_sub_ps_64x64
388
+ lsl x1, x1, #1
389
+ sub x1, x1, #64
390
+ mov w12, #16
391
+.loop_sub_ps_64_sve2:
392
+ sub w12, w12, #1
393
+.rept 4
394
+ ld1 {v0.16b-v3.16b}, x2, x4
395
+ ld1 {v4.16b-v7.16b}, x3, x5
396
+ usubl v16.8h, v0.8b, v4.8b
397
+ usubl2 v17.8h, v0.16b, v4.16b
398
+ usubl v18.8h, v1.8b, v5.8b
399
+ usubl2 v19.8h, v1.16b, v5.16b
400
+ usubl v20.8h, v2.8b, v6.8b
401
+ usubl2 v21.8h, v2.16b, v6.16b
402
+ usubl v22.8h, v3.8b, v7.8b
403
+ usubl2 v23.8h, v3.16b, v7.16b
404
+ st1 {v16.8h-v19.8h}, x0, #64
405
+ st1 {v20.8h-v23.8h}, x0, x1
406
+.endr
407
+ cbnz w12, .loop_sub_ps_64_sve2
408
+ ret
409
+.vl_gt_16_pixel_sub_ps_64x64:
410
+ rdvl x9, #1
411
+ cmp x9, #16
412
+ bgt .vl_gt_16_pixel_sub_ps_64x64
413
+ ptrue p0.b, vl32
414
+ mov w12, #16
415
+.vl_gt_16_loop_sub_ps_64_sve2:
416
+ sub w12, w12, #1
417
+.rept 4
418
+ ld1b {z0.b}, p0/z, x2
419
+ ld1b {z1.b}, p0/z, x2, #1, mul vl
420
+ ld1b {z4.b}, p0/z, x3
421
+ ld1b {z5.b}, p0/z, x3, #1, mul vl
422
+ add x2, x2, x4
423
+ add x3, x3, x5
424
+ usublb z16.h, z0.b, z4.b
425
+ usublt z17.h, z0.b, z4.b
426
+ usublb z18.h, z1.b, z5.b
427
+ usublt z19.h, z1.b, z5.b
428
+ st2h {z16.h, z17.h}, p0, x0
429
+ st2h {z18.h, z19.h}, p0, x0, #2, mul vl
430
+ add x0, x0, x1, lsl #1
431
+.endr
432
+ cbnz w12, .vl_gt_16_loop_sub_ps_64_sve2
433
+ ret
434
+.vl_gt_48_pixel_sub_ps_64x64:
435
+ cmp x9, #112
436
+ bgt .vl_gt_112_pixel_sub_ps_64x64
437
+ ptrue p0.b, vl64
438
+ mov w12, #16
439
+.vl_gt_48_loop_sub_ps_64_sve2:
440
+ sub w12, w12, #1
441
+.rept 4
442
+ ld1b {z0.b}, p0/z, x2
443
+ ld1b {z4.b}, p0/z, x3
444
+ add x2, x2, x4
445
+ add x3, x3, x5
446
+ usublb z16.h, z0.b, z4.b
447
+ usublt z17.h, z0.b, z4.b
448
+ st2h {z16.h, z17.h}, p0, x0
449
+ add x0, x0, x1, lsl #1
450
+.endr
451
+ cbnz w12, .vl_gt_48_loop_sub_ps_64_sve2
452
+ ret
453
+.vl_gt_112_pixel_sub_ps_64x64:
454
+ ptrue p0.h, vl64
455
+ mov w12, #16
456
+.vl_gt_112_loop_sub_ps_64_sve2:
457
+ sub w12, w12, #1
458
+.rept 4
459
+ ld1b {z0.h}, p0/z, x2
460
+ ld1b {z8.h}, p0/z, x3
461
+ add x2, x2, x4
462
+ add x3, x3, x5
463
+ sub z16.h, z0.h, z8.h
464
+ st1h {z16.h}, p0, x0
465
+ add x0, x0, x1, lsl #1
466
+.endr
467
+ cbnz w12, .vl_gt_112_loop_sub_ps_64_sve2
468
+ ret
469
+endfunc
470
+
471
+function PFX(pixel_sub_ps_32x64_sve2)
472
+ rdvl x9, #1
473
+ cmp x9, #16
474
+ bgt .vl_gt_16_pixel_sub_ps_32x64
475
+ lsl x1, x1, #1
476
+ mov w12, #8
477
+.loop_sub_ps_32x64_sve2:
478
+ sub w12, w12, #1
479
+.rept 4
480
+ ld1 {v0.16b-v1.16b}, x2, x4
481
+ ld1 {v2.16b-v3.16b}, x3, x5
482
+ ld1 {v4.16b-v5.16b}, x2, x4
483
+ ld1 {v6.16b-v7.16b}, x3, x5
484
+ usubl v16.8h, v0.8b, v2.8b
485
+ usubl2 v17.8h, v0.16b, v2.16b
486
+ usubl v18.8h, v1.8b, v3.8b
487
+ usubl2 v19.8h, v1.16b, v3.16b
488
+ usubl v20.8h, v4.8b, v6.8b
489
+ usubl2 v21.8h, v4.16b, v6.16b
490
+ usubl v22.8h, v5.8b, v7.8b
491
+ usubl2 v23.8h, v5.16b, v7.16b
492
+ st1 {v16.8h-v19.8h}, x0, x1
493
+ st1 {v20.8h-v23.8h}, x0, x1
494
+.endr
495
+ cbnz w12, .loop_sub_ps_32x64_sve2
496
+ ret
497
+.vl_gt_16_pixel_sub_ps_32x64:
498
+ cmp x9, #48
499
+ bgt .vl_gt_48_pixel_sub_ps_32x64
500
+ ptrue p0.b, vl32
501
+ mov w12, #8
502
+.vl_gt_16_loop_sub_ps_32x64_sve2:
503
+ sub w12, w12, #1
504
+.rept 8
505
+ ld1b {z0.b}, p0/z, x2
506
+ ld1b {z2.b}, p0/z, x3
507
+ add x2, x2, x4
508
+ add x3, x3, x5
509
+ usublb z16.h, z0.b, z2.b
510
+ usublt z17.h, z0.b, z2.b
511
+ st2h {z16.h, z17.h}, p0, x0
512
+ add x0, x0, x1, lsl #1
513
+.endr
514
+ cbnz w12, .vl_gt_16_loop_sub_ps_32x64_sve2
515
+ ret
516
+.vl_gt_48_pixel_sub_ps_32x64:
517
+ ptrue p0.h, vl32
518
+ mov w12, #8
519
+.vl_gt_48_loop_sub_ps_32x64_sve2:
520
+ sub w12, w12, #1
521
+.rept 8
522
+ ld1b {z0.h}, p0/z, x2
523
+ ld1b {z4.h}, p0/z, x3
524
+ add x2, x2, x4
525
+ add x3, x3, x5
526
+ sub z8.h, z0.h, z4.h
527
+ st1h {z8.h}, p0, x0
528
+ add x0, x0, x1, lsl #1
529
+.endr
530
+ cbnz w12, .vl_gt_48_loop_sub_ps_32x64_sve2
531
+ ret
532
+endfunc
533
+
534
+function PFX(pixel_add_ps_4x4_sve2)
535
+ ptrue p0.h, vl8
536
+ ptrue p1.h, vl4
537
+.rept 4
538
+ ld1b {z0.h}, p0/z, x2
539
+ ld1h {z2.h}, p1/z, x3
540
+ add x2, x2, x4
541
+ add x3, x3, x5, lsl #1
542
+ add z4.h, z0.h, z2.h
543
+ sqxtunb z4.b, z4.h
544
+ st1b {z4.h}, p1, x0
545
+ add x0, x0, x1
546
+.endr
547
+ ret
548
+endfunc
549
+
550
+function PFX(pixel_add_ps_8x8_sve2)
551
+ ptrue p0.h, vl8
552
+.rept 8
553
+ ld1b {z0.h}, p0/z, x2
554
+ ld1h {z2.h}, p0/z, x3
555
+ add x2, x2, x4
556
+ add x3, x3, x5, lsl #1
557
+ add z4.h, z0.h, z2.h
558
+ sqxtunb z4.b, z4.h
559
+ st1b {z4.h}, p0, x0
560
+ add x0, x0, x1
561
+.endr
562
+ ret
563
+endfunc
564
+
565
+.macro pixel_add_ps_16xN_sve2 h
566
+function PFX(pixel_add_ps_16x\h\()_sve2)
567
+ rdvl x9, #1
568
+ cmp x9, #16
569
+ bgt .vl_gt_16_pixel_add_ps_16x\h
570
+ ptrue p0.b, vl16
571
+.rept \h
572
+ ld1b {z0.h}, p0/z, x2
573
+ ld1b {z1.h}, p0/z, x2, #1, mul vl
574
+ ld1h {z2.h}, p0/z, x3
575
+ ld1h {z3.h}, p0/z, x3, #1, mul vl
576
+ add x2, x2, x4
577
+ add x3, x3, x5, lsl #1
578
+ add z24.h, z0.h, z2.h
579
+ add z25.h, z1.h, z3.h
580
+ sqxtunb z6.b, z24.h
581
+ sqxtunb z7.b, z25.h
582
+ st1b {z6.h}, p0, x0
583
+ st1b {z7.h}, p0, x0, #1, mul vl
584
+ add x0, x0, x1
585
+.endr
586
+ ret
587
+.vl_gt_16_pixel_add_ps_16x\h\():
588
+ ptrue p0.b, vl32
589
+.rept \h
590
+ ld1b {z0.h}, p0/z, x2
591
+ ld1h {z2.h}, p0/z, x3
592
+ add x2, x2, x4
593
+ add x3, x3, x5, lsl #1
594
+ add z24.h, z0.h, z2.h
595
+ sqxtunb z6.b, z24.h
596
+ st1b {z6.h}, p0, x0
597
+ add x0, x0, x1
598
+.endr
599
+ ret
600
+endfunc
601
+.endm
602
+
603
+pixel_add_ps_16xN_sve2 16
604
+pixel_add_ps_16xN_sve2 32
605
+
606
+.macro pixel_add_ps_32xN_sve2 h
607
+ function PFX(pixel_add_ps_32x\h\()_sve2)
608
+ rdvl x9, #1
609
+ cmp x9, #16
610
+ bgt .vl_gt_16_pixel_add_ps_32x\h
611
+ lsl x5, x5, #1
612
+ mov w12, #\h / 4
613
+.loop_add_ps__sve2_32x\h\():
614
+ sub w12, w12, #1
615
+.rept 4
616
+ ld1 {v0.16b-v1.16b}, x2, x4
617
+ ld1 {v16.8h-v19.8h}, x3, x5
618
+ uxtl v4.8h, v0.8b
619
+ uxtl2 v5.8h, v0.16b
620
+ uxtl v6.8h, v1.8b
621
+ uxtl2 v7.8h, v1.16b
622
+ add v24.8h, v4.8h, v16.8h
623
+ add v25.8h, v5.8h, v17.8h
624
+ add v26.8h, v6.8h, v18.8h
625
+ add v27.8h, v7.8h, v19.8h
626
+ sqxtun v4.8b, v24.8h
627
+ sqxtun2 v4.16b, v25.8h
628
+ sqxtun v5.8b, v26.8h
629
+ sqxtun2 v5.16b, v27.8h
630
+ st1 {v4.16b-v5.16b}, x0, x1
631
+.endr
632
+ cbnz w12, .loop_add_ps__sve2_32x\h
633
+ ret
634
+.vl_gt_16_pixel_add_ps_32x\h\():
635
+ cmp x9, #48
636
+ bgt .vl_gt_48_pixel_add_ps_32x\h
637
+ ptrue p0.b, vl32
638
+.rept \h
639
+ ld1b {z0.h}, p0/z, x2
640
+ ld1b {z1.h}, p0/z, x2, #1, mul vl
641
+ ld1h {z4.h}, p0/z, x3
642
+ ld1h {z5.h}, p0/z, x3, #1, mul vl
643
+ add x2, x2, x4
644
+ add x3, x3, x5, lsl #1
645
+ add z24.h, z0.h, z4.h
646
+ add z25.h, z1.h, z5.h
647
+ sqxtunb z6.b, z24.h
648
+ sqxtunb z7.b, z25.h
649
+ st1b {z6.h}, p0, x0
650
+ st1b {z7.h}, p0, x0, #1, mul vl
651
+ add x0, x0, x1
652
+.endr
653
+ ret
654
+.vl_gt_48_pixel_add_ps_32x\h\():
655
+ ptrue p0.b, vl64
656
+.rept \h
657
+ ld1b {z0.h}, p0/z, x2
658
+ ld1h {z4.h}, p0/z, x3
659
+ add x2, x2, x4
660
+ add x3, x3, x5, lsl #1
661
+ add z24.h, z0.h, z4.h
662
+ sqxtunb z6.b, z24.h
663
+ st1b {z6.h}, p0, x0
664
+ add x0, x0, x1
665
+.endr
666
+ ret
667
+endfunc
668
+.endm
669
+
670
+pixel_add_ps_32xN_sve2 32
671
+pixel_add_ps_32xN_sve2 64
672
+
673
+function PFX(pixel_add_ps_64x64_sve2)
674
+ rdvl x9, #1
675
+ cmp x9, #16
676
+ bgt .vl_gt_16_pixel_add_ps_64x64
677
+ ptrue p0.b, vl16
678
+.rept 64
679
+ ld1b {z0.h}, p0/z, x2
680
+ ld1b {z1.h}, p0/z, x2, #1, mul vl
681
+ ld1b {z2.h}, p0/z, x2, #2, mul vl
682
+ ld1b {z3.h}, p0/z, x2, #3, mul vl
683
+ ld1b {z4.h}, p0/z, x2, #4 ,mul vl
684
+ ld1b {z5.h}, p0/z, x2, #5, mul vl
685
+ ld1b {z6.h}, p0/z, x2, #6, mul vl
686
+ ld1b {z7.h}, p0/z, x2, #7, mul vl
687
+ ld1h {z8.h}, p0/z, x3
688
+ ld1h {z9.h}, p0/z, x3, #1, mul vl
689
+ ld1h {z10.h}, p0/z, x3, #2, mul vl
690
+ ld1h {z11.h}, p0/z, x3, #3, mul vl
691
+ ld1h {z12.h}, p0/z, x3, #4, mul vl
692
+ ld1h {z13.h}, p0/z, x3, #5, mul vl
693
+ ld1h {z14.h}, p0/z, x3, #6, mul vl
694
+ ld1h {z15.h}, p0/z, x3, #7, mul vl
695
+ add x2, x2, x4
696
+ add x3, x3, x5, lsl #1
697
+ add z24.h, z0.h, z8.h
698
+ add z25.h, z1.h, z9.h
699
+ add z26.h, z2.h, z10.h
700
+ add z27.h, z3.h, z11.h
701
+ add z28.h, z4.h, z12.h
702
+ add z29.h, z5.h, z13.h
703
+ add z30.h, z6.h, z14.h
704
+ add z31.h, z7.h, z15.h
705
+ sqxtunb z6.b, z24.h
706
+ sqxtunb z7.b, z25.h
707
+ sqxtunb z8.b, z26.h
708
+ sqxtunb z9.b, z27.h
709
+ sqxtunb z10.b, z28.h
710
+ sqxtunb z11.b, z29.h
711
+ sqxtunb z12.b, z30.h
712
+ sqxtunb z13.b, z31.h
713
+ st1b {z6.h}, p0, x0
714
+ st1b {z7.h}, p0, x0, #1, mul vl
715
+ st1b {z8.h}, p0, x0, #2, mul vl
716
+ st1b {z9.h}, p0, x0, #3, mul vl
717
+ st1b {z10.h}, p0, x0, #4, mul vl
718
+ st1b {z11.h}, p0, x0, #5, mul vl
719
+ st1b {z12.h}, p0, x0, #6, mul vl
720
+ st1b {z13.h}, p0, x0, #7, mul vl
721
+ add x0, x0, x1
722
+.endr
723
+ ret
724
+.vl_gt_16_pixel_add_ps_64x64:
725
+ cmp x9, #48
726
+ bgt .vl_gt_48_pixel_add_ps_64x64
727
+ ptrue p0.b, vl32
728
+.rept 64
729
+ ld1b {z0.h}, p0/z, x2
730
+ ld1b {z1.h}, p0/z, x2, #1, mul vl
731
+ ld1b {z2.h}, p0/z, x2, #2, mul vl
732
+ ld1b {z3.h}, p0/z, x2, #3, mul vl
733
+ ld1h {z8.h}, p0/z, x3
734
+ ld1h {z9.h}, p0/z, x3, #1, mul vl
735
+ ld1h {z10.h}, p0/z, x3, #2, mul vl
736
+ ld1h {z11.h}, p0/z, x3, #3, mul vl
737
+ add x2, x2, x4
738
+ add x3, x3, x5, lsl #1
739
+ add z24.h, z0.h, z8.h
740
+ add z25.h, z1.h, z9.h
741
+ add z26.h, z2.h, z10.h
742
+ add z27.h, z3.h, z11.h
743
+ sqxtunb z6.b, z24.h
744
+ sqxtunb z7.b, z25.h
745
+ sqxtunb z8.b, z26.h
746
+ sqxtunb z9.b, z27.h
747
+ st1b {z6.h}, p0, x0
748
+ st1b {z7.h}, p0, x0, #1, mul vl
749
+ st1b {z8.h}, p0, x0, #2, mul vl
750
+ st1b {z9.h}, p0, x0, #3, mul vl
751
+ add x0, x0, x1
752
+.endr
753
+ ret
754
+.vl_gt_48_pixel_add_ps_64x64:
755
+ cmp x9, #112
756
+ bgt .vl_gt_112_pixel_add_ps_64x64
757
+ ptrue p0.b, vl64
758
+.rept 64
759
+ ld1b {z0.h}, p0/z, x2
760
+ ld1b {z1.h}, p0/z, x2, #1, mul vl
761
+ ld1h {z8.h}, p0/z, x3
762
+ ld1h {z9.h}, p0/z, x3, #1, mul vl
763
+ add x2, x2, x4
764
+ add x3, x3, x5, lsl #1
765
+ add z24.h, z0.h, z8.h
766
+ add z25.h, z1.h, z9.h
767
+ sqxtunb z6.b, z24.h
768
+ sqxtunb z7.b, z25.h
769
+ st1b {z6.h}, p0, x0
770
+ st1b {z7.h}, p0, x0, #1, mul vl
771
+ add x0, x0, x1
772
+.endr
773
+ ret
774
+.vl_gt_112_pixel_add_ps_64x64:
775
+ ptrue p0.b, vl128
776
+.rept 64
777
+ ld1b {z0.h}, p0/z, x2
778
+ ld1h {z8.h}, p0/z, x3
779
+ add x2, x2, x4
780
+ add x3, x3, x5, lsl #1
781
+ add z24.h, z0.h, z8.h
782
+ sqxtunb z6.b, z24.h
783
+ st1b {z6.h}, p0, x0
784
+ add x0, x0, x1
785
+.endr
786
+ ret
787
+endfunc
788
+
789
+// Chroma add_ps
790
+function PFX(pixel_add_ps_4x8_sve2)
791
+ ptrue p0.h,vl4
792
+.rept 8
793
+ ld1b {z0.h}, p0/z, x2
794
+ ld1h {z2.h}, p0/z, x3
795
+ add x2, x2, x4
796
+ add x3, x3, x5, lsl #1
797
+ add z4.h, z0.h, z2.h
798
+ sqxtunb z4.b, z4.h
799
+ st1b {z4.h}, p0, x0
800
+ add x0, x0, x1
801
+.endr
802
+ ret
803
+endfunc
804
+
805
+function PFX(pixel_add_ps_8x16_sve2)
806
+ ptrue p0.h,vl8
807
+.rept 16
808
+ ld1b {z0.h}, p0/z, x2
809
+ ld1h {z2.h}, p0/z, x3
810
+ add x2, x2, x4
811
+ add x3, x3, x5, lsl #1
812
+ add z4.h, z0.h, z2.h
813
+ sqxtunb z4.b, z4.h
814
+ st1b {z4.h}, p0, x0
815
+ add x0, x0, x1
816
+.endr
817
+ ret
818
+endfunc
819
+
820
+// void scale1D_128to64(pixel *dst, const pixel *src)
821
+function PFX(scale1D_128to64_sve2)
822
+ rdvl x9, #1
823
+ cmp x9, #16
824
+ bgt .vl_gt_16_scale1D_128to64
825
+ ptrue p0.b, vl16
826
+.rept 2
827
+ ld2b {z0.b, z1.b}, p0/z, x1
828
+ ld2b {z2.b, z3.b}, p0/z, x1, #2, mul vl
829
+ ld2b {z4.b, z5.b}, p0/z, x1, #4, mul vl
830
+ ld2b {z6.b, z7.b}, p0/z, x1, #6, mul vl
831
+ add x1, x1, #128
832
+ urhadd z0.b, p0/m, z0.b, z1.b
833
+ urhadd z2.b, p0/m, z2.b, z3.b
834
+ urhadd z4.b, p0/m, z4.b, z5.b
835
+ urhadd z6.b, p0/m, z6.b, z7.b
836
+ st1b {z0.b}, p0, x0
837
+ st1b {z2.b}, p0, x0, #1, mul vl
838
+ st1b {z4.b}, p0, x0, #2, mul vl
839
+ st1b {z6.b}, p0, x0, #3, mul vl
840
+ add x0, x0, #64
841
+.endr
842
+ ret
843
+.vl_gt_16_scale1D_128to64:
844
+ cmp x9, #48
845
+ bgt .vl_gt_48_scale1D_128to64
846
+ ptrue p0.b, vl32
847
+.rept 2
848
+ ld2b {z0.b, z1.b}, p0/z, x1
849
+ ld2b {z2.b, z3.b}, p0/z, x1, #2, mul vl
850
+ add x1, x1, #128
851
+ urhadd z0.b, p0/m, z0.b, z1.b
852
+ urhadd z2.b, p0/m, z2.b, z3.b
853
+ st1b {z0.b}, p0, x0
854
+ st1b {z2.b}, p0, x0, #1, mul vl
855
+ add x0, x0, #64
856
+.endr
857
+ ret
858
+.vl_gt_48_scale1D_128to64:
859
+ ptrue p0.b, vl64
860
+.rept 2
861
+ ld2b {z0.b, z1.b}, p0/z, x1
862
+ add x1, x1, #128
863
+ urhadd z0.b, p0/m, z0.b, z1.b
864
+ st1b {z0.b}, p0, x0
865
+ add x0, x0, #64
866
+.endr
867
+ ret
868
+endfunc
869
+
870
+/***** dequant_scaling*****/
871
+// void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
872
+function PFX(dequant_scaling_sve2)
873
+ ptrue p0.h, vl8
874
+ add x5, x5, #4 // shift + 4
875
+ lsr x3, x3, #3 // num / 8
876
+ cmp x5, x4
877
+ blt .dequant_skip_sve2
878
+
879
+ mov x12, #1
880
+ sub x6, x5, x4 // shift - per
881
+ sub x6, x6, #1 // shift - per - 1
882
+ lsl x6, x12, x6 // 1 << shift - per - 1 (add)
883
+ mov z0.s, w6
884
+ sub x7, x4, x5 // per - shift
885
+ mov z3.s, w7
886
+
887
+.dequant_loop1_sve2:
888
+ ld1h {z19.h}, p0/z, x0
889
+ ld1w {z2.s}, p0/z, x1
890
+ add x1, x1, #16
891
+ ld1w {z20.s}, p0/z, x1
892
+ add x0, x0, #16
893
+ add x1, x1, #16
894
+
895
+ sub x3, x3, #1
896
+ sunpklo z1.s, z19.h
897
+ sunpkhi z19.s, z19.h
898
+
899
+ mul z1.s, z1.s, z2.s // quantCoef * deQuantCoef
900
+ mul z19.s, z19.s, z20.s
901
+ add z1.s, z1.s, z0.s // quantCoef * deQuantCoef + add
902
+ add z19.s, z19.s, z0.s
903
+
904
+ // No equivalent instructions in SVE2 for sshl
905
+ // as sqshl has double latency
906
+ sshl v1.4s, v1.4s, v3.4s
907
+ sshl v19.4s, v19.4s, v3.4s
908
+
909
+ sqxtnb z16.h, z1.s
910
+ sqxtnb z17.h, z19.s
911
+ st1h {z16.s}, p0, x2
912
+ st1h {z17.s}, p0, x2, #1, mul vl
913
+ add x2, x2, #16
914
+ cbnz x3, .dequant_loop1_sve2
915
+ ret
916
+
917
+.dequant_skip_sve2:
918
+ sub x6, x4, x5 // per - shift
919
+ mov z0.h, w6
920
+
921
+.dequant_loop2_sve2:
922
+ ld1h {z19.h}, p0/z, x0
923
+ ld1w {z2.s}, p0/z, x1
924
+ add x1, x1, #16
925
+ ld1w {z20.s}, p0/z, x1
926
+ add x0, x0, #16
927
+ add x1, x1, #16
928
+
929
+
930
+ sub x3, x3, #1
931
+ sunpklo z1.s, z19.h
932
+ sunpkhi z19.s, z19.h
933
+
934
+ mul z1.s, z1.s, z2.s // quantCoef * deQuantCoef
935
+ mul z19.s, z19.s, z20.s
936
+
937
+ // Keeping NEON instructions here in order to have
938
+ // one sqshl later
939
+ sqxtn v16.4h, v1.4s // x265_clip3
940
+ sqxtn2 v16.8h, v19.4s
941
+
942
+ sqshl z16.h, p0/m, z16.h, z0.h // coefQ << per - shift
943
+ st1h {z16.h}, p0, x2
944
+ add x2, x2, #16
945
+ cbnz x3, .dequant_loop2_sve2
946
+ ret
947
+endfunc
948
+
949
+// void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
950
+function PFX(dequant_normal_sve2)
951
+ lsr w2, w2, #4 // num / 16
952
+ neg w4, w4
953
+ mov z0.h, w3
954
+ mov z1.s, w4
955
+ rdvl x9, #1
956
+ cmp x9, #16
957
+ bgt .vl_gt_16_dequant_normal_sve2
958
+.dqn_loop1_sve2:
959
+ ld1 {v2.8h, v3.8h}, x0, #32
960
+ smull v16.4s, v2.4h, v0.4h
961
+ smull2 v17.4s, v2.8h, v0.8h
962
+ smull v18.4s, v3.4h, v0.4h
963
+ smull2 v19.4s, v3.8h, v0.8h
964
+
965
+ srshl v16.4s, v16.4s, v1.4s
966
+ srshl v17.4s, v17.4s, v1.4s
967
+ srshl v18.4s, v18.4s, v1.4s
968
+ srshl v19.4s, v19.4s, v1.4s
969
+
970
+ sqxtn v2.4h, v16.4s
971
+ sqxtn2 v2.8h, v17.4s
972
+ sqxtn v3.4h, v18.4s
973
+ sqxtn2 v3.8h, v19.4s
974
+
975
+ sub w2, w2, #1
976
+ st1 {v2.8h, v3.8h}, x1, #32
977
+ cbnz w2, .dqn_loop1_sve2
978
+ ret
979
+.vl_gt_16_dequant_normal_sve2:
980
+ ptrue p0.h, vl16
981
+.gt_16_dqn_loop1_sve2:
982
+ ld1h {z2.h}, p0/z, x0
983
+ add x0, x0, #32
984
+ smullb z16.s, z2.h, z0.h
985
+ smullt z17.s, z2.h, z0.h
986
+
987
+ srshl z16.s, p0/m, z16.s, z1.s
988
+ srshl z17.s, p0/m, z17.s, z1.s
989
+
990
+ sqxtnb z2.h, z16.s
991
+ sqxtnt z2.h, z17.s
992
+
993
+ sub w2, w2, #1
994
+ st1h {z2.h}, p0, x1
995
+ add x1, x1, #32
996
+ cbnz w2, .gt_16_dqn_loop1_sve2
997
+ ret
998
+
999
+endfunc
1000
+
1001
+// void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24)
1002
+function PFX(ssim_4x4x2_core_sve2)
1003
+ ptrue p0.b, vl16
1004
+ movi v30.2d, #0
1005
+ movi v31.2d, #0
1006
+
1007
+ ld1b {z0.h}, p0/z, x0
1008
+ add x0, x0, x1
1009
+ ld1b {z1.h}, p0/z, x0
1010
+ add x0, x0, x1
1011
+ ld1b {z2.h}, p0/z, x0
1012
+ add x0, x0, x1
1013
+ ld1b {z3.h}, p0/z, x0
1014
+ add x0, x0, x1
1015
+
1016
+ ld1b {z4.h}, p0/z, x2
1017
+ add x2, x2, x3
1018
+ ld1b {z5.h}, p0/z, x2
1019
+ add x2, x2, x3
1020
+ ld1b {z6.h}, p0/z, x2
1021
+ add x2, x2, x3
1022
+ ld1b {z7.h}, p0/z, x2
1023
+ add x2, x2, x3
1024
+
1025
+ mul z16.h, z0.h, z0.h
1026
+ mul z17.h, z1.h, z1.h
1027
+ mul z18.h, z2.h, z2.h
1028
+ uaddlp v30.4s, v16.8h
1029
+
1030
+ mul z19.h, z3.h, z3.h
1031
+ mul z20.h, z4.h, z4.h
1032
+ mul z21.h, z5.h, z5.h
1033
+ uadalp v30.4s, v17.8h
1034
+
1035
+ mul z22.h, z6.h, z6.h
1036
+ mul z23.h, z7.h, z7.h
1037
+ mul z24.h, z0.h, z4.h
1038
+ uadalp v30.4s, v18.8h
1039
+
1040
+ mul z25.h, z1.h, z5.h
1041
+ mul z26.h, z2.h, z6.h
1042
+ mul z27.h, z3.h, z7.h
1043
+ uadalp v30.4s, v19.8h
1044
+
1045
+ add z28.h, z0.h, z1.h
1046
+ add z29.h, z4.h, z5.h
1047
+ uadalp v30.4s, v20.8h
1048
+ uaddlp v31.4s, v24.8h
1049
+
1050
+ add z28.h, z28.h, z2.h
1051
+ add z29.h, z29.h, z6.h
1052
+ uadalp v30.4s, v21.8h
1053
+ uadalp v31.4s, v25.8h
1054
+
1055
+ add z28.h, z28.h, z3.h
1056
+ add z29.h, z29.h, z7.h
1057
+ uadalp v30.4s, v22.8h
1058
+ uadalp v31.4s, v26.8h
1059
+
1060
+ // Better use NEON instructions here
1061
+ uaddlp v28.4s, v28.8h
1062
+ uaddlp v29.4s, v29.8h
1063
+ uadalp v30.4s, v23.8h
1064
+ uadalp v31.4s, v27.8h
1065
+
1066
+ addp v28.4s, v28.4s, v28.4s
1067
+ addp v29.4s, v29.4s, v29.4s
1068
+ addp v30.4s, v30.4s, v30.4s
1069
+ addp v31.4s, v31.4s, v31.4s
1070
+
1071
+ st4 {v28.2s, v29.2s, v30.2s, v31.2s}, x4
1072
+ ret
1073
+endfunc
1074
+
1075
+// void ssimDist_c(const pixel* fenc, uint32_t fStride, const pixel* recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k)
1076
+.macro ssimDist_start_sve2
1077
+ mov z0.d, #0
1078
+ mov z1.d, #0
1079
+.endm
1080
+
1081
+.macro ssimDist_1_sve2 z0 z1 z2 z3
1082
+ sub z16.s, \z0\().s, \z2\().s
1083
+ sub z17.s, \z1\().s, \z3\().s
1084
+ mul z18.s, \z0\().s, \z0\().s
1085
+ mul z19.s, \z1\().s, \z1\().s
1086
+ mul z20.s, z16.s, z16.s
1087
+ mul z21.s, z17.s, z17.s
1088
+ add z0.s, z0.s, z18.s
1089
+ add z0.s, z0.s, z19.s
1090
+ add z1.s, z1.s, z20.s
1091
+ add z1.s, z1.s, z21.s
1092
+.endm
1093
+
1094
+.macro ssimDist_end_sve2
1095
+ uaddv d0, p0, z0.s
1096
+ uaddv d1, p0, z1.s
1097
+ str d0, x6
1098
+ str d1, x4
1099
+.endm
1100
+
1101
+function PFX(ssimDist4_sve2)
1102
+ ssimDist_start
1103
+ ptrue p0.s, vl4
1104
+.rept 4
1105
+ ld1b {z4.s}, p0/z, x0
1106
+ add x0, x0, x1
1107
+ ld1b {z5.s}, p0/z, x2
1108
+ add x2, x2, x3
1109
+ sub z2.s, z4.s, z5.s
1110
+ mul z3.s, z4.s, z4.s
1111
+ mul z2.s, z2.s, z2.s
1112
+ add z0.s, z0.s, z3.s
1113
+ add z1.s, z1.s, z2.s
1114
+.endr
1115
+ ssimDist_end
1116
+ ret
1117
+endfunc
1118
+
1119
+function PFX(ssimDist8_sve2)
1120
+ rdvl x9, #1
1121
+ cmp x9, #16
1122
+ bgt .vl_gt_16_ssimDist8
1123
+ ssimDist_start
1124
+ ptrue p0.s, vl4
1125
+.rept 8
1126
+ ld1b {z4.s}, p0/z, x0
1127
+ ld1b {z5.s}, p0/z, x0, #1, mul vl
1128
+ add x0, x0, x1
1129
+ ld1b {z6.s}, p0/z, x2
1130
+ ld1b {z7.s}, p0/z, x2, #1, mul vl
1131
+ add x2, x2, x3
1132
+ ssimDist_1_sve2 z4, z5, z6, z7
1133
+.endr
1134
+ ssimDist_end
1135
+ ret
1136
+.vl_gt_16_ssimDist8:
1137
+ ssimDist_start_sve2
1138
+ ptrue p0.s, vl8
1139
+.rept 8
1140
+ ld1b {z4.s}, p0/z, x0
1141
+ add x0, x0, x1
1142
+ ld1b {z6.s}, p0/z, x2
1143
+ add x2, x2, x3
1144
+ sub z20.s, z4.s, z6.s
1145
+ mul z16.s, z4.s, z4.s
1146
+ mul z18.s, z20.s, z20.s
1147
+ add z0.s, z0.s, z16.s
1148
+ add z1.s, z1.s, z18.s
1149
+.endr
1150
+ ssimDist_end_sve2
1151
+ ret
1152
+endfunc
1153
+
1154
+function PFX(ssimDist16_sve2)
1155
+ mov w12, #16
1156
+ rdvl x9, #1
1157
+ cmp x9, #16
1158
+ bgt .vl_gt_16_ssimDist16
1159
+ ssimDist_start
1160
+ ptrue p0.s, vl4
1161
+.loop_ssimDist16_sve2:
1162
+ sub w12, w12, #1
1163
+ ld1b {z4.s}, p0/z, x0
1164
+ ld1b {z5.s}, p0/z, x0, #1, mul vl
1165
+ ld1b {z6.s}, p0/z, x0, #2, mul vl
1166
+ ld1b {z7.s}, p0/z, x0, #3, mul vl
1167
+ add x0, x0, x1
1168
+ ld1b {z8.s}, p0/z, x2
1169
+ ld1b {z9.s}, p0/z, x2, #1, mul vl
1170
+ ld1b {z10.s}, p0/z, x2, #2, mul vl
1171
+ ld1b {z11.s}, p0/z, x2, #3, mul vl
1172
+ add x2, x2, x3
1173
+ ssimDist_1_sve2 z4, z5, z8, z9
1174
+ ssimDist_1_sve2 z6, z7, z10, z11
1175
+ cbnz w12, .loop_ssimDist16_sve2
1176
+ ssimDist_end
1177
+ ret
1178
+.vl_gt_16_ssimDist16:
1179
+ cmp x9, #48
1180
+ bgt .vl_gt_48_ssimDist16
1181
+ ssimDist_start_sve2
1182
+ ptrue p0.s, vl8
1183
+.vl_gt_16_loop_ssimDist16_sve2:
1184
+ sub w12, w12, #1
1185
+ ld1b {z4.s}, p0/z, x0
1186
+ ld1b {z5.s}, p0/z, x0, #1, mul vl
1187
+ add x0, x0, x1
1188
+ ld1b {z8.s}, p0/z, x2
1189
+ ld1b {z9.s}, p0/z, x2, #1, mul vl
1190
+ add x2, x2, x3
1191
+ ssimDist_1_sve2 z4, z5, z8, z9
1192
+ cbnz w12, .vl_gt_16_loop_ssimDist16_sve2
1193
+ ssimDist_end_sve2
1194
+ ret
1195
+.vl_gt_48_ssimDist16:
1196
+ ssimDist_start_sve2
1197
+ ptrue p0.s, vl16
1198
+.vl_gt_48_loop_ssimDist16_sve2:
1199
+ sub w12, w12, #1
1200
+ ld1b {z4.s}, p0/z, x0
1201
+ add x0, x0, x1
1202
+ ld1b {z8.s}, p0/z, x2
1203
+ add x2, x2, x3
1204
+ sub z20.s, z4.s, z8.s
1205
+ mul z16.s, z4.s, z4.s
1206
+ mul z18.s, z20.s, z20.s
1207
+ add z0.s, z0.s, z16.s
1208
+ add z1.s, z1.s, z18.s
1209
+ cbnz w12, .vl_gt_48_loop_ssimDist16_sve2
1210
+ ssimDist_end_sve2
1211
+ ret
1212
+endfunc
1213
+
1214
+function PFX(ssimDist32_sve2)
1215
+ mov w12, #32
1216
+ rdvl x9, #1
1217
+ cmp x9, #16
1218
+ bgt .vl_gt_16_ssimDist32
1219
+ ssimDist_start
1220
+ ptrue p0.s, vl4
1221
+.loop_ssimDist32_sve2:
1222
+ sub w12, w12, #1
1223
+ ld1b {z2.s}, p0/z, x0
1224
+ ld1b {z3.s}, p0/z, x0, #1, mul vl
1225
+ ld1b {z4.s}, p0/z, x0, #2, mul vl
1226
+ ld1b {z5.s}, p0/z, x0, #3, mul vl
1227
+ ld1b {z6.s}, p0/z, x0, #4, mul vl
1228
+ ld1b {z7.s}, p0/z, x0, #5, mul vl
1229
+ ld1b {z8.s}, p0/z, x0, #6, mul vl
1230
+ ld1b {z9.s}, p0/z, x0, #7, mul vl
1231
+ add x0, x0, x1
1232
+ ld1b {z10.s}, p0/z, x2
1233
+ ld1b {z11.s}, p0/z, x2, #1, mul vl
1234
+ ld1b {z12.s}, p0/z, x2, #2, mul vl
1235
+ ld1b {z13.s}, p0/z, x2, #3, mul vl
1236
+ ld1b {z14.s}, p0/z, x2, #4, mul vl
1237
+ ld1b {z15.s}, p0/z, x2, #5, mul vl
1238
+ ld1b {z30.s}, p0/z, x2, #6, mul vl
1239
+ ld1b {z31.s}, p0/z, x2, #7, mul vl
1240
+ add x2, x2, x3
1241
+ ssimDist_1_sve2 z2, z3, z10, z11
1242
+ ssimDist_1_sve2 z4, z5, z12, z13
1243
+ ssimDist_1_sve2 z6, z7, z14, z15
1244
+ ssimDist_1_sve2 z8, z9, z30, z31
1245
+ cbnz w12, .loop_ssimDist32_sve2
1246
+ ssimDist_end
1247
+ ret
1248
+.vl_gt_16_ssimDist32:
1249
+ cmp x9, #48
1250
+ bgt .vl_gt_48_ssimDist32
1251
+ ssimDist_start_sve2
1252
+ ptrue p0.s, vl8
1253
+.vl_gt_16_loop_ssimDist32_sve2:
1254
+ sub w12, w12, #1
1255
+ ld1b {z2.s}, p0/z, x0
1256
+ ld1b {z3.s}, p0/z, x0, #1, mul vl
1257
+ ld1b {z4.s}, p0/z, x0, #2, mul vl
1258
+ ld1b {z5.s}, p0/z, x0, #3, mul vl
1259
+ add x0, x0, x1
1260
+ ld1b {z10.s}, p0/z, x2
1261
+ ld1b {z11.s}, p0/z, x2, #1, mul vl
1262
+ ld1b {z12.s}, p0/z, x2, #2, mul vl
1263
+ ld1b {z13.s}, p0/z, x2, #3, mul vl
1264
+ add x2, x2, x3
1265
+ ssimDist_1_sve2 z2, z3, z10, z11
1266
+ ssimDist_1_sve2 z4, z5, z12, z13
1267
+ cbnz w12, .vl_gt_16_loop_ssimDist32_sve2
1268
+ ssimDist_end_sve2
1269
+ ret
1270
+.vl_gt_48_ssimDist32:
1271
+ cmp x9, #112
1272
+ bgt .vl_gt_112_ssimDist32
1273
+ ssimDist_start_sve2
1274
+ ptrue p0.s, vl16
1275
+.vl_gt_48_loop_ssimDist32_sve2:
1276
+ sub w12, w12, #1
1277
+ ld1b {z2.s}, p0/z, x0
1278
+ ld1b {z3.s}, p0/z, x0, #1, mul vl
1279
+ add x0, x0, x1
1280
+ ld1b {z10.s}, p0/z, x2
1281
+ ld1b {z11.s}, p0/z, x2, #1, mul vl
1282
+ add x2, x2, x3
1283
+ ssimDist_1_sve2 z2, z3, z10, z11
1284
+ cbnz w12, .vl_gt_48_loop_ssimDist32_sve2
1285
+ ssimDist_end_sve2
1286
+ ret
1287
+.vl_gt_112_ssimDist32:
1288
+ ssimDist_start_sve2
1289
+ ptrue p0.s, vl32
1290
+.vl_gt_112_loop_ssimDist32_sve2:
1291
+ sub w12, w12, #1
1292
+ ld1b {z2.s}, p0/z, x0
1293
+ add x0, x0, x1
1294
+ ld1b {z10.s}, p0/z, x2
1295
+ add x2, x2, x3
1296
+ sub z20.s, z2.s, z10.s
1297
+ mul z16.s, z2.s, z2.s
1298
+ mul z18.s, z20.s, z20.s
1299
+ add z0.s, z0.s, z16.s
1300
+ add z1.s, z1.s, z18.s
1301
+ cbnz w12, .vl_gt_112_loop_ssimDist32_sve2
1302
+ ssimDist_end_sve2
1303
+ ret
1304
+endfunc
1305
+
1306
+function PFX(ssimDist64_sve2)
1307
+ mov w12, #64
1308
+ rdvl x9, #1
1309
+ cmp x9, #16
1310
+ bgt .vl_gt_16_ssimDist64
1311
+ ssimDist_start
1312
+ ptrue p0.s, vl4
1313
+.loop_ssimDist64_sve2:
1314
+ sub w12, w12, #1
1315
+ ld1b {z2.s}, p0/z, x0
1316
+ ld1b {z3.s}, p0/z, x0, #1, mul vl
1317
+ ld1b {z4.s}, p0/z, x0, #2, mul vl
1318
+ ld1b {z5.s}, p0/z, x0, #3, mul vl
1319
+ ld1b {z6.s}, p0/z, x0, #4, mul vl
1320
+ ld1b {z7.s}, p0/z, x0, #5, mul vl
1321
+ ld1b {z8.s}, p0/z, x0, #6, mul vl
1322
+ ld1b {z9.s}, p0/z, x0, #7, mul vl
1323
+ ld1b {z23.s}, p0/z, x2
1324
+ ld1b {z24.s}, p0/z, x2, #1, mul vl
1325
+ ld1b {z25.s}, p0/z, x2, #2, mul vl
1326
+ ld1b {z26.s}, p0/z, x2, #3, mul vl
1327
+ ld1b {z27.s}, p0/z, x2, #4, mul vl
1328
+ ld1b {z28.s}, p0/z, x2, #5, mul vl
1329
+ ld1b {z29.s}, p0/z, x2, #6, mul vl
1330
+ ld1b {z30.s}, p0/z, x2, #7, mul vl
1331
+ ssimDist_1_sve2 z2, z3, z23, z24
1332
+ ssimDist_1_sve2 z4, z5, z25, z26
1333
+ ssimDist_1_sve2 z6, z7, z27, z28
1334
+ ssimDist_1_sve2 z8, z9, z29, z30
1335
+ mov x4, x0
1336
+ mov x5, x2
1337
+ add x4, x4, #32
1338
+ add x5, x5, #32
1339
+ ld1b {z2.s}, p0/z, x4
1340
+ ld1b {z3.s}, p0/z, x4, #1, mul vl
1341
+ ld1b {z4.s}, p0/z, x4, #2, mul vl
1342
+ ld1b {z5.s}, p0/z, x4, #3, mul vl
1343
+ ld1b {z6.s}, p0/z, x4, #4, mul vl
1344
+ ld1b {z7.s}, p0/z, x4, #5, mul vl
1345
+ ld1b {z8.s}, p0/z, x4, #6, mul vl
1346
+ ld1b {z9.s}, p0/z, x4, #7, mul vl
1347
+ ld1b {z23.s}, p0/z, x5
1348
+ ld1b {z24.s}, p0/z, x5, #1, mul vl
1349
+ ld1b {z25.s}, p0/z, x5, #2, mul vl
1350
+ ld1b {z26.s}, p0/z, x5, #3, mul vl
1351
+ ld1b {z27.s}, p0/z, x5, #4, mul vl
1352
+ ld1b {z28.s}, p0/z, x5, #5, mul vl
1353
+ ld1b {z29.s}, p0/z, x5, #6, mul vl
1354
+ ld1b {z30.s}, p0/z, x5, #7, mul vl
1355
+ ssimDist_1_sve2 z2, z3, z23, z24
1356
+ ssimDist_1_sve2 z4, z5, z25, z26
1357
+ ssimDist_1_sve2 z6, z7, z27, z28
1358
+ ssimDist_1_sve2 z8, z9, z29, z30
1359
+ add x0, x0, x1
1360
+ add x2, x2, x3
1361
+ cbnz w12, .loop_ssimDist64_sve2
1362
+ ssimDist_end
1363
+ ret
1364
+.vl_gt_16_ssimDist64:
1365
+ cmp x9, #48
1366
+ bgt .vl_gt_48_ssimDist64
1367
+ ssimDist_start_sve2
1368
+ ptrue p0.s, vl8
1369
+.vl_gt_16_loop_ssimDist64_sve2:
1370
+ sub w12, w12, #1
1371
+ ld1b {z2.s}, p0/z, x0
1372
+ ld1b {z3.s}, p0/z, x0, #1, mul vl
1373
+ ld1b {z4.s}, p0/z, x0, #2, mul vl
1374
+ ld1b {z5.s}, p0/z, x0, #3, mul vl
1375
+ ld1b {z6.s}, p0/z, x0, #4, mul vl
1376
+ ld1b {z7.s}, p0/z, x0, #5, mul vl
1377
+ ld1b {z8.s}, p0/z, x0, #6, mul vl
1378
+ ld1b {z9.s}, p0/z, x0, #7, mul vl
1379
+ ld1b {z23.s}, p0/z, x2
1380
+ ld1b {z24.s}, p0/z, x2, #1, mul vl
1381
+ ld1b {z25.s}, p0/z, x2, #2, mul vl
1382
+ ld1b {z26.s}, p0/z, x2, #3, mul vl
1383
+ ld1b {z27.s}, p0/z, x2, #4, mul vl
1384
+ ld1b {z28.s}, p0/z, x2, #5, mul vl
1385
+ ld1b {z29.s}, p0/z, x2, #6, mul vl
1386
+ ld1b {z30.s}, p0/z, x2, #7, mul vl
1387
+ ssimDist_1_sve2 z2, z3, z23, z24
1388
+ ssimDist_1_sve2 z4, z5, z25, z26
1389
+ ssimDist_1_sve2 z6, z7, z27, z28
1390
+ ssimDist_1_sve2 z8, z9, z29, z30
1391
+ add x0, x0, x1
1392
+ add x2, x2, x3
1393
+ cbnz w12, .vl_gt_16_loop_ssimDist64_sve2
1394
+ ssimDist_end_sve2
1395
+ ret
1396
+.vl_gt_48_ssimDist64:
1397
+ cmp x9, #112
1398
+ bgt .vl_gt_112_ssimDist64
1399
+ ssimDist_start_sve2
1400
+ ptrue p0.s, vl16
1401
+.vl_gt_48_loop_ssimDist64_sve2:
1402
+ sub w12, w12, #1
1403
+ ld1b {z2.s}, p0/z, x0
1404
+ ld1b {z3.s}, p0/z, x0, #1, mul vl
1405
+ ld1b {z4.s}, p0/z, x0, #2, mul vl
1406
+ ld1b {z5.s}, p0/z, x0, #3, mul vl
1407
+ ld1b {z23.s}, p0/z, x2
1408
+ ld1b {z24.s}, p0/z, x2, #1, mul vl
1409
+ ld1b {z25.s}, p0/z, x2, #2, mul vl
1410
+ ld1b {z26.s}, p0/z, x2, #3, mul vl
1411
+ ssimDist_1_sve2 z2, z3, z23, z24
1412
+ ssimDist_1_sve2 z4, z5, z25, z26
1413
+ add x0, x0, x1
1414
+ add x2, x2, x3
1415
+ cbnz w12, .vl_gt_48_loop_ssimDist64_sve2
1416
+ ssimDist_end_sve2
1417
+ ret
1418
+.vl_gt_112_ssimDist64:
1419
+ ssimDist_start_sve2
1420
+ ptrue p0.s, vl32
1421
+.vl_gt_112_loop_ssimDist64_sve2:
1422
+ sub w12, w12, #1
1423
+ ld1b {z2.s}, p0/z, x0
1424
+ ld1b {z3.s}, p0/z, x0, #1, mul vl
1425
+ ld1b {z23.s}, p0/z, x2
1426
+ ld1b {z24.s}, p0/z, x2, #1, mul vl
1427
+ ssimDist_1_sve2 z2, z3, z23, z24
1428
+ add x0, x0, x1
1429
+ add x2, x2, x3
1430
+ cbnz w12, .vl_gt_112_loop_ssimDist64_sve2
1431
+ ssimDist_end_sve2
1432
+ ret
1433
+endfunc
1434
+
1435
+// void normFact_c(const pixel* src, uint32_t blockSize, int shift, uint64_t *z_k)
1436
+.macro normFact_start_sve2
1437
+ mov z0.d, #0
1438
+.endm
1439
+
1440
+.macro normFact_1_sve2 z0, z1
1441
+ mul z16.s, \z0\().s, \z0\().s
1442
+ mul z17.s, \z1\().s, \z1\().s
1443
+ add z0.s, z0.s, z16.s
1444
+ add z0.s, z0.s, z17.s
1445
+.endm
1446
+
1447
+.macro normFact_end_sve2
1448
+ uaddv d0, p0, z0.s
1449
+ str d0, x3
1450
+.endm
1451
+
1452
+function PFX(normFact8_sve2)
1453
+ rdvl x9, #1
1454
+ cmp x9, #16
1455
+ bgt .vl_gt_16_normFact8
1456
+ normFact_start
1457
+ ptrue p0.s, vl4
1458
+.rept 8
1459
+ ld1b {z4.s}, p0/z, x0
1460
+ ld1b {z5.s}, p0/z, x0, #1, mul vl
1461
+ add x0, x0, x1
1462
+ normFact_1_sve2 z4, z5
1463
+.endr
1464
+ normFact_end
1465
+ ret
1466
+.vl_gt_16_normFact8:
1467
+ normFact_start_sve2
1468
+ ptrue p0.s, vl8
1469
+.rept 8
1470
+ ld1b {z4.s}, p0/z, x0
1471
+ add x0, x0, x1
1472
+ mul z16.s, z4.s, z4.s
1473
+ add z0.s, z0.s, z16.s
1474
+.endr
1475
+ normFact_end_sve2
1476
+ ret
1477
+endfunc
1478
+
1479
+function PFX(normFact16_sve2)
1480
+ mov w12, #16
1481
+ rdvl x9, #1
1482
+ cmp x9, #16
1483
+ bgt .vl_gt_16_normFact16
1484
+ normFact_start
1485
+ ptrue p0.s, vl4
1486
+.loop_normFact16_sve2:
1487
+ sub w12, w12, #1
1488
+ ld1b {z4.s}, p0/z, x0
1489
+ ld1b {z5.s}, p0/z, x0, #1, mul vl
1490
+ ld1b {z6.s}, p0/z, x0, #2, mul vl
1491
+ ld1b {z7.s}, p0/z, x0, #3, mul vl
1492
+ add x0, x0, x1
1493
+ normFact_1_sve2 z4, z5
1494
+ normFact_1_sve2 z6, z7
1495
+ cbnz w12, .loop_normFact16_sve2
1496
+ normFact_end
1497
+ ret
1498
+.vl_gt_16_normFact16:
1499
+ cmp x9, #48
1500
+ bgt .vl_gt_48_normFact16
1501
+ normFact_start_sve2
1502
+ ptrue p0.s, vl8
1503
+.vl_gt_16_loop_normFact16_sve2:
1504
+ sub w12, w12, #1
1505
+ ld1b {z4.s}, p0/z, x0
1506
+ ld1b {z5.s}, p0/z, x0, #1, mul vl
1507
+ add x0, x0, x1
1508
+ normFact_1_sve2 z4, z5
1509
+ cbnz w12, .vl_gt_16_loop_normFact16_sve2
1510
+ normFact_end_sve2
1511
+ ret
1512
+.vl_gt_48_normFact16:
1513
+ normFact_start_sve2
1514
+ ptrue p0.s, vl16
1515
+.vl_gt_48_loop_normFact16_sve2:
1516
+ sub w12, w12, #1
1517
+ ld1b {z4.s}, p0/z, x0
1518
+ add x0, x0, x1
1519
+ mul z16.s, z4.s, z4.s
1520
+ add z0.s, z0.s, z16.s
1521
+ cbnz w12, .vl_gt_48_loop_normFact16_sve2
1522
+ normFact_end_sve2
1523
+ ret
1524
+endfunc
1525
+
1526
+function PFX(normFact32_sve2)
1527
+ mov w12, #32
1528
+ rdvl x9, #1
1529
+ cmp x9, #16
1530
+ bgt .vl_gt_16_normFact32
1531
+ normFact_start
1532
+ ptrue p0.s, vl4
1533
+.loop_normFact32_sve2:
1534
+ sub w12, w12, #1
1535
+ ld1b {z4.s}, p0/z, x0
1536
+ ld1b {z5.s}, p0/z, x0, #1, mul vl
1537
+ ld1b {z6.s}, p0/z, x0, #2, mul vl
1538
+ ld1b {z7.s}, p0/z, x0, #3, mul vl
1539
+ ld1b {z8.s}, p0/z, x0, #4, mul vl
1540
+ ld1b {z9.s}, p0/z, x0, #5, mul vl
1541
+ ld1b {z10.s}, p0/z, x0, #6, mul vl
1542
+ ld1b {z11.s}, p0/z, x0, #7, mul vl
1543
+ add x0, x0, x1
1544
+ normFact_1_sve2 z4, z5
1545
+ normFact_1_sve2 z6, z7
1546
+ normFact_1_sve2 z8, z9
1547
+ normFact_1_sve2 z10, z11
1548
+ cbnz w12, .loop_normFact32_sve2
1549
+ normFact_end
1550
+ ret
1551
+.vl_gt_16_normFact32:
1552
+ cmp x9, #48
1553
+ bgt .vl_gt_48_normFact32
1554
+ normFact_start_sve2
1555
+ ptrue p0.s, vl8
1556
+.vl_gt_16_loop_normFact32_sve2:
1557
+ sub w12, w12, #1
1558
+ ld1b {z4.s}, p0/z, x0
1559
+ ld1b {z5.s}, p0/z, x0, #1, mul vl
1560
+ ld1b {z6.s}, p0/z, x0, #2, mul vl
1561
+ ld1b {z7.s}, p0/z, x0, #3, mul vl
1562
+ add x0, x0, x1
1563
+ normFact_1_sve2 z4, z5
1564
+ normFact_1_sve2 z6, z7
1565
+ cbnz w12, .vl_gt_16_loop_normFact32_sve2
1566
+ normFact_end_sve2
1567
+ ret
1568
+.vl_gt_48_normFact32:
1569
+ cmp x9, #112
1570
+ bgt .vl_gt_112_normFact32
1571
+ normFact_start_sve2
1572
+ ptrue p0.s, vl16
1573
+.vl_gt_48_loop_normFact32_sve2:
1574
+ sub w12, w12, #1
1575
+ ld1b {z4.s}, p0/z, x0
1576
+ ld1b {z5.s}, p0/z, x0, #1, mul vl
1577
+ add x0, x0, x1
1578
+ normFact_1_sve2 z4, z5
1579
+ cbnz w12, .vl_gt_48_loop_normFact32_sve2
1580
+ normFact_end_sve2
1581
+ ret
1582
+.vl_gt_112_normFact32:
1583
+ normFact_start_sve2
1584
+ ptrue p0.s, vl32
1585
+.vl_gt_112_loop_normFact32_sve2:
1586
+ sub w12, w12, #1
1587
+ ld1b {z4.s}, p0/z, x0
1588
+ add x0, x0, x1
1589
+ mul z16.s, z4.s, z4.s
1590
+ add z0.s, z0.s, z16.s
1591
+ cbnz w12, .vl_gt_112_loop_normFact32_sve2
1592
+ normFact_end_sve2
1593
+ ret
1594
+endfunc
1595
+
1596
+function PFX(normFact64_sve2)
1597
+ mov w12, #64
1598
+ rdvl x9, #1
1599
+ cmp x9, #16
1600
+ bgt .vl_gt_16_normFact64
1601
+ normFact_start
1602
+ ptrue p0.s, vl4
1603
+.loop_normFact64_sve2:
1604
+ sub w12, w12, #1
1605
+ ld1b {z4.s}, p0/z, x0
1606
+ ld1b {z5.s}, p0/z, x0, #1, mul vl
1607
+ ld1b {z6.s}, p0/z, x0, #2, mul vl
1608
+ ld1b {z7.s}, p0/z, x0, #3, mul vl
1609
+ ld1b {z8.s}, p0/z, x0, #4, mul vl
1610
+ ld1b {z9.s}, p0/z, x0, #5, mul vl
1611
+ ld1b {z10.s}, p0/z, x0, #6, mul vl
1612
+ ld1b {z11.s}, p0/z, x0, #7, mul vl
1613
+ normFact_1_sve2 z4, z5
1614
+ normFact_1_sve2 z6, z7
1615
+ normFact_1_sve2 z8, z9
1616
+ normFact_1_sve2 z10, z11
1617
+ mov x2, x0
1618
+ add x2, x2, #32
1619
+ ld1b {z4.s}, p0/z, x2
1620
+ ld1b {z5.s}, p0/z, x2, #1, mul vl
1621
+ ld1b {z6.s}, p0/z, x2, #2, mul vl
1622
+ ld1b {z7.s}, p0/z, x2, #3, mul vl
1623
+ ld1b {z8.s}, p0/z, x2, #4, mul vl
1624
+ ld1b {z9.s}, p0/z, x2, #5, mul vl
1625
+ ld1b {z10.s}, p0/z, x2, #6, mul vl
1626
+ ld1b {z11.s}, p0/z, x2, #7, mul vl
1627
+ normFact_1_sve2 z4, z5
1628
+ normFact_1_sve2 z6, z7
1629
+ normFact_1_sve2 z8, z9
1630
+ normFact_1_sve2 z10, z11
1631
+ add x0, x0, x1
1632
+ cbnz w12, .loop_normFact64_sve2
1633
+ normFact_end
1634
+ ret
1635
+.vl_gt_16_normFact64:
1636
+ cmp x9, #48
1637
+ bgt .vl_gt_48_normFact64
1638
+ normFact_start_sve2
1639
+ ptrue p0.s, vl8
1640
+.vl_gt_16_loop_normFact64_sve2:
1641
+ sub w12, w12, #1
1642
+ ld1b {z4.s}, p0/z, x0
1643
+ ld1b {z5.s}, p0/z, x0, #1, mul vl
1644
+ ld1b {z6.s}, p0/z, x0, #2, mul vl
1645
+ ld1b {z7.s}, p0/z, x0, #3, mul vl
1646
+ ld1b {z8.s}, p0/z, x0, #4, mul vl
1647
+ ld1b {z9.s}, p0/z, x0, #5, mul vl
1648
+ ld1b {z10.s}, p0/z, x0, #6, mul vl
1649
+ ld1b {z11.s}, p0/z, x0, #7, mul vl
1650
+ normFact_1_sve2 z4, z5
1651
+ normFact_1_sve2 z6, z7
1652
+ normFact_1_sve2 z8, z9
1653
+ normFact_1_sve2 z10, z11
1654
+ add x0, x0, x1
1655
+ cbnz w12, .vl_gt_16_loop_normFact64_sve2
1656
+ normFact_end_sve2
1657
+ ret
1658
+.vl_gt_48_normFact64:
1659
+ cmp x9, #112
1660
+ bgt .vl_gt_112_normFact64
1661
+ normFact_start_sve2
1662
+ ptrue p0.s, vl16
1663
+.vl_gt_48_loop_normFact64_sve2:
1664
+ sub w12, w12, #1
1665
+ ld1b {z4.s}, p0/z, x0
1666
+ ld1b {z5.s}, p0/z, x0, #1, mul vl
1667
+ ld1b {z6.s}, p0/z, x0, #2, mul vl
1668
+ ld1b {z7.s}, p0/z, x0, #3, mul vl
1669
+ normFact_1_sve2 z4, z5
1670
+ normFact_1_sve2 z6, z7
1671
+ add x0, x0, x1
1672
+ cbnz w12, .vl_gt_48_loop_normFact64_sve2
1673
+ normFact_end_sve2
1674
+ ret
1675
+.vl_gt_112_normFact64:
1676
+ normFact_start_sve2
1677
+ ptrue p0.s, vl32
1678
+.vl_gt_112_loop_normFact64_sve2:
1679
+ sub w12, w12, #1
1680
+ ld1b {z4.s}, p0/z, x0
1681
+ ld1b {z5.s}, p0/z, x0, #1, mul vl
1682
+ normFact_1_sve2 z4, z5
1683
+ add x0, x0, x1
1684
+ cbnz w12, .vl_gt_112_loop_normFact64_sve2
1685
+ normFact_end_sve2
1686
+ ret
1687
+endfunc
1688
x265_3.5.tar.gz/source/common/aarch64/pixel-util.S -> x265_3.6.tar.gz/source/common/aarch64/pixel-util.S
Changed
2419
1
2
/*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
+ * Copyright (C) 2020-2021 MulticoreWare, Inc
5
*
6
* Authors: Yimeng Su <yimeng.su@huawei.com>
7
* Hongbin Liu <liuhongbin1@huawei.com>
8
+ * Sebastian Pop <spop@amazon.com>
9
*
10
* This program is free software; you can redistribute it and/or modify
11
* it under the terms of the GNU General Public License as published by
12
13
*****************************************************************************/
14
15
#include "asm.S"
16
+#include "pixel-util-common.S"
17
18
+#ifdef __APPLE__
19
+.section __RODATA,__rodata
20
+#else
21
.section .rodata
22
+#endif
23
24
.align 4
25
26
.text
27
28
+// uint64_t pixel_var(const pixel* pix, intptr_t i_stride)
29
+function PFX(pixel_var_8x8_neon)
30
+ ld1 {v4.8b}, x0, x1 // pixx
31
+ uxtl v0.8h, v4.8b // sum = pixx
32
+ umull v1.8h, v4.8b, v4.8b
33
+ uaddlp v1.4s, v1.8h // sqr = pixx * pixx
34
+
35
+.rept 7
36
+ ld1 {v4.8b}, x0, x1 // pixx
37
+ umull v31.8h, v4.8b, v4.8b
38
+ uaddw v0.8h, v0.8h, v4.8b // sum += pixx
39
+ uadalp v1.4s, v31.8h // sqr += pixx * pixx
40
+.endr
41
+ uaddlv s0, v0.8h
42
+ uaddlv d1, v1.4s
43
+ fmov w0, s0
44
+ fmov x1, d1
45
+ orr x0, x0, x1, lsl #32 // return sum + ((uint64_t)sqr << 32);
46
+ ret
47
+endfunc
48
+
49
+function PFX(pixel_var_16x16_neon)
50
+ pixel_var_start
51
+ mov w12, #16
52
+.loop_var_16:
53
+ sub w12, w12, #1
54
+ ld1 {v4.16b}, x0, x1
55
+ pixel_var_1 v4
56
+ cbnz w12, .loop_var_16
57
+ pixel_var_end
58
+ ret
59
+endfunc
60
+
61
+function PFX(pixel_var_32x32_neon)
62
+ pixel_var_start
63
+ mov w12, #32
64
+.loop_var_32:
65
+ sub w12, w12, #1
66
+ ld1 {v4.16b-v5.16b}, x0, x1
67
+ pixel_var_1 v4
68
+ pixel_var_1 v5
69
+ cbnz w12, .loop_var_32
70
+ pixel_var_end
71
+ ret
72
+endfunc
73
+
74
+function PFX(pixel_var_64x64_neon)
75
+ pixel_var_start
76
+ mov w12, #64
77
+.loop_var_64:
78
+ sub w12, w12, #1
79
+ ld1 {v4.16b-v7.16b}, x0, x1
80
+ pixel_var_1 v4
81
+ pixel_var_1 v5
82
+ pixel_var_1 v6
83
+ pixel_var_1 v7
84
+ cbnz w12, .loop_var_64
85
+ pixel_var_end
86
+ ret
87
+endfunc
88
+
89
+// void getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride)
90
+function PFX(getResidual4_neon)
91
+ lsl x4, x3, #1
92
+.rept 2
93
+ ld1 {v0.8b}, x0, x3
94
+ ld1 {v1.8b}, x1, x3
95
+ ld1 {v2.8b}, x0, x3
96
+ ld1 {v3.8b}, x1, x3
97
+ usubl v4.8h, v0.8b, v1.8b
98
+ usubl v5.8h, v2.8b, v3.8b
99
+ st1 {v4.8b}, x2, x4
100
+ st1 {v5.8b}, x2, x4
101
+.endr
102
+ ret
103
+endfunc
104
+
105
+function PFX(getResidual8_neon)
106
+ lsl x4, x3, #1
107
+.rept 4
108
+ ld1 {v0.8b}, x0, x3
109
+ ld1 {v1.8b}, x1, x3
110
+ ld1 {v2.8b}, x0, x3
111
+ ld1 {v3.8b}, x1, x3
112
+ usubl v4.8h, v0.8b, v1.8b
113
+ usubl v5.8h, v2.8b, v3.8b
114
+ st1 {v4.16b}, x2, x4
115
+ st1 {v5.16b}, x2, x4
116
+.endr
117
+ ret
118
+endfunc
119
+
120
+function PFX(getResidual16_neon)
121
+ lsl x4, x3, #1
122
+.rept 8
123
+ ld1 {v0.16b}, x0, x3
124
+ ld1 {v1.16b}, x1, x3
125
+ ld1 {v2.16b}, x0, x3
126
+ ld1 {v3.16b}, x1, x3
127
+ usubl v4.8h, v0.8b, v1.8b
128
+ usubl2 v5.8h, v0.16b, v1.16b
129
+ usubl v6.8h, v2.8b, v3.8b
130
+ usubl2 v7.8h, v2.16b, v3.16b
131
+ st1 {v4.8h-v5.8h}, x2, x4
132
+ st1 {v6.8h-v7.8h}, x2, x4
133
+.endr
134
+ ret
135
+endfunc
136
+
137
+function PFX(getResidual32_neon)
138
+ lsl x4, x3, #1
139
+ mov w12, #4
140
+.loop_residual_32:
141
+ sub w12, w12, #1
142
+.rept 4
143
+ ld1 {v0.16b-v1.16b}, x0, x3
144
+ ld1 {v2.16b-v3.16b}, x1, x3
145
+ ld1 {v4.16b-v5.16b}, x0, x3
146
+ ld1 {v6.16b-v7.16b}, x1, x3
147
+ usubl v16.8h, v0.8b, v2.8b
148
+ usubl2 v17.8h, v0.16b, v2.16b
149
+ usubl v18.8h, v1.8b, v3.8b
150
+ usubl2 v19.8h, v1.16b, v3.16b
151
+ usubl v20.8h, v4.8b, v6.8b
152
+ usubl2 v21.8h, v4.16b, v6.16b
153
+ usubl v22.8h, v5.8b, v7.8b
154
+ usubl2 v23.8h, v5.16b, v7.16b
155
+ st1 {v16.8h-v19.8h}, x2, x4
156
+ st1 {v20.8h-v23.8h}, x2, x4
157
+.endr
158
+ cbnz w12, .loop_residual_32
159
+ ret
160
+endfunc
161
+
162
+// void pixel_sub_ps_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1)
163
+function PFX(pixel_sub_ps_4x4_neon)
164
+ lsl x1, x1, #1
165
+.rept 2
166
+ ld1 {v0.8b}, x2, x4
167
+ ld1 {v1.8b}, x3, x5
168
+ ld1 {v2.8b}, x2, x4
169
+ ld1 {v3.8b}, x3, x5
170
+ usubl v4.8h, v0.8b, v1.8b
171
+ usubl v5.8h, v2.8b, v3.8b
172
+ st1 {v4.4h}, x0, x1
173
+ st1 {v5.4h}, x0, x1
174
+.endr
175
+ ret
176
+endfunc
177
+
178
+function PFX(pixel_sub_ps_8x8_neon)
179
+ lsl x1, x1, #1
180
+.rept 4
181
+ ld1 {v0.8b}, x2, x4
182
+ ld1 {v1.8b}, x3, x5
183
+ ld1 {v2.8b}, x2, x4
184
+ ld1 {v3.8b}, x3, x5
185
+ usubl v4.8h, v0.8b, v1.8b
186
+ usubl v5.8h, v2.8b, v3.8b
187
+ st1 {v4.8h}, x0, x1
188
+ st1 {v5.8h}, x0, x1
189
+.endr
190
+ ret
191
+endfunc
192
+
193
+function PFX(pixel_sub_ps_16x16_neon)
194
+ lsl x1, x1, #1
195
+.rept 8
196
+ ld1 {v0.16b}, x2, x4
197
+ ld1 {v1.16b}, x3, x5
198
+ ld1 {v2.16b}, x2, x4
199
+ ld1 {v3.16b}, x3, x5
200
+ usubl v4.8h, v0.8b, v1.8b
201
+ usubl2 v5.8h, v0.16b, v1.16b
202
+ usubl v6.8h, v2.8b, v3.8b
203
+ usubl2 v7.8h, v2.16b, v3.16b
204
+ st1 {v4.8h-v5.8h}, x0, x1
205
+ st1 {v6.8h-v7.8h}, x0, x1
206
+.endr
207
+ ret
208
+endfunc
209
+
210
+function PFX(pixel_sub_ps_32x32_neon)
211
+ lsl x1, x1, #1
212
+ mov w12, #4
213
+.loop_sub_ps_32:
214
+ sub w12, w12, #1
215
+.rept 4
216
+ ld1 {v0.16b-v1.16b}, x2, x4
217
+ ld1 {v2.16b-v3.16b}, x3, x5
218
+ ld1 {v4.16b-v5.16b}, x2, x4
219
+ ld1 {v6.16b-v7.16b}, x3, x5
220
+ usubl v16.8h, v0.8b, v2.8b
221
+ usubl2 v17.8h, v0.16b, v2.16b
222
+ usubl v18.8h, v1.8b, v3.8b
223
+ usubl2 v19.8h, v1.16b, v3.16b
224
+ usubl v20.8h, v4.8b, v6.8b
225
+ usubl2 v21.8h, v4.16b, v6.16b
226
+ usubl v22.8h, v5.8b, v7.8b
227
+ usubl2 v23.8h, v5.16b, v7.16b
228
+ st1 {v16.8h-v19.8h}, x0, x1
229
+ st1 {v20.8h-v23.8h}, x0, x1
230
+.endr
231
+ cbnz w12, .loop_sub_ps_32
232
+ ret
233
+endfunc
234
+
235
+function PFX(pixel_sub_ps_64x64_neon)
236
+ lsl x1, x1, #1
237
+ sub x1, x1, #64
238
+ mov w12, #16
239
+.loop_sub_ps_64:
240
+ sub w12, w12, #1
241
+.rept 4
242
+ ld1 {v0.16b-v3.16b}, x2, x4
243
+ ld1 {v4.16b-v7.16b}, x3, x5
244
+ usubl v16.8h, v0.8b, v4.8b
245
+ usubl2 v17.8h, v0.16b, v4.16b
246
+ usubl v18.8h, v1.8b, v5.8b
247
+ usubl2 v19.8h, v1.16b, v5.16b
248
+ usubl v20.8h, v2.8b, v6.8b
249
+ usubl2 v21.8h, v2.16b, v6.16b
250
+ usubl v22.8h, v3.8b, v7.8b
251
+ usubl2 v23.8h, v3.16b, v7.16b
252
+ st1 {v16.8h-v19.8h}, x0, #64
253
+ st1 {v20.8h-v23.8h}, x0, x1
254
+.endr
255
+ cbnz w12, .loop_sub_ps_64
256
+ ret
257
+endfunc
258
+
259
+// chroma sub_ps
260
+function PFX(pixel_sub_ps_4x8_neon)
261
+ lsl x1, x1, #1
262
+.rept 4
263
+ ld1 {v0.8b}, x2, x4
264
+ ld1 {v1.8b}, x3, x5
265
+ ld1 {v2.8b}, x2, x4
266
+ ld1 {v3.8b}, x3, x5
267
+ usubl v4.8h, v0.8b, v1.8b
268
+ usubl v5.8h, v2.8b, v3.8b
269
+ st1 {v4.4h}, x0, x1
270
+ st1 {v5.4h}, x0, x1
271
+.endr
272
+ ret
273
+endfunc
274
+
275
+function PFX(pixel_sub_ps_8x16_neon)
276
+ lsl x1, x1, #1
277
+.rept 8
278
+ ld1 {v0.8b}, x2, x4
279
+ ld1 {v1.8b}, x3, x5
280
+ ld1 {v2.8b}, x2, x4
281
+ ld1 {v3.8b}, x3, x5
282
+ usubl v4.8h, v0.8b, v1.8b
283
+ usubl v5.8h, v2.8b, v3.8b
284
+ st1 {v4.8h}, x0, x1
285
+ st1 {v5.8h}, x0, x1
286
+.endr
287
+ ret
288
+endfunc
289
+
290
+function PFX(pixel_sub_ps_16x32_neon)
291
+ lsl x1, x1, #1
292
+.rept 16
293
+ ld1 {v0.16b}, x2, x4
294
+ ld1 {v1.16b}, x3, x5
295
+ ld1 {v2.16b}, x2, x4
296
+ ld1 {v3.16b}, x3, x5
297
+ usubl v4.8h, v0.8b, v1.8b
298
+ usubl2 v5.8h, v0.16b, v1.16b
299
+ usubl v6.8h, v2.8b, v3.8b
300
+ usubl2 v7.8h, v2.16b, v3.16b
301
+ st1 {v4.8h-v5.8h}, x0, x1
302
+ st1 {v6.8h-v7.8h}, x0, x1
303
+.endr
304
+ ret
305
+endfunc
306
+
307
+function PFX(pixel_sub_ps_32x64_neon)
308
+ lsl x1, x1, #1
309
+ mov w12, #8
310
+.loop_sub_ps_32x64:
311
+ sub w12, w12, #1
312
+.rept 4
313
+ ld1 {v0.16b-v1.16b}, x2, x4
314
+ ld1 {v2.16b-v3.16b}, x3, x5
315
+ ld1 {v4.16b-v5.16b}, x2, x4
316
+ ld1 {v6.16b-v7.16b}, x3, x5
317
+ usubl v16.8h, v0.8b, v2.8b
318
+ usubl2 v17.8h, v0.16b, v2.16b
319
+ usubl v18.8h, v1.8b, v3.8b
320
+ usubl2 v19.8h, v1.16b, v3.16b
321
+ usubl v20.8h, v4.8b, v6.8b
322
+ usubl2 v21.8h, v4.16b, v6.16b
323
+ usubl v22.8h, v5.8b, v7.8b
324
+ usubl2 v23.8h, v5.16b, v7.16b
325
+ st1 {v16.8h-v19.8h}, x0, x1
326
+ st1 {v20.8h-v23.8h}, x0, x1
327
+.endr
328
+ cbnz w12, .loop_sub_ps_32x64
329
+ ret
330
+endfunc
331
+
332
+// void x265_pixel_add_ps_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
333
+function PFX(pixel_add_ps_4x4_neon)
334
+ lsl x5, x5, #1
335
+.rept 2
336
+ ld1 {v0.8b}, x2, x4
337
+ ld1 {v1.8b}, x2, x4
338
+ ld1 {v2.4h}, x3, x5
339
+ ld1 {v3.4h}, x3, x5
340
+ uxtl v0.8h, v0.8b
341
+ uxtl v1.8h, v1.8b
342
+ add v4.8h, v0.8h, v2.8h
343
+ add v5.8h, v1.8h, v3.8h
344
+ sqxtun v4.8b, v4.8h
345
+ sqxtun v5.8b, v5.8h
346
+ st1 {v4.s}0, x0, x1
347
+ st1 {v5.s}0, x0, x1
348
+.endr
349
+ ret
350
+endfunc
351
+
352
+function PFX(pixel_add_ps_8x8_neon)
353
+ lsl x5, x5, #1
354
+.rept 4
355
+ ld1 {v0.8b}, x2, x4
356
+ ld1 {v1.8b}, x2, x4
357
+ ld1 {v2.8h}, x3, x5
358
+ ld1 {v3.8h}, x3, x5
359
+ uxtl v0.8h, v0.8b
360
+ uxtl v1.8h, v1.8b
361
+ add v4.8h, v0.8h, v2.8h
362
+ add v5.8h, v1.8h, v3.8h
363
+ sqxtun v4.8b, v4.8h
364
+ sqxtun v5.8b, v5.8h
365
+ st1 {v4.8b}, x0, x1
366
+ st1 {v5.8b}, x0, x1
367
+.endr
368
+ ret
369
+endfunc
370
+
371
+.macro pixel_add_ps_16xN_neon h
372
+function PFX(pixel_add_ps_16x\h\()_neon)
373
+ lsl x5, x5, #1
374
+ mov w12, #\h / 8
375
+.loop_add_ps_16x\h\():
376
+ sub w12, w12, #1
377
+.rept 4
378
+ ld1 {v0.16b}, x2, x4
379
+ ld1 {v1.16b}, x2, x4
380
+ ld1 {v16.8h-v17.8h}, x3, x5
381
+ ld1 {v18.8h-v19.8h}, x3, x5
382
+ uxtl v4.8h, v0.8b
383
+ uxtl2 v5.8h, v0.16b
384
+ uxtl v6.8h, v1.8b
385
+ uxtl2 v7.8h, v1.16b
386
+ add v24.8h, v4.8h, v16.8h
387
+ add v25.8h, v5.8h, v17.8h
388
+ add v26.8h, v6.8h, v18.8h
389
+ add v27.8h, v7.8h, v19.8h
390
+ sqxtun v4.8b, v24.8h
391
+ sqxtun2 v4.16b, v25.8h
392
+ sqxtun v5.8b, v26.8h
393
+ sqxtun2 v5.16b, v27.8h
394
+ st1 {v4.16b}, x0, x1
395
+ st1 {v5.16b}, x0, x1
396
+.endr
397
+ cbnz w12, .loop_add_ps_16x\h
398
+ ret
399
+endfunc
400
+.endm
401
+
402
+pixel_add_ps_16xN_neon 16
403
+pixel_add_ps_16xN_neon 32
404
+
405
+.macro pixel_add_ps_32xN_neon h
406
+ function PFX(pixel_add_ps_32x\h\()_neon)
407
+ lsl x5, x5, #1
408
+ mov w12, #\h / 4
409
+.loop_add_ps_32x\h\():
410
+ sub w12, w12, #1
411
+.rept 4
412
+ ld1 {v0.16b-v1.16b}, x2, x4
413
+ ld1 {v16.8h-v19.8h}, x3, x5
414
+ uxtl v4.8h, v0.8b
415
+ uxtl2 v5.8h, v0.16b
416
+ uxtl v6.8h, v1.8b
417
+ uxtl2 v7.8h, v1.16b
418
+ add v24.8h, v4.8h, v16.8h
419
+ add v25.8h, v5.8h, v17.8h
420
+ add v26.8h, v6.8h, v18.8h
421
+ add v27.8h, v7.8h, v19.8h
422
+ sqxtun v4.8b, v24.8h
423
+ sqxtun2 v4.16b, v25.8h
424
+ sqxtun v5.8b, v26.8h
425
+ sqxtun2 v5.16b, v27.8h
426
+ st1 {v4.16b-v5.16b}, x0, x1
427
+.endr
428
+ cbnz w12, .loop_add_ps_32x\h
429
+ ret
430
+endfunc
431
+.endm
432
+
433
+pixel_add_ps_32xN_neon 32
434
+pixel_add_ps_32xN_neon 64
435
+
436
+function PFX(pixel_add_ps_64x64_neon)
437
+ lsl x5, x5, #1
438
+ sub x5, x5, #64
439
+ mov w12, #32
440
+.loop_add_ps_64x64:
441
+ sub w12, w12, #1
442
+.rept 2
443
+ ld1 {v0.16b-v3.16b}, x2, x4
444
+ ld1 {v16.8h-v19.8h}, x3, #64
445
+ ld1 {v20.8h-v23.8h}, x3, x5
446
+ uxtl v4.8h, v0.8b
447
+ uxtl2 v5.8h, v0.16b
448
+ uxtl v6.8h, v1.8b
449
+ uxtl2 v7.8h, v1.16b
450
+ uxtl v24.8h, v2.8b
451
+ uxtl2 v25.8h, v2.16b
452
+ uxtl v26.8h, v3.8b
453
+ uxtl2 v27.8h, v3.16b
454
+ add v0.8h, v4.8h, v16.8h
455
+ add v1.8h, v5.8h, v17.8h
456
+ add v2.8h, v6.8h, v18.8h
457
+ add v3.8h, v7.8h, v19.8h
458
+ add v4.8h, v24.8h, v20.8h
459
+ add v5.8h, v25.8h, v21.8h
460
+ add v6.8h, v26.8h, v22.8h
461
+ add v7.8h, v27.8h, v23.8h
462
+ sqxtun v0.8b, v0.8h
463
+ sqxtun2 v0.16b, v1.8h
464
+ sqxtun v1.8b, v2.8h
465
+ sqxtun2 v1.16b, v3.8h
466
+ sqxtun v2.8b, v4.8h
467
+ sqxtun2 v2.16b, v5.8h
468
+ sqxtun v3.8b, v6.8h
469
+ sqxtun2 v3.16b, v7.8h
470
+ st1 {v0.16b-v3.16b}, x0, x1
471
+.endr
472
+ cbnz w12, .loop_add_ps_64x64
473
+ ret
474
+endfunc
475
+
476
+// Chroma add_ps
477
+function PFX(pixel_add_ps_4x8_neon)
478
+ lsl x5, x5, #1
479
+.rept 4
480
+ ld1 {v0.8b}, x2, x4
481
+ ld1 {v1.8b}, x2, x4
482
+ ld1 {v2.4h}, x3, x5
483
+ ld1 {v3.4h}, x3, x5
484
+ uxtl v0.8h, v0.8b
485
+ uxtl v1.8h, v1.8b
486
+ add v4.8h, v0.8h, v2.8h
487
+ add v5.8h, v1.8h, v3.8h
488
+ sqxtun v4.8b, v4.8h
489
+ sqxtun v5.8b, v5.8h
490
+ st1 {v4.s}0, x0, x1
491
+ st1 {v5.s}0, x0, x1
492
+.endr
493
+ ret
494
+endfunc
495
+
496
+function PFX(pixel_add_ps_8x16_neon)
497
+ lsl x5, x5, #1
498
+.rept 8
499
+ ld1 {v0.8b}, x2, x4
500
+ ld1 {v1.8b}, x2, x4
501
+ ld1 {v2.8h}, x3, x5
502
+ ld1 {v3.8h}, x3, x5
503
+ uxtl v0.8h, v0.8b
504
+ uxtl v1.8h, v1.8b
505
+ add v4.8h, v0.8h, v2.8h
506
+ add v5.8h, v1.8h, v3.8h
507
+ sqxtun v4.8b, v4.8h
508
+ sqxtun v5.8b, v5.8h
509
+ st1 {v4.8b}, x0, x1
510
+ st1 {v5.8b}, x0, x1
511
+.endr
512
+ ret
513
+endfunc
514
+
515
+// void scale1D_128to64(pixel *dst, const pixel *src)
516
+function PFX(scale1D_128to64_neon)
517
+.rept 2
518
+ ld2 {v0.16b, v1.16b}, x1, #32
519
+ ld2 {v2.16b, v3.16b}, x1, #32
520
+ ld2 {v4.16b, v5.16b}, x1, #32
521
+ ld2 {v6.16b, v7.16b}, x1, #32
522
+ urhadd v0.16b, v0.16b, v1.16b
523
+ urhadd v1.16b, v2.16b, v3.16b
524
+ urhadd v2.16b, v4.16b, v5.16b
525
+ urhadd v3.16b, v6.16b, v7.16b
526
+ st1 {v0.16b-v3.16b}, x0, #64
527
+.endr
528
+ ret
529
+endfunc
530
+
531
+.macro scale2D_1 v0, v1
532
+ uaddlp \v0\().8h, \v0\().16b
533
+ uaddlp \v1\().8h, \v1\().16b
534
+ add \v0\().8h, \v0\().8h, \v1\().8h
535
+.endm
536
+
537
+// void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride)
538
+function PFX(scale2D_64to32_neon)
539
+ mov w12, #32
540
+.loop_scale2D:
541
+ ld1 {v0.16b-v3.16b}, x1, x2
542
+ sub w12, w12, #1
543
+ ld1 {v4.16b-v7.16b}, x1, x2
544
+ scale2D_1 v0, v4
545
+ scale2D_1 v1, v5
546
+ scale2D_1 v2, v6
547
+ scale2D_1 v3, v7
548
+ uqrshrn v0.8b, v0.8h, #2
549
+ uqrshrn2 v0.16b, v1.8h, #2
550
+ uqrshrn v1.8b, v2.8h, #2
551
+ uqrshrn2 v1.16b, v3.8h, #2
552
+ st1 {v0.16b-v1.16b}, x0, #32
553
+ cbnz w12, .loop_scale2D
554
+ ret
555
+endfunc
556
+
557
+// void planecopy_cp_c(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift)
558
+function PFX(pixel_planecopy_cp_neon)
559
+ dup v2.16b, w6
560
+ sub x5, x5, #1
561
+.loop_h:
562
+ mov x6, x0
563
+ mov x12, x2
564
+ mov x7, #0
565
+.loop_w:
566
+ ldr q0, x6, #16
567
+ ushl v0.16b, v0.16b, v2.16b
568
+ str q0, x12, #16
569
+ add x7, x7, #16
570
+ cmp x7, x4
571
+ blt .loop_w
572
+
573
+ add x0, x0, x1
574
+ add x2, x2, x3
575
+ sub x5, x5, #1
576
+ cbnz x5, .loop_h
577
+
578
+// handle last row
579
+ mov x5, x4
580
+ lsr x5, x5, #3
581
+.loopW8:
582
+ ldr d0, x0, #8
583
+ ushl v0.8b, v0.8b, v2.8b
584
+ str d0, x2, #8
585
+ sub x4, x4, #8
586
+ sub x5, x5, #1
587
+ cbnz x5, .loopW8
588
+
589
+ mov x5, #8
590
+ sub x5, x5, x4
591
+ sub x0, x0, x5
592
+ sub x2, x2, x5
593
+ ldr d0, x0
594
+ ushl v0.8b, v0.8b, v2.8b
595
+ str d0, x2
596
+ ret
597
+endfunc
598
+
599
+//******* satd *******
600
+.macro satd_4x4_neon
601
+ ld1 {v0.s}0, x0, x1
602
+ ld1 {v0.s}1, x0, x1
603
+ ld1 {v1.s}0, x2, x3
604
+ ld1 {v1.s}1, x2, x3
605
+ ld1 {v2.s}0, x0, x1
606
+ ld1 {v2.s}1, x0, x1
607
+ ld1 {v3.s}0, x2, x3
608
+ ld1 {v3.s}1, x2, x3
609
+
610
+ usubl v4.8h, v0.8b, v1.8b
611
+ usubl v5.8h, v2.8b, v3.8b
612
+
613
+ add v6.8h, v4.8h, v5.8h
614
+ sub v7.8h, v4.8h, v5.8h
615
+
616
+ mov v4.d0, v6.d1
617
+ add v0.4h, v6.4h, v4.4h
618
+ sub v2.4h, v6.4h, v4.4h
619
+
620
+ mov v5.d0, v7.d1
621
+ add v1.4h, v7.4h, v5.4h
622
+ sub v3.4h, v7.4h, v5.4h
623
+
624
+ trn1 v4.4h, v0.4h, v1.4h
625
+ trn2 v5.4h, v0.4h, v1.4h
626
+
627
+ trn1 v6.4h, v2.4h, v3.4h
628
+ trn2 v7.4h, v2.4h, v3.4h
629
+
630
+ add v0.4h, v4.4h, v5.4h
631
+ sub v1.4h, v4.4h, v5.4h
632
+
633
+ add v2.4h, v6.4h, v7.4h
634
+ sub v3.4h, v6.4h, v7.4h
635
+
636
+ trn1 v4.2s, v0.2s, v1.2s
637
+ trn2 v5.2s, v0.2s, v1.2s
638
+
639
+ trn1 v6.2s, v2.2s, v3.2s
640
+ trn2 v7.2s, v2.2s, v3.2s
641
+
642
+ abs v4.4h, v4.4h
643
+ abs v5.4h, v5.4h
644
+ abs v6.4h, v6.4h
645
+ abs v7.4h, v7.4h
646
+
647
+ smax v1.4h, v4.4h, v5.4h
648
+ smax v2.4h, v6.4h, v7.4h
649
+
650
+ add v0.4h, v1.4h, v2.4h
651
+ uaddlp v0.2s, v0.4h
652
+ uaddlp v0.1d, v0.2s
653
+.endm
654
+
655
+// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
656
+function PFX(pixel_satd_4x4_neon)
657
+ satd_4x4_neon
658
+ fmov x0, d0
659
+ ret
660
+endfunc
661
+
662
.macro x265_satd_4x8_8x4_end_neon
663
add v0.8h, v4.8h, v6.8h
664
add v1.8h, v5.8h, v7.8h
665
666
.endm
667
668
.macro pixel_satd_4x8_neon
669
- ld1r {v1.2s}, x2, x3
670
+ ld1r {v1.2s}, x2, x3
671
ld1r {v0.2s}, x0, x1
672
ld1r {v3.2s}, x2, x3
673
ld1r {v2.2s}, x0, x1
674
675
sub v5.8h, v0.8h, v1.8h
676
ld1 {v6.s}1, x0, x1
677
usubl v3.8h, v6.8b, v7.8b
678
- add v6.8h, v2.8h, v3.8h
679
- sub v7.8h, v2.8h, v3.8h
680
+ add v6.8h, v2.8h, v3.8h
681
+ sub v7.8h, v2.8h, v3.8h
682
x265_satd_4x8_8x4_end_neon
683
.endm
684
685
-// template<int w, int h>
686
-// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
687
-function x265_pixel_satd_4x8_neon
688
- pixel_satd_4x8_neon
689
- mov w0, v0.s0
690
- ret
691
+// template<int w, int h>
692
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
693
+function PFX(pixel_satd_4x8_neon)
694
+ pixel_satd_4x8_neon
695
+ mov w0, v0.s0
696
+ ret
697
+endfunc
698
+
699
+function PFX(pixel_satd_4x16_neon)
700
+ mov w4, #0
701
+ pixel_satd_4x8_neon
702
+ mov w5, v0.s0
703
+ add w4, w4, w5
704
+ pixel_satd_4x8_neon
705
+ mov w5, v0.s0
706
+ add w0, w5, w4
707
+ ret
708
+endfunc
709
+
710
+function PFX(pixel_satd_4x32_neon)
711
+ mov w4, #0
712
+.rept 4
713
+ pixel_satd_4x8_neon
714
+ mov w5, v0.s0
715
+ add w4, w4, w5
716
+.endr
717
+ mov w0, w4
718
+ ret
719
+endfunc
720
+
721
+function PFX(pixel_satd_12x16_neon)
722
+ mov x4, x0
723
+ mov x5, x2
724
+ mov w7, #0
725
+ pixel_satd_4x8_neon
726
+ mov w6, v0.s0
727
+ add w7, w7, w6
728
+ pixel_satd_4x8_neon
729
+ mov w6, v0.s0
730
+ add w7, w7, w6
731
+
732
+ add x0, x4, #4
733
+ add x2, x5, #4
734
+ pixel_satd_4x8_neon
735
+ mov w6, v0.s0
736
+ add w7, w7, w6
737
+ pixel_satd_4x8_neon
738
+ mov w6, v0.s0
739
+ add w7, w7, w6
740
+
741
+ add x0, x4, #8
742
+ add x2, x5, #8
743
+ pixel_satd_4x8_neon
744
+ mov w6, v0.s0
745
+ add w7, w7, w6
746
+ pixel_satd_4x8_neon
747
+ mov w6, v0.s0
748
+ add w0, w7, w6
749
+ ret
750
+endfunc
751
+
752
+function PFX(pixel_satd_12x32_neon)
753
+ mov x4, x0
754
+ mov x5, x2
755
+ mov w7, #0
756
+.rept 4
757
+ pixel_satd_4x8_neon
758
+ mov w6, v0.s0
759
+ add w7, w7, w6
760
+.endr
761
+
762
+ add x0, x4, #4
763
+ add x2, x5, #4
764
+.rept 4
765
+ pixel_satd_4x8_neon
766
+ mov w6, v0.s0
767
+ add w7, w7, w6
768
+.endr
769
+
770
+ add x0, x4, #8
771
+ add x2, x5, #8
772
+.rept 4
773
+ pixel_satd_4x8_neon
774
+ mov w6, v0.s0
775
+ add w7, w7, w6
776
+.endr
777
+
778
+ mov w0, w7
779
+ ret
780
+endfunc
781
+
782
+function PFX(pixel_satd_8x4_neon)
783
+ mov x4, x0
784
+ mov x5, x2
785
+ satd_4x4_neon
786
+ add x0, x4, #4
787
+ add x2, x5, #4
788
+ umov x6, v0.d0
789
+ satd_4x4_neon
790
+ umov x0, v0.d0
791
+ add x0, x0, x6
792
+ ret
793
+endfunc
794
+
795
+.macro LOAD_DIFF_8x4 v0 v1 v2 v3
796
+ ld1 {v0.8b}, x0, x1
797
+ ld1 {v1.8b}, x2, x3
798
+ ld1 {v2.8b}, x0, x1
799
+ ld1 {v3.8b}, x2, x3
800
+ ld1 {v4.8b}, x0, x1
801
+ ld1 {v5.8b}, x2, x3
802
+ ld1 {v6.8b}, x0, x1
803
+ ld1 {v7.8b}, x2, x3
804
+ usubl \v0, v0.8b, v1.8b
805
+ usubl \v1, v2.8b, v3.8b
806
+ usubl \v2, v4.8b, v5.8b
807
+ usubl \v3, v6.8b, v7.8b
808
+.endm
809
+
810
+.macro LOAD_DIFF_16x4 v0 v1 v2 v3 v4 v5 v6 v7
811
+ ld1 {v0.16b}, x0, x1
812
+ ld1 {v1.16b}, x2, x3
813
+ ld1 {v2.16b}, x0, x1
814
+ ld1 {v3.16b}, x2, x3
815
+ ld1 {v4.16b}, x0, x1
816
+ ld1 {v5.16b}, x2, x3
817
+ ld1 {v6.16b}, x0, x1
818
+ ld1 {v7.16b}, x2, x3
819
+ usubl \v0, v0.8b, v1.8b
820
+ usubl \v1, v2.8b, v3.8b
821
+ usubl \v2, v4.8b, v5.8b
822
+ usubl \v3, v6.8b, v7.8b
823
+ usubl2 \v4, v0.16b, v1.16b
824
+ usubl2 \v5, v2.16b, v3.16b
825
+ usubl2 \v6, v4.16b, v5.16b
826
+ usubl2 \v7, v6.16b, v7.16b
827
+.endm
828
+
829
+function PFX(satd_16x4_neon), export=0
830
+ LOAD_DIFF_16x4 v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
831
+ b PFX(satd_8x4v_8x8h_neon)
832
+endfunc
833
+
834
+function PFX(satd_8x8_neon), export=0
835
+ LOAD_DIFF_8x4 v16.8h, v17.8h, v18.8h, v19.8h
836
+ LOAD_DIFF_8x4 v20.8h, v21.8h, v22.8h, v23.8h
837
+ b PFX(satd_8x4v_8x8h_neon)
838
+endfunc
839
+
840
+// one vertical hadamard pass and two horizontal
841
+function PFX(satd_8x4v_8x8h_neon), export=0
842
+ HADAMARD4_V v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h
843
+ HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
844
+ trn4 v0.8h, v1.8h, v2.8h, v3.8h, v16.8h, v17.8h, v18.8h, v19.8h
845
+ trn4 v4.8h, v5.8h, v6.8h, v7.8h, v20.8h, v21.8h, v22.8h, v23.8h
846
+ SUMSUB_ABCD v16.8h, v17.8h, v18.8h, v19.8h, v0.8h, v1.8h, v2.8h, v3.8h
847
+ SUMSUB_ABCD v20.8h, v21.8h, v22.8h, v23.8h, v4.8h, v5.8h, v6.8h, v7.8h
848
+ trn4 v0.4s, v2.4s, v1.4s, v3.4s, v16.4s, v18.4s, v17.4s, v19.4s
849
+ trn4 v4.4s, v6.4s, v5.4s, v7.4s, v20.4s, v22.4s, v21.4s, v23.4s
850
+ ABS8 v0.8h, v1.8h, v2.8h, v3.8h, v4.8h, v5.8h, v6.8h, v7.8h
851
+ smax v0.8h, v0.8h, v2.8h
852
+ smax v1.8h, v1.8h, v3.8h
853
+ smax v2.8h, v4.8h, v6.8h
854
+ smax v3.8h, v5.8h, v7.8h
855
+ ret
856
+endfunc
857
+
858
+function PFX(pixel_satd_8x8_neon)
859
+ mov x10, x30
860
+ bl PFX(satd_8x8_neon)
861
+ add v0.8h, v0.8h, v1.8h
862
+ add v1.8h, v2.8h, v3.8h
863
+ add v0.8h, v0.8h, v1.8h
864
+ uaddlv s0, v0.8h
865
+ mov w0, v0.s0
866
+ ret x10
867
+endfunc
868
+
869
+function PFX(pixel_satd_8x12_neon)
870
+ mov x4, x0
871
+ mov x5, x2
872
+ mov x7, #0
873
+ satd_4x4_neon
874
+ umov x6, v0.d0
875
+ add x7, x7, x6
876
+ add x0, x4, #4
877
+ add x2, x5, #4
878
+ satd_4x4_neon
879
+ umov x6, v0.d0
880
+ add x7, x7, x6
881
+.rept 2
882
+ sub x0, x0, #4
883
+ sub x2, x2, #4
884
+ mov x4, x0
885
+ mov x5, x2
886
+ satd_4x4_neon
887
+ umov x6, v0.d0
888
+ add x7, x7, x6
889
+ add x0, x4, #4
890
+ add x2, x5, #4
891
+ satd_4x4_neon
892
+ umov x6, v0.d0
893
+ add x7, x7, x6
894
+.endr
895
+ mov x0, x7
896
+ ret
897
+endfunc
898
+
899
+function PFX(pixel_satd_8x16_neon)
900
+ mov x10, x30
901
+ bl PFX(satd_8x8_neon)
902
+ add v30.8h, v0.8h, v1.8h
903
+ add v31.8h, v2.8h, v3.8h
904
+ bl PFX(satd_8x8_neon)
905
+ add v30.8h, v30.8h, v0.8h
906
+ add v31.8h, v31.8h, v1.8h
907
+ add v30.8h, v30.8h, v2.8h
908
+ add v31.8h, v31.8h, v3.8h
909
+ add v0.8h, v30.8h, v31.8h
910
+ uaddlv s0, v0.8h
911
+ mov w0, v0.s0
912
+ ret x10
913
+endfunc
914
+
915
+function PFX(pixel_satd_8x32_neon)
916
+ mov x10, x30
917
+ bl PFX(satd_8x8_neon)
918
+ add v30.8h, v0.8h, v1.8h
919
+ add v31.8h, v2.8h, v3.8h
920
+.rept 3
921
+ bl PFX(satd_8x8_neon)
922
+ add v30.8h, v30.8h, v0.8h
923
+ add v31.8h, v31.8h, v1.8h
924
+ add v30.8h, v30.8h, v2.8h
925
+ add v31.8h, v31.8h, v3.8h
926
+.endr
927
+ add v0.8h, v30.8h, v31.8h
928
+ uaddlv s0, v0.8h
929
+ mov w0, v0.s0
930
+ ret x10
931
+endfunc
932
+
933
+function PFX(pixel_satd_8x64_neon)
934
+ mov x10, x30
935
+ bl PFX(satd_8x8_neon)
936
+ add v30.8h, v0.8h, v1.8h
937
+ add v31.8h, v2.8h, v3.8h
938
+.rept 7
939
+ bl PFX(satd_8x8_neon)
940
+ add v30.8h, v30.8h, v0.8h
941
+ add v31.8h, v31.8h, v1.8h
942
+ add v30.8h, v30.8h, v2.8h
943
+ add v31.8h, v31.8h, v3.8h
944
+.endr
945
+ add v0.8h, v30.8h, v31.8h
946
+ uaddlv s0, v0.8h
947
+ mov w0, v0.s0
948
+ ret x10
949
+endfunc
950
+
951
+function PFX(pixel_satd_16x4_neon)
952
+ mov x10, x30
953
+ bl PFX(satd_16x4_neon)
954
+ add v30.8h, v0.8h, v1.8h
955
+ add v31.8h, v2.8h, v3.8h
956
+ add v0.8h, v30.8h, v31.8h
957
+ uaddlv s0, v0.8h
958
+ mov w0, v0.s0
959
+ ret x10
960
+endfunc
961
+
962
+function PFX(pixel_satd_16x8_neon)
963
+ mov x10, x30
964
+ bl PFX(satd_16x4_neon)
965
+ add v30.8h, v0.8h, v1.8h
966
+ add v31.8h, v2.8h, v3.8h
967
+ bl PFX(satd_16x4_neon)
968
+ add v30.8h, v30.8h, v0.8h
969
+ add v31.8h, v31.8h, v1.8h
970
+ add v30.8h, v30.8h, v2.8h
971
+ add v31.8h, v31.8h, v3.8h
972
+ add v0.8h, v30.8h, v31.8h
973
+ uaddlv s0, v0.8h
974
+ mov w0, v0.s0
975
+ ret x10
976
+endfunc
977
+
978
+function PFX(pixel_satd_16x12_neon)
979
+ mov x10, x30
980
+ bl PFX(satd_16x4_neon)
981
+ add v30.8h, v0.8h, v1.8h
982
+ add v31.8h, v2.8h, v3.8h
983
+.rept 2
984
+ bl PFX(satd_16x4_neon)
985
+ add v30.8h, v30.8h, v0.8h
986
+ add v31.8h, v31.8h, v1.8h
987
+ add v30.8h, v30.8h, v2.8h
988
+ add v31.8h, v31.8h, v3.8h
989
+.endr
990
+ add v0.8h, v30.8h, v31.8h
991
+ uaddlv s0, v0.8h
992
+ mov w0, v0.s0
993
+ ret x10
994
+endfunc
995
+
996
+function PFX(pixel_satd_16x16_neon)
997
+ mov x10, x30
998
+ bl PFX(satd_16x4_neon)
999
+ add v30.8h, v0.8h, v1.8h
1000
+ add v31.8h, v2.8h, v3.8h
1001
+.rept 3
1002
+ bl PFX(satd_16x4_neon)
1003
+ add v30.8h, v30.8h, v0.8h
1004
+ add v31.8h, v31.8h, v1.8h
1005
+ add v30.8h, v30.8h, v2.8h
1006
+ add v31.8h, v31.8h, v3.8h
1007
+.endr
1008
+ add v0.8h, v30.8h, v31.8h
1009
+ uaddlv s0, v0.8h
1010
+ mov w0, v0.s0
1011
+ ret x10
1012
+endfunc
1013
+
1014
+function PFX(pixel_satd_16x24_neon)
1015
+ mov x10, x30
1016
+ bl PFX(satd_16x4_neon)
1017
+ add v30.8h, v0.8h, v1.8h
1018
+ add v31.8h, v2.8h, v3.8h
1019
+.rept 5
1020
+ bl PFX(satd_16x4_neon)
1021
+ add v30.8h, v30.8h, v0.8h
1022
+ add v31.8h, v31.8h, v1.8h
1023
+ add v30.8h, v30.8h, v2.8h
1024
+ add v31.8h, v31.8h, v3.8h
1025
+.endr
1026
+ add v0.8h, v30.8h, v31.8h
1027
+ uaddlv s0, v0.8h
1028
+ mov w0, v0.s0
1029
+ ret x10
1030
+endfunc
1031
+
1032
+.macro pixel_satd_16x32_neon
1033
+ bl PFX(satd_16x4_neon)
1034
+ add v30.8h, v0.8h, v1.8h
1035
+ add v31.8h, v2.8h, v3.8h
1036
+.rept 7
1037
+ bl PFX(satd_16x4_neon)
1038
+ add v30.8h, v30.8h, v0.8h
1039
+ add v31.8h, v31.8h, v1.8h
1040
+ add v30.8h, v30.8h, v2.8h
1041
+ add v31.8h, v31.8h, v3.8h
1042
+.endr
1043
+.endm
1044
+
1045
+function PFX(pixel_satd_16x32_neon)
1046
+ mov x10, x30
1047
+ pixel_satd_16x32_neon
1048
+ add v0.8h, v30.8h, v31.8h
1049
+ uaddlv s0, v0.8h
1050
+ mov w0, v0.s0
1051
+ ret x10
1052
endfunc
1053
1054
-// template<int w, int h>
1055
-// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1056
-function x265_pixel_satd_4x16_neon
1057
- eor w4, w4, w4
1058
- pixel_satd_4x8_neon
1059
- mov w5, v0.s0
1060
- add w4, w4, w5
1061
- pixel_satd_4x8_neon
1062
- mov w5, v0.s0
1063
- add w0, w5, w4
1064
- ret
1065
+function PFX(pixel_satd_16x64_neon)
1066
+ mov x10, x30
1067
+ bl PFX(satd_16x4_neon)
1068
+ add v30.8h, v0.8h, v1.8h
1069
+ add v31.8h, v2.8h, v3.8h
1070
+.rept 15
1071
+ bl PFX(satd_16x4_neon)
1072
+ add v30.8h, v30.8h, v0.8h
1073
+ add v31.8h, v31.8h, v1.8h
1074
+ add v30.8h, v30.8h, v2.8h
1075
+ add v31.8h, v31.8h, v3.8h
1076
+.endr
1077
+ add v0.8h, v30.8h, v31.8h
1078
+ uaddlv s0, v0.8h
1079
+ mov w0, v0.s0
1080
+ ret x10
1081
endfunc
1082
1083
-// template<int w, int h>
1084
-// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1085
-function x265_pixel_satd_4x32_neon
1086
- eor w4, w4, w4
1087
+function PFX(pixel_satd_24x32_neon)
1088
+ mov x10, x30
1089
+ mov x7, #0
1090
+ mov x4, x0
1091
+ mov x5, x2
1092
+.rept 3
1093
+ movi v30.8h, #0
1094
+ movi v31.8h, #0
1095
.rept 4
1096
- pixel_satd_4x8_neon
1097
- mov w5, v0.s0
1098
- add w4, w4, w5
1099
+ bl PFX(satd_8x8_neon)
1100
+ add v30.8h, v30.8h, v0.8h
1101
+ add v31.8h, v31.8h, v1.8h
1102
+ add v30.8h, v30.8h, v2.8h
1103
+ add v31.8h, v31.8h, v3.8h
1104
.endr
1105
- mov w0, w4
1106
- ret
1107
+ add v0.8h, v30.8h, v31.8h
1108
+ uaddlv s0, v0.8h
1109
+ mov w6, v0.s0
1110
+ add x7, x7, x6
1111
+ add x4, x4, #8
1112
+ add x5, x5, #8
1113
+ mov x0, x4
1114
+ mov x2, x5
1115
+.endr
1116
+ mov x0, x7
1117
+ ret x10
1118
endfunc
1119
1120
-// template<int w, int h>
1121
-// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1122
-function x265_pixel_satd_12x16_neon
1123
+function PFX(pixel_satd_24x64_neon)
1124
+ mov x10, x30
1125
+ mov x7, #0
1126
mov x4, x0
1127
mov x5, x2
1128
- eor w7, w7, w7
1129
- pixel_satd_4x8_neon
1130
+.rept 3
1131
+ movi v30.8h, #0
1132
+ movi v31.8h, #0
1133
+.rept 4
1134
+ bl PFX(satd_8x8_neon)
1135
+ add v30.8h, v30.8h, v0.8h
1136
+ add v31.8h, v31.8h, v1.8h
1137
+ add v30.8h, v30.8h, v2.8h
1138
+ add v31.8h, v31.8h, v3.8h
1139
+.endr
1140
+ add v0.8h, v30.8h, v31.8h
1141
+ uaddlv s0, v0.8h
1142
mov w6, v0.s0
1143
- add w7, w7, w6
1144
- pixel_satd_4x8_neon
1145
+ add x7, x7, x6
1146
+ add x4, x4, #8
1147
+ add x5, x5, #8
1148
+ mov x0, x4
1149
+ mov x2, x5
1150
+.endr
1151
+ sub x4, x4, #24
1152
+ sub x5, x5, #24
1153
+ add x0, x4, x1, lsl #5
1154
+ add x2, x5, x3, lsl #5
1155
+ mov x4, x0
1156
+ mov x5, x2
1157
+.rept 3
1158
+ movi v30.8h, #0
1159
+ movi v31.8h, #0
1160
+.rept 4
1161
+ bl PFX(satd_8x8_neon)
1162
+ add v30.8h, v30.8h, v0.8h
1163
+ add v31.8h, v31.8h, v1.8h
1164
+ add v30.8h, v30.8h, v2.8h
1165
+ add v31.8h, v31.8h, v3.8h
1166
+.endr
1167
+ add v0.8h, v30.8h, v31.8h
1168
+ uaddlv s0, v0.8h
1169
mov w6, v0.s0
1170
- add w7, w7, w6
1171
+ add x7, x7, x6
1172
+ add x4, x4, #8
1173
+ add x5, x5, #8
1174
+ mov x0, x4
1175
+ mov x2, x5
1176
+.endr
1177
+ mov x0, x7
1178
+ ret x10
1179
+endfunc
1180
1181
- add x0, x4, #4
1182
- add x2, x5, #4
1183
- pixel_satd_4x8_neon
1184
- mov w6, v0.s0
1185
- add w7, w7, w6
1186
- pixel_satd_4x8_neon
1187
- mov w6, v0.s0
1188
- add w7, w7, w6
1189
+.macro pixel_satd_32x8
1190
+ mov x4, x0
1191
+ mov x5, x2
1192
+.rept 2
1193
+ bl PFX(satd_16x4_neon)
1194
+ add v30.8h, v30.8h, v0.8h
1195
+ add v31.8h, v31.8h, v1.8h
1196
+ add v30.8h, v30.8h, v2.8h
1197
+ add v31.8h, v31.8h, v3.8h
1198
+.endr
1199
+ add x0, x4, #16
1200
+ add x2, x5, #16
1201
+.rept 2
1202
+ bl PFX(satd_16x4_neon)
1203
+ add v30.8h, v30.8h, v0.8h
1204
+ add v31.8h, v31.8h, v1.8h
1205
+ add v30.8h, v30.8h, v2.8h
1206
+ add v31.8h, v31.8h, v3.8h
1207
+.endr
1208
+.endm
1209
1210
- add x0, x4, #8
1211
- add x2, x5, #8
1212
- pixel_satd_4x8_neon
1213
- mov w6, v0.s0
1214
- add w7, w7, w6
1215
- pixel_satd_4x8_neon
1216
+.macro satd_32x16_neon
1217
+ movi v30.8h, #0
1218
+ movi v31.8h, #0
1219
+ pixel_satd_32x8
1220
+ sub x0, x0, #16
1221
+ sub x2, x2, #16
1222
+ pixel_satd_32x8
1223
+ add v0.8h, v30.8h, v31.8h
1224
+ uaddlv s0, v0.8h
1225
mov w6, v0.s0
1226
- add w0, w7, w6
1227
- ret
1228
-endfunc
1229
+.endm
1230
1231
-// template<int w, int h>
1232
-// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1233
-function x265_pixel_satd_12x32_neon
1234
+.macro satd_64x16_neon
1235
+ mov x8, x0
1236
+ mov x9, x2
1237
+ satd_32x16_neon
1238
+ add x7, x7, x6
1239
+ add x0, x8, #32
1240
+ add x2, x9, #32
1241
+ satd_32x16_neon
1242
+ add x7, x7, x6
1243
+.endm
1244
+
1245
+function PFX(pixel_satd_32x8_neon)
1246
+ mov x10, x30
1247
+ mov x7, #0
1248
mov x4, x0
1249
mov x5, x2
1250
- eor w7, w7, w7
1251
-.rept 4
1252
- pixel_satd_4x8_neon
1253
- mov w6, v0.s0
1254
- add w7, w7, w6
1255
+ movi v30.8h, #0
1256
+ movi v31.8h, #0
1257
+ pixel_satd_32x8
1258
+ add v0.8h, v30.8h, v31.8h
1259
+ uaddlv s0, v0.8h
1260
+ mov w0, v0.s0
1261
+ ret x10
1262
+endfunc
1263
+
1264
+function PFX(pixel_satd_32x16_neon)
1265
+ mov x10, x30
1266
+ satd_32x16_neon
1267
+ mov x0, x6
1268
+ ret x10
1269
+endfunc
1270
+
1271
+function PFX(pixel_satd_32x24_neon)
1272
+ mov x10, x30
1273
+ satd_32x16_neon
1274
+ movi v30.8h, #0
1275
+ movi v31.8h, #0
1276
+ sub x0, x0, #16
1277
+ sub x2, x2, #16
1278
+ pixel_satd_32x8
1279
+ add v0.8h, v30.8h, v31.8h
1280
+ uaddlv s0, v0.8h
1281
+ mov w0, v0.s0
1282
+ add x0, x0, x6
1283
+ ret x10
1284
+endfunc
1285
+
1286
+function PFX(pixel_satd_32x32_neon)
1287
+ mov x10, x30
1288
+ mov x7, #0
1289
+ satd_32x16_neon
1290
+ sub x0, x0, #16
1291
+ sub x2, x2, #16
1292
+ add x7, x7, x6
1293
+ satd_32x16_neon
1294
+ add x0, x7, x6
1295
+ ret x10
1296
+endfunc
1297
+
1298
+function PFX(pixel_satd_32x48_neon)
1299
+ mov x10, x30
1300
+ mov x7, #0
1301
+.rept 2
1302
+ satd_32x16_neon
1303
+ sub x0, x0, #16
1304
+ sub x2, x2, #16
1305
+ add x7, x7, x6
1306
.endr
1307
+ satd_32x16_neon
1308
+ add x0, x7, x6
1309
+ ret x10
1310
+endfunc
1311
1312
- add x0, x4, #4
1313
- add x2, x5, #4
1314
-.rept 4
1315
- pixel_satd_4x8_neon
1316
- mov w6, v0.s0
1317
- add w7, w7, w6
1318
+function PFX(pixel_satd_32x64_neon)
1319
+ mov x10, x30
1320
+ mov x7, #0
1321
+.rept 3
1322
+ satd_32x16_neon
1323
+ sub x0, x0, #16
1324
+ sub x2, x2, #16
1325
+ add x7, x7, x6
1326
.endr
1327
+ satd_32x16_neon
1328
+ add x0, x7, x6
1329
+ ret x10
1330
+endfunc
1331
1332
- add x0, x4, #8
1333
- add x2, x5, #8
1334
-.rept 4
1335
- pixel_satd_4x8_neon
1336
- mov w6, v0.s0
1337
- add w7, w7, w6
1338
+function PFX(pixel_satd_64x16_neon)
1339
+ mov x10, x30
1340
+ mov x7, #0
1341
+ satd_64x16_neon
1342
+ mov x0, x7
1343
+ ret x10
1344
+endfunc
1345
+
1346
+function PFX(pixel_satd_64x32_neon)
1347
+ mov x10, x30
1348
+ mov x7, #0
1349
+ satd_64x16_neon
1350
+ sub x0, x0, #48
1351
+ sub x2, x2, #48
1352
+ satd_64x16_neon
1353
+ mov x0, x7
1354
+ ret x10
1355
+endfunc
1356
+
1357
+function PFX(pixel_satd_64x48_neon)
1358
+ mov x10, x30
1359
+ mov x7, #0
1360
+.rept 2
1361
+ satd_64x16_neon
1362
+ sub x0, x0, #48
1363
+ sub x2, x2, #48
1364
.endr
1365
+ satd_64x16_neon
1366
+ mov x0, x7
1367
+ ret x10
1368
+endfunc
1369
1370
- mov w0, w7
1371
+function PFX(pixel_satd_64x64_neon)
1372
+ mov x10, x30
1373
+ mov x7, #0
1374
+.rept 3
1375
+ satd_64x16_neon
1376
+ sub x0, x0, #48
1377
+ sub x2, x2, #48
1378
+.endr
1379
+ satd_64x16_neon
1380
+ mov x0, x7
1381
+ ret x10
1382
+endfunc
1383
+
1384
+function PFX(pixel_satd_48x64_neon)
1385
+ mov x10, x30
1386
+ mov x7, #0
1387
+ mov x8, x0
1388
+ mov x9, x2
1389
+.rept 3
1390
+ satd_32x16_neon
1391
+ sub x0, x0, #16
1392
+ sub x2, x2, #16
1393
+ add x7, x7, x6
1394
+.endr
1395
+ satd_32x16_neon
1396
+ add x7, x7, x6
1397
+
1398
+ add x0, x8, #32
1399
+ add x2, x9, #32
1400
+ pixel_satd_16x32_neon
1401
+ add v0.8h, v30.8h, v31.8h
1402
+ uaddlv s0, v0.8h
1403
+ mov w6, v0.s0
1404
+ add x7, x7, x6
1405
+
1406
+ movi v30.8h, #0
1407
+ movi v31.8h, #0
1408
+ pixel_satd_16x32_neon
1409
+ add v0.8h, v30.8h, v31.8h
1410
+ uaddlv s0, v0.8h
1411
+ mov w6, v0.s0
1412
+ add x0, x7, x6
1413
+ ret x10
1414
+endfunc
1415
+
1416
+function PFX(sa8d_8x8_neon), export=0
1417
+ LOAD_DIFF_8x4 v16.8h, v17.8h, v18.8h, v19.8h
1418
+ LOAD_DIFF_8x4 v20.8h, v21.8h, v22.8h, v23.8h
1419
+ HADAMARD4_V v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h
1420
+ HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
1421
+ SUMSUB_ABCD v0.8h, v16.8h, v1.8h, v17.8h, v16.8h, v20.8h, v17.8h, v21.8h
1422
+ SUMSUB_ABCD v2.8h, v18.8h, v3.8h, v19.8h, v18.8h, v22.8h, v19.8h, v23.8h
1423
+ trn4 v4.8h, v5.8h, v6.8h, v7.8h, v0.8h, v1.8h, v2.8h, v3.8h
1424
+ trn4 v20.8h, v21.8h, v22.8h, v23.8h, v16.8h, v17.8h, v18.8h, v19.8h
1425
+ SUMSUB_ABCD v2.8h, v3.8h, v24.8h, v25.8h, v20.8h, v21.8h, v4.8h, v5.8h
1426
+ SUMSUB_ABCD v0.8h, v1.8h, v4.8h, v5.8h, v22.8h, v23.8h, v6.8h, v7.8h
1427
+ trn4 v20.4s, v22.4s, v21.4s, v23.4s, v2.4s, v0.4s, v3.4s, v1.4s
1428
+ trn4 v16.4s, v18.4s, v17.4s, v19.4s, v24.4s, v4.4s, v25.4s, v5.4s
1429
+ SUMSUB_ABCD v0.8h, v2.8h, v1.8h, v3.8h, v20.8h, v22.8h, v21.8h, v23.8h
1430
+ SUMSUB_ABCD v4.8h, v6.8h, v5.8h, v7.8h, v16.8h, v18.8h, v17.8h, v19.8h
1431
+ trn4 v16.2d, v20.2d, v17.2d, v21.2d, v0.2d, v4.2d, v1.2d, v5.2d
1432
+ trn4 v18.2d, v22.2d, v19.2d, v23.2d, v2.2d, v6.2d, v3.2d, v7.2d
1433
+ ABS8 v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
1434
+ smax v16.8h, v16.8h, v20.8h
1435
+ smax v17.8h, v17.8h, v21.8h
1436
+ smax v18.8h, v18.8h, v22.8h
1437
+ smax v19.8h, v19.8h, v23.8h
1438
+ add v0.8h, v16.8h, v17.8h
1439
+ add v1.8h, v18.8h, v19.8h
1440
ret
1441
endfunc
1442
1443
-// template<int w, int h>
1444
-// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1445
-function x265_pixel_satd_8x8_neon
1446
- eor w4, w4, w4
1447
- mov x6, x0
1448
- mov x7, x2
1449
- pixel_satd_4x8_neon
1450
- mov w5, v0.s0
1451
- add w4, w4, w5
1452
- add x0, x6, #4
1453
- add x2, x7, #4
1454
- pixel_satd_4x8_neon
1455
+function PFX(pixel_sa8d_8x8_neon)
1456
+ mov x10, x30
1457
+ bl PFX(sa8d_8x8_neon)
1458
+ add v0.8h, v0.8h, v1.8h
1459
+ uaddlv s0, v0.8h
1460
+ mov w0, v0.s0
1461
+ add w0, w0, #1
1462
+ lsr w0, w0, #1
1463
+ ret x10
1464
+endfunc
1465
+
1466
+function PFX(pixel_sa8d_8x16_neon)
1467
+ mov x10, x30
1468
+ bl PFX(sa8d_8x8_neon)
1469
+ add v0.8h, v0.8h, v1.8h
1470
+ uaddlv s0, v0.8h
1471
mov w5, v0.s0
1472
+ add w5, w5, #1
1473
+ lsr w5, w5, #1
1474
+ bl PFX(sa8d_8x8_neon)
1475
+ add v0.8h, v0.8h, v1.8h
1476
+ uaddlv s0, v0.8h
1477
+ mov w4, v0.s0
1478
+ add w4, w4, #1
1479
+ lsr w4, w4, #1
1480
+ add w0, w4, w5
1481
+ ret x10
1482
+endfunc
1483
+
1484
+.macro sa8d_16x16 reg
1485
+ bl PFX(sa8d_8x8_neon)
1486
+ uaddlp v30.4s, v0.8h
1487
+ uaddlp v31.4s, v1.8h
1488
+ bl PFX(sa8d_8x8_neon)
1489
+ uadalp v30.4s, v0.8h
1490
+ uadalp v31.4s, v1.8h
1491
+ sub x0, x0, x1, lsl #4
1492
+ sub x2, x2, x3, lsl #4
1493
+ add x0, x0, #8
1494
+ add x2, x2, #8
1495
+ bl PFX(sa8d_8x8_neon)
1496
+ uadalp v30.4s, v0.8h
1497
+ uadalp v31.4s, v1.8h
1498
+ bl PFX(sa8d_8x8_neon)
1499
+ uadalp v30.4s, v0.8h
1500
+ uadalp v31.4s, v1.8h
1501
+ add v0.4s, v30.4s, v31.4s
1502
+ addv s0, v0.4s
1503
+ mov \reg, v0.s0
1504
+ add \reg, \reg, #1
1505
+ lsr \reg, \reg, #1
1506
+.endm
1507
+
1508
+function PFX(pixel_sa8d_16x16_neon)
1509
+ mov x10, x30
1510
+ sa8d_16x16 w0
1511
+ ret x10
1512
+endfunc
1513
+
1514
+function PFX(pixel_sa8d_16x32_neon)
1515
+ mov x10, x30
1516
+ sa8d_16x16 w4
1517
+ sub x0, x0, #8
1518
+ sub x2, x2, #8
1519
+ sa8d_16x16 w5
1520
add w0, w4, w5
1521
+ ret x10
1522
+endfunc
1523
+
1524
+function PFX(pixel_sa8d_32x32_neon)
1525
+ mov x10, x30
1526
+ sa8d_16x16 w4
1527
+ sub x0, x0, x1, lsl #4
1528
+ sub x2, x2, x3, lsl #4
1529
+ add x0, x0, #8
1530
+ add x2, x2, #8
1531
+ sa8d_16x16 w5
1532
+ sub x0, x0, #24
1533
+ sub x2, x2, #24
1534
+ sa8d_16x16 w6
1535
+ sub x0, x0, x1, lsl #4
1536
+ sub x2, x2, x3, lsl #4
1537
+ add x0, x0, #8
1538
+ add x2, x2, #8
1539
+ sa8d_16x16 w7
1540
+ add w4, w4, w5
1541
+ add w6, w6, w7
1542
+ add w0, w4, w6
1543
+ ret x10
1544
+endfunc
1545
+
1546
+function PFX(pixel_sa8d_32x64_neon)
1547
+ mov x10, x30
1548
+ mov w11, #4
1549
+ mov w9, #0
1550
+.loop_sa8d_32:
1551
+ sub w11, w11, #1
1552
+ sa8d_16x16 w4
1553
+ sub x0, x0, x1, lsl #4
1554
+ sub x2, x2, x3, lsl #4
1555
+ add x0, x0, #8
1556
+ add x2, x2, #8
1557
+ sa8d_16x16 w5
1558
+ add w4, w4, w5
1559
+ add w9, w9, w4
1560
+ sub x0, x0, #24
1561
+ sub x2, x2, #24
1562
+ cbnz w11, .loop_sa8d_32
1563
+ mov w0, w9
1564
+ ret x10
1565
+endfunc
1566
+
1567
+function PFX(pixel_sa8d_64x64_neon)
1568
+ mov x10, x30
1569
+ mov w11, #4
1570
+ mov w9, #0
1571
+.loop_sa8d_64:
1572
+ sub w11, w11, #1
1573
+ sa8d_16x16 w4
1574
+ sub x0, x0, x1, lsl #4
1575
+ sub x2, x2, x3, lsl #4
1576
+ add x0, x0, #8
1577
+ add x2, x2, #8
1578
+ sa8d_16x16 w5
1579
+ sub x0, x0, x1, lsl #4
1580
+ sub x2, x2, x3, lsl #4
1581
+ add x0, x0, #8
1582
+ add x2, x2, #8
1583
+ sa8d_16x16 w6
1584
+ sub x0, x0, x1, lsl #4
1585
+ sub x2, x2, x3, lsl #4
1586
+ add x0, x0, #8
1587
+ add x2, x2, #8
1588
+ sa8d_16x16 w7
1589
+ add w4, w4, w5
1590
+ add w6, w6, w7
1591
+ add w8, w4, w6
1592
+ add w9, w9, w8
1593
+
1594
+ sub x0, x0, #56
1595
+ sub x2, x2, #56
1596
+ cbnz w11, .loop_sa8d_64
1597
+ mov w0, w9
1598
+ ret x10
1599
+endfunc
1600
+
1601
+/***** dequant_scaling*****/
1602
+// void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
1603
+function PFX(dequant_scaling_neon)
1604
+ add x5, x5, #4 // shift + 4
1605
+ lsr x3, x3, #3 // num / 8
1606
+ cmp x5, x4
1607
+ blt .dequant_skip
1608
+
1609
+ mov x12, #1
1610
+ sub x6, x5, x4 // shift - per
1611
+ sub x6, x6, #1 // shift - per - 1
1612
+ lsl x6, x12, x6 // 1 << shift - per - 1 (add)
1613
+ dup v0.4s, w6
1614
+ sub x7, x4, x5 // per - shift
1615
+ dup v3.4s, w7
1616
+
1617
+.dequant_loop1:
1618
+ ld1 {v19.8h}, x0, #16 // quantCoef
1619
+ ld1 {v2.4s}, x1, #16 // deQuantCoef
1620
+ ld1 {v20.4s}, x1, #16
1621
+ sub x3, x3, #1
1622
+ sxtl v1.4s, v19.4h
1623
+ sxtl2 v19.4s, v19.8h
1624
+
1625
+ mul v1.4s, v1.4s, v2.4s // quantCoef * deQuantCoef
1626
+ mul v19.4s, v19.4s, v20.4s
1627
+ add v1.4s, v1.4s, v0.4s // quantCoef * deQuantCoef + add
1628
+ add v19.4s, v19.4s, v0.4s
1629
+
1630
+ sshl v1.4s, v1.4s, v3.4s
1631
+ sshl v19.4s, v19.4s, v3.4s
1632
+ sqxtn v16.4h, v1.4s // x265_clip3
1633
+ sqxtn2 v16.8h, v19.4s
1634
+ st1 {v16.8h}, x2, #16
1635
+ cbnz x3, .dequant_loop1
1636
+ ret
1637
+
1638
+.dequant_skip:
1639
+ sub x6, x4, x5 // per - shift
1640
+ dup v0.8h, w6
1641
+
1642
+.dequant_loop2:
1643
+ ld1 {v19.8h}, x0, #16 // quantCoef
1644
+ ld1 {v2.4s}, x1, #16 // deQuantCoef
1645
+ ld1 {v20.4s}, x1, #16
1646
+ sub x3, x3, #1
1647
+ sxtl v1.4s, v19.4h
1648
+ sxtl2 v19.4s, v19.8h
1649
+
1650
+ mul v1.4s, v1.4s, v2.4s // quantCoef * deQuantCoef
1651
+ mul v19.4s, v19.4s, v20.4s
1652
+ sqxtn v16.4h, v1.4s // x265_clip3
1653
+ sqxtn2 v16.8h, v19.4s
1654
+
1655
+ sqshl v16.8h, v16.8h, v0.8h // coefQ << per - shift
1656
+ st1 {v16.8h}, x2, #16
1657
+ cbnz x3, .dequant_loop2
1658
+ ret
1659
+endfunc
1660
+
1661
+// void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
1662
+function PFX(dequant_normal_neon)
1663
+ lsr w2, w2, #4 // num / 16
1664
+ neg w4, w4
1665
+ dup v0.8h, w3
1666
+ dup v1.4s, w4
1667
+
1668
+.dqn_loop1:
1669
+ ld1 {v2.8h, v3.8h}, x0, #32
1670
+ smull v16.4s, v2.4h, v0.4h
1671
+ smull2 v17.4s, v2.8h, v0.8h
1672
+ smull v18.4s, v3.4h, v0.4h
1673
+ smull2 v19.4s, v3.8h, v0.8h
1674
+
1675
+ srshl v16.4s, v16.4s, v1.4s
1676
+ srshl v17.4s, v17.4s, v1.4s
1677
+ srshl v18.4s, v18.4s, v1.4s
1678
+ srshl v19.4s, v19.4s, v1.4s
1679
+
1680
+ sqxtn v2.4h, v16.4s
1681
+ sqxtn2 v2.8h, v17.4s
1682
+ sqxtn v3.4h, v18.4s
1683
+ sqxtn2 v3.8h, v19.4s
1684
+
1685
+ sub w2, w2, #1
1686
+ st1 {v2.8h, v3.8h}, x1, #32
1687
+ cbnz w2, .dqn_loop1
1688
+ ret
1689
+endfunc
1690
+
1691
+/********* ssim ***********/
1692
+// void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24)
1693
+function PFX(ssim_4x4x2_core_neon)
1694
+ ld1 {v0.8b}, x0, x1
1695
+ ld1 {v1.8b}, x0, x1
1696
+ ld1 {v2.8b}, x0, x1
1697
+ ld1 {v3.8b}, x0, x1
1698
+
1699
+ ld1 {v4.8b}, x2, x3
1700
+ ld1 {v5.8b}, x2, x3
1701
+ ld1 {v6.8b}, x2, x3
1702
+ ld1 {v7.8b}, x2, x3
1703
+
1704
+ umull v16.8h, v0.8b, v0.8b
1705
+ umull v17.8h, v1.8b, v1.8b
1706
+ umull v18.8h, v2.8b, v2.8b
1707
+ uaddlp v30.4s, v16.8h
1708
+ umull v19.8h, v3.8b, v3.8b
1709
+ umull v20.8h, v4.8b, v4.8b
1710
+ umull v21.8h, v5.8b, v5.8b
1711
+ uadalp v30.4s, v17.8h
1712
+ umull v22.8h, v6.8b, v6.8b
1713
+ umull v23.8h, v7.8b, v7.8b
1714
+
1715
+ umull v24.8h, v0.8b, v4.8b
1716
+ uadalp v30.4s, v18.8h
1717
+ umull v25.8h, v1.8b, v5.8b
1718
+ umull v26.8h, v2.8b, v6.8b
1719
+ umull v27.8h, v3.8b, v7.8b
1720
+ uadalp v30.4s, v19.8h
1721
+
1722
+ uaddl v28.8h, v0.8b, v1.8b
1723
+ uaddl v29.8h, v4.8b, v5.8b
1724
+ uadalp v30.4s, v20.8h
1725
+ uaddlp v31.4s, v24.8h
1726
+
1727
+ uaddw v28.8h, v28.8h, v2.8b
1728
+ uaddw v29.8h, v29.8h, v6.8b
1729
+ uadalp v30.4s, v21.8h
1730
+ uadalp v31.4s, v25.8h
1731
+
1732
+ uaddw v28.8h, v28.8h, v3.8b
1733
+ uaddw v29.8h, v29.8h, v7.8b
1734
+ uadalp v30.4s, v22.8h
1735
+ uadalp v31.4s, v26.8h
1736
+
1737
+ uaddlp v28.4s, v28.8h
1738
+ uaddlp v29.4s, v29.8h
1739
+ uadalp v30.4s, v23.8h
1740
+ uadalp v31.4s, v27.8h
1741
+
1742
+ addp v28.4s, v28.4s, v28.4s
1743
+ addp v29.4s, v29.4s, v29.4s
1744
+ addp v30.4s, v30.4s, v30.4s
1745
+ addp v31.4s, v31.4s, v31.4s
1746
+
1747
+ st4 {v28.2s, v29.2s, v30.2s, v31.2s}, x4
1748
ret
1749
endfunc
1750
1751
// int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
1752
-function x265_psyCost_4x4_neon
1753
+function PFX(psyCost_4x4_neon)
1754
ld1r {v4.2s}, x0, x1
1755
ld1r {v5.2s}, x0, x1
1756
ld1 {v4.s}1, x0, x1
1757
1758
endfunc
1759
1760
// uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
1761
-function x265_quant_neon
1762
+function PFX(quant_neon)
1763
mov w9, #1
1764
lsl w9, w9, w4
1765
dup v0.2s, w9
1766
1767
ret
1768
endfunc
1769
1770
-.macro satd_4x4_neon
1771
- ld1 {v1.s}0, x2, x3
1772
- ld1 {v0.s}0, x0, x1
1773
- ld1 {v3.s}0, x2, x3
1774
- ld1 {v2.s}0, x0, x1
1775
+// uint32_t nquant_c(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff)
1776
+function PFX(nquant_neon)
1777
+ neg x12, x3
1778
+ dup v0.4s, w12 // q0= -qbits
1779
+ dup v1.4s, w4 // add
1780
1781
- ld1 {v1.s}1, x2, x3
1782
- ld1 {v0.s}1, x0, x1
1783
- ld1 {v3.s}1, x2, x3
1784
- ld1 {v2.s}1, x0, x1
1785
+ lsr w5, w5, #2
1786
+ movi v4.4s, #0 // v4= accumulate numsig
1787
+ mov x4, #0
1788
+ movi v22.4s, #0
1789
1790
- usubl v4.8h, v0.8b, v1.8b
1791
- usubl v5.8h, v2.8b, v3.8b
1792
+.loop_nquant:
1793
+ ld1 {v16.4h}, x0, #8
1794
+ sub w5, w5, #1
1795
+ sxtl v19.4s, v16.4h // v19 = coefblockpos
1796
1797
- add v6.8h, v4.8h, v5.8h
1798
- sub v7.8h, v4.8h, v5.8h
1799
+ cmlt v18.4s, v19.4s, #0 // v18 = sign
1800
1801
- mov v4.d0, v6.d1
1802
- add v0.8h, v6.8h, v4.8h
1803
- sub v2.8h, v6.8h, v4.8h
1804
+ abs v19.4s, v19.4s // v19 = level=abs(coefblockpos)
1805
+ ld1 {v20.4s}, x1, #16 // v20 = quantCoeffblockpos
1806
+ mul v19.4s, v19.4s, v20.4s // v19 = tmplevel = abs(level) * quantCoeffblockpos;
1807
1808
- mov v5.d0, v7.d1
1809
- add v1.8h, v7.8h, v5.8h
1810
- sub v3.8h, v7.8h, v5.8h
1811
+ add v20.4s, v19.4s, v1.4s // v20 = tmplevel+add
1812
+ sshl v20.4s, v20.4s, v0.4s // v20 = level =(tmplevel+add) >> qbits
1813
1814
- trn1 v4.4h, v0.4h, v1.4h
1815
- trn2 v5.4h, v0.4h, v1.4h
1816
+ // numsig
1817
+ cmeq v21.4s, v20.4s, v22.4s
1818
+ add v4.4s, v4.4s, v21.4s
1819
+ add x4, x4, #4
1820
1821
- trn1 v6.4h, v2.4h, v3.4h
1822
- trn2 v7.4h, v2.4h, v3.4h
1823
+ eor v21.16b, v20.16b, v18.16b
1824
+ sub v21.4s, v21.4s, v18.4s
1825
+ sqxtn v16.4h, v21.4s
1826
+ abs v17.4h, v16.4h
1827
+ st1 {v17.4h}, x2, #8
1828
1829
- add v0.4h, v4.4h, v5.4h
1830
- sub v1.4h, v4.4h, v5.4h
1831
+ cbnz w5, .loop_nquant
1832
1833
- add v2.4h, v6.4h, v7.4h
1834
- sub v3.4h, v6.4h, v7.4h
1835
+ uaddlv d4, v4.4s
1836
+ fmov x12, d4
1837
+ add x0, x4, x12
1838
+ ret
1839
+endfunc
1840
1841
- trn1 v4.2s, v0.2s, v1.2s
1842
- trn2 v5.2s, v0.2s, v1.2s
1843
+// void ssimDist_c(const pixel* fenc, uint32_t fStride, const pixel* recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k)
1844
+.macro ssimDist_1 v4 v5
1845
+ sub v20.8h, \v4\().8h, \v5\().8h
1846
+ smull v16.4s, \v4\().4h, \v4\().4h
1847
+ smull2 v17.4s, \v4\().8h, \v4\().8h
1848
+ smull v18.4s, v20.4h, v20.4h
1849
+ smull2 v19.4s, v20.8h, v20.8h
1850
+ add v0.4s, v0.4s, v16.4s
1851
+ add v0.4s, v0.4s, v17.4s
1852
+ add v1.4s, v1.4s, v18.4s
1853
+ add v1.4s, v1.4s, v19.4s
1854
+.endm
1855
1856
- trn1 v6.2s, v2.2s, v3.2s
1857
- trn2 v7.2s, v2.2s, v3.2s
1858
+function PFX(ssimDist4_neon)
1859
+ ssimDist_start
1860
+.rept 4
1861
+ ld1 {v4.s}0, x0, x1
1862
+ ld1 {v5.s}0, x2, x3
1863
+ uxtl v4.8h, v4.8b
1864
+ uxtl v5.8h, v5.8b
1865
+ sub v2.4h, v4.4h, v5.4h
1866
+ smull v3.4s, v4.4h, v4.4h
1867
+ smull v2.4s, v2.4h, v2.4h
1868
+ add v0.4s, v0.4s, v3.4s
1869
+ add v1.4s, v1.4s, v2.4s
1870
+.endr
1871
+ ssimDist_end
1872
+ ret
1873
+endfunc
1874
1875
- abs v4.4h, v4.4h
1876
- abs v5.4h, v5.4h
1877
- abs v6.4h, v6.4h
1878
- abs v7.4h, v7.4h
1879
+function PFX(ssimDist8_neon)
1880
+ ssimDist_start
1881
+.rept 8
1882
+ ld1 {v4.8b}, x0, x1
1883
+ ld1 {v5.8b}, x2, x3
1884
+ uxtl v4.8h, v4.8b
1885
+ uxtl v5.8h, v5.8b
1886
+ ssimDist_1 v4, v5
1887
+.endr
1888
+ ssimDist_end
1889
+ ret
1890
+endfunc
1891
1892
- smax v1.4h, v4.4h, v5.4h
1893
- smax v2.4h, v6.4h, v7.4h
1894
+function PFX(ssimDist16_neon)
1895
+ mov w12, #16
1896
+ ssimDist_start
1897
+.loop_ssimDist16:
1898
+ sub w12, w12, #1
1899
+ ld1 {v4.16b}, x0, x1
1900
+ ld1 {v5.16b}, x2, x3
1901
+ uxtl v6.8h, v4.8b
1902
+ uxtl v7.8h, v5.8b
1903
+ uxtl2 v4.8h, v4.16b
1904
+ uxtl2 v5.8h, v5.16b
1905
+ ssimDist_1 v6, v7
1906
+ ssimDist_1 v4, v5
1907
+ cbnz w12, .loop_ssimDist16
1908
+ ssimDist_end
1909
+ ret
1910
+endfunc
1911
1912
- add v0.4h, v1.4h, v2.4h
1913
- uaddlp v0.2s, v0.4h
1914
- uaddlp v0.1d, v0.2s
1915
+function PFX(ssimDist32_neon)
1916
+ mov w12, #32
1917
+ ssimDist_start
1918
+.loop_ssimDist32:
1919
+ sub w12, w12, #1
1920
+ ld1 {v4.16b-v5.16b}, x0, x1
1921
+ ld1 {v6.16b-v7.16b}, x2, x3
1922
+ uxtl v21.8h, v4.8b
1923
+ uxtl v22.8h, v6.8b
1924
+ uxtl v23.8h, v5.8b
1925
+ uxtl v24.8h, v7.8b
1926
+ uxtl2 v25.8h, v4.16b
1927
+ uxtl2 v26.8h, v6.16b
1928
+ uxtl2 v27.8h, v5.16b
1929
+ uxtl2 v28.8h, v7.16b
1930
+ ssimDist_1 v21, v22
1931
+ ssimDist_1 v23, v24
1932
+ ssimDist_1 v25, v26
1933
+ ssimDist_1 v27, v28
1934
+ cbnz w12, .loop_ssimDist32
1935
+ ssimDist_end
1936
+ ret
1937
+endfunc
1938
+
1939
+function PFX(ssimDist64_neon)
1940
+ mov w12, #64
1941
+ ssimDist_start
1942
+.loop_ssimDist64:
1943
+ sub w12, w12, #1
1944
+ ld1 {v4.16b-v7.16b}, x0, x1
1945
+ ld1 {v16.16b-v19.16b}, x2, x3
1946
+ uxtl v21.8h, v4.8b
1947
+ uxtl v22.8h, v16.8b
1948
+ uxtl v23.8h, v5.8b
1949
+ uxtl v24.8h, v17.8b
1950
+ uxtl2 v25.8h, v4.16b
1951
+ uxtl2 v26.8h, v16.16b
1952
+ uxtl2 v27.8h, v5.16b
1953
+ uxtl2 v28.8h, v17.16b
1954
+ ssimDist_1 v21, v22
1955
+ ssimDist_1 v23, v24
1956
+ ssimDist_1 v25, v26
1957
+ ssimDist_1 v27, v28
1958
+ uxtl v21.8h, v6.8b
1959
+ uxtl v22.8h, v18.8b
1960
+ uxtl v23.8h, v7.8b
1961
+ uxtl v24.8h, v19.8b
1962
+ uxtl2 v25.8h, v6.16b
1963
+ uxtl2 v26.8h, v18.16b
1964
+ uxtl2 v27.8h, v7.16b
1965
+ uxtl2 v28.8h, v19.16b
1966
+ ssimDist_1 v21, v22
1967
+ ssimDist_1 v23, v24
1968
+ ssimDist_1 v25, v26
1969
+ ssimDist_1 v27, v28
1970
+ cbnz w12, .loop_ssimDist64
1971
+ ssimDist_end
1972
+ ret
1973
+endfunc
1974
+
1975
+// void normFact_c(const pixel* src, uint32_t blockSize, int shift, uint64_t *z_k)
1976
+
1977
+.macro normFact_1 v4
1978
+ smull v16.4s, \v4\().4h, \v4\().4h
1979
+ smull2 v17.4s, \v4\().8h, \v4\().8h
1980
+ add v0.4s, v0.4s, v16.4s
1981
+ add v0.4s, v0.4s, v17.4s
1982
.endm
1983
1984
-// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1985
-function x265_pixel_satd_4x4_neon
1986
- satd_4x4_neon
1987
- umov x0, v0.d0
1988
+function PFX(normFact8_neon)
1989
+ normFact_start
1990
+.rept 8
1991
+ ld1 {v4.8b}, x0, x1
1992
+ uxtl v4.8h, v4.8b
1993
+ normFact_1 v4
1994
+.endr
1995
+ normFact_end
1996
ret
1997
endfunc
1998
1999
-// int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2000
-function x265_pixel_satd_8x4_neon
2001
- mov x4, x0
2002
- mov x5, x2
2003
- satd_4x4_neon
2004
- add x0, x4, #4
2005
- add x2, x5, #4
2006
- umov x6, v0.d0
2007
- satd_4x4_neon
2008
- umov x0, v0.d0
2009
- add x0, x0, x6
2010
+function PFX(normFact16_neon)
2011
+ mov w12, #16
2012
+ normFact_start
2013
+.loop_normFact16:
2014
+ sub w12, w12, #1
2015
+ ld1 {v4.16b}, x0, x1
2016
+ uxtl v5.8h, v4.8b
2017
+ uxtl2 v4.8h, v4.16b
2018
+ normFact_1 v5
2019
+ normFact_1 v4
2020
+ cbnz w12, .loop_normFact16
2021
+ normFact_end
2022
+ ret
2023
+endfunc
2024
+
2025
+function PFX(normFact32_neon)
2026
+ mov w12, #32
2027
+ normFact_start
2028
+.loop_normFact32:
2029
+ sub w12, w12, #1
2030
+ ld1 {v4.16b-v5.16b}, x0, x1
2031
+ uxtl v6.8h, v4.8b
2032
+ uxtl2 v4.8h, v4.16b
2033
+ uxtl v7.8h, v5.8b
2034
+ uxtl2 v5.8h, v5.16b
2035
+ normFact_1 v4
2036
+ normFact_1 v5
2037
+ normFact_1 v6
2038
+ normFact_1 v7
2039
+ cbnz w12, .loop_normFact32
2040
+ normFact_end
2041
+ ret
2042
+endfunc
2043
+
2044
+function PFX(normFact64_neon)
2045
+ mov w12, #64
2046
+ normFact_start
2047
+.loop_normFact64:
2048
+ sub w12, w12, #1
2049
+ ld1 {v4.16b-v7.16b}, x0, x1
2050
+ uxtl v26.8h, v4.8b
2051
+ uxtl2 v24.8h, v4.16b
2052
+ uxtl v27.8h, v5.8b
2053
+ uxtl2 v25.8h, v5.16b
2054
+ normFact_1 v24
2055
+ normFact_1 v25
2056
+ normFact_1 v26
2057
+ normFact_1 v27
2058
+ uxtl v26.8h, v6.8b
2059
+ uxtl2 v24.8h, v6.16b
2060
+ uxtl v27.8h, v7.8b
2061
+ uxtl2 v25.8h, v7.16b
2062
+ normFact_1 v24
2063
+ normFact_1 v25
2064
+ normFact_1 v26
2065
+ normFact_1 v27
2066
+ cbnz w12, .loop_normFact64
2067
+ normFact_end
2068
+ ret
2069
+endfunc
2070
+
2071
+// void weight_pp_c(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
2072
+function PFX(weight_pp_neon)
2073
+ sub x2, x2, x3
2074
+ ldr w9, sp // offset
2075
+ lsl w5, w5, #6 // w0 << correction
2076
+
2077
+ // count trailing zeros in w5 and compare against shift right amount.
2078
+ rbit w10, w5
2079
+ clz w10, w10
2080
+ cmp w10, w7
2081
+ b.lt .unfoldedShift
2082
+
2083
+ // shift right only removes trailing zeros: hoist LSR out of the loop.
2084
+ lsr w10, w5, w7 // w0 << correction >> shift
2085
+ dup v25.16b, w10
2086
+ lsr w6, w6, w7 // round >> shift
2087
+ add w6, w6, w9 // round >> shift + offset
2088
+ dup v26.8h, w6
2089
+
2090
+ // Check arithmetic range.
2091
+ mov w11, #255
2092
+ madd w11, w11, w10, w6
2093
+ add w11, w11, w9
2094
+ lsr w11, w11, #16
2095
+ cbnz w11, .widenTo32Bit
2096
+
2097
+ // 16-bit arithmetic is enough.
2098
+.loopHpp:
2099
+ mov x12, x3
2100
+.loopWpp:
2101
+ ldr q0, x0, #16
2102
+ sub x12, x12, #16
2103
+ umull v1.8h, v0.8b, v25.8b // val *= w0 << correction >> shift
2104
+ umull2 v2.8h, v0.16b, v25.16b
2105
+ add v1.8h, v1.8h, v26.8h // val += round >> shift + offset
2106
+ add v2.8h, v2.8h, v26.8h
2107
+ sqxtun v0.8b, v1.8h // val = x265_clip(val)
2108
+ sqxtun2 v0.16b, v2.8h
2109
+ str q0, x1, #16
2110
+ cbnz x12, .loopWpp
2111
+ add x1, x1, x2
2112
+ add x0, x0, x2
2113
+ sub x4, x4, #1
2114
+ cbnz x4, .loopHpp
2115
+ ret
2116
+
2117
+ // 32-bit arithmetic is needed.
2118
+.widenTo32Bit:
2119
+.loopHpp32:
2120
+ mov x12, x3
2121
+.loopWpp32:
2122
+ ldr d0, x0, #8
2123
+ sub x12, x12, #8
2124
+ uxtl v0.8h, v0.8b
2125
+ umull v1.4s, v0.4h, v25.4h // val *= w0 << correction >> shift
2126
+ umull2 v2.4s, v0.8h, v25.8h
2127
+ add v1.4s, v1.4s, v26.4s // val += round >> shift + offset
2128
+ add v2.4s, v2.4s, v26.4s
2129
+ sqxtn v0.4h, v1.4s // val = x265_clip(val)
2130
+ sqxtn2 v0.8h, v2.4s
2131
+ sqxtun v0.8b, v0.8h
2132
+ str d0, x1, #8
2133
+ cbnz x12, .loopWpp32
2134
+ add x1, x1, x2
2135
+ add x0, x0, x2
2136
+ sub x4, x4, #1
2137
+ cbnz x4, .loopHpp32
2138
+ ret
2139
+
2140
+ // The shift right cannot be moved out of the loop.
2141
+.unfoldedShift:
2142
+ dup v25.8h, w5 // w0 << correction
2143
+ dup v26.4s, w6 // round
2144
+ neg w7, w7 // -shift
2145
+ dup v27.4s, w7
2146
+ dup v29.4s, w9 // offset
2147
+.loopHppUS:
2148
+ mov x12, x3
2149
+.loopWppUS:
2150
+ ldr d0, x0, #8
2151
+ sub x12, x12, #8
2152
+ uxtl v0.8h, v0.8b
2153
+ umull v1.4s, v0.4h, v25.4h // val *= w0
2154
+ umull2 v2.4s, v0.8h, v25.8h
2155
+ add v1.4s, v1.4s, v26.4s // val += round
2156
+ add v2.4s, v2.4s, v26.4s
2157
+ sshl v1.4s, v1.4s, v27.4s // val >>= shift
2158
+ sshl v2.4s, v2.4s, v27.4s
2159
+ add v1.4s, v1.4s, v29.4s // val += offset
2160
+ add v2.4s, v2.4s, v29.4s
2161
+ sqxtn v0.4h, v1.4s // val = x265_clip(val)
2162
+ sqxtn2 v0.8h, v2.4s
2163
+ sqxtun v0.8b, v0.8h
2164
+ str d0, x1, #8
2165
+ cbnz x12, .loopWppUS
2166
+ add x1, x1, x2
2167
+ add x0, x0, x2
2168
+ sub x4, x4, #1
2169
+ cbnz x4, .loopHppUS
2170
+ ret
2171
+endfunc
2172
+
2173
+// int scanPosLast(
2174
+// const uint16_t *scan, // x0
2175
+// const coeff_t *coeff, // x1
2176
+// uint16_t *coeffSign, // x2
2177
+// uint16_t *coeffFlag, // x3
2178
+// uint8_t *coeffNum, // x4
2179
+// int numSig, // x5
2180
+// const uint16_t* scanCG4x4, // x6
2181
+// const int trSize) // x7
2182
+function PFX(scanPosLast_neon)
2183
+ // convert unit of Stride(trSize) to int16_t
2184
+ add x7, x7, x7
2185
+
2186
+ // load scan table and convert to Byte
2187
+ ldp q0, q1, x6
2188
+ xtn v0.8b, v0.8h
2189
+ xtn2 v0.16b, v1.8h // v0 - Zigzag scan table
2190
+
2191
+ movrel x10, g_SPL_and_mask
2192
+ ldr q28, x10 // v28 = mask for pmovmskb
2193
+ movi v31.16b, #0 // v31 = {0, ..., 0}
2194
+ add x10, x7, x7 // 2*x7
2195
+ add x11, x10, x7 // 3*x7
2196
+ add x9, x4, #1 // CG count
2197
+
2198
+.loop_spl:
2199
+ // position of current CG
2200
+ ldrh w6, x0, #32
2201
+ add x6, x1, x6, lsl #1
2202
+
2203
+ // loading current CG
2204
+ ldr d2, x6
2205
+ ldr d3, x6, x7
2206
+ ldr d4, x6, x10
2207
+ ldr d5, x6, x11
2208
+ mov v2.d1, v3.d0
2209
+ mov v4.d1, v5.d0
2210
+ sqxtn v2.8b, v2.8h
2211
+ sqxtn2 v2.16b, v4.8h
2212
+
2213
+ // Zigzag
2214
+ tbl v3.16b, {v2.16b}, v0.16b
2215
+
2216
+ // get sign
2217
+ cmhi v5.16b, v3.16b, v31.16b // v5 = non-zero
2218
+ cmlt v3.16b, v3.16b, #0 // v3 = negative
2219
+
2220
+ // val - w13 = pmovmskb(v3)
2221
+ and v3.16b, v3.16b, v28.16b
2222
+ mov d4, v3.d1
2223
+ addv b23, v3.8b
2224
+ addv b24, v4.8b
2225
+ mov v23.b1, v24.b0
2226
+ fmov w13, s23
2227
+
2228
+ // mask - w15 = pmovmskb(v5)
2229
+ and v5.16b, v5.16b, v28.16b
2230
+ mov d6, v5.d1
2231
+ addv b25, v5.8b
2232
+ addv b26, v6.8b
2233
+ mov v25.b1, v26.b0
2234
+ fmov w15, s25
2235
+
2236
+ // coeffFlag = reverse_bit(w15) in 16-bit
2237
+ rbit w12, w15
2238
+ lsr w12, w12, #16
2239
+ fmov s30, w12
2240
+ strh w12, x3, #2
2241
+
2242
+ // accelerate by preparing w13 = w13 & w15
2243
+ and w13, w13, w15
2244
+ mov x14, xzr
2245
+.loop_spl_1:
2246
+ cbz w15, .pext_end
2247
+ clz w6, w15
2248
+ lsl w13, w13, w6
2249
+ lsl w15, w15, w6
2250
+ extr w14, w14, w13, #31
2251
+ bfm w15, wzr, #1, #0
2252
+ b .loop_spl_1
2253
+.pext_end:
2254
+ strh w14, x2, #2
2255
+
2256
+ // compute coeffNum = popcount(coeffFlag)
2257
+ cnt v30.8b, v30.8b
2258
+ addp v30.8b, v30.8b, v30.8b
2259
+ fmov w6, s30
2260
+ sub x5, x5, x6
2261
+ strb w6, x4, #1
2262
+
2263
+ cbnz x5, .loop_spl
2264
+
2265
+ // count trailing zeros
2266
+ rbit w13, w12
2267
+ clz w13, w13
2268
+ lsr w12, w12, w13
2269
+ strh w12, x3, #-2
2270
+
2271
+ // get last pos
2272
+ sub x9, x4, x9
2273
+ lsl x0, x9, #4
2274
+ eor w13, w13, #15
2275
+ add x0, x0, x13
2276
+ ret
2277
+endfunc
2278
+
2279
+// uint32_t costCoeffNxN(
2280
+// uint16_t *scan, // x0
2281
+// coeff_t *coeff, // x1
2282
+// intptr_t trSize, // x2
2283
+// uint16_t *absCoeff, // x3
2284
+// uint8_t *tabSigCtx, // x4
2285
+// uint16_t scanFlagMask, // x5
2286
+// uint8_t *baseCtx, // x6
2287
+// int offset, // x7
2288
+// int scanPosSigOff, // sp
2289
+// int subPosBase) // sp + 8
2290
+function PFX(costCoeffNxN_neon)
2291
+ // abs(coeff)
2292
+ add x2, x2, x2
2293
+ ld1 {v1.d}0, x1, x2
2294
+ ld1 {v1.d}1, x1, x2
2295
+ ld1 {v2.d}0, x1, x2
2296
+ ld1 {v2.d}1, x1, x2
2297
+ abs v1.8h, v1.8h
2298
+ abs v2.8h, v2.8h
2299
+
2300
+ // WARNING: beyond-bound read here!
2301
+ // loading scan table
2302
+ ldr w2, sp
2303
+ eor w15, w2, #15
2304
+ add x1, x0, x15, lsl #1
2305
+ ldp q20, q21, x1
2306
+ uzp1 v20.16b, v20.16b, v21.16b
2307
+ movi v21.16b, #15
2308
+ eor v0.16b, v20.16b, v21.16b
2309
+
2310
+ // reorder coeff
2311
+ uzp1 v22.16b, v1.16b, v2.16b
2312
+ uzp2 v23.16b, v1.16b, v2.16b
2313
+ tbl v24.16b, {v22.16b}, v0.16b
2314
+ tbl v25.16b, {v23.16b}, v0.16b
2315
+ zip1 v2.16b, v24.16b, v25.16b
2316
+ zip2 v3.16b, v24.16b, v25.16b
2317
+
2318
+ // loading tabSigCtx (+offset)
2319
+ ldr q1, x4
2320
+ tbl v1.16b, {v1.16b}, v0.16b
2321
+ dup v4.16b, w7
2322
+ movi v5.16b, #0
2323
+ tbl v4.16b, {v4.16b}, v5.16b
2324
+ add v1.16b, v1.16b, v4.16b
2325
+
2326
+ // register mapping
2327
+ // x0 - sum
2328
+ // x1 - entropyStateBits
2329
+ // v1 - sigCtx
2330
+ // {v3,v2} - abs(coeff)
2331
+ // x2 - scanPosSigOff
2332
+ // x3 - absCoeff
2333
+ // x4 - numNonZero
2334
+ // x5 - scanFlagMask
2335
+ // x6 - baseCtx
2336
+ mov x0, #0
2337
+ movrel x1, PFX_C(entropyStateBits)
2338
+ mov x4, #0
2339
+ mov x11, #0
2340
+ movi v31.16b, #0
2341
+ cbz x2, .idx_zero
2342
+.loop_ccnn:
2343
+// {
2344
+// const uint32_t cnt = tabSigCtxblkPos + offset + posOffset;
2345
+// ctxSig = cnt & posZeroMask;
2346
+// const uint32_t mstate = baseCtxctxSig;
2347
+// const uint32_t mps = mstate & 1;
2348
+// const uint32_t stateBits = x265_entropyStateBitsmstate ^ sig;
2349
+// uint32_t nextState = (stateBits >> 24) + mps;
2350
+// if ((mstate ^ sig) == 1)
2351
+// nextState = sig;
2352
+// baseCtxctxSig = (uint8_t)nextState;
2353
+// sum += stateBits;
2354
+// }
2355
+// absCoeffnumNonZero = tmpCoeffblkPos;
2356
+// numNonZero += sig;
2357
+// scanPosSigOff--;
2358
+
2359
+ add x13, x3, x4, lsl #1
2360
+ sub x2, x2, #1
2361
+ str h2, x13 // absCoeffnumNonZero = tmpCoeffblkPos
2362
+ fmov w14, s1 // x14 = ctxSig
2363
+ uxtb w14, w14
2364
+ ubfx w11, w5, #0, #1 // x11 = sig
2365
+ lsr x5, x5, #1
2366
+ add x4, x4, x11 // numNonZero += sig
2367
+ ext v1.16b, v1.16b, v31.16b, #1
2368
+ ext v2.16b, v2.16b, v3.16b, #2
2369
+ ext v3.16b, v3.16b, v31.16b, #2
2370
+ ldrb w9, x6, x14 // mstate = baseCtxctxSig
2371
+ and w10, w9, #1 // mps = mstate & 1
2372
+ eor w9, w9, w11 // x9 = mstate ^ sig
2373
+ add x12, x1, x9, lsl #2
2374
+ ldr w13, x12
2375
+ add w0, w0, w13 // sum += x265_entropyStateBitsmstate ^ sig
2376
+ ldrb w13, x12, #3
2377
+ add w10, w10, w13 // nextState = (stateBits >> 24) + mps
2378
+ cmp w9, #1
2379
+ csel w10, w11, w10, eq
2380
+ strb w10, x6, x14
2381
+ cbnz x2, .loop_ccnn
2382
+.idx_zero:
2383
+
2384
+ add x13, x3, x4, lsl #1
2385
+ add x4, x4, x15
2386
+ str h2, x13 // absCoeffnumNonZero = tmpCoeffblkPos
2387
+
2388
+ ldr x9, sp, #8 // subPosBase
2389
+ uxth w9, w9
2390
+ cmp w9, #0
2391
+ cset x2, eq
2392
+ add x4, x4, x2
2393
+ cbz x4, .exit_ccnn
2394
+
2395
+ sub w2, w2, #1
2396
+ uxtb w2, w2
2397
+ fmov w3, s1
2398
+ and w2, w2, w3
2399
+
2400
+ ldrb w3, x6, x2 // mstate = baseCtxctxSig
2401
+ eor w4, w5, w3 // x5 = mstate ^ sig
2402
+ and w3, w3, #1 // mps = mstate & 1
2403
+ add x1, x1, x4, lsl #2
2404
+ ldr w11, x1
2405
+ ldrb w12, x1, #3
2406
+ add w0, w0, w11 // sum += x265_entropyStateBitsmstate ^ sig
2407
+ add w3, w3, w12 // nextState = (stateBits >> 24) + mps
2408
+ cmp w4, #1
2409
+ csel w3, w5, w3, eq
2410
+ strb w3, x6, x2
2411
+.exit_ccnn:
2412
+ ubfx w0, w0, #0, #24
2413
ret
2414
endfunc
2415
+
2416
+const g_SPL_and_mask, align=8
2417
+.byte 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
2418
+endconst
2419
x265_3.6.tar.gz/source/common/aarch64/sad-a-common.S
Added
516
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+#include "asm.S"
29
+
30
+.arch armv8-a
31
+
32
+#ifdef __APPLE__
33
+.section __RODATA,__rodata
34
+#else
35
+.section .rodata
36
+#endif
37
+
38
+.align 4
39
+
40
+.macro SAD_START_4 f
41
+ ld1 {v0.s}0, x0, x1
42
+ ld1 {v0.s}1, x0, x1
43
+ ld1 {v1.s}0, x2, x3
44
+ ld1 {v1.s}1, x2, x3
45
+ \f v16.8h, v0.8b, v1.8b
46
+.endm
47
+
48
+.macro SAD_4 h
49
+.rept \h / 2 - 1
50
+ SAD_START_4 uabal
51
+.endr
52
+.endm
53
+
54
+.macro SAD_START_8 f
55
+ ld1 {v0.8b}, x0, x1
56
+ ld1 {v1.8b}, x2, x3
57
+ ld1 {v2.8b}, x0, x1
58
+ ld1 {v3.8b}, x2, x3
59
+ \f v16.8h, v0.8b, v1.8b
60
+ \f v17.8h, v2.8b, v3.8b
61
+.endm
62
+
63
+.macro SAD_8 h
64
+.rept \h / 2 - 1
65
+ SAD_START_8 uabal
66
+.endr
67
+.endm
68
+
69
+.macro SAD_START_16 f
70
+ ld1 {v0.16b}, x0, x1
71
+ ld1 {v1.16b}, x2, x3
72
+ ld1 {v2.16b}, x0, x1
73
+ ld1 {v3.16b}, x2, x3
74
+ \f v16.8h, v0.8b, v1.8b
75
+ \f\()2 v17.8h, v0.16b, v1.16b
76
+ uabal v16.8h, v2.8b, v3.8b
77
+ uabal2 v17.8h, v2.16b, v3.16b
78
+.endm
79
+
80
+.macro SAD_16 h
81
+.rept \h / 2 - 1
82
+ SAD_START_16 uabal
83
+.endr
84
+.endm
85
+
86
+.macro SAD_START_32
87
+ movi v16.16b, #0
88
+ movi v17.16b, #0
89
+ movi v18.16b, #0
90
+ movi v19.16b, #0
91
+.endm
92
+
93
+.macro SAD_32
94
+ ld1 {v0.16b-v1.16b}, x0, x1
95
+ ld1 {v2.16b-v3.16b}, x2, x3
96
+ ld1 {v4.16b-v5.16b}, x0, x1
97
+ ld1 {v6.16b-v7.16b}, x2, x3
98
+ uabal v16.8h, v0.8b, v2.8b
99
+ uabal2 v17.8h, v0.16b, v2.16b
100
+ uabal v18.8h, v1.8b, v3.8b
101
+ uabal2 v19.8h, v1.16b, v3.16b
102
+ uabal v16.8h, v4.8b, v6.8b
103
+ uabal2 v17.8h, v4.16b, v6.16b
104
+ uabal v18.8h, v5.8b, v7.8b
105
+ uabal2 v19.8h, v5.16b, v7.16b
106
+.endm
107
+
108
+.macro SAD_END_32
109
+ add v16.8h, v16.8h, v17.8h
110
+ add v17.8h, v18.8h, v19.8h
111
+ add v16.8h, v16.8h, v17.8h
112
+ uaddlv s0, v16.8h
113
+ fmov w0, s0
114
+ ret
115
+.endm
116
+
117
+.macro SAD_START_64
118
+ movi v16.16b, #0
119
+ movi v17.16b, #0
120
+ movi v18.16b, #0
121
+ movi v19.16b, #0
122
+ movi v20.16b, #0
123
+ movi v21.16b, #0
124
+ movi v22.16b, #0
125
+ movi v23.16b, #0
126
+.endm
127
+
128
+.macro SAD_64
129
+ ld1 {v0.16b-v3.16b}, x0, x1
130
+ ld1 {v4.16b-v7.16b}, x2, x3
131
+ ld1 {v24.16b-v27.16b}, x0, x1
132
+ ld1 {v28.16b-v31.16b}, x2, x3
133
+ uabal v16.8h, v0.8b, v4.8b
134
+ uabal2 v17.8h, v0.16b, v4.16b
135
+ uabal v18.8h, v1.8b, v5.8b
136
+ uabal2 v19.8h, v1.16b, v5.16b
137
+ uabal v20.8h, v2.8b, v6.8b
138
+ uabal2 v21.8h, v2.16b, v6.16b
139
+ uabal v22.8h, v3.8b, v7.8b
140
+ uabal2 v23.8h, v3.16b, v7.16b
141
+
142
+ uabal v16.8h, v24.8b, v28.8b
143
+ uabal2 v17.8h, v24.16b, v28.16b
144
+ uabal v18.8h, v25.8b, v29.8b
145
+ uabal2 v19.8h, v25.16b, v29.16b
146
+ uabal v20.8h, v26.8b, v30.8b
147
+ uabal2 v21.8h, v26.16b, v30.16b
148
+ uabal v22.8h, v27.8b, v31.8b
149
+ uabal2 v23.8h, v27.16b, v31.16b
150
+.endm
151
+
152
+.macro SAD_END_64
153
+ add v16.8h, v16.8h, v17.8h
154
+ add v17.8h, v18.8h, v19.8h
155
+ add v16.8h, v16.8h, v17.8h
156
+ uaddlp v16.4s, v16.8h
157
+ add v18.8h, v20.8h, v21.8h
158
+ add v19.8h, v22.8h, v23.8h
159
+ add v17.8h, v18.8h, v19.8h
160
+ uaddlp v17.4s, v17.8h
161
+ add v16.4s, v16.4s, v17.4s
162
+ uaddlv d0, v16.4s
163
+ fmov x0, d0
164
+ ret
165
+.endm
166
+
167
+.macro SAD_START_12
168
+ movrel x12, sad12_mask
169
+ ld1 {v31.16b}, x12
170
+ movi v16.16b, #0
171
+ movi v17.16b, #0
172
+.endm
173
+
174
+.macro SAD_12
175
+ ld1 {v0.16b}, x0, x1
176
+ and v0.16b, v0.16b, v31.16b
177
+ ld1 {v1.16b}, x2, x3
178
+ and v1.16b, v1.16b, v31.16b
179
+ ld1 {v2.16b}, x0, x1
180
+ and v2.16b, v2.16b, v31.16b
181
+ ld1 {v3.16b}, x2, x3
182
+ and v3.16b, v3.16b, v31.16b
183
+ uabal v16.8h, v0.8b, v1.8b
184
+ uabal2 v17.8h, v0.16b, v1.16b
185
+ uabal v16.8h, v2.8b, v3.8b
186
+ uabal2 v17.8h, v2.16b, v3.16b
187
+.endm
188
+
189
+.macro SAD_END_12
190
+ add v16.8h, v16.8h, v17.8h
191
+ uaddlv s0, v16.8h
192
+ fmov w0, s0
193
+ ret
194
+.endm
195
+
196
+.macro SAD_START_24
197
+ movi v16.16b, #0
198
+ movi v17.16b, #0
199
+ movi v18.16b, #0
200
+ sub x1, x1, #16
201
+ sub x3, x3, #16
202
+.endm
203
+
204
+.macro SAD_24
205
+ ld1 {v0.16b}, x0, #16
206
+ ld1 {v1.8b}, x0, x1
207
+ ld1 {v2.16b}, x2, #16
208
+ ld1 {v3.8b}, x2, x3
209
+ ld1 {v4.16b}, x0, #16
210
+ ld1 {v5.8b}, x0, x1
211
+ ld1 {v6.16b}, x2, #16
212
+ ld1 {v7.8b}, x2, x3
213
+ uabal v16.8h, v0.8b, v2.8b
214
+ uabal2 v17.8h, v0.16b, v2.16b
215
+ uabal v18.8h, v1.8b, v3.8b
216
+ uabal v16.8h, v4.8b, v6.8b
217
+ uabal2 v17.8h, v4.16b, v6.16b
218
+ uabal v18.8h, v5.8b, v7.8b
219
+.endm
220
+
221
+.macro SAD_END_24
222
+ add v16.8h, v16.8h, v17.8h
223
+ add v16.8h, v16.8h, v18.8h
224
+ uaddlv s0, v16.8h
225
+ fmov w0, s0
226
+ ret
227
+.endm
228
+
229
+.macro SAD_START_48
230
+ movi v16.16b, #0
231
+ movi v17.16b, #0
232
+ movi v18.16b, #0
233
+ movi v19.16b, #0
234
+ movi v20.16b, #0
235
+ movi v21.16b, #0
236
+.endm
237
+
238
+.macro SAD_48
239
+ ld1 {v0.16b-v2.16b}, x0, x1
240
+ ld1 {v4.16b-v6.16b}, x2, x3
241
+ ld1 {v24.16b-v26.16b}, x0, x1
242
+ ld1 {v28.16b-v30.16b}, x2, x3
243
+ uabal v16.8h, v0.8b, v4.8b
244
+ uabal2 v17.8h, v0.16b, v4.16b
245
+ uabal v18.8h, v1.8b, v5.8b
246
+ uabal2 v19.8h, v1.16b, v5.16b
247
+ uabal v20.8h, v2.8b, v6.8b
248
+ uabal2 v21.8h, v2.16b, v6.16b
249
+
250
+ uabal v16.8h, v24.8b, v28.8b
251
+ uabal2 v17.8h, v24.16b, v28.16b
252
+ uabal v18.8h, v25.8b, v29.8b
253
+ uabal2 v19.8h, v25.16b, v29.16b
254
+ uabal v20.8h, v26.8b, v30.8b
255
+ uabal2 v21.8h, v26.16b, v30.16b
256
+.endm
257
+
258
+.macro SAD_END_48
259
+ add v16.8h, v16.8h, v17.8h
260
+ add v17.8h, v18.8h, v19.8h
261
+ add v16.8h, v16.8h, v17.8h
262
+ uaddlv s0, v16.8h
263
+ fmov w0, s0
264
+ add v18.8h, v20.8h, v21.8h
265
+ uaddlv s1, v18.8h
266
+ fmov w1, s1
267
+ add w0, w0, w1
268
+ ret
269
+.endm
270
+
271
+.macro SAD_X_START_4 h, x, f
272
+ ld1 {v0.s}0, x0, x9
273
+ ld1 {v0.s}1, x0, x9
274
+ ld1 {v1.s}0, x1, x5
275
+ ld1 {v1.s}1, x1, x5
276
+ ld1 {v2.s}0, x2, x5
277
+ ld1 {v2.s}1, x2, x5
278
+ ld1 {v3.s}0, x3, x5
279
+ ld1 {v3.s}1, x3, x5
280
+ \f v16.8h, v0.8b, v1.8b
281
+ \f v17.8h, v0.8b, v2.8b
282
+ \f v18.8h, v0.8b, v3.8b
283
+.if \x == 4
284
+ ld1 {v4.s}0, x4, x5
285
+ ld1 {v4.s}1, x4, x5
286
+ \f v19.8h, v0.8b, v4.8b
287
+.endif
288
+.endm
289
+
290
+.macro SAD_X_4 h, x
291
+.rept \h/2 - 1
292
+ SAD_X_START_4 \h, \x, uabal
293
+.endr
294
+.endm
295
+
296
+.macro SAD_X_END_4 x
297
+ uaddlv s0, v16.8h
298
+ uaddlv s1, v17.8h
299
+ uaddlv s2, v18.8h
300
+ stp s0, s1, x6
301
+.if \x == 3
302
+ str s2, x6, #8
303
+.elseif \x == 4
304
+ uaddlv s3, v19.8h
305
+ stp s2, s3, x6, #8
306
+.endif
307
+ ret
308
+.endm
309
+
310
+.macro SAD_X_START_8 h, x, f
311
+ ld1 {v0.8b}, x0, x9
312
+ ld1 {v1.8b}, x1, x5
313
+ ld1 {v2.8b}, x2, x5
314
+ ld1 {v3.8b}, x3, x5
315
+ \f v16.8h, v0.8b, v1.8b
316
+ \f v17.8h, v0.8b, v2.8b
317
+ \f v18.8h, v0.8b, v3.8b
318
+.if \x == 4
319
+ ld1 {v4.8b}, x4, x5
320
+ \f v19.8h, v0.8b, v4.8b
321
+.endif
322
+.endm
323
+
324
+.macro SAD_X_8 h x
325
+.rept \h - 1
326
+ SAD_X_START_8 \h, \x, uabal
327
+.endr
328
+.endm
329
+
330
+.macro SAD_X_END_8 x
331
+ SAD_X_END_4 \x
332
+.endm
333
+
334
+.macro SAD_X_START_12 h, x, f
335
+ ld1 {v0.16b}, x0, x9
336
+ and v0.16b, v0.16b, v31.16b
337
+ ld1 {v1.16b}, x1, x5
338
+ and v1.16b, v1.16b, v31.16b
339
+ ld1 {v2.16b}, x2, x5
340
+ and v2.16b, v2.16b, v31.16b
341
+ ld1 {v3.16b}, x3, x5
342
+ and v3.16b, v3.16b, v31.16b
343
+ \f v16.8h, v1.8b, v0.8b
344
+ \f\()2 v20.8h, v1.16b, v0.16b
345
+ \f v17.8h, v2.8b, v0.8b
346
+ \f\()2 v21.8h, v2.16b, v0.16b
347
+ \f v18.8h, v3.8b, v0.8b
348
+ \f\()2 v22.8h, v3.16b, v0.16b
349
+.if \x == 4
350
+ ld1 {v4.16b}, x4, x5
351
+ and v4.16b, v4.16b, v31.16b
352
+ \f v19.8h, v4.8b, v0.8b
353
+ \f\()2 v23.8h, v4.16b, v0.16b
354
+.endif
355
+.endm
356
+
357
+.macro SAD_X_12 h x
358
+.rept \h - 1
359
+ SAD_X_START_12 \h, \x, uabal
360
+.endr
361
+.endm
362
+
363
+.macro SAD_X_END_12 x
364
+ SAD_X_END_16 \x
365
+.endm
366
+
367
+.macro SAD_X_START_16 h, x, f
368
+ ld1 {v0.16b}, x0, x9
369
+ ld1 {v1.16b}, x1, x5
370
+ ld1 {v2.16b}, x2, x5
371
+ ld1 {v3.16b}, x3, x5
372
+ \f v16.8h, v1.8b, v0.8b
373
+ \f\()2 v20.8h, v1.16b, v0.16b
374
+ \f v17.8h, v2.8b, v0.8b
375
+ \f\()2 v21.8h, v2.16b, v0.16b
376
+ \f v18.8h, v3.8b, v0.8b
377
+ \f\()2 v22.8h, v3.16b, v0.16b
378
+.if \x == 4
379
+ ld1 {v4.16b}, x4, x5
380
+ \f v19.8h, v4.8b, v0.8b
381
+ \f\()2 v23.8h, v4.16b, v0.16b
382
+.endif
383
+.endm
384
+
385
+.macro SAD_X_16 h x
386
+.rept \h - 1
387
+ SAD_X_START_16 \h, \x, uabal
388
+.endr
389
+.endm
390
+
391
+.macro SAD_X_END_16 x
392
+ add v16.8h, v16.8h, v20.8h
393
+ add v17.8h, v17.8h, v21.8h
394
+ add v18.8h, v18.8h, v22.8h
395
+.if \x == 4
396
+ add v19.8h, v19.8h, v23.8h
397
+.endif
398
+
399
+ SAD_X_END_4 \x
400
+.endm
401
+
402
+.macro SAD_X_START_24 x
403
+ SAD_X_START_32 \x
404
+ sub x5, x5, #16
405
+ sub x9, x9, #16
406
+.endm
407
+
408
+.macro SAD_X_24 base v1 v2
409
+ ld1 {v0.16b}, \base , #16
410
+ ld1 {v1.8b}, \base , x5
411
+ uabal \v1\().8h, v0.8b, v6.8b
412
+ uabal \v1\().8h, v1.8b, v7.8b
413
+ uabal2 \v2\().8h, v0.16b, v6.16b
414
+.endm
415
+
416
+.macro SAD_X_END_24 x
417
+ SAD_X_END_16 \x
418
+.endm
419
+
420
+.macro SAD_X_START_32 x
421
+ movi v16.16b, #0
422
+ movi v17.16b, #0
423
+ movi v18.16b, #0
424
+ movi v20.16b, #0
425
+ movi v21.16b, #0
426
+ movi v22.16b, #0
427
+.if \x == 4
428
+ movi v19.16b, #0
429
+ movi v23.16b, #0
430
+.endif
431
+.endm
432
+
433
+.macro SAD_X_32 base v1 v2
434
+ ld1 {v0.16b-v1.16b}, \base , x5
435
+ uabal \v1\().8h, v0.8b, v6.8b
436
+ uabal \v1\().8h, v1.8b, v7.8b
437
+ uabal2 \v2\().8h, v0.16b, v6.16b
438
+ uabal2 \v2\().8h, v1.16b, v7.16b
439
+.endm
440
+
441
+.macro SAD_X_END_32 x
442
+ SAD_X_END_16 \x
443
+.endm
444
+
445
+.macro SAD_X_START_48 x
446
+ SAD_X_START_32 \x
447
+.endm
448
+
449
+.macro SAD_X_48 x1 v1 v2
450
+ ld1 {v0.16b-v2.16b}, \x1 , x5
451
+ uabal \v1\().8h, v0.8b, v4.8b
452
+ uabal \v1\().8h, v1.8b, v5.8b
453
+ uabal \v1\().8h, v2.8b, v6.8b
454
+ uabal2 \v2\().8h, v0.16b, v4.16b
455
+ uabal2 \v2\().8h, v1.16b, v5.16b
456
+ uabal2 \v2\().8h, v2.16b, v6.16b
457
+.endm
458
+
459
+.macro SAD_X_END_48 x
460
+ SAD_X_END_64 \x
461
+.endm
462
+
463
+.macro SAD_X_START_64 x
464
+ SAD_X_START_32 \x
465
+.endm
466
+
467
+.macro SAD_X_64 x1 v1 v2
468
+ ld1 {v0.16b-v3.16b}, \x1 , x5
469
+ uabal \v1\().8h, v0.8b, v4.8b
470
+ uabal \v1\().8h, v1.8b, v5.8b
471
+ uabal \v1\().8h, v2.8b, v6.8b
472
+ uabal \v1\().8h, v3.8b, v7.8b
473
+ uabal2 \v2\().8h, v0.16b, v4.16b
474
+ uabal2 \v2\().8h, v1.16b, v5.16b
475
+ uabal2 \v2\().8h, v2.16b, v6.16b
476
+ uabal2 \v2\().8h, v3.16b, v7.16b
477
+.endm
478
+
479
+.macro SAD_X_END_64 x
480
+ uaddlp v16.4s, v16.8h
481
+ uaddlp v17.4s, v17.8h
482
+ uaddlp v18.4s, v18.8h
483
+ uaddlp v20.4s, v20.8h
484
+ uaddlp v21.4s, v21.8h
485
+ uaddlp v22.4s, v22.8h
486
+ add v16.4s, v16.4s, v20.4s
487
+ add v17.4s, v17.4s, v21.4s
488
+ add v18.4s, v18.4s, v22.4s
489
+ trn2 v20.2d, v16.2d, v16.2d
490
+ trn2 v21.2d, v17.2d, v17.2d
491
+ trn2 v22.2d, v18.2d, v18.2d
492
+ add v16.2s, v16.2s, v20.2s
493
+ add v17.2s, v17.2s, v21.2s
494
+ add v18.2s, v18.2s, v22.2s
495
+ uaddlp v16.1d, v16.2s
496
+ uaddlp v17.1d, v17.2s
497
+ uaddlp v18.1d, v18.2s
498
+ stp s16, s17, x6, #8
499
+.if \x == 3
500
+ str s18, x6
501
+.elseif \x == 4
502
+ uaddlp v19.4s, v19.8h
503
+ uaddlp v23.4s, v23.8h
504
+ add v19.4s, v19.4s, v23.4s
505
+ trn2 v23.2d, v19.2d, v19.2d
506
+ add v19.2s, v19.2s, v23.2s
507
+ uaddlp v19.1d, v19.2s
508
+ stp s18, s19, x6
509
+.endif
510
+ ret
511
+.endm
512
+
513
+const sad12_mask, align=8
514
+.byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0
515
+endconst
516
x265_3.6.tar.gz/source/common/aarch64/sad-a-sve2.S
Added
513
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "sad-a-common.S"
27
+
28
+.arch armv8-a+sve2
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+.macro SAD_SVE2_16 h
41
+ mov z16.d, #0
42
+ ptrue p0.h, vl16
43
+.rept \h
44
+ ld1b {z0.h}, p0/z, x0
45
+ ld1b {z2.h}, p0/z, x2
46
+ add x0, x0, x1
47
+ add x2, x2, x3
48
+ uaba z16.h, z0.h, z2.h
49
+.endr
50
+ uaddv d0, p0, z16.h
51
+ fmov w0, s0
52
+ ret
53
+.endm
54
+
55
+.macro SAD_SVE2_32 h
56
+ ptrue p0.b, vl32
57
+.rept \h
58
+ ld1b {z0.b}, p0/z, x0
59
+ ld1b {z4.b}, p0/z, x2
60
+ add x0, x0, x1
61
+ add x2, x2, x3
62
+ uabalb z16.h, z0.b, z4.b
63
+ uabalt z16.h, z0.b, z4.b
64
+.endr
65
+ uaddv d0, p0, z16.h
66
+ fmov w0, s0
67
+ ret
68
+.endm
69
+
70
+.macro SAD_SVE2_64 h
71
+ cmp x9, #48
72
+ bgt .vl_gt_48_pixel_sad_64x\h
73
+ mov z16.d, #0
74
+ mov z17.d, #0
75
+ mov z18.d, #0
76
+ mov z19.d, #0
77
+ ptrue p0.b, vl32
78
+.rept \h
79
+ ld1b {z0.b}, p0/z, x0
80
+ ld1b {z1.b}, p0/z, x0, #1, mul vl
81
+ ld1b {z4.b}, p0/z, x2
82
+ ld1b {z5.b}, p0/z, x2, #1, mul vl
83
+ add x0, x0, x1
84
+ add x2, x2, x3
85
+ uabalb z16.h, z0.b, z4.b
86
+ uabalt z17.h, z0.b, z4.b
87
+ uabalb z18.h, z1.b, z5.b
88
+ uabalt z19.h, z1.b, z5.b
89
+.endr
90
+ add z16.h, z16.h, z17.h
91
+ add z17.h, z18.h, z19.h
92
+ add z16.h, z16.h, z17.h
93
+ uadalp z24.s, p0/m, z16.h
94
+ uaddv d5, p0, z24.s
95
+ fmov x0, d5
96
+ ret
97
+.vl_gt_48_pixel_sad_64x\h\():
98
+ mov z16.d, #0
99
+ mov z17.d, #0
100
+ mov z24.d, #0
101
+ ptrue p0.b, vl64
102
+.rept \h
103
+ ld1b {z0.b}, p0/z, x0
104
+ ld1b {z4.b}, p0/z, x2
105
+ add x0, x0, x1
106
+ add x2, x2, x3
107
+ uabalb z16.h, z0.b, z4.b
108
+ uabalt z17.h, z0.b, z4.b
109
+.endr
110
+ add z16.h, z16.h, z17.h
111
+ uadalp z24.s, p0/m, z16.h
112
+ uaddv d5, p0, z24.s
113
+ fmov x0, d5
114
+ ret
115
+.endm
116
+
117
+.macro SAD_SVE2_24 h
118
+ mov z16.d, #0
119
+ mov x10, #24
120
+ mov x11, #0
121
+ whilelt p0.b, x11, x10
122
+.rept \h
123
+ ld1b {z0.b}, p0/z, x0
124
+ ld1b {z8.b}, p0/z, x2
125
+ add x0, x0, x1
126
+ add x2, x2, x3
127
+ uabalb z16.h, z0.b, z8.b
128
+ uabalt z16.h, z0.b, z8.b
129
+.endr
130
+ uaddv d5, p0, z16.h
131
+ fmov w0, s5
132
+ ret
133
+.endm
134
+
135
+.macro SAD_SVE2_48 h
136
+ cmp x9, #48
137
+ bgt .vl_gt_48_pixel_sad_48x\h
138
+ mov z16.d, #0
139
+ mov z17.d, #0
140
+ mov z18.d, #0
141
+ mov z19.d, #0
142
+ ptrue p0.b, vl32
143
+ ptrue p1.b, vl16
144
+.rept \h
145
+ ld1b {z0.b}, p0/z, x0
146
+ ld1b {z1.b}, p1/z, x0, #1, mul vl
147
+ ld1b {z8.b}, p0/z, x2
148
+ ld1b {z9.b}, p1/z, x2, #1, mul vl
149
+ add x0, x0, x1
150
+ add x2, x2, x3
151
+ uabalb z16.h, z0.b, z8.b
152
+ uabalt z17.h, z0.b, z8.b
153
+ uabalb z18.h, z1.b, z9.b
154
+ uabalt z19.h, z1.b, z9.b
155
+.endr
156
+ add z16.h, z16.h, z17.h
157
+ add z17.h, z18.h, z19.h
158
+ add z16.h, z16.h, z17.h
159
+ uaddv d5, p0, z16.h
160
+ fmov w0, s5
161
+ ret
162
+.vl_gt_48_pixel_sad_48x\h\():
163
+ mov z16.d, #0
164
+ mov z17.d, #0
165
+ mov x10, #48
166
+ mov x11, #0
167
+ whilelt p0.b, x11, x10
168
+.rept \h
169
+ ld1b {z0.b}, p0/z, x0
170
+ ld1b {z8.b}, p0/z, x2
171
+ add x0, x0, x1
172
+ add x2, x2, x3
173
+ uabalb z16.h, z0.b, z8.b
174
+ uabalt z17.h, z0.b, z8.b
175
+.endr
176
+ add z16.h, z16.h, z17.h
177
+ uaddv d5, p0, z16.h
178
+ fmov w0, s5
179
+ ret
180
+.endm
181
+
182
+// Fully unrolled.
183
+.macro SAD_FUNC_SVE2 w, h
184
+function PFX(pixel_sad_\w\()x\h\()_sve2)
185
+ rdvl x9, #1
186
+ cmp x9, #16
187
+ bgt .vl_gt_16_pixel_sad_\w\()x\h
188
+ SAD_START_\w uabdl
189
+ SAD_\w \h
190
+.if \w > 4
191
+ add v16.8h, v16.8h, v17.8h
192
+.endif
193
+ uaddlv s0, v16.8h
194
+ fmov w0, s0
195
+ ret
196
+.vl_gt_16_pixel_sad_\w\()x\h\():
197
+.if \w == 4 || \w == 8 || \w == 12
198
+ SAD_START_\w uabdl
199
+ SAD_\w \h
200
+.if \w > 4
201
+ add v16.8h, v16.8h, v17.8h
202
+.endif
203
+ uaddlv s0, v16.8h
204
+ fmov w0, s0
205
+ ret
206
+.else
207
+ SAD_SVE2_\w \h
208
+.endif
209
+endfunc
210
+.endm
211
+
212
+// Loop unrolled 4.
213
+.macro SAD_FUNC_LOOP_SVE2 w, h
214
+function PFX(pixel_sad_\w\()x\h\()_sve2)
215
+ rdvl x9, #1
216
+ cmp x9, #16
217
+ bgt .vl_gt_16_pixel_sad_loop_\w\()x\h
218
+ SAD_START_\w
219
+
220
+ mov w9, #\h/8
221
+.loop_sve2_\w\()x\h:
222
+ sub w9, w9, #1
223
+.rept 4
224
+ SAD_\w
225
+.endr
226
+ cbnz w9, .loop_sve2_\w\()x\h
227
+
228
+ SAD_END_\w
229
+
230
+.vl_gt_16_pixel_sad_loop_\w\()x\h\():
231
+.if \w == 4 || \w == 8 || \w == 12
232
+ SAD_START_\w
233
+
234
+ mov w9, #\h/8
235
+.loop_sve2_loop_\w\()x\h:
236
+ sub w9, w9, #1
237
+.rept 4
238
+ SAD_\w
239
+.endr
240
+ cbnz w9, .loop_sve2_loop_\w\()x\h
241
+
242
+ SAD_END_\w
243
+.else
244
+ SAD_SVE2_\w \h
245
+.endif
246
+endfunc
247
+.endm
248
+
249
+SAD_FUNC_SVE2 4, 4
250
+SAD_FUNC_SVE2 4, 8
251
+SAD_FUNC_SVE2 4, 16
252
+SAD_FUNC_SVE2 8, 4
253
+SAD_FUNC_SVE2 8, 8
254
+SAD_FUNC_SVE2 8, 16
255
+SAD_FUNC_SVE2 8, 32
256
+SAD_FUNC_SVE2 16, 4
257
+SAD_FUNC_SVE2 16, 8
258
+SAD_FUNC_SVE2 16, 12
259
+SAD_FUNC_SVE2 16, 16
260
+SAD_FUNC_SVE2 16, 32
261
+SAD_FUNC_SVE2 16, 64
262
+
263
+SAD_FUNC_LOOP_SVE2 32, 8
264
+SAD_FUNC_LOOP_SVE2 32, 16
265
+SAD_FUNC_LOOP_SVE2 32, 24
266
+SAD_FUNC_LOOP_SVE2 32, 32
267
+SAD_FUNC_LOOP_SVE2 32, 64
268
+SAD_FUNC_LOOP_SVE2 64, 16
269
+SAD_FUNC_LOOP_SVE2 64, 32
270
+SAD_FUNC_LOOP_SVE2 64, 48
271
+SAD_FUNC_LOOP_SVE2 64, 64
272
+SAD_FUNC_LOOP_SVE2 12, 16
273
+SAD_FUNC_LOOP_SVE2 24, 32
274
+SAD_FUNC_LOOP_SVE2 48, 64
275
+
276
+// SAD_X3 and SAD_X4 code start
277
+
278
+.macro SAD_X_SVE2_24_INNER_GT_16 base z
279
+ ld1b {z4.b}, p0/z, \base
280
+ add \base, \base, x5
281
+ uabalb \z\().h, z4.b, z0.b
282
+ uabalt \z\().h, z4.b, z0.b
283
+.endm
284
+
285
+.macro SAD_X_SVE2_24 h x
286
+ mov z20.d, #0
287
+ mov z21.d, #0
288
+ mov z22.d, #0
289
+ mov z23.d, #0
290
+ mov x10, #24
291
+ mov x11, #0
292
+ whilelt p0.b, x11, x10
293
+.rept \h
294
+ ld1b {z0.b}, p0/z, x0
295
+ add x0, x0, x9
296
+ SAD_X_SVE2_24_INNER_GT_16 x1, z20
297
+ SAD_X_SVE2_24_INNER_GT_16 x2, z21
298
+ SAD_X_SVE2_24_INNER_GT_16 x3, z22
299
+.if \x == 4
300
+ SAD_X_SVE2_24_INNER_GT_16 x4, z23
301
+.endif
302
+.endr
303
+ uaddlv s0, v20.8h
304
+ uaddlv s1, v21.8h
305
+ uaddlv s2, v22.8h
306
+ stp s0, s1, x6
307
+.if \x == 3
308
+ str s2, x6, #8
309
+.elseif \x == 4
310
+ uaddv d0, p0, z20.h
311
+ uaddv d1, p0, z21.h
312
+ uaddv d2, p0, z22.h
313
+ stp s2, s3, x6, #8
314
+.endif
315
+ ret
316
+.endm
317
+
318
+.macro SAD_X_SVE2_32_INNER_GT_16 base z
319
+ ld1b {z4.b}, p0/z, \base
320
+ add \base, \base, x5
321
+ uabalb \z\().h, z4.b, z0.b
322
+ uabalt \z\().h, z4.b, z0.b
323
+.endm
324
+
325
+.macro SAD_X_SVE2_32 h x
326
+ mov z20.d, #0
327
+ mov z21.d, #0
328
+ mov z22.d, #0
329
+ mov z23.d, #0
330
+ ptrue p0.b, vl32
331
+.rept \h
332
+ ld1b {z0.b}, p0/z, x0
333
+ add x0, x0, x9
334
+ SAD_X_SVE2_32_INNER_GT_16 x1, z20
335
+ SAD_X_SVE2_32_INNER_GT_16 x2, z21
336
+ SAD_X_SVE2_32_INNER_GT_16 x3, z22
337
+.if \x == 4
338
+ SAD_X_SVE2_32_INNER_GT_16 x4, z23
339
+.endif
340
+.endr
341
+ uaddv d0, p0, z20.h
342
+ uaddv d1, p0, z21.h
343
+ uaddv d2, p0, z22.h
344
+ stp s0, s1, x6
345
+.if \x == 3
346
+ str s2, x6, #8
347
+.elseif \x == 4
348
+ uaddv d3, p0, z23.h
349
+ stp s2, s3, x6, #8
350
+.endif
351
+ ret
352
+.endm
353
+
354
+// static void x264_pixel_sad_x3_##size(pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, intptr_t i_stride, int scores3)
355
+// static void x264_pixel_sad_x4_##size(pixel *fenc, pixel *pix0, pixel *pix1,pixel *pix2, pixel *pix3, intptr_t i_stride, int scores4)
356
+.macro SAD_X_FUNC_SVE2 x, w, h
357
+function PFX(sad_x\x\()_\w\()x\h\()_sve2)
358
+ mov x9, #FENC_STRIDE
359
+
360
+// Make function arguments for x == 3 look like x == 4.
361
+.if \x == 3
362
+ mov x6, x5
363
+ mov x5, x4
364
+.endif
365
+ rdvl x11, #1
366
+ cmp x11, #16
367
+ bgt .vl_gt_16_sad_x\x\()_\w\()x\h
368
+.if \w == 12
369
+ movrel x12, sad12_mask
370
+ ld1 {v31.16b}, x12
371
+.endif
372
+
373
+ SAD_X_START_\w \h, \x, uabdl
374
+ SAD_X_\w \h, \x
375
+ SAD_X_END_\w \x
376
+.vl_gt_16_sad_x\x\()_\w\()x\h\():
377
+.if \w == 24 || \w == 32
378
+ SAD_X_SVE2_\w \h, \x
379
+.else
380
+.if \w == 12
381
+ movrel x12, sad12_mask
382
+ ld1 {v31.16b}, x12
383
+.endif
384
+
385
+ SAD_X_START_\w \h, \x, uabdl
386
+ SAD_X_\w \h, \x
387
+ SAD_X_END_\w \x
388
+.endif
389
+endfunc
390
+.endm
391
+
392
+.macro SAD_X_LOOP_SVE2 x, w, h
393
+function PFX(sad_x\x\()_\w\()x\h\()_sve2)
394
+ mov x9, #FENC_STRIDE
395
+
396
+// Make function arguments for x == 3 look like x == 4.
397
+.if \x == 3
398
+ mov x6, x5
399
+ mov x5, x4
400
+.endif
401
+ rdvl x11, #1
402
+ cmp x11, #16
403
+ bgt .vl_gt_16_sad_x_loop_\x\()_\w\()x\h
404
+ SAD_X_START_\w \x
405
+ mov w12, #\h/4
406
+.loop_sad_sve2_x\x\()_\w\()x\h:
407
+ sub w12, w12, #1
408
+ .rept 4
409
+ .if \w == 24
410
+ ld1 {v6.16b}, x0, #16
411
+ ld1 {v7.8b}, x0, x9
412
+ .elseif \w == 32
413
+ ld1 {v6.16b-v7.16b}, x0, x9
414
+ .elseif \w == 48
415
+ ld1 {v4.16b-v6.16b}, x0, x9
416
+ .elseif \w == 64
417
+ ld1 {v4.16b-v7.16b}, x0, x9
418
+ .endif
419
+ SAD_X_\w x1, v16, v20
420
+ SAD_X_\w x2, v17, v21
421
+ SAD_X_\w x3, v18, v22
422
+ .if \x == 4
423
+ SAD_X_\w x4, v19, v23
424
+ .endif
425
+ .endr
426
+ cbnz w12, .loop_sad_sve2_x\x\()_\w\()x\h
427
+ SAD_X_END_\w \x
428
+.vl_gt_16_sad_x_loop_\x\()_\w\()x\h\():
429
+.if \w == 24 || \w == 32
430
+ SAD_X_SVE2_\w \h, \x
431
+ ret
432
+.else
433
+ SAD_X_START_\w \x
434
+ mov w12, #\h/4
435
+.loop_sad_sve2_gt_16_x\x\()_\w\()x\h:
436
+ sub w12, w12, #1
437
+ .rept 4
438
+ .if \w == 24
439
+ ld1 {v6.16b}, x0, #16
440
+ ld1 {v7.8b}, x0, x9
441
+ .elseif \w == 32
442
+ ld1 {v6.16b-v7.16b}, x0, x9
443
+ .elseif \w == 48
444
+ ld1 {v4.16b-v6.16b}, x0, x9
445
+ .elseif \w == 64
446
+ ld1 {v4.16b-v7.16b}, x0, x9
447
+ .endif
448
+ SAD_X_\w x1, v16, v20
449
+ SAD_X_\w x2, v17, v21
450
+ SAD_X_\w x3, v18, v22
451
+ .if \x == 4
452
+ SAD_X_\w x4, v19, v23
453
+ .endif
454
+ .endr
455
+ cbnz w12, .loop_sad_sve2_gt_16_x\x\()_\w\()x\h
456
+ SAD_X_END_\w \x
457
+.endif
458
+endfunc
459
+.endm
460
+
461
+
462
+SAD_X_FUNC_SVE2 3, 4, 4
463
+SAD_X_FUNC_SVE2 3, 4, 8
464
+SAD_X_FUNC_SVE2 3, 4, 16
465
+SAD_X_FUNC_SVE2 3, 8, 4
466
+SAD_X_FUNC_SVE2 3, 8, 8
467
+SAD_X_FUNC_SVE2 3, 8, 16
468
+SAD_X_FUNC_SVE2 3, 8, 32
469
+SAD_X_FUNC_SVE2 3, 12, 16
470
+SAD_X_FUNC_SVE2 3, 16, 4
471
+SAD_X_FUNC_SVE2 3, 16, 8
472
+SAD_X_FUNC_SVE2 3, 16, 12
473
+SAD_X_FUNC_SVE2 3, 16, 16
474
+SAD_X_FUNC_SVE2 3, 16, 32
475
+SAD_X_FUNC_SVE2 3, 16, 64
476
+SAD_X_LOOP_SVE2 3, 24, 32
477
+SAD_X_LOOP_SVE2 3, 32, 8
478
+SAD_X_LOOP_SVE2 3, 32, 16
479
+SAD_X_LOOP_SVE2 3, 32, 24
480
+SAD_X_LOOP_SVE2 3, 32, 32
481
+SAD_X_LOOP_SVE2 3, 32, 64
482
+SAD_X_LOOP_SVE2 3, 48, 64
483
+SAD_X_LOOP_SVE2 3, 64, 16
484
+SAD_X_LOOP_SVE2 3, 64, 32
485
+SAD_X_LOOP_SVE2 3, 64, 48
486
+SAD_X_LOOP_SVE2 3, 64, 64
487
+
488
+SAD_X_FUNC_SVE2 4, 4, 4
489
+SAD_X_FUNC_SVE2 4, 4, 8
490
+SAD_X_FUNC_SVE2 4, 4, 16
491
+SAD_X_FUNC_SVE2 4, 8, 4
492
+SAD_X_FUNC_SVE2 4, 8, 8
493
+SAD_X_FUNC_SVE2 4, 8, 16
494
+SAD_X_FUNC_SVE2 4, 8, 32
495
+SAD_X_FUNC_SVE2 4, 12, 16
496
+SAD_X_FUNC_SVE2 4, 16, 4
497
+SAD_X_FUNC_SVE2 4, 16, 8
498
+SAD_X_FUNC_SVE2 4, 16, 12
499
+SAD_X_FUNC_SVE2 4, 16, 16
500
+SAD_X_FUNC_SVE2 4, 16, 32
501
+SAD_X_FUNC_SVE2 4, 16, 64
502
+SAD_X_LOOP_SVE2 4, 24, 32
503
+SAD_X_LOOP_SVE2 4, 32, 8
504
+SAD_X_LOOP_SVE2 4, 32, 16
505
+SAD_X_LOOP_SVE2 4, 32, 24
506
+SAD_X_LOOP_SVE2 4, 32, 32
507
+SAD_X_LOOP_SVE2 4, 32, 64
508
+SAD_X_LOOP_SVE2 4, 48, 64
509
+SAD_X_LOOP_SVE2 4, 64, 16
510
+SAD_X_LOOP_SVE2 4, 64, 32
511
+SAD_X_LOOP_SVE2 4, 64, 48
512
+SAD_X_LOOP_SVE2 4, 64, 64
513
x265_3.5.tar.gz/source/common/aarch64/sad-a.S -> x265_3.6.tar.gz/source/common/aarch64/sad-a.S
Changed
256
1
2
/*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
+ * Copyright (C) 2020-2021 MulticoreWare, Inc
5
*
6
* Authors: Hongbin Liu <liuhongbin1@huawei.com>
7
+ * Sebastian Pop <spop@amazon.com>
8
*
9
* This program is free software; you can redistribute it and/or modify
10
* it under the terms of the GNU General Public License as published by
11
12
*****************************************************************************/
13
14
#include "asm.S"
15
+#include "sad-a-common.S"
16
17
+#ifdef __APPLE__
18
+.section __RODATA,__rodata
19
+#else
20
.section .rodata
21
+#endif
22
23
.align 4
24
25
.text
26
27
-.macro SAD_X_START_8 x
28
- ld1 {v0.8b}, x0, x9
29
-.if \x == 3
30
- ld1 {v1.8b}, x1, x4
31
- ld1 {v2.8b}, x2, x4
32
- ld1 {v3.8b}, x3, x4
33
-.elseif \x == 4
34
- ld1 {v1.8b}, x1, x5
35
- ld1 {v2.8b}, x2, x5
36
- ld1 {v3.8b}, x3, x5
37
- ld1 {v4.8b}, x4, x5
38
-.endif
39
- uabdl v16.8h, v0.8b, v1.8b
40
- uabdl v17.8h, v0.8b, v2.8b
41
- uabdl v18.8h, v0.8b, v3.8b
42
-.if \x == 4
43
- uabdl v19.8h, v0.8b, v4.8b
44
+// Fully unrolled.
45
+.macro SAD_FUNC w, h
46
+function PFX(pixel_sad_\w\()x\h\()_neon)
47
+ SAD_START_\w uabdl
48
+ SAD_\w \h
49
+.if \w > 4
50
+ add v16.8h, v16.8h, v17.8h
51
.endif
52
+ uaddlv s0, v16.8h
53
+ fmov w0, s0
54
+ ret
55
+endfunc
56
+.endm
57
+
58
+// Loop unrolled 4.
59
+.macro SAD_FUNC_LOOP w, h
60
+function PFX(pixel_sad_\w\()x\h\()_neon)
61
+ SAD_START_\w
62
+
63
+ mov w9, #\h/8
64
+.loop_\w\()x\h:
65
+ sub w9, w9, #1
66
+.rept 4
67
+ SAD_\w
68
+.endr
69
+ cbnz w9, .loop_\w\()x\h
70
+
71
+ SAD_END_\w
72
+endfunc
73
.endm
74
75
-.macro SAD_X_8 x
76
- ld1 {v0.8b}, x0, x9
77
+SAD_FUNC 4, 4
78
+SAD_FUNC 4, 8
79
+SAD_FUNC 4, 16
80
+SAD_FUNC 8, 4
81
+SAD_FUNC 8, 8
82
+SAD_FUNC 8, 16
83
+SAD_FUNC 8, 32
84
+SAD_FUNC 16, 4
85
+SAD_FUNC 16, 8
86
+SAD_FUNC 16, 12
87
+SAD_FUNC 16, 16
88
+SAD_FUNC 16, 32
89
+SAD_FUNC 16, 64
90
+
91
+SAD_FUNC_LOOP 32, 8
92
+SAD_FUNC_LOOP 32, 16
93
+SAD_FUNC_LOOP 32, 24
94
+SAD_FUNC_LOOP 32, 32
95
+SAD_FUNC_LOOP 32, 64
96
+SAD_FUNC_LOOP 64, 16
97
+SAD_FUNC_LOOP 64, 32
98
+SAD_FUNC_LOOP 64, 48
99
+SAD_FUNC_LOOP 64, 64
100
+SAD_FUNC_LOOP 12, 16
101
+SAD_FUNC_LOOP 24, 32
102
+SAD_FUNC_LOOP 48, 64
103
+
104
+// SAD_X3 and SAD_X4 code start
105
+
106
+// static void x264_pixel_sad_x3_##size(pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, intptr_t i_stride, int scores3)
107
+// static void x264_pixel_sad_x4_##size(pixel *fenc, pixel *pix0, pixel *pix1,pixel *pix2, pixel *pix3, intptr_t i_stride, int scores4)
108
+.macro SAD_X_FUNC x, w, h
109
+function PFX(sad_x\x\()_\w\()x\h\()_neon)
110
+ mov x9, #FENC_STRIDE
111
+
112
+// Make function arguments for x == 3 look like x == 4.
113
.if \x == 3
114
- ld1 {v1.8b}, x1, x4
115
- ld1 {v2.8b}, x2, x4
116
- ld1 {v3.8b}, x3, x4
117
-.elseif \x == 4
118
- ld1 {v1.8b}, x1, x5
119
- ld1 {v2.8b}, x2, x5
120
- ld1 {v3.8b}, x3, x5
121
- ld1 {v4.8b}, x4, x5
122
+ mov x6, x5
123
+ mov x5, x4
124
.endif
125
- uabal v16.8h, v0.8b, v1.8b
126
- uabal v17.8h, v0.8b, v2.8b
127
- uabal v18.8h, v0.8b, v3.8b
128
-.if \x == 4
129
- uabal v19.8h, v0.8b, v4.8b
130
+
131
+.if \w == 12
132
+ movrel x12, sad12_mask
133
+ ld1 {v31.16b}, x12
134
.endif
135
+
136
+ SAD_X_START_\w \h, \x, uabdl
137
+ SAD_X_\w \h, \x
138
+ SAD_X_END_\w \x
139
+endfunc
140
.endm
141
142
-.macro SAD_X_8xN x, h
143
-function x265_sad_x\x\()_8x\h\()_neon
144
+.macro SAD_X_LOOP x, w, h
145
+function PFX(sad_x\x\()_\w\()x\h\()_neon)
146
mov x9, #FENC_STRIDE
147
- SAD_X_START_8 \x
148
-.rept \h - 1
149
- SAD_X_8 \x
150
-.endr
151
- uaddlv s0, v16.8h
152
- uaddlv s1, v17.8h
153
- uaddlv s2, v18.8h
154
-.if \x == 4
155
- uaddlv s3, v19.8h
156
-.endif
157
158
+// Make function arguments for x == 3 look like x == 4.
159
.if \x == 3
160
- stp s0, s1, x5
161
- str s2, x5, #8
162
-.elseif \x == 4
163
- stp s0, s1, x6
164
- stp s2, s3, x6, #8
165
+ mov x6, x5
166
+ mov x5, x4
167
.endif
168
- ret
169
+ SAD_X_START_\w \x
170
+ mov w12, #\h/4
171
+.loop_sad_x\x\()_\w\()x\h:
172
+ sub w12, w12, #1
173
+ .rept 4
174
+ .if \w == 24
175
+ ld1 {v6.16b}, x0, #16
176
+ ld1 {v7.8b}, x0, x9
177
+ .elseif \w == 32
178
+ ld1 {v6.16b-v7.16b}, x0, x9
179
+ .elseif \w == 48
180
+ ld1 {v4.16b-v6.16b}, x0, x9
181
+ .elseif \w == 64
182
+ ld1 {v4.16b-v7.16b}, x0, x9
183
+ .endif
184
+ SAD_X_\w x1, v16, v20
185
+ SAD_X_\w x2, v17, v21
186
+ SAD_X_\w x3, v18, v22
187
+ .if \x == 4
188
+ SAD_X_\w x4, v19, v23
189
+ .endif
190
+ .endr
191
+ cbnz w12, .loop_sad_x\x\()_\w\()x\h
192
+ SAD_X_END_\w \x
193
endfunc
194
.endm
195
196
-SAD_X_8xN 3 4
197
-SAD_X_8xN 3 8
198
-SAD_X_8xN 3 16
199
-SAD_X_8xN 3 32
200
201
-SAD_X_8xN 4 4
202
-SAD_X_8xN 4 8
203
-SAD_X_8xN 4 16
204
-SAD_X_8xN 4 32
205
+SAD_X_FUNC 3, 4, 4
206
+SAD_X_FUNC 3, 4, 8
207
+SAD_X_FUNC 3, 4, 16
208
+SAD_X_FUNC 3, 8, 4
209
+SAD_X_FUNC 3, 8, 8
210
+SAD_X_FUNC 3, 8, 16
211
+SAD_X_FUNC 3, 8, 32
212
+SAD_X_FUNC 3, 12, 16
213
+SAD_X_FUNC 3, 16, 4
214
+SAD_X_FUNC 3, 16, 8
215
+SAD_X_FUNC 3, 16, 12
216
+SAD_X_FUNC 3, 16, 16
217
+SAD_X_FUNC 3, 16, 32
218
+SAD_X_FUNC 3, 16, 64
219
+SAD_X_LOOP 3, 24, 32
220
+SAD_X_LOOP 3, 32, 8
221
+SAD_X_LOOP 3, 32, 16
222
+SAD_X_LOOP 3, 32, 24
223
+SAD_X_LOOP 3, 32, 32
224
+SAD_X_LOOP 3, 32, 64
225
+SAD_X_LOOP 3, 48, 64
226
+SAD_X_LOOP 3, 64, 16
227
+SAD_X_LOOP 3, 64, 32
228
+SAD_X_LOOP 3, 64, 48
229
+SAD_X_LOOP 3, 64, 64
230
+
231
+SAD_X_FUNC 4, 4, 4
232
+SAD_X_FUNC 4, 4, 8
233
+SAD_X_FUNC 4, 4, 16
234
+SAD_X_FUNC 4, 8, 4
235
+SAD_X_FUNC 4, 8, 8
236
+SAD_X_FUNC 4, 8, 16
237
+SAD_X_FUNC 4, 8, 32
238
+SAD_X_FUNC 4, 12, 16
239
+SAD_X_FUNC 4, 16, 4
240
+SAD_X_FUNC 4, 16, 8
241
+SAD_X_FUNC 4, 16, 12
242
+SAD_X_FUNC 4, 16, 16
243
+SAD_X_FUNC 4, 16, 32
244
+SAD_X_FUNC 4, 16, 64
245
+SAD_X_LOOP 4, 24, 32
246
+SAD_X_LOOP 4, 32, 8
247
+SAD_X_LOOP 4, 32, 16
248
+SAD_X_LOOP 4, 32, 24
249
+SAD_X_LOOP 4, 32, 32
250
+SAD_X_LOOP 4, 32, 64
251
+SAD_X_LOOP 4, 48, 64
252
+SAD_X_LOOP 4, 64, 16
253
+SAD_X_LOOP 4, 64, 32
254
+SAD_X_LOOP 4, 64, 48
255
+SAD_X_LOOP 4, 64, 64
256
x265_3.6.tar.gz/source/common/aarch64/ssd-a-common.S
Added
39
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+#include "asm.S"
29
+
30
+.arch armv8-a
31
+
32
+.macro ret_v0_w0
33
+ trn2 v1.2d, v0.2d, v0.2d
34
+ add v0.2s, v0.2s, v1.2s
35
+ addp v0.2s, v0.2s, v0.2s
36
+ fmov w0, s0
37
+ ret
38
+.endm
39
x265_3.6.tar.gz/source/common/aarch64/ssd-a-sve.S
Added
80
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+
27
+.arch armv8-a+sve
28
+
29
+#ifdef __APPLE__
30
+.section __RODATA,__rodata
31
+#else
32
+.section .rodata
33
+#endif
34
+
35
+.align 4
36
+
37
+.text
38
+
39
+function PFX(pixel_sse_pp_4x4_sve)
40
+ ptrue p0.s, vl4
41
+ ld1b {z0.s}, p0/z, x0
42
+ ld1b {z17.s}, p0/z, x2
43
+ add x0, x0, x1
44
+ add x2, x2, x3
45
+ sub z0.s, p0/m, z0.s, z17.s
46
+ mul z0.s, p0/m, z0.s, z0.s
47
+.rept 3
48
+ ld1b {z16.s}, p0/z, x0
49
+ ld1b {z17.s}, p0/z, x2
50
+ add x0, x0, x1
51
+ add x2, x2, x3
52
+ sub z16.s, p0/m, z16.s, z17.s
53
+ mla z0.s, p0/m, z16.s, z16.s
54
+.endr
55
+ uaddv d0, p0, z0.s
56
+ fmov w0, s0
57
+ ret
58
+endfunc
59
+
60
+function PFX(pixel_sse_pp_4x8_sve)
61
+ ptrue p0.s, vl4
62
+ ld1b {z0.s}, p0/z, x0
63
+ ld1b {z17.s}, p0/z, x2
64
+ add x0, x0, x1
65
+ add x2, x2, x3
66
+ sub z0.s, p0/m, z0.s, z17.s
67
+ mul z0.s, p0/m, z0.s, z0.s
68
+.rept 7
69
+ ld1b {z16.s}, p0/z, x0
70
+ ld1b {z17.s}, p0/z, x2
71
+ add x0, x0, x1
72
+ add x2, x2, x3
73
+ sub z16.s, p0/m, z16.s, z17.s
74
+ mla z0.s, p0/m, z16.s, z16.s
75
+.endr
76
+ uaddv d0, p0, z0.s
77
+ fmov w0, s0
78
+ ret
79
+endfunc
80
x265_3.6.tar.gz/source/common/aarch64/ssd-a-sve2.S
Added
889
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "ssd-a-common.S"
27
+
28
+.arch armv8-a+sve2
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+function PFX(pixel_sse_pp_32x32_sve2)
41
+ rdvl x9, #1
42
+ cmp x9, #16
43
+ bgt .vl_gt_16_pixel_sse_pp_32x32
44
+ mov w12, #8
45
+ movi v0.16b, #0
46
+ movi v1.16b, #0
47
+.loop_sse_pp_32_sve2:
48
+ sub w12, w12, #1
49
+.rept 4
50
+ ld1 {v16.16b,v17.16b}, x0, x1
51
+ ld1 {v18.16b,v19.16b}, x2, x3
52
+ usubl v2.8h, v16.8b, v18.8b
53
+ usubl2 v3.8h, v16.16b, v18.16b
54
+ usubl v4.8h, v17.8b, v19.8b
55
+ usubl2 v5.8h, v17.16b, v19.16b
56
+ smlal v0.4s, v2.4h, v2.4h
57
+ smlal2 v1.4s, v2.8h, v2.8h
58
+ smlal v0.4s, v3.4h, v3.4h
59
+ smlal2 v1.4s, v3.8h, v3.8h
60
+ smlal v0.4s, v4.4h, v4.4h
61
+ smlal2 v1.4s, v4.8h, v4.8h
62
+ smlal v0.4s, v5.4h, v5.4h
63
+ smlal2 v1.4s, v5.8h, v5.8h
64
+.endr
65
+ cbnz w12, .loop_sse_pp_32_sve2
66
+ add v0.4s, v0.4s, v1.4s
67
+ ret_v0_w0
68
+.vl_gt_16_pixel_sse_pp_32x32:
69
+ ptrue p0.b, vl32
70
+ ld1b {z16.b}, p0/z, x0
71
+ ld1b {z18.b}, p0/z, x2
72
+ add x0, x0, x1
73
+ add x2, x2, x3
74
+ usublb z1.h, z16.b, z18.b
75
+ usublt z2.h, z16.b, z18.b
76
+ smullb z0.s, z1.h, z1.h
77
+ smlalt z0.s, z1.h, z1.h
78
+ smlalb z0.s, z2.h, z2.h
79
+ smlalt z0.s, z2.h, z2.h
80
+.rept 31
81
+ ld1b {z16.b}, p0/z, x0
82
+ ld1b {z18.b}, p0/z, x2
83
+ add x0, x0, x1
84
+ add x2, x2, x3
85
+ usublb z1.h, z16.b, z18.b
86
+ usublt z2.h, z16.b, z18.b
87
+ smullb z0.s, z1.h, z1.h
88
+ smlalt z0.s, z1.h, z1.h
89
+ smlalb z0.s, z2.h, z2.h
90
+ smlalt z0.s, z2.h, z2.h
91
+.endr
92
+ uaddv d3, p0, z0.s
93
+ fmov w0, s3
94
+ ret
95
+endfunc
96
+
97
+function PFX(pixel_sse_pp_32x64_sve2)
98
+ rdvl x9, #1
99
+ cmp x9, #16
100
+ bgt .vl_gt_16_pixel_sse_pp_32x64
101
+ ptrue p0.b, vl16
102
+ ld1b {z16.b}, p0/z, x0
103
+ ld1b {z17.b}, p0/z, x0, #1, mul vl
104
+ ld1b {z18.b}, p0/z, x2
105
+ ld1b {z19.b}, p0/z, x2, #1, mul vl
106
+ add x0, x0, x1
107
+ add x2, x2, x3
108
+ usublb z1.h, z16.b, z18.b
109
+ usublt z2.h, z16.b, z18.b
110
+ usublb z3.h, z17.b, z19.b
111
+ usublt z4.h, z17.b, z19.b
112
+ smullb z20.s, z1.h, z1.h
113
+ smullt z21.s, z1.h, z1.h
114
+ smlalb z20.s, z2.h, z2.h
115
+ smlalt z21.s, z2.h, z2.h
116
+ smlalb z20.s, z3.h, z3.h
117
+ smlalt z21.s, z3.h, z3.h
118
+ smlalb z20.s, z4.h, z4.h
119
+ smlalt z21.s, z4.h, z4.h
120
+.rept 63
121
+ ld1b {z16.b}, p0/z, x0
122
+ ld1b {z17.b}, p0/z, x0, #1, mul vl
123
+ ld1b {z18.b}, p0/z, x2
124
+ ld1b {z19.b}, p0/z, x2, #1, mul vl
125
+ add x0, x0, x1
126
+ add x2, x2, x3
127
+ usublb z1.h, z16.b, z18.b
128
+ usublt z2.h, z16.b, z18.b
129
+ usublb z3.h, z17.b, z19.b
130
+ usublt z4.h, z17.b, z19.b
131
+ smlalb z20.s, z1.h, z1.h
132
+ smlalt z21.s, z1.h, z1.h
133
+ smlalb z20.s, z2.h, z2.h
134
+ smlalt z21.s, z2.h, z2.h
135
+ smlalb z20.s, z3.h, z3.h
136
+ smlalt z21.s, z3.h, z3.h
137
+ smlalb z20.s, z4.h, z4.h
138
+ smlalt z21.s, z4.h, z4.h
139
+.endr
140
+ uaddv d3, p0, z20.s
141
+ fmov w0, s3
142
+ uaddv d4, p0, z21.s
143
+ fmov w1, s4
144
+ add w0, w0, w1
145
+ ret
146
+.vl_gt_16_pixel_sse_pp_32x64:
147
+ ptrue p0.b, vl32
148
+ ld1b {z16.b}, p0/z, x0
149
+ ld1b {z18.b}, p0/z, x2
150
+ add x0, x0, x1
151
+ add x2, x2, x3
152
+ usublb z1.h, z16.b, z18.b
153
+ usublt z2.h, z16.b, z18.b
154
+ smullb z20.s, z1.h, z1.h
155
+ smullt z21.s, z1.h, z1.h
156
+ smlalb z20.s, z2.h, z2.h
157
+ smlalt z21.s, z2.h, z2.h
158
+.rept 63
159
+ ld1b {z16.b}, p0/z, x0
160
+ ld1b {z18.b}, p0/z, x2
161
+ add x0, x0, x1
162
+ add x2, x2, x3
163
+ usublb z1.h, z16.b, z18.b
164
+ usublt z2.h, z16.b, z18.b
165
+ smlalb z20.s, z1.h, z1.h
166
+ smlalt z21.s, z1.h, z1.h
167
+ smlalb z20.s, z2.h, z2.h
168
+ smlalt z21.s, z2.h, z2.h
169
+.endr
170
+ uaddv d3, p0, z20.s
171
+ fmov w0, s3
172
+ uaddv d4, p0, z21.s
173
+ fmov w1, s4
174
+ add w0, w0, w1
175
+ ret
176
+endfunc
177
+
178
+function PFX(pixel_sse_pp_64x64_sve2)
179
+ rdvl x9, #1
180
+ cmp x9, #16
181
+ bgt .vl_gt_16_pixel_sse_pp_64x64
182
+ mov w12, #16
183
+ movi v0.16b, #0
184
+ movi v1.16b, #0
185
+
186
+.loop_sse_pp_64_sve2:
187
+ sub w12, w12, #1
188
+.rept 4
189
+ ld1 {v16.16b-v19.16b}, x0, x1
190
+ ld1 {v20.16b-v23.16b}, x2, x3
191
+
192
+ usubl v2.8h, v16.8b, v20.8b
193
+ usubl2 v3.8h, v16.16b, v20.16b
194
+ usubl v4.8h, v17.8b, v21.8b
195
+ usubl2 v5.8h, v17.16b, v21.16b
196
+ smlal v0.4s, v2.4h, v2.4h
197
+ smlal2 v1.4s, v2.8h, v2.8h
198
+ smlal v0.4s, v3.4h, v3.4h
199
+ smlal2 v1.4s, v3.8h, v3.8h
200
+ smlal v0.4s, v4.4h, v4.4h
201
+ smlal2 v1.4s, v4.8h, v4.8h
202
+ smlal v0.4s, v5.4h, v5.4h
203
+ smlal2 v1.4s, v5.8h, v5.8h
204
+
205
+ usubl v2.8h, v18.8b, v22.8b
206
+ usubl2 v3.8h, v18.16b, v22.16b
207
+ usubl v4.8h, v19.8b, v23.8b
208
+ usubl2 v5.8h, v19.16b, v23.16b
209
+ smlal v0.4s, v2.4h, v2.4h
210
+ smlal2 v1.4s, v2.8h, v2.8h
211
+ smlal v0.4s, v3.4h, v3.4h
212
+ smlal2 v1.4s, v3.8h, v3.8h
213
+ smlal v0.4s, v4.4h, v4.4h
214
+ smlal2 v1.4s, v4.8h, v4.8h
215
+ smlal v0.4s, v5.4h, v5.4h
216
+ smlal2 v1.4s, v5.8h, v5.8h
217
+.endr
218
+ cbnz w12, .loop_sse_pp_64_sve2
219
+ add v0.4s, v0.4s, v1.4s
220
+ ret_v0_w0
221
+.vl_gt_16_pixel_sse_pp_64x64:
222
+ cmp x9, #48
223
+ bgt .vl_gt_48_pixel_sse_pp_64x64
224
+ ptrue p0.b, vl32
225
+ ld1b {z16.b}, p0/z, x0
226
+ ld1b {z17.b}, p0/z, x0, #1, mul vl
227
+ ld1b {z20.b}, p0/z, x2
228
+ ld1b {z21.b}, p0/z, x2, #1, mul vl
229
+ add x0, x0, x1
230
+ add x2, x2, x3
231
+ usublb z1.h, z16.b, z20.b
232
+ usublt z2.h, z16.b, z20.b
233
+ usublb z3.h, z17.b, z21.b
234
+ usublt z4.h, z17.b, z21.b
235
+ smullb z24.s, z1.h, z1.h
236
+ smullt z25.s, z1.h, z1.h
237
+ smlalb z24.s, z2.h, z2.h
238
+ smlalt z25.s, z2.h, z2.h
239
+ smlalb z24.s, z3.h, z3.h
240
+ smlalt z25.s, z3.h, z3.h
241
+ smlalb z24.s, z4.h, z4.h
242
+ smlalt z25.s, z4.h, z4.h
243
+.rept 63
244
+ ld1b {z16.b}, p0/z, x0
245
+ ld1b {z17.b}, p0/z, x0, #1, mul vl
246
+ ld1b {z20.b}, p0/z, x2
247
+ ld1b {z21.b}, p0/z, x2, #1, mul vl
248
+ add x0, x0, x1
249
+ add x2, x2, x3
250
+ usublb z1.h, z16.b, z20.b
251
+ usublt z2.h, z16.b, z20.b
252
+ usublb z3.h, z17.b, z21.b
253
+ usublt z4.h, z17.b, z21.b
254
+ smlalb z24.s, z1.h, z1.h
255
+ smlalt z25.s, z1.h, z1.h
256
+ smlalb z24.s, z2.h, z2.h
257
+ smlalt z25.s, z2.h, z2.h
258
+ smlalb z24.s, z3.h, z3.h
259
+ smlalt z25.s, z3.h, z3.h
260
+ smlalb z24.s, z4.h, z4.h
261
+ smlalt z25.s, z4.h, z4.h
262
+.endr
263
+ uaddv d3, p0, z24.s
264
+ fmov w0, s3
265
+ uaddv d4, p0, z25.s
266
+ fmov w1, s4
267
+ add w0, w0, w1
268
+ ret
269
+.vl_gt_48_pixel_sse_pp_64x64:
270
+ ptrue p0.b, vl64
271
+ ld1b {z16.b}, p0/z, x0
272
+ ld1b {z20.b}, p0/z, x2
273
+ add x0, x0, x1
274
+ add x2, x2, x3
275
+ usublb z1.h, z16.b, z20.b
276
+ usublt z2.h, z16.b, z20.b
277
+ smullb z24.s, z1.h, z1.h
278
+ smullt z25.s, z1.h, z1.h
279
+ smlalb z24.s, z2.h, z2.h
280
+ smlalt z25.s, z2.h, z2.h
281
+.rept 63
282
+ ld1b {z16.b}, p0/z, x0
283
+ ld1b {z20.b}, p0/z, x2
284
+ add x0, x0, x1
285
+ add x2, x2, x3
286
+ usublb z1.h, z16.b, z20.b
287
+ usublt z2.h, z16.b, z20.b
288
+ smlalb z24.s, z1.h, z1.h
289
+ smlalt z25.s, z1.h, z1.h
290
+ smlalb z24.s, z2.h, z2.h
291
+ smlalt z25.s, z2.h, z2.h
292
+.endr
293
+ uaddv d3, p0, z24.s
294
+ fmov w0, s3
295
+ uaddv d4, p0, z25.s
296
+ fmov w1, s4
297
+ add w0, w0, w1
298
+ ret
299
+endfunc
300
+
301
+function PFX(pixel_sse_ss_4x4_sve2)
302
+ ptrue p0.b, vl8
303
+ ld1b {z16.b}, p0/z, x0
304
+ ld1b {z17.b}, p0/z, x2
305
+ add x0, x0, x1, lsl #1
306
+ add x2, x2, x3, lsl #1
307
+ sub z1.h, z16.h, z17.h
308
+ smullb z3.s, z1.h, z1.h
309
+ smullt z4.s, z1.h, z1.h
310
+.rept 3
311
+ ld1b {z16.b}, p0/z, x0
312
+ ld1b {z17.b}, p0/z, x2
313
+ add x0, x0, x1, lsl #1
314
+ add x2, x2, x3, lsl #1
315
+ sub z1.h, z16.h, z17.h
316
+ smlalb z3.s, z1.h, z1.h
317
+ smlalt z4.s, z1.h, z1.h
318
+.endr
319
+ uaddv d3, p0, z3.s
320
+ fmov w0, s3
321
+ uaddv d4, p0, z4.s
322
+ fmov w1, s4
323
+ add w0, w0, w1
324
+ ret
325
+endfunc
326
+
327
+function PFX(pixel_sse_ss_8x8_sve2)
328
+ ptrue p0.b, vl16
329
+ ld1b {z16.b}, p0/z, x0
330
+ ld1b {z17.b}, p0/z, x2
331
+ add x0, x0, x1, lsl #1
332
+ add x2, x2, x3, lsl #1
333
+ sub z1.h, z16.h, z17.h
334
+ smullb z3.s, z1.h, z1.h
335
+ smullt z4.s, z1.h, z1.h
336
+.rept 7
337
+ ld1b {z16.b}, p0/z, x0
338
+ ld1b {z17.b}, p0/z, x2
339
+ add x0, x0, x1, lsl #1
340
+ add x2, x2, x3, lsl #1
341
+ sub z1.h, z16.h, z17.h
342
+ smlalb z3.s, z1.h, z1.h
343
+ smlalt z4.s, z1.h, z1.h
344
+.endr
345
+ uaddv d3, p0, z3.s
346
+ fmov w0, s3
347
+ uaddv d4, p0, z4.s
348
+ fmov w1, s4
349
+ add w0, w0, w1
350
+ ret
351
+endfunc
352
+
353
+function PFX(pixel_sse_ss_16x16_sve2)
354
+ rdvl x9, #1
355
+ cmp x9, #16
356
+ bgt .vl_gt_16_pixel_sse_ss_16x16
357
+ ptrue p0.b, vl16
358
+ ld1b {z16.b}, p0/z, x0
359
+ ld1b {z17.b}, p0/z, x0, #1, mul vl
360
+ ld1b {z18.b}, p0/z, x2
361
+ ld1b {z19.b}, p0/z, x2, #1, mul vl
362
+ add x0, x0, x1, lsl #1
363
+ add x2, x2, x3, lsl #1
364
+ sub z1.h, z16.h, z18.h
365
+ sub z2.h, z17.h, z19.h
366
+ smullb z3.s, z1.h, z1.h
367
+ smullt z4.s, z1.h, z1.h
368
+ smlalb z3.s, z2.h, z2.h
369
+ smlalt z4.s, z2.h, z2.h
370
+.rept 15
371
+ ld1b {z16.b}, p0/z, x0
372
+ ld1b {z17.b}, p0/z, x0, #1, mul vl
373
+ ld1b {z18.b}, p0/z, x2
374
+ ld1b {z19.b}, p0/z, x2, #1, mul vl
375
+ add x0, x0, x1, lsl #1
376
+ add x2, x2, x3, lsl #1
377
+ sub z1.h, z16.h, z18.h
378
+ sub z2.h, z17.h, z19.h
379
+ smlalb z3.s, z1.h, z1.h
380
+ smlalt z4.s, z1.h, z1.h
381
+ smlalb z3.s, z2.h, z2.h
382
+ smlalt z4.s, z2.h, z2.h
383
+.endr
384
+ uaddv d3, p0, z3.s
385
+ fmov w0, s3
386
+ uaddv d4, p0, z4.s
387
+ fmov w1, s4
388
+ add w0, w0, w1
389
+ ret
390
+.vl_gt_16_pixel_sse_ss_16x16:
391
+ ptrue p0.b, vl32
392
+ ld1b {z16.b}, p0/z, x0
393
+ ld1b {z18.b}, p0/z, x2
394
+ add x0, x0, x1, lsl #1
395
+ add x2, x2, x3, lsl #1
396
+ sub z1.h, z16.h, z18.h
397
+ smullb z3.s, z1.h, z1.h
398
+ smullt z4.s, z1.h, z1.h
399
+.rept 15
400
+ ld1b {z16.b}, p0/z, x0
401
+ ld1b {z18.b}, p0/z, x2
402
+ add x0, x0, x1, lsl #1
403
+ add x2, x2, x3, lsl #1
404
+ sub z1.h, z16.h, z18.h
405
+ smlalb z3.s, z1.h, z1.h
406
+ smlalt z4.s, z1.h, z1.h
407
+.endr
408
+ uaddv d3, p0, z3.s
409
+ fmov w0, s3
410
+ uaddv d4, p0, z4.s
411
+ fmov w1, s4
412
+ add w0, w0, w1
413
+ ret
414
+endfunc
415
+
416
+function PFX(pixel_sse_ss_32x32_sve2)
417
+ rdvl x9, #1
418
+ cmp x9, #16
419
+ bgt .vl_gt_16_pixel_sse_ss_32x32
420
+ ptrue p0.b, vl16
421
+ ld1b {z16.b}, p0/z, x0
422
+ ld1b {z17.b}, p0/z, x0, #1, mul vl
423
+ ld1b {z18.b}, p0/z, x0, #2, mul vl
424
+ ld1b {z19.b}, p0/z, x0, #3, mul vl
425
+ ld1b {z20.b}, p0/z, x2
426
+ ld1b {z21.b}, p0/z, x2, #1, mul vl
427
+ ld1b {z22.b}, p0/z, x2, #2, mul vl
428
+ ld1b {z23.b}, p0/z, x2, #3, mul vl
429
+ add x0, x0, x1, lsl #1
430
+ add x2, x2, x3, lsl #1
431
+ sub z1.h, z16.h, z20.h
432
+ sub z2.h, z17.h, z21.h
433
+ sub z3.h, z18.h, z22.h
434
+ sub z4.h, z19.h, z23.h
435
+ smullb z5.s, z1.h, z1.h
436
+ smullt z6.s, z1.h, z1.h
437
+ smlalb z5.s, z2.h, z2.h
438
+ smlalt z6.s, z2.h, z2.h
439
+ smlalb z5.s, z3.h, z3.h
440
+ smlalt z6.s, z3.h, z3.h
441
+ smlalb z5.s, z4.h, z4.h
442
+ smlalt z6.s, z4.h, z4.h
443
+.rept 31
444
+ ld1b {z16.b}, p0/z, x0
445
+ ld1b {z17.b}, p0/z, x0, #1, mul vl
446
+ ld1b {z18.b}, p0/z, x0, #2, mul vl
447
+ ld1b {z19.b}, p0/z, x0, #3, mul vl
448
+ ld1b {z20.b}, p0/z, x2
449
+ ld1b {z21.b}, p0/z, x2, #1, mul vl
450
+ ld1b {z22.b}, p0/z, x2, #2, mul vl
451
+ ld1b {z23.b}, p0/z, x2, #3, mul vl
452
+ add x0, x0, x1, lsl #1
453
+ add x2, x2, x3, lsl #1
454
+ sub z1.h, z16.h, z20.h
455
+ sub z2.h, z17.h, z21.h
456
+ sub z3.h, z18.h, z22.h
457
+ sub z4.h, z19.h, z23.h
458
+ smlalb z5.s, z1.h, z1.h
459
+ smlalt z6.s, z1.h, z1.h
460
+ smlalb z5.s, z2.h, z2.h
461
+ smlalt z6.s, z2.h, z2.h
462
+ smlalb z5.s, z3.h, z3.h
463
+ smlalt z6.s, z3.h, z3.h
464
+ smlalb z5.s, z4.h, z4.h
465
+ smlalt z6.s, z4.h, z4.h
466
+.endr
467
+ uaddv d3, p0, z5.s
468
+ fmov w0, s3
469
+ uaddv d4, p0, z6.s
470
+ fmov w1, s4
471
+ add w0, w0, w1
472
+ ret
473
+.vl_gt_16_pixel_sse_ss_32x32:
474
+ cmp x9, #48
475
+ bgt .vl_gt_48_pixel_sse_ss_32x32
476
+ ptrue p0.b, vl32
477
+ ld1b {z16.b}, p0/z, x0
478
+ ld1b {z17.b}, p0/z, x0, #1, mul vl
479
+ ld1b {z20.b}, p0/z, x2
480
+ ld1b {z21.b}, p0/z, x2, #1, mul vl
481
+ add x0, x0, x1, lsl #1
482
+ add x2, x2, x3, lsl #1
483
+ sub z1.h, z16.h, z20.h
484
+ sub z2.h, z17.h, z21.h
485
+ smullb z5.s, z1.h, z1.h
486
+ smullt z6.s, z1.h, z1.h
487
+ smlalb z5.s, z2.h, z2.h
488
+ smlalt z6.s, z2.h, z2.h
489
+.rept 31
490
+ ld1b {z16.b}, p0/z, x0
491
+ ld1b {z17.b}, p0/z, x0, #1, mul vl
492
+ ld1b {z20.b}, p0/z, x2
493
+ ld1b {z21.b}, p0/z, x2, #1, mul vl
494
+ add x0, x0, x1, lsl #1
495
+ add x2, x2, x3, lsl #1
496
+ sub z1.h, z16.h, z20.h
497
+ sub z2.h, z17.h, z21.h
498
+ smlalb z5.s, z1.h, z1.h
499
+ smlalt z6.s, z1.h, z1.h
500
+ smlalb z5.s, z2.h, z2.h
501
+ smlalt z6.s, z2.h, z2.h
502
+.endr
503
+ uaddv d3, p0, z5.s
504
+ fmov w0, s3
505
+ uaddv d4, p0, z6.s
506
+ fmov w1, s4
507
+ add w0, w0, w1
508
+ ret
509
+.vl_gt_48_pixel_sse_ss_32x32:
510
+ ptrue p0.b, vl64
511
+ ld1b {z16.b}, p0/z, x0
512
+ ld1b {z20.b}, p0/z, x2
513
+ add x0, x0, x1, lsl #1
514
+ add x2, x2, x3, lsl #1
515
+ sub z1.h, z16.h, z20.h
516
+ smullb z5.s, z1.h, z1.h
517
+ smullt z6.s, z1.h, z1.h
518
+.rept 31
519
+ ld1b {z16.b}, p0/z, x0
520
+ ld1b {z20.b}, p0/z, x2
521
+ add x0, x0, x1, lsl #1
522
+ add x2, x2, x3, lsl #1
523
+ sub z1.h, z16.h, z20.h
524
+ smlalb z5.s, z1.h, z1.h
525
+ smlalt z6.s, z1.h, z1.h
526
+.endr
527
+ uaddv d3, p0, z5.s
528
+ fmov w0, s3
529
+ uaddv d4, p0, z6.s
530
+ fmov w1, s4
531
+ add w0, w0, w1
532
+ ret
533
+endfunc
534
+
535
+function PFX(pixel_sse_ss_64x64_sve2)
536
+ rdvl x9, #1
537
+ cmp x9, #16
538
+ bgt .vl_gt_16_pixel_sse_ss_64x64
539
+ ptrue p0.b, vl16
540
+ ld1b {z24.b}, p0/z, x0
541
+ ld1b {z25.b}, p0/z, x0, #1, mul vl
542
+ ld1b {z26.b}, p0/z, x0, #2, mul vl
543
+ ld1b {z27.b}, p0/z, x0, #3, mul vl
544
+ ld1b {z28.b}, p0/z, x2
545
+ ld1b {z29.b}, p0/z, x2, #1, mul vl
546
+ ld1b {z30.b}, p0/z, x2, #2, mul vl
547
+ ld1b {z31.b}, p0/z, x2, #3, mul vl
548
+ sub z0.h, z24.h, z28.h
549
+ sub z1.h, z25.h, z29.h
550
+ sub z2.h, z26.h, z30.h
551
+ sub z3.h, z27.h, z31.h
552
+ smullb z5.s, z0.h, z0.h
553
+ smullt z6.s, z0.h, z0.h
554
+ smlalb z5.s, z1.h, z1.h
555
+ smlalt z6.s, z1.h, z1.h
556
+ smlalb z5.s, z2.h, z2.h
557
+ smlalt z6.s, z2.h, z2.h
558
+ smlalb z5.s, z3.h, z3.h
559
+ smlalt z6.s, z3.h, z3.h
560
+ ld1b {z24.b}, p0/z, x0, #4, mul vl
561
+ ld1b {z25.b}, p0/z, x0, #5, mul vl
562
+ ld1b {z26.b}, p0/z, x0, #6, mul vl
563
+ ld1b {z27.b}, p0/z, x0, #7, mul vl
564
+ ld1b {z28.b}, p0/z, x2, #4, mul vl
565
+ ld1b {z29.b}, p0/z, x2, #5, mul vl
566
+ ld1b {z30.b}, p0/z, x2, #6, mul vl
567
+ ld1b {z31.b}, p0/z, x2, #7, mul vl
568
+ sub z0.h, z24.h, z28.h
569
+ sub z1.h, z25.h, z29.h
570
+ sub z2.h, z26.h, z30.h
571
+ sub z3.h, z27.h, z31.h
572
+ smlalb z5.s, z0.h, z0.h
573
+ smlalt z6.s, z0.h, z0.h
574
+ smlalb z5.s, z1.h, z1.h
575
+ smlalt z6.s, z1.h, z1.h
576
+ smlalb z5.s, z2.h, z2.h
577
+ smlalt z6.s, z2.h, z2.h
578
+ smlalb z5.s, z3.h, z3.h
579
+ smlalt z6.s, z3.h, z3.h
580
+ add x0, x0, x1, lsl #1
581
+ add x2, x2, x3, lsl #1
582
+.rept 63
583
+ ld1b {z24.b}, p0/z, x0
584
+ ld1b {z25.b}, p0/z, x0, #1, mul vl
585
+ ld1b {z26.b}, p0/z, x0, #2, mul vl
586
+ ld1b {z27.b}, p0/z, x0, #3, mul vl
587
+ ld1b {z28.b}, p0/z, x2
588
+ ld1b {z29.b}, p0/z, x2, #1, mul vl
589
+ ld1b {z30.b}, p0/z, x2, #2, mul vl
590
+ ld1b {z31.b}, p0/z, x2, #3, mul vl
591
+ sub z0.h, z24.h, z28.h
592
+ sub z1.h, z25.h, z29.h
593
+ sub z2.h, z26.h, z30.h
594
+ sub z3.h, z27.h, z31.h
595
+ smlalb z5.s, z0.h, z0.h
596
+ smlalt z6.s, z0.h, z0.h
597
+ smlalb z5.s, z1.h, z1.h
598
+ smlalt z6.s, z1.h, z1.h
599
+ smlalb z5.s, z2.h, z2.h
600
+ smlalt z6.s, z2.h, z2.h
601
+ smlalb z5.s, z3.h, z3.h
602
+ smlalt z6.s, z3.h, z3.h
603
+ ld1b {z24.b}, p0/z, x0, #4, mul vl
604
+ ld1b {z25.b}, p0/z, x0, #5, mul vl
605
+ ld1b {z26.b}, p0/z, x0, #6, mul vl
606
+ ld1b {z27.b}, p0/z, x0, #7, mul vl
607
+ ld1b {z28.b}, p0/z, x2, #4, mul vl
608
+ ld1b {z29.b}, p0/z, x2, #5, mul vl
609
+ ld1b {z30.b}, p0/z, x2, #6, mul vl
610
+ ld1b {z31.b}, p0/z, x2, #7, mul vl
611
+ sub z0.h, z24.h, z28.h
612
+ sub z1.h, z25.h, z29.h
613
+ sub z2.h, z26.h, z30.h
614
+ sub z3.h, z27.h, z31.h
615
+ smlalb z5.s, z0.h, z0.h
616
+ smlalt z6.s, z0.h, z0.h
617
+ smlalb z5.s, z1.h, z1.h
618
+ smlalt z6.s, z1.h, z1.h
619
+ smlalb z5.s, z2.h, z2.h
620
+ smlalt z6.s, z2.h, z2.h
621
+ smlalb z5.s, z3.h, z3.h
622
+ smlalt z6.s, z3.h, z3.h
623
+ add x0, x0, x1, lsl #1
624
+ add x2, x2, x3, lsl #1
625
+.endr
626
+ uaddv d3, p0, z5.s
627
+ fmov w0, s3
628
+ uaddv d4, p0, z6.s
629
+ fmov w1, s4
630
+ add w0, w0, w1
631
+ ret
632
+.vl_gt_16_pixel_sse_ss_64x64:
633
+ cmp x9, #48
634
+ bgt .vl_gt_48_pixel_sse_ss_64x64
635
+ ptrue p0.b, vl32
636
+ ld1b {z24.b}, p0/z, x0
637
+ ld1b {z25.b}, p0/z, x0, #1, mul vl
638
+ ld1b {z28.b}, p0/z, x2
639
+ ld1b {z29.b}, p0/z, x2, #1, mul vl
640
+ sub z0.h, z24.h, z28.h
641
+ sub z1.h, z25.h, z29.h
642
+ smullb z5.s, z0.h, z0.h
643
+ smullt z6.s, z0.h, z0.h
644
+ smlalb z5.s, z1.h, z1.h
645
+ smlalt z6.s, z1.h, z1.h
646
+ ld1b {z24.b}, p0/z, x0, #1, mul vl
647
+ ld1b {z25.b}, p0/z, x0, #2, mul vl
648
+ ld1b {z28.b}, p0/z, x2, #1, mul vl
649
+ ld1b {z29.b}, p0/z, x2, #2, mul vl
650
+ sub z0.h, z24.h, z28.h
651
+ sub z1.h, z25.h, z29.h
652
+ smlalb z5.s, z0.h, z0.h
653
+ smlalt z6.s, z0.h, z0.h
654
+ smlalb z5.s, z1.h, z1.h
655
+ smlalt z6.s, z1.h, z1.h
656
+ add x0, x0, x1, lsl #1
657
+ add x2, x2, x3, lsl #1
658
+.rept 63
659
+ ld1b {z24.b}, p0/z, x0
660
+ ld1b {z25.b}, p0/z, x0, #1, mul vl
661
+ ld1b {z28.b}, p0/z, x2
662
+ ld1b {z29.b}, p0/z, x2, #1, mul vl
663
+ sub z0.h, z24.h, z28.h
664
+ sub z1.h, z25.h, z29.h
665
+ smlalb z5.s, z0.h, z0.h
666
+ smlalt z6.s, z0.h, z0.h
667
+ smlalb z5.s, z1.h, z1.h
668
+ smlalt z6.s, z1.h, z1.h
669
+ ld1b {z24.b}, p0/z, x0, #1, mul vl
670
+ ld1b {z25.b}, p0/z, x0, #2, mul vl
671
+ ld1b {z28.b}, p0/z, x2, #1, mul vl
672
+ ld1b {z29.b}, p0/z, x2, #2, mul vl
673
+ sub z0.h, z24.h, z28.h
674
+ sub z1.h, z25.h, z29.h
675
+ smlalb z5.s, z0.h, z0.h
676
+ smlalt z6.s, z0.h, z0.h
677
+ smlalb z5.s, z1.h, z1.h
678
+ smlalt z6.s, z1.h, z1.h
679
+ add x0, x0, x1, lsl #1
680
+ add x2, x2, x3, lsl #1
681
+.endr
682
+ uaddv d3, p0, z5.s
683
+ fmov w0, s3
684
+ uaddv d4, p0, z6.s
685
+ fmov w1, s4
686
+ add w0, w0, w1
687
+ ret
688
+.vl_gt_48_pixel_sse_ss_64x64:
689
+ cmp x9, #112
690
+ bgt .vl_gt_112_pixel_sse_ss_64x64
691
+ ptrue p0.b, vl64
692
+ ld1b {z24.b}, p0/z, x0
693
+ ld1b {z28.b}, p0/z, x2
694
+ sub z0.h, z24.h, z28.h
695
+ smullb z5.s, z0.h, z0.h
696
+ smullt z6.s, z0.h, z0.h
697
+ ld1b {z24.b}, p0/z, x0, #1, mul vl
698
+ ld1b {z28.b}, p0/z, x2, #1, mul vl
699
+ sub z0.h, z24.h, z28.h
700
+ smlalb z5.s, z0.h, z0.h
701
+ smlalt z6.s, z0.h, z0.h
702
+ add x0, x0, x1, lsl #1
703
+ add x2, x2, x3, lsl #1
704
+.rept 63
705
+ ld1b {z24.b}, p0/z, x0
706
+ ld1b {z28.b}, p0/z, x2
707
+ sub z0.h, z24.h, z28.h
708
+ smlalb z5.s, z0.h, z0.h
709
+ smlalt z6.s, z0.h, z0.h
710
+ ld1b {z24.b}, p0/z, x0, #1, mul vl
711
+ ld1b {z28.b}, p0/z, x2, #1, mul vl
712
+ sub z0.h, z24.h, z28.h
713
+ smlalb z5.s, z0.h, z0.h
714
+ smlalt z6.s, z0.h, z0.h
715
+ add x0, x0, x1, lsl #1
716
+ add x2, x2, x3, lsl #1
717
+.endr
718
+ uaddv d3, p0, z5.s
719
+ fmov w0, s3
720
+ uaddv d4, p0, z6.s
721
+ fmov w1, s4
722
+ add w0, w0, w1
723
+ ret
724
+.vl_gt_112_pixel_sse_ss_64x64:
725
+ ptrue p0.b, vl128
726
+ ld1b {z24.b}, p0/z, x0
727
+ ld1b {z28.b}, p0/z, x2
728
+ sub z0.h, z24.h, z28.h
729
+ smullb z5.s, z0.h, z0.h
730
+ smullt z6.s, z0.h, z0.h
731
+ add x0, x0, x1, lsl #1
732
+ add x2, x2, x3, lsl #1
733
+.rept 63
734
+ ld1b {z24.b}, p0/z, x0
735
+ ld1b {z28.b}, p0/z, x2
736
+ sub z0.h, z24.h, z28.h
737
+ smlalb z5.s, z0.h, z0.h
738
+ smlalt z6.s, z0.h, z0.h
739
+ add x0, x0, x1, lsl #1
740
+ add x2, x2, x3, lsl #1
741
+.endr
742
+ uaddv d3, p0, z5.s
743
+ fmov w0, s3
744
+ uaddv d4, p0, z6.s
745
+ fmov w1, s4
746
+ add w0, w0, w1
747
+ ret
748
+endfunc
749
+
750
+function PFX(pixel_ssd_s_4x4_sve2)
751
+ ptrue p0.b, vl8
752
+ ld1b {z16.b}, p0/z, x0
753
+ add x0, x0, x1, lsl #1
754
+ smullb z0.s, z16.h, z16.h
755
+ smlalt z0.s, z16.h, z16.h
756
+.rept 3
757
+ ld1b {z16.b}, p0/z, x0
758
+ add x0, x0, x1, lsl #1
759
+ smlalb z0.s, z16.h, z16.h
760
+ smlalt z0.s, z16.h, z16.h
761
+.endr
762
+ uaddv d3, p0, z0.s
763
+ fmov w0, s3
764
+ ret
765
+endfunc
766
+
767
+function PFX(pixel_ssd_s_8x8_sve2)
768
+ ptrue p0.b, vl16
769
+ ld1b {z16.b}, p0/z, x0
770
+ add x0, x0, x1, lsl #1
771
+ smullb z0.s, z16.h, z16.h
772
+ smlalt z0.s, z16.h, z16.h
773
+.rept 7
774
+ ld1b {z16.b}, p0/z, x0
775
+ add x0, x0, x1, lsl #1
776
+ smlalb z0.s, z16.h, z16.h
777
+ smlalt z0.s, z16.h, z16.h
778
+.endr
779
+ uaddv d3, p0, z0.s
780
+ fmov w0, s3
781
+ ret
782
+endfunc
783
+
784
+function PFX(pixel_ssd_s_16x16_sve2)
785
+ rdvl x9, #1
786
+ cmp x9, #16
787
+ bgt .vl_gt_16_pixel_ssd_s_16x16
788
+ add x1, x1, x1
789
+ mov w12, #4
790
+ movi v0.16b, #0
791
+ movi v1.16b, #0
792
+.loop_ssd_s_16_sve2:
793
+ sub w12, w12, #1
794
+.rept 2
795
+ ld1 {v4.16b,v5.16b}, x0, x1
796
+ ld1 {v6.16b,v7.16b}, x0, x1
797
+ smlal v0.4s, v4.4h, v4.4h
798
+ smlal2 v1.4s, v4.8h, v4.8h
799
+ smlal v0.4s, v5.4h, v5.4h
800
+ smlal2 v1.4s, v5.8h, v5.8h
801
+ smlal v0.4s, v6.4h, v6.4h
802
+ smlal2 v1.4s, v6.8h, v6.8h
803
+ smlal v0.4s, v7.4h, v7.4h
804
+ smlal2 v1.4s, v7.8h, v7.8h
805
+.endr
806
+ cbnz w12, .loop_ssd_s_16_sve2
807
+ add v0.4s, v0.4s, v1.4s
808
+ ret_v0_w0
809
+.vl_gt_16_pixel_ssd_s_16x16:
810
+ ptrue p0.b, vl32
811
+ ld1b {z16.b}, p0/z, x0
812
+ add x0, x0, x1, lsl #1
813
+ smullb z0.s, z16.h, z16.h
814
+ smlalt z0.s, z16.h, z16.h
815
+.rept 15
816
+ ld1b {z16.b}, p0/z, x0
817
+ add x0, x0, x1, lsl #1
818
+ smlalb z0.s, z16.h, z16.h
819
+ smlalt z0.s, z16.h, z16.h
820
+.endr
821
+ uaddv d3, p0, z0.s
822
+ fmov w0, s3
823
+ ret
824
+endfunc
825
+
826
+function PFX(pixel_ssd_s_32x32_sve2)
827
+ rdvl x9, #1
828
+ cmp x9, #16
829
+ bgt .vl_gt_16_pixel_ssd_s_32x32
830
+ add x1, x1, x1
831
+ mov w12, #8
832
+ movi v0.16b, #0
833
+ movi v1.16b, #0
834
+.loop_ssd_s_32:
835
+ sub w12, w12, #1
836
+.rept 4
837
+ ld1 {v4.16b-v7.16b}, x0, x1
838
+ smlal v0.4s, v4.4h, v4.4h
839
+ smlal2 v1.4s, v4.8h, v4.8h
840
+ smlal v0.4s, v5.4h, v5.4h
841
+ smlal2 v1.4s, v5.8h, v5.8h
842
+ smlal v0.4s, v6.4h, v6.4h
843
+ smlal2 v1.4s, v6.8h, v6.8h
844
+ smlal v0.4s, v7.4h, v7.4h
845
+ smlal2 v1.4s, v7.8h, v7.8h
846
+.endr
847
+ cbnz w12, .loop_ssd_s_32
848
+ add v0.4s, v0.4s, v1.4s
849
+ ret_v0_w0
850
+.vl_gt_16_pixel_ssd_s_32x32:
851
+ cmp x9, #48
852
+ bgt .vl_gt_48_pixel_ssd_s_32x32
853
+ ptrue p0.b, vl32
854
+ ld1b {z16.b}, p0/z, x0
855
+ ld1b {z17.b}, p0/z, x0, #1, mul vl
856
+ add x0, x0, x1, lsl #1
857
+ smullb z0.s, z16.h, z16.h
858
+ smlalt z0.s, z16.h, z16.h
859
+ smlalb z0.s, z17.h, z17.h
860
+ smlalt z0.s, z17.h, z17.h
861
+.rept 31
862
+ ld1b {z16.b}, p0/z, x0
863
+ ld1b {z17.b}, p0/z, x0, #1, mul vl
864
+ add x0, x0, x1, lsl #1
865
+ smlalb z0.s, z16.h, z16.h
866
+ smlalt z0.s, z16.h, z16.h
867
+ smlalb z0.s, z17.h, z17.h
868
+ smlalt z0.s, z17.h, z17.h
869
+.endr
870
+ uaddv d3, p0, z0.s
871
+ fmov w0, s3
872
+ ret
873
+.vl_gt_48_pixel_ssd_s_32x32:
874
+ ptrue p0.b, vl64
875
+ ld1b {z16.b}, p0/z, x0
876
+ add x0, x0, x1, lsl #1
877
+ smullb z0.s, z16.h, z16.h
878
+ smlalt z0.s, z16.h, z16.h
879
+.rept 31
880
+ ld1b {z16.b}, p0/z, x0
881
+ add x0, x0, x1, lsl #1
882
+ smlalb z0.s, z16.h, z16.h
883
+ smlalt z0.s, z16.h, z16.h
884
+.endr
885
+ uaddv d3, p0, z0.s
886
+ fmov w0, s3
887
+ ret
888
+endfunc
889
x265_3.6.tar.gz/source/common/aarch64/ssd-a.S
Added
478
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2021 MulticoreWare, Inc
4
+ *
5
+ * Authors: Sebastian Pop <spop@amazon.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+#include "ssd-a-common.S"
27
+
28
+#ifdef __APPLE__
29
+.section __RODATA,__rodata
30
+#else
31
+.section .rodata
32
+#endif
33
+
34
+.align 4
35
+
36
+.text
37
+
38
+function PFX(pixel_sse_pp_4x4_neon)
39
+ ld1 {v16.s}0, x0, x1
40
+ ld1 {v17.s}0, x2, x3
41
+ ld1 {v18.s}0, x0, x1
42
+ ld1 {v19.s}0, x2, x3
43
+ ld1 {v20.s}0, x0, x1
44
+ ld1 {v21.s}0, x2, x3
45
+ ld1 {v22.s}0, x0, x1
46
+ ld1 {v23.s}0, x2, x3
47
+
48
+ usubl v1.8h, v16.8b, v17.8b
49
+ usubl v2.8h, v18.8b, v19.8b
50
+ usubl v3.8h, v20.8b, v21.8b
51
+ usubl v4.8h, v22.8b, v23.8b
52
+
53
+ smull v0.4s, v1.4h, v1.4h
54
+ smlal v0.4s, v2.4h, v2.4h
55
+ smlal v0.4s, v3.4h, v3.4h
56
+ smlal v0.4s, v4.4h, v4.4h
57
+ ret_v0_w0
58
+endfunc
59
+
60
+function PFX(pixel_sse_pp_4x8_neon)
61
+ ld1 {v16.s}0, x0, x1
62
+ ld1 {v17.s}0, x2, x3
63
+ usubl v1.8h, v16.8b, v17.8b
64
+ ld1 {v16.s}0, x0, x1
65
+ ld1 {v17.s}0, x2, x3
66
+ smull v0.4s, v1.4h, v1.4h
67
+.rept 6
68
+ usubl v1.8h, v16.8b, v17.8b
69
+ ld1 {v16.s}0, x0, x1
70
+ smlal v0.4s, v1.4h, v1.4h
71
+ ld1 {v17.s}0, x2, x3
72
+.endr
73
+ usubl v1.8h, v16.8b, v17.8b
74
+ smlal v0.4s, v1.4h, v1.4h
75
+ ret_v0_w0
76
+endfunc
77
+
78
+function PFX(pixel_sse_pp_8x8_neon)
79
+ ld1 {v16.8b}, x0, x1
80
+ ld1 {v17.8b}, x2, x3
81
+ usubl v1.8h, v16.8b, v17.8b
82
+ ld1 {v16.8b}, x0, x1
83
+ smull v0.4s, v1.4h, v1.4h
84
+ smlal2 v0.4s, v1.8h, v1.8h
85
+ ld1 {v17.8b}, x2, x3
86
+
87
+.rept 6
88
+ usubl v1.8h, v16.8b, v17.8b
89
+ ld1 {v16.8b}, x0, x1
90
+ smlal v0.4s, v1.4h, v1.4h
91
+ smlal2 v0.4s, v1.8h, v1.8h
92
+ ld1 {v17.8b}, x2, x3
93
+.endr
94
+ usubl v1.8h, v16.8b, v17.8b
95
+ smlal v0.4s, v1.4h, v1.4h
96
+ smlal2 v0.4s, v1.8h, v1.8h
97
+ ret_v0_w0
98
+endfunc
99
+
100
+function PFX(pixel_sse_pp_8x16_neon)
101
+ ld1 {v16.8b}, x0, x1
102
+ ld1 {v17.8b}, x2, x3
103
+ usubl v1.8h, v16.8b, v17.8b
104
+ ld1 {v16.8b}, x0, x1
105
+ smull v0.4s, v1.4h, v1.4h
106
+ smlal2 v0.4s, v1.8h, v1.8h
107
+ ld1 {v17.8b}, x2, x3
108
+
109
+.rept 14
110
+ usubl v1.8h, v16.8b, v17.8b
111
+ ld1 {v16.8b}, x0, x1
112
+ smlal v0.4s, v1.4h, v1.4h
113
+ smlal2 v0.4s, v1.8h, v1.8h
114
+ ld1 {v17.8b}, x2, x3
115
+.endr
116
+ usubl v1.8h, v16.8b, v17.8b
117
+ smlal v0.4s, v1.4h, v1.4h
118
+ smlal2 v0.4s, v1.8h, v1.8h
119
+ ret_v0_w0
120
+endfunc
121
+
122
+.macro sse_pp_16xN h
123
+function PFX(pixel_sse_pp_16x\h\()_neon)
124
+ ld1 {v16.16b}, x0, x1
125
+ ld1 {v17.16b}, x2, x3
126
+ usubl v1.8h, v16.8b, v17.8b
127
+ usubl2 v2.8h, v16.16b, v17.16b
128
+ ld1 {v16.16b}, x0, x1
129
+ ld1 {v17.16b}, x2, x3
130
+ smull v0.4s, v1.4h, v1.4h
131
+ smlal2 v0.4s, v1.8h, v1.8h
132
+ smlal v0.4s, v2.4h, v2.4h
133
+ smlal2 v0.4s, v2.8h, v2.8h
134
+.rept \h - 2
135
+ usubl v1.8h, v16.8b, v17.8b
136
+ usubl2 v2.8h, v16.16b, v17.16b
137
+ ld1 {v16.16b}, x0, x1
138
+ smlal v0.4s, v1.4h, v1.4h
139
+ smlal2 v0.4s, v1.8h, v1.8h
140
+ ld1 {v17.16b}, x2, x3
141
+ smlal v0.4s, v2.4h, v2.4h
142
+ smlal2 v0.4s, v2.8h, v2.8h
143
+.endr
144
+ usubl v1.8h, v16.8b, v17.8b
145
+ usubl2 v2.8h, v16.16b, v17.16b
146
+ smlal v0.4s, v1.4h, v1.4h
147
+ smlal2 v0.4s, v1.8h, v1.8h
148
+ smlal v0.4s, v2.4h, v2.4h
149
+ smlal2 v0.4s, v2.8h, v2.8h
150
+ ret_v0_w0
151
+endfunc
152
+.endm
153
+
154
+sse_pp_16xN 16
155
+sse_pp_16xN 32
156
+
157
+function PFX(pixel_sse_pp_32x32_neon)
158
+ mov w12, #8
159
+ movi v0.16b, #0
160
+ movi v1.16b, #0
161
+.loop_sse_pp_32:
162
+ sub w12, w12, #1
163
+.rept 4
164
+ ld1 {v16.16b,v17.16b}, x0, x1
165
+ ld1 {v18.16b,v19.16b}, x2, x3
166
+ usubl v2.8h, v16.8b, v18.8b
167
+ usubl2 v3.8h, v16.16b, v18.16b
168
+ usubl v4.8h, v17.8b, v19.8b
169
+ usubl2 v5.8h, v17.16b, v19.16b
170
+ smlal v0.4s, v2.4h, v2.4h
171
+ smlal2 v1.4s, v2.8h, v2.8h
172
+ smlal v0.4s, v3.4h, v3.4h
173
+ smlal2 v1.4s, v3.8h, v3.8h
174
+ smlal v0.4s, v4.4h, v4.4h
175
+ smlal2 v1.4s, v4.8h, v4.8h
176
+ smlal v0.4s, v5.4h, v5.4h
177
+ smlal2 v1.4s, v5.8h, v5.8h
178
+.endr
179
+ cbnz w12, .loop_sse_pp_32
180
+ add v0.4s, v0.4s, v1.4s
181
+ ret_v0_w0
182
+endfunc
183
+
184
+function PFX(pixel_sse_pp_32x64_neon)
185
+ mov w12, #16
186
+ movi v0.16b, #0
187
+ movi v1.16b, #0
188
+.loop_sse_pp_32x64:
189
+ sub w12, w12, #1
190
+.rept 4
191
+ ld1 {v16.16b,v17.16b}, x0, x1
192
+ ld1 {v18.16b,v19.16b}, x2, x3
193
+ usubl v2.8h, v16.8b, v18.8b
194
+ usubl2 v3.8h, v16.16b, v18.16b
195
+ usubl v4.8h, v17.8b, v19.8b
196
+ usubl2 v5.8h, v17.16b, v19.16b
197
+ smlal v0.4s, v2.4h, v2.4h
198
+ smlal2 v1.4s, v2.8h, v2.8h
199
+ smlal v0.4s, v3.4h, v3.4h
200
+ smlal2 v1.4s, v3.8h, v3.8h
201
+ smlal v0.4s, v4.4h, v4.4h
202
+ smlal2 v1.4s, v4.8h, v4.8h
203
+ smlal v0.4s, v5.4h, v5.4h
204
+ smlal2 v1.4s, v5.8h, v5.8h
205
+.endr
206
+ cbnz w12, .loop_sse_pp_32x64
207
+ add v0.4s, v0.4s, v1.4s
208
+ ret_v0_w0
209
+endfunc
210
+
211
+function PFX(pixel_sse_pp_64x64_neon)
212
+ mov w12, #16
213
+ movi v0.16b, #0
214
+ movi v1.16b, #0
215
+
216
+.loop_sse_pp_64:
217
+ sub w12, w12, #1
218
+.rept 4
219
+ ld1 {v16.16b-v19.16b}, x0, x1
220
+ ld1 {v20.16b-v23.16b}, x2, x3
221
+
222
+ usubl v2.8h, v16.8b, v20.8b
223
+ usubl2 v3.8h, v16.16b, v20.16b
224
+ usubl v4.8h, v17.8b, v21.8b
225
+ usubl2 v5.8h, v17.16b, v21.16b
226
+ smlal v0.4s, v2.4h, v2.4h
227
+ smlal2 v1.4s, v2.8h, v2.8h
228
+ smlal v0.4s, v3.4h, v3.4h
229
+ smlal2 v1.4s, v3.8h, v3.8h
230
+ smlal v0.4s, v4.4h, v4.4h
231
+ smlal2 v1.4s, v4.8h, v4.8h
232
+ smlal v0.4s, v5.4h, v5.4h
233
+ smlal2 v1.4s, v5.8h, v5.8h
234
+
235
+ usubl v2.8h, v18.8b, v22.8b
236
+ usubl2 v3.8h, v18.16b, v22.16b
237
+ usubl v4.8h, v19.8b, v23.8b
238
+ usubl2 v5.8h, v19.16b, v23.16b
239
+ smlal v0.4s, v2.4h, v2.4h
240
+ smlal2 v1.4s, v2.8h, v2.8h
241
+ smlal v0.4s, v3.4h, v3.4h
242
+ smlal2 v1.4s, v3.8h, v3.8h
243
+ smlal v0.4s, v4.4h, v4.4h
244
+ smlal2 v1.4s, v4.8h, v4.8h
245
+ smlal v0.4s, v5.4h, v5.4h
246
+ smlal2 v1.4s, v5.8h, v5.8h
247
+.endr
248
+ cbnz w12, .loop_sse_pp_64
249
+ add v0.4s, v0.4s, v1.4s
250
+ ret_v0_w0
251
+endfunc
252
+
253
+function PFX(pixel_sse_ss_4x4_neon)
254
+ add x1, x1, x1
255
+ add x3, x3, x3
256
+ ld1 {v16.8b}, x0, x1
257
+ ld1 {v17.8b}, x2, x3
258
+ sub v2.4h, v16.4h, v17.4h
259
+ ld1 {v16.8b}, x0, x1
260
+ ld1 {v17.8b}, x2, x3
261
+ smull v0.4s, v2.4h, v2.4h
262
+ sub v2.4h, v16.4h, v17.4h
263
+ ld1 {v16.8b}, x0, x1
264
+ ld1 {v17.8b}, x2, x3
265
+ smlal v0.4s, v2.4h, v2.4h
266
+ sub v2.4h, v16.4h, v17.4h
267
+ ld1 {v16.8b}, x0, x1
268
+ smlal v0.4s, v2.4h, v2.4h
269
+ ld1 {v17.8b}, x2, x3
270
+ sub v2.4h, v16.4h, v17.4h
271
+ smlal v0.4s, v2.4h, v2.4h
272
+ ret_v0_w0
273
+endfunc
274
+
275
+function PFX(pixel_sse_ss_8x8_neon)
276
+ add x1, x1, x1
277
+ add x3, x3, x3
278
+ ld1 {v16.16b}, x0, x1
279
+ ld1 {v17.16b}, x2, x3
280
+ sub v2.8h, v16.8h, v17.8h
281
+ ld1 {v16.16b}, x0, x1
282
+ ld1 {v17.16b}, x2, x3
283
+ smull v0.4s, v2.4h, v2.4h
284
+ smull2 v1.4s, v2.8h, v2.8h
285
+ sub v2.8h, v16.8h, v17.8h
286
+.rept 6
287
+ ld1 {v16.16b}, x0, x1
288
+ ld1 {v17.16b}, x2, x3
289
+ smlal v0.4s, v2.4h, v2.4h
290
+ smlal2 v1.4s, v2.8h, v2.8h
291
+ sub v2.8h, v16.8h, v17.8h
292
+.endr
293
+ smlal v0.4s, v2.4h, v2.4h
294
+ smlal2 v1.4s, v2.8h, v2.8h
295
+ add v0.4s, v0.4s, v1.4s
296
+ ret_v0_w0
297
+endfunc
298
+
299
+function PFX(pixel_sse_ss_16x16_neon)
300
+ add x1, x1, x1
301
+ add x3, x3, x3
302
+ mov w12, #4
303
+ movi v0.16b, #0
304
+ movi v1.16b, #0
305
+.loop_sse_ss_16:
306
+ sub w12, w12, #1
307
+.rept 4
308
+ ld1 {v16.16b, v17.16b}, x0, x1
309
+ ld1 {v18.16b, v19.16b}, x2, x3
310
+ sub v2.8h, v16.8h, v18.8h
311
+ sub v3.8h, v17.8h, v19.8h
312
+ smlal v0.4s, v2.4h, v2.4h
313
+ smlal2 v1.4s, v2.8h, v2.8h
314
+ smlal v0.4s, v3.4h, v3.4h
315
+ smlal2 v1.4s, v3.8h, v3.8h
316
+.endr
317
+ cbnz w12, .loop_sse_ss_16
318
+ add v0.4s, v0.4s, v1.4s
319
+ ret_v0_w0
320
+endfunc
321
+
322
+function PFX(pixel_sse_ss_32x32_neon)
323
+ add x1, x1, x1
324
+ add x3, x3, x3
325
+
326
+ mov w12, #8
327
+ movi v0.16b, #0
328
+ movi v1.16b, #0
329
+.loop_sse_ss_32:
330
+ sub w12, w12, #1
331
+.rept 4
332
+ ld1 {v16.16b-v19.16b}, x0, x1
333
+ ld1 {v20.16b-v23.16b}, x2, x3
334
+ sub v2.8h, v16.8h, v20.8h
335
+ sub v3.8h, v17.8h, v21.8h
336
+ sub v4.8h, v18.8h, v22.8h
337
+ sub v5.8h, v19.8h, v23.8h
338
+ smlal v0.4s, v2.4h, v2.4h
339
+ smlal2 v1.4s, v2.8h, v2.8h
340
+ smlal v0.4s, v3.4h, v3.4h
341
+ smlal2 v1.4s, v3.8h, v3.8h
342
+ smlal v0.4s, v4.4h, v4.4h
343
+ smlal2 v1.4s, v4.8h, v4.8h
344
+ smlal v0.4s, v5.4h, v5.4h
345
+ smlal2 v1.4s, v5.8h, v5.8h
346
+.endr
347
+ cbnz w12, .loop_sse_ss_32
348
+ add v0.4s, v0.4s, v1.4s
349
+ ret_v0_w0
350
+endfunc
351
+
352
+function PFX(pixel_sse_ss_64x64_neon)
353
+ add x1, x1, x1
354
+ add x3, x3, x3
355
+ sub x1, x1, #64
356
+ sub x3, x3, #64
357
+
358
+ mov w12, #32
359
+ movi v0.16b, #0
360
+ movi v1.16b, #0
361
+.loop_sse_ss_64:
362
+ sub w12, w12, #1
363
+.rept 2
364
+ ld1 {v16.16b-v19.16b}, x0, #64
365
+ ld1 {v20.16b-v23.16b}, x2, #64
366
+ sub v2.8h, v16.8h, v20.8h
367
+ sub v3.8h, v17.8h, v21.8h
368
+ sub v4.8h, v18.8h, v22.8h
369
+ sub v5.8h, v19.8h, v23.8h
370
+ ld1 {v16.16b-v19.16b}, x0, x1
371
+ ld1 {v20.16b-v23.16b}, x2, x3
372
+ smlal v0.4s, v2.4h, v2.4h
373
+ smlal2 v1.4s, v2.8h, v2.8h
374
+ smlal v0.4s, v3.4h, v3.4h
375
+ smlal2 v1.4s, v3.8h, v3.8h
376
+ smlal v0.4s, v4.4h, v4.4h
377
+ smlal2 v1.4s, v4.8h, v4.8h
378
+ smlal v0.4s, v5.4h, v5.4h
379
+ smlal2 v1.4s, v5.8h, v5.8h
380
+ sub v2.8h, v16.8h, v20.8h
381
+ sub v3.8h, v17.8h, v21.8h
382
+ sub v4.8h, v18.8h, v22.8h
383
+ sub v5.8h, v19.8h, v23.8h
384
+ smlal v0.4s, v2.4h, v2.4h
385
+ smlal2 v1.4s, v2.8h, v2.8h
386
+ smlal v0.4s, v3.4h, v3.4h
387
+ smlal2 v1.4s, v3.8h, v3.8h
388
+ smlal v0.4s, v4.4h, v4.4h
389
+ smlal2 v1.4s, v4.8h, v4.8h
390
+ smlal v0.4s, v5.4h, v5.4h
391
+ smlal2 v1.4s, v5.8h, v5.8h
392
+.endr
393
+ cbnz w12, .loop_sse_ss_64
394
+ add v0.4s, v0.4s, v1.4s
395
+ ret_v0_w0
396
+endfunc
397
+
398
+function PFX(pixel_ssd_s_4x4_neon)
399
+ add x1, x1, x1
400
+ ld1 {v4.8b}, x0, x1
401
+ ld1 {v5.8b}, x0, x1
402
+ ld1 {v6.8b}, x0, x1
403
+ ld1 {v7.8b}, x0
404
+ smull v0.4s, v4.4h, v4.4h
405
+ smull v1.4s, v5.4h, v5.4h
406
+ smlal v0.4s, v6.4h, v6.4h
407
+ smlal v1.4s, v7.4h, v7.4h
408
+ add v0.4s, v0.4s, v1.4s
409
+ ret_v0_w0
410
+endfunc
411
+
412
+function PFX(pixel_ssd_s_8x8_neon)
413
+ add x1, x1, x1
414
+ ld1 {v4.16b}, x0, x1
415
+ ld1 {v5.16b}, x0, x1
416
+ smull v0.4s, v4.4h, v4.4h
417
+ smull2 v1.4s, v4.8h, v4.8h
418
+ smlal v0.4s, v5.4h, v5.4h
419
+ smlal2 v1.4s, v5.8h, v5.8h
420
+.rept 3
421
+ ld1 {v4.16b}, x0, x1
422
+ ld1 {v5.16b}, x0, x1
423
+ smlal v0.4s, v4.4h, v4.4h
424
+ smlal2 v1.4s, v4.8h, v4.8h
425
+ smlal v0.4s, v5.4h, v5.4h
426
+ smlal2 v1.4s, v5.8h, v5.8h
427
+.endr
428
+ add v0.4s, v0.4s, v1.4s
429
+ ret_v0_w0
430
+endfunc
431
+
432
+function PFX(pixel_ssd_s_16x16_neon)
433
+ add x1, x1, x1
434
+ mov w12, #4
435
+ movi v0.16b, #0
436
+ movi v1.16b, #0
437
+.loop_ssd_s_16:
438
+ sub w12, w12, #1
439
+.rept 2
440
+ ld1 {v4.16b,v5.16b}, x0, x1
441
+ ld1 {v6.16b,v7.16b}, x0, x1
442
+ smlal v0.4s, v4.4h, v4.4h
443
+ smlal2 v1.4s, v4.8h, v4.8h
444
+ smlal v0.4s, v5.4h, v5.4h
445
+ smlal2 v1.4s, v5.8h, v5.8h
446
+ smlal v0.4s, v6.4h, v6.4h
447
+ smlal2 v1.4s, v6.8h, v6.8h
448
+ smlal v0.4s, v7.4h, v7.4h
449
+ smlal2 v1.4s, v7.8h, v7.8h
450
+.endr
451
+ cbnz w12, .loop_ssd_s_16
452
+ add v0.4s, v0.4s, v1.4s
453
+ ret_v0_w0
454
+endfunc
455
+
456
+function PFX(pixel_ssd_s_32x32_neon)
457
+ add x1, x1, x1
458
+ mov w12, #8
459
+ movi v0.16b, #0
460
+ movi v1.16b, #0
461
+.loop_ssd_s_32:
462
+ sub w12, w12, #1
463
+.rept 4
464
+ ld1 {v4.16b-v7.16b}, x0, x1
465
+ smlal v0.4s, v4.4h, v4.4h
466
+ smlal2 v1.4s, v4.8h, v4.8h
467
+ smlal v0.4s, v5.4h, v5.4h
468
+ smlal2 v1.4s, v5.8h, v5.8h
469
+ smlal v0.4s, v6.4h, v6.4h
470
+ smlal2 v1.4s, v6.8h, v6.8h
471
+ smlal v0.4s, v7.4h, v7.4h
472
+ smlal2 v1.4s, v7.8h, v7.8h
473
+.endr
474
+ cbnz w12, .loop_ssd_s_32
475
+ add v0.4s, v0.4s, v1.4s
476
+ ret_v0_w0
477
+endfunc
478
x265_3.5.tar.gz/source/common/common.h -> x265_3.6.tar.gz/source/common/common.h
Changed
51
1
2
typedef uint64_t pixel4;
3
typedef int64_t ssum2_t;
4
#define SHIFT_TO_BITPLANE 9
5
-#define HISTOGRAM_BINS 1024
6
#else
7
typedef uint8_t pixel;
8
typedef uint16_t sum_t;
9
10
typedef uint32_t pixel4;
11
typedef int32_t ssum2_t; // Signed sum
12
#define SHIFT_TO_BITPLANE 7
13
-#define HISTOGRAM_BINS 256
14
#endif // if HIGH_BIT_DEPTH
15
16
#if X265_DEPTH < 10
17
18
19
#define MIN_QPSCALE 0.21249999999999999
20
#define MAX_MAX_QPSCALE 615.46574234477100
21
+#define FRAME_BRIGHTNESS_THRESHOLD 50.0 // Min % of pixels in a frame, that are above BRIGHTNESS_THRESHOLD for it to be considered a bright frame
22
+#define FRAME_EDGE_THRESHOLD 10.0 // Min % of edge pixels in a frame, for it to be considered to have high edge density
23
24
25
template<typename T>
26
27
#define FILLER_OVERHEAD (NAL_TYPE_OVERHEAD + START_CODE_OVERHEAD + 1)
28
29
#define MAX_NUM_DYN_REFINE (NUM_CU_DEPTH * X265_REFINE_INTER_LEVELS)
30
+#define X265_BYTE 8
31
+
32
+#define MAX_MCSTF_TEMPORAL_WINDOW_LENGTH 8
33
34
namespace X265_NS {
35
36
37
#define x265_unlink(fileName) unlink(fileName)
38
#define x265_rename(oldName, newName) rename(oldName, newName)
39
#endif
40
+/* Close a file */
41
+#define x265_fclose(file) if (file != NULL) fclose(file); file=NULL;
42
+#define x265_fread(val, size, readSize, fileOffset,errorMessage)\
43
+ if (fread(val, size, readSize, fileOffset) != readSize)\
44
+ {\
45
+ x265_log(NULL, X265_LOG_ERROR, errorMessage); \
46
+ return; \
47
+ }
48
int x265_exp2fix8(double x);
49
50
double x265_ssim2dB(double ssim);
51
x265_3.5.tar.gz/source/common/cpu.cpp -> x265_3.6.tar.gz/source/common/cpu.cpp
Changed
58
1
2
* Steve Borho <steve@borho.org>
3
* Hongbin Liu <liuhongbin1@huawei.com>
4
* Yimeng Su <yimeng.su@huawei.com>
5
+ * Josh Dekker <josh@itanimul.li>
6
+ * Jean-Baptiste Kempf <jb@videolan.org>
7
*
8
* This program is free software; you can redistribute it and/or modify
9
* it under the terms of the GNU General Public License as published by
10
11
{ "NEON", X265_CPU_NEON },
12
{ "FastNeonMRC", X265_CPU_FAST_NEON_MRC },
13
14
+#elif X265_ARCH_ARM64
15
+ { "NEON", X265_CPU_NEON },
16
+#if defined(HAVE_SVE)
17
+ { "SVE", X265_CPU_SVE },
18
+#endif
19
+#if defined(HAVE_SVE2)
20
+ { "SVE2", X265_CPU_SVE2 },
21
+#endif
22
#elif X265_ARCH_POWER8
23
{ "Altivec", X265_CPU_ALTIVEC },
24
25
26
flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
27
#endif
28
// TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
29
-#elif X265_ARCH_ARM64
30
- flags |= X265_CPU_NEON;
31
#endif // if HAVE_ARMV6
32
return flags;
33
}
34
35
+#elif X265_ARCH_ARM64
36
+
37
+uint32_t cpu_detect(bool benableavx512)
38
+{
39
+ int flags = 0;
40
+
41
+ #if defined(HAVE_SVE2)
42
+ flags |= X265_CPU_SVE2;
43
+ flags |= X265_CPU_SVE;
44
+ flags |= X265_CPU_NEON;
45
+ #elif defined(HAVE_SVE)
46
+ flags |= X265_CPU_SVE;
47
+ flags |= X265_CPU_NEON;
48
+ #elif HAVE_NEON
49
+ flags |= X265_CPU_NEON;
50
+ #endif
51
+
52
+ return flags;
53
+}
54
+
55
#elif X265_ARCH_POWER8
56
57
uint32_t cpu_detect(bool benableavx512)
58
x265_3.5.tar.gz/source/common/frame.cpp -> x265_3.6.tar.gz/source/common/frame.cpp
Changed
102
1
2
m_edgeBitPlane = NULL;
3
m_edgeBitPic = NULL;
4
m_isInsideWindow = 0;
5
+
6
+ // mcstf
7
+ m_isSubSampled = NULL;
8
+ m_mcstf = NULL;
9
+ m_refPicCnt0 = 0;
10
+ m_refPicCnt1 = 0;
11
+ m_nextMCSTF = NULL;
12
+ m_prevMCSTF = NULL;
13
+
14
+ m_tempLayer = 0;
15
+ m_sameLayerRefPic = false;
16
}
17
18
bool Frame::create(x265_param *param, float* quantOffsets)
19
{
20
m_fencPic = new PicYuv;
21
m_param = param;
22
+
23
+ if (m_param->bEnableTemporalFilter)
24
+ {
25
+ m_mcstf = new TemporalFilter;
26
+ m_mcstf->init(param);
27
+
28
+ m_fencPicSubsampled2 = new PicYuv;
29
+ m_fencPicSubsampled4 = new PicYuv;
30
+
31
+ if (!m_fencPicSubsampled2->createScaledPicYUV(param, 2))
32
+ return false;
33
+ if (!m_fencPicSubsampled4->createScaledPicYUV(param, 4))
34
+ return false;
35
+
36
+ CHECKED_MALLOC_ZERO(m_isSubSampled, int, 1);
37
+ }
38
+
39
CHECKED_MALLOC_ZERO(m_rcData, RcStats, 1);
40
41
if (param->bCTUInfo)
42
43
return false;
44
}
45
46
+bool Frame::createSubSample()
47
+{
48
+
49
+ m_fencPicSubsampled2 = new PicYuv;
50
+ m_fencPicSubsampled4 = new PicYuv;
51
+
52
+ if (!m_fencPicSubsampled2->createScaledPicYUV(m_param, 2))
53
+ return false;
54
+ if (!m_fencPicSubsampled4->createScaledPicYUV(m_param, 4))
55
+ return false;
56
+ CHECKED_MALLOC_ZERO(m_isSubSampled, int, 1);
57
+ return true;
58
+fail:
59
+ return false;
60
+}
61
+
62
bool Frame::allocEncodeData(x265_param *param, const SPS& sps)
63
{
64
m_encData = new FrameData;
65
66
m_fencPic = NULL;
67
}
68
69
+ if (m_param->bEnableTemporalFilter)
70
+ {
71
+
72
+ if (m_fencPicSubsampled2)
73
+ {
74
+ m_fencPicSubsampled2->destroy();
75
+ delete m_fencPicSubsampled2;
76
+ m_fencPicSubsampled2 = NULL;
77
+ }
78
+
79
+ if (m_fencPicSubsampled4)
80
+ {
81
+ m_fencPicSubsampled4->destroy();
82
+ delete m_fencPicSubsampled4;
83
+ m_fencPicSubsampled4 = NULL;
84
+ }
85
+ delete m_mcstf;
86
+ X265_FREE(m_isSubSampled);
87
+ }
88
+
89
if (m_reconPic)
90
{
91
m_reconPic->destroy();
92
93
X265_FREE(m_addOnPrevChange);
94
m_addOnPrevChange = NULL;
95
}
96
- m_lowres.destroy();
97
+
98
+ m_lowres.destroy(m_param);
99
X265_FREE(m_rcData);
100
101
if (m_param->bDynamicRefine)
102
x265_3.5.tar.gz/source/common/frame.h -> x265_3.6.tar.gz/source/common/frame.h
Changed
60
1
2
#include "common.h"
3
#include "lowres.h"
4
#include "threading.h"
5
+#include "temporalfilter.h"
6
7
namespace X265_NS {
8
// private namespace
9
10
double count4;
11
double offset4;
12
double bufferFillFinal;
13
+ int64_t currentSatd;
14
};
15
16
class Frame
17
18
19
/* Data associated with x265_picture */
20
PicYuv* m_fencPic;
21
+ PicYuv* m_fencPicSubsampled2;
22
+ PicYuv* m_fencPicSubsampled4;
23
+
24
int m_poc;
25
int m_encodeOrder;
26
+ int m_gopOffset;
27
int64_t m_pts; // user provided presentation time stamp
28
int64_t m_reorderedPts;
29
int64_t m_dts;
30
31
bool m_classifyFrame;
32
int m_fieldNum;
33
34
+ /*MCSTF*/
35
+ TemporalFilter* m_mcstf;
36
+ int m_refPicCnt2;
37
+ Frame* m_nextMCSTF; // PicList doubly linked list pointers
38
+ Frame* m_prevMCSTF;
39
+ int* m_isSubSampled;
40
+
41
/* aq-mode 4 : Gaussian, edge and theta frames for edge information */
42
pixel* m_edgePic;
43
pixel* m_gaussianPic;
44
45
46
int m_isInsideWindow;
47
48
+ /*Frame's temporal layer info*/
49
+ uint8_t m_tempLayer;
50
+ int8_t m_gopId;
51
+ bool m_sameLayerRefPic;
52
+
53
Frame();
54
55
bool create(x265_param *param, float* quantOffsets);
56
+ bool createSubSample();
57
bool allocEncodeData(x265_param *param, const SPS& sps);
58
void reinit(const SPS& sps);
59
void destroy();
60
x265_3.5.tar.gz/source/common/framedata.cpp -> x265_3.6.tar.gz/source/common/framedata.cpp
Changed
10
1
2
}
3
else
4
return false;
5
- CHECKED_MALLOC_ZERO(m_cuStat, RCStatCU, sps.numCUsInFrame);
6
+ CHECKED_MALLOC_ZERO(m_cuStat, RCStatCU, sps.numCUsInFrame + 1);
7
CHECKED_MALLOC(m_rowStat, RCStatRow, sps.numCuInHeight);
8
reinit(sps);
9
10
x265_3.5.tar.gz/source/common/lowres.cpp -> x265_3.6.tar.gz/source/common/lowres.cpp
Changed
154
1
2
3
using namespace X265_NS;
4
5
+/*
6
+ * Down Sample input picture
7
+ */
8
+static
9
+void frame_lowres_core(const pixel* src0, pixel* dst0,
10
+ intptr_t src_stride, intptr_t dst_stride, int width, int height)
11
+{
12
+ for (int y = 0; y < height; y++)
13
+ {
14
+ const pixel* src1 = src0 + src_stride;
15
+ for (int x = 0; x < width; x++)
16
+ {
17
+ // slower than naive bilinear, but matches asm
18
+#define FILTER(a, b, c, d) ((((a + b + 1) >> 1) + ((c + d + 1) >> 1) + 1) >> 1)
19
+ dst0x = FILTER(src02 * x, src12 * x, src02 * x + 1, src12 * x + 1);
20
+#undef FILTER
21
+ }
22
+ src0 += src_stride * 2;
23
+ dst0 += dst_stride;
24
+ }
25
+}
26
+
27
bool PicQPAdaptationLayer::create(uint32_t width, uint32_t height, uint32_t partWidth, uint32_t partHeight, uint32_t numAQPartInWidthExt, uint32_t numAQPartInHeightExt)
28
{
29
aqPartWidth = partWidth;
30
31
32
size_t planesize = lumaStride * (lines + 2 * origPic->m_lumaMarginY);
33
size_t padoffset = lumaStride * origPic->m_lumaMarginY + origPic->m_lumaMarginX;
34
- if (!!param->rc.aqMode || !!param->rc.hevcAq || !!param->bAQMotion)
35
+ if (!!param->rc.aqMode || !!param->rc.hevcAq || !!param->bAQMotion || !!param->bEnableWeightedPred || !!param->bEnableWeightedBiPred)
36
{
37
CHECKED_MALLOC_ZERO(qpAqOffset, double, cuCountFullRes);
38
CHECKED_MALLOC_ZERO(invQscaleFactor, int, cuCountFullRes);
39
40
}
41
}
42
43
+ if (param->bHistBasedSceneCut)
44
+ {
45
+ quarterSampleLowResWidth = widthFullRes / 4;
46
+ quarterSampleLowResHeight = heightFullRes / 4;
47
+ quarterSampleLowResOriginX = 16;
48
+ quarterSampleLowResOriginY = 16;
49
+ quarterSampleLowResStrideY = quarterSampleLowResWidth + 2 * quarterSampleLowResOriginY;
50
+
51
+ size_t quarterSampleLowResPlanesize = quarterSampleLowResStrideY * (quarterSampleLowResHeight + 2 * quarterSampleLowResOriginX);
52
+ /* allocate quarter sampled lowres buffers */
53
+ CHECKED_MALLOC_ZERO(quarterSampleLowResBuffer, pixel, quarterSampleLowResPlanesize);
54
+
55
+ // Allocate memory for Histograms
56
+ picHistogram = X265_MALLOC(uint32_t***, NUMBER_OF_SEGMENTS_IN_WIDTH * sizeof(uint32_t***));
57
+ picHistogram0 = X265_MALLOC(uint32_t**, NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
58
+ for (uint32_t wd = 1; wd < NUMBER_OF_SEGMENTS_IN_WIDTH; wd++) {
59
+ picHistogramwd = picHistogram0 + wd * NUMBER_OF_SEGMENTS_IN_HEIGHT;
60
+ }
61
+
62
+ for (uint32_t regionInPictureWidthIndex = 0; regionInPictureWidthIndex < NUMBER_OF_SEGMENTS_IN_WIDTH; regionInPictureWidthIndex++)
63
+ {
64
+ for (uint32_t regionInPictureHeightIndex = 0; regionInPictureHeightIndex < NUMBER_OF_SEGMENTS_IN_HEIGHT; regionInPictureHeightIndex++)
65
+ {
66
+ picHistogramregionInPictureWidthIndexregionInPictureHeightIndex = X265_MALLOC(uint32_t*, NUMBER_OF_SEGMENTS_IN_WIDTH *sizeof(uint32_t*));
67
+ picHistogramregionInPictureWidthIndexregionInPictureHeightIndex0 = X265_MALLOC(uint32_t, 3 * HISTOGRAM_NUMBER_OF_BINS * sizeof(uint32_t));
68
+ for (uint32_t wd = 1; wd < 3; wd++) {
69
+ picHistogramregionInPictureWidthIndexregionInPictureHeightIndexwd = picHistogramregionInPictureWidthIndexregionInPictureHeightIndex0 + wd * HISTOGRAM_NUMBER_OF_BINS;
70
+ }
71
+ }
72
+ }
73
+ }
74
+
75
return true;
76
77
fail:
78
return false;
79
}
80
81
-void Lowres::destroy()
82
+void Lowres::destroy(x265_param* param)
83
{
84
X265_FREE(buffer0);
85
if(bEnableHME)
86
87
X265_FREE(invQscaleFactor8x8);
88
X265_FREE(edgeInclined);
89
X265_FREE(qpAqMotionOffset);
90
- X265_FREE(blockVariance);
91
+ if (param->bDynamicRefine || param->bEnableFades)
92
+ X265_FREE(blockVariance);
93
if (maxAQDepth > 0)
94
{
95
for (uint32_t d = 0; d < 4; d++)
96
97
98
delete pAQLayer;
99
}
100
+
101
+ // Histograms
102
+ if (param->bHistBasedSceneCut)
103
+ {
104
+ for (uint32_t segmentInFrameWidthIdx = 0; segmentInFrameWidthIdx < NUMBER_OF_SEGMENTS_IN_WIDTH; segmentInFrameWidthIdx++)
105
+ {
106
+ if (picHistogramsegmentInFrameWidthIdx)
107
+ {
108
+ for (uint32_t segmentInFrameHeightIdx = 0; segmentInFrameHeightIdx < NUMBER_OF_SEGMENTS_IN_HEIGHT; segmentInFrameHeightIdx++)
109
+ {
110
+ if (picHistogramsegmentInFrameWidthIdxsegmentInFrameHeightIdx)
111
+ X265_FREE(picHistogramsegmentInFrameWidthIdxsegmentInFrameHeightIdx0);
112
+ X265_FREE(picHistogramsegmentInFrameWidthIdxsegmentInFrameHeightIdx);
113
+ }
114
+ }
115
+ }
116
+ if (picHistogram)
117
+ X265_FREE(picHistogram0);
118
+ X265_FREE(picHistogram);
119
+
120
+ X265_FREE(quarterSampleLowResBuffer);
121
+
122
+ }
123
}
124
// (re) initialize lowres state
125
void Lowres::init(PicYuv *origPic, int poc)
126
127
indB = 0;
128
memset(costEst, -1, sizeof(costEst));
129
memset(weightedCostDelta, 0, sizeof(weightedCostDelta));
130
- interPCostPercDiff = 0.0;
131
- intraCostPercDiff = 0.0;
132
- m_bIsMaxThres = false;
133
- m_bIsHardScenecut = false;
134
135
if (qpAqOffset && invQscaleFactor)
136
memset(costEstAq, -1, sizeof(costEstAq));
137
138
}
139
140
fpelPlane0 = lowresPlane0;
141
+
142
+ if (origPic->m_param->bHistBasedSceneCut)
143
+ {
144
+ // Quarter Sampled Input Picture Formation
145
+ // TO DO: Replace with ASM function
146
+ frame_lowres_core(
147
+ lowresPlane0,
148
+ quarterSampleLowResBuffer + quarterSampleLowResOriginX + quarterSampleLowResOriginY * quarterSampleLowResStrideY,
149
+ lumaStride,
150
+ quarterSampleLowResStrideY,
151
+ widthFullRes / 4, heightFullRes / 4);
152
+ }
153
}
154
x265_3.5.tar.gz/source/common/lowres.h -> x265_3.6.tar.gz/source/common/lowres.h
Changed
73
1
2
namespace X265_NS {
3
// private namespace
4
5
+#define HISTOGRAM_NUMBER_OF_BINS 256
6
+#define NUMBER_OF_SEGMENTS_IN_WIDTH 4
7
+#define NUMBER_OF_SEGMENTS_IN_HEIGHT 4
8
+
9
struct ReferencePlanes
10
{
11
ReferencePlanes() { memset(this, 0, sizeof(ReferencePlanes)); }
12
13
14
int frameNum; // Presentation frame number
15
int sliceType; // Slice type decided by lookahead
16
+ int sliceTypeReq; // Slice type required as per the QP file
17
int width; // width of lowres frame in pixels
18
int lines; // height of lowres frame in pixel lines
19
int leadingBframes; // number of leading B frames for P or I
20
21
double* qpAqOffset; // AQ QP offset values for each 16x16 CU
22
double* qpCuTreeOffset; // cuTree QP offset values for each 16x16 CU
23
double* qpAqMotionOffset;
24
- int* invQscaleFactor; // qScale values for qp Aq Offsets
25
+ int* invQscaleFactor; // qScale values for qp Aq Offsets
26
int* invQscaleFactor8x8; // temporary buffer for qg-size 8
27
uint32_t* blockVariance;
28
uint64_t wp_ssd3; // This is different than SSDY, this is sum(pixel^2) - sum(pixel)^2 for entire frame
29
uint64_t wp_sum3;
30
double frameVariance;
31
- int* edgeInclined;
32
+ int* edgeInclined;
33
34
35
/* cutree intermediate data */
36
37
uint32_t heightFullRes;
38
uint32_t m_maxCUSize;
39
uint32_t m_qgSize;
40
-
41
+
42
uint16_t* propagateCost;
43
double weightedCostDeltaX265_BFRAME_MAX + 2;
44
ReferencePlanes weightedRefX265_BFRAME_MAX + 2;
45
+
46
/* For hist-based scenecut */
47
- bool m_bIsMaxThres;
48
- double interPCostPercDiff;
49
- double intraCostPercDiff;
50
- bool m_bIsHardScenecut;
51
+ int quarterSampleLowResWidth; // width of 1/4 lowres frame in pixels
52
+ int quarterSampleLowResHeight; // height of 1/4 lowres frame in pixels
53
+ int quarterSampleLowResStrideY;
54
+ int quarterSampleLowResOriginX;
55
+ int quarterSampleLowResOriginY;
56
+ pixel *quarterSampleLowResBuffer;
57
+ bool bHistScenecutAnalyzed;
58
+
59
+ uint16_t picAvgVariance;
60
+ uint16_t picAvgVarianceCb;
61
+ uint16_t picAvgVarianceCr;
62
+
63
+ uint32_t ****picHistogram;
64
+ uint64_t averageIntensityPerSegmentNUMBER_OF_SEGMENTS_IN_WIDTHNUMBER_OF_SEGMENTS_IN_HEIGHT3;
65
+ uint8_t averageIntensity3;
66
67
bool create(x265_param* param, PicYuv *origPic, uint32_t qgSize);
68
- void destroy();
69
+ void destroy(x265_param* param);
70
void init(PicYuv *origPic, int poc);
71
};
72
}
73
x265_3.5.tar.gz/source/common/mv.h -> x265_3.6.tar.gz/source/common/mv.h
Changed
10
1
2
{
3
return x >= _min.x && x <= _max.x && y >= _min.y && y <= _max.y;
4
}
5
+
6
+ void set(int32_t _x, int32_t _y) { x = _x; y = _y; }
7
};
8
}
9
10
x265_3.5.tar.gz/source/common/param.cpp -> x265_3.6.tar.gz/source/common/param.cpp
Changed
668
1
2
param->bAnnexB = 1;
3
param->bRepeatHeaders = 0;
4
param->bEnableAccessUnitDelimiters = 0;
5
+ param->bEnableEndOfBitstream = 0;
6
+ param->bEnableEndOfSequence = 0;
7
param->bEmitHRDSEI = 0;
8
param->bEmitInfoSEI = 1;
9
param->bEmitHDRSEI = 0; /*Deprecated*/
10
11
param->keyframeMax = 250;
12
param->gopLookahead = 0;
13
param->bOpenGOP = 1;
14
+ param->craNal = 0;
15
param->bframes = 4;
16
param->lookaheadDepth = 20;
17
param->bFrameAdaptive = X265_B_ADAPT_TRELLIS;
18
param->bBPyramid = 1;
19
param->scenecutThreshold = 40; /* Magic number pulled in from x264 */
20
- param->edgeTransitionThreshold = 0.03;
21
param->bHistBasedSceneCut = 0;
22
param->lookaheadSlices = 8;
23
param->lookaheadThreads = 0;
24
25
param->bEnableHRDConcatFlag = 0;
26
param->bEnableFades = 0;
27
param->bEnableSceneCutAwareQp = 0;
28
- param->fwdScenecutWindow = 500;
29
- param->fwdRefQpDelta = 5;
30
- param->fwdNonRefQpDelta = param->fwdRefQpDelta + (SLICE_TYPE_DELTA * param->fwdRefQpDelta);
31
- param->bwdScenecutWindow = 100;
32
- param->bwdRefQpDelta = -1;
33
- param->bwdNonRefQpDelta = -1;
34
+ param->fwdMaxScenecutWindow = 1200;
35
+ param->bwdMaxScenecutWindow = 600;
36
+ for (int i = 0; i < 6; i++)
37
+ {
38
+ int deltas6 = { 5, 4, 3, 2, 1, 0 };
39
+
40
+ param->fwdScenecutWindowi = 200;
41
+ param->fwdRefQpDeltai = deltasi;
42
+ param->fwdNonRefQpDeltai = param->fwdRefQpDeltai + (SLICE_TYPE_DELTA * param->fwdRefQpDeltai);
43
+
44
+ param->bwdScenecutWindowi = 100;
45
+ param->bwdRefQpDeltai = -1;
46
+ param->bwdNonRefQpDeltai = -1;
47
+ }
48
49
/* Intra Coding Tools */
50
param->bEnableConstrainedIntra = 0;
51
52
param->rc.rfConstantMin = 0;
53
param->rc.bStatRead = 0;
54
param->rc.bStatWrite = 0;
55
+ param->rc.dataShareMode = X265_SHARE_MODE_FILE;
56
param->rc.statFileName = NULL;
57
+ param->rc.sharedMemName = NULL;
58
+ param->rc.bEncFocusedFramesOnly = 0;
59
param->rc.complexityBlur = 20;
60
param->rc.qblur = 0.5;
61
param->rc.zoneCount = 0;
62
63
param->maxLuma = PIXEL_MAX;
64
param->log2MaxPocLsb = 8;
65
param->maxSlices = 1;
66
+ param->videoSignalTypePreset = NULL;
67
68
/*Conformance window*/
69
param->confWinRightOffset = 0;
70
71
param->bEnableSvtHevc = 0;
72
param->svtHevcParam = NULL;
73
74
+ /* MCSTF */
75
+ param->bEnableTemporalFilter = 0;
76
+ param->temporalFilterStrength = 0.95;
77
+
78
#ifdef SVT_HEVC
79
param->svtHevcParam = svtParam;
80
svt_param_default(param);
81
#endif
82
+ /* Film grain characteristics model filename */
83
+ param->filmGrain = NULL;
84
+ param->bEnableSBRC = 0;
85
}
86
87
int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
88
89
#define atof(str) x265_atof(str, bError)
90
#define atobool(str) (x265_atobool(str, bError))
91
92
+int x265_scenecut_aware_qp_param_parse(x265_param* p, const char* name, const char* value)
93
+{
94
+ bool bError = false;
95
+ char nameBuf64;
96
+ if (!name)
97
+ return X265_PARAM_BAD_NAME;
98
+ // skip -- prefix if provided
99
+ if (name0 == '-' && name1 == '-')
100
+ name += 2;
101
+ // s/_/-/g
102
+ if (strlen(name) + 1 < sizeof(nameBuf) && strchr(name, '_'))
103
+ {
104
+ char *c;
105
+ strcpy(nameBuf, name);
106
+ while ((c = strchr(nameBuf, '_')) != 0)
107
+ *c = '-';
108
+ name = nameBuf;
109
+ }
110
+ if (!value)
111
+ value = "true";
112
+ else if (value0 == '=')
113
+ value++;
114
+#define OPT(STR) else if (!strcmp(name, STR))
115
+ if (0);
116
+ OPT("scenecut-aware-qp") p->bEnableSceneCutAwareQp = x265_atoi(value, bError);
117
+ OPT("masking-strength") bError = parseMaskingStrength(p, value);
118
+ else
119
+ return X265_PARAM_BAD_NAME;
120
+#undef OPT
121
+ return bError ? X265_PARAM_BAD_VALUE : 0;
122
+}
123
+
124
+
125
+/* internal versions of string-to-int with additional error checking */
126
+#undef atoi
127
+#undef atof
128
+#define atoi(str) x265_atoi(str, bError)
129
+#define atof(str) x265_atof(str, bError)
130
+#define atobool(str) (x265_atobool(str, bError))
131
+
132
int x265_zone_param_parse(x265_param* p, const char* name, const char* value)
133
{
134
bool bError = false;
135
136
{
137
bError = false;
138
p->scenecutThreshold = atoi(value);
139
- p->bHistBasedSceneCut = 0;
140
}
141
}
142
- OPT("temporal-layers") p->bEnableTemporalSubLayers = atobool(value);
143
+ OPT("temporal-layers") p->bEnableTemporalSubLayers = atoi(value);
144
OPT("keyint") p->keyframeMax = atoi(value);
145
OPT("min-keyint") p->keyframeMin = atoi(value);
146
OPT("rc-lookahead") p->lookaheadDepth = atoi(value);
147
148
int pass = x265_clip3(0, 3, atoi(value));
149
p->rc.bStatWrite = pass & 1;
150
p->rc.bStatRead = pass & 2;
151
+ p->rc.dataShareMode = X265_SHARE_MODE_FILE;
152
}
153
OPT("stats") p->rc.statFileName = strdup(value);
154
OPT("scaling-list") p->scalingLists = strdup(value);
155
156
OPT("opt-ref-list-length-pps") p->bOptRefListLengthPPS = atobool(value);
157
OPT("multi-pass-opt-rps") p->bMultiPassOptRPS = atobool(value);
158
OPT("scenecut-bias") p->scenecutBias = atof(value);
159
- OPT("hist-scenecut")
160
- {
161
- p->bHistBasedSceneCut = atobool(value);
162
- if (bError)
163
- {
164
- bError = false;
165
- p->bHistBasedSceneCut = 0;
166
- }
167
- if (p->bHistBasedSceneCut)
168
- {
169
- bError = false;
170
- p->scenecutThreshold = 0;
171
- }
172
- }
173
- OPT("hist-threshold") p->edgeTransitionThreshold = atof(value);
174
+ OPT("hist-scenecut") p->bHistBasedSceneCut = atobool(value);
175
OPT("rskip-edge-threshold") p->edgeVarThreshold = atoi(value)/100.0f;
176
OPT("lookahead-threads") p->lookaheadThreads = atoi(value);
177
OPT("opt-cu-delta-qp") p->bOptCUDeltaQP = atobool(value);
178
179
OPT("multi-pass-opt-distortion") p->analysisMultiPassDistortion = atobool(value);
180
OPT("aq-motion") p->bAQMotion = atobool(value);
181
OPT("dynamic-rd") p->dynamicRd = atof(value);
182
+ OPT("cra-nal") p->craNal = atobool(value);
183
OPT("analysis-reuse-level")
184
{
185
p->analysisReuseLevel = atoi(value);
186
187
}
188
OPT("fades") p->bEnableFades = atobool(value);
189
OPT("scenecut-aware-qp") p->bEnableSceneCutAwareQp = atoi(value);
190
- OPT("masking-strength")
191
- {
192
- int window1;
193
- double refQpDelta1, nonRefQpDelta1;
194
-
195
- if (p->bEnableSceneCutAwareQp == FORWARD)
196
- {
197
- if (3 == sscanf(value, "%d,%lf,%lf", &window1, &refQpDelta1, &nonRefQpDelta1))
198
- {
199
- if (window1 > 0)
200
- p->fwdScenecutWindow = window1;
201
- if (refQpDelta1 > 0)
202
- p->fwdRefQpDelta = refQpDelta1;
203
- if (nonRefQpDelta1 > 0)
204
- p->fwdNonRefQpDelta = nonRefQpDelta1;
205
- }
206
- else
207
- {
208
- x265_log(NULL, X265_LOG_ERROR, "Specify all the necessary offsets for masking-strength \n");
209
- bError = true;
210
- }
211
- }
212
- else if (p->bEnableSceneCutAwareQp == BACKWARD)
213
- {
214
- if (3 == sscanf(value, "%d,%lf,%lf", &window1, &refQpDelta1, &nonRefQpDelta1))
215
- {
216
- if (window1 > 0)
217
- p->bwdScenecutWindow = window1;
218
- if (refQpDelta1 > 0)
219
- p->bwdRefQpDelta = refQpDelta1;
220
- if (nonRefQpDelta1 > 0)
221
- p->bwdNonRefQpDelta = nonRefQpDelta1;
222
- }
223
- else
224
- {
225
- x265_log(NULL, X265_LOG_ERROR, "Specify all the necessary offsets for masking-strength \n");
226
- bError = true;
227
- }
228
- }
229
- else if (p->bEnableSceneCutAwareQp == BI_DIRECTIONAL)
230
- {
231
- int window2;
232
- double refQpDelta2, nonRefQpDelta2;
233
- if (6 == sscanf(value, "%d,%lf,%lf,%d,%lf,%lf", &window1, &refQpDelta1, &nonRefQpDelta1, &window2, &refQpDelta2, &nonRefQpDelta2))
234
- {
235
- if (window1 > 0)
236
- p->fwdScenecutWindow = window1;
237
- if (refQpDelta1 > 0)
238
- p->fwdRefQpDelta = refQpDelta1;
239
- if (nonRefQpDelta1 > 0)
240
- p->fwdNonRefQpDelta = nonRefQpDelta1;
241
- if (window2 > 0)
242
- p->bwdScenecutWindow = window2;
243
- if (refQpDelta2 > 0)
244
- p->bwdRefQpDelta = refQpDelta2;
245
- if (nonRefQpDelta2 > 0)
246
- p->bwdNonRefQpDelta = nonRefQpDelta2;
247
- }
248
- else
249
- {
250
- x265_log(NULL, X265_LOG_ERROR, "Specify all the necessary offsets for masking-strength \n");
251
- bError = true;
252
- }
253
- }
254
- }
255
+ OPT("masking-strength") bError |= parseMaskingStrength(p, value);
256
OPT("field") p->bField = atobool( value );
257
OPT("cll") p->bEmitCLL = atobool(value);
258
OPT("frame-dup") p->bEnableFrameDuplication = atobool(value);
259
260
OPT("vbv-live-multi-pass") p->bliveVBV2pass = atobool(value);
261
OPT("min-vbv-fullness") p->minVbvFullness = atof(value);
262
OPT("max-vbv-fullness") p->maxVbvFullness = atof(value);
263
+ OPT("video-signal-type-preset") p->videoSignalTypePreset = strdup(value);
264
+ OPT("eob") p->bEnableEndOfBitstream = atobool(value);
265
+ OPT("eos") p->bEnableEndOfSequence = atobool(value);
266
+ /* Film grain characterstics model filename */
267
+ OPT("film-grain") p->filmGrain = (char* )value;
268
+ OPT("mcstf") p->bEnableTemporalFilter = atobool(value);
269
+ OPT("sbrc") p->bEnableSBRC = atobool(value);
270
else
271
return X265_PARAM_BAD_NAME;
272
}
273
274
"scenecutThreshold must be greater than 0");
275
CHECK(param->scenecutBias < 0 || 100 < param->scenecutBias,
276
"scenecut-bias must be between 0 and 100");
277
- CHECK(param->edgeTransitionThreshold < 0.0 || 1.0 < param->edgeTransitionThreshold,
278
- "hist-threshold must be between 0.0 and 1.0");
279
CHECK(param->radl < 0 || param->radl > param->bframes,
280
"radl must be between 0 and bframes");
281
CHECK(param->rdPenalty < 0 || param->rdPenalty > 2,
282
283
"Invalid refine-ctu-distortion value, must be either 0 or 1");
284
CHECK(param->maxAUSizeFactor < 0.5 || param->maxAUSizeFactor > 1.0,
285
"Supported factor for controlling max AU size is from 0.5 to 1");
286
- CHECK((param->dolbyProfile != 0) && (param->dolbyProfile != 50) && (param->dolbyProfile != 81) && (param->dolbyProfile != 82),
287
- "Unsupported Dolby Vision profile, only profile 5, profile 8.1 and profile 8.2 enabled");
288
+ CHECK((param->dolbyProfile != 0) && (param->dolbyProfile != 50) && (param->dolbyProfile != 81) && (param->dolbyProfile != 82) && (param->dolbyProfile != 84),
289
+ "Unsupported Dolby Vision profile, only profile 5, profile 8.1, profile 8.2 and profile 8.4 enabled");
290
CHECK(param->dupThreshold < 1 || 99 < param->dupThreshold,
291
"Invalid frame-duplication threshold. Value must be between 1 and 99.");
292
if (param->dolbyProfile)
293
{
294
CHECK((param->rc.vbvMaxBitrate <= 0 || param->rc.vbvBufferSize <= 0), "Dolby Vision requires VBV settings to enable HRD.\n");
295
- CHECK((param->internalBitDepth != 10), "Dolby Vision profile - 5, profile - 8.1 and profile - 8.2 is Main10 only\n");
296
- CHECK((param->internalCsp != X265_CSP_I420), "Dolby Vision profile - 5, profile - 8.1 and profile - 8.2 requires YCbCr 4:2:0 color space\n");
297
+ CHECK((param->internalBitDepth != 10), "Dolby Vision profile - 5, profile - 8.1, profile - 8.2 and profile - 8.4 are Main10 only\n");
298
+ CHECK((param->internalCsp != X265_CSP_I420), "Dolby Vision profile - 5, profile - 8.1, profile - 8.2 and profile - 8.4 requires YCbCr 4:2:0 color space\n");
299
if (param->dolbyProfile == 81)
300
CHECK(!(param->masteringDisplayColorVolume), "Dolby Vision profile - 8.1 requires Mastering display color volume information\n");
301
}
302
303
{
304
CHECK(param->bEnableSceneCutAwareQp < 0 || param->bEnableSceneCutAwareQp > 3,
305
"Invalid masking direction. Value must be between 0 and 3(inclusive)");
306
- CHECK(param->fwdScenecutWindow < 0 || param->fwdScenecutWindow > 1000,
307
- "Invalid forward scenecut Window duration. Value must be between 0 and 1000(inclusive)");
308
- CHECK(param->fwdRefQpDelta < 0 || param->fwdRefQpDelta > 10,
309
- "Invalid fwdRefQpDelta value. Value must be between 0 and 10 (inclusive)");
310
- CHECK(param->fwdNonRefQpDelta < 0 || param->fwdNonRefQpDelta > 10,
311
- "Invalid fwdNonRefQpDelta value. Value must be between 0 and 10 (inclusive)");
312
-
313
- CHECK(param->bwdScenecutWindow < 0 || param->bwdScenecutWindow > 1000,
314
- "Invalid backward scenecut Window duration. Value must be between 0 and 1000(inclusive)");
315
- CHECK(param->bwdRefQpDelta < -1 || param->bwdRefQpDelta > 10,
316
- "Invalid bwdRefQpDelta value. Value must be between 0 and 10 (inclusive)");
317
- CHECK(param->bwdNonRefQpDelta < -1 || param->bwdNonRefQpDelta > 10,
318
- "Invalid bwdNonRefQpDelta value. Value must be between 0 and 10 (inclusive)");
319
+ for (int i = 0; i < 6; i++)
320
+ {
321
+ CHECK(param->fwdScenecutWindowi < 0 || param->fwdScenecutWindowi > 1000,
322
+ "Invalid forward scenecut Window duration. Value must be between 0 and 1000(inclusive)");
323
+ CHECK(param->fwdRefQpDeltai < 0 || param->fwdRefQpDeltai > 20,
324
+ "Invalid fwdRefQpDelta value. Value must be between 0 and 20 (inclusive)");
325
+ CHECK(param->fwdNonRefQpDeltai < 0 || param->fwdNonRefQpDeltai > 20,
326
+ "Invalid fwdNonRefQpDelta value. Value must be between 0 and 20 (inclusive)");
327
+
328
+ CHECK(param->bwdScenecutWindowi < 0 || param->bwdScenecutWindowi > 1000,
329
+ "Invalid backward scenecut Window duration. Value must be between 0 and 1000(inclusive)");
330
+ CHECK(param->bwdRefQpDeltai < -1 || param->bwdRefQpDeltai > 20,
331
+ "Invalid bwdRefQpDelta value. Value must be between 0 and 20 (inclusive)");
332
+ CHECK(param->bwdNonRefQpDeltai < -1 || param->bwdNonRefQpDeltai > 20,
333
+ "Invalid bwdNonRefQpDelta value. Value must be between 0 and 20 (inclusive)");
334
+ }
335
}
336
}
337
if (param->bEnableHME)
338
339
param->bSingleSeiNal = 0;
340
x265_log(param, X265_LOG_WARNING, "None of the SEI messages are enabled. Disabling Single SEI NAL\n");
341
}
342
+ if (param->bEnableTemporalFilter && (param->frameNumThreads > 1))
343
+ {
344
+ param->bEnableTemporalFilter = 0;
345
+ x265_log(param, X265_LOG_WARNING, "MCSTF can be enabled with frame thread = 1 only. Disabling MCSTF\n");
346
+ }
347
CHECK(param->confWinRightOffset < 0, "Conformance Window Right Offset must be 0 or greater");
348
CHECK(param->confWinBottomOffset < 0, "Conformance Window Bottom Offset must be 0 or greater");
349
CHECK(param->decoderVbvMaxRate < 0, "Invalid Decoder Vbv Maxrate. Value can not be less than zero");
350
351
x265_log(param, X265_LOG_WARNING, "Live VBV enabled without VBV settings.Disabling live VBV in 2 pass\n");
352
}
353
}
354
+ CHECK(param->rc.dataShareMode != X265_SHARE_MODE_FILE && param->rc.dataShareMode != X265_SHARE_MODE_SHAREDMEM, "Invalid data share mode. It must be one of the X265_DATA_SHARE_MODES enum values\n" );
355
return check_failed;
356
}
357
358
359
x265_log(param, X265_LOG_INFO, "Keyframe min / max / scenecut / bias : %d / %d / %d / %.2lf \n",
360
param->keyframeMin, param->keyframeMax, param->scenecutThreshold, param->scenecutBias * 100);
361
else if (param->bHistBasedSceneCut && param->keyframeMax != INT_MAX)
362
- x265_log(param, X265_LOG_INFO, "Keyframe min / max / scenecut / edge threshold : %d / %d / %d / %.2lf\n",
363
- param->keyframeMin, param->keyframeMax, param->bHistBasedSceneCut, param->edgeTransitionThreshold);
364
+ x265_log(param, X265_LOG_INFO, "Keyframe min / max / scenecut : %d / %d / %d\n",
365
+ param->keyframeMin, param->keyframeMax, param->bHistBasedSceneCut);
366
else if (param->keyframeMax == INT_MAX)
367
x265_log(param, X265_LOG_INFO, "Keyframe min / max / scenecut : disabled\n");
368
369
370
bufSize += strlen(p->numaPools);
371
if (p->masteringDisplayColorVolume)
372
bufSize += strlen(p->masteringDisplayColorVolume);
373
+ if (p->videoSignalTypePreset)
374
+ bufSize += strlen(p->videoSignalTypePreset);
375
376
buf = s = X265_MALLOC(char, bufSize);
377
if (!buf)
378
379
BOOL(p->bRepeatHeaders, "repeat-headers");
380
BOOL(p->bAnnexB, "annexb");
381
BOOL(p->bEnableAccessUnitDelimiters, "aud");
382
+ BOOL(p->bEnableEndOfBitstream, "eob");
383
+ BOOL(p->bEnableEndOfSequence, "eos");
384
BOOL(p->bEmitHRDSEI, "hrd");
385
BOOL(p->bEmitInfoSEI, "info");
386
s += sprintf(s, " hash=%d", p->decodedPictureHashSEI);
387
- BOOL(p->bEnableTemporalSubLayers, "temporal-layers");
388
+ s += sprintf(s, " temporal-layers=%d", p->bEnableTemporalSubLayers);
389
BOOL(p->bOpenGOP, "open-gop");
390
s += sprintf(s, " min-keyint=%d", p->keyframeMin);
391
s += sprintf(s, " keyint=%d", p->keyframeMax);
392
393
s += sprintf(s, " rc-lookahead=%d", p->lookaheadDepth);
394
s += sprintf(s, " lookahead-slices=%d", p->lookaheadSlices);
395
s += sprintf(s, " scenecut=%d", p->scenecutThreshold);
396
- s += sprintf(s, " hist-scenecut=%d", p->bHistBasedSceneCut);
397
+ BOOL(p->bHistBasedSceneCut, "hist-scenecut");
398
s += sprintf(s, " radl=%d", p->radl);
399
BOOL(p->bEnableHRDConcatFlag, "splice");
400
BOOL(p->bIntraRefresh, "intra-refresh");
401
402
BOOL(p->bOptRefListLengthPPS, "opt-ref-list-length-pps");
403
BOOL(p->bMultiPassOptRPS, "multi-pass-opt-rps");
404
s += sprintf(s, " scenecut-bias=%.2f", p->scenecutBias);
405
- s += sprintf(s, " hist-threshold=%.2f", p->edgeTransitionThreshold);
406
BOOL(p->bOptCUDeltaQP, "opt-cu-delta-qp");
407
BOOL(p->bAQMotion, "aq-motion");
408
BOOL(p->bEmitHDR10SEI, "hdr10");
409
410
s += sprintf(s, " qp-adaptation-range=%.2f", p->rc.qpAdaptationRange);
411
s += sprintf(s, " scenecut-aware-qp=%d", p->bEnableSceneCutAwareQp);
412
if (p->bEnableSceneCutAwareQp)
413
- s += sprintf(s, " fwd-scenecut-window=%d fwd-ref-qp-delta=%f fwd-nonref-qp-delta=%f bwd-scenecut-window=%d bwd-ref-qp-delta=%f bwd-nonref-qp-delta=%f", p->fwdScenecutWindow, p->fwdRefQpDelta, p->fwdNonRefQpDelta, p->bwdScenecutWindow, p->bwdRefQpDelta, p->bwdNonRefQpDelta);
414
+ s += sprintf(s, " fwd-scenecut-window=%d fwd-ref-qp-delta=%f fwd-nonref-qp-delta=%f bwd-scenecut-window=%d bwd-ref-qp-delta=%f bwd-nonref-qp-delta=%f", p->fwdMaxScenecutWindow, p->fwdRefQpDelta0, p->fwdNonRefQpDelta0, p->bwdMaxScenecutWindow, p->bwdRefQpDelta0, p->bwdNonRefQpDelta0);
415
s += sprintf(s, "conformance-window-offsets right=%d bottom=%d", p->confWinRightOffset, p->confWinBottomOffset);
416
s += sprintf(s, " decoder-max-rate=%d", p->decoderVbvMaxRate);
417
BOOL(p->bliveVBV2pass, "vbv-live-multi-pass");
418
+ if (p->filmGrain)
419
+ s += sprintf(s, " film-grain=%s", p->filmGrain); // Film grain characteristics model filename
420
+ BOOL(p->bEnableTemporalFilter, "mcstf");
421
+ BOOL(p->bEnableSBRC, "sbrc");
422
#undef BOOL
423
return buf;
424
}
425
426
return false;
427
}
428
429
+bool parseMaskingStrength(x265_param* p, const char* value)
430
+{
431
+ bool bError = false;
432
+ int window16;
433
+ double refQpDelta16, nonRefQpDelta16;
434
+ if (p->bEnableSceneCutAwareQp == FORWARD)
435
+ {
436
+ if (3 == sscanf(value, "%d,%lf,%lf", &window10, &refQpDelta10, &nonRefQpDelta10))
437
+ {
438
+ if (window10 > 0)
439
+ p->fwdMaxScenecutWindow = window10;
440
+ if (refQpDelta10 > 0)
441
+ p->fwdRefQpDelta0 = refQpDelta10;
442
+ if (nonRefQpDelta10 > 0)
443
+ p->fwdNonRefQpDelta0 = nonRefQpDelta10;
444
+
445
+ p->fwdScenecutWindow0 = p->fwdMaxScenecutWindow / 6;
446
+ for (int i = 1; i < 6; i++)
447
+ {
448
+ p->fwdScenecutWindowi = p->fwdMaxScenecutWindow / 6;
449
+ p->fwdRefQpDeltai = p->fwdRefQpDeltai - 1 - (0.15 * p->fwdRefQpDeltai - 1);
450
+ p->fwdNonRefQpDeltai = p->fwdNonRefQpDeltai - 1 - (0.15 * p->fwdNonRefQpDeltai - 1);
451
+ }
452
+ }
453
+ else if (18 == sscanf(value, "%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf"
454
+ , &window10, &refQpDelta10, &nonRefQpDelta10, &window11, &refQpDelta11, &nonRefQpDelta11
455
+ , &window12, &refQpDelta12, &nonRefQpDelta12, &window13, &refQpDelta13, &nonRefQpDelta13
456
+ , &window14, &refQpDelta14, &nonRefQpDelta14, &window15, &refQpDelta15, &nonRefQpDelta15))
457
+ {
458
+ p->fwdMaxScenecutWindow = 0;
459
+ for (int i = 0; i < 6; i++)
460
+ {
461
+ p->fwdScenecutWindowi = window1i;
462
+ p->fwdRefQpDeltai = refQpDelta1i;
463
+ p->fwdNonRefQpDeltai = nonRefQpDelta1i;
464
+ p->fwdMaxScenecutWindow += p->fwdScenecutWindowi;
465
+ }
466
+ }
467
+ else
468
+ {
469
+ x265_log(NULL, X265_LOG_ERROR, "Specify all the necessary offsets for masking-strength \n");
470
+ bError = true;
471
+ }
472
+ }
473
+ else if (p->bEnableSceneCutAwareQp == BACKWARD)
474
+ {
475
+ if (3 == sscanf(value, "%d,%lf,%lf", &window10, &refQpDelta10, &nonRefQpDelta10))
476
+ {
477
+ if (window10 > 0)
478
+ p->bwdMaxScenecutWindow = window10;
479
+ if (refQpDelta10 > 0)
480
+ p->bwdRefQpDelta0 = refQpDelta10;
481
+ if (nonRefQpDelta10 > 0)
482
+ p->bwdNonRefQpDelta0 = nonRefQpDelta10;
483
+
484
+ p->bwdScenecutWindow0 = p->bwdMaxScenecutWindow / 6;
485
+ for (int i = 1; i < 6; i++)
486
+ {
487
+ p->bwdScenecutWindowi = p->bwdMaxScenecutWindow / 6;
488
+ p->bwdRefQpDeltai = p->bwdRefQpDeltai - 1 - (0.15 * p->bwdRefQpDeltai - 1);
489
+ p->bwdNonRefQpDeltai = p->bwdNonRefQpDeltai - 1 - (0.15 * p->bwdNonRefQpDeltai - 1);
490
+ }
491
+ }
492
+ else if (18 == sscanf(value, "%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf"
493
+ , &window10, &refQpDelta10, &nonRefQpDelta10, &window11, &refQpDelta11, &nonRefQpDelta11
494
+ , &window12, &refQpDelta12, &nonRefQpDelta12, &window13, &refQpDelta13, &nonRefQpDelta13
495
+ , &window14, &refQpDelta14, &nonRefQpDelta14, &window15, &refQpDelta15, &nonRefQpDelta15))
496
+ {
497
+ p->bwdMaxScenecutWindow = 0;
498
+ for (int i = 0; i < 6; i++)
499
+ {
500
+ p->bwdScenecutWindowi = window1i;
501
+ p->bwdRefQpDeltai = refQpDelta1i;
502
+ p->bwdNonRefQpDeltai = nonRefQpDelta1i;
503
+ p->bwdMaxScenecutWindow += p->bwdScenecutWindowi;
504
+ }
505
+ }
506
+ else
507
+ {
508
+ x265_log(NULL, X265_LOG_ERROR, "Specify all the necessary offsets for masking-strength \n");
509
+ bError = true;
510
+ }
511
+ }
512
+ else if (p->bEnableSceneCutAwareQp == BI_DIRECTIONAL)
513
+ {
514
+ int window26;
515
+ double refQpDelta26, nonRefQpDelta26;
516
+ if (6 == sscanf(value, "%d,%lf,%lf,%d,%lf,%lf", &window10, &refQpDelta10, &nonRefQpDelta10, &window20, &refQpDelta20, &nonRefQpDelta20))
517
+ {
518
+ if (window10 > 0)
519
+ p->fwdMaxScenecutWindow = window10;
520
+ if (refQpDelta10 > 0)
521
+ p->fwdRefQpDelta0 = refQpDelta10;
522
+ if (nonRefQpDelta10 > 0)
523
+ p->fwdNonRefQpDelta0 = nonRefQpDelta10;
524
+ if (window20 > 0)
525
+ p->bwdMaxScenecutWindow = window20;
526
+ if (refQpDelta20 > 0)
527
+ p->bwdRefQpDelta0 = refQpDelta20;
528
+ if (nonRefQpDelta20 > 0)
529
+ p->bwdNonRefQpDelta0 = nonRefQpDelta20;
530
+
531
+ p->fwdScenecutWindow0 = p->fwdMaxScenecutWindow / 6;
532
+ p->bwdScenecutWindow0 = p->bwdMaxScenecutWindow / 6;
533
+ for (int i = 1; i < 6; i++)
534
+ {
535
+ p->fwdScenecutWindowi = p->fwdMaxScenecutWindow / 6;
536
+ p->bwdScenecutWindowi = p->bwdMaxScenecutWindow / 6;
537
+ p->fwdRefQpDeltai = p->fwdRefQpDeltai - 1 - (0.15 * p->fwdRefQpDeltai - 1);
538
+ p->fwdNonRefQpDeltai = p->fwdNonRefQpDeltai - 1 - (0.15 * p->fwdNonRefQpDeltai - 1);
539
+ p->bwdRefQpDeltai = p->bwdRefQpDeltai - 1 - (0.15 * p->bwdRefQpDeltai - 1);
540
+ p->bwdNonRefQpDeltai = p->bwdNonRefQpDeltai - 1 - (0.15 * p->bwdNonRefQpDeltai - 1);
541
+ }
542
+ }
543
+ else if (36 == sscanf(value, "%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf,%d,%lf,%lf"
544
+ , &window10, &refQpDelta10, &nonRefQpDelta10, &window11, &refQpDelta11, &nonRefQpDelta11
545
+ , &window12, &refQpDelta12, &nonRefQpDelta12, &window13, &refQpDelta13, &nonRefQpDelta13
546
+ , &window14, &refQpDelta14, &nonRefQpDelta14, &window15, &refQpDelta15, &nonRefQpDelta15
547
+ , &window20, &refQpDelta20, &nonRefQpDelta20, &window21, &refQpDelta21, &nonRefQpDelta21
548
+ , &window22, &refQpDelta22, &nonRefQpDelta22, &window23, &refQpDelta23, &nonRefQpDelta23
549
+ , &window24, &refQpDelta24, &nonRefQpDelta24, &window25, &refQpDelta25, &nonRefQpDelta25))
550
+ {
551
+ p->fwdMaxScenecutWindow = 0;
552
+ p->bwdMaxScenecutWindow = 0;
553
+ for (int i = 0; i < 6; i++)
554
+ {
555
+ p->fwdScenecutWindowi = window1i;
556
+ p->fwdRefQpDeltai = refQpDelta1i;
557
+ p->fwdNonRefQpDeltai = nonRefQpDelta1i;
558
+ p->bwdScenecutWindowi = window2i;
559
+ p->bwdRefQpDeltai = refQpDelta2i;
560
+ p->bwdNonRefQpDeltai = nonRefQpDelta2i;
561
+ p->fwdMaxScenecutWindow += p->fwdScenecutWindowi;
562
+ p->bwdMaxScenecutWindow += p->bwdScenecutWindowi;
563
+ }
564
+ }
565
+ else
566
+ {
567
+ x265_log(NULL, X265_LOG_ERROR, "Specify all the necessary offsets for masking-strength \n");
568
+ bError = true;
569
+ }
570
+ }
571
+ return bError;
572
+}
573
+
574
void x265_copy_params(x265_param* dst, x265_param* src)
575
{
576
dst->cpuid = src->cpuid;
577
578
dst->bRepeatHeaders = src->bRepeatHeaders;
579
dst->bAnnexB = src->bAnnexB;
580
dst->bEnableAccessUnitDelimiters = src->bEnableAccessUnitDelimiters;
581
+ dst->bEnableEndOfBitstream = src->bEnableEndOfBitstream;
582
+ dst->bEnableEndOfSequence = src->bEnableEndOfSequence;
583
dst->bEmitInfoSEI = src->bEmitInfoSEI;
584
dst->decodedPictureHashSEI = src->decodedPictureHashSEI;
585
dst->bEnableTemporalSubLayers = src->bEnableTemporalSubLayers;
586
dst->bOpenGOP = src->bOpenGOP;
587
+ dst->craNal = src->craNal;
588
dst->keyframeMax = src->keyframeMax;
589
dst->keyframeMin = src->keyframeMin;
590
dst->bframes = src->bframes;
591
592
dst->rc.rfConstantMin = src->rc.rfConstantMin;
593
dst->rc.bStatWrite = src->rc.bStatWrite;
594
dst->rc.bStatRead = src->rc.bStatRead;
595
+ dst->rc.dataShareMode = src->rc.dataShareMode;
596
if (src->rc.statFileName) dst->rc.statFileName=strdup(src->rc.statFileName);
597
else dst->rc.statFileName = NULL;
598
+ if (src->rc.sharedMemName) dst->rc.sharedMemName = strdup(src->rc.sharedMemName);
599
+ else dst->rc.sharedMemName = NULL;
600
dst->rc.qblur = src->rc.qblur;
601
dst->rc.complexityBlur = src->rc.complexityBlur;
602
dst->rc.bEnableSlowFirstPass = src->rc.bEnableSlowFirstPass;
603
604
dst->rc.zonefileCount = src->rc.zonefileCount;
605
dst->reconfigWindowSize = src->reconfigWindowSize;
606
dst->bResetZoneConfig = src->bResetZoneConfig;
607
+ dst->bNoResetZoneConfig = src->bNoResetZoneConfig;
608
dst->decoderVbvMaxRate = src->decoderVbvMaxRate;
609
610
if (src->rc.zonefileCount && src->rc.zones && src->bResetZoneConfig)
611
612
for (int i = 0; i < src->rc.zonefileCount; i++)
613
{
614
dst->rc.zonesi.startFrame = src->rc.zonesi.startFrame;
615
+ dst->rc.zones0.keyframeMax = src->rc.zones0.keyframeMax;
616
memcpy(dst->rc.zonesi.zoneParam, src->rc.zonesi.zoneParam, sizeof(x265_param));
617
}
618
}
619
620
dst->bOptRefListLengthPPS = src->bOptRefListLengthPPS;
621
dst->bMultiPassOptRPS = src->bMultiPassOptRPS;
622
dst->scenecutBias = src->scenecutBias;
623
- dst->edgeTransitionThreshold = src->edgeTransitionThreshold;
624
dst->gopLookahead = src->lookaheadDepth;
625
dst->bOptCUDeltaQP = src->bOptCUDeltaQP;
626
dst->analysisMultiPassDistortion = src->analysisMultiPassDistortion;
627
628
dst->bEnableSvtHevc = src->bEnableSvtHevc;
629
dst->bEnableFades = src->bEnableFades;
630
dst->bEnableSceneCutAwareQp = src->bEnableSceneCutAwareQp;
631
- dst->fwdScenecutWindow = src->fwdScenecutWindow;
632
- dst->fwdRefQpDelta = src->fwdRefQpDelta;
633
- dst->fwdNonRefQpDelta = src->fwdNonRefQpDelta;
634
- dst->bwdScenecutWindow = src->bwdScenecutWindow;
635
- dst->bwdRefQpDelta = src->bwdRefQpDelta;
636
- dst->bwdNonRefQpDelta = src->bwdNonRefQpDelta;
637
+ dst->fwdMaxScenecutWindow = src->fwdMaxScenecutWindow;
638
+ dst->bwdMaxScenecutWindow = src->bwdMaxScenecutWindow;
639
+ for (int i = 0; i < 6; i++)
640
+ {
641
+ dst->fwdScenecutWindowi = src->fwdScenecutWindowi;
642
+ dst->fwdRefQpDeltai = src->fwdRefQpDeltai;
643
+ dst->fwdNonRefQpDeltai = src->fwdNonRefQpDeltai;
644
+ dst->bwdScenecutWindowi = src->bwdScenecutWindowi;
645
+ dst->bwdRefQpDeltai = src->bwdRefQpDeltai;
646
+ dst->bwdNonRefQpDeltai = src->bwdNonRefQpDeltai;
647
+ }
648
dst->bField = src->bField;
649
-
650
+ dst->bEnableTemporalFilter = src->bEnableTemporalFilter;
651
+ dst->temporalFilterStrength = src->temporalFilterStrength;
652
dst->confWinRightOffset = src->confWinRightOffset;
653
dst->confWinBottomOffset = src->confWinBottomOffset;
654
dst->bliveVBV2pass = src->bliveVBV2pass;
655
+
656
+ if (src->videoSignalTypePreset) dst->videoSignalTypePreset = strdup(src->videoSignalTypePreset);
657
+ else dst->videoSignalTypePreset = NULL;
658
#ifdef SVT_HEVC
659
memcpy(dst->svtHevcParam, src->svtHevcParam, sizeof(EB_H265_ENC_CONFIGURATION));
660
#endif
661
+ /* Film grain */
662
+ if (src->filmGrain)
663
+ dst->filmGrain = src->filmGrain;
664
+ dst->bEnableSBRC = src->bEnableSBRC;
665
}
666
667
#ifdef SVT_HEVC
668
x265_3.5.tar.gz/source/common/param.h -> x265_3.6.tar.gz/source/common/param.h
Changed
17
1
2
void getParamAspectRatio(x265_param *p, int& width, int& height);
3
bool parseLambdaFile(x265_param *param);
4
void x265_copy_params(x265_param* dst, x265_param* src);
5
+bool parseMaskingStrength(x265_param* p, const char* value);
6
7
/* this table is kept internal to avoid confusion, since log level indices start at -1 */
8
static const char * const logLevelNames = { "none", "error", "warning", "info", "debug", "full", 0 };
9
10
int x265_param_default_preset(x265_param *, const char *preset, const char *tune);
11
int x265_param_apply_profile(x265_param *, const char *profile);
12
int x265_param_parse(x265_param *p, const char *name, const char *value);
13
+int x265_scenecut_aware_qp_param_parse(x265_param* p, const char* name, const char* value);
14
int x265_zone_param_parse(x265_param* p, const char* name, const char* value);
15
#define PARAM_NS X265_NS
16
#endif
17
x265_3.5.tar.gz/source/common/piclist.cpp -> x265_3.6.tar.gz/source/common/piclist.cpp
Changed
134
1
2
m_count++;
3
}
4
5
+void PicList::pushFrontMCSTF(Frame& curFrame)
6
+{
7
+ X265_CHECK(!curFrame.m_nextMCSTF && !curFrame.m_nextMCSTF, "piclist: picture already in OPB list\n"); // ensure frame is not in a list
8
+ curFrame.m_nextMCSTF = m_start;
9
+ curFrame.m_prevMCSTF = NULL;
10
+
11
+ if (m_count)
12
+ {
13
+ m_start->m_prevMCSTF = &curFrame;
14
+ m_start = &curFrame;
15
+ }
16
+ else
17
+ {
18
+ m_start = m_end = &curFrame;
19
+ }
20
+ m_count++;
21
+
22
+}
23
+
24
void PicList::pushBack(Frame& curFrame)
25
{
26
X265_CHECK(!curFrame.m_next && !curFrame.m_prev, "piclist: picture already in list\n"); // ensure frame is not in a list
27
28
m_count++;
29
}
30
31
+void PicList::pushBackMCSTF(Frame& curFrame)
32
+{
33
+ X265_CHECK(!curFrame.m_nextMCSTF && !curFrame.m_prevMCSTF, "piclist: picture already in OPB list\n"); // ensure frame is not in a list
34
+ curFrame.m_nextMCSTF = NULL;
35
+ curFrame.m_prevMCSTF = m_end;
36
+
37
+ if (m_count)
38
+ {
39
+ m_end->m_nextMCSTF = &curFrame;
40
+ m_end = &curFrame;
41
+ }
42
+ else
43
+ {
44
+ m_start = m_end = &curFrame;
45
+ }
46
+ m_count++;
47
+}
48
+
49
Frame *PicList::popFront()
50
{
51
if (m_start)
52
53
return curFrame;
54
}
55
56
+Frame* PicList::getPOCMCSTF(int poc)
57
+{
58
+ Frame *curFrame = m_start;
59
+ while (curFrame && curFrame->m_poc != poc)
60
+ curFrame = curFrame->m_nextMCSTF;
61
+ return curFrame;
62
+}
63
+
64
Frame *PicList::popBack()
65
{
66
if (m_end)
67
68
return NULL;
69
}
70
71
+Frame *PicList::popBackMCSTF()
72
+{
73
+ if (m_end)
74
+ {
75
+ Frame* temp = m_end;
76
+ m_count--;
77
+
78
+ if (m_count)
79
+ {
80
+ m_end = m_end->m_prevMCSTF;
81
+ m_end->m_nextMCSTF = NULL;
82
+ }
83
+ else
84
+ {
85
+ m_start = m_end = NULL;
86
+ }
87
+ temp->m_nextMCSTF = temp->m_prevMCSTF = NULL;
88
+ return temp;
89
+ }
90
+ else
91
+ return NULL;
92
+}
93
+
94
Frame* PicList::getCurFrame(void)
95
{
96
Frame *curFrame = m_start;
97
98
99
curFrame.m_next = curFrame.m_prev = NULL;
100
}
101
+
102
+void PicList::removeMCSTF(Frame& curFrame)
103
+{
104
+#if _DEBUG
105
+ Frame *tmp = m_start;
106
+ while (tmp && tmp != &curFrame)
107
+ {
108
+ tmp = tmp->m_nextMCSTF;
109
+ }
110
+
111
+ X265_CHECK(tmp == &curFrame, "framelist: pic being removed was not in list\n"); // verify pic is in this list
112
+#endif
113
+
114
+ m_count--;
115
+ if (m_count)
116
+ {
117
+ if (m_start == &curFrame)
118
+ m_start = curFrame.m_nextMCSTF;
119
+ if (m_end == &curFrame)
120
+ m_end = curFrame.m_prevMCSTF;
121
+
122
+ if (curFrame.m_nextMCSTF)
123
+ curFrame.m_nextMCSTF->m_prevMCSTF = curFrame.m_prevMCSTF;
124
+ if (curFrame.m_prevMCSTF)
125
+ curFrame.m_prevMCSTF->m_nextMCSTF = curFrame.m_nextMCSTF;
126
+ }
127
+ else
128
+ {
129
+ m_start = m_end = NULL;
130
+ }
131
+
132
+ curFrame.m_nextMCSTF = curFrame.m_prevMCSTF = NULL;
133
+}
134
x265_3.5.tar.gz/source/common/piclist.h -> x265_3.6.tar.gz/source/common/piclist.h
Changed
33
1
2
3
/** Push picture to end of the list */
4
void pushBack(Frame& pic);
5
+ void pushBackMCSTF(Frame& pic);
6
7
/** Push picture to beginning of the list */
8
void pushFront(Frame& pic);
9
+ void pushFrontMCSTF(Frame& pic);
10
11
/** Pop picture from end of the list */
12
Frame* popBack();
13
+ Frame* popBackMCSTF();
14
15
/** Pop picture from beginning of the list */
16
Frame* popFront();
17
18
/** Find frame with specified POC */
19
Frame* getPOC(int poc);
20
+ /* Find next MCSTF frame with specified POC */
21
+ Frame* getPOCMCSTF(int poc);
22
23
/** Get the current Frame from the list **/
24
Frame* getCurFrame(void);
25
26
/** Remove picture from list */
27
void remove(Frame& pic);
28
+ /* Remove MCSTF picture from list */
29
+ void removeMCSTF(Frame& pic);
30
31
Frame* first() { return m_start; }
32
33
x265_3.5.tar.gz/source/common/picyuv.cpp -> x265_3.6.tar.gz/source/common/picyuv.cpp
Changed
60
1
2
return false;
3
}
4
5
+/*Copy pixels from the picture buffer of a frame to picture buffer of another frame*/
6
+void PicYuv::copyFromFrame(PicYuv* source)
7
+{
8
+ uint32_t numCuInHeight = (m_picHeight + m_param->maxCUSize - 1) / m_param->maxCUSize;
9
+
10
+ int maxHeight = numCuInHeight * m_param->maxCUSize;
11
+ memcpy(m_picBuf0, source->m_picBuf0, sizeof(pixel)* m_stride * (maxHeight + (m_lumaMarginY * 2)));
12
+ m_picOrg0 = m_picBuf0 + m_lumaMarginY * m_stride + m_lumaMarginX;
13
+
14
+ if (m_picCsp != X265_CSP_I400)
15
+ {
16
+ memcpy(m_picBuf1, source->m_picBuf1, sizeof(pixel)* m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
17
+ memcpy(m_picBuf2, source->m_picBuf2, sizeof(pixel)* m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
18
+
19
+ m_picOrg1 = m_picBuf1 + m_chromaMarginY * m_strideC + m_chromaMarginX;
20
+ m_picOrg2 = m_picBuf2 + m_chromaMarginY * m_strideC + m_chromaMarginX;
21
+ }
22
+ else
23
+ {
24
+ m_picBuf1 = m_picBuf2 = NULL;
25
+ m_picOrg1 = m_picOrg2 = NULL;
26
+ }
27
+}
28
+
29
+bool PicYuv::createScaledPicYUV(x265_param* param, uint8_t scaleFactor)
30
+{
31
+ m_param = param;
32
+ m_picWidth = m_param->sourceWidth / scaleFactor;
33
+ m_picHeight = m_param->sourceHeight / scaleFactor;
34
+
35
+ m_picCsp = m_param->internalCsp;
36
+ m_hChromaShift = CHROMA_H_SHIFT(m_picCsp);
37
+ m_vChromaShift = CHROMA_V_SHIFT(m_picCsp);
38
+
39
+ uint32_t numCuInWidth = (m_picWidth + param->maxCUSize - 1) / param->maxCUSize;
40
+ uint32_t numCuInHeight = (m_picHeight + param->maxCUSize - 1) / param->maxCUSize;
41
+
42
+ m_lumaMarginX = 128; // search margin for L0 and L1 ME in horizontal direction
43
+ m_lumaMarginY = 128; // search margin for L0 and L1 ME in vertical direction
44
+ m_stride = (numCuInWidth * param->maxCUSize) + (m_lumaMarginX << 1);
45
+
46
+ int maxHeight = numCuInHeight * param->maxCUSize;
47
+ CHECKED_MALLOC_ZERO(m_picBuf0, pixel, m_stride * (maxHeight + (m_lumaMarginY * 2)));
48
+ m_picOrg0 = m_picBuf0 + m_lumaMarginY * m_stride + m_lumaMarginX;
49
+ m_picBuf1 = m_picBuf2 = NULL;
50
+ m_picOrg1 = m_picOrg2 = NULL;
51
+ return true;
52
+
53
+fail:
54
+ return false;
55
+}
56
+
57
int PicYuv::getLumaBufLen(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp)
58
{
59
m_picWidth = picWidth;
60
x265_3.5.tar.gz/source/common/picyuv.h -> x265_3.6.tar.gz/source/common/picyuv.h
Changed
15
1
2
PicYuv();
3
4
bool create(x265_param* param, bool picAlloc = true, pixel *pixelbuf = NULL);
5
+ bool createScaledPicYUV(x265_param* param, uint8_t scaleFactor);
6
bool createOffsets(const SPS& sps);
7
void destroy();
8
int getLumaBufLen(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp);
9
10
void copyFromPicture(const x265_picture&, const x265_param& param, int padx, int pady);
11
+ void copyFromFrame(PicYuv* source);
12
13
intptr_t getChromaAddrOffset(uint32_t ctuAddr, uint32_t absPartIdx) const { return m_cuOffsetCctuAddr + m_buOffsetCabsPartIdx; }
14
15
x265_3.5.tar.gz/source/common/pixel.cpp -> x265_3.6.tar.gz/source/common/pixel.cpp
Changed
51
1
2
{
3
int satd = 0;
4
5
-#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
6
+#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 && !HIGH_BIT_DEPTH
7
pixelcmp_t satd_4x4 = x265_pixel_satd_4x4_neon;
8
#endif
9
10
11
{
12
int satd = 0;
13
14
-#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
15
+#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 && !HIGH_BIT_DEPTH
16
pixelcmp_t satd_8x4 = x265_pixel_satd_8x4_neon;
17
#endif
18
19
20
}
21
}
22
23
+static
24
+void frame_subsample_luma(const pixel* src0, pixel* dst0, intptr_t src_stride, intptr_t dst_stride, int width, int height)
25
+{
26
+ for (int y = 0; y < height; y++, src0 += 2 * src_stride, dst0 += dst_stride)
27
+ {
28
+ const pixel *inRow = src0;
29
+ const pixel *inRowBelow = src0 + src_stride;
30
+ pixel *target = dst0;
31
+ for (int x = 0; x < width; x++)
32
+ {
33
+ targetx = (((inRow0 + inRowBelow0 + 1) >> 1) + ((inRow1 + inRowBelow1 + 1) >> 1) + 1) >> 1;
34
+ inRow += 2;
35
+ inRowBelow += 2;
36
+ }
37
+ }
38
+}
39
+
40
/* structural similarity metric */
41
static void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24)
42
{
43
44
p.cuBLOCK_16x16.normFact = normFact_c;
45
p.cuBLOCK_32x32.normFact = normFact_c;
46
p.cuBLOCK_64x64.normFact = normFact_c;
47
+ /* SubSample Luma*/
48
+ p.frameSubSampleLuma = frame_subsample_luma;
49
}
50
}
51
x265_3.5.tar.gz/source/common/ppc/intrapred_altivec.cpp -> x265_3.6.tar.gz/source/common/ppc/intrapred_altivec.cpp
Changed
10
1
2
#include <assert.h>
3
#include <math.h>
4
#include <cmath>
5
-#include <linux/types.h>
6
+#include <sys/types.h>
7
#include <stdlib.h>
8
#include <stdio.h>
9
#include <stdint.h>
10
x265_3.5.tar.gz/source/common/primitives.h -> x265_3.6.tar.gz/source/common/primitives.h
Changed
28
1
2
typedef void(*psyRdoQuant_t2)(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos);
3
typedef void(*ssimDistortion_t)(const pixel *fenc, uint32_t fStride, const pixel *recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k);
4
typedef void(*normFactor_t)(const pixel *src, uint32_t blockSize, int shift, uint64_t *z_k);
5
+/* SubSampling Luma */
6
+typedef void (*downscaleluma_t)(const pixel* src0, pixel* dstf, intptr_t src_stride, intptr_t dst_stride, int width, int height);
7
/* Function pointers to optimized encoder primitives. Each pointer can reference
8
* either an assembly routine, a SIMD intrinsic primitive, or a C function */
9
struct EncoderPrimitives
10
11
12
downscale_t frameInitLowres;
13
downscale_t frameInitLowerRes;
14
+ /* Sub Sample Luma */
15
+ downscaleluma_t frameSubSampleLuma;
16
cutree_propagate_cost propagateCost;
17
cutree_fix8_unpack fix8Unpack;
18
cutree_fix8_pack fix8Pack;
19
20
21
#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
22
extern "C" {
23
-#include "aarch64/pixel-util.h"
24
+#include "aarch64/fun-decls.h"
25
}
26
#endif
27
28
x265_3.6.tar.gz/source/common/ringmem.cpp
Added
359
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2013-2017 MulticoreWare, Inc
4
+ *
5
+ * Authors: liwei <liwei@multicorewareinc.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com
23
+ *****************************************************************************/
24
+
25
+#include "ringmem.h"
26
+
27
+#ifndef _WIN32
28
+#include <sys/mman.h>
29
+#endif ////< _WIN32
30
+
31
+#ifdef _WIN32
32
+#define X265_SHARED_MEM_NAME "Local\\_x265_shr_mem_"
33
+#define X265_SEMAPHORE_RINGMEM_WRITER_NAME "_x265_semW_"
34
+#define X265_SEMAPHORE_RINGMEM_READER_NAME "_x265_semR_"
35
+#else /* POSIX / pthreads */
36
+#define X265_SHARED_MEM_NAME "/tmp/_x265_shr_mem_"
37
+#define X265_SEMAPHORE_RINGMEM_WRITER_NAME "/tmp/_x265_semW_"
38
+#define X265_SEMAPHORE_RINGMEM_READER_NAME "/tmp/_x265_semR_"
39
+#endif
40
+
41
+#define RINGMEM_ALLIGNMENT 64
42
+
43
+namespace X265_NS {
44
+ RingMem::RingMem()
45
+ : m_initialized(false)
46
+ , m_protectRW(false)
47
+ , m_itemSize(0)
48
+ , m_itemCnt(0)
49
+ , m_dataPool(NULL)
50
+ , m_shrMem(NULL)
51
+#ifdef _WIN32
52
+ , m_handle(NULL)
53
+#else //_WIN32
54
+ , m_filepath(NULL)
55
+#endif //_WIN32
56
+ , m_writeSem(NULL)
57
+ , m_readSem(NULL)
58
+ {
59
+ }
60
+
61
+
62
+ RingMem::~RingMem()
63
+ {
64
+ }
65
+
66
+ bool RingMem::skipRead(int32_t cnt) {
67
+ if (!m_initialized)
68
+ {
69
+ return false;
70
+ }
71
+
72
+ if (m_protectRW)
73
+ {
74
+ for (int i = 0; i < cnt; i++)
75
+ {
76
+ m_readSem->take();
77
+ }
78
+ }
79
+
80
+ ATOMIC_ADD(&m_shrMem->m_read, cnt);
81
+
82
+ if (m_protectRW)
83
+ {
84
+ m_writeSem->give(cnt);
85
+ }
86
+
87
+ return true;
88
+ }
89
+
90
+ bool RingMem::skipWrite(int32_t cnt) {
91
+ if (!m_initialized)
92
+ {
93
+ return false;
94
+ }
95
+
96
+ if (m_protectRW)
97
+ {
98
+ for (int i = 0; i < cnt; i++)
99
+ {
100
+ m_writeSem->take();
101
+ }
102
+ }
103
+
104
+ ATOMIC_ADD(&m_shrMem->m_write, cnt);
105
+
106
+ if (m_protectRW)
107
+ {
108
+ m_readSem->give(cnt);
109
+ }
110
+
111
+ return true;
112
+ }
113
+
114
+ ///< initialize
115
+ bool RingMem::init(int32_t itemSize, int32_t itemCnt, const char *name, bool protectRW)
116
+ {
117
+ ///< check parameters
118
+ if (itemSize <= 0 || itemCnt <= 0 || NULL == name)
119
+ {
120
+ ///< invalid parameters
121
+ return false;
122
+ }
123
+
124
+ if (!m_initialized)
125
+ {
126
+ ///< formating names
127
+ char nameBufMAX_SHR_NAME_LEN = { 0 };
128
+
129
+ ///< shared memory name
130
+ snprintf(nameBuf, sizeof(nameBuf) - 1, "%s%s", X265_SHARED_MEM_NAME, name);
131
+
132
+ ///< create or open shared memory
133
+ bool newCreated = false;
134
+
135
+ ///< calculate the size of the shared memory
136
+ int32_t shrMemSize = (itemSize * itemCnt + sizeof(ShrMemCtrl) + RINGMEM_ALLIGNMENT - 1) & ~(RINGMEM_ALLIGNMENT - 1);
137
+
138
+#ifdef _WIN32
139
+ HANDLE h = OpenFileMappingA(FILE_MAP_WRITE | FILE_MAP_READ, FALSE, nameBuf);
140
+ if (!h)
141
+ {
142
+ h = CreateFileMappingA(INVALID_HANDLE_VALUE, NULL, PAGE_READWRITE, 0, shrMemSize, nameBuf);
143
+
144
+ if (!h)
145
+ {
146
+ return false;
147
+ }
148
+
149
+ newCreated = true;
150
+ }
151
+
152
+ void *pool = MapViewOfFile(h, FILE_MAP_ALL_ACCESS, 0, 0, 0);
153
+
154
+ ///< should not close the handle here, otherwise the OpenFileMapping would fail
155
+ //CloseHandle(h);
156
+ m_handle = h;
157
+
158
+ if (!pool)
159
+ {
160
+ return false;
161
+ }
162
+
163
+#else /* POSIX / pthreads */
164
+ mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH;
165
+ int flag = O_RDWR;
166
+ int shrfd = -1;
167
+ if ((shrfd = open(nameBuf, flag, mode)) < 0)
168
+ {
169
+ flag |= O_CREAT;
170
+
171
+ shrfd = open(nameBuf, flag, mode);
172
+ if (shrfd < 0)
173
+ {
174
+ return false;
175
+ }
176
+ newCreated = true;
177
+
178
+ lseek(shrfd, shrMemSize - 1, SEEK_SET);
179
+
180
+ if (-1 == write(shrfd, "\0", 1))
181
+ {
182
+ close(shrfd);
183
+ return false;
184
+ }
185
+
186
+ if (lseek(shrfd, 0, SEEK_END) < shrMemSize)
187
+ {
188
+ close(shrfd);
189
+ return false;
190
+ }
191
+ }
192
+
193
+ void *pool = mmap(0,
194
+ shrMemSize,
195
+ PROT_READ | PROT_WRITE,
196
+ MAP_SHARED,
197
+ shrfd,
198
+ 0);
199
+
200
+ close(shrfd);
201
+ if (pool == MAP_FAILED)
202
+ {
203
+ return false;
204
+ }
205
+
206
+ m_filepath = strdup(nameBuf);
207
+#endif ///< _WIN32
208
+
209
+ if (newCreated)
210
+ {
211
+ memset(pool, 0, shrMemSize);
212
+ }
213
+
214
+ m_shrMem = reinterpret_cast<ShrMemCtrl *>(pool);
215
+ m_dataPool = reinterpret_cast<uint8_t *>(pool) + sizeof(ShrMemCtrl);
216
+ m_itemSize = itemSize;
217
+ m_itemCnt = itemCnt;
218
+ m_initialized = true;
219
+
220
+ if (protectRW)
221
+ {
222
+ m_protectRW = true;
223
+ m_writeSem = new NamedSemaphore();
224
+ if (!m_writeSem)
225
+ {
226
+ release();
227
+ return false;
228
+ }
229
+
230
+ ///< shared memory name
231
+ snprintf(nameBuf, sizeof(nameBuf) - 1, "%s%s", X265_SEMAPHORE_RINGMEM_WRITER_NAME, name);
232
+ if (!m_writeSem->create(nameBuf, m_itemCnt, m_itemCnt))
233
+ {
234
+ release();
235
+ return false;
236
+ }
237
+
238
+ m_readSem = new NamedSemaphore();
239
+ if (!m_readSem)
240
+ {
241
+ release();
242
+ return false;
243
+ }
244
+
245
+ ///< shared memory name
246
+ snprintf(nameBuf, sizeof(nameBuf) - 1, "%s%s", X265_SEMAPHORE_RINGMEM_READER_NAME, name);
247
+ if (!m_readSem->create(nameBuf, 0, m_itemCnt))
248
+ {
249
+ release();
250
+ return false;
251
+ }
252
+ }
253
+ }
254
+
255
+ return true;
256
+ }
257
+ ///< finalize
258
+ void RingMem::release()
259
+ {
260
+ if (m_initialized)
261
+ {
262
+ m_initialized = false;
263
+
264
+ if (m_shrMem)
265
+ {
266
+#ifdef _WIN32
267
+ UnmapViewOfFile(m_shrMem);
268
+ CloseHandle(m_handle);
269
+ m_handle = NULL;
270
+#else /* POSIX / pthreads */
271
+ int32_t shrMemSize = (m_itemSize * m_itemCnt + sizeof(ShrMemCtrl) + RINGMEM_ALLIGNMENT - 1) & (~RINGMEM_ALLIGNMENT - 1);
272
+ munmap(m_shrMem, shrMemSize);
273
+ unlink(m_filepath);
274
+ free(m_filepath);
275
+ m_filepath = NULL;
276
+#endif ///< _WIN32
277
+ m_shrMem = NULL;
278
+ m_dataPool = NULL;
279
+ m_itemSize = 0;
280
+ m_itemCnt = 0;
281
+ }
282
+
283
+ if (m_protectRW)
284
+ {
285
+ m_protectRW = false;
286
+ if (m_writeSem)
287
+ {
288
+ m_writeSem->release();
289
+
290
+ delete m_writeSem;
291
+ m_writeSem = NULL;
292
+ }
293
+
294
+ if (m_readSem)
295
+ {
296
+ m_readSem->release();
297
+
298
+ delete m_readSem;
299
+ m_readSem = NULL;
300
+ }
301
+ }
302
+
303
+ }
304
+ }
305
+
306
+ ///< data read
307
+ bool RingMem::readNext(void* dst, fnRWSharedData callback)
308
+ {
309
+ if (!m_initialized || !callback || !dst)
310
+ {
311
+ return false;
312
+ }
313
+
314
+ if (m_protectRW)
315
+ {
316
+ if (!m_readSem->take())
317
+ {
318
+ return false;
319
+ }
320
+ }
321
+
322
+ int32_t index = ATOMIC_ADD(&m_shrMem->m_read, 1) % m_itemCnt;
323
+ (*callback)(dst, reinterpret_cast<uint8_t *>(m_dataPool) + index * m_itemSize, m_itemSize);
324
+
325
+ if (m_protectRW)
326
+ {
327
+ m_writeSem->give(1);
328
+ }
329
+
330
+ return true;
331
+ }
332
+ ///< data write
333
+ bool RingMem::writeData(void *data, fnRWSharedData callback)
334
+ {
335
+ if (!m_initialized || !data || !callback)
336
+ {
337
+ return false;
338
+ }
339
+
340
+ if (m_protectRW)
341
+ {
342
+ if (!m_writeSem->take())
343
+ {
344
+ return false;
345
+ }
346
+ }
347
+
348
+ int32_t index = ATOMIC_ADD(&m_shrMem->m_write, 1) % m_itemCnt;
349
+ (*callback)(reinterpret_cast<uint8_t *>(m_dataPool) + index * m_itemSize, data, m_itemSize);
350
+
351
+ if (m_protectRW)
352
+ {
353
+ m_readSem->give(1);
354
+ }
355
+
356
+ return true;
357
+ }
358
+}
359
x265_3.6.tar.gz/source/common/ringmem.h
Added
92
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2013-2017 MulticoreWare, Inc
4
+ *
5
+ * Authors: liwei <liwei@multicorewareinc.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com
23
+ *****************************************************************************/
24
+
25
+#ifndef X265_RINGMEM_H
26
+#define X265_RINGMEM_H
27
+
28
+#include "common.h"
29
+#include "threading.h"
30
+
31
+#if _MSC_VER
32
+#define snprintf _snprintf
33
+#define strdup _strdup
34
+#endif
35
+
36
+namespace X265_NS {
37
+
38
+#define MAX_SHR_NAME_LEN 256
39
+
40
+ class RingMem {
41
+ public:
42
+ RingMem();
43
+ ~RingMem();
44
+
45
+ bool skipRead(int32_t cnt);
46
+
47
+ bool skipWrite(int32_t cnt);
48
+
49
+ ///< initialize
50
+ ///< protectRW: if use the semaphore the protect the write and read operation.
51
+ bool init(int32_t itemSize, int32_t itemCnt, const char *name, bool protectRW = false);
52
+ ///< finalize
53
+ void release();
54
+
55
+ typedef void(*fnRWSharedData)(void *dst, void *src, int32_t size);
56
+
57
+ ///< data read
58
+ bool readNext(void* dst, fnRWSharedData callback);
59
+ ///< data write
60
+ bool writeData(void *data, fnRWSharedData callback);
61
+
62
+ private:
63
+ bool m_initialized;
64
+ bool m_protectRW;
65
+
66
+ int32_t m_itemSize;
67
+ int32_t m_itemCnt;
68
+ ///< data pool
69
+ void *m_dataPool;
70
+ typedef struct {
71
+ ///< index to write
72
+ int32_t m_write;
73
+ ///< index to read
74
+ int32_t m_read;
75
+
76
+ }ShrMemCtrl;
77
+
78
+ ShrMemCtrl *m_shrMem;
79
+#ifdef _WIN32
80
+ void *m_handle;
81
+#else // _WIN32
82
+ char *m_filepath;
83
+#endif // _WIN32
84
+
85
+ ///< Semaphores
86
+ NamedSemaphore *m_writeSem;
87
+ NamedSemaphore *m_readSem;
88
+ };
89
+};
90
+
91
+#endif // ifndef X265_RINGMEM_H
92
x265_3.5.tar.gz/source/common/slice.h -> x265_3.6.tar.gz/source/common/slice.h
Changed
35
1
2
HRDInfo hrdParameters;
3
ProfileTierLevel ptl;
4
uint32_t maxTempSubLayers;
5
- uint32_t numReorderPics;
6
- uint32_t maxDecPicBuffering;
7
- uint32_t maxLatencyIncrease;
8
+ uint32_t numReorderPicsMAX_T_LAYERS;
9
+ uint32_t maxDecPicBufferingMAX_T_LAYERS;
10
+ uint32_t maxLatencyIncreaseMAX_T_LAYERS;
11
};
12
13
struct Window
14
15
uint32_t maxAMPDepth;
16
17
uint32_t maxTempSubLayers; // max number of Temporal Sub layers
18
- uint32_t maxDecPicBuffering; // these are dups of VPS values
19
- uint32_t maxLatencyIncrease;
20
- int numReorderPics;
21
+ uint32_t maxDecPicBufferingMAX_T_LAYERS; // these are dups of VPS values
22
+ uint32_t maxLatencyIncreaseMAX_T_LAYERS;
23
+ int numReorderPicsMAX_T_LAYERS;
24
25
RPS spsrpsMAX_NUM_SHORT_TERM_RPS;
26
int spsrpsNum;
27
28
int m_iNumRPSInSPS;
29
const x265_param *m_param;
30
int m_fieldNum;
31
+ Frame* m_mcstfRefFrameList2MAX_MCSTF_TEMPORAL_WINDOW_LENGTH;
32
33
Slice()
34
{
35
x265_3.6.tar.gz/source/common/temporalfilter.cpp
Added
1019
1
2
+/*****************************************************************************
3
+* Copyright (C) 2013-2021 MulticoreWare, Inc
4
+*
5
+ * Authors: Ashok Kumar Mishra <ashok@multicorewareinc.com>
6
+ *
7
+* This program is free software; you can redistribute it and/or modify
8
+* it under the terms of the GNU General Public License as published by
9
+* the Free Software Foundation; either version 2 of the License, or
10
+* (at your option) any later version.
11
+*
12
+* This program is distributed in the hope that it will be useful,
13
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+* GNU General Public License for more details.
16
+*
17
+* You should have received a copy of the GNU General Public License
18
+* along with this program; if not, write to the Free Software
19
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+*
21
+* This program is also available under a commercial proprietary license.
22
+* For more information, contact us at license @ x265.com.
23
+*****************************************************************************/
24
+#include "common.h"
25
+#include "temporalfilter.h"
26
+#include "primitives.h"
27
+
28
+#include "frame.h"
29
+#include "slice.h"
30
+#include "framedata.h"
31
+#include "analysis.h"
32
+
33
+using namespace X265_NS;
34
+
35
+void OrigPicBuffer::addPicture(Frame* inFrame)
36
+{
37
+ m_mcstfPicList.pushFrontMCSTF(*inFrame);
38
+}
39
+
40
+void OrigPicBuffer::addEncPicture(Frame* inFrame)
41
+{
42
+ m_mcstfOrigPicFreeList.pushFrontMCSTF(*inFrame);
43
+}
44
+
45
+void OrigPicBuffer::addEncPictureToPicList(Frame* inFrame)
46
+{
47
+ m_mcstfOrigPicList.pushFrontMCSTF(*inFrame);
48
+}
49
+
50
+OrigPicBuffer::~OrigPicBuffer()
51
+{
52
+ while (!m_mcstfOrigPicList.empty())
53
+ {
54
+ Frame* curFrame = m_mcstfOrigPicList.popBackMCSTF();
55
+ curFrame->destroy();
56
+ delete curFrame;
57
+ }
58
+
59
+ while (!m_mcstfOrigPicFreeList.empty())
60
+ {
61
+ Frame* curFrame = m_mcstfOrigPicFreeList.popBackMCSTF();
62
+ curFrame->destroy();
63
+ delete curFrame;
64
+ }
65
+}
66
+
67
+void OrigPicBuffer::setOrigPicList(Frame* inFrame, int frameCnt)
68
+{
69
+ Slice* slice = inFrame->m_encData->m_slice;
70
+ uint8_t j = 0;
71
+ for (int iterPOC = (inFrame->m_poc - inFrame->m_mcstf->m_range);
72
+ iterPOC <= (inFrame->m_poc + inFrame->m_mcstf->m_range); iterPOC++)
73
+ {
74
+ if (iterPOC != inFrame->m_poc)
75
+ {
76
+ if (iterPOC < 0)
77
+ continue;
78
+ if (iterPOC >= frameCnt)
79
+ break;
80
+
81
+ Frame *iterFrame = m_mcstfPicList.getPOCMCSTF(iterPOC);
82
+ X265_CHECK(iterFrame, "Reference frame not found in OPB");
83
+ if (iterFrame != NULL)
84
+ {
85
+ slice->m_mcstfRefFrameList1j = iterFrame;
86
+ iterFrame->m_refPicCnt1--;
87
+ }
88
+
89
+ iterFrame = m_mcstfOrigPicList.getPOCMCSTF(iterPOC);
90
+ if (iterFrame != NULL)
91
+ {
92
+
93
+ slice->m_mcstfRefFrameList1j = iterFrame;
94
+
95
+ iterFrame->m_refPicCnt1--;
96
+ Frame *cFrame = m_mcstfOrigPicList.getPOCMCSTF(inFrame->m_poc);
97
+ X265_CHECK(cFrame, "Reference frame not found in encoded OPB");
98
+ cFrame->m_refPicCnt1--;
99
+ }
100
+ j++;
101
+ }
102
+ }
103
+}
104
+
105
+void OrigPicBuffer::recycleOrigPicList()
106
+{
107
+ Frame *iterFrame = m_mcstfPicList.first();
108
+
109
+ while (iterFrame)
110
+ {
111
+ Frame *curFrame = iterFrame;
112
+ iterFrame = iterFrame->m_nextMCSTF;
113
+ if (!curFrame->m_refPicCnt1)
114
+ {
115
+ m_mcstfPicList.removeMCSTF(*curFrame);
116
+ iterFrame = m_mcstfPicList.first();
117
+ }
118
+ }
119
+
120
+ iterFrame = m_mcstfOrigPicList.first();
121
+
122
+ while (iterFrame)
123
+ {
124
+ Frame *curFrame = iterFrame;
125
+ iterFrame = iterFrame->m_nextMCSTF;
126
+ if (!curFrame->m_refPicCnt1)
127
+ {
128
+ m_mcstfOrigPicList.removeMCSTF(*curFrame);
129
+ *curFrame->m_isSubSampled = false;
130
+ m_mcstfOrigPicFreeList.pushFrontMCSTF(*curFrame);
131
+ iterFrame = m_mcstfOrigPicList.first();
132
+ }
133
+ }
134
+}
135
+
136
+void OrigPicBuffer::addPictureToFreelist(Frame* inFrame)
137
+{
138
+ m_mcstfOrigPicFreeList.pushBack(*inFrame);
139
+}
140
+
141
+TemporalFilter::TemporalFilter()
142
+{
143
+ m_sourceWidth = 0;
144
+ m_sourceHeight = 0,
145
+ m_QP = 0;
146
+ m_sliceTypeConfig = 3;
147
+ m_numRef = 0;
148
+ m_useSADinME = 1;
149
+
150
+ m_range = 2;
151
+ m_chromaFactor = 0.55;
152
+ m_sigmaMultiplier = 9.0;
153
+ m_sigmaZeroPoint = 10.0;
154
+ m_motionVectorFactor = 16;
155
+}
156
+
157
+void TemporalFilter::init(const x265_param* param)
158
+{
159
+ m_param = param;
160
+ m_bitDepth = param->internalBitDepth;
161
+ m_sourceWidth = param->sourceWidth;
162
+ m_sourceHeight = param->sourceHeight;
163
+ m_internalCsp = param->internalCsp;
164
+ m_numComponents = (m_internalCsp != X265_CSP_I400) ? MAX_NUM_COMPONENT : 1;
165
+
166
+ m_metld = new MotionEstimatorTLD;
167
+
168
+ predPUYuv.create(FENC_STRIDE, X265_CSP_I400);
169
+}
170
+
171
+int TemporalFilter::createRefPicInfo(TemporalFilterRefPicInfo* refFrame, x265_param* param)
172
+{
173
+ CHECKED_MALLOC_ZERO(refFrame->mvs, MV, sizeof(MV)* ((m_sourceWidth ) / 4) * ((m_sourceHeight ) / 4));
174
+ refFrame->mvsStride = m_sourceWidth / 4;
175
+ CHECKED_MALLOC_ZERO(refFrame->mvs0, MV, sizeof(MV)* ((m_sourceWidth ) / 16) * ((m_sourceHeight ) / 16));
176
+ refFrame->mvsStride0 = m_sourceWidth / 16;
177
+ CHECKED_MALLOC_ZERO(refFrame->mvs1, MV, sizeof(MV)* ((m_sourceWidth ) / 16) * ((m_sourceHeight ) / 16));
178
+ refFrame->mvsStride1 = m_sourceWidth / 16;
179
+ CHECKED_MALLOC_ZERO(refFrame->mvs2, MV, sizeof(MV)* ((m_sourceWidth ) / 16)*((m_sourceHeight ) / 16));
180
+ refFrame->mvsStride2 = m_sourceWidth / 16;
181
+
182
+ CHECKED_MALLOC_ZERO(refFrame->noise, int, sizeof(int) * ((m_sourceWidth) / 4) * ((m_sourceHeight) / 4));
183
+ CHECKED_MALLOC_ZERO(refFrame->error, int, sizeof(int) * ((m_sourceWidth) / 4) * ((m_sourceHeight) / 4));
184
+
185
+ refFrame->slicetype = X265_TYPE_AUTO;
186
+
187
+ refFrame->compensatedPic = new PicYuv;
188
+ refFrame->compensatedPic->create(param, true);
189
+
190
+ return 1;
191
+fail:
192
+ return 0;
193
+}
194
+
195
+int TemporalFilter::motionErrorLumaSAD(
196
+ PicYuv *orig,
197
+ PicYuv *buffer,
198
+ int x,
199
+ int y,
200
+ int dx,
201
+ int dy,
202
+ int bs,
203
+ int besterror)
204
+{
205
+
206
+ pixel* origOrigin = orig->m_picOrg0;
207
+ intptr_t origStride = orig->m_stride;
208
+ pixel *buffOrigin = buffer->m_picOrg0;
209
+ intptr_t buffStride = buffer->m_stride;
210
+ int error = 0;// dx * 10 + dy * 10;
211
+ if (((dx | dy) & 0xF) == 0)
212
+ {
213
+ dx /= m_motionVectorFactor;
214
+ dy /= m_motionVectorFactor;
215
+
216
+ const pixel* bufferRowStart = buffOrigin + (y + dy) * buffStride + (x + dx);
217
+#if 0
218
+ const pixel* origRowStart = origOrigin + y *origStride + x;
219
+
220
+ for (int y1 = 0; y1 < bs; y1++)
221
+ {
222
+ for (int x1 = 0; x1 < bs; x1++)
223
+ {
224
+ int diff = origRowStartx1 - bufferRowStartx1;
225
+ error += abs(diff);
226
+ }
227
+
228
+ origRowStart += origStride;
229
+ bufferRowStart += buffStride;
230
+ }
231
+#else
232
+ int partEnum = partitionFromSizes(bs, bs);
233
+ /* copy PU block into cache */
234
+ primitives.pupartEnum.copy_pp(predPUYuv.m_buf0, FENC_STRIDE, bufferRowStart, buffStride);
235
+
236
+ error = m_metld->me.bufSAD(predPUYuv.m_buf0, FENC_STRIDE);
237
+#endif
238
+ if (error > besterror)
239
+ {
240
+ return error;
241
+ }
242
+ }
243
+ else
244
+ {
245
+ const int *xFilter = s_interpolationFilterdx & 0xF;
246
+ const int *yFilter = s_interpolationFilterdy & 0xF;
247
+ int tempArray64 + 864;
248
+
249
+ int iSum, iBase;
250
+ for (int y1 = 1; y1 < bs + 7; y1++)
251
+ {
252
+ const int yOffset = y + y1 + (dy >> 4) - 3;
253
+ const pixel *sourceRow = buffOrigin + (yOffset)*buffStride + 0;
254
+ for (int x1 = 0; x1 < bs; x1++)
255
+ {
256
+ iSum = 0;
257
+ iBase = x + x1 + (dx >> 4) - 3;
258
+ const pixel *rowStart = sourceRow + iBase;
259
+
260
+ iSum += xFilter1 * rowStart1;
261
+ iSum += xFilter2 * rowStart2;
262
+ iSum += xFilter3 * rowStart3;
263
+ iSum += xFilter4 * rowStart4;
264
+ iSum += xFilter5 * rowStart5;
265
+ iSum += xFilter6 * rowStart6;
266
+
267
+ tempArrayy1x1 = iSum;
268
+ }
269
+ }
270
+
271
+ const pixel maxSampleValue = (1 << m_bitDepth) - 1;
272
+ for (int y1 = 0; y1 < bs; y1++)
273
+ {
274
+ const pixel *origRow = origOrigin + (y + y1)*origStride + 0;
275
+ for (int x1 = 0; x1 < bs; x1++)
276
+ {
277
+ iSum = 0;
278
+ iSum += yFilter1 * tempArrayy1 + 1x1;
279
+ iSum += yFilter2 * tempArrayy1 + 2x1;
280
+ iSum += yFilter3 * tempArrayy1 + 3x1;
281
+ iSum += yFilter4 * tempArrayy1 + 4x1;
282
+ iSum += yFilter5 * tempArrayy1 + 5x1;
283
+ iSum += yFilter6 * tempArrayy1 + 6x1;
284
+
285
+ iSum = (iSum + (1 << 11)) >> 12;
286
+ iSum = iSum < 0 ? 0 : (iSum > maxSampleValue ? maxSampleValue : iSum);
287
+
288
+ error += abs(iSum - origRowx + x1);
289
+ }
290
+ if (error > besterror)
291
+ {
292
+ return error;
293
+ }
294
+ }
295
+ }
296
+ return error;
297
+}
298
+
299
+int TemporalFilter::motionErrorLumaSSD(
300
+ PicYuv *orig,
301
+ PicYuv *buffer,
302
+ int x,
303
+ int y,
304
+ int dx,
305
+ int dy,
306
+ int bs,
307
+ int besterror)
308
+{
309
+
310
+ pixel* origOrigin = orig->m_picOrg0;
311
+ intptr_t origStride = orig->m_stride;
312
+ pixel *buffOrigin = buffer->m_picOrg0;
313
+ intptr_t buffStride = buffer->m_stride;
314
+ int error = 0;// dx * 10 + dy * 10;
315
+ if (((dx | dy) & 0xF) == 0)
316
+ {
317
+ dx /= m_motionVectorFactor;
318
+ dy /= m_motionVectorFactor;
319
+
320
+ const pixel* bufferRowStart = buffOrigin + (y + dy) * buffStride + (x + dx);
321
+#if 0
322
+ const pixel* origRowStart = origOrigin + y * origStride + x;
323
+
324
+ for (int y1 = 0; y1 < bs; y1++)
325
+ {
326
+ for (int x1 = 0; x1 < bs; x1++)
327
+ {
328
+ int diff = origRowStartx1 - bufferRowStartx1;
329
+ error += diff * diff;
330
+ }
331
+
332
+ origRowStart += origStride;
333
+ bufferRowStart += buffStride;
334
+ }
335
+#else
336
+ int partEnum = partitionFromSizes(bs, bs);
337
+ /* copy PU block into cache */
338
+ primitives.pupartEnum.copy_pp(predPUYuv.m_buf0, FENC_STRIDE, bufferRowStart, buffStride);
339
+
340
+ error = (int)primitives.cupartEnum.sse_pp(m_metld->me.fencPUYuv.m_buf0, FENC_STRIDE, predPUYuv.m_buf0, FENC_STRIDE);
341
+
342
+#endif
343
+ if (error > besterror)
344
+ {
345
+ return error;
346
+ }
347
+ }
348
+ else
349
+ {
350
+ const int *xFilter = s_interpolationFilterdx & 0xF;
351
+ const int *yFilter = s_interpolationFilterdy & 0xF;
352
+ int tempArray64 + 864;
353
+
354
+ int iSum, iBase;
355
+ for (int y1 = 1; y1 < bs + 7; y1++)
356
+ {
357
+ const int yOffset = y + y1 + (dy >> 4) - 3;
358
+ const pixel *sourceRow = buffOrigin + (yOffset)*buffStride + 0;
359
+ for (int x1 = 0; x1 < bs; x1++)
360
+ {
361
+ iSum = 0;
362
+ iBase = x + x1 + (dx >> 4) - 3;
363
+ const pixel *rowStart = sourceRow + iBase;
364
+
365
+ iSum += xFilter1 * rowStart1;
366
+ iSum += xFilter2 * rowStart2;
367
+ iSum += xFilter3 * rowStart3;
368
+ iSum += xFilter4 * rowStart4;
369
+ iSum += xFilter5 * rowStart5;
370
+ iSum += xFilter6 * rowStart6;
371
+
372
+ tempArrayy1x1 = iSum;
373
+ }
374
+ }
375
+
376
+ const pixel maxSampleValue = (1 << m_bitDepth) - 1;
377
+ for (int y1 = 0; y1 < bs; y1++)
378
+ {
379
+ const pixel *origRow = origOrigin + (y + y1)*origStride + 0;
380
+ for (int x1 = 0; x1 < bs; x1++)
381
+ {
382
+ iSum = 0;
383
+ iSum += yFilter1 * tempArrayy1 + 1x1;
384
+ iSum += yFilter2 * tempArrayy1 + 2x1;
385
+ iSum += yFilter3 * tempArrayy1 + 3x1;
386
+ iSum += yFilter4 * tempArrayy1 + 4x1;
387
+ iSum += yFilter5 * tempArrayy1 + 5x1;
388
+ iSum += yFilter6 * tempArrayy1 + 6x1;
389
+
390
+ iSum = (iSum + (1 << 11)) >> 12;
391
+ iSum = iSum < 0 ? 0 : (iSum > maxSampleValue ? maxSampleValue : iSum);
392
+
393
+ error += (iSum - origRowx + x1) * (iSum - origRowx + x1);
394
+ }
395
+ if (error > besterror)
396
+ {
397
+ return error;
398
+ }
399
+ }
400
+ }
401
+ return error;
402
+}
403
+
404
+void TemporalFilter::applyMotion(MV *mvs, uint32_t mvsStride, PicYuv *input, PicYuv *output)
405
+{
406
+ static const int lumaBlockSize = 8;
407
+ int srcStride = 0;
408
+ int dstStride = 0;
409
+ int csx = 0, csy = 0;
410
+ for (int c = 0; c < m_numComponents; c++)
411
+ {
412
+ const pixel maxValue = (1 << X265_DEPTH) - 1;
413
+
414
+ const pixel *pSrcImage = input->m_picOrgc;
415
+ pixel *pDstImage = output->m_picOrgc;
416
+
417
+ if (!c)
418
+ {
419
+ srcStride = (int)input->m_stride;
420
+ dstStride = (int)output->m_stride;
421
+ }
422
+ else
423
+ {
424
+ srcStride = (int)input->m_strideC;
425
+ dstStride = (int)output->m_strideC;
426
+ csx = CHROMA_H_SHIFT(m_internalCsp);
427
+ csy = CHROMA_V_SHIFT(m_internalCsp);
428
+ }
429
+ const int blockSizeX = lumaBlockSize >> csx;
430
+ const int blockSizeY = lumaBlockSize >> csy;
431
+ const int height = input->m_picHeight >> csy;
432
+ const int width = input->m_picWidth >> csx;
433
+
434
+ for (int y = 0, blockNumY = 0; y + blockSizeY <= height; y += blockSizeY, blockNumY++)
435
+ {
436
+ for (int x = 0, blockNumX = 0; x + blockSizeX <= width; x += blockSizeX, blockNumX++)
437
+ {
438
+ int mvIdx = blockNumY * mvsStride + blockNumX;
439
+ const MV &mv = mvsmvIdx;
440
+ const int dx = mv.x >> csx;
441
+ const int dy = mv.y >> csy;
442
+ const int xInt = mv.x >> (4 + csx);
443
+ const int yInt = mv.y >> (4 + csy);
444
+
445
+ const int *xFilter = s_interpolationFilterdx & 0xf;
446
+ const int *yFilter = s_interpolationFilterdy & 0xf; // will add 6 bit.
447
+ const int numFilterTaps = 7;
448
+ const int centreTapOffset = 3;
449
+
450
+ int tempArraylumaBlockSize + numFilterTapslumaBlockSize;
451
+
452
+ for (int by = 1; by < blockSizeY + numFilterTaps; by++)
453
+ {
454
+ const int yOffset = y + by + yInt - centreTapOffset;
455
+ const pixel *sourceRow = pSrcImage + yOffset * srcStride;
456
+ for (int bx = 0; bx < blockSizeX; bx++)
457
+ {
458
+ int iBase = x + bx + xInt - centreTapOffset;
459
+ const pixel *rowStart = sourceRow + iBase;
460
+
461
+ int iSum = 0;
462
+ iSum += xFilter1 * rowStart1;
463
+ iSum += xFilter2 * rowStart2;
464
+ iSum += xFilter3 * rowStart3;
465
+ iSum += xFilter4 * rowStart4;
466
+ iSum += xFilter5 * rowStart5;
467
+ iSum += xFilter6 * rowStart6;
468
+
469
+ tempArraybybx = iSum;
470
+ }
471
+ }
472
+
473
+ pixel *pDstRow = pDstImage + y * dstStride;
474
+ for (int by = 0; by < blockSizeY; by++, pDstRow += dstStride)
475
+ {
476
+ pixel *pDstPel = pDstRow + x;
477
+ for (int bx = 0; bx < blockSizeX; bx++, pDstPel++)
478
+ {
479
+ int iSum = 0;
480
+
481
+ iSum += yFilter1 * tempArrayby + 1bx;
482
+ iSum += yFilter2 * tempArrayby + 2bx;
483
+ iSum += yFilter3 * tempArrayby + 3bx;
484
+ iSum += yFilter4 * tempArrayby + 4bx;
485
+ iSum += yFilter5 * tempArrayby + 5bx;
486
+ iSum += yFilter6 * tempArrayby + 6bx;
487
+
488
+ iSum = (iSum + (1 << 11)) >> 12;
489
+ iSum = iSum < 0 ? 0 : (iSum > maxValue ? maxValue : iSum);
490
+ *pDstPel = (pixel)iSum;
491
+ }
492
+ }
493
+ }
494
+ }
495
+ }
496
+}
497
+
498
+void TemporalFilter::bilateralFilter(Frame* frame,
499
+ TemporalFilterRefPicInfo* m_mcstfRefList,
500
+ double overallStrength)
501
+{
502
+
503
+ const int numRefs = frame->m_mcstf->m_numRef;
504
+
505
+ for (int i = 0; i < numRefs; i++)
506
+ {
507
+ TemporalFilterRefPicInfo *ref = &m_mcstfRefListi;
508
+ applyMotion(m_mcstfRefListi.mvs, m_mcstfRefListi.mvsStride, m_mcstfRefListi.picBuffer, ref->compensatedPic);
509
+ }
510
+
511
+ int refStrengthRow = 2;
512
+ if (numRefs == m_range * 2)
513
+ {
514
+ refStrengthRow = 0;
515
+ }
516
+ else if (numRefs == m_range)
517
+ {
518
+ refStrengthRow = 1;
519
+ }
520
+
521
+ const double lumaSigmaSq = (m_QP - m_sigmaZeroPoint) * (m_QP - m_sigmaZeroPoint) * m_sigmaMultiplier;
522
+ const double chromaSigmaSq = 30 * 30;
523
+
524
+ PicYuv* orgPic = frame->m_fencPic;
525
+
526
+ for (int c = 0; c < m_numComponents; c++)
527
+ {
528
+ int height, width;
529
+ pixel *srcPelRow = NULL;
530
+ intptr_t srcStride, correctedPicsStride = 0;
531
+
532
+ if (!c)
533
+ {
534
+ height = orgPic->m_picHeight;
535
+ width = orgPic->m_picWidth;
536
+ srcPelRow = orgPic->m_picOrgc;
537
+ srcStride = orgPic->m_stride;
538
+ }
539
+ else
540
+ {
541
+ int csx = CHROMA_H_SHIFT(m_internalCsp);
542
+ int csy = CHROMA_V_SHIFT(m_internalCsp);
543
+
544
+ height = orgPic->m_picHeight >> csy;
545
+ width = orgPic->m_picWidth >> csx;
546
+ srcPelRow = orgPic->m_picOrgc;
547
+ srcStride = (int)orgPic->m_strideC;
548
+ }
549
+
550
+ const double sigmaSq = (!c) ? lumaSigmaSq : chromaSigmaSq;
551
+ const double weightScaling = overallStrength * ( (!c) ? 0.4 : m_chromaFactor);
552
+
553
+ const double maxSampleValue = (1 << m_bitDepth) - 1;
554
+ const double bitDepthDiffWeighting = 1024.0 / (maxSampleValue + 1);
555
+
556
+ const int blkSize = (!c) ? 8 : 4;
557
+
558
+ for (int y = 0; y < height; y++, srcPelRow += srcStride)
559
+ {
560
+ pixel *srcPel = srcPelRow;
561
+
562
+ for (int x = 0; x < width; x++, srcPel++)
563
+ {
564
+ const int orgVal = (int)*srcPel;
565
+ double temporalWeightSum = 1.0;
566
+ double newVal = (double)orgVal;
567
+
568
+ if ((y % blkSize == 0) && (x % blkSize == 0))
569
+ {
570
+ for (int i = 0; i < numRefs; i++)
571
+ {
572
+ TemporalFilterRefPicInfo *refPicInfo = &m_mcstfRefListi;
573
+
574
+ if (!c)
575
+ correctedPicsStride = refPicInfo->compensatedPic->m_stride;
576
+ else
577
+ correctedPicsStride = refPicInfo->compensatedPic->m_strideC;
578
+
579
+ double variance = 0, diffsum = 0;
580
+ for (int y1 = 0; y1 < blkSize - 1; y1++)
581
+ {
582
+ for (int x1 = 0; x1 < blkSize - 1; x1++)
583
+ {
584
+ int pix = *(srcPel + x1);
585
+ int pixR = *(srcPel + x1 + 1);
586
+ int pixD = *(srcPel + x1 + srcStride);
587
+
588
+ int ref = *(refPicInfo->compensatedPic->m_picOrgc + ((y + y1) * correctedPicsStride + x + x1));
589
+ int refR = *(refPicInfo->compensatedPic->m_picOrgc + ((y + y1) * correctedPicsStride + x + x1 + 1));
590
+ int refD = *(refPicInfo->compensatedPic->m_picOrgc + ((y + y1 + 1) * correctedPicsStride + x + x1));
591
+
592
+ int diff = pix - ref;
593
+ int diffR = pixR - refR;
594
+ int diffD = pixD - refD;
595
+
596
+ variance += diff * diff;
597
+ diffsum += (diffR - diff) * (diffR - diff);
598
+ diffsum += (diffD - diff) * (diffD - diff);
599
+ }
600
+ }
601
+
602
+ refPicInfo->noise(y / blkSize) * refPicInfo->mvsStride + (x / blkSize) = (int)round((300 * variance + 50) / (10 * diffsum + 50));
603
+ }
604
+ }
605
+
606
+ double minError = 9999999;
607
+ for (int i = 0; i < numRefs; i++)
608
+ {
609
+ TemporalFilterRefPicInfo *refPicInfo = &m_mcstfRefListi;
610
+ minError = X265_MIN(minError, (double)refPicInfo->error(y / blkSize) * refPicInfo->mvsStride + (x / blkSize));
611
+ }
612
+
613
+ for (int i = 0; i < numRefs; i++)
614
+ {
615
+ TemporalFilterRefPicInfo *refPicInfo = &m_mcstfRefListi;
616
+
617
+ const int error = refPicInfo->error(y / blkSize) * refPicInfo->mvsStride + (x / blkSize);
618
+ const int noise = refPicInfo->noise(y / blkSize) * refPicInfo->mvsStride + (x / blkSize);
619
+
620
+ const pixel *pCorrectedPelPtr = refPicInfo->compensatedPic->m_picOrgc + (y * correctedPicsStride + x);
621
+ const int refVal = (int)*pCorrectedPelPtr;
622
+ double diff = (double)(refVal - orgVal);
623
+ diff *= bitDepthDiffWeighting;
624
+ double diffSq = diff * diff;
625
+
626
+ const int index = X265_MIN(3, std::abs(refPicInfo->origOffset) - 1);
627
+ double ww = 1, sw = 1;
628
+ ww *= (noise < 25) ? 1 : 1.2;
629
+ sw *= (noise < 25) ? 1.3 : 0.8;
630
+ ww *= (error < 50) ? 1.2 : ((error > 100) ? 0.8 : 1);
631
+ sw *= (error < 50) ? 1.3 : 1;
632
+ ww *= ((minError + 1) / (error + 1));
633
+ const double weight = weightScaling * s_refStrengthsrefStrengthRowindex * ww * exp(-diffSq / (2 * sw * sigmaSq));
634
+
635
+ newVal += weight * refVal;
636
+ temporalWeightSum += weight;
637
+ }
638
+ newVal /= temporalWeightSum;
639
+ double sampleVal = round(newVal);
640
+ sampleVal = (sampleVal < 0 ? 0 : (sampleVal > maxSampleValue ? maxSampleValue : sampleVal));
641
+ *srcPel = (pixel)sampleVal;
642
+ }
643
+ }
644
+ }
645
+}
646
+
647
+void TemporalFilter::motionEstimationLuma(MV *mvs, uint32_t mvStride, PicYuv *orig, PicYuv *buffer, int blockSize,
648
+ MV *previous, uint32_t prevMvStride, int factor)
649
+{
650
+
651
+ int range = 5;
652
+
653
+
654
+ const int stepSize = blockSize;
655
+
656
+ const int origWidth = orig->m_picWidth;
657
+ const int origHeight = orig->m_picHeight;
658
+
659
+ int error;
660
+
661
+ for (int blockY = 0; blockY + blockSize <= origHeight; blockY += stepSize)
662
+ {
663
+ for (int blockX = 0; blockX + blockSize <= origWidth; blockX += stepSize)
664
+ {
665
+ const intptr_t pelOffset = blockY * orig->m_stride + blockX;
666
+ m_metld->me.setSourcePU(orig->m_picOrg0, orig->m_stride, pelOffset, blockSize, blockSize, X265_HEX_SEARCH, 1);
667
+
668
+
669
+ MV best(0, 0);
670
+ int leastError = INT_MAX;
671
+
672
+ if (previous == NULL)
673
+ {
674
+ range = 8;
675
+ }
676
+ else
677
+ {
678
+
679
+ for (int py = -1; py <= 1; py++)
680
+ {
681
+ int testy = blockY / (2 * blockSize) + py;
682
+
683
+ for (int px = -1; px <= 1; px++)
684
+ {
685
+
686
+ int testx = blockX / (2 * blockSize) + px;
687
+ if ((testx >= 0) && (testx < origWidth / (2 * blockSize)) && (testy >= 0) && (testy < origHeight / (2 * blockSize)))
688
+ {
689
+ int mvIdx = testy * prevMvStride + testx;
690
+ MV old = previousmvIdx;
691
+
692
+ if (m_useSADinME)
693
+ error = motionErrorLumaSAD(orig, buffer, blockX, blockY, old.x * factor, old.y * factor, blockSize, leastError);
694
+ else
695
+ error = motionErrorLumaSSD(orig, buffer, blockX, blockY, old.x * factor, old.y * factor, blockSize, leastError);
696
+
697
+ if (error < leastError)
698
+ {
699
+ best.set(old.x * factor, old.y * factor);
700
+ leastError = error;
701
+ }
702
+ }
703
+ }
704
+ }
705
+
706
+ if (m_useSADinME)
707
+ error = motionErrorLumaSAD(orig, buffer, blockX, blockY, 0, 0, blockSize, leastError);
708
+ else
709
+ error = motionErrorLumaSSD(orig, buffer, blockX, blockY, 0, 0, blockSize, leastError);
710
+
711
+ if (error < leastError)
712
+ {
713
+ best.set(0, 0);
714
+ leastError = error;
715
+ }
716
+
717
+ }
718
+
719
+ MV prevBest = best;
720
+ for (int y2 = prevBest.y / m_motionVectorFactor - range; y2 <= prevBest.y / m_motionVectorFactor + range; y2++)
721
+ {
722
+ for (int x2 = prevBest.x / m_motionVectorFactor - range; x2 <= prevBest.x / m_motionVectorFactor + range; x2++)
723
+ {
724
+ if (m_useSADinME)
725
+ error = motionErrorLumaSAD(orig, buffer, blockX, blockY, x2 * m_motionVectorFactor, y2 * m_motionVectorFactor, blockSize, leastError);
726
+ else
727
+ error = motionErrorLumaSSD(orig, buffer, blockX, blockY, x2 * m_motionVectorFactor, y2 * m_motionVectorFactor, blockSize, leastError);
728
+ if (error < leastError)
729
+ {
730
+ best.set(x2 * m_motionVectorFactor, y2 * m_motionVectorFactor);
731
+ leastError = error;
732
+ }
733
+ }
734
+ }
735
+
736
+ if (blockY > 0)
737
+ {
738
+ int idx = ((blockY - stepSize) / stepSize) * mvStride + (blockX / stepSize);
739
+ MV aboveMV = mvsidx;
740
+
741
+ if (m_useSADinME)
742
+ error = motionErrorLumaSAD(orig, buffer, blockX, blockY, aboveMV.x, aboveMV.y, blockSize, leastError);
743
+ else
744
+ error = motionErrorLumaSSD(orig, buffer, blockX, blockY, aboveMV.x, aboveMV.y, blockSize, leastError);
745
+
746
+ if (error < leastError)
747
+ {
748
+ best.set(aboveMV.x, aboveMV.y);
749
+ leastError = error;
750
+ }
751
+ }
752
+
753
+ if (blockX > 0)
754
+ {
755
+ int idx = ((blockY / stepSize) * mvStride + (blockX - stepSize) / stepSize);
756
+ MV leftMV = mvsidx;
757
+
758
+ if (m_useSADinME)
759
+ error = motionErrorLumaSAD(orig, buffer, blockX, blockY, leftMV.x, leftMV.y, blockSize, leastError);
760
+ else
761
+ error = motionErrorLumaSSD(orig, buffer, blockX, blockY, leftMV.x, leftMV.y, blockSize, leastError);
762
+
763
+ if (error < leastError)
764
+ {
765
+ best.set(leftMV.x, leftMV.y);
766
+ leastError = error;
767
+ }
768
+ }
769
+
770
+ // calculate average
771
+ double avg = 0.0;
772
+ for (int x1 = 0; x1 < blockSize; x1++)
773
+ {
774
+ for (int y1 = 0; y1 < blockSize; y1++)
775
+ {
776
+ avg = avg + *(orig->m_picOrg0 + (blockX + x1 + orig->m_stride * (blockY + y1)));
777
+ }
778
+ }
779
+ avg = avg / (blockSize * blockSize);
780
+
781
+ // calculate variance
782
+ double variance = 0;
783
+ for (int x1 = 0; x1 < blockSize; x1++)
784
+ {
785
+ for (int y1 = 0; y1 < blockSize; y1++)
786
+ {
787
+ int pix = *(orig->m_picOrg0 + (blockX + x1 + orig->m_stride * (blockY + y1)));
788
+ variance = variance + (pix - avg) * (pix - avg);
789
+ }
790
+ }
791
+
792
+ leastError = (int)(20 * ((leastError + 5.0) / (variance + 5.0)) + (leastError / (blockSize * blockSize)) / 50);
793
+
794
+ int mvIdx = (blockY / stepSize) * mvStride + (blockX / stepSize);
795
+ mvsmvIdx = best;
796
+ }
797
+ }
798
+}
799
+
800
+
801
+void TemporalFilter::motionEstimationLumaDoubleRes(MV *mvs, uint32_t mvStride, PicYuv *orig, PicYuv *buffer, int blockSize,
802
+ MV *previous, uint32_t prevMvStride, int factor, int* minError)
803
+{
804
+
805
+ int range = 0;
806
+
807
+
808
+ const int stepSize = blockSize;
809
+
810
+ const int origWidth = orig->m_picWidth;
811
+ const int origHeight = orig->m_picHeight;
812
+
813
+ int error;
814
+
815
+ for (int blockY = 0; blockY + blockSize <= origHeight; blockY += stepSize)
816
+ {
817
+ for (int blockX = 0; blockX + blockSize <= origWidth; blockX += stepSize)
818
+ {
819
+
820
+ const intptr_t pelOffset = blockY * orig->m_stride + blockX;
821
+ m_metld->me.setSourcePU(orig->m_picOrg0, orig->m_stride, pelOffset, blockSize, blockSize, X265_HEX_SEARCH, 1);
822
+
823
+ MV best(0, 0);
824
+ int leastError = INT_MAX;
825
+
826
+ if (previous == NULL)
827
+ {
828
+ range = 8;
829
+ }
830
+ else
831
+ {
832
+
833
+ for (int py = -1; py <= 1; py++)
834
+ {
835
+ int testy = blockY / (2 * blockSize) + py;
836
+
837
+ for (int px = -1; px <= 1; px++)
838
+ {
839
+
840
+ int testx = blockX / (2 * blockSize) + px;
841
+ if ((testx >= 0) && (testx < origWidth / (2 * blockSize)) && (testy >= 0) && (testy < origHeight / (2 * blockSize)))
842
+ {
843
+ int mvIdx = testy * prevMvStride + testx;
844
+ MV old = previousmvIdx;
845
+
846
+ if (m_useSADinME)
847
+ error = motionErrorLumaSAD(orig, buffer, blockX, blockY, old.x * factor, old.y * factor, blockSize, leastError);
848
+ else
849
+ error = motionErrorLumaSSD(orig, buffer, blockX, blockY, old.x * factor, old.y * factor, blockSize, leastError);
850
+
851
+ if (error < leastError)
852
+ {
853
+ best.set(old.x * factor, old.y * factor);
854
+ leastError = error;
855
+ }
856
+ }
857
+ }
858
+ }
859
+
860
+ if (m_useSADinME)
861
+ error = motionErrorLumaSAD(orig, buffer, blockX, blockY, 0, 0, blockSize, leastError);
862
+ else
863
+ error = motionErrorLumaSSD(orig, buffer, blockX, blockY, 0, 0, blockSize, leastError);
864
+
865
+ if (error < leastError)
866
+ {
867
+ best.set(0, 0);
868
+ leastError = error;
869
+ }
870
+
871
+ }
872
+
873
+ MV prevBest = best;
874
+ for (int y2 = prevBest.y / m_motionVectorFactor - range; y2 <= prevBest.y / m_motionVectorFactor + range; y2++)
875
+ {
876
+ for (int x2 = prevBest.x / m_motionVectorFactor - range; x2 <= prevBest.x / m_motionVectorFactor + range; x2++)
877
+ {
878
+ if (m_useSADinME)
879
+ error = motionErrorLumaSAD(orig, buffer, blockX, blockY, x2 * m_motionVectorFactor, y2 * m_motionVectorFactor, blockSize, leastError);
880
+ else
881
+ error = motionErrorLumaSSD(orig, buffer, blockX, blockY, x2 * m_motionVectorFactor, y2 * m_motionVectorFactor, blockSize, leastError);
882
+
883
+ if (error < leastError)
884
+ {
885
+ best.set(x2 * m_motionVectorFactor, y2 * m_motionVectorFactor);
886
+ leastError = error;
887
+ }
888
+ }
889
+ }
890
+
891
+ prevBest = best;
892
+ int doubleRange = 3 * 4;
893
+ for (int y2 = prevBest.y - doubleRange; y2 <= prevBest.y + doubleRange; y2 += 4)
894
+ {
895
+ for (int x2 = prevBest.x - doubleRange; x2 <= prevBest.x + doubleRange; x2 += 4)
896
+ {
897
+ if (m_useSADinME)
898
+ error = motionErrorLumaSAD(orig, buffer, blockX, blockY, x2, y2, blockSize, leastError);
899
+ else
900
+ error = motionErrorLumaSSD(orig, buffer, blockX, blockY, x2, y2, blockSize, leastError);
901
+
902
+ if (error < leastError)
903
+ {
904
+ best.set(x2, y2);
905
+ leastError = error;
906
+ }
907
+ }
908
+ }
909
+
910
+ prevBest = best;
911
+ doubleRange = 3;
912
+ for (int y2 = prevBest.y - doubleRange; y2 <= prevBest.y + doubleRange; y2++)
913
+ {
914
+ for (int x2 = prevBest.x - doubleRange; x2 <= prevBest.x + doubleRange; x2++)
915
+ {
916
+ if (m_useSADinME)
917
+ error = motionErrorLumaSAD(orig, buffer, blockX, blockY, x2, y2, blockSize, leastError);
918
+ else
919
+ error = motionErrorLumaSSD(orig, buffer, blockX, blockY, x2, y2, blockSize, leastError);
920
+
921
+ if (error < leastError)
922
+ {
923
+ best.set(x2, y2);
924
+ leastError = error;
925
+ }
926
+ }
927
+ }
928
+
929
+
930
+ if (blockY > 0)
931
+ {
932
+ int idx = ((blockY - stepSize) / stepSize) * mvStride + (blockX / stepSize);
933
+ MV aboveMV = mvsidx;
934
+
935
+ if (m_useSADinME)
936
+ error = motionErrorLumaSAD(orig, buffer, blockX, blockY, aboveMV.x, aboveMV.y, blockSize, leastError);
937
+ else
938
+ error = motionErrorLumaSSD(orig, buffer, blockX, blockY, aboveMV.x, aboveMV.y, blockSize, leastError);
939
+
940
+ if (error < leastError)
941
+ {
942
+ best.set(aboveMV.x, aboveMV.y);
943
+ leastError = error;
944
+ }
945
+ }
946
+
947
+ if (blockX > 0)
948
+ {
949
+ int idx = ((blockY / stepSize) * mvStride + (blockX - stepSize) / stepSize);
950
+ MV leftMV = mvsidx;
951
+
952
+ if (m_useSADinME)
953
+ error = motionErrorLumaSAD(orig, buffer, blockX, blockY, leftMV.x, leftMV.y, blockSize, leastError);
954
+ else
955
+ error = motionErrorLumaSSD(orig, buffer, blockX, blockY, leftMV.x, leftMV.y, blockSize, leastError);
956
+
957
+ if (error < leastError)
958
+ {
959
+ best.set(leftMV.x, leftMV.y);
960
+ leastError = error;
961
+ }
962
+ }
963
+
964
+ // calculate average
965
+ double avg = 0.0;
966
+ for (int x1 = 0; x1 < blockSize; x1++)
967
+ {
968
+ for (int y1 = 0; y1 < blockSize; y1++)
969
+ {
970
+ avg = avg + *(orig->m_picOrg0 + (blockX + x1 + orig->m_stride * (blockY + y1)));
971
+ }
972
+ }
973
+ avg = avg / (blockSize * blockSize);
974
+
975
+ // calculate variance
976
+ double variance = 0;
977
+ for (int x1 = 0; x1 < blockSize; x1++)
978
+ {
979
+ for (int y1 = 0; y1 < blockSize; y1++)
980
+ {
981
+ int pix = *(orig->m_picOrg0 + (blockX + x1 + orig->m_stride * (blockY + y1)));
982
+ variance = variance + (pix - avg) * (pix - avg);
983
+ }
984
+ }
985
+
986
+ leastError = (int)(20 * ((leastError + 5.0) / (variance + 5.0)) + (leastError / (blockSize * blockSize)) / 50);
987
+
988
+ int mvIdx = (blockY / stepSize) * mvStride + (blockX / stepSize);
989
+ mvsmvIdx = best;
990
+ minErrormvIdx = leastError;
991
+ }
992
+ }
993
+}
994
+
995
+void TemporalFilter::destroyRefPicInfo(TemporalFilterRefPicInfo* curFrame)
996
+{
997
+ if (curFrame)
998
+ {
999
+ if (curFrame->compensatedPic)
1000
+ {
1001
+ curFrame->compensatedPic->destroy();
1002
+ delete curFrame->compensatedPic;
1003
+ }
1004
+
1005
+ if (curFrame->mvs)
1006
+ X265_FREE(curFrame->mvs);
1007
+ if (curFrame->mvs0)
1008
+ X265_FREE(curFrame->mvs0);
1009
+ if (curFrame->mvs1)
1010
+ X265_FREE(curFrame->mvs1);
1011
+ if (curFrame->mvs2)
1012
+ X265_FREE(curFrame->mvs2);
1013
+ if (curFrame->noise)
1014
+ X265_FREE(curFrame->noise);
1015
+ if (curFrame->error)
1016
+ X265_FREE(curFrame->error);
1017
+ }
1018
+}
1019
x265_3.6.tar.gz/source/common/temporalfilter.h
Added
187
1
2
+/*****************************************************************************
3
+* Copyright (C) 2013-2021 MulticoreWare, Inc
4
+*
5
+ * Authors: Ashok Kumar Mishra <ashok@multicorewareinc.com>
6
+ *
7
+* This program is free software; you can redistribute it and/or modify
8
+* it under the terms of the GNU General Public License as published by
9
+* the Free Software Foundation; either version 2 of the License, or
10
+* (at your option) any later version.
11
+*
12
+* This program is distributed in the hope that it will be useful,
13
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+* GNU General Public License for more details.
16
+*
17
+* You should have received a copy of the GNU General Public License
18
+* along with this program; if not, write to the Free Software
19
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+*
21
+* This program is also available under a commercial proprietary license.
22
+* For more information, contact us at license @ x265.com.
23
+*****************************************************************************/
24
+
25
+#ifndef X265_TEMPORAL_FILTER_H
26
+#define X265_TEMPORAL_FILTER_H
27
+
28
+#include "x265.h"
29
+#include "picyuv.h"
30
+#include "mv.h"
31
+#include "piclist.h"
32
+#include "yuv.h"
33
+#include "motion.h"
34
+
35
+const int s_interpolationFilter168 =
36
+{
37
+ { 0, 0, 0, 64, 0, 0, 0, 0 }, //0
38
+ { 0, 1, -3, 64, 4, -2, 0, 0 }, //1 -->-->
39
+ { 0, 1, -6, 62, 9, -3, 1, 0 }, //2 -->
40
+ { 0, 2, -8, 60, 14, -5, 1, 0 }, //3 -->-->
41
+ { 0, 2, -9, 57, 19, -7, 2, 0 }, //4
42
+ { 0, 3, -10, 53, 24, -8, 2, 0 }, //5 -->-->
43
+ { 0, 3, -11, 50, 29, -9, 2, 0 }, //6 -->
44
+ { 0, 3, -11, 44, 35, -10, 3, 0 }, //7 -->-->
45
+ { 0, 1, -7, 38, 38, -7, 1, 0 }, //8
46
+ { 0, 3, -10, 35, 44, -11, 3, 0 }, //9 -->-->
47
+ { 0, 2, -9, 29, 50, -11, 3, 0 }, //10-->
48
+ { 0, 2, -8, 24, 53, -10, 3, 0 }, //11-->-->
49
+ { 0, 2, -7, 19, 57, -9, 2, 0 }, //12
50
+ { 0, 1, -5, 14, 60, -8, 2, 0 }, //13-->-->
51
+ { 0, 1, -3, 9, 62, -6, 1, 0 }, //14-->
52
+ { 0, 0, -2, 4, 64, -3, 1, 0 } //15-->-->
53
+};
54
+
55
+const double s_refStrengths34 =
56
+{ // abs(POC offset)
57
+ // 1, 2 3 4
58
+ {0.85, 0.57, 0.41, 0.33}, // m_range * 2
59
+ {1.13, 0.97, 0.81, 0.57}, // m_range
60
+ {0.30, 0.30, 0.30, 0.30} // otherwise
61
+};
62
+
63
+namespace X265_NS {
64
+ class OrigPicBuffer
65
+ {
66
+ public:
67
+ PicList m_mcstfPicList;
68
+ PicList m_mcstfOrigPicFreeList;
69
+ PicList m_mcstfOrigPicList;
70
+
71
+ ~OrigPicBuffer();
72
+ void addPicture(Frame*);
73
+ void addEncPicture(Frame*);
74
+ void setOrigPicList(Frame*, int);
75
+ void recycleOrigPicList();
76
+ void addPictureToFreelist(Frame*);
77
+ void addEncPictureToPicList(Frame*);
78
+ };
79
+
80
+ struct MotionEstimatorTLD
81
+ {
82
+ MotionEstimate me;
83
+
84
+ MotionEstimatorTLD()
85
+ {
86
+ me.init(X265_CSP_I400);
87
+ me.setQP(X265_LOOKAHEAD_QP);
88
+ }
89
+
90
+ ~MotionEstimatorTLD() {}
91
+ };
92
+
93
+ struct TemporalFilterRefPicInfo
94
+ {
95
+ PicYuv* picBuffer;
96
+ PicYuv* picBufferSubSampled2;
97
+ PicYuv* picBufferSubSampled4;
98
+ MV* mvs;
99
+ MV* mvs0;
100
+ MV* mvs1;
101
+ MV* mvs2;
102
+ uint32_t mvsStride;
103
+ uint32_t mvsStride0;
104
+ uint32_t mvsStride1;
105
+ uint32_t mvsStride2;
106
+ int* error;
107
+ int* noise;
108
+
109
+ int16_t origOffset;
110
+ bool isFilteredFrame;
111
+ PicYuv* compensatedPic;
112
+
113
+ int* isSubsampled;
114
+
115
+ int slicetype;
116
+ };
117
+
118
+ class TemporalFilter
119
+ {
120
+ public:
121
+ TemporalFilter();
122
+ ~TemporalFilter() {}
123
+
124
+ void init(const x265_param* param);
125
+
126
+ //private:
127
+ // Private static member variables
128
+ const x265_param *m_param;
129
+ int32_t m_bitDepth;
130
+ int m_range;
131
+ uint8_t m_numRef;
132
+ double m_chromaFactor;
133
+ double m_sigmaMultiplier;
134
+ double m_sigmaZeroPoint;
135
+ int m_motionVectorFactor;
136
+ int m_padding;
137
+
138
+ // Private member variables
139
+
140
+ int m_sourceWidth;
141
+ int m_sourceHeight;
142
+ int m_QP;
143
+
144
+ int m_internalCsp;
145
+ int m_numComponents;
146
+ uint8_t m_sliceTypeConfig;
147
+
148
+ MotionEstimatorTLD* m_metld;
149
+ Yuv predPUYuv;
150
+ int m_useSADinME;
151
+
152
+ int createRefPicInfo(TemporalFilterRefPicInfo* refFrame, x265_param* param);
153
+
154
+ void bilateralFilter(Frame* frame, TemporalFilterRefPicInfo* mctfRefList, double overallStrength);
155
+
156
+ void motionEstimationLuma(MV *mvs, uint32_t mvStride, PicYuv *orig, PicYuv *buffer, int bs,
157
+ MV *previous = 0, uint32_t prevmvStride = 0, int factor = 1);
158
+
159
+ void motionEstimationLumaDoubleRes(MV *mvs, uint32_t mvStride, PicYuv *orig, PicYuv *buffer, int blockSize,
160
+ MV *previous, uint32_t prevMvStride, int factor, int* minError);
161
+
162
+ int motionErrorLumaSSD(PicYuv *orig,
163
+ PicYuv *buffer,
164
+ int x,
165
+ int y,
166
+ int dx,
167
+ int dy,
168
+ int bs,
169
+ int besterror = 8 * 8 * 1024 * 1024);
170
+
171
+ int motionErrorLumaSAD(PicYuv *orig,
172
+ PicYuv *buffer,
173
+ int x,
174
+ int y,
175
+ int dx,
176
+ int dy,
177
+ int bs,
178
+ int besterror = 8 * 8 * 1024 * 1024);
179
+
180
+ void destroyRefPicInfo(TemporalFilterRefPicInfo* curFrame);
181
+
182
+ void applyMotion(MV *mvs, uint32_t mvsStride, PicYuv *input, PicYuv *output);
183
+
184
+ };
185
+}
186
+#endif
187
x265_3.5.tar.gz/source/common/threading.h -> x265_3.6.tar.gz/source/common/threading.h
Changed
340
1
2
*
3
* Authors: Steve Borho <steve@borho.org>
4
* Min Chen <chenm003@163.com>
5
+ liwei <liwei@multicorewareinc.com>
6
*
7
* This program is free software; you can redistribute it and/or modify
8
* it under the terms of the GNU General Public License as published by
9
10
int m_val;
11
};
12
13
+class NamedSemaphore
14
+{
15
+public:
16
+ NamedSemaphore() : m_sem(NULL)
17
+ {
18
+ }
19
+
20
+ ~NamedSemaphore()
21
+ {
22
+ }
23
+
24
+ bool create(const char* name, const int initcnt, const int maxcnt)
25
+ {
26
+ if(!m_sem)
27
+ {
28
+ m_sem = CreateSemaphoreA(NULL, initcnt, maxcnt, name);
29
+ }
30
+ return m_sem != NULL;
31
+ }
32
+
33
+ bool give(const int32_t cnt)
34
+ {
35
+ return ReleaseSemaphore(m_sem, (LONG)cnt, NULL) != FALSE;
36
+ }
37
+
38
+ bool take(const uint32_t time_out = INFINITE)
39
+ {
40
+ int32_t rt = WaitForSingleObject(m_sem, time_out);
41
+ return rt != WAIT_TIMEOUT && rt != WAIT_FAILED;
42
+ }
43
+
44
+ void release()
45
+ {
46
+ CloseHandle(m_sem);
47
+ m_sem = NULL;
48
+ }
49
+
50
+private:
51
+ HANDLE m_sem;
52
+};
53
+
54
#else /* POSIX / pthreads */
55
56
typedef pthread_t ThreadHandle;
57
58
int m_val;
59
};
60
61
+#define TIMEOUT_INFINITE 0xFFFFFFFF
62
+
63
+class NamedSemaphore
64
+{
65
+public:
66
+ NamedSemaphore()
67
+ : m_sem(NULL)
68
+#ifndef __APPLE__
69
+ , m_name(NULL)
70
+#endif //__APPLE__
71
+ {
72
+ }
73
+
74
+ ~NamedSemaphore()
75
+ {
76
+ }
77
+
78
+ bool create(const char* name, const int initcnt, const int maxcnt)
79
+ {
80
+ bool ret = false;
81
+
82
+ if (initcnt >= maxcnt)
83
+ {
84
+ return false;
85
+ }
86
+
87
+#ifdef __APPLE__
88
+ do
89
+ {
90
+ int32_t pshared = name != NULL ? PTHREAD_PROCESS_SHARED : PTHREAD_PROCESS_PRIVATE;
91
+
92
+ m_sem = (mac_sem_t *)malloc(sizeof(mac_sem_t));
93
+ if (!m_sem)
94
+ {
95
+ break;
96
+ }
97
+
98
+ if (pthread_mutexattr_init(&m_sem->mutexAttr))
99
+ {
100
+ break;
101
+ }
102
+
103
+ if (pthread_mutexattr_setpshared(&m_sem->mutexAttr, pshared))
104
+ {
105
+ break;
106
+ }
107
+
108
+ if (pthread_condattr_init(&m_sem->condAttr))
109
+ {
110
+ break;
111
+ }
112
+
113
+ if (pthread_condattr_setpshared(&m_sem->condAttr, pshared))
114
+ {
115
+ break;
116
+ }
117
+
118
+ if (pthread_mutex_init(&m_sem->mutex, &m_sem->mutexAttr))
119
+ {
120
+ break;
121
+ }
122
+
123
+ if (pthread_cond_init(&m_sem->cond, &m_sem->condAttr))
124
+ {
125
+ break;
126
+ }
127
+
128
+ m_sem->curCnt = initcnt;
129
+ m_sem->maxCnt = maxcnt;
130
+
131
+ ret = true;
132
+ } while (0);
133
+
134
+ if (!ret)
135
+ {
136
+ release();
137
+ }
138
+
139
+#else //__APPLE__
140
+ m_sem = sem_open(name, O_CREAT | O_EXCL, 0666, initcnt);
141
+ if (m_sem != SEM_FAILED)
142
+ {
143
+ m_name = strdup(name);
144
+ ret = true;
145
+ }
146
+ else
147
+ {
148
+ if (EEXIST == errno)
149
+ {
150
+ m_sem = sem_open(name, 0);
151
+ if (m_sem != SEM_FAILED)
152
+ {
153
+ m_name = strdup(name);
154
+ ret = true;
155
+ }
156
+ }
157
+ }
158
+#endif //__APPLE__
159
+
160
+ return ret;
161
+ }
162
+
163
+ bool give(const int32_t cnt)
164
+ {
165
+ if (!m_sem)
166
+ {
167
+ return false;
168
+ }
169
+
170
+#ifdef __APPLE__
171
+ if (pthread_mutex_lock(&m_sem->mutex))
172
+ {
173
+ return false;
174
+ }
175
+
176
+ int oldCnt = m_sem->curCnt;
177
+ m_sem->curCnt += cnt;
178
+ if (m_sem->curCnt > m_sem->maxCnt)
179
+ {
180
+ m_sem->curCnt = m_sem->maxCnt;
181
+ }
182
+
183
+ bool ret = true;
184
+ if (!oldCnt)
185
+ {
186
+ ret = 0 == pthread_cond_broadcast(&m_sem->cond);
187
+ }
188
+
189
+ if (pthread_mutex_unlock(&m_sem->mutex))
190
+ {
191
+ return false;
192
+ }
193
+
194
+ return ret;
195
+#else //__APPLE__
196
+ int ret = 0;
197
+ int32_t curCnt = cnt;
198
+ while (curCnt-- && !ret) {
199
+ ret = sem_post(m_sem);
200
+ }
201
+
202
+ return 0 == ret;
203
+#endif //_APPLE__
204
+ }
205
+
206
+ bool take(const uint32_t time_out = TIMEOUT_INFINITE)
207
+ {
208
+ if (!m_sem)
209
+ {
210
+ return false;
211
+ }
212
+
213
+#ifdef __APPLE__
214
+
215
+ if (pthread_mutex_lock(&m_sem->mutex))
216
+ {
217
+ return false;
218
+ }
219
+
220
+ bool ret = true;
221
+ if (TIMEOUT_INFINITE == time_out)
222
+ {
223
+ if (!m_sem->curCnt)
224
+ {
225
+ if (pthread_cond_wait(&m_sem->cond, &m_sem->mutex))
226
+ {
227
+ ret = false;
228
+ }
229
+ }
230
+
231
+ if (m_sem->curCnt && ret)
232
+ {
233
+ m_sem->curCnt--;
234
+ }
235
+ }
236
+ else
237
+ {
238
+ if (0 == time_out)
239
+ {
240
+ if (m_sem->curCnt)
241
+ {
242
+ m_sem->curCnt--;
243
+ }
244
+ else
245
+ {
246
+ ret = false;
247
+ }
248
+ }
249
+ else
250
+ {
251
+ if (!m_sem->curCnt)
252
+ {
253
+ struct timespec ts;
254
+ ts.tv_sec = time_out / 1000L;
255
+ ts.tv_nsec = (time_out * 1000000L) - ts.tv_sec * 1000 * 1000 * 1000;
256
+
257
+ if (pthread_cond_timedwait(&m_sem->cond, &m_sem->mutex, &ts))
258
+ {
259
+ ret = false;
260
+ }
261
+ }
262
+
263
+ if (m_sem->curCnt && ret)
264
+ {
265
+ m_sem->curCnt--;
266
+ }
267
+ }
268
+ }
269
+
270
+ if (pthread_mutex_unlock(&m_sem->mutex))
271
+ {
272
+ return false;
273
+ }
274
+
275
+ return ret;
276
+#else //__APPLE__
277
+ if (TIMEOUT_INFINITE == time_out)
278
+ {
279
+ return 0 == sem_wait(m_sem);
280
+ }
281
+ else
282
+ {
283
+ if (0 == time_out)
284
+ {
285
+ return 0 == sem_trywait(m_sem);
286
+ }
287
+ else
288
+ {
289
+ struct timespec ts;
290
+ ts.tv_sec = time_out / 1000L;
291
+ ts.tv_nsec = (time_out * 1000000L) - ts.tv_sec * 1000 * 1000 * 1000;
292
+ return 0 == sem_timedwait(m_sem, &ts);
293
+ }
294
+ }
295
+#endif //_APPLE__
296
+ }
297
+
298
+ void release()
299
+ {
300
+ if (m_sem)
301
+ {
302
+#ifdef __APPLE__
303
+ pthread_condattr_destroy(&m_sem->condAttr);
304
+ pthread_mutexattr_destroy(&m_sem->mutexAttr);
305
+ pthread_mutex_destroy(&m_sem->mutex);
306
+ pthread_cond_destroy(&m_sem->cond);
307
+ free(m_sem);
308
+ m_sem = NULL;
309
+#else //__APPLE__
310
+ sem_close(m_sem);
311
+ sem_unlink(m_name);
312
+ m_sem = NULL;
313
+ free(m_name);
314
+ m_name = NULL;
315
+#endif //__APPLE__
316
+ }
317
+ }
318
+
319
+private:
320
+#ifdef __APPLE__
321
+ typedef struct
322
+ {
323
+ pthread_mutex_t mutex;
324
+ pthread_cond_t cond;
325
+ pthread_mutexattr_t mutexAttr;
326
+ pthread_condattr_t condAttr;
327
+ uint32_t curCnt;
328
+ uint32_t maxCnt;
329
+ }mac_sem_t;
330
+ mac_sem_t *m_sem;
331
+#else // __APPLE__
332
+ sem_t *m_sem;
333
+ char *m_name;
334
+#endif // __APPLE_
335
+};
336
+
337
#endif // ifdef _WIN32
338
339
class ScopedLock
340
x265_3.5.tar.gz/source/common/threadpool.cpp -> x265_3.6.tar.gz/source/common/threadpool.cpp
Changed
10
1
2
/* limit threads based on param->numaPools
3
* For windows because threads can't be allocated to live across sockets
4
* changing the default behavior to be per-socket pools -- FIXME */
5
-#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
6
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 || HAVE_LIBNUMA
7
if (!p->numaPools || (strcmp(p->numaPools, "NULL") == 0 || strcmp(p->numaPools, "*") == 0 || strcmp(p->numaPools, "") == 0))
8
{
9
char poolString50 = "";
10
x265_3.5.tar.gz/source/common/version.cpp -> x265_3.6.tar.gz/source/common/version.cpp
Changed
10
1
2
#define ONOS "Unk-OS"
3
#endif
4
5
-#if X86_64
6
+#if defined(_LP64) || defined(_WIN64)
7
#define BITS "64 bit"
8
#else
9
#define BITS "32 bit"
10
x265_3.5.tar.gz/source/common/x86/asm-primitives.cpp -> x265_3.6.tar.gz/source/common/x86/asm-primitives.cpp
Changed
85
1
2
3
p.frameInitLowres = PFX(frame_init_lowres_core_sse2);
4
p.frameInitLowerRes = PFX(frame_init_lowres_core_sse2);
5
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_sse2);
6
// TODO: the planecopy_sp is really planecopy_SC now, must be fix it
7
//p.planecopy_sp = PFX(downShift_16_sse2);
8
p.planecopy_sp_shl = PFX(upShift_16_sse2);
9
10
{
11
ASSIGN2(p.scale1D_128to64, scale1D_128to64_ssse3);
12
p.scale2D_64to32 = PFX(scale2D_64to32_ssse3);
13
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_ssse3);
14
15
// p.puLUMA_4x4.satd = p.cuBLOCK_4x4.sa8d = PFX(pixel_satd_4x4_ssse3); this one is broken
16
ALL_LUMA_PU(satd, pixel_satd, ssse3);
17
18
p.puLUMA_64x48.copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx);
19
p.puLUMA_64x64.copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx);
20
p.propagateCost = PFX(mbtree_propagate_cost_avx);
21
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_avx);
22
}
23
if (cpuMask & X265_CPU_XOP)
24
{
25
26
LUMA_VAR(xop);
27
p.frameInitLowres = PFX(frame_init_lowres_core_xop);
28
p.frameInitLowerRes = PFX(frame_init_lowres_core_xop);
29
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_xop);
30
}
31
if (cpuMask & X265_CPU_AVX2)
32
{
33
34
35
p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
36
p.frameInitLowerRes = PFX(frame_init_lowres_core_avx2);
37
+
38
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_avx2);
39
+
40
p.propagateCost = PFX(mbtree_propagate_cost_avx2);
41
p.fix8Unpack = PFX(cutree_fix8_unpack_avx2);
42
p.fix8Pack = PFX(cutree_fix8_pack_avx2);
43
44
//p.frameInitLowres = PFX(frame_init_lowres_core_mmx2);
45
p.frameInitLowres = PFX(frame_init_lowres_core_sse2);
46
p.frameInitLowerRes = PFX(frame_init_lowres_core_sse2);
47
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_sse2);
48
49
ALL_LUMA_TU(blockfill_sNONALIGNED, blockfill_s, sse2);
50
ALL_LUMA_TU(blockfill_sALIGNED, blockfill_s, sse2);
51
52
ASSIGN2(p.scale1D_128to64, scale1D_128to64_ssse3);
53
p.scale2D_64to32 = PFX(scale2D_64to32_ssse3);
54
55
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_ssse3);
56
+
57
ASSIGN2(p.puLUMA_8x4.convert_p2s, filterPixelToShort_8x4_ssse3);
58
ASSIGN2(p.puLUMA_8x8.convert_p2s, filterPixelToShort_8x8_ssse3);
59
ASSIGN2(p.puLUMA_8x16.convert_p2s, filterPixelToShort_8x16_ssse3);
60
61
p.frameInitLowres = PFX(frame_init_lowres_core_avx);
62
p.frameInitLowerRes = PFX(frame_init_lowres_core_avx);
63
p.propagateCost = PFX(mbtree_propagate_cost_avx);
64
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_avx);
65
}
66
if (cpuMask & X265_CPU_XOP)
67
{
68
69
p.cuBLOCK_16x16.sse_pp = PFX(pixel_ssd_16x16_xop);
70
p.frameInitLowres = PFX(frame_init_lowres_core_xop);
71
p.frameInitLowerRes = PFX(frame_init_lowres_core_xop);
72
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_xop);
73
74
}
75
#if X86_64
76
77
p.saoCuStatsE2 = PFX(saoCuStatsE2_avx2);
78
p.saoCuStatsE3 = PFX(saoCuStatsE3_avx2);
79
80
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_avx2);
81
+
82
if (cpuMask & X265_CPU_BMI2)
83
{
84
p.scanPosLast = PFX(scanPosLast_avx2_bmi2);
85
x265_3.5.tar.gz/source/common/x86/const-a.asm -> x265_3.6.tar.gz/source/common/x86/const-a.asm
Changed
10
1
2
const pw_2000, times 16 dw 0x2000
3
const pw_8000, times 8 dw 0x8000
4
const pw_3fff, times 16 dw 0x3fff
5
-const pw_32_0, times 4 dw 32,
6
+const pw_32_0, times 4 dw 32
7
times 4 dw 0
8
const pw_pixel_max, times 16 dw ((1 << BIT_DEPTH)-1)
9
10
x265_3.5.tar.gz/source/common/x86/h-ipfilter8.asm -> x265_3.6.tar.gz/source/common/x86/h-ipfilter8.asm
Changed
20
1
2
ALIGN 32
3
interp4_hps_shuf: times 2 db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
4
5
+ALIGN 32
6
+const interp_4tap_8x8_horiz_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7
7
+
8
SECTION .text
9
10
cextern pw_1
11
12
13
RET
14
15
-ALIGN 32
16
-const interp_4tap_8x8_horiz_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7
17
18
%macro FILTER_H4_w6 3
19
movu %1, srcq - 1
20
x265_3.5.tar.gz/source/common/x86/mc-a2.asm -> x265_3.6.tar.gz/source/common/x86/mc-a2.asm
Changed
264
1
2
FRAME_INIT_LOWRES
3
%endif
4
5
+%macro SUBSAMPLEFILT8x4 7
6
+ mova %3, r0+%7
7
+ mova %4, r0+r2+%7
8
+ pavgb %3, %4
9
+ pavgb %4, r0+r2*2+%7
10
+ PALIGNR %1, %3, 1, m6
11
+ PALIGNR %2, %4, 1, m6
12
+%if cpuflag(xop)
13
+ pavgb %1, %3
14
+ pavgb %2, %4
15
+%else
16
+ pavgb %1, %3
17
+ pavgb %2, %4
18
+ psrlw %5, %1, 8
19
+ psrlw %6, %2, 8
20
+ pand %1, m7
21
+ pand %2, m7
22
+%endif
23
+%endmacro
24
+
25
+%macro SUBSAMPLEFILT32x4U 1
26
+ movu m1, r0+r2
27
+ pavgb m0, m1, r0
28
+ movu m3, r0+r2+1
29
+ pavgb m2, m3, r0+1
30
+ pavgb m1, r0+r2*2
31
+ pavgb m3, r0+r2*2+1
32
+ pavgb m0, m2
33
+ pavgb m1, m3
34
+
35
+ movu m3, r0+r2+mmsize
36
+ pavgb m2, m3, r0+mmsize
37
+ movu m5, r0+r2+1+mmsize
38
+ pavgb m4, m5, r0+1+mmsize
39
+ pavgb m2, m4
40
+
41
+ pshufb m0, m7
42
+ pshufb m2, m7
43
+ punpcklqdq m0, m0, m2
44
+ vpermq m0, m0, q3120
45
+ movu %1, m0
46
+%endmacro
47
+
48
+%macro SUBSAMPLEFILT16x2 3
49
+ mova m3, r0+%3+mmsize
50
+ mova m2, r0+%3
51
+ pavgb m3, r0+%3+r2+mmsize
52
+ pavgb m2, r0+%3+r2
53
+ PALIGNR %1, m3, 1, m6
54
+ pavgb %1, m3
55
+ PALIGNR m3, m2, 1, m6
56
+ pavgb m3, m2
57
+%if cpuflag(xop)
58
+ vpperm m3, m3, %1, m6
59
+%else
60
+ pand m3, m7
61
+ pand %1, m7
62
+ packuswb m3, %1
63
+%endif
64
+ mova %2, m3
65
+ mova %1, m2
66
+%endmacro
67
+
68
+%macro SUBSAMPLEFILT8x2U 2
69
+ mova m2, r0+%2
70
+ pavgb m2, r0+%2+r2
71
+ mova m0, r0+%2+1
72
+ pavgb m0, r0+%2+r2+1
73
+ pavgb m1, m3
74
+ pavgb m0, m2
75
+ pand m1, m7
76
+ pand m0, m7
77
+ packuswb m0, m1
78
+ mova %1, m0
79
+%endmacro
80
+
81
+%macro SUBSAMPLEFILT8xU 2
82
+ mova m3, r0+%2+8
83
+ mova m2, r0+%2
84
+ pavgw m3, r0+%2+r2+8
85
+ pavgw m2, r0+%2+r2
86
+ movu m1, r0+%2+10
87
+ movu m0, r0+%2+2
88
+ pavgw m1, r0+%2+r2+10
89
+ pavgw m0, r0+%2+r2+2
90
+ pavgw m1, m3
91
+ pavgw m0, m2
92
+ psrld m3, m1, 16
93
+ pand m1, m7
94
+ pand m0, m7
95
+ packssdw m0, m1
96
+ movu %1, m0
97
+%endmacro
98
+
99
+%macro SUBSAMPLEFILT8xA 3
100
+ movu m3, r0+%3+mmsize
101
+ movu m2, r0+%3
102
+ pavgw m3, r0+%3+r2+mmsize
103
+ pavgw m2, r0+%3+r2
104
+ PALIGNR %1, m3, 2, m6
105
+ pavgw %1, m3
106
+ PALIGNR m3, m2, 2, m6
107
+ pavgw m3, m2
108
+%if cpuflag(xop)
109
+ vpperm m3, m3, %1, m6
110
+%else
111
+ pand m3, m7
112
+ pand %1, m7
113
+ packssdw m3, %1
114
+%endif
115
+%if cpuflag(avx2)
116
+ vpermq m3, m3, q3120
117
+%endif
118
+ movu %2, m3
119
+ movu %1, m2
120
+%endmacro
121
+
122
+;-----------------------------------------------------------------------------
123
+; void frame_subsample_luma( uint8_t *src0, uint8_t *dst0,
124
+; intptr_t src_stride, intptr_t dst_stride, int width, int height )
125
+;-----------------------------------------------------------------------------
126
+
127
+%macro FRAME_SUBSAMPLE_LUMA 0
128
+cglobal frame_subsample_luma, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
129
+%if HIGH_BIT_DEPTH
130
+ shl dword r3m, 1
131
+ FIX_STRIDES r2
132
+ shl dword r4m, 1
133
+%endif
134
+%if mmsize >= 16
135
+ add dword r4m, mmsize-1
136
+ and dword r4m, ~(mmsize-1)
137
+%endif
138
+ ; src += 2*(height-1)*stride + 2*width
139
+ mov r6d, r5m
140
+ dec r6d
141
+ imul r6d, r2d
142
+ add r6d, r4m
143
+ lea r0, r0+r6*2
144
+ ; dst += (height-1)*stride + width
145
+ mov r6d, r5m
146
+ dec r6d
147
+ imul r6d, r3m
148
+ add r6d, r4m
149
+ add r1, r6
150
+ ; gap = stride - width
151
+ mov r6d, r3m
152
+ sub r6d, r4m
153
+ PUSH r6
154
+ %define dst_gap rsp+gprsize
155
+ mov r6d, r2d
156
+ sub r6d, r4m
157
+ shl r6d, 1
158
+ PUSH r6
159
+ %define src_gap rsp
160
+%if HIGH_BIT_DEPTH
161
+%if cpuflag(xop)
162
+ mova m6, deinterleave_shuf32a
163
+ mova m7, deinterleave_shuf32b
164
+%else
165
+ pcmpeqw m7, m7
166
+ psrld m7, 16
167
+%endif
168
+.vloop:
169
+ mov r6d, r4m
170
+%ifnidn cpuname, mmx2
171
+ movu m0, r0
172
+ movu m1, r0+r2
173
+ pavgw m0, m1
174
+ pavgw m1, r0+r2*2
175
+%endif
176
+.hloop:
177
+ sub r0, mmsize*2
178
+ sub r1, mmsize
179
+%ifidn cpuname, mmx2
180
+ SUBSAMPLEFILT8xU r1, 0
181
+%else
182
+ SUBSAMPLEFILT8xA m0, r1, 0
183
+%endif
184
+ sub r6d, mmsize
185
+ jg .hloop
186
+%else ; !HIGH_BIT_DEPTH
187
+%if cpuflag(avx2)
188
+ mova m7, deinterleave_shuf
189
+%elif cpuflag(xop)
190
+ mova m6, deinterleave_shuf32a
191
+ mova m7, deinterleave_shuf32b
192
+%else
193
+ pcmpeqb m7, m7
194
+ psrlw m7, 8
195
+%endif
196
+.vloop:
197
+ mov r6d, r4m
198
+%ifnidn cpuname, mmx2
199
+%if mmsize <= 16
200
+ mova m0, r0
201
+ mova m1, r0+r2
202
+ pavgb m0, m1
203
+ pavgb m1, r0+r2*2
204
+%endif
205
+%endif
206
+.hloop:
207
+ sub r0, mmsize*2
208
+ sub r1, mmsize
209
+%if mmsize==32
210
+ SUBSAMPLEFILT32x4U r1
211
+%elifdef m8
212
+ SUBSAMPLEFILT8x4 m0, m1, m2, m3, m10, m11, mmsize
213
+ mova m8, m0
214
+ mova m9, m1
215
+ SUBSAMPLEFILT8x4 m2, m3, m0, m1, m4, m5, 0
216
+%if cpuflag(xop)
217
+ vpperm m4, m2, m8, m7
218
+ vpperm m2, m2, m8, m6
219
+%else
220
+ packuswb m2, m8
221
+%endif
222
+ mova r1, m2
223
+%elifidn cpuname, mmx2
224
+ SUBSAMPLEFILT8x2U r1, 0
225
+%else
226
+ SUBSAMPLEFILT16x2 m0, r1, 0
227
+%endif
228
+ sub r6d, mmsize
229
+ jg .hloop
230
+%endif ; HIGH_BIT_DEPTH
231
+.skip:
232
+ mov r3, dst_gap
233
+ sub r0, src_gap
234
+ sub r1, r3
235
+ dec dword r5m
236
+ jg .vloop
237
+ ADD rsp, 2*gprsize
238
+ emms
239
+ RET
240
+%endmacro ; FRAME_SUBSAMPLE_LUMA
241
+
242
+INIT_MMX mmx2
243
+FRAME_SUBSAMPLE_LUMA
244
+%if ARCH_X86_64 == 0
245
+INIT_MMX cache32, mmx2
246
+FRAME_SUBSAMPLE_LUMA
247
+%endif
248
+INIT_XMM sse2
249
+FRAME_SUBSAMPLE_LUMA
250
+INIT_XMM ssse3
251
+FRAME_SUBSAMPLE_LUMA
252
+INIT_XMM avx
253
+FRAME_SUBSAMPLE_LUMA
254
+INIT_XMM xop
255
+FRAME_SUBSAMPLE_LUMA
256
+%if ARCH_X86_64 == 1
257
+INIT_YMM avx2
258
+FRAME_SUBSAMPLE_LUMA
259
+%endif
260
+
261
;-----------------------------------------------------------------------------
262
; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, int32_t *intra_costs,
263
; uint16_t *inter_costs, int32_t *inv_qscales, double *fps_factor, int len )
264
x265_3.5.tar.gz/source/common/x86/mc.h -> x265_3.6.tar.gz/source/common/x86/mc.h
Changed
19
1
2
3
#undef LOWRES
4
5
+#define SUBSAMPLELUMA(cpu) \
6
+ void PFX(frame_subsample_luma_ ## cpu)(const pixel* src0, pixel* dst0, intptr_t src_stride, intptr_t dst_stride, int width, int height);
7
+SUBSAMPLELUMA(mmx2)
8
+SUBSAMPLELUMA(sse2)
9
+SUBSAMPLELUMA(ssse3)
10
+SUBSAMPLELUMA(avx)
11
+SUBSAMPLELUMA(avx2)
12
+SUBSAMPLELUMA(xop)
13
+
14
+#undef SUBSAMPLELUMA
15
+
16
#define PROPAGATE_COST(cpu) \
17
void PFX(mbtree_propagate_cost_ ## cpu)(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, \
18
const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
19
x265_3.5.tar.gz/source/common/x86/x86inc.asm -> x265_3.6.tar.gz/source/common/x86/x86inc.asm
Changed
96
1
2
%endif
3
%endmacro
4
5
-%macro DEFINE_ARGS_INTERNAL 3+
6
- %ifnum %2
7
- DEFINE_ARGS %3
8
- %elif %1 == 4
9
- DEFINE_ARGS %2
10
- %elif %1 > 4
11
- DEFINE_ARGS %2, %3
12
- %endif
13
-%endmacro
14
-
15
%if WIN64 ; Windows x64 ;=================================================
16
17
DECLARE_REG 0, rcx
18
19
DECLARE_REG 13, R12, 112
20
DECLARE_REG 14, R13, 120
21
22
-%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
23
+%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
24
%assign num_args %1
25
%assign regs_used %2
26
ASSERT regs_used >= num_args
27
28
WIN64_SPILL_XMM %3
29
%endif
30
LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
31
- DEFINE_ARGS_INTERNAL %0, %4, %5
32
+ %if %0 > 4
33
+ %ifnum %4
34
+ DEFINE_ARGS %5
35
+ %else
36
+ DEFINE_ARGS %4, %5
37
+ %endif
38
+ %elifnnum %4
39
+ DEFINE_ARGS %4
40
+ %endif
41
%endmacro
42
43
%macro WIN64_PUSH_XMM 0
44
45
DECLARE_REG 13, R12, 64
46
DECLARE_REG 14, R13, 72
47
48
-%macro PROLOGUE 2-5+ 0; #args, #regs, #xmm_regs, stack_size, arg_names...
49
+%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
50
%assign num_args %1
51
%assign regs_used %2
52
%assign xmm_regs_used %3
53
54
PUSH_IF_USED 9, 10, 11, 12, 13, 14
55
ALLOC_STACK %4
56
LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
57
- DEFINE_ARGS_INTERNAL %0, %4, %5
58
+ %if %0 > 4
59
+ %ifnum %4
60
+ DEFINE_ARGS %5
61
+ %else
62
+ DEFINE_ARGS %4, %5
63
+ %endif
64
+ %elifnnum %4
65
+ DEFINE_ARGS %4
66
+ %endif
67
%endmacro
68
69
%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
70
71
72
DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
73
74
-%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, stack_size, arg_names...
75
+%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
76
%assign num_args %1
77
%assign regs_used %2
78
ASSERT regs_used >= num_args
79
80
PUSH_IF_USED 3, 4, 5, 6
81
ALLOC_STACK %4
82
LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
83
- DEFINE_ARGS_INTERNAL %0, %4, %5
84
+ %if %0 > 4
85
+ %ifnum %4
86
+ DEFINE_ARGS %5
87
+ %else
88
+ DEFINE_ARGS %4, %5
89
+ %endif
90
+ %elifnnum %4
91
+ DEFINE_ARGS %4
92
+ %endif
93
%endmacro
94
95
%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
96
x265_3.5.tar.gz/source/common/x86/x86util.asm -> x265_3.6.tar.gz/source/common/x86/x86util.asm
Changed
13
1
2
%elif %1==2
3
%if mmsize==8
4
SBUTTERFLY dq, %3, %4, %5
5
- %else
6
+ %elif %0==6
7
TRANS q, ORDER, %3, %4, %5, %6
8
+ %else
9
+ TRANS q, ORDER, %3, %4, %5
10
%endif
11
%elif %1==4
12
SBUTTERFLY qdq, %3, %4, %5
13
x265_3.5.tar.gz/source/encoder/analysis.cpp -> x265_3.6.tar.gz/source/encoder/analysis.cpp
Changed
10
1
2
qp += distortionData->offsetctu.m_cuAddr;
3
}
4
5
- if (m_param->analysisLoadReuseLevel == 10 && m_param->rc.cuTree)
6
+ if (m_param->analysisLoadReuseLevel >= 2 && m_param->rc.cuTree)
7
{
8
int cuIdx = (ctu.m_cuAddr * ctu.m_numPartitions) + cuGeom.absPartIdx;
9
if (ctu.m_slice->m_sliceType == I_SLICE)
10
x265_3.5.tar.gz/source/encoder/api.cpp -> x265_3.6.tar.gz/source/encoder/api.cpp
Changed
50
1
2
memcpy(zoneParam, param, sizeof(x265_param));
3
for (int i = 0; i < param->rc.zonefileCount; i++)
4
{
5
- param->rc.zonesi.startFrame = -1;
6
encoder->configureZone(zoneParam, param->rc.zonesi.zoneParam);
7
}
8
9
10
if (numEncoded < 0)
11
encoder->m_aborted = true;
12
13
+ if ((!encoder->m_numDelayedPic && !numEncoded) && (encoder->m_param->bEnableEndOfSequence || encoder->m_param->bEnableEndOfBitstream))
14
+ {
15
+ Bitstream bs;
16
+ encoder->getEndNalUnits(encoder->m_nalList, bs);
17
+ *pp_nal = &encoder->m_nalList.m_nal0;
18
+ if (pi_nal) *pi_nal = encoder->m_nalList.m_numNal;
19
+ }
20
+
21
return numEncoded;
22
}
23
24
25
&PARAM_NS::x265_param_free,
26
&PARAM_NS::x265_param_default,
27
&PARAM_NS::x265_param_parse,
28
+ &PARAM_NS::x265_scenecut_aware_qp_param_parse,
29
&PARAM_NS::x265_param_apply_profile,
30
&PARAM_NS::x265_param_default_preset,
31
&x265_picture_alloc,
32
33
if (param->csvLogLevel)
34
{
35
fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, Scenecut, ");
36
+ if (!!param->bEnableTemporalSubLayers)
37
+ fprintf(csvfp, "Temporal Sub Layer ID, ");
38
if (param->csvLogLevel >= 2)
39
fprintf(csvfp, "I/P cost ratio, ");
40
if (param->rc.rateControlMode == X265_RC_CRF)
41
42
const x265_frame_stats* frameStats = &pic->frameData;
43
fprintf(param->csvfpt, "%d, %c-SLICE, %4d, %2.2lf, %10d, %d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc,
44
frameStats->qp, (int)frameStats->bits, frameStats->bScenecut);
45
+ if (!!param->bEnableTemporalSubLayers)
46
+ fprintf(param->csvfpt, "%d,", frameStats->tLayer);
47
if (param->csvLogLevel >= 2)
48
fprintf(param->csvfpt, "%.2f,", frameStats->ipCostRatio);
49
if (param->rc.rateControlMode == X265_RC_CRF)
50
x265_3.5.tar.gz/source/encoder/dpb.cpp -> x265_3.6.tar.gz/source/encoder/dpb.cpp
Changed
258
1
2
{
3
Frame *curFrame = iterFrame;
4
iterFrame = iterFrame->m_next;
5
- if (!curFrame->m_encData->m_bHasReferences && !curFrame->m_countRefEncoders)
6
+ bool isMCSTFReferenced = false;
7
+
8
+ if (curFrame->m_param->bEnableTemporalFilter)
9
+ isMCSTFReferenced =!!(curFrame->m_refPicCnt1);
10
+
11
+ if (!curFrame->m_encData->m_bHasReferences && !curFrame->m_countRefEncoders && !isMCSTFReferenced)
12
{
13
curFrame->m_bChromaExtended = false;
14
15
+ if (curFrame->m_param->bEnableTemporalFilter)
16
+ *curFrame->m_isSubSampled = false;
17
+
18
// Reset column counter
19
X265_CHECK(curFrame->m_reconRowFlag != NULL, "curFrame->m_reconRowFlag check failure");
20
X265_CHECK(curFrame->m_reconColCount != NULL, "curFrame->m_reconColCount check failure");
21
22
{
23
newFrame->m_encData->m_bHasReferences = false;
24
25
+ newFrame->m_tempLayer = (newFrame->m_param->bEnableTemporalSubLayers && !m_bTemporalSublayer) ? 1 : newFrame->m_tempLayer;
26
// Adjust NAL type for unreferenced B frames (change from _R "referenced"
27
// to _N "non-referenced" NAL unit type)
28
switch (slice->m_nalUnitType)
29
{
30
case NAL_UNIT_CODED_SLICE_TRAIL_R:
31
- slice->m_nalUnitType = m_bTemporalSublayer ? NAL_UNIT_CODED_SLICE_TSA_N : NAL_UNIT_CODED_SLICE_TRAIL_N;
32
+ slice->m_nalUnitType = newFrame->m_param->bEnableTemporalSubLayers ? NAL_UNIT_CODED_SLICE_TSA_N : NAL_UNIT_CODED_SLICE_TRAIL_N;
33
break;
34
case NAL_UNIT_CODED_SLICE_RADL_R:
35
slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_RADL_N;
36
37
38
m_picList.pushFront(*newFrame);
39
40
+ if (m_bTemporalSublayer && getTemporalLayerNonReferenceFlag())
41
+ {
42
+ switch (slice->m_nalUnitType)
43
+ {
44
+ case NAL_UNIT_CODED_SLICE_TRAIL_R:
45
+ slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_TRAIL_N;
46
+ break;
47
+ case NAL_UNIT_CODED_SLICE_RADL_R:
48
+ slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_RADL_N;
49
+ break;
50
+ case NAL_UNIT_CODED_SLICE_RASL_R:
51
+ slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_RASL_N;
52
+ break;
53
+ default:
54
+ break;
55
+ }
56
+ }
57
// Do decoding refresh marking if any
58
decodingRefreshMarking(pocCurr, slice->m_nalUnitType);
59
60
- computeRPS(pocCurr, slice->isIRAP(), &slice->m_rps, slice->m_sps->maxDecPicBuffering);
61
-
62
+ computeRPS(pocCurr, newFrame->m_tempLayer, slice->isIRAP(), &slice->m_rps, slice->m_sps->maxDecPicBufferingnewFrame->m_tempLayer);
63
+ bool isTSAPic = ((slice->m_nalUnitType == 2) || (slice->m_nalUnitType == 3)) ? true : false;
64
// Mark pictures in m_piclist as unreferenced if they are not included in RPS
65
- applyReferencePictureSet(&slice->m_rps, pocCurr);
66
+ applyReferencePictureSet(&slice->m_rps, pocCurr, newFrame->m_tempLayer, isTSAPic);
67
+
68
+
69
+ if (m_bTemporalSublayer && newFrame->m_tempLayer > 0
70
+ && !(slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RADL_N // Check if not a leading picture
71
+ || slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RADL_R
72
+ || slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RASL_N
73
+ || slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RASL_R)
74
+ )
75
+ {
76
+ if (isTemporalLayerSwitchingPoint(pocCurr, newFrame->m_tempLayer) || (slice->m_sps->maxTempSubLayers == 1))
77
+ {
78
+ if (getTemporalLayerNonReferenceFlag())
79
+ {
80
+ slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_TSA_N;
81
+ }
82
+ else
83
+ {
84
+ slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_TSA_R;
85
+ }
86
+ }
87
+ else if (isStepwiseTemporalLayerSwitchingPoint(&slice->m_rps, pocCurr, newFrame->m_tempLayer))
88
+ {
89
+ bool isSTSA = true;
90
+ int id = newFrame->m_gopOffset % x265_gop_ra_lengthnewFrame->m_gopId;
91
+ for (int ii = id; (ii < x265_gop_ra_lengthnewFrame->m_gopId && isSTSA == true); ii++)
92
+ {
93
+ int tempIdRef = x265_gop_ranewFrame->m_gopIdii.layer;
94
+ if (tempIdRef == newFrame->m_tempLayer)
95
+ {
96
+ for (int jj = 0; jj < slice->m_rps.numberOfPositivePictures + slice->m_rps.numberOfNegativePictures; jj++)
97
+ {
98
+ if (slice->m_rps.bUsedjj)
99
+ {
100
+ int refPoc = x265_gop_ranewFrame->m_gopIdii.poc_offset + slice->m_rps.deltaPOCjj;
101
+ int kk = 0;
102
+ for (kk = 0; kk < x265_gop_ra_lengthnewFrame->m_gopId; kk++)
103
+ {
104
+ if (x265_gop_ranewFrame->m_gopIdkk.poc_offset == refPoc)
105
+ {
106
+ break;
107
+ }
108
+ }
109
+ if (x265_gop_ranewFrame->m_gopIdkk.layer >= newFrame->m_tempLayer)
110
+ {
111
+ isSTSA = false;
112
+ break;
113
+ }
114
+ }
115
+ }
116
+ }
117
+ }
118
+ if (isSTSA == true)
119
+ {
120
+ if (getTemporalLayerNonReferenceFlag())
121
+ {
122
+ slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_STSA_N;
123
+ }
124
+ else
125
+ {
126
+ slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_STSA_R;
127
+ }
128
+ }
129
+ }
130
+ }
131
132
if (slice->m_sliceType != I_SLICE)
133
slice->m_numRefIdx0 = x265_clip3(1, newFrame->m_param->maxNumReferences, slice->m_rps.numberOfNegativePictures);
134
135
}
136
}
137
138
-void DPB::computeRPS(int curPoc, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer)
139
+void DPB::computeRPS(int curPoc, int tempId, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer)
140
{
141
unsigned int poci = 0, numNeg = 0, numPos = 0;
142
143
144
{
145
if ((iterPic->m_poc != curPoc) && iterPic->m_encData->m_bHasReferences)
146
{
147
- if ((m_lastIDR >= curPoc) || (m_lastIDR <= iterPic->m_poc))
148
+ if ((!m_bTemporalSublayer || (iterPic->m_tempLayer <= tempId)) && ((m_lastIDR >= curPoc) || (m_lastIDR <= iterPic->m_poc)))
149
{
150
rps->pocpoci = iterPic->m_poc;
151
rps->deltaPOCpoci = rps->pocpoci - curPoc;
152
153
rps->sortDeltaPOC();
154
}
155
156
+bool DPB::getTemporalLayerNonReferenceFlag()
157
+{
158
+ Frame* curFrame = m_picList.first();
159
+ if (curFrame->m_encData->m_bHasReferences)
160
+ {
161
+ curFrame->m_sameLayerRefPic = true;
162
+ return false;
163
+ }
164
+ else
165
+ return true;
166
+}
167
+
168
/* Marking reference pictures when an IDR/CRA is encountered. */
169
void DPB::decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType)
170
{
171
172
}
173
174
/** Function for applying picture marking based on the Reference Picture Set */
175
-void DPB::applyReferencePictureSet(RPS *rps, int curPoc)
176
+void DPB::applyReferencePictureSet(RPS *rps, int curPoc, int tempId, bool isTSAPicture)
177
{
178
// loop through all pictures in the reference picture buffer
179
Frame* iterFrame = m_picList.first();
180
181
}
182
if (!referenced)
183
iterFrame->m_encData->m_bHasReferences = false;
184
+
185
+ if (m_bTemporalSublayer)
186
+ {
187
+ //check that pictures of higher temporal layers are not used
188
+ assert(referenced == 0 || iterFrame->m_encData->m_bHasReferences == false || iterFrame->m_tempLayer <= tempId);
189
+
190
+ //check that pictures of higher or equal temporal layer are not in the RPS if the current picture is a TSA picture
191
+ if (isTSAPicture)
192
+ {
193
+ assert(referenced == 0 || iterFrame->m_tempLayer < tempId);
194
+ }
195
+ //check that pictures marked as temporal layer non-reference pictures are not used for reference
196
+ if (iterFrame->m_tempLayer == tempId)
197
+ {
198
+ assert(referenced == 0 || iterFrame->m_sameLayerRefPic == true);
199
+ }
200
+ }
201
+ }
202
+ iterFrame = iterFrame->m_next;
203
+ }
204
+}
205
+
206
+bool DPB::isTemporalLayerSwitchingPoint(int curPoc, int tempId)
207
+{
208
+ // loop through all pictures in the reference picture buffer
209
+ Frame* iterFrame = m_picList.first();
210
+ while (iterFrame)
211
+ {
212
+ if (iterFrame->m_poc != curPoc && iterFrame->m_encData->m_bHasReferences)
213
+ {
214
+ if (iterFrame->m_tempLayer >= tempId)
215
+ {
216
+ return false;
217
+ }
218
+ }
219
+ iterFrame = iterFrame->m_next;
220
+ }
221
+ return true;
222
+}
223
+
224
+bool DPB::isStepwiseTemporalLayerSwitchingPoint(RPS *rps, int curPoc, int tempId)
225
+{
226
+ // loop through all pictures in the reference picture buffer
227
+ Frame* iterFrame = m_picList.first();
228
+ while (iterFrame)
229
+ {
230
+ if (iterFrame->m_poc != curPoc && iterFrame->m_encData->m_bHasReferences)
231
+ {
232
+ for (int i = 0; i < rps->numberOfPositivePictures + rps->numberOfNegativePictures; i++)
233
+ {
234
+ if ((iterFrame->m_poc == curPoc + rps->deltaPOCi) && rps->bUsedi)
235
+ {
236
+ if (iterFrame->m_tempLayer >= tempId)
237
+ {
238
+ return false;
239
+ }
240
+ }
241
+ }
242
}
243
iterFrame = iterFrame->m_next;
244
}
245
+ return true;
246
}
247
248
/* deciding the nal_unit_type */
249
250
if (!curPOC)
251
return NAL_UNIT_CODED_SLICE_IDR_N_LP;
252
if (bIsKeyFrame)
253
- return m_bOpenGOP ? NAL_UNIT_CODED_SLICE_CRA : m_bhasLeadingPicture ? NAL_UNIT_CODED_SLICE_IDR_W_RADL : NAL_UNIT_CODED_SLICE_IDR_N_LP;
254
+ return (m_bOpenGOP || m_craNal) ? NAL_UNIT_CODED_SLICE_CRA : m_bhasLeadingPicture ? NAL_UNIT_CODED_SLICE_IDR_W_RADL : NAL_UNIT_CODED_SLICE_IDR_N_LP;
255
if (m_pocCRA && curPOC < m_pocCRA)
256
// All leading pictures are being marked as TFD pictures here since
257
// current encoder uses all reference pictures while encoding leading
258
x265_3.5.tar.gz/source/encoder/dpb.h -> x265_3.6.tar.gz/source/encoder/dpb.h
Changed
35
1
2
int m_lastIDR;
3
int m_pocCRA;
4
int m_bOpenGOP;
5
+ int m_craNal;
6
int m_bhasLeadingPicture;
7
bool m_bRefreshPending;
8
bool m_bTemporalSublayer;
9
10
m_bRefreshPending = false;
11
m_frameDataFreeList = NULL;
12
m_bOpenGOP = param->bOpenGOP;
13
- m_bTemporalSublayer = !!param->bEnableTemporalSubLayers;
14
+ m_craNal = param->craNal;
15
+ m_bTemporalSublayer = (param->bEnableTemporalSubLayers > 2);
16
}
17
18
~DPB();
19
20
21
protected:
22
23
- void computeRPS(int curPoc, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer);
24
+ void computeRPS(int curPoc,int tempId, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer);
25
26
- void applyReferencePictureSet(RPS *rps, int curPoc);
27
+ void applyReferencePictureSet(RPS *rps, int curPoc, int tempId, bool isTSAPicture);
28
+ bool getTemporalLayerNonReferenceFlag();
29
void decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType);
30
+ bool isTemporalLayerSwitchingPoint(int curPoc, int tempId);
31
+ bool isStepwiseTemporalLayerSwitchingPoint(RPS *rps, int curPoc, int tempId);
32
33
NalUnitType getNalUnitType(int curPoc, bool bIsKeyFrame);
34
};
35
x265_3.5.tar.gz/source/encoder/encoder.cpp -> x265_3.6.tar.gz/source/encoder/encoder.cpp
Changed
1237
1
2
{
3
{ 1, 1, 1, 1, 1, 5, 1, 2, 2, 2, 50 },
4
{ 1, 1, 1, 1, 1, 5, 0, 16, 9, 9, 81 },
5
- { 1, 1, 1, 1, 1, 5, 0, 1, 1, 1, 82 }
6
+ { 1, 1, 1, 1, 1, 5, 0, 1, 1, 1, 82 },
7
+ { 1, 1, 1, 1, 1, 5, 0, 18, 9, 9, 84 }
8
+};
9
+
10
+typedef struct
11
+{
12
+ int bEnableVideoSignalTypePresentFlag;
13
+ int bEnableColorDescriptionPresentFlag;
14
+ int bEnableChromaLocInfoPresentFlag;
15
+ int colorPrimaries;
16
+ int transferCharacteristics;
17
+ int matrixCoeffs;
18
+ int bEnableVideoFullRangeFlag;
19
+ int chromaSampleLocTypeTopField;
20
+ int chromaSampleLocTypeBottomField;
21
+ const char* systemId;
22
+}VideoSignalTypePresets;
23
+
24
+VideoSignalTypePresets vstPresets =
25
+{
26
+ {1, 1, 1, 6, 6, 6, 0, 0, 0, "BT601_525"},
27
+ {1, 1, 1, 5, 6, 5, 0, 0, 0, "BT601_626"},
28
+ {1, 1, 1, 1, 1, 1, 0, 0, 0, "BT709_YCC"},
29
+ {1, 1, 0, 1, 1, 0, 0, 0, 0, "BT709_RGB"},
30
+ {1, 1, 1, 9, 14, 1, 0, 2, 2, "BT2020_YCC_NCL"},
31
+ {1, 1, 0, 9, 16, 9, 0, 0, 0, "BT2020_RGB"},
32
+ {1, 1, 1, 9, 16, 9, 0, 2, 2, "BT2100_PQ_YCC"},
33
+ {1, 1, 1, 9, 16, 14, 0, 2, 2, "BT2100_PQ_ICTCP"},
34
+ {1, 1, 0, 9, 16, 0, 0, 0, 0, "BT2100_PQ_RGB"},
35
+ {1, 1, 1, 9, 18, 9, 0, 2, 2, "BT2100_HLG_YCC"},
36
+ {1, 1, 0, 9, 18, 0, 0, 0, 0, "BT2100_HLG_RGB"},
37
+ {1, 1, 0, 1, 1, 0, 1, 0, 0, "FR709_RGB"},
38
+ {1, 1, 0, 9, 14, 0, 1, 0, 0, "FR2020_RGB"},
39
+ {1, 1, 1, 12, 1, 6, 1, 1, 1, "FRP3D65_YCC"}
40
};
41
}
42
43
44
m_threadPool = NULL;
45
m_analysisFileIn = NULL;
46
m_analysisFileOut = NULL;
47
+ m_filmGrainIn = NULL;
48
m_naluFile = NULL;
49
m_offsetEmergency = NULL;
50
m_iFrameNum = 0;
51
52
m_prevTonemapPayload.payload = NULL;
53
m_startPoint = 0;
54
m_saveCTUSize = 0;
55
- m_edgePic = NULL;
56
- m_edgeHistThreshold = 0;
57
- m_chromaHistThreshold = 0.0;
58
- m_scaledEdgeThreshold = 0.0;
59
- m_scaledChromaThreshold = 0.0;
60
m_zoneIndex = 0;
61
+ m_origPicBuffer = 0;
62
}
63
64
inline char *strcatFilename(const char *input, const char *suffix)
65
66
}
67
}
68
69
- if (m_param->bHistBasedSceneCut)
70
- {
71
- m_planeSizes0 = (m_param->sourceWidth >> x265_cli_cspsp->internalCsp.width0) * (m_param->sourceHeight >> x265_cli_cspsm_param->internalCsp.height0);
72
- uint32_t pixelbytes = m_param->internalBitDepth > 8 ? 2 : 1;
73
- m_edgePic = X265_MALLOC(pixel, m_planeSizes0 * pixelbytes);
74
- m_edgeHistThreshold = m_param->edgeTransitionThreshold;
75
- m_chromaHistThreshold = x265_min(m_edgeHistThreshold * 10.0, MAX_SCENECUT_THRESHOLD);
76
- m_scaledEdgeThreshold = x265_min(m_edgeHistThreshold * SCENECUT_STRENGTH_FACTOR, MAX_SCENECUT_THRESHOLD);
77
- m_scaledChromaThreshold = x265_min(m_chromaHistThreshold * SCENECUT_STRENGTH_FACTOR, MAX_SCENECUT_THRESHOLD);
78
- if (m_param->sourceBitDepth != m_param->internalBitDepth)
79
- {
80
- int size = m_param->sourceWidth * m_param->sourceHeight;
81
- int hshift = CHROMA_H_SHIFT(m_param->internalCsp);
82
- int vshift = CHROMA_V_SHIFT(m_param->internalCsp);
83
- int widthC = m_param->sourceWidth >> hshift;
84
- int heightC = m_param->sourceHeight >> vshift;
85
-
86
- m_inputPic0 = X265_MALLOC(pixel, size);
87
- if (m_param->internalCsp != X265_CSP_I400)
88
- {
89
- for (int j = 1; j < 3; j++)
90
- {
91
- m_inputPicj = X265_MALLOC(pixel, widthC * heightC);
92
- }
93
- }
94
- }
95
- }
96
-
97
// Do not allow WPP if only one row or fewer than 3 columns, it is pointless and unstable
98
if (rows == 1 || cols < 3)
99
{
100
101
lookAheadThreadPooli.start();
102
m_lookahead->m_numPools = pools;
103
m_dpb = new DPB(m_param);
104
+
105
+ if (m_param->bEnableTemporalFilter)
106
+ m_origPicBuffer = new OrigPicBuffer();
107
+
108
m_rateControl = new RateControl(*m_param, this);
109
if (!m_param->bResetZoneConfig)
110
{
111
112
}
113
}
114
}
115
+ if (m_param->filmGrain)
116
+ {
117
+ m_filmGrainIn = x265_fopen(m_param->filmGrain, "rb");
118
+ if (!m_filmGrainIn)
119
+ {
120
+ x265_log_file(NULL, X265_LOG_ERROR, "Failed to open film grain characteristics binary file %s\n", m_param->filmGrain);
121
+ }
122
+ }
123
+
124
m_bZeroLatency = !m_param->bframes && !m_param->lookaheadDepth && m_param->frameNumThreads == 1 && m_param->maxSlices == 1;
125
m_aborted |= parseLambdaFile(m_param);
126
127
128
}
129
}
130
131
- if (m_param->bHistBasedSceneCut)
132
- {
133
- if (m_edgePic != NULL)
134
- {
135
- X265_FREE_ZERO(m_edgePic);
136
- }
137
-
138
- if (m_param->sourceBitDepth != m_param->internalBitDepth)
139
- {
140
- X265_FREE_ZERO(m_inputPic0);
141
- if (m_param->internalCsp != X265_CSP_I400)
142
- {
143
- for (int i = 1; i < 3; i++)
144
- {
145
- X265_FREE_ZERO(m_inputPici);
146
- }
147
- }
148
- }
149
- }
150
-
151
for (int i = 0; i < m_param->frameNumThreads; i++)
152
{
153
if (m_frameEncoderi)
154
155
delete zoneReadCount;
156
delete zoneWriteCount;
157
}
158
+
159
+ if (m_param->bEnableTemporalFilter)
160
+ delete m_origPicBuffer;
161
+
162
if (m_rateControl)
163
{
164
m_rateControl->destroy();
165
166
}
167
if (m_naluFile)
168
fclose(m_naluFile);
169
+ if (m_filmGrainIn)
170
+ x265_fclose(m_filmGrainIn);
171
172
#ifdef SVT_HEVC
173
X265_FREE(m_svtAppData);
174
175
/* release string arguments that were strdup'd */
176
free((char*)m_param->rc.lambdaFileName);
177
free((char*)m_param->rc.statFileName);
178
+ free((char*)m_param->rc.sharedMemName);
179
free((char*)m_param->analysisReuseFileName);
180
free((char*)m_param->scalingLists);
181
free((char*)m_param->csvfn);
182
183
free((char*)m_param->toneMapFile);
184
free((char*)m_param->analysisSave);
185
free((char*)m_param->analysisLoad);
186
+ free((char*)m_param->videoSignalTypePreset);
187
PARAM_NS::x265_param_free(m_param);
188
}
189
}
190
191
dest->planes2 = (char*)dest->planes1 + src->stride1 * (src->height >> x265_cli_cspssrc->colorSpace.height1);
192
}
193
194
-bool Encoder::computeHistograms(x265_picture *pic)
195
+bool Encoder::isFilterThisframe(uint8_t sliceTypeConfig, int curSliceType)
196
{
197
- pixel *src = NULL, *planeV = NULL, *planeU = NULL;
198
- uint32_t widthC, heightC;
199
- int hshift, vshift;
200
-
201
- hshift = CHROMA_H_SHIFT(pic->colorSpace);
202
- vshift = CHROMA_V_SHIFT(pic->colorSpace);
203
- widthC = pic->width >> hshift;
204
- heightC = pic->height >> vshift;
205
-
206
- if (pic->bitDepth == X265_DEPTH)
207
+ uint8_t newSliceType = 0;
208
+ switch (curSliceType)
209
{
210
- src = (pixel*)pic->planes0;
211
- if (m_param->internalCsp != X265_CSP_I400)
212
- {
213
- planeU = (pixel*)pic->planes1;
214
- planeV = (pixel*)pic->planes2;
215
- }
216
- }
217
- else if (pic->bitDepth == 8 && X265_DEPTH > 8)
218
- {
219
- int shift = (X265_DEPTH - 8);
220
- uint8_t *yChar, *uChar, *vChar;
221
-
222
- yChar = (uint8_t*)pic->planes0;
223
- primitives.planecopy_cp(yChar, pic->stride0 / sizeof(*yChar), m_inputPic0, pic->stride0 / sizeof(*yChar), pic->width, pic->height, shift);
224
- src = m_inputPic0;
225
- if (m_param->internalCsp != X265_CSP_I400)
226
- {
227
- uChar = (uint8_t*)pic->planes1;
228
- vChar = (uint8_t*)pic->planes2;
229
- primitives.planecopy_cp(uChar, pic->stride1 / sizeof(*uChar), m_inputPic1, pic->stride1 / sizeof(*uChar), widthC, heightC, shift);
230
- primitives.planecopy_cp(vChar, pic->stride2 / sizeof(*vChar), m_inputPic2, pic->stride2 / sizeof(*vChar), widthC, heightC, shift);
231
- planeU = m_inputPic1;
232
- planeV = m_inputPic2;
233
- }
234
- }
235
- else
236
- {
237
- uint16_t *yShort, *uShort, *vShort;
238
- /* mask off bits that are supposed to be zero */
239
- uint16_t mask = (1 << X265_DEPTH) - 1;
240
- int shift = abs(pic->bitDepth - X265_DEPTH);
241
-
242
- yShort = (uint16_t*)pic->planes0;
243
- uShort = (uint16_t*)pic->planes1;
244
- vShort = (uint16_t*)pic->planes2;
245
-
246
- if (pic->bitDepth > X265_DEPTH)
247
- {
248
- /* shift right and mask pixels to final size */
249
- primitives.planecopy_sp(yShort, pic->stride0 / sizeof(*yShort), m_inputPic0, pic->stride0 / sizeof(*yShort), pic->width, pic->height, shift, mask);
250
- if (m_param->internalCsp != X265_CSP_I400)
251
- {
252
- primitives.planecopy_sp(uShort, pic->stride1 / sizeof(*uShort), m_inputPic1, pic->stride1 / sizeof(*uShort), widthC, heightC, shift, mask);
253
- primitives.planecopy_sp(vShort, pic->stride2 / sizeof(*vShort), m_inputPic2, pic->stride2 / sizeof(*vShort), widthC, heightC, shift, mask);
254
- }
255
- }
256
- else /* Case for (pic.bitDepth < X265_DEPTH) */
257
- {
258
- /* shift left and mask pixels to final size */
259
- primitives.planecopy_sp_shl(yShort, pic->stride0 / sizeof(*yShort), m_inputPic0, pic->stride0 / sizeof(*yShort), pic->width, pic->height, shift, mask);
260
- if (m_param->internalCsp != X265_CSP_I400)
261
- {
262
- primitives.planecopy_sp_shl(uShort, pic->stride1 / sizeof(*uShort), m_inputPic1, pic->stride1 / sizeof(*uShort), widthC, heightC, shift, mask);
263
- primitives.planecopy_sp_shl(vShort, pic->stride2 / sizeof(*vShort), m_inputPic2, pic->stride2 / sizeof(*vShort), widthC, heightC, shift, mask);
264
- }
265
- }
266
-
267
- src = m_inputPic0;
268
- planeU = m_inputPic1;
269
- planeV = m_inputPic2;
270
- }
271
-
272
- size_t bufSize = sizeof(pixel) * m_planeSizes0;
273
- memset(m_edgePic, 0, bufSize);
274
-
275
- if (!computeEdge(m_edgePic, src, NULL, pic->width, pic->height, pic->width, false, 1))
276
- {
277
- x265_log(m_param, X265_LOG_ERROR, "Failed to compute edge!");
278
- return false;
279
- }
280
-
281
- pixel pixelVal;
282
- int32_t *edgeHist = m_curEdgeHist;
283
- memset(edgeHist, 0, EDGE_BINS * sizeof(int32_t));
284
- for (uint32_t i = 0; i < m_planeSizes0; i++)
285
- {
286
- if (m_edgePici)
287
- edgeHist1++;
288
- else
289
- edgeHist0++;
290
- }
291
-
292
- /* Y Histogram Calculation */
293
- int32_t *yHist = m_curYUVHist0;
294
- memset(yHist, 0, HISTOGRAM_BINS * sizeof(int32_t));
295
- for (uint32_t i = 0; i < m_planeSizes0; i++)
296
- {
297
- pixelVal = srci;
298
- yHistpixelVal++;
299
+ case 1: newSliceType |= 1 << 0;
300
+ break;
301
+ case 2: newSliceType |= 1 << 0;
302
+ break;
303
+ case 3: newSliceType |= 1 << 1;
304
+ break;
305
+ case 4: newSliceType |= 1 << 2;
306
+ break;
307
+ case 5: newSliceType |= 1 << 3;
308
+ break;
309
+ default: return 0;
310
}
311
+ return ((sliceTypeConfig & newSliceType) != 0);
312
+}
313
314
- if (pic->colorSpace != X265_CSP_I400)
315
- {
316
- /* U Histogram Calculation */
317
- int32_t *uHist = m_curYUVHist1;
318
- memset(uHist, 0, sizeof(m_curYUVHist1));
319
- for (uint32_t i = 0; i < m_planeSizes1; i++)
320
- {
321
- pixelVal = planeUi;
322
- uHistpixelVal++;
323
- }
324
+inline int enqueueRefFrame(FrameEncoder* curframeEncoder, Frame* iterFrame, Frame* curFrame, bool isPreFiltered, int16_t i)
325
+{
326
+ TemporalFilterRefPicInfo* dest = &curframeEncoder->m_mcstfRefListcurFrame->m_mcstf->m_numRef;
327
+ dest->picBuffer = iterFrame->m_fencPic;
328
+ dest->picBufferSubSampled2 = iterFrame->m_fencPicSubsampled2;
329
+ dest->picBufferSubSampled4 = iterFrame->m_fencPicSubsampled4;
330
+ dest->isFilteredFrame = isPreFiltered;
331
+ dest->isSubsampled = iterFrame->m_isSubSampled;
332
+ dest->origOffset = i;
333
+ curFrame->m_mcstf->m_numRef++;
334
335
- /* V Histogram Calculation */
336
- pixelVal = 0;
337
- int32_t *vHist = m_curYUVHist2;
338
- memset(vHist, 0, sizeof(m_curYUVHist2));
339
- for (uint32_t i = 0; i < m_planeSizes2; i++)
340
- {
341
- pixelVal = planeVi;
342
- vHistpixelVal++;
343
- }
344
- }
345
- return true;
346
+ return 1;
347
}
348
349
-void Encoder::computeHistogramSAD(double *normalizedMaxUVSad, double *normalizedEdgeSad, int curPoc)
350
+bool Encoder::generateMcstfRef(Frame* frameEnc, FrameEncoder* currEncoder)
351
{
352
+ frameEnc->m_mcstf->m_numRef = 0;
353
354
- if (curPoc == 0)
355
- { /* first frame is scenecut by default no sad computation for the same. */
356
- *normalizedMaxUVSad = 0.0;
357
- *normalizedEdgeSad = 0.0;
358
- }
359
- else
360
+ for (int iterPOC = (frameEnc->m_poc - frameEnc->m_mcstf->m_range);
361
+ iterPOC <= (frameEnc->m_poc + frameEnc->m_mcstf->m_range); iterPOC++)
362
{
363
- /* compute sum of absolute differences of histogram bins of chroma and luma edge response between the current and prev pictures. */
364
- int32_t edgeHistSad = 0;
365
- int32_t uHistSad = 0;
366
- int32_t vHistSad = 0;
367
- double normalizedUSad = 0.0;
368
- double normalizedVSad = 0.0;
369
-
370
- for (int j = 0; j < HISTOGRAM_BINS; j++)
371
+ bool isFound = false;
372
+ if (iterPOC != frameEnc->m_poc)
373
{
374
- if (j < 2)
375
+ //search for the reference frame in the Original Picture Buffer
376
+ if (!isFound)
377
{
378
- edgeHistSad += abs(m_curEdgeHistj - m_prevEdgeHistj);
379
- }
380
- uHistSad += abs(m_curYUVHist1j - m_prevYUVHist1j);
381
- vHistSad += abs(m_curYUVHist2j - m_prevYUVHist2j);
382
- }
383
- *normalizedEdgeSad = normalizeRange(edgeHistSad, 0, 2 * m_planeSizes0, 0.0, 1.0);
384
- normalizedUSad = normalizeRange(uHistSad, 0, 2 * m_planeSizes1, 0.0, 1.0);
385
- normalizedVSad = normalizeRange(vHistSad, 0, 2 * m_planeSizes2, 0.0, 1.0);
386
- *normalizedMaxUVSad = x265_max(normalizedUSad, normalizedVSad);
387
- }
388
-
389
- /* store histograms of previous frame for reference */
390
- memcpy(m_prevEdgeHist, m_curEdgeHist, sizeof(m_curEdgeHist));
391
- memcpy(m_prevYUVHist, m_curYUVHist, sizeof(m_curYUVHist));
392
-}
393
+ for (int j = 0; j < (2 * frameEnc->m_mcstf->m_range); j++)
394
+ {
395
+ if (iterPOC < 0)
396
+ continue;
397
+ if (iterPOC >= m_pocLast)
398
+ {
399
400
-double Encoder::normalizeRange(int32_t value, int32_t minValue, int32_t maxValue, double rangeStart, double rangeEnd)
401
-{
402
- return (double)(value - minValue) * (rangeEnd - rangeStart) / (maxValue - minValue) + rangeStart;
403
-}
404
+ TemporalFilter* mcstf = frameEnc->m_mcstf;
405
+ while (mcstf->m_numRef)
406
+ {
407
+ memset(currEncoder->m_mcstfRefListmcstf->m_numRef.mvs0, 0, sizeof(MV) * ((mcstf->m_sourceWidth / 16) * (mcstf->m_sourceHeight / 16)));
408
+ memset(currEncoder->m_mcstfRefListmcstf->m_numRef.mvs1, 0, sizeof(MV) * ((mcstf->m_sourceWidth / 16) * (mcstf->m_sourceHeight / 16)));
409
+ memset(currEncoder->m_mcstfRefListmcstf->m_numRef.mvs2, 0, sizeof(MV) * ((mcstf->m_sourceWidth / 16) * (mcstf->m_sourceHeight / 16)));
410
+ memset(currEncoder->m_mcstfRefListmcstf->m_numRef.mvs, 0, sizeof(MV) * ((mcstf->m_sourceWidth / 4) * (mcstf->m_sourceHeight / 4)));
411
+ memset(currEncoder->m_mcstfRefListmcstf->m_numRef.noise, 0, sizeof(int) * ((mcstf->m_sourceWidth / 4) * (mcstf->m_sourceHeight / 4)));
412
+ memset(currEncoder->m_mcstfRefListmcstf->m_numRef.error, 0, sizeof(int) * ((mcstf->m_sourceWidth / 4) * (mcstf->m_sourceHeight / 4)));
413
414
-void Encoder::findSceneCuts(x265_picture *pic, bool& bDup, double maxUVSad, double edgeSad, bool& isMaxThres, bool& isHardSC)
415
-{
416
- double minEdgeT = m_edgeHistThreshold * MIN_EDGE_FACTOR;
417
- double minChromaT = minEdgeT * SCENECUT_CHROMA_FACTOR;
418
- double maxEdgeT = m_edgeHistThreshold * MAX_EDGE_FACTOR;
419
- double maxChromaT = maxEdgeT * SCENECUT_CHROMA_FACTOR;
420
- pic->frameData.bScenecut = false;
421
+ mcstf->m_numRef--;
422
+ }
423
424
- if (pic->poc == 0)
425
- {
426
- /* for first frame */
427
- pic->frameData.bScenecut = false;
428
- bDup = false;
429
- }
430
- else
431
- {
432
- if (edgeSad == 0.0 && maxUVSad == 0.0)
433
- {
434
- bDup = true;
435
- }
436
- else if (edgeSad < minEdgeT && maxUVSad < minChromaT)
437
- {
438
- pic->frameData.bScenecut = false;
439
- }
440
- else if (edgeSad > maxEdgeT && maxUVSad > maxChromaT)
441
- {
442
- pic->frameData.bScenecut = true;
443
- isMaxThres = true;
444
- isHardSC = true;
445
- }
446
- else if (edgeSad > m_scaledEdgeThreshold || maxUVSad >= m_scaledChromaThreshold
447
- || (edgeSad > m_edgeHistThreshold && maxUVSad >= m_chromaHistThreshold))
448
- {
449
- pic->frameData.bScenecut = true;
450
- bDup = false;
451
- if (edgeSad > m_scaledEdgeThreshold || maxUVSad >= m_scaledChromaThreshold)
452
- isHardSC = true;
453
+ break;
454
+ }
455
+ Frame* iterFrame = frameEnc->m_encData->m_slice->m_mcstfRefFrameList1j;
456
+ if (iterFrame->m_poc == iterPOC)
457
+ {
458
+ if (!enqueueRefFrame(currEncoder, iterFrame, frameEnc, false, (int16_t)(iterPOC - frameEnc->m_poc)))
459
+ {
460
+ return false;
461
+ };
462
+ break;
463
+ }
464
+ }
465
+ }
466
}
467
}
468
+
469
+ return true;
470
}
471
472
/**
473
474
const x265_picture* inputPic = NULL;
475
static int written = 0, read = 0;
476
bool dontRead = false;
477
- bool bdropFrame = false;
478
bool dropflag = false;
479
- bool isMaxThres = false;
480
- bool isHardSC = false;
481
482
if (m_exportedPic)
483
{
484
if (!m_param->bUseAnalysisFile && m_param->analysisSave)
485
x265_free_analysis_data(m_param, &m_exportedPic->m_analysisData);
486
+
487
ATOMIC_DEC(&m_exportedPic->m_countRefEncoders);
488
+
489
m_exportedPic = NULL;
490
m_dpb->recycleUnreferenced();
491
+
492
+ if (m_param->bEnableTemporalFilter)
493
+ m_origPicBuffer->recycleOrigPicList();
494
}
495
+
496
if ((pic_in && (!m_param->chunkEnd || (m_encodedFrameNum < m_param->chunkEnd))) || (m_param->bEnableFrameDuplication && !pic_in && (read < written)))
497
{
498
- if (m_param->bHistBasedSceneCut && pic_in)
499
- {
500
- x265_picture *pic = (x265_picture *) pic_in;
501
-
502
- if (pic->poc == 0)
503
- {
504
- /* for entire encode compute the chroma plane sizes only once */
505
- for (int i = 1; i < x265_cli_cspsm_param->internalCsp.planes; i++)
506
- m_planeSizesi = (pic->width >> x265_cli_cspsm_param->internalCsp.widthi) * (pic->height >> x265_cli_cspsm_param->internalCsp.heighti);
507
- }
508
-
509
- if (computeHistograms(pic))
510
- {
511
- double maxUVSad = 0.0, edgeSad = 0.0;
512
- computeHistogramSAD(&maxUVSad, &edgeSad, pic_in->poc);
513
- findSceneCuts(pic, bdropFrame, maxUVSad, edgeSad, isMaxThres, isHardSC);
514
- }
515
- }
516
-
517
if ((m_param->bEnableFrameDuplication && !pic_in && (read < written)))
518
dontRead = true;
519
else
520
521
written++;
522
}
523
524
- if (m_param->bEnableFrameDuplication && m_param->bHistBasedSceneCut)
525
- {
526
- if (!bdropFrame && m_dupBuffer1->dupPic->frameData.bScenecut == false)
527
- {
528
- psnrWeight = ComputePSNR(m_dupBuffer0->dupPic, m_dupBuffer1->dupPic, m_param);
529
- if (psnrWeight >= m_param->dupThreshold)
530
- dropflag = true;
531
- }
532
- else
533
- {
534
- dropflag = true;
535
- }
536
- }
537
- else if (m_param->bEnableFrameDuplication)
538
+ if (m_param->bEnableFrameDuplication)
539
{
540
psnrWeight = ComputePSNR(m_dupBuffer0->dupPic, m_dupBuffer1->dupPic, m_param);
541
if (psnrWeight >= m_param->dupThreshold)
542
543
}
544
}
545
}
546
- if (m_param->recursionSkipMode == EDGE_BASED_RSKIP && m_param->bHistBasedSceneCut)
547
- {
548
- pixel* src = m_edgePic;
549
- primitives.planecopy_pp_shr(src, inFrame->m_fencPic->m_picWidth, inFrame->m_edgeBitPic, inFrame->m_fencPic->m_stride,
550
- inFrame->m_fencPic->m_picWidth, inFrame->m_fencPic->m_picHeight, 0);
551
- }
552
}
553
else
554
{
555
556
inFrame->m_lowres.satdCost = (int64_t)-1;
557
inFrame->m_lowresInit = false;
558
inFrame->m_isInsideWindow = 0;
559
+ inFrame->m_tempLayer = 0;
560
+ inFrame->m_sameLayerRefPic = 0;
561
}
562
563
/* Copy input picture into a Frame and PicYuv, send to lookahead */
564
565
inFrame->m_poc = ++m_pocLast;
566
inFrame->m_userData = inputPic->userData;
567
inFrame->m_pts = inputPic->pts;
568
- if (m_param->bHistBasedSceneCut)
569
- {
570
- inFrame->m_lowres.bScenecut = (inputPic->frameData.bScenecut == 1) ? true : false;
571
- inFrame->m_lowres.m_bIsMaxThres = isMaxThres;
572
- if (m_param->radl && m_param->keyframeMax != m_param->keyframeMin)
573
- inFrame->m_lowres.m_bIsHardScenecut = isHardSC;
574
- }
575
576
if ((m_param->bEnableSceneCutAwareQp & BACKWARD) && m_param->rc.bStatRead)
577
{
578
579
rcEntry = &(m_rateControl->m_rce2PassinFrame->m_poc);
580
if(rcEntry->scenecut)
581
{
582
- int backwardWindow = X265_MIN(int((m_param->bwdScenecutWindow / 1000.0) * (m_param->fpsNum / m_param->fpsDenom)), p->lookaheadDepth);
583
+ int backwardWindow = X265_MIN(int((m_param->bwdMaxScenecutWindow / 1000.0) * (m_param->fpsNum / m_param->fpsDenom)), p->lookaheadDepth);
584
for (int i = 1; i <= backwardWindow; i++)
585
{
586
int frameNum = inFrame->m_poc - i;
587
588
}
589
}
590
}
591
- if (m_param->bHistBasedSceneCut && m_param->analysisSave)
592
- {
593
- memcpy(inFrame->m_analysisData.edgeHist, m_curEdgeHist, EDGE_BINS * sizeof(int32_t));
594
- memcpy(inFrame->m_analysisData.yuvHist0, m_curYUVHist0, HISTOGRAM_BINS *sizeof(int32_t));
595
- if (inputPic->colorSpace != X265_CSP_I400)
596
- {
597
- memcpy(inFrame->m_analysisData.yuvHist1, m_curYUVHist1, HISTOGRAM_BINS * sizeof(int32_t));
598
- memcpy(inFrame->m_analysisData.yuvHist2, m_curYUVHist2, HISTOGRAM_BINS * sizeof(int32_t));
599
- }
600
- }
601
+
602
inFrame->m_forceqp = inputPic->forceqp;
603
inFrame->m_param = (m_reconfigure || m_reconfigureRc) ? m_latestParam : m_param;
604
inFrame->m_picStruct = inputPic->picStruct;
605
606
}
607
608
/* Use the frame types from the first pass, if available */
609
- int sliceType = (m_param->rc.bStatRead) ? m_rateControl->rateControlSliceType(inFrame->m_poc) : inputPic->sliceType;
610
+ int sliceType = (m_param->rc.bStatRead) ? m_rateControl->rateControlSliceType(inFrame->m_poc) : X265_TYPE_AUTO;
611
+ inFrame->m_lowres.sliceTypeReq = inputPic->sliceType;
612
613
/* In analysisSave mode, x265_analysis_data is allocated in inputPic and inFrame points to this */
614
/* Load analysis data before lookahead->addPicture, since sliceType has been decided */
615
616
if (m_reconfigureRc)
617
inFrame->m_reconfigureRc = true;
618
619
+ if (m_param->bEnableTemporalFilter)
620
+ {
621
+ if (!m_pocLast)
622
+ {
623
+ /*One shot allocation of frames in OriginalPictureBuffer*/
624
+ int numFramesinOPB = X265_MAX(m_param->bframes, (inFrame->m_mcstf->m_range << 1)) + 1;
625
+ for (int i = 0; i < numFramesinOPB; i++)
626
+ {
627
+ Frame* dupFrame = new Frame;
628
+ if (!(dupFrame->create(m_param, pic_in->quantOffsets)))
629
+ {
630
+ m_aborted = true;
631
+ x265_log(m_param, X265_LOG_ERROR, "Memory allocation failure, aborting encode\n");
632
+ fflush(stderr);
633
+ dupFrame->destroy();
634
+ delete dupFrame;
635
+ return -1;
636
+ }
637
+ else
638
+ {
639
+ if (m_sps.cuOffsetY)
640
+ {
641
+ dupFrame->m_fencPic->m_cuOffsetC = m_sps.cuOffsetC;
642
+ dupFrame->m_fencPic->m_buOffsetC = m_sps.buOffsetC;
643
+ dupFrame->m_fencPic->m_cuOffsetY = m_sps.cuOffsetY;
644
+ dupFrame->m_fencPic->m_buOffsetY = m_sps.buOffsetY;
645
+ if (m_param->internalCsp != X265_CSP_I400)
646
+ {
647
+ dupFrame->m_fencPic->m_cuOffsetC = m_sps.cuOffsetC;
648
+ dupFrame->m_fencPic->m_buOffsetC = m_sps.buOffsetC;
649
+ }
650
+ m_origPicBuffer->addEncPicture(dupFrame);
651
+ }
652
+ }
653
+ }
654
+ }
655
+
656
+ inFrame->m_refPicCnt1 = 2 * inFrame->m_mcstf->m_range + 1;
657
+ if (inFrame->m_poc < inFrame->m_mcstf->m_range)
658
+ inFrame->m_refPicCnt1 -= (uint8_t)(inFrame->m_mcstf->m_range - inFrame->m_poc);
659
+ if (m_param->totalFrames && (inFrame->m_poc >= (m_param->totalFrames - inFrame->m_mcstf->m_range)))
660
+ inFrame->m_refPicCnt1 -= (uint8_t)(inFrame->m_poc + inFrame->m_mcstf->m_range - m_param->totalFrames + 1);
661
+
662
+ //Extend full-res original picture border
663
+ PicYuv *orig = inFrame->m_fencPic;
664
+ extendPicBorder(orig->m_picOrg0, orig->m_stride, orig->m_picWidth, orig->m_picHeight, orig->m_lumaMarginX, orig->m_lumaMarginY);
665
+ extendPicBorder(orig->m_picOrg1, orig->m_strideC, orig->m_picWidth >> orig->m_hChromaShift, orig->m_picHeight >> orig->m_vChromaShift, orig->m_chromaMarginX, orig->m_chromaMarginY);
666
+ extendPicBorder(orig->m_picOrg2, orig->m_strideC, orig->m_picWidth >> orig->m_hChromaShift, orig->m_picHeight >> orig->m_vChromaShift, orig->m_chromaMarginX, orig->m_chromaMarginY);
667
+
668
+ //TODO: Add subsampling here if required
669
+ m_origPicBuffer->addPicture(inFrame);
670
+ }
671
+
672
m_lookahead->addPicture(*inFrame, sliceType);
673
m_numDelayedPic++;
674
}
675
676
pic_out->bitDepth = X265_DEPTH;
677
pic_out->userData = outFrame->m_userData;
678
pic_out->colorSpace = m_param->internalCsp;
679
+ pic_out->frameData.tLayer = outFrame->m_tempLayer;
680
frameData = &(pic_out->frameData);
681
682
pic_out->pts = outFrame->m_pts;
683
684
pic_out->analysisData.poc = pic_out->poc;
685
pic_out->analysisData.sliceType = pic_out->sliceType;
686
pic_out->analysisData.bScenecut = outFrame->m_lowres.bScenecut;
687
- if (m_param->bHistBasedSceneCut)
688
- {
689
- memcpy(pic_out->analysisData.edgeHist, outFrame->m_analysisData.edgeHist, EDGE_BINS * sizeof(int32_t));
690
- memcpy(pic_out->analysisData.yuvHist0, outFrame->m_analysisData.yuvHist0, HISTOGRAM_BINS * sizeof(int32_t));
691
- if (pic_out->colorSpace != X265_CSP_I400)
692
- {
693
- memcpy(pic_out->analysisData.yuvHist1, outFrame->m_analysisData.yuvHist1, HISTOGRAM_BINS * sizeof(int32_t));
694
- memcpy(pic_out->analysisData.yuvHist2, outFrame->m_analysisData.yuvHist2, HISTOGRAM_BINS * sizeof(int32_t));
695
- }
696
- }
697
pic_out->analysisData.satdCost = outFrame->m_lowres.satdCost;
698
pic_out->analysisData.numCUsInFrame = outFrame->m_analysisData.numCUsInFrame;
699
pic_out->analysisData.numPartitions = outFrame->m_analysisData.numPartitions;
700
701
if (m_rateControl->writeRateControlFrameStats(outFrame, &curEncoder->m_rce))
702
m_aborted = true;
703
if (pic_out)
704
- {
705
+ {
706
/* m_rcData is allocated for every frame */
707
pic_out->rcData = outFrame->m_rcData;
708
outFrame->m_rcData->qpaRc = outFrame->m_encData->m_avgQpRc;
709
710
outFrame->m_rcData->iCuCount = outFrame->m_encData->m_frameStats.percent8x8Intra * m_rateControl->m_ncu;
711
outFrame->m_rcData->pCuCount = outFrame->m_encData->m_frameStats.percent8x8Inter * m_rateControl->m_ncu;
712
outFrame->m_rcData->skipCuCount = outFrame->m_encData->m_frameStats.percent8x8Skip * m_rateControl->m_ncu;
713
+ outFrame->m_rcData->currentSatd = curEncoder->m_rce.coeffBits;
714
+ }
715
+
716
+ if (m_param->bEnableTemporalFilter)
717
+ {
718
+ Frame *curFrame = m_origPicBuffer->m_mcstfPicList.getPOCMCSTF(outFrame->m_poc);
719
+ X265_CHECK(curFrame, "Outframe not found in DPB's mcstfPicList");
720
+ curFrame->m_refPicCnt0--;
721
+ curFrame->m_refPicCnt1--;
722
+ curFrame = m_origPicBuffer->m_mcstfOrigPicList.getPOCMCSTF(outFrame->m_poc);
723
+ X265_CHECK(curFrame, "Outframe not found in OPB's mcstfOrigPicList");
724
+ curFrame->m_refPicCnt1--;
725
}
726
727
/* Allow this frame to be recycled if no frame encoders are using it for reference */
728
729
{
730
ATOMIC_DEC(&outFrame->m_countRefEncoders);
731
m_dpb->recycleUnreferenced();
732
+ if (m_param->bEnableTemporalFilter)
733
+ m_origPicBuffer->recycleOrigPicList();
734
}
735
else
736
m_exportedPic = outFrame;
737
738
m_rateControl->m_lastScenecut = frameEnc->m_poc;
739
else
740
{
741
- int maxWindowSize = int((m_param->fwdScenecutWindow / 1000.0) * (m_param->fpsNum / m_param->fpsDenom) + 0.5);
742
+ int maxWindowSize = int((m_param->fwdMaxScenecutWindow / 1000.0) * (m_param->fpsNum / m_param->fpsDenom) + 0.5);
743
if (frameEnc->m_poc > (m_rateControl->m_lastScenecut + maxWindowSize))
744
m_rateControl->m_lastScenecut = frameEnc->m_poc;
745
}
746
747
analysis->numPartitions = m_param->num4x4Partitions;
748
x265_alloc_analysis_data(m_param, analysis);
749
}
750
+ if (m_param->bEnableTemporalSubLayers > 2)
751
+ {
752
+ //Re-assign temporalid if the current frame is at the end of encode or when I slice is encountered
753
+ if ((frameEnc->m_poc == (m_param->totalFrames - 1)) || (frameEnc->m_lowres.sliceType == X265_TYPE_I) || (frameEnc->m_lowres.sliceType == X265_TYPE_IDR))
754
+ {
755
+ frameEnc->m_tempLayer = (int8_t)0;
756
+ }
757
+ }
758
/* determine references, setup RPS, etc */
759
m_dpb->prepareEncode(frameEnc);
760
+
761
+ if (m_param->bEnableTemporalFilter)
762
+ {
763
+ X265_CHECK(!m_origPicBuffer->m_mcstfOrigPicFreeList.empty(), "Frames not available in Encoded OPB");
764
+
765
+ Frame *dupFrame = m_origPicBuffer->m_mcstfOrigPicFreeList.popBackMCSTF();
766
+ dupFrame->m_fencPic->copyFromFrame(frameEnc->m_fencPic);
767
+ dupFrame->m_poc = frameEnc->m_poc;
768
+ dupFrame->m_encodeOrder = frameEnc->m_encodeOrder;
769
+ dupFrame->m_refPicCnt1 = 2 * dupFrame->m_mcstf->m_range + 1;
770
+
771
+ if (dupFrame->m_poc < dupFrame->m_mcstf->m_range)
772
+ dupFrame->m_refPicCnt1 -= (uint8_t)(dupFrame->m_mcstf->m_range - dupFrame->m_poc);
773
+ if (m_param->totalFrames && (dupFrame->m_poc >= (m_param->totalFrames - dupFrame->m_mcstf->m_range)))
774
+ dupFrame->m_refPicCnt1 -= (uint8_t)(dupFrame->m_poc + dupFrame->m_mcstf->m_range - m_param->totalFrames + 1);
775
+
776
+ m_origPicBuffer->addEncPictureToPicList(dupFrame);
777
+ m_origPicBuffer->setOrigPicList(frameEnc, m_pocLast);
778
+ }
779
+
780
if (!!m_param->selectiveSAO)
781
{
782
Slice* slice = frameEnc->m_encData->m_slice;
783
784
785
if (m_param->rc.rateControlMode != X265_RC_CQP)
786
m_lookahead->getEstimatedPictureCost(frameEnc);
787
+
788
if (m_param->bIntraRefresh)
789
calcRefreshInterval(frameEnc);
790
791
+ // Generate MCSTF References and perform HME
792
+ if (m_param->bEnableTemporalFilter && isFilterThisframe(frameEnc->m_mcstf->m_sliceTypeConfig, frameEnc->m_lowres.sliceType))
793
+ {
794
+
795
+ if (!generateMcstfRef(frameEnc, curEncoder))
796
+ {
797
+ m_aborted = true;
798
+ x265_log(m_param, X265_LOG_ERROR, "Failed to initialize MCSTFReferencePicInfo at POC %d\n", frameEnc->m_poc);
799
+ fflush(stderr);
800
+ return -1;
801
+ }
802
+
803
+
804
+ if (!*frameEnc->m_isSubSampled)
805
+ {
806
+ primitives.frameSubSampleLuma((const pixel *)frameEnc->m_fencPic->m_picOrg0,frameEnc->m_fencPicSubsampled2->m_picOrg0, frameEnc->m_fencPic->m_stride, frameEnc->m_fencPicSubsampled2->m_stride, frameEnc->m_fencPicSubsampled2->m_picWidth, frameEnc->m_fencPicSubsampled2->m_picHeight);
807
+ extendPicBorder(frameEnc->m_fencPicSubsampled2->m_picOrg0, frameEnc->m_fencPicSubsampled2->m_stride, frameEnc->m_fencPicSubsampled2->m_picWidth, frameEnc->m_fencPicSubsampled2->m_picHeight, frameEnc->m_fencPicSubsampled2->m_lumaMarginX, frameEnc->m_fencPicSubsampled2->m_lumaMarginY);
808
+ primitives.frameSubSampleLuma((const pixel *)frameEnc->m_fencPicSubsampled2->m_picOrg0,frameEnc->m_fencPicSubsampled4->m_picOrg0, frameEnc->m_fencPicSubsampled2->m_stride, frameEnc->m_fencPicSubsampled4->m_stride, frameEnc->m_fencPicSubsampled4->m_picWidth, frameEnc->m_fencPicSubsampled4->m_picHeight);
809
+ extendPicBorder(frameEnc->m_fencPicSubsampled4->m_picOrg0, frameEnc->m_fencPicSubsampled4->m_stride, frameEnc->m_fencPicSubsampled4->m_picWidth, frameEnc->m_fencPicSubsampled4->m_picHeight, frameEnc->m_fencPicSubsampled4->m_lumaMarginX, frameEnc->m_fencPicSubsampled4->m_lumaMarginY);
810
+ *frameEnc->m_isSubSampled = true;
811
+ }
812
+
813
+ for (uint8_t i = 1; i <= frameEnc->m_mcstf->m_numRef; i++)
814
+ {
815
+ TemporalFilterRefPicInfo *ref = &curEncoder->m_mcstfRefListi - 1;
816
+ if (!*ref->isSubsampled)
817
+ {
818
+ primitives.frameSubSampleLuma((const pixel *)ref->picBuffer->m_picOrg0, ref->picBufferSubSampled2->m_picOrg0, ref->picBuffer->m_stride, ref->picBufferSubSampled2->m_stride, ref->picBufferSubSampled2->m_picWidth, ref->picBufferSubSampled2->m_picHeight);
819
+ extendPicBorder(ref->picBufferSubSampled2->m_picOrg0, ref->picBufferSubSampled2->m_stride, ref->picBufferSubSampled2->m_picWidth, ref->picBufferSubSampled2->m_picHeight, ref->picBufferSubSampled2->m_lumaMarginX, ref->picBufferSubSampled2->m_lumaMarginY);
820
+ primitives.frameSubSampleLuma((const pixel *)ref->picBufferSubSampled2->m_picOrg0,ref->picBufferSubSampled4->m_picOrg0, ref->picBufferSubSampled2->m_stride, ref->picBufferSubSampled4->m_stride, ref->picBufferSubSampled4->m_picWidth, ref->picBufferSubSampled4->m_picHeight);
821
+ extendPicBorder(ref->picBufferSubSampled4->m_picOrg0, ref->picBufferSubSampled4->m_stride, ref->picBufferSubSampled4->m_picWidth, ref->picBufferSubSampled4->m_picHeight, ref->picBufferSubSampled4->m_lumaMarginX, ref->picBufferSubSampled4->m_lumaMarginY);
822
+ *ref->isSubsampled = true;
823
+ }
824
+ }
825
+
826
+ for (uint8_t i = 1; i <= frameEnc->m_mcstf->m_numRef; i++)
827
+ {
828
+ TemporalFilterRefPicInfo *ref = &curEncoder->m_mcstfRefListi - 1;
829
+
830
+ curEncoder->m_frameEncTF->motionEstimationLuma(ref->mvs0, ref->mvsStride0, frameEnc->m_fencPicSubsampled4, ref->picBufferSubSampled4, 16);
831
+ curEncoder->m_frameEncTF->motionEstimationLuma(ref->mvs1, ref->mvsStride1, frameEnc->m_fencPicSubsampled2, ref->picBufferSubSampled2, 16, ref->mvs0, ref->mvsStride0, 2);
832
+ curEncoder->m_frameEncTF->motionEstimationLuma(ref->mvs2, ref->mvsStride2, frameEnc->m_fencPic, ref->picBuffer, 16, ref->mvs1, ref->mvsStride1, 2);
833
+ curEncoder->m_frameEncTF->motionEstimationLumaDoubleRes(ref->mvs, ref->mvsStride, frameEnc->m_fencPic, ref->picBuffer, 8, ref->mvs2, ref->mvsStride2, 1, ref->error);
834
+ }
835
+
836
+ for (int i = 0; i < frameEnc->m_mcstf->m_numRef; i++)
837
+ {
838
+ TemporalFilterRefPicInfo *ref = &curEncoder->m_mcstfRefListi;
839
+ ref->slicetype = m_lookahead->findSliceType(frameEnc->m_poc + ref->origOffset);
840
+ Frame* dpbframePtr = m_dpb->m_picList.getPOC(frameEnc->m_poc + ref->origOffset);
841
+ if (dpbframePtr != NULL)
842
+ {
843
+ if (dpbframePtr->m_encData->m_slice->m_sliceType == B_SLICE)
844
+ ref->slicetype = X265_TYPE_B;
845
+ else if (dpbframePtr->m_encData->m_slice->m_sliceType == P_SLICE)
846
+ ref->slicetype = X265_TYPE_P;
847
+ else
848
+ ref->slicetype = X265_TYPE_I;
849
+ }
850
+ }
851
+ }
852
+
853
/* Allow FrameEncoder::compressFrame() to start in the frame encoder thread */
854
if (!curEncoder->startCompressFrame(frameEnc))
855
m_aborted = true;
856
857
encParam->dynamicRd = param->dynamicRd;
858
encParam->bEnableTransformSkip = param->bEnableTransformSkip;
859
encParam->bEnableAMP = param->bEnableAMP;
860
-
861
+ if (param->confWinBottomOffset == 0 && param->confWinRightOffset == 0)
862
+ {
863
+ encParam->confWinBottomOffset = param->confWinBottomOffset;
864
+ encParam->confWinRightOffset = param->confWinRightOffset;
865
+ }
866
/* Resignal changes in params in Parameter Sets */
867
m_sps.maxAMPDepth = (m_sps.bUseAMP = param->bEnableAMP && param->bEnableAMP) ? param->maxCUDepth : 0;
868
m_pps.bTransformSkipEnabled = param->bEnableTransformSkip ? 1 : 0;
869
870
(float)100.0 * m_numLumaWPBiFrames / m_analyzeB.m_numPics,
871
(float)100.0 * m_numChromaWPBiFrames / m_analyzeB.m_numPics);
872
}
873
- int pWithB = 0;
874
- for (int i = 0; i <= m_param->bframes; i++)
875
- pWithB += m_lookahead->m_histogrami;
876
877
- if (pWithB)
878
- {
879
- int p = 0;
880
- for (int i = 0; i <= m_param->bframes; i++)
881
- p += sprintf(buffer + p, "%.1f%% ", 100. * m_lookahead->m_histogrami / pWithB);
882
-
883
- x265_log(m_param, X265_LOG_INFO, "consecutive B-frames: %s\n", buffer);
884
- }
885
if (m_param->bLossless)
886
{
887
float frameSize = (float)(m_param->sourceWidth - m_sps.conformanceWindow.rightOffset) *
888
889
}
890
}
891
892
+void Encoder::getEndNalUnits(NALList& list, Bitstream& bs)
893
+{
894
+ NALList nalList;
895
+ bs.resetBits();
896
+
897
+ if (m_param->bEnableEndOfSequence)
898
+ nalList.serialize(NAL_UNIT_EOS, bs);
899
+ if (m_param->bEnableEndOfBitstream)
900
+ nalList.serialize(NAL_UNIT_EOB, bs);
901
+
902
+ list.takeContents(nalList);
903
+}
904
+
905
void Encoder::initVPS(VPS *vps)
906
{
907
/* Note that much of the VPS is initialized by determineLevel() */
908
909
sps->bUseAMP = m_param->bEnableAMP;
910
sps->maxAMPDepth = m_param->bEnableAMP ? m_param->maxCUDepth : 0;
911
912
- sps->maxTempSubLayers = m_param->bEnableTemporalSubLayers ? 2 : 1;
913
- sps->maxDecPicBuffering = m_vps.maxDecPicBuffering;
914
- sps->numReorderPics = m_vps.numReorderPics;
915
- sps->maxLatencyIncrease = m_vps.maxLatencyIncrease = m_param->bframes;
916
+ sps->maxTempSubLayers = m_vps.maxTempSubLayers;// Getting the value from the user
917
+
918
+ for(uint8_t i = 0; i < sps->maxTempSubLayers; i++)
919
+ {
920
+ sps->maxDecPicBufferingi = m_vps.maxDecPicBufferingi;
921
+ sps->numReorderPicsi = m_vps.numReorderPicsi;
922
+ sps->maxLatencyIncreasei = m_vps.maxLatencyIncreasei = m_param->bframes;
923
+ }
924
925
sps->bUseStrongIntraSmoothing = m_param->bEnableStrongIntraSmoothing;
926
sps->bTemporalMVPEnabled = m_param->bEnableTemporalMvp;
927
928
p->rc.aqMode = X265_AQ_NONE;
929
p->rc.hevcAq = 0;
930
}
931
+ if (p->rc.aqMode == 0 && p->rc.cuTree)
932
+ {
933
+ p->rc.aqMode = X265_AQ_VARIANCE;
934
+ p->rc.aqStrength = 0;
935
+ }
936
p->radl = zone->radl;
937
}
938
memcpy(zone, p, sizeof(x265_param));
939
940
p->crQpOffset = 3;
941
}
942
943
+void Encoder::configureVideoSignalTypePreset(x265_param* p)
944
+{
945
+ char systemId20 = {};
946
+ char colorVolume20 = {};
947
+ sscanf(p->videoSignalTypePreset, "%^::%s", systemId, colorVolume);
948
+ uint32_t sysId = 0;
949
+ while (strcmp(vstPresetssysId.systemId, systemId))
950
+ {
951
+ if (sysId + 1 == sizeof(vstPresets) / sizeof(vstPresets0))
952
+ {
953
+ x265_log(NULL, X265_LOG_ERROR, "Incorrect system-id, aborting\n");
954
+ m_aborted = true;
955
+ break;
956
+ }
957
+ sysId++;
958
+ }
959
+
960
+ p->vui.bEnableVideoSignalTypePresentFlag = vstPresetssysId.bEnableVideoSignalTypePresentFlag;
961
+ p->vui.bEnableColorDescriptionPresentFlag = vstPresetssysId.bEnableColorDescriptionPresentFlag;
962
+ p->vui.bEnableChromaLocInfoPresentFlag = vstPresetssysId.bEnableChromaLocInfoPresentFlag;
963
+ p->vui.colorPrimaries = vstPresetssysId.colorPrimaries;
964
+ p->vui.transferCharacteristics = vstPresetssysId.transferCharacteristics;
965
+ p->vui.matrixCoeffs = vstPresetssysId.matrixCoeffs;
966
+ p->vui.bEnableVideoFullRangeFlag = vstPresetssysId.bEnableVideoFullRangeFlag;
967
+ p->vui.chromaSampleLocTypeTopField = vstPresetssysId.chromaSampleLocTypeTopField;
968
+ p->vui.chromaSampleLocTypeBottomField = vstPresetssysId.chromaSampleLocTypeBottomField;
969
+
970
+ if (colorVolume0 != '\0')
971
+ {
972
+ if (!strcmp(systemId, "BT2100_PQ_YCC") || !strcmp(systemId, "BT2100_PQ_ICTCP") || !strcmp(systemId, "BT2100_PQ_RGB"))
973
+ {
974
+ p->bEmitHDR10SEI = 1;
975
+ if (!strcmp(colorVolume, "P3D65x1000n0005"))
976
+ {
977
+ p->masteringDisplayColorVolume = strdup("G(13250,34500)B(7500,3000)R(34000,16000)WP(15635,16450)L(10000000,5)");
978
+ }
979
+ else if (!strcmp(colorVolume, "P3D65x4000n005"))
980
+ {
981
+ p->masteringDisplayColorVolume = strdup("G(13250,34500)B(7500,3000)R(34000,16000)WP(15635,16450)L(40000000,50)");
982
+ }
983
+ else if (!strcmp(colorVolume, "BT2100x108n0005"))
984
+ {
985
+ p->masteringDisplayColorVolume = strdup("G(8500,39850)B(6550,2300)R(34000,146000)WP(15635,16450)L(10000000,1)");
986
+ }
987
+ else
988
+ {
989
+ x265_log(NULL, X265_LOG_ERROR, "Incorrect color-volume, aborting\n");
990
+ m_aborted = true;
991
+ }
992
+ }
993
+ else
994
+ {
995
+ x265_log(NULL, X265_LOG_ERROR, "Color-volume is not supported with the given system-id, aborting\n");
996
+ m_aborted = true;
997
+ }
998
+ }
999
+
1000
+}
1001
+
1002
void Encoder::configure(x265_param *p)
1003
{
1004
this->m_param = p;
1005
1006
if (!p->rdoqLevel)
1007
p->psyRdoq = 0;
1008
1009
+ if (p->craNal && p->keyframeMax > 1)
1010
+ {
1011
+ x265_log_file(NULL, X265_LOG_ERROR, " --cra-nal works only with keyint 1, but given keyint = %s\n", p->keyframeMax);
1012
+ m_aborted = true;
1013
+ }
1014
+
1015
/* Disable features which are not supported by the current RD level */
1016
if (p->rdLevel < 3)
1017
{
1018
1019
p->limitReferences = 0;
1020
}
1021
1022
- if (p->bEnableTemporalSubLayers && !p->bframes)
1023
+ if ((p->bEnableTemporalSubLayers > 2) && !p->bframes)
1024
{
1025
x265_log(p, X265_LOG_WARNING, "B frames not enabled, temporal sublayer disabled\n");
1026
p->bEnableTemporalSubLayers = 0;
1027
}
1028
1029
+ if (!!p->bEnableTemporalSubLayers && p->bEnableTemporalSubLayers < 2)
1030
+ {
1031
+ p->bEnableTemporalSubLayers = 0;
1032
+ x265_log(p, X265_LOG_WARNING, "No support for temporal sublayers less than 2; Disabling temporal layers\n");
1033
+ }
1034
+
1035
+ if (p->bEnableTemporalSubLayers > 5)
1036
+ {
1037
+ p->bEnableTemporalSubLayers = 5;
1038
+ x265_log(p, X265_LOG_WARNING, "No support for temporal sublayers more than 5; Reducing the temporal sublayers to 5\n");
1039
+ }
1040
+
1041
+ // Assign number of B frames for temporal layers
1042
+ if (p->bEnableTemporalSubLayers > 2)
1043
+ p->bframes = x265_temporal_layer_bframesp->bEnableTemporalSubLayers - 1;
1044
+
1045
+ if (p->bEnableTemporalSubLayers > 2)
1046
+ {
1047
+ if (p->bFrameAdaptive)
1048
+ {
1049
+ x265_log(p, X265_LOG_WARNING, "Disabling adaptive B-frame placement to support temporal sub-layers\n");
1050
+ p->bFrameAdaptive = 0;
1051
+ }
1052
+ }
1053
+
1054
m_bframeDelay = p->bframes ? (p->bBPyramid ? 2 : 1) : 0;
1055
1056
p->bFrameBias = X265_MIN(X265_MAX(-90, p->bFrameBias), 100);
1057
1058
p->rc.bStatRead = 0;
1059
}
1060
1061
+ if ((p->rc.bStatWrite || p->rc.bStatRead) && p->rc.dataShareMode != X265_SHARE_MODE_FILE && p->rc.dataShareMode != X265_SHARE_MODE_SHAREDMEM)
1062
+ {
1063
+ p->rc.dataShareMode = X265_SHARE_MODE_FILE;
1064
+ }
1065
+
1066
+ if (!p->rc.bStatRead || p->rc.rateControlMode != X265_RC_CRF)
1067
+ {
1068
+ p->rc.bEncFocusedFramesOnly = 0;
1069
+ }
1070
+
1071
/* some options make no sense if others are disabled */
1072
p->bSaoNonDeblocked &= p->bEnableSAO;
1073
p->bEnableTSkipFast &= p->bEnableTransformSkip;
1074
1075
}
1076
}
1077
1078
+ if (p->videoSignalTypePreset) // Default disabled.
1079
+ configureVideoSignalTypePreset(p);
1080
+
1081
if (m_param->toneMapFile || p->bHDR10Opt || p->bEmitHDR10SEI)
1082
{
1083
if (!p->bRepeatHeaders)
1084
1085
m_param->searchRange = m_param->hmeRange2;
1086
}
1087
1088
- if (p->bHistBasedSceneCut && !p->edgeTransitionThreshold)
1089
- {
1090
- p->edgeTransitionThreshold = 0.03;
1091
- x265_log(p, X265_LOG_WARNING, "using default threshold %.2lf for scene cut detection\n", p->edgeTransitionThreshold);
1092
- }
1093
+ if (p->bEnableSBRC && (p->rc.rateControlMode != X265_RC_CRF || (p->rc.vbvBufferSize == 0 || p->rc.vbvMaxBitrate == 0)))
1094
+ {
1095
+ x265_log(p, X265_LOG_WARNING, "SBRC can be enabled only with CRF+VBV mode. Disabling SBRC\n");
1096
+ p->bEnableSBRC = 0;
1097
+ }
1098
1099
+ if (p->bEnableSBRC)
1100
+ {
1101
+ p->rc.ipFactor = p->rc.ipFactor * X265_IPRATIO_STRENGTH;
1102
+ if (p->bOpenGOP)
1103
+ {
1104
+ x265_log(p, X265_LOG_WARNING, "Segment based RateControl requires closed gop structure. Enabling closed GOP.\n");
1105
+ p->bOpenGOP = 0;
1106
+ }
1107
+ if (p->keyframeMax != p->keyframeMin)
1108
+ {
1109
+ x265_log(p, X265_LOG_WARNING, "Segment based RateControl requires fixed gop length. Force set min-keyint equal to keyint.\n");
1110
+ p->keyframeMin = p->keyframeMax;
1111
+ }
1112
+ }
1113
}
1114
1115
void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const x265_picture* picIn, int paramBytes)
1116
1117
analysis->frameRecordSize = frameRecordSize;
1118
X265_FREAD(&analysis->sliceType, sizeof(int), 1, m_analysisFileIn, &(picData->sliceType));
1119
X265_FREAD(&analysis->bScenecut, sizeof(int), 1, m_analysisFileIn, &(picData->bScenecut));
1120
- if (m_param->bHistBasedSceneCut)
1121
- {
1122
- X265_FREAD(&analysis->edgeHist, sizeof(int32_t), EDGE_BINS, m_analysisFileIn, &m_curEdgeHist);
1123
- X265_FREAD(&analysis->yuvHist0, sizeof(int32_t), HISTOGRAM_BINS, m_analysisFileIn, &m_curYUVHist0);
1124
- if (m_param->internalCsp != X265_CSP_I400)
1125
- {
1126
- X265_FREAD(&analysis->yuvHist1, sizeof(int32_t), HISTOGRAM_BINS, m_analysisFileIn, &m_curYUVHist1);
1127
- X265_FREAD(&analysis->yuvHist2, sizeof(int32_t), HISTOGRAM_BINS, m_analysisFileIn, &m_curYUVHist2);
1128
- }
1129
- }
1130
X265_FREAD(&analysis->satdCost, sizeof(int64_t), 1, m_analysisFileIn, &(picData->satdCost));
1131
X265_FREAD(&numCUsLoad, sizeof(int), 1, m_analysisFileIn, &(picData->numCUsInFrame));
1132
X265_FREAD(&analysis->numPartitions, sizeof(int), 1, m_analysisFileIn, &(picData->numPartitions));
1133
1134
analysis->frameRecordSize = frameRecordSize;
1135
X265_FREAD(&analysis->sliceType, sizeof(int), 1, m_analysisFileIn, &(picData->sliceType));
1136
X265_FREAD(&analysis->bScenecut, sizeof(int), 1, m_analysisFileIn, &(picData->bScenecut));
1137
- if (m_param->bHistBasedSceneCut)
1138
- {
1139
- X265_FREAD(&analysis->edgeHist, sizeof(int32_t), EDGE_BINS, m_analysisFileIn, &m_curEdgeHist);
1140
- X265_FREAD(&analysis->yuvHist0, sizeof(int32_t), HISTOGRAM_BINS, m_analysisFileIn, &m_curYUVHist0);
1141
- if (m_param->internalCsp != X265_CSP_I400)
1142
- {
1143
- X265_FREAD(&analysis->yuvHist1, sizeof(int32_t), HISTOGRAM_BINS, m_analysisFileIn, &m_curYUVHist1);
1144
- X265_FREAD(&analysis->yuvHist2, sizeof(int32_t), HISTOGRAM_BINS, m_analysisFileIn, &m_curYUVHist2);
1145
- }
1146
- }
1147
X265_FREAD(&analysis->satdCost, sizeof(int64_t), 1, m_analysisFileIn, &(picData->satdCost));
1148
X265_FREAD(&analysis->numCUsInFrame, sizeof(int), 1, m_analysisFileIn, &(picData->numCUsInFrame));
1149
X265_FREAD(&analysis->numPartitions, sizeof(int), 1, m_analysisFileIn, &(picData->numPartitions));
1150
1151
1152
if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
1153
{
1154
- if (m_param->analysisLoadReuseLevel < 2)
1155
- return;
1156
+ if (m_param->analysisLoadReuseLevel < 2)
1157
+ {
1158
+ /* Restore to the current encode's numPartitions and numCUsInFrame */
1159
+ analysis->numPartitions = m_param->num4x4Partitions;
1160
+ analysis->numCUsInFrame = cuLoc.heightInCU * cuLoc.widthInCU;
1161
+ analysis->numCuInHeight = cuLoc.heightInCU;
1162
+ return;
1163
+ }
1164
1165
uint8_t *tempBuf = NULL, *depthBuf = NULL, *modeBuf = NULL, *partSizes = NULL;
1166
int8_t *cuQPBuf = NULL;
1167
1168
uint32_t numDir = analysis->sliceType == X265_TYPE_P ? 1 : 2;
1169
uint32_t numPlanes = m_param->internalCsp == X265_CSP_I400 ? 1 : 3;
1170
X265_FREAD((WeightParam*)analysis->wt, sizeof(WeightParam), numPlanes * numDir, m_analysisFileIn, (picIn->analysisData.wt));
1171
- if (m_param->analysisLoadReuseLevel < 2)
1172
- return;
1173
+ if (m_param->analysisLoadReuseLevel < 2)
1174
+ {
1175
+ /* Restore to the current encode's numPartitions and numCUsInFrame */
1176
+ analysis->numPartitions = m_param->num4x4Partitions;
1177
+ analysis->numCUsInFrame = cuLoc.heightInCU * cuLoc.widthInCU;
1178
+ analysis->numCuInHeight = cuLoc.heightInCU;
1179
+ return;
1180
+ }
1181
1182
uint8_t *tempBuf = NULL, *depthBuf = NULL, *modeBuf = NULL, *partSize = NULL, *mergeFlag = NULL;
1183
uint8_t *interDir = NULL, *chromaDir = NULL, *mvpIdx2;
1184
1185
1186
int bcutree;
1187
X265_FREAD(&bcutree, sizeof(int), 1, m_analysisFileIn, &(saveParam->cuTree));
1188
- if (loadLevel == 10 && m_param->rc.cuTree && (!bcutree || saveLevel < 2))
1189
+ if (loadLevel >= 2 && m_param->rc.cuTree && (!bcutree || saveLevel < 2))
1190
{
1191
x265_log(NULL, X265_LOG_ERROR, "Error reading cu-tree info. Disabling cutree offsets. \n");
1192
m_param->rc.cuTree = 0;
1193
1194
distortionData->highDistortionCtuCount++;
1195
}
1196
}
1197
+
1198
void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, int sliceType)
1199
{
1200
1201
1202
/* calculate frameRecordSize */
1203
analysis->frameRecordSize = sizeof(analysis->frameRecordSize) + sizeof(depthBytes) + sizeof(analysis->poc) + sizeof(analysis->sliceType) +
1204
sizeof(analysis->numCUsInFrame) + sizeof(analysis->numPartitions) + sizeof(analysis->bScenecut) + sizeof(analysis->satdCost);
1205
- if (m_param->bHistBasedSceneCut)
1206
- {
1207
- analysis->frameRecordSize += sizeof(analysis->edgeHist);
1208
- analysis->frameRecordSize += sizeof(int32_t) * HISTOGRAM_BINS;
1209
- if (m_param->internalCsp != X265_CSP_I400)
1210
- {
1211
- analysis->frameRecordSize += sizeof(int32_t) * HISTOGRAM_BINS;
1212
- analysis->frameRecordSize += sizeof(int32_t) * HISTOGRAM_BINS;
1213
- }
1214
- }
1215
-
1216
if (analysis->sliceType > X265_TYPE_I)
1217
{
1218
numDir = (analysis->sliceType == X265_TYPE_P) ? 1 : 2;
1219
1220
X265_FWRITE(&analysis->poc, sizeof(int), 1, m_analysisFileOut);
1221
X265_FWRITE(&analysis->sliceType, sizeof(int), 1, m_analysisFileOut);
1222
X265_FWRITE(&analysis->bScenecut, sizeof(int), 1, m_analysisFileOut);
1223
- if (m_param->bHistBasedSceneCut)
1224
- {
1225
- X265_FWRITE(&analysis->edgeHist, sizeof(int32_t), EDGE_BINS, m_analysisFileOut);
1226
- X265_FWRITE(&analysis->yuvHist0, sizeof(int32_t), HISTOGRAM_BINS, m_analysisFileOut);
1227
- if (m_param->internalCsp != X265_CSP_I400)
1228
- {
1229
- X265_FWRITE(&analysis->yuvHist1, sizeof(int32_t), HISTOGRAM_BINS, m_analysisFileOut);
1230
- X265_FWRITE(&analysis->yuvHist2, sizeof(int32_t), HISTOGRAM_BINS, m_analysisFileOut);
1231
- }
1232
- }
1233
-
1234
X265_FWRITE(&analysis->satdCost, sizeof(int64_t), 1, m_analysisFileOut);
1235
X265_FWRITE(&analysis->numCUsInFrame, sizeof(int), 1, m_analysisFileOut);
1236
X265_FWRITE(&analysis->numPartitions, sizeof(int), 1, m_analysisFileOut);
1237
x265_3.5.tar.gz/source/encoder/encoder.h -> x265_3.6.tar.gz/source/encoder/encoder.h
Changed
72
1
2
#include "nal.h"
3
#include "framedata.h"
4
#include "svt.h"
5
+#include "temporalfilter.h"
6
#ifdef ENABLE_HDR10_PLUS
7
#include "dynamicHDR10/hdr10plus.h"
8
#endif
9
10
int m_bToneMap; // Enables tone-mapping
11
int m_enableNal;
12
13
- /* For histogram based scene-cut detection */
14
- pixel* m_edgePic;
15
- pixel* m_inputPic3;
16
- int32_t m_curYUVHist3HISTOGRAM_BINS;
17
- int32_t m_prevYUVHist3HISTOGRAM_BINS;
18
- int32_t m_curEdgeHist2;
19
- int32_t m_prevEdgeHist2;
20
- uint32_t m_planeSizes3;
21
- double m_edgeHistThreshold;
22
- double m_chromaHistThreshold;
23
- double m_scaledEdgeThreshold;
24
- double m_scaledChromaThreshold;
25
-
26
#ifdef ENABLE_HDR10_PLUS
27
const hdr10plus_api *m_hdr10plus_api;
28
uint8_t **m_cim;
29
30
31
ThreadSafeInteger* zoneReadCount;
32
ThreadSafeInteger* zoneWriteCount;
33
+ /* Film grain model file */
34
+ FILE* m_filmGrainIn;
35
+ OrigPicBuffer* m_origPicBuffer;
36
37
Encoder();
38
~Encoder()
39
40
41
void getStreamHeaders(NALList& list, Entropy& sbacCoder, Bitstream& bs);
42
43
+ void getEndNalUnits(NALList& list, Bitstream& bs);
44
+
45
void fetchStats(x265_stats* stats, size_t statsSizeBytes);
46
47
void printSummary();
48
49
50
void copyPicture(x265_picture *dest, const x265_picture *src);
51
52
- bool computeHistograms(x265_picture *pic);
53
- void computeHistogramSAD(double *maxUVNormalizedSAD, double *edgeNormalizedSAD, int curPoc);
54
- double normalizeRange(int32_t value, int32_t minValue, int32_t maxValue, double rangeStart, double rangeEnd);
55
- void findSceneCuts(x265_picture *pic, bool& bDup, double m_maxUVSADVal, double m_edgeSADVal, bool& isMaxThres, bool& isHardSC);
56
-
57
void initRefIdx();
58
void analyseRefIdx(int *numRefIdx);
59
void updateRefIdx();
60
61
62
void configureDolbyVisionParams(x265_param* p);
63
64
+ void configureVideoSignalTypePreset(x265_param* p);
65
+
66
+ bool isFilterThisframe(uint8_t sliceTypeConfig, int curSliceType);
67
+ bool generateMcstfRef(Frame* frameEnc, FrameEncoder* currEncoder);
68
+
69
protected:
70
71
void initVPS(VPS *vps);
72
x265_3.5.tar.gz/source/encoder/entropy.cpp -> x265_3.6.tar.gz/source/encoder/entropy.cpp
Changed
41
1
2
3
for (uint32_t i = 0; i < vps.maxTempSubLayers; i++)
4
{
5
- WRITE_UVLC(vps.maxDecPicBuffering - 1, "vps_max_dec_pic_buffering_minus1i");
6
- WRITE_UVLC(vps.numReorderPics, "vps_num_reorder_picsi");
7
- WRITE_UVLC(vps.maxLatencyIncrease + 1, "vps_max_latency_increase_plus1i");
8
+ WRITE_UVLC(vps.maxDecPicBufferingi - 1, "vps_max_dec_pic_buffering_minus1i");
9
+ WRITE_UVLC(vps.numReorderPicsi, "vps_num_reorder_picsi");
10
+ WRITE_UVLC(vps.maxLatencyIncreasei + 1, "vps_max_latency_increase_plus1i");
11
}
12
13
WRITE_CODE(0, 6, "vps_max_nuh_reserved_zero_layer_id");
14
15
16
for (uint32_t i = 0; i < sps.maxTempSubLayers; i++)
17
{
18
- WRITE_UVLC(sps.maxDecPicBuffering - 1, "sps_max_dec_pic_buffering_minus1i");
19
- WRITE_UVLC(sps.numReorderPics, "sps_num_reorder_picsi");
20
- WRITE_UVLC(sps.maxLatencyIncrease + 1, "sps_max_latency_increase_plus1i");
21
+ WRITE_UVLC(sps.maxDecPicBufferingi - 1, "sps_max_dec_pic_buffering_minus1i");
22
+ WRITE_UVLC(sps.numReorderPicsi, "sps_num_reorder_picsi");
23
+ WRITE_UVLC(sps.maxLatencyIncreasei + 1, "sps_max_latency_increase_plus1i");
24
}
25
26
WRITE_UVLC(sps.log2MinCodingBlockSize - 3, "log2_min_coding_block_size_minus3");
27
28
29
if (maxTempSubLayers > 1)
30
{
31
- WRITE_FLAG(0, "sub_layer_profile_present_flagi");
32
- WRITE_FLAG(0, "sub_layer_level_present_flagi");
33
+ for(int i = 0; i < maxTempSubLayers - 1; i++)
34
+ {
35
+ WRITE_FLAG(0, "sub_layer_profile_present_flagi");
36
+ WRITE_FLAG(0, "sub_layer_level_present_flagi");
37
+ }
38
for (int i = maxTempSubLayers - 1; i < 8 ; i++)
39
WRITE_CODE(0, 2, "reserved_zero_2bits");
40
}
41
x265_3.5.tar.gz/source/encoder/frameencoder.cpp -> x265_3.6.tar.gz/source/encoder/frameencoder.cpp
Changed
200
1
2
#include "common.h"
3
#include "slicetype.h"
4
#include "nal.h"
5
+#include "temporalfilter.h"
6
7
namespace X265_NS {
8
void weightAnalyse(Slice& slice, Frame& frame, x265_param& param);
9
10
delete m_rce.picTimingSEI;
11
delete m_rce.hrdTiming;
12
}
13
+
14
+ if (m_param->bEnableTemporalFilter)
15
+ {
16
+ delete m_frameEncTF->m_metld;
17
+
18
+ for (int i = 0; i < (m_frameEncTF->m_range << 1); i++)
19
+ m_frameEncTF->destroyRefPicInfo(&m_mcstfRefListi);
20
+
21
+ delete m_frameEncTF;
22
+ }
23
}
24
25
bool FrameEncoder::init(Encoder *top, int numRows, int numCols)
26
27
m_sliceAddrBits = (uint16_t)(tmp + 1);
28
}
29
30
+ if (m_param->bEnableTemporalFilter)
31
+ {
32
+ m_frameEncTF = new TemporalFilter();
33
+ if (m_frameEncTF)
34
+ m_frameEncTF->init(m_param);
35
+
36
+ for (int i = 0; i < (m_frameEncTF->m_range << 1); i++)
37
+ ok &= !!m_frameEncTF->createRefPicInfo(&m_mcstfRefListi, m_param);
38
+ }
39
+
40
return ok;
41
}
42
43
44
m_ssimCnt = 0;
45
memset(&(m_frame->m_encData->m_frameStats), 0, sizeof(m_frame->m_encData->m_frameStats));
46
47
- if (!m_param->bHistBasedSceneCut && m_param->rc.aqMode != X265_AQ_EDGE && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
48
+ if (m_param->rc.aqMode != X265_AQ_EDGE && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
49
{
50
int height = m_frame->m_fencPic->m_picHeight;
51
int width = m_frame->m_fencPic->m_picWidth;
52
53
* unit) */
54
Slice* slice = m_frame->m_encData->m_slice;
55
56
+ if (m_param->bEnableEndOfSequence && m_frame->m_lowres.sliceType == X265_TYPE_IDR && m_frame->m_poc)
57
+ {
58
+ m_bs.resetBits();
59
+ m_nalList.serialize(NAL_UNIT_EOS, m_bs);
60
+ }
61
+
62
if (m_param->bEnableAccessUnitDelimiters && (m_frame->m_poc || m_param->bRepeatHeaders))
63
{
64
m_bs.resetBits();
65
66
int qp = m_top->m_rateControl->rateControlStart(m_frame, &m_rce, m_top);
67
m_rce.newQp = qp;
68
69
+ if (m_param->bEnableTemporalFilter)
70
+ {
71
+ m_frameEncTF->m_QP = qp;
72
+ m_frameEncTF->bilateralFilter(m_frame, m_mcstfRefList, m_param->temporalFilterStrength);
73
+ }
74
+
75
if (m_nr)
76
{
77
if (qp > QP_MAX_SPEC && m_frame->m_param->rc.vbvBufferSize)
78
79
// wait after removal of the access unit with the most recent
80
// buffering period SEI message
81
sei->m_auCpbRemovalDelay = X265_MIN(X265_MAX(1, m_rce.encodeOrder - prevBPSEI), (1 << hrd->cpbRemovalDelayLength));
82
- sei->m_picDpbOutputDelay = slice->m_sps->numReorderPics + poc - m_rce.encodeOrder;
83
+ sei->m_picDpbOutputDelay = slice->m_sps->numReorderPicsm_frame->m_tempLayer + poc - m_rce.encodeOrder;
84
}
85
86
sei->writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
87
88
m_seiAlternativeTC.m_preferredTransferCharacteristics = m_param->preferredTransferCharacteristics;
89
m_seiAlternativeTC.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
90
}
91
-
92
+ /* Write Film grain characteristics if present */
93
+ if (this->m_top->m_filmGrainIn)
94
+ {
95
+ FilmGrainCharacteristics m_filmGrain;
96
+ /* Read the Film grain model file */
97
+ readModel(&m_filmGrain, this->m_top->m_filmGrainIn);
98
+ m_filmGrain.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
99
+ }
100
/* Write user SEI */
101
for (int i = 0; i < m_frame->m_userSEI.numPayloads; i++)
102
{
103
104
if (m_param->bDynamicRefine && m_top->m_startPoint <= m_frame->m_encodeOrder) //Avoid collecting data that will not be used by future frames.
105
collectDynDataFrame();
106
107
+ if (m_param->bEnableTemporalFilter && m_top->isFilterThisframe(m_frame->m_mcstf->m_sliceTypeConfig, m_frame->m_lowres.sliceType))
108
+ {
109
+ //Reset the MCSTF context in Frame Encoder and Frame
110
+ for (int i = 0; i < (m_frameEncTF->m_range << 1); i++)
111
+ {
112
+ memset(m_mcstfRefListi.mvs0, 0, sizeof(MV) * ((m_param->sourceWidth / 16) * (m_param->sourceHeight / 16)));
113
+ memset(m_mcstfRefListi.mvs1, 0, sizeof(MV) * ((m_param->sourceWidth / 16) * (m_param->sourceHeight / 16)));
114
+ memset(m_mcstfRefListi.mvs2, 0, sizeof(MV) * ((m_param->sourceWidth / 16) * (m_param->sourceHeight / 16)));
115
+ memset(m_mcstfRefListi.mvs, 0, sizeof(MV) * ((m_param->sourceWidth / 4) * (m_param->sourceHeight / 4)));
116
+ memset(m_mcstfRefListi.noise, 0, sizeof(int) * ((m_param->sourceWidth / 4) * (m_param->sourceHeight / 4)));
117
+ memset(m_mcstfRefListi.error, 0, sizeof(int) * ((m_param->sourceWidth / 4) * (m_param->sourceHeight / 4)));
118
+
119
+ m_frame->m_mcstf->m_numRef = 0;
120
+ }
121
+ }
122
+
123
+
124
if (m_param->rc.bStatWrite)
125
{
126
int totalI = 0, totalP = 0, totalSkip = 0;
127
128
129
m_bs.writeByteAlignment();
130
131
- m_nalList.serialize(slice->m_nalUnitType, m_bs);
132
+ m_nalList.serialize(slice->m_nalUnitType, m_bs, (!!m_param->bEnableTemporalSubLayers ? m_frame->m_tempLayer + 1 : (1 + (slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N))));
133
}
134
}
135
else
136
137
m_entropyCoder.codeSliceHeaderWPPEntryPoints(m_substreamSizes, (slice->m_sps->numCuInHeight - 1), maxStreamSize);
138
m_bs.writeByteAlignment();
139
140
- m_nalList.serialize(slice->m_nalUnitType, m_bs);
141
+ m_nalList.serialize(slice->m_nalUnitType, m_bs, (!!m_param->bEnableTemporalSubLayers ? m_frame->m_tempLayer + 1 : (1 + (slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N))));
142
}
143
144
if (m_param->decodedPictureHashSEI)
145
146
m_nr->nrOffsetDenoisecat0 = 0;
147
}
148
}
149
+
150
+void FrameEncoder::readModel(FilmGrainCharacteristics* m_filmGrain, FILE* filmgrain)
151
+{
152
+ char const* errorMessage = "Error reading FilmGrain characteristics\n";
153
+ FilmGrain m_fg;
154
+ x265_fread((char* )&m_fg, sizeof(bool) * 3 + sizeof(uint8_t), 1, filmgrain, errorMessage);
155
+ m_filmGrain->m_filmGrainCharacteristicsCancelFlag = m_fg.m_filmGrainCharacteristicsCancelFlag;
156
+ m_filmGrain->m_filmGrainCharacteristicsPersistenceFlag = m_fg.m_filmGrainCharacteristicsPersistenceFlag;
157
+ m_filmGrain->m_filmGrainModelId = m_fg.m_filmGrainModelId;
158
+ m_filmGrain->m_separateColourDescriptionPresentFlag = m_fg.m_separateColourDescriptionPresentFlag;
159
+ if (m_filmGrain->m_separateColourDescriptionPresentFlag)
160
+ {
161
+ ColourDescription m_clr;
162
+ x265_fread((char* )&m_clr, sizeof(bool) + sizeof(uint8_t) * 5, 1, filmgrain, errorMessage);
163
+ m_filmGrain->m_filmGrainBitDepthLumaMinus8 = m_clr.m_filmGrainBitDepthLumaMinus8;
164
+ m_filmGrain->m_filmGrainBitDepthChromaMinus8 = m_clr.m_filmGrainBitDepthChromaMinus8;
165
+ m_filmGrain->m_filmGrainFullRangeFlag = m_clr.m_filmGrainFullRangeFlag;
166
+ m_filmGrain->m_filmGrainColourPrimaries = m_clr.m_filmGrainColourPrimaries;
167
+ m_filmGrain->m_filmGrainTransferCharacteristics = m_clr.m_filmGrainTransferCharacteristics;
168
+ m_filmGrain->m_filmGrainMatrixCoeffs = m_clr.m_filmGrainMatrixCoeffs;
169
+ }
170
+ FGPresent m_present;
171
+ x265_fread((char* )&m_present, sizeof(bool) * 3 + sizeof(uint8_t) * 2, 1, filmgrain, errorMessage);
172
+ m_filmGrain->m_blendingModeId = m_present.m_blendingModeId;
173
+ m_filmGrain->m_log2ScaleFactor = m_present.m_log2ScaleFactor;
174
+ m_filmGrain->m_compModel0.bPresentFlag = m_present.m_presentFlag0;
175
+ m_filmGrain->m_compModel1.bPresentFlag = m_present.m_presentFlag1;
176
+ m_filmGrain->m_compModel2.bPresentFlag = m_present.m_presentFlag2;
177
+ for (int i = 0; i < MAX_NUM_COMPONENT; i++)
178
+ {
179
+ if (m_filmGrain->m_compModeli.bPresentFlag)
180
+ {
181
+ x265_fread((char* )(&m_filmGrain->m_compModeli.m_filmGrainNumIntensityIntervalMinus1), sizeof(uint8_t), 1, filmgrain, errorMessage);
182
+ x265_fread((char* )(&m_filmGrain->m_compModeli.numModelValues), sizeof(uint8_t), 1, filmgrain, errorMessage);
183
+ m_filmGrain->m_compModeli.intensityValues = (FilmGrainCharacteristics::CompModelIntensityValues* ) malloc(sizeof(FilmGrainCharacteristics::CompModelIntensityValues) * (m_filmGrain->m_compModeli.m_filmGrainNumIntensityIntervalMinus1+1)) ;
184
+ for (int j = 0; j <= m_filmGrain->m_compModeli.m_filmGrainNumIntensityIntervalMinus1; j++)
185
+ {
186
+ x265_fread((char* )(&m_filmGrain->m_compModeli.intensityValuesj.intensityIntervalLowerBound), sizeof(uint8_t), 1, filmgrain, errorMessage);
187
+ x265_fread((char* )(&m_filmGrain->m_compModeli.intensityValuesj.intensityIntervalUpperBound), sizeof(uint8_t), 1, filmgrain, errorMessage);
188
+ m_filmGrain->m_compModeli.intensityValuesj.compModelValue = (int* ) malloc(sizeof(int) * (m_filmGrain->m_compModeli.numModelValues));
189
+ for (int k = 0; k < m_filmGrain->m_compModeli.numModelValues; k++)
190
+ {
191
+ x265_fread((char* )(&m_filmGrain->m_compModeli.intensityValuesj.compModelValuek), sizeof(int), 1, filmgrain, errorMessage);
192
+ }
193
+ }
194
+ }
195
+ }
196
+}
197
#if ENABLE_LIBVMAF
198
void FrameEncoder::vmafFrameLevelScore()
199
{
200
x265_3.5.tar.gz/source/encoder/frameencoder.h -> x265_3.6.tar.gz/source/encoder/frameencoder.h
Changed
63
1
2
#include "ratecontrol.h"
3
#include "reference.h"
4
#include "nal.h"
5
+#include "temporalfilter.h"
6
7
namespace X265_NS {
8
// private x265 namespace
9
10
}
11
};
12
13
+/*Film grain characteristics*/
14
+struct FilmGrain
15
+{
16
+ bool m_filmGrainCharacteristicsCancelFlag;
17
+ bool m_filmGrainCharacteristicsPersistenceFlag;
18
+ bool m_separateColourDescriptionPresentFlag;
19
+ uint8_t m_filmGrainModelId;
20
+ uint8_t m_blendingModeId;
21
+ uint8_t m_log2ScaleFactor;
22
+};
23
+
24
+struct ColourDescription
25
+{
26
+ bool m_filmGrainFullRangeFlag;
27
+ uint8_t m_filmGrainBitDepthLumaMinus8;
28
+ uint8_t m_filmGrainBitDepthChromaMinus8;
29
+ uint8_t m_filmGrainColourPrimaries;
30
+ uint8_t m_filmGrainTransferCharacteristics;
31
+ uint8_t m_filmGrainMatrixCoeffs;
32
+};
33
+
34
+struct FGPresent
35
+{
36
+ uint8_t m_blendingModeId;
37
+ uint8_t m_log2ScaleFactor;
38
+ bool m_presentFlag3;
39
+};
40
+
41
// Manages the wave-front processing of a single encoding frame
42
class FrameEncoder : public WaveFront, public Thread
43
{
44
45
FrameFilter m_frameFilter;
46
NALList m_nalList;
47
48
+ // initialization for mcstf
49
+ TemporalFilter* m_frameEncTF;
50
+ TemporalFilterRefPicInfo m_mcstfRefListMAX_MCSTF_TEMPORAL_WINDOW_LENGTH;
51
+
52
class WeightAnalysis : public BondedTaskGroup
53
{
54
public:
55
56
void collectDynDataFrame();
57
void computeAvgTrainingData();
58
void collectDynDataRow(CUData& ctu, FrameStats* rowStats);
59
+ void readModel(FilmGrainCharacteristics* m_filmGrain, FILE* filmgrain);
60
};
61
}
62
63
x265_3.5.tar.gz/source/encoder/level.cpp -> x265_3.6.tar.gz/source/encoder/level.cpp
Changed
86
1
2
* for intra-only profiles (vps.ptl.intraConstraintFlag) */
3
vps.ptl.lowerBitRateConstraintFlag = true;
4
5
- vps.maxTempSubLayers = param.bEnableTemporalSubLayers ? 2 : 1;
6
+ vps.maxTempSubLayers = !!param.bEnableTemporalSubLayers ? param.bEnableTemporalSubLayers : 1;
7
8
if (param.internalCsp == X265_CSP_I420 && param.internalBitDepth <= 10)
9
{
10
11
12
/* The value of sps_max_dec_pic_buffering_minus1 HighestTid + 1 shall be less than
13
* or equal to MaxDpbSize */
14
- if (vps.maxDecPicBuffering > maxDpbSize)
15
+ if (vps.maxDecPicBufferingvps.maxTempSubLayers - 1 > maxDpbSize)
16
continue;
17
18
/* For level 5 and higher levels, the value of CtbSizeY shall be equal to 32 or 64 */
19
20
}
21
22
/* The value of NumPocTotalCurr shall be less than or equal to 8 */
23
- int numPocTotalCurr = param.maxNumReferences + vps.numReorderPics;
24
- if (numPocTotalCurr > 8)
25
+ int numPocTotalCurr = param.maxNumReferences + vps.numReorderPicsvps.maxTempSubLayers - 1;
26
+ if (numPocTotalCurr > 10)
27
{
28
x265_log(¶m, X265_LOG_WARNING, "level %s detected, but NumPocTotalCurr (total references) is non-compliant\n", levelsi.name);
29
vps.ptl.profileIdc = Profile::NONE;
30
31
* circumstances it will be quite noisy */
32
bool enforceLevel(x265_param& param, VPS& vps)
33
{
34
- vps.numReorderPics = (param.bBPyramid && param.bframes > 1) ? 2 : !!param.bframes;
35
- vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 2, (uint32_t)param.maxNumReferences) + 1);
36
+ vps.maxTempSubLayers = !!param.bEnableTemporalSubLayers ? param.bEnableTemporalSubLayers : 1;
37
+ for (uint32_t i = 0; i < vps.maxTempSubLayers; i++)
38
+ {
39
+ vps.numReorderPicsi = (i == 0) ? ((param.bBPyramid && param.bframes > 1) ? 2 : !!param.bframes) : i;
40
+ vps.maxDecPicBufferingi = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPicsi + 2, (uint32_t)param.maxNumReferences) + 1);
41
+ }
42
43
+ if (!!param.bEnableTemporalSubLayers)
44
+ {
45
+ for (int i = 0; i < MAX_T_LAYERS - 1; i++)
46
+ {
47
+ // a lower layer can not have higher value of numReorderPics than a higher layer
48
+ if (vps.numReorderPicsi + 1 < vps.numReorderPicsi)
49
+ {
50
+ vps.numReorderPicsi + 1 = vps.numReorderPicsi;
51
+ }
52
+ // the value of numReorderPicsi shall be in the range of 0 to maxDecPicBufferingi - 1, inclusive
53
+ if (vps.numReorderPicsi > vps.maxDecPicBufferingi - 1)
54
+ {
55
+ vps.maxDecPicBufferingi = vps.numReorderPicsi + 1;
56
+ }
57
+ // a lower layer can not have higher value of maxDecPicBuffering than a higher layer
58
+ if (vps.maxDecPicBufferingi + 1 < vps.maxDecPicBufferingi)
59
+ {
60
+ vps.maxDecPicBufferingi + 1 = vps.maxDecPicBufferingi;
61
+ }
62
+ }
63
+
64
+ // the value of numReorderPicsi shall be in the range of 0 to maxDecPicBuffering i - 1, inclusive
65
+ if (vps.numReorderPicsMAX_T_LAYERS - 1 > vps.maxDecPicBufferingMAX_T_LAYERS - 1 - 1)
66
+ {
67
+ vps.maxDecPicBufferingMAX_T_LAYERS - 1 = vps.numReorderPicsMAX_T_LAYERS - 1 + 1;
68
+ }
69
+ }
70
/* no level specified by user, just auto-detect from the configuration */
71
if (param.levelIdc <= 0)
72
return true;
73
74
}
75
76
int savedRefCount = param.maxNumReferences;
77
- while (vps.maxDecPicBuffering > maxDpbSize && param.maxNumReferences > 1)
78
+ while (vps.maxDecPicBufferingvps.maxTempSubLayers - 1 > maxDpbSize && param.maxNumReferences > 1)
79
{
80
param.maxNumReferences--;
81
- vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 1, (uint32_t)param.maxNumReferences) + 1);
82
+ vps.maxDecPicBufferingvps.maxTempSubLayers - 1 = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPicsvps.maxTempSubLayers - 1 + 1, (uint32_t)param.maxNumReferences) + 1);
83
}
84
if (param.maxNumReferences != savedRefCount)
85
x265_log(¶m, X265_LOG_WARNING, "Lowering max references to %d to meet level requirement\n", param.maxNumReferences);
86
x265_3.5.tar.gz/source/encoder/motion.cpp -> x265_3.6.tar.gz/source/encoder/motion.cpp
Changed
33
1
2
X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
3
}
4
5
+/* Called by lookahead, luma only, no use of PicYuv */
6
+void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int method, const int refine)
7
+{
8
+ partEnum = partitionFromSizes(pwidth, pheight);
9
+ X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
10
+ sad = primitives.pupartEnum.sad;
11
+ ads = primitives.pupartEnum.ads;
12
+ satd = primitives.pupartEnum.satd;
13
+ sad_x3 = primitives.pupartEnum.sad_x3;
14
+ sad_x4 = primitives.pupartEnum.sad_x4;
15
+
16
+
17
+ blockwidth = pwidth;
18
+ blockOffset = offset;
19
+ absPartIdx = ctuAddr = -1;
20
+
21
+ /* Search params */
22
+ searchMethod = method;
23
+ subpelRefine = refine;
24
+
25
+ /* copy PU block into cache */
26
+ primitives.pupartEnum.copy_pp(fencPUYuv.m_buf0, FENC_STRIDE, fencY + offset, stride);
27
+ X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
28
+}
29
+
30
/* Called by Search::predInterSearch() or --pme equivalent, chroma residual might be considered */
31
void MotionEstimate::setSourcePU(const Yuv& srcFencYuv, int _ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight, const int method, const int refine, bool bChroma)
32
{
33
x265_3.5.tar.gz/source/encoder/motion.h -> x265_3.6.tar.gz/source/encoder/motion.h
Changed
10
1
2
void init(int csp);
3
4
/* Methods called at slice setup */
5
-
6
+ void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int searchMethod, const int subpelRefine);
7
void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int searchMethod, const int searchL0, const int searchL1, const int subpelRefine);
8
void setSourcePU(const Yuv& srcFencYuv, int ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight, const int searchMethod, const int subpelRefine, bool bChroma);
9
10
x265_3.5.tar.gz/source/encoder/nal.cpp -> x265_3.6.tar.gz/source/encoder/nal.cpp
Changed
19
1
2
other.m_buffer = X265_MALLOC(uint8_t, m_allocSize);
3
}
4
5
-void NALList::serialize(NalUnitType nalUnitType, const Bitstream& bs)
6
+void NALList::serialize(NalUnitType nalUnitType, const Bitstream& bs, uint8_t temporalID)
7
{
8
static const char startCodePrefix = { 0, 0, 0, 1 };
9
10
11
* nuh_reserved_zero_6bits 6-bits
12
* nuh_temporal_id_plus1 3-bits */
13
outbytes++ = (uint8_t)nalUnitType << 1;
14
- outbytes++ = 1 + (nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N);
15
+ outbytes++ = temporalID;
16
17
/* 7.4.1 ...
18
* Within the NAL unit, the following three-byte sequences shall not occur at
19
x265_3.5.tar.gz/source/encoder/nal.h -> x265_3.6.tar.gz/source/encoder/nal.h
Changed
10
1
2
3
void takeContents(NALList& other);
4
5
- void serialize(NalUnitType nalUnitType, const Bitstream& bs);
6
+ void serialize(NalUnitType nalUnitType, const Bitstream& bs, uint8_t temporalID = 1);
7
8
uint32_t serializeSubstreams(uint32_t* streamSizeBytes, uint32_t streamCount, const Bitstream* streams);
9
};
10
x265_3.5.tar.gz/source/encoder/ratecontrol.cpp -> x265_3.6.tar.gz/source/encoder/ratecontrol.cpp
Changed
1457
1
2
#define BR_SHIFT 6
3
#define CPB_SHIFT 4
4
5
+#define SHARED_DATA_ALIGNMENT 4 ///< 4btye, 32bit
6
+#define CUTREE_SHARED_MEM_NAME "cutree"
7
+#define GOP_CNT_CU_TREE 3
8
+
9
using namespace X265_NS;
10
11
/* Amortize the partial cost of I frames over the next N frames */
12
13
return output;
14
}
15
16
+typedef struct CUTreeSharedDataItem
17
+{
18
+ uint8_t *type;
19
+ uint16_t *stats;
20
+}CUTreeSharedDataItem;
21
+
22
+void static ReadSharedCUTreeData(void *dst, void *src, int32_t size)
23
+{
24
+ CUTreeSharedDataItem *statsDst = reinterpret_cast<CUTreeSharedDataItem *>(dst);
25
+ uint8_t *typeSrc = reinterpret_cast<uint8_t *>(src);
26
+ *statsDst->type = *typeSrc;
27
+
28
+ ///< for memory alignment, the type will take 32bit in the shared memory
29
+ int32_t offset = (sizeof(*statsDst->type) + SHARED_DATA_ALIGNMENT - 1) & ~(SHARED_DATA_ALIGNMENT - 1);
30
+ uint16_t *statsSrc = reinterpret_cast<uint16_t *>(typeSrc + offset);
31
+ memcpy(statsDst->stats, statsSrc, size - offset);
32
+}
33
+
34
+void static WriteSharedCUTreeData(void *dst, void *src, int32_t size)
35
+{
36
+ CUTreeSharedDataItem *statsSrc = reinterpret_cast<CUTreeSharedDataItem *>(src);
37
+ uint8_t *typeDst = reinterpret_cast<uint8_t *>(dst);
38
+ *typeDst = *statsSrc->type;
39
+
40
+ ///< for memory alignment, the type will take 32bit in the shared memory
41
+ int32_t offset = (sizeof(*statsSrc->type) + SHARED_DATA_ALIGNMENT - 1) & ~(SHARED_DATA_ALIGNMENT - 1);
42
+ uint16_t *statsDst = reinterpret_cast<uint16_t *>(typeDst + offset);
43
+ memcpy(statsDst, statsSrc->stats, size - offset);
44
+}
45
+
46
+
47
inline double qScale2bits(RateControlEntry *rce, double qScale)
48
{
49
if (qScale < 0.1)
50
51
m_lastAbrResetPoc = -1;
52
m_statFileOut = NULL;
53
m_cutreeStatFileOut = m_cutreeStatFileIn = NULL;
54
+ m_cutreeShrMem = NULL;
55
m_rce2Pass = NULL;
56
m_encOrder = NULL;
57
m_lastBsliceSatdCost = 0;
58
59
m_initVbv = false;
60
m_singleFrameVbv = 0;
61
m_rateTolerance = 1.0;
62
+ m_encodedSegmentBits = 0;
63
+ m_segDur = 0;
64
65
if (m_param->rc.vbvBufferSize)
66
{
67
68
m_cuTreeStats.qpBufferi = NULL;
69
}
70
71
-bool RateControl::init(const SPS& sps)
72
+bool RateControl::initCUTreeSharedMem()
73
{
74
- if (m_isVbv && !m_initVbv)
75
- {
76
- /* We don't support changing the ABR bitrate right now,
77
- * so if the stream starts as CBR, keep it CBR. */
78
- if (m_param->rc.vbvBufferSize < (int)(m_param->rc.vbvMaxBitrate / m_fps))
79
+ if (!m_cutreeShrMem) {
80
+ m_cutreeShrMem = new RingMem();
81
+ if (!m_cutreeShrMem)
82
{
83
- m_param->rc.vbvBufferSize = (int)(m_param->rc.vbvMaxBitrate / m_fps);
84
- x265_log(m_param, X265_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
85
- m_param->rc.vbvBufferSize);
86
+ return false;
87
}
88
- int vbvBufferSize = m_param->rc.vbvBufferSize * 1000;
89
- int vbvMaxBitrate = m_param->rc.vbvMaxBitrate * 1000;
90
91
- if (m_param->bEmitHRDSEI && !m_param->decoderVbvMaxRate)
92
+ ///< now cutree data form at most 3 gops would be stored in the shared memory at the same time
93
+ int32_t itemSize = (sizeof(uint8_t) + SHARED_DATA_ALIGNMENT - 1) & ~(SHARED_DATA_ALIGNMENT - 1);
94
+ if (m_param->rc.qgSize == 8)
95
{
96
- const HRDInfo* hrd = &sps.vuiParameters.hrdParameters;
97
- vbvBufferSize = hrd->cpbSizeValue << (hrd->cpbSizeScale + CPB_SHIFT);
98
- vbvMaxBitrate = hrd->bitRateValue << (hrd->bitRateScale + BR_SHIFT);
99
+ itemSize += sizeof(uint16_t) * m_ncu * 4;
100
}
101
- m_bufferRate = vbvMaxBitrate / m_fps;
102
- m_vbvMaxRate = vbvMaxBitrate;
103
- m_bufferSize = vbvBufferSize;
104
- m_singleFrameVbv = m_bufferRate * 1.1 > m_bufferSize;
105
+ else
106
+ {
107
+ itemSize += sizeof(uint16_t) * m_ncu;
108
+ }
109
+
110
+ int32_t itemCnt = X265_MIN(m_param->keyframeMax, (int)(m_fps + 0.5));
111
+ itemCnt *= GOP_CNT_CU_TREE;
112
113
- if (m_param->rc.vbvBufferInit > 1.)
114
- m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, m_param->rc.vbvBufferInit / m_param->rc.vbvBufferSize);
115
- if (m_param->vbvBufferEnd > 1.)
116
- m_param->vbvBufferEnd = x265_clip3(0.0, 1.0, m_param->vbvBufferEnd / m_param->rc.vbvBufferSize);
117
- if (m_param->vbvEndFrameAdjust > 1.)
118
- m_param->vbvEndFrameAdjust = x265_clip3(0.0, 1.0, m_param->vbvEndFrameAdjust);
119
- m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, X265_MAX(m_param->rc.vbvBufferInit, m_bufferRate / m_bufferSize));
120
- m_bufferFillFinal = m_bufferSize * m_param->rc.vbvBufferInit;
121
- m_bufferFillActual = m_bufferFillFinal;
122
- m_bufferExcess = 0;
123
- m_minBufferFill = m_param->minVbvFullness / 100;
124
- m_maxBufferFill = 1 - (m_param->maxVbvFullness / 100);
125
- m_initVbv = true;
126
+ char shrnameMAX_SHR_NAME_LEN = { 0 };
127
+ strcpy(shrname, m_param->rc.sharedMemName);
128
+ strcat(shrname, CUTREE_SHARED_MEM_NAME);
129
+
130
+ if (!m_cutreeShrMem->init(itemSize, itemCnt, shrname))
131
+ {
132
+ return false;
133
+ }
134
}
135
136
+ return true;
137
+}
138
+
139
+void RateControl::initVBV(const SPS& sps)
140
+{
141
+ /* We don't support changing the ABR bitrate right now,
142
+ * so if the stream starts as CBR, keep it CBR. */
143
+ if (m_param->rc.vbvBufferSize < (int)(m_param->rc.vbvMaxBitrate / m_fps))
144
+ {
145
+ m_param->rc.vbvBufferSize = (int)(m_param->rc.vbvMaxBitrate / m_fps);
146
+ x265_log(m_param, X265_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
147
+ m_param->rc.vbvBufferSize);
148
+ }
149
+ int vbvBufferSize = m_param->rc.vbvBufferSize * 1000;
150
+ int vbvMaxBitrate = m_param->rc.vbvMaxBitrate * 1000;
151
+
152
+ if (m_param->bEmitHRDSEI && !m_param->decoderVbvMaxRate)
153
+ {
154
+ const HRDInfo* hrd = &sps.vuiParameters.hrdParameters;
155
+ vbvBufferSize = hrd->cpbSizeValue << (hrd->cpbSizeScale + CPB_SHIFT);
156
+ vbvMaxBitrate = hrd->bitRateValue << (hrd->bitRateScale + BR_SHIFT);
157
+ }
158
+ m_bufferRate = vbvMaxBitrate / m_fps;
159
+ m_vbvMaxRate = vbvMaxBitrate;
160
+ m_bufferSize = vbvBufferSize;
161
+ m_singleFrameVbv = m_bufferRate * 1.1 > m_bufferSize;
162
+
163
+ if (m_param->rc.vbvBufferInit > 1.)
164
+ m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, m_param->rc.vbvBufferInit / m_param->rc.vbvBufferSize);
165
+ if (m_param->vbvBufferEnd > 1.)
166
+ m_param->vbvBufferEnd = x265_clip3(0.0, 1.0, m_param->vbvBufferEnd / m_param->rc.vbvBufferSize);
167
+ if (m_param->vbvEndFrameAdjust > 1.)
168
+ m_param->vbvEndFrameAdjust = x265_clip3(0.0, 1.0, m_param->vbvEndFrameAdjust);
169
+ m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, X265_MAX(m_param->rc.vbvBufferInit, m_bufferRate / m_bufferSize));
170
+ m_bufferFillFinal = m_bufferSize * m_param->rc.vbvBufferInit;
171
+ m_bufferFillActual = m_bufferFillFinal;
172
+ m_bufferExcess = 0;
173
+ m_minBufferFill = m_param->minVbvFullness / 100;
174
+ m_maxBufferFill = 1 - (m_param->maxVbvFullness / 100);
175
+ m_initVbv = true;
176
+}
177
+
178
+bool RateControl::init(const SPS& sps)
179
+{
180
+ if (m_isVbv && !m_initVbv)
181
+ initVBV(sps);
182
+
183
if (!m_param->bResetZoneConfig && (m_relativeComplexity == NULL))
184
{
185
m_relativeComplexity = X265_MALLOC(double, m_param->reconfigWindowSize);
186
187
188
m_totalBits = 0;
189
m_encodedBits = 0;
190
+ m_encodedSegmentBits = 0;
191
m_framesDone = 0;
192
+ m_segDur = 0;
193
m_residualCost = 0;
194
m_partialResidualCost = 0;
195
m_amortizeFraction = 0.85;
196
197
/* Load stat file and init 2pass algo */
198
if (m_param->rc.bStatRead)
199
{
200
- m_expectedBitsSum = 0;
201
- char *p, *statsIn, *statsBuf;
202
- /* read 1st pass stats */
203
- statsIn = statsBuf = x265_slurp_file(fileName);
204
- if (!statsBuf)
205
- return false;
206
- if (m_param->rc.cuTree)
207
+ if (X265_SHARE_MODE_FILE == m_param->rc.dataShareMode)
208
{
209
- char *tmpFile = strcatFilename(fileName, ".cutree");
210
- if (!tmpFile)
211
+ m_expectedBitsSum = 0;
212
+ char *p, *statsIn, *statsBuf;
213
+ /* read 1st pass stats */
214
+ statsIn = statsBuf = x265_slurp_file(fileName);
215
+ if (!statsBuf)
216
return false;
217
- m_cutreeStatFileIn = x265_fopen(tmpFile, "rb");
218
- X265_FREE(tmpFile);
219
- if (!m_cutreeStatFileIn)
220
+ if (m_param->rc.cuTree)
221
{
222
- x265_log_file(m_param, X265_LOG_ERROR, "can't open stats file %s.cutree\n", fileName);
223
- return false;
224
+ char *tmpFile = strcatFilename(fileName, ".cutree");
225
+ if (!tmpFile)
226
+ return false;
227
+ m_cutreeStatFileIn = x265_fopen(tmpFile, "rb");
228
+ X265_FREE(tmpFile);
229
+ if (!m_cutreeStatFileIn)
230
+ {
231
+ x265_log_file(m_param, X265_LOG_ERROR, "can't open stats file %s.cutree\n", fileName);
232
+ return false;
233
+ }
234
}
235
- }
236
237
- /* check whether 1st pass options were compatible with current options */
238
- if (strncmp(statsBuf, "#options:", 9))
239
- {
240
- x265_log(m_param, X265_LOG_ERROR,"options list in stats file not valid\n");
241
- return false;
242
- }
243
- {
244
- int i, j, m;
245
- uint32_t k , l;
246
- bool bErr = false;
247
- char *opts = statsBuf;
248
- statsIn = strchr(statsBuf, '\n');
249
- if (!statsIn)
250
- {
251
- x265_log(m_param, X265_LOG_ERROR, "Malformed stats file\n");
252
- return false;
253
- }
254
- *statsIn = '\0';
255
- statsIn++;
256
- if ((p = strstr(opts, " input-res=")) == 0 || sscanf(p, " input-res=%dx%d", &i, &j) != 2)
257
- {
258
- x265_log(m_param, X265_LOG_ERROR, "Resolution specified in stats file not valid\n");
259
- return false;
260
- }
261
- if ((p = strstr(opts, " fps=")) == 0 || sscanf(p, " fps=%u/%u", &k, &l) != 2)
262
- {
263
- x265_log(m_param, X265_LOG_ERROR, "fps specified in stats file not valid\n");
264
- return false;
265
- }
266
- if (((p = strstr(opts, " vbv-maxrate=")) == 0 || sscanf(p, " vbv-maxrate=%d", &m) != 1) && m_param->rc.rateControlMode == X265_RC_CRF)
267
- {
268
- x265_log(m_param, X265_LOG_ERROR, "Constant rate-factor is incompatible with 2pass without vbv-maxrate in the previous pass\n");
269
- return false;
270
- }
271
- if (k != m_param->fpsNum || l != m_param->fpsDenom)
272
+ /* check whether 1st pass options were compatible with current options */
273
+ if (strncmp(statsBuf, "#options:", 9))
274
{
275
- x265_log(m_param, X265_LOG_ERROR, "fps mismatch with 1st pass (%u/%u vs %u/%u)\n",
276
- m_param->fpsNum, m_param->fpsDenom, k, l);
277
+ x265_log(m_param, X265_LOG_ERROR, "options list in stats file not valid\n");
278
return false;
279
}
280
- if (m_param->analysisMultiPassRefine)
281
{
282
- p = strstr(opts, "ref=");
283
- sscanf(p, "ref=%d", &i);
284
- if (i > m_param->maxNumReferences)
285
+ int i, j, m;
286
+ uint32_t k, l;
287
+ bool bErr = false;
288
+ char *opts = statsBuf;
289
+ statsIn = strchr(statsBuf, '\n');
290
+ if (!statsIn)
291
{
292
- x265_log(m_param, X265_LOG_ERROR, "maxNumReferences cannot be less than 1st pass (%d vs %d)\n",
293
- i, m_param->maxNumReferences);
294
+ x265_log(m_param, X265_LOG_ERROR, "Malformed stats file\n");
295
return false;
296
}
297
- }
298
- if (m_param->analysisMultiPassRefine || m_param->analysisMultiPassDistortion)
299
- {
300
- p = strstr(opts, "ctu=");
301
- sscanf(p, "ctu=%u", &k);
302
- if (k != m_param->maxCUSize)
303
+ *statsIn = '\0';
304
+ statsIn++;
305
+ if ((p = strstr(opts, " input-res=")) == 0 || sscanf(p, " input-res=%dx%d", &i, &j) != 2)
306
{
307
- x265_log(m_param, X265_LOG_ERROR, "maxCUSize mismatch with 1st pass (%u vs %u)\n",
308
- k, m_param->maxCUSize);
309
+ x265_log(m_param, X265_LOG_ERROR, "Resolution specified in stats file not valid\n");
310
return false;
311
}
312
+ if ((p = strstr(opts, " fps=")) == 0 || sscanf(p, " fps=%u/%u", &k, &l) != 2)
313
+ {
314
+ x265_log(m_param, X265_LOG_ERROR, "fps specified in stats file not valid\n");
315
+ return false;
316
+ }
317
+ if (((p = strstr(opts, " vbv-maxrate=")) == 0 || sscanf(p, " vbv-maxrate=%d", &m) != 1) && m_param->rc.rateControlMode == X265_RC_CRF)
318
+ {
319
+ x265_log(m_param, X265_LOG_ERROR, "Constant rate-factor is incompatible with 2pass without vbv-maxrate in the previous pass\n");
320
+ return false;
321
+ }
322
+ if (k != m_param->fpsNum || l != m_param->fpsDenom)
323
+ {
324
+ x265_log(m_param, X265_LOG_ERROR, "fps mismatch with 1st pass (%u/%u vs %u/%u)\n",
325
+ m_param->fpsNum, m_param->fpsDenom, k, l);
326
+ return false;
327
+ }
328
+ if (m_param->analysisMultiPassRefine)
329
+ {
330
+ p = strstr(opts, "ref=");
331
+ sscanf(p, "ref=%d", &i);
332
+ if (i > m_param->maxNumReferences)
333
+ {
334
+ x265_log(m_param, X265_LOG_ERROR, "maxNumReferences cannot be less than 1st pass (%d vs %d)\n",
335
+ i, m_param->maxNumReferences);
336
+ return false;
337
+ }
338
+ }
339
+ if (m_param->analysisMultiPassRefine || m_param->analysisMultiPassDistortion)
340
+ {
341
+ p = strstr(opts, "ctu=");
342
+ sscanf(p, "ctu=%u", &k);
343
+ if (k != m_param->maxCUSize)
344
+ {
345
+ x265_log(m_param, X265_LOG_ERROR, "maxCUSize mismatch with 1st pass (%u vs %u)\n",
346
+ k, m_param->maxCUSize);
347
+ return false;
348
+ }
349
+ }
350
+ CMP_OPT_FIRST_PASS("bitdepth", m_param->internalBitDepth);
351
+ CMP_OPT_FIRST_PASS("weightp", m_param->bEnableWeightedPred);
352
+ CMP_OPT_FIRST_PASS("bframes", m_param->bframes);
353
+ CMP_OPT_FIRST_PASS("b-pyramid", m_param->bBPyramid);
354
+ CMP_OPT_FIRST_PASS("open-gop", m_param->bOpenGOP);
355
+ CMP_OPT_FIRST_PASS(" keyint", m_param->keyframeMax);
356
+ CMP_OPT_FIRST_PASS("scenecut", m_param->scenecutThreshold);
357
+ CMP_OPT_FIRST_PASS("intra-refresh", m_param->bIntraRefresh);
358
+ CMP_OPT_FIRST_PASS("frame-dup", m_param->bEnableFrameDuplication);
359
+ if (m_param->bMultiPassOptRPS)
360
+ {
361
+ CMP_OPT_FIRST_PASS("multi-pass-opt-rps", m_param->bMultiPassOptRPS);
362
+ CMP_OPT_FIRST_PASS("repeat-headers", m_param->bRepeatHeaders);
363
+ CMP_OPT_FIRST_PASS("min-keyint", m_param->keyframeMin);
364
+ }
365
+
366
+ if ((p = strstr(opts, "b-adapt=")) != 0 && sscanf(p, "b-adapt=%d", &i) && i >= X265_B_ADAPT_NONE && i <= X265_B_ADAPT_TRELLIS)
367
+ {
368
+ m_param->bFrameAdaptive = i;
369
+ }
370
+ else if (m_param->bframes)
371
+ {
372
+ x265_log(m_param, X265_LOG_ERROR, "b-adapt method specified in stats file not valid\n");
373
+ return false;
374
+ }
375
+
376
+ if ((p = strstr(opts, "rc-lookahead=")) != 0 && sscanf(p, "rc-lookahead=%d", &i))
377
+ m_param->lookaheadDepth = i;
378
}
379
- CMP_OPT_FIRST_PASS("bitdepth", m_param->internalBitDepth);
380
- CMP_OPT_FIRST_PASS("weightp", m_param->bEnableWeightedPred);
381
- CMP_OPT_FIRST_PASS("bframes", m_param->bframes);
382
- CMP_OPT_FIRST_PASS("b-pyramid", m_param->bBPyramid);
383
- CMP_OPT_FIRST_PASS("open-gop", m_param->bOpenGOP);
384
- CMP_OPT_FIRST_PASS(" keyint", m_param->keyframeMax);
385
- CMP_OPT_FIRST_PASS("scenecut", m_param->scenecutThreshold);
386
- CMP_OPT_FIRST_PASS("intra-refresh", m_param->bIntraRefresh);
387
- CMP_OPT_FIRST_PASS("frame-dup", m_param->bEnableFrameDuplication);
388
- if (m_param->bMultiPassOptRPS)
389
+ /* find number of pics */
390
+ p = statsIn;
391
+ int numEntries;
392
+ for (numEntries = -1; p; numEntries++)
393
+ p = strchr(p + 1, ';');
394
+ if (!numEntries)
395
{
396
- CMP_OPT_FIRST_PASS("multi-pass-opt-rps", m_param->bMultiPassOptRPS);
397
- CMP_OPT_FIRST_PASS("repeat-headers", m_param->bRepeatHeaders);
398
- CMP_OPT_FIRST_PASS("min-keyint", m_param->keyframeMin);
399
+ x265_log(m_param, X265_LOG_ERROR, "empty stats file\n");
400
+ return false;
401
}
402
+ m_numEntries = numEntries;
403
404
- if ((p = strstr(opts, "b-adapt=")) != 0 && sscanf(p, "b-adapt=%d", &i) && i >= X265_B_ADAPT_NONE && i <= X265_B_ADAPT_TRELLIS)
405
+ if (m_param->totalFrames < m_numEntries && m_param->totalFrames > 0)
406
{
407
- m_param->bFrameAdaptive = i;
408
+ x265_log(m_param, X265_LOG_WARNING, "2nd pass has fewer frames than 1st pass (%d vs %d)\n",
409
+ m_param->totalFrames, m_numEntries);
410
}
411
- else if (m_param->bframes)
412
+ if (m_param->totalFrames > m_numEntries && !m_param->bEnableFrameDuplication)
413
{
414
- x265_log(m_param, X265_LOG_ERROR, "b-adapt method specified in stats file not valid\n");
415
+ x265_log(m_param, X265_LOG_ERROR, "2nd pass has more frames than 1st pass (%d vs %d)\n",
416
+ m_param->totalFrames, m_numEntries);
417
return false;
418
}
419
420
- if ((p = strstr(opts, "rc-lookahead=")) != 0 && sscanf(p, "rc-lookahead=%d", &i))
421
- m_param->lookaheadDepth = i;
422
- }
423
- /* find number of pics */
424
- p = statsIn;
425
- int numEntries;
426
- for (numEntries = -1; p; numEntries++)
427
- p = strchr(p + 1, ';');
428
- if (!numEntries)
429
- {
430
- x265_log(m_param, X265_LOG_ERROR, "empty stats file\n");
431
- return false;
432
- }
433
- m_numEntries = numEntries;
434
-
435
- if (m_param->totalFrames < m_numEntries && m_param->totalFrames > 0)
436
- {
437
- x265_log(m_param, X265_LOG_WARNING, "2nd pass has fewer frames than 1st pass (%d vs %d)\n",
438
- m_param->totalFrames, m_numEntries);
439
- }
440
- if (m_param->totalFrames > m_numEntries && !m_param->bEnableFrameDuplication)
441
- {
442
- x265_log(m_param, X265_LOG_ERROR, "2nd pass has more frames than 1st pass (%d vs %d)\n",
443
- m_param->totalFrames, m_numEntries);
444
- return false;
445
- }
446
-
447
- m_rce2Pass = X265_MALLOC(RateControlEntry, m_numEntries);
448
- if (!m_rce2Pass)
449
- {
450
- x265_log(m_param, X265_LOG_ERROR, "Rce Entries for 2 pass cannot be allocated\n");
451
- return false;
452
- }
453
- m_encOrder = X265_MALLOC(int, m_numEntries);
454
- if (!m_encOrder)
455
- {
456
- x265_log(m_param, X265_LOG_ERROR, "Encode order for 2 pass cannot be allocated\n");
457
- return false;
458
- }
459
- /* init all to skipped p frames */
460
- for (int i = 0; i < m_numEntries; i++)
461
- {
462
- RateControlEntry *rce = &m_rce2Passi;
463
- rce->sliceType = P_SLICE;
464
- rce->qScale = rce->newQScale = x265_qp2qScale(20);
465
- rce->miscBits = m_ncu + 10;
466
- rce->newQp = 0;
467
- }
468
- /* read stats */
469
- p = statsIn;
470
- double totalQpAq = 0;
471
- for (int i = 0; i < m_numEntries; i++)
472
- {
473
- RateControlEntry *rce, *rcePocOrder;
474
- int frameNumber;
475
- int encodeOrder;
476
- char picType;
477
- int e;
478
- char *next;
479
- double qpRc, qpAq, qNoVbv, qRceq;
480
- next = strstr(p, ";");
481
- if (next)
482
- *next++ = 0;
483
- e = sscanf(p, " in:%d out:%d", &frameNumber, &encodeOrder);
484
- if (frameNumber < 0 || frameNumber >= m_numEntries)
485
+ m_rce2Pass = X265_MALLOC(RateControlEntry, m_numEntries);
486
+ if (!m_rce2Pass)
487
{
488
- x265_log(m_param, X265_LOG_ERROR, "bad frame number (%d) at stats line %d\n", frameNumber, i);
489
+ x265_log(m_param, X265_LOG_ERROR, "Rce Entries for 2 pass cannot be allocated\n");
490
return false;
491
}
492
- rce = &m_rce2PassencodeOrder;
493
- rcePocOrder = &m_rce2PassframeNumber;
494
- m_encOrderframeNumber = encodeOrder;
495
- if (!m_param->bMultiPassOptRPS)
496
- {
497
- int scenecut = 0;
498
- e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf q-noVbv:%lf q-Rceq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf sc:%d",
499
- &picType, &qpRc, &qpAq, &qNoVbv, &qRceq, &rce->coeffBits,
500
- &rce->mvBits, &rce->miscBits, &rce->iCuCount, &rce->pCuCount,
501
- &rce->skipCuCount, &scenecut);
502
- rcePocOrder->scenecut = scenecut != 0;
503
+ m_encOrder = X265_MALLOC(int, m_numEntries);
504
+ if (!m_encOrder)
505
+ {
506
+ x265_log(m_param, X265_LOG_ERROR, "Encode order for 2 pass cannot be allocated\n");
507
+ return false;
508
}
509
- else
510
+ /* init all to skipped p frames */
511
+ for (int i = 0; i < m_numEntries; i++)
512
{
513
- char deltaPOC128;
514
- char bUsed40;
515
- memset(deltaPOC, 0, sizeof(deltaPOC));
516
- memset(bUsed, 0, sizeof(bUsed));
517
- e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf q-noVbv:%lf q-Rceq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf nump:%d numnegp:%d numposp:%d deltapoc:%s bused:%s",
518
- &picType, &qpRc, &qpAq, &qNoVbv, &qRceq, &rce->coeffBits,
519
- &rce->mvBits, &rce->miscBits, &rce->iCuCount, &rce->pCuCount,
520
- &rce->skipCuCount, &rce->rpsData.numberOfPictures, &rce->rpsData.numberOfNegativePictures, &rce->rpsData.numberOfPositivePictures, deltaPOC, bUsed);
521
- splitdeltaPOC(deltaPOC, rce);
522
- splitbUsed(bUsed, rce);
523
- rce->rpsIdx = -1;
524
- }
525
- rce->keptAsRef = true;
526
- rce->isIdr = false;
527
- if (picType == 'b' || picType == 'p')
528
- rce->keptAsRef = false;
529
- if (picType == 'I')
530
- rce->isIdr = true;
531
- if (picType == 'I' || picType == 'i')
532
- rce->sliceType = I_SLICE;
533
- else if (picType == 'P' || picType == 'p')
534
+ RateControlEntry *rce = &m_rce2Passi;
535
rce->sliceType = P_SLICE;
536
- else if (picType == 'B' || picType == 'b')
537
- rce->sliceType = B_SLICE;
538
- else
539
- e = -1;
540
- if (e < 10)
541
+ rce->qScale = rce->newQScale = x265_qp2qScale(20);
542
+ rce->miscBits = m_ncu + 10;
543
+ rce->newQp = 0;
544
+ }
545
+ /* read stats */
546
+ p = statsIn;
547
+ double totalQpAq = 0;
548
+ for (int i = 0; i < m_numEntries; i++)
549
+ {
550
+ RateControlEntry *rce, *rcePocOrder;
551
+ int frameNumber;
552
+ int encodeOrder;
553
+ char picType;
554
+ int e;
555
+ char *next;
556
+ double qpRc, qpAq, qNoVbv, qRceq;
557
+ next = strstr(p, ";");
558
+ if (next)
559
+ *next++ = 0;
560
+ e = sscanf(p, " in:%d out:%d", &frameNumber, &encodeOrder);
561
+ if (frameNumber < 0 || frameNumber >= m_numEntries)
562
+ {
563
+ x265_log(m_param, X265_LOG_ERROR, "bad frame number (%d) at stats line %d\n", frameNumber, i);
564
+ return false;
565
+ }
566
+ rce = &m_rce2PassencodeOrder;
567
+ rcePocOrder = &m_rce2PassframeNumber;
568
+ m_encOrderframeNumber = encodeOrder;
569
+ if (!m_param->bMultiPassOptRPS)
570
+ {
571
+ int scenecut = 0;
572
+ e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf q-noVbv:%lf q-Rceq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf sc:%d",
573
+ &picType, &qpRc, &qpAq, &qNoVbv, &qRceq, &rce->coeffBits,
574
+ &rce->mvBits, &rce->miscBits, &rce->iCuCount, &rce->pCuCount,
575
+ &rce->skipCuCount, &scenecut);
576
+ rcePocOrder->scenecut = scenecut != 0;
577
+ }
578
+ else
579
+ {
580
+ char deltaPOC128;
581
+ char bUsed40;
582
+ memset(deltaPOC, 0, sizeof(deltaPOC));
583
+ memset(bUsed, 0, sizeof(bUsed));
584
+ e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf q-noVbv:%lf q-Rceq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf nump:%d numnegp:%d numposp:%d deltapoc:%s bused:%s",
585
+ &picType, &qpRc, &qpAq, &qNoVbv, &qRceq, &rce->coeffBits,
586
+ &rce->mvBits, &rce->miscBits, &rce->iCuCount, &rce->pCuCount,
587
+ &rce->skipCuCount, &rce->rpsData.numberOfPictures, &rce->rpsData.numberOfNegativePictures, &rce->rpsData.numberOfPositivePictures, deltaPOC, bUsed);
588
+ splitdeltaPOC(deltaPOC, rce);
589
+ splitbUsed(bUsed, rce);
590
+ rce->rpsIdx = -1;
591
+ }
592
+ rce->keptAsRef = true;
593
+ rce->isIdr = false;
594
+ if (picType == 'b' || picType == 'p')
595
+ rce->keptAsRef = false;
596
+ if (picType == 'I')
597
+ rce->isIdr = true;
598
+ if (picType == 'I' || picType == 'i')
599
+ rce->sliceType = I_SLICE;
600
+ else if (picType == 'P' || picType == 'p')
601
+ rce->sliceType = P_SLICE;
602
+ else if (picType == 'B' || picType == 'b')
603
+ rce->sliceType = B_SLICE;
604
+ else
605
+ e = -1;
606
+ if (e < 10)
607
+ {
608
+ x265_log(m_param, X265_LOG_ERROR, "statistics are damaged at line %d, parser out=%d\n", i, e);
609
+ return false;
610
+ }
611
+ rce->qScale = rce->newQScale = x265_qp2qScale(qpRc);
612
+ totalQpAq += qpAq;
613
+ rce->qpNoVbv = qNoVbv;
614
+ rce->qpaRc = qpRc;
615
+ rce->qpAq = qpAq;
616
+ rce->qRceq = qRceq;
617
+ p = next;
618
+ }
619
+ X265_FREE(statsBuf);
620
+ if (m_param->rc.rateControlMode != X265_RC_CQP)
621
+ {
622
+ m_start = 0;
623
+ m_isQpModified = true;
624
+ if (!initPass2())
625
+ return false;
626
+ } /* else we're using constant quant, so no need to run the bitrate allocation */
627
+ }
628
+ else // X265_SHARE_MODE_SHAREDMEM == m_param->rc.dataShareMode
629
+ {
630
+ if (m_param->rc.cuTree)
631
{
632
- x265_log(m_param, X265_LOG_ERROR, "statistics are damaged at line %d, parser out=%d\n", i, e);
633
- return false;
634
+ if (!initCUTreeSharedMem())
635
+ {
636
+ return false;
637
+ }
638
}
639
- rce->qScale = rce->newQScale = x265_qp2qScale(qpRc);
640
- totalQpAq += qpAq;
641
- rce->qpNoVbv = qNoVbv;
642
- rce->qpaRc = qpRc;
643
- rce->qpAq = qpAq;
644
- rce->qRceq = qRceq;
645
- p = next;
646
- }
647
- X265_FREE(statsBuf);
648
- if (m_param->rc.rateControlMode != X265_RC_CQP)
649
- {
650
- m_start = 0;
651
- m_isQpModified = true;
652
- if (!initPass2())
653
- return false;
654
- } /* else we're using constant quant, so no need to run the bitrate allocation */
655
+ }
656
}
657
/* Open output file */
658
/* If input and output files are the same, output to a temp file
659
660
X265_FREE(p);
661
if (m_param->rc.cuTree && !m_param->rc.bStatRead)
662
{
663
- statFileTmpname = strcatFilename(fileName, ".cutree.temp");
664
- if (!statFileTmpname)
665
- return false;
666
- m_cutreeStatFileOut = x265_fopen(statFileTmpname, "wb");
667
- X265_FREE(statFileTmpname);
668
- if (!m_cutreeStatFileOut)
669
+ if (X265_SHARE_MODE_FILE == m_param->rc.dataShareMode)
670
{
671
- x265_log_file(m_param, X265_LOG_ERROR, "can't open mbtree stats file %s.cutree.temp\n", fileName);
672
- return false;
673
+ statFileTmpname = strcatFilename(fileName, ".cutree.temp");
674
+ if (!statFileTmpname)
675
+ return false;
676
+ m_cutreeStatFileOut = x265_fopen(statFileTmpname, "wb");
677
+ X265_FREE(statFileTmpname);
678
+ if (!m_cutreeStatFileOut)
679
+ {
680
+ x265_log_file(m_param, X265_LOG_ERROR, "can't open mbtree stats file %s.cutree.temp\n", fileName);
681
+ return false;
682
+ }
683
+ }
684
+ else // X265_SHARE_MODE_SHAREDMEM == m_param->rc.dataShareMode
685
+ {
686
+ if (!initCUTreeSharedMem())
687
+ {
688
+ return false;
689
+ }
690
}
691
}
692
}
693
- if (m_param->rc.cuTree)
694
+ if (m_param->rc.cuTree && !m_cuTreeStats.qpBuffer0)
695
{
696
if (m_param->rc.qgSize == 8)
697
{
698
699
return true;
700
}
701
702
+void RateControl::skipCUTreeSharedMemRead(int32_t cnt)
703
+{
704
+ m_cutreeShrMem->skipRead(cnt);
705
+}
706
void RateControl::reconfigureRC()
707
{
708
if (m_isVbv)
709
710
711
TimingInfo *time = &sps.vuiParameters.timingInfo;
712
int maxCpbOutputDelay = (int)(X265_MIN(m_param->keyframeMax * MAX_DURATION * time->timeScale / time->numUnitsInTick, INT_MAX));
713
- int maxDpbOutputDelay = (int)(sps.maxDecPicBuffering * MAX_DURATION * time->timeScale / time->numUnitsInTick);
714
+ int maxDpbOutputDelay = (int)(sps.maxDecPicBufferingsps.maxTempSubLayers - 1 * MAX_DURATION * time->timeScale / time->numUnitsInTick);
715
int maxDelay = (int)(90000.0 * cpbSizeUnscale / bitRateUnscale + 0.5);
716
717
hrd->initialCpbRemovalDelayLength = 2 + x265_clip3(4, 22, 32 - calcLength(maxDelay));
718
719
{
720
uint64_t allConstBits = 0, allCodedBits = 0;
721
uint64_t allAvailableBits = uint64_t(m_param->rc.bitrate * 1000. * m_numEntries * m_frameDuration);
722
- int startIndex, framesCount, endIndex;
723
+ int startIndex, endIndex;
724
int fps = X265_MIN(m_param->keyframeMax, (int)(m_fps + 0.5));
725
- startIndex = endIndex = framesCount = 0;
726
- int diffQp = 0;
727
+ int distance = fps << 1;
728
+ distance = distance > m_param->keyframeMax ? (m_param->keyframeMax << 1) : m_param->keyframeMax;
729
+ startIndex = endIndex = 0;
730
double targetBits = 0;
731
double expectedBits = 0;
732
- for (startIndex = m_start, endIndex = m_start; endIndex < m_numEntries; endIndex++)
733
+ double targetBits2 = 0;
734
+ double expectedBits2 = 0;
735
+ double cpxSum = 0;
736
+ double cpxSum2 = 0;
737
+
738
+ if (m_param->rc.rateControlMode == X265_RC_ABR)
739
{
740
- allConstBits += m_rce2PassendIndex.miscBits;
741
- allCodedBits += m_rce2PassendIndex.coeffBits + m_rce2PassendIndex.mvBits;
742
- if (m_param->rc.rateControlMode == X265_RC_CRF)
743
+ for (endIndex = m_start; endIndex < m_numEntries; endIndex++)
744
{
745
- framesCount = endIndex - startIndex + 1;
746
- diffQp += int (m_rce2PassendIndex.qpaRc - m_rce2PassendIndex.qpNoVbv);
747
- if (framesCount > fps)
748
- diffQp -= int (m_rce2PassendIndex - fps.qpaRc - m_rce2PassendIndex - fps.qpNoVbv);
749
- if (framesCount >= fps)
750
- {
751
- if (diffQp >= 1)
752
- {
753
- if (!m_isQpModified && endIndex > fps)
754
- {
755
- double factor = 2;
756
- double step = 0;
757
- if (endIndex + fps >= m_numEntries)
758
- {
759
- m_start = endIndex - (endIndex % fps);
760
- return true;
761
- }
762
- for (int start = endIndex + 1; start <= endIndex + fps && start < m_numEntries; start++)
763
- {
764
- RateControlEntry *rce = &m_rce2Passstart;
765
- targetBits += qScale2bits(rce, x265_qp2qScale(rce->qpNoVbv));
766
- expectedBits += qScale2bits(rce, rce->qScale);
767
- }
768
- if (expectedBits < 0.95 * targetBits)
769
- {
770
- m_isQpModified = true;
771
- m_isGopReEncoded = true;
772
- while (endIndex + fps < m_numEntries)
773
- {
774
- step = pow(2, factor / 6.0);
775
- expectedBits = 0;
776
- for (int start = endIndex + 1; start <= endIndex + fps; start++)
777
- {
778
- RateControlEntry *rce = &m_rce2Passstart;
779
- rce->newQScale = rce->qScale / step;
780
- X265_CHECK(rce->newQScale >= 0, "new Qscale is negative\n");
781
- expectedBits += qScale2bits(rce, rce->newQScale);
782
- rce->newQp = x265_qScale2qp(rce->newQScale);
783
- }
784
- if (expectedBits >= targetBits && step > 1)
785
- factor *= 0.90;
786
- else
787
- break;
788
- }
789
-
790
- if (m_isVbv && endIndex + fps < m_numEntries)
791
- if (!vbv2Pass((uint64_t)targetBits, endIndex + fps, endIndex + 1))
792
- return false;
793
-
794
- targetBits = 0;
795
- expectedBits = 0;
796
-
797
- for (int start = endIndex - fps + 1; start <= endIndex; start++)
798
- {
799
- RateControlEntry *rce = &m_rce2Passstart;
800
- targetBits += qScale2bits(rce, x265_qp2qScale(rce->qpNoVbv));
801
- }
802
- while (1)
803
- {
804
- step = pow(2, factor / 6.0);
805
- expectedBits = 0;
806
- for (int start = endIndex - fps + 1; start <= endIndex; start++)
807
- {
808
- RateControlEntry *rce = &m_rce2Passstart;
809
- rce->newQScale = rce->qScale * step;
810
- X265_CHECK(rce->newQScale >= 0, "new Qscale is negative\n");
811
- expectedBits += qScale2bits(rce, rce->newQScale);
812
- rce->newQp = x265_qScale2qp(rce->newQScale);
813
- }
814
- if (expectedBits > targetBits && step > 1)
815
- factor *= 1.1;
816
- else
817
- break;
818
- }
819
- if (m_isVbv)
820
- if (!vbv2Pass((uint64_t)targetBits, endIndex, endIndex - fps + 1))
821
- return false;
822
- diffQp = 0;
823
- m_reencode = endIndex - fps + 1;
824
- endIndex = endIndex + fps;
825
- startIndex = endIndex + 1;
826
- m_start = startIndex;
827
- targetBits = expectedBits = 0;
828
- }
829
- else
830
- targetBits = expectedBits = 0;
831
- }
832
- }
833
- else
834
- m_isQpModified = false;
835
- }
836
+ allConstBits += m_rce2PassendIndex.miscBits;
837
+ allCodedBits += m_rce2PassendIndex.coeffBits + m_rce2PassendIndex.mvBits;
838
}
839
- }
840
841
- if (m_param->rc.rateControlMode == X265_RC_ABR)
842
- {
843
if (allAvailableBits < allConstBits)
844
{
845
x265_log(m_param, X265_LOG_ERROR, "requested bitrate is too low. estimated minimum is %d kbps\n",
846
- (int)(allConstBits * m_fps / framesCount * 1000.));
847
+ (int)(allConstBits * m_fps / (m_numEntries - m_start) * 1000.));
848
return false;
849
}
850
if (!analyseABR2Pass(allAvailableBits))
851
return false;
852
+
853
+ return true;
854
+ }
855
+
856
+ if (m_isQpModified)
857
+ {
858
+ return true;
859
+ }
860
+
861
+ if (m_start + (fps << 1) > m_numEntries)
862
+ {
863
+ return true;
864
+ }
865
+
866
+ for (startIndex = m_start, endIndex = m_numEntries - 1; startIndex < endIndex; startIndex++, endIndex--)
867
+ {
868
+ cpxSum += m_rce2PassstartIndex.qScale / m_rce2PassstartIndex.coeffBits;
869
+ cpxSum2 += m_rce2PassendIndex.qScale / m_rce2PassendIndex.coeffBits;
870
+
871
+ RateControlEntry *rce = &m_rce2PassstartIndex;
872
+ targetBits += qScale2bits(rce, x265_qp2qScale(rce->qpNoVbv));
873
+ expectedBits += qScale2bits(rce, rce->qScale);
874
+
875
+ rce = &m_rce2PassendIndex;
876
+ targetBits2 += qScale2bits(rce, x265_qp2qScale(rce->qpNoVbv));
877
+ expectedBits2 += qScale2bits(rce, rce->qScale);
878
}
879
880
- m_start = X265_MAX(m_start, endIndex - fps);
881
+ if (expectedBits < 0.95 * targetBits || expectedBits2 < 0.95 * targetBits2)
882
+ {
883
+ if (cpxSum / cpxSum2 < 0.95 || cpxSum2 / cpxSum < 0.95)
884
+ {
885
+ m_isQpModified = true;
886
+ m_isGopReEncoded = true;
887
+
888
+ m_shortTermCplxSum = 0;
889
+ m_shortTermCplxCount = 0;
890
+ m_framesDone = m_start;
891
+
892
+ for (startIndex = m_start; startIndex < m_numEntries; startIndex++)
893
+ {
894
+ m_shortTermCplxSum *= 0.5;
895
+ m_shortTermCplxCount *= 0.5;
896
+ m_shortTermCplxSum += m_rce2PassstartIndex.currentSatd / (CLIP_DURATION(m_frameDuration) / BASE_FRAME_DURATION);
897
+ m_shortTermCplxCount++;
898
+ }
899
+
900
+ m_bufferFill = m_rce2Passm_start - 1.bufferFill;
901
+ m_bufferFillFinal = m_rce2Passm_start - 1.bufferFillFinal;
902
+ m_bufferFillActual = m_rce2Passm_start - 1.bufferFillActual;
903
+
904
+ m_reencode = m_start;
905
+ m_start = m_numEntries;
906
+ }
907
+ else
908
+ {
909
+
910
+ m_isQpModified = false;
911
+ m_isGopReEncoded = false;
912
+ }
913
+ }
914
+ else
915
+ {
916
+
917
+ m_isQpModified = false;
918
+ m_isGopReEncoded = false;
919
+ }
920
+
921
+ m_start = X265_MAX(m_start, m_numEntries - distance + m_param->keyframeMax);
922
923
return true;
924
}
925
926
m_predType = getPredictorType(curFrame->m_lowres.sliceType, m_sliceType);
927
rce->poc = m_curSlice->m_poc;
928
929
+ if (m_param->bEnableSBRC)
930
+ {
931
+ if (rce->poc == 0 || (m_framesDone % m_param->keyframeMax == 0))
932
+ {
933
+ //Reset SBRC buffer
934
+ m_encodedSegmentBits = 0;
935
+ m_segDur = 0;
936
+ }
937
+ }
938
+
939
if (!m_param->bResetZoneConfig && (rce->encodeOrder % m_param->reconfigWindowSize == 0))
940
{
941
int index = m_zoneBufferIdx % m_param->rc.zonefileCount;
942
943
{
944
m_param = m_param->rc.zonesi.zoneParam;
945
reconfigureRC();
946
- init(*m_curSlice->m_sps);
947
+ if (!m_param->bNoResetZoneConfig)
948
+ init(*m_curSlice->m_sps);
949
}
950
}
951
}
952
953
rce->frameSizeMaximum *= m_param->maxAUSizeFactor;
954
}
955
}
956
+
957
+ ///< regenerate the qp
958
if (!m_isAbr && m_2pass && m_param->rc.rateControlMode == X265_RC_CRF)
959
{
960
- rce->qpPrev = x265_qScale2qp(rce->qScale);
961
- rce->qScale = rce->newQScale;
962
- rce->qpaRc = curEncData.m_avgQpRc = curEncData.m_avgQpAq = x265_qScale2qp(rce->newQScale);
963
- m_qp = int(rce->qpaRc + 0.5);
964
- rce->frameSizePlanned = qScale2bits(rce, rce->qScale);
965
- m_framesDone++;
966
- return m_qp;
967
+ if (!m_param->rc.bEncFocusedFramesOnly)
968
+ {
969
+ rce->qpPrev = x265_qScale2qp(rce->qScale);
970
+ if (m_param->bEnableSceneCutAwareQp)
971
+ {
972
+ double lqmin = m_lminm_sliceType;
973
+ double lqmax = m_lmaxm_sliceType;
974
+ if (m_param->bEnableSceneCutAwareQp & FORWARD)
975
+ rce->newQScale = forwardMasking(curFrame, rce->newQScale);
976
+ if (m_param->bEnableSceneCutAwareQp & BACKWARD)
977
+ rce->newQScale = backwardMasking(curFrame, rce->newQScale);
978
+ rce->newQScale = x265_clip3(lqmin, lqmax, rce->newQScale);
979
+ }
980
+ rce->qScale = rce->newQScale;
981
+ rce->qpaRc = curEncData.m_avgQpRc = curEncData.m_avgQpAq = x265_qScale2qp(rce->newQScale);
982
+ m_qp = int(rce->qpaRc + 0.5);
983
+ rce->frameSizePlanned = qScale2bits(rce, rce->qScale);
984
+ m_framesDone++;
985
+ return m_qp;
986
+ }
987
+ else
988
+ {
989
+ int index = m_encOrderrce->poc;
990
+ index++;
991
+ double totalDuration = m_frameDuration;
992
+ for (int j = 0; totalDuration < 1.0 && index < m_numEntries; j++)
993
+ {
994
+ switch (m_rce2Passindex.sliceType)
995
+ {
996
+ case B_SLICE:
997
+ curFrame->m_lowres.plannedTypej = m_rce2Passindex.keptAsRef ? X265_TYPE_BREF : X265_TYPE_B;
998
+ break;
999
+ case P_SLICE:
1000
+ curFrame->m_lowres.plannedTypej = X265_TYPE_P;
1001
+ break;
1002
+ case I_SLICE:
1003
+ curFrame->m_lowres.plannedTypej = m_param->bOpenGOP ? X265_TYPE_I : X265_TYPE_IDR;
1004
+ break;
1005
+ default:
1006
+ break;
1007
+ }
1008
+
1009
+ curFrame->m_lowres.plannedSatdj = m_rce2Passindex.currentSatd;
1010
+ totalDuration += m_frameDuration;
1011
+ index++;
1012
+ }
1013
+ }
1014
}
1015
1016
if (m_isAbr || m_2pass) // ABR,CRF
1017
1018
{
1019
m_cuTreeStats.qpBufPos++;
1020
1021
- if (!fread(&type, 1, 1, m_cutreeStatFileIn))
1022
- goto fail;
1023
- if (fread(m_cuTreeStats.qpBufferm_cuTreeStats.qpBufPos, sizeof(uint16_t), ncu, m_cutreeStatFileIn) != (size_t)ncu)
1024
- goto fail;
1025
+ if (X265_SHARE_MODE_FILE == m_param->rc.dataShareMode)
1026
+ {
1027
+ if (!fread(&type, 1, 1, m_cutreeStatFileIn))
1028
+ goto fail;
1029
+ if (fread(m_cuTreeStats.qpBufferm_cuTreeStats.qpBufPos, sizeof(uint16_t), ncu, m_cutreeStatFileIn) != (size_t)ncu)
1030
+ goto fail;
1031
+ }
1032
+ else // X265_SHARE_MODE_SHAREDMEM == m_param->rc.dataShareMode
1033
+ {
1034
+ if (!m_cutreeShrMem)
1035
+ {
1036
+ goto fail;
1037
+ }
1038
+
1039
+ CUTreeSharedDataItem shrItem;
1040
+ shrItem.type = &type;
1041
+ shrItem.stats = m_cuTreeStats.qpBufferm_cuTreeStats.qpBufPos;
1042
+ m_cutreeShrMem->readNext(&shrItem, ReadSharedCUTreeData);
1043
+ }
1044
1045
if (type != sliceTypeActual && m_cuTreeStats.qpBufPos == 1)
1046
{
1047
1048
m_sliderPos++;
1049
}
1050
1051
- if (m_sliceType == B_SLICE)
1052
+ if((!m_param->bEnableSBRC && m_sliceType == B_SLICE) || (m_param->bEnableSBRC && !IS_REFERENCED(curFrame)))
1053
{
1054
/* B-frames don't have independent rate control, but rather get the
1055
* average QP of the two adjacent P-frames + an offset */
1056
1057
double minScenecutQscale =x265_qp2qScale(ABR_SCENECUT_INIT_QP_MIN);
1058
m_lastQScaleForP_SLICE = X265_MAX(minScenecutQscale, m_lastQScaleForP_SLICE);
1059
}
1060
+
1061
double qScale = x265_qp2qScale(q);
1062
rce->qpNoVbv = q;
1063
+
1064
+ if (m_param->bEnableSBRC)
1065
+ {
1066
+ qScale = tuneQscaleForSBRC(curFrame, qScale);
1067
+ rce->qpNoVbv = x265_qScale2qp(qScale);
1068
+ }
1069
+
1070
double lmin = 0, lmax = 0;
1071
if (m_isGrainEnabled && m_isFirstMiniGop)
1072
{
1073
1074
qScale = x265_clip3(lqmin, lqmax, qScale);
1075
}
1076
1077
- if (!m_2pass || m_param->bliveVBV2pass)
1078
+ if (!m_2pass || m_param->bliveVBV2pass || (m_2pass && m_param->rc.rateControlMode == X265_RC_CRF && m_param->rc.bEncFocusedFramesOnly))
1079
{
1080
/* clip qp to permissible range after vbv-lookahead estimation to avoid possible
1081
* mispredictions by initial frame size predictors */
1082
1083
else
1084
{
1085
double abrBuffer = 2 * m_rateTolerance * m_bitrate;
1086
- if (m_2pass)
1087
+ if (m_2pass && (m_param->rc.rateControlMode != X265_RC_CRF || !m_param->rc.bEncFocusedFramesOnly))
1088
{
1089
double lmin = m_lminm_sliceType;
1090
double lmax = m_lmaxm_sliceType;
1091
1092
1093
if (m_param->rc.rateControlMode == X265_RC_CRF)
1094
{
1095
+ if (m_param->bEnableSBRC)
1096
+ {
1097
+ double rfConstant = m_param->rc.rfConstant;
1098
+ if (m_currentSatd < rce->movingAvgSum)
1099
+ rfConstant += 2;
1100
+ double ipOffset = (curFrame->m_lowres.bScenecut ? m_ipOffset : m_ipOffset / 2.0);
1101
+ rfConstant = (rce->sliceType == I_SLICE ? rfConstant - ipOffset :
1102
+ (rce->sliceType == B_SLICE ? rfConstant + m_pbOffset : rfConstant));
1103
+ double mbtree_offset = m_param->rc.cuTree ? (1.0 - m_param->rc.qCompress) * 13.5 : 0;
1104
+ double qComp = (m_param->rc.cuTree && !m_param->rc.hevcAq) ? 0.99 : m_param->rc.qCompress;
1105
+ m_rateFactorConstant = pow(m_currentSatd, 1.0 - qComp) /
1106
+ x265_qp2qScale(rfConstant + mbtree_offset);
1107
+ }
1108
q = getQScale(rce, m_rateFactorConstant);
1109
x265_zone* zone = getZone();
1110
if (zone)
1111
1112
}
1113
double tunedQScale = tuneAbrQScaleFromFeedback(initialQScale);
1114
overflow = tunedQScale / initialQScale;
1115
- q = !m_partialResidualFrames? tunedQScale : initialQScale;
1116
+ q = !m_partialResidualFrames ? tunedQScale : initialQScale;
1117
bool isEncodeEnd = (m_param->totalFrames &&
1118
m_framesDone > 0.75 * m_param->totalFrames) ? 1 : 0;
1119
bool isEncodeBeg = m_framesDone < (int)(m_fps + 0.5);
1120
1121
q = X265_MAX(minScenecutQscale, q);
1122
m_lastQScaleForP_SLICE = X265_MAX(minScenecutQscale, m_lastQScaleForP_SLICE);
1123
}
1124
+ if (m_param->bEnableSBRC)
1125
+ q = tuneQscaleForSBRC(curFrame, q);
1126
+
1127
rce->qpNoVbv = x265_qScale2qp(q);
1128
if (m_sliceType == P_SLICE)
1129
{
1130
1131
return (p->coeff * var + p->offset) / (q * p->count);
1132
}
1133
1134
+double RateControl::tuneQscaleForSBRC(Frame* curFrame, double q)
1135
+{
1136
+ int depth = 0;
1137
+ int framesDoneInSeg = m_framesDone % m_param->keyframeMax;
1138
+ if (framesDoneInSeg + m_param->lookaheadDepth <= m_param->keyframeMax)
1139
+ depth = m_param->lookaheadDepth;
1140
+ else
1141
+ depth = m_param->keyframeMax - framesDoneInSeg;
1142
+ for (int iterations = 0; iterations < 1000; iterations++)
1143
+ {
1144
+ double totalDuration = m_segDur;
1145
+ double frameBitsTotal = m_encodedSegmentBits + predictSize(&m_predm_predType, q, (double)m_currentSatd);
1146
+ for (int i = 0; i < depth; i++)
1147
+ {
1148
+ int type = curFrame->m_lowres.plannedTypei;
1149
+ if (type == X265_TYPE_AUTO)
1150
+ break;
1151
+ int64_t satd = curFrame->m_lowres.plannedSatdi >> (X265_DEPTH - 8);
1152
+ type = IS_X265_TYPE_I(curFrame->m_lowres.plannedTypei) ? I_SLICE : IS_X265_TYPE_B(curFrame->m_lowres.plannedTypei) ? B_SLICE : P_SLICE;
1153
+ int predType = getPredictorType(curFrame->m_lowres.plannedTypei, type);
1154
+ double curBits = predictSize(&m_predpredType, q, (double)satd);
1155
+ frameBitsTotal += curBits;
1156
+ totalDuration += m_frameDuration;
1157
+ }
1158
+ //Check for segment buffer overflow and adjust QP accordingly
1159
+ double segDur = m_param->keyframeMax / m_fps;
1160
+ double allowedSize = m_vbvMaxRate * segDur;
1161
+ double remDur = segDur - totalDuration;
1162
+ double remainingBits = frameBitsTotal / totalDuration * remDur;
1163
+ if (frameBitsTotal + remainingBits > 0.9 * allowedSize)
1164
+ q = q * 1.01;
1165
+ else
1166
+ break;
1167
+ }
1168
+ return q;
1169
+}
1170
+
1171
double RateControl::clipQscale(Frame* curFrame, RateControlEntry* rce, double q)
1172
{
1173
// B-frames are not directly subject to VBV,
1174
1175
{
1176
finalDur = x265_clip3(0.4, 1.0, totalDuration);
1177
}
1178
- targetFill = X265_MIN(m_bufferFill + totalDuration * m_vbvMaxRate * 0.5, m_bufferSize * (1 - m_minBufferFill * finalDur));
1179
+ targetFill = X265_MIN(m_bufferFill + totalDuration * m_vbvMaxRate * 0.5, m_bufferSize * (m_minBufferFill * finalDur));
1180
if (bufferFillCur < targetFill)
1181
{
1182
q *= 1.01;
1183
1184
1185
if (m_param->rc.aqMode || m_isVbv || m_param->bAQMotion || bEnableDistOffset)
1186
{
1187
- if (m_isVbv && !(m_2pass && m_param->rc.rateControlMode == X265_RC_CRF))
1188
+ if (m_isVbv && !(m_2pass && m_param->rc.rateControlMode == X265_RC_CRF && !m_param->rc.bEncFocusedFramesOnly))
1189
{
1190
double avgQpRc = 0;
1191
/* determine avg QP decided by VBV rate control */
1192
1193
if (m_param->rc.rateControlMode == X265_RC_CRF)
1194
{
1195
double crfVal, qpRef = curEncData.m_avgQpRc;
1196
+
1197
bool is2passCrfChange = false;
1198
- if (m_2pass)
1199
+ if (m_2pass && !m_param->rc.bEncFocusedFramesOnly)
1200
{
1201
if (fabs(curEncData.m_avgQpRc - rce->qpPrev) > 0.1)
1202
{
1203
1204
m_wantedBitsWindow += m_frameDuration * m_bitrate;
1205
m_totalBits += bits - rce->rowTotalBits;
1206
m_encodedBits += actualBits;
1207
+ m_encodedSegmentBits += actualBits;
1208
+ m_segDur += m_frameDuration;
1209
int pos = m_sliderPos - m_param->frameNumThreads;
1210
if (pos >= 0)
1211
m_encodedBitsWindowpos % s_slidingWindowFrames = actualBits;
1212
1213
{
1214
uint8_t sliceType = (uint8_t)rce->sliceType;
1215
primitives.fix8Pack(m_cuTreeStats.qpBuffer0, curFrame->m_lowres.qpCuTreeOffset, ncu);
1216
- if (fwrite(&sliceType, 1, 1, m_cutreeStatFileOut) < 1)
1217
- goto writeFailure;
1218
- if (fwrite(m_cuTreeStats.qpBuffer0, sizeof(uint16_t), ncu, m_cutreeStatFileOut) < (size_t)ncu)
1219
- goto writeFailure;
1220
+
1221
+ if (X265_SHARE_MODE_FILE == m_param->rc.dataShareMode)
1222
+ {
1223
+ if (fwrite(&sliceType, 1, 1, m_cutreeStatFileOut) < 1)
1224
+ goto writeFailure;
1225
+ if (fwrite(m_cuTreeStats.qpBuffer0, sizeof(uint16_t), ncu, m_cutreeStatFileOut) < (size_t)ncu)
1226
+ goto writeFailure;
1227
+ }
1228
+ else // X265_SHARE_MODE_SHAREDMEM == m_param->rc.dataShareMode
1229
+ {
1230
+ if (!m_cutreeShrMem)
1231
+ {
1232
+ goto writeFailure;
1233
+ }
1234
+
1235
+ CUTreeSharedDataItem shrItem;
1236
+ shrItem.type = &sliceType;
1237
+ shrItem.stats = m_cuTreeStats.qpBuffer0;
1238
+ m_cutreeShrMem->writeData(&shrItem, WriteSharedCUTreeData);
1239
+ }
1240
}
1241
return 0;
1242
1243
1244
if (m_cutreeStatFileIn)
1245
fclose(m_cutreeStatFileIn);
1246
1247
+ if (m_cutreeShrMem)
1248
+ {
1249
+ m_cutreeShrMem->release();
1250
+ delete m_cutreeShrMem;
1251
+ m_cutreeShrMem = NULL;
1252
+ }
1253
+
1254
X265_FREE(m_rce2Pass);
1255
X265_FREE(m_encOrder);
1256
for (int i = 0; i < 2; i++)
1257
1258
double RateControl::forwardMasking(Frame* curFrame, double q)
1259
{
1260
double qp = x265_qScale2qp(q);
1261
- uint32_t maxWindowSize = uint32_t((m_param->fwdScenecutWindow / 1000.0) * (m_param->fpsNum / m_param->fpsDenom) + 0.5);
1262
- uint32_t windowSize = maxWindowSize / 3;
1263
+ uint32_t maxWindowSize = uint32_t((m_param->fwdMaxScenecutWindow / 1000.0) * (m_param->fpsNum / m_param->fpsDenom) + 0.5);
1264
+ uint32_t windowSize6, prevWindow = 0;
1265
int lastScenecut = m_top->m_rateControl->m_lastScenecut;
1266
- int lastIFrame = m_top->m_rateControl->m_lastScenecutAwareIFrame;
1267
- double fwdRefQpDelta = double(m_param->fwdRefQpDelta);
1268
- double fwdNonRefQpDelta = double(m_param->fwdNonRefQpDelta);
1269
- double sliceTypeDelta = SLICE_TYPE_DELTA * fwdRefQpDelta;
1270
+
1271
+ double fwdRefQpDelta6, fwdNonRefQpDelta6, sliceTypeDelta6;
1272
+ for (int i = 0; i < 6; i++)
1273
+ {
1274
+ windowSizei = prevWindow + (uint32_t((m_param->fwdScenecutWindowi / 1000.0) * (m_param->fpsNum / m_param->fpsDenom) + 0.5));
1275
+ fwdRefQpDeltai = double(m_param->fwdRefQpDeltai);
1276
+ fwdNonRefQpDeltai = double(m_param->fwdNonRefQpDeltai);
1277
+ sliceTypeDeltai = SLICE_TYPE_DELTA * fwdRefQpDeltai;
1278
+ prevWindow = windowSizei;
1279
+ }
1280
+
1281
1282
//Check whether the current frame is within the forward window
1283
if (curFrame->m_poc > lastScenecut && curFrame->m_poc <= (lastScenecut + int(maxWindowSize)))
1284
1285
}
1286
else if (curFrame->m_lowres.sliceType == X265_TYPE_P)
1287
{
1288
- if (!(lastIFrame > lastScenecut && lastIFrame <= (lastScenecut + int(maxWindowSize))
1289
- && curFrame->m_poc >= lastIFrame))
1290
- {
1291
- //Add offsets corresponding to the window in which the P-frame occurs
1292
- if (curFrame->m_poc <= (lastScenecut + int(windowSize)))
1293
- qp += WINDOW1_DELTA * (fwdRefQpDelta - sliceTypeDelta);
1294
- else if (((curFrame->m_poc) > (lastScenecut + int(windowSize))) && ((curFrame->m_poc) <= (lastScenecut + 2 * int(windowSize))))
1295
- qp += WINDOW2_DELTA * (fwdRefQpDelta - sliceTypeDelta);
1296
- else if (curFrame->m_poc > lastScenecut + 2 * int(windowSize))
1297
- qp += WINDOW3_DELTA * (fwdRefQpDelta - sliceTypeDelta);
1298
- }
1299
+ //Add offsets corresponding to the window in which the P-frame occurs
1300
+ if (curFrame->m_poc <= (lastScenecut + int(windowSize0)))
1301
+ qp += fwdRefQpDelta0 - sliceTypeDelta0;
1302
+ else if (((curFrame->m_poc) > (lastScenecut + int(windowSize0))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize1))))
1303
+ qp += fwdRefQpDelta1 - sliceTypeDelta1;
1304
+ else if (((curFrame->m_poc) > (lastScenecut + int(windowSize1))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize2))))
1305
+ qp += fwdRefQpDelta2 - sliceTypeDelta2;
1306
+ else if (((curFrame->m_poc) > (lastScenecut + int(windowSize2))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize3))))
1307
+ qp += fwdRefQpDelta3 - sliceTypeDelta3;
1308
+ else if (((curFrame->m_poc) > (lastScenecut + int(windowSize3))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize4))))
1309
+ qp += fwdRefQpDelta4 - sliceTypeDelta4;
1310
+ else if (curFrame->m_poc > lastScenecut + int(windowSize4))
1311
+ qp += fwdRefQpDelta5 - sliceTypeDelta5;
1312
}
1313
else if (curFrame->m_lowres.sliceType == X265_TYPE_BREF)
1314
{
1315
- if (!(lastIFrame > lastScenecut && lastIFrame <= (lastScenecut + int(maxWindowSize))
1316
- && curFrame->m_poc >= lastIFrame))
1317
- {
1318
- //Add offsets corresponding to the window in which the B-frame occurs
1319
- if (curFrame->m_poc <= (lastScenecut + int(windowSize)))
1320
- qp += WINDOW1_DELTA * fwdRefQpDelta;
1321
- else if (((curFrame->m_poc) > (lastScenecut + int(windowSize))) && ((curFrame->m_poc) <= (lastScenecut + 2 * int(windowSize))))
1322
- qp += WINDOW2_DELTA * fwdRefQpDelta;
1323
- else if (curFrame->m_poc > lastScenecut + 2 * int(windowSize))
1324
- qp += WINDOW3_DELTA * fwdRefQpDelta;
1325
- }
1326
+ //Add offsets corresponding to the window in which the B-frame occurs
1327
+ if (curFrame->m_poc <= (lastScenecut + int(windowSize0)))
1328
+ qp += fwdRefQpDelta0;
1329
+ else if (((curFrame->m_poc) > (lastScenecut + int(windowSize0))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize1))))
1330
+ qp += fwdRefQpDelta1;
1331
+ else if (((curFrame->m_poc) > (lastScenecut + int(windowSize1))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize2))))
1332
+ qp += fwdRefQpDelta2;
1333
+ else if (((curFrame->m_poc) > (lastScenecut + int(windowSize2))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize3))))
1334
+ qp += fwdRefQpDelta3;
1335
+ else if (((curFrame->m_poc) > (lastScenecut + int(windowSize3))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize4))))
1336
+ qp += fwdRefQpDelta4;
1337
+ else if (curFrame->m_poc > lastScenecut + int(windowSize4))
1338
+ qp += fwdRefQpDelta5;
1339
}
1340
else if (curFrame->m_lowres.sliceType == X265_TYPE_B)
1341
{
1342
- if (!(lastIFrame > lastScenecut && lastIFrame <= (lastScenecut + int(maxWindowSize))
1343
- && curFrame->m_poc >= lastIFrame))
1344
- {
1345
- //Add offsets corresponding to the window in which the b-frame occurs
1346
- if (curFrame->m_poc <= (lastScenecut + int(windowSize)))
1347
- qp += WINDOW1_DELTA * fwdNonRefQpDelta;
1348
- else if (((curFrame->m_poc) > (lastScenecut + int(windowSize))) && ((curFrame->m_poc) <= (lastScenecut + 2 * int(windowSize))))
1349
- qp += WINDOW2_DELTA * fwdNonRefQpDelta;
1350
- else if (curFrame->m_poc > lastScenecut + 2 * int(windowSize))
1351
- qp += WINDOW3_DELTA * fwdNonRefQpDelta;
1352
- }
1353
+ //Add offsets corresponding to the window in which the b-frame occurs
1354
+ if (curFrame->m_poc <= (lastScenecut + int(windowSize0)))
1355
+ qp += fwdNonRefQpDelta0;
1356
+ else if (((curFrame->m_poc) > (lastScenecut + int(windowSize0))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize1))))
1357
+ qp += fwdNonRefQpDelta1;
1358
+ else if (((curFrame->m_poc) > (lastScenecut + int(windowSize1))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize2))))
1359
+ qp += fwdNonRefQpDelta2;
1360
+ else if (((curFrame->m_poc) > (lastScenecut + int(windowSize2))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize3))))
1361
+ qp += fwdNonRefQpDelta3;
1362
+ else if (((curFrame->m_poc) > (lastScenecut + int(windowSize3))) && ((curFrame->m_poc) <= (lastScenecut + int(windowSize4))))
1363
+ qp += fwdNonRefQpDelta4;
1364
+ else if (curFrame->m_poc > lastScenecut + int(windowSize4))
1365
+ qp += fwdNonRefQpDelta5;
1366
}
1367
}
1368
1369
1370
double RateControl::backwardMasking(Frame* curFrame, double q)
1371
{
1372
double qp = x265_qScale2qp(q);
1373
- double fwdRefQpDelta = double(m_param->fwdRefQpDelta);
1374
- double bwdRefQpDelta = double(m_param->bwdRefQpDelta);
1375
- double bwdNonRefQpDelta = double(m_param->bwdNonRefQpDelta);
1376
+ uint32_t windowSize6, prevWindow = 0;
1377
+ int lastScenecut = m_top->m_rateControl->m_lastScenecut;
1378
1379
- if (curFrame->m_isInsideWindow == BACKWARD_WINDOW)
1380
+ double bwdRefQpDelta6, bwdNonRefQpDelta6, sliceTypeDelta6;
1381
+ for (int i = 0; i < 6; i++)
1382
{
1383
- if (bwdRefQpDelta < 0)
1384
- bwdRefQpDelta = WINDOW3_DELTA * fwdRefQpDelta;
1385
- double sliceTypeDelta = SLICE_TYPE_DELTA * bwdRefQpDelta;
1386
- if (bwdNonRefQpDelta < 0)
1387
- bwdNonRefQpDelta = bwdRefQpDelta + sliceTypeDelta;
1388
+ windowSizei = prevWindow + (uint32_t((m_param->bwdScenecutWindowi / 1000.0) * (m_param->fpsNum / m_param->fpsDenom) + 0.5));
1389
+ prevWindow = windowSizei;
1390
+ bwdRefQpDeltai = double(m_param->bwdRefQpDeltai);
1391
+ bwdNonRefQpDeltai = double(m_param->bwdNonRefQpDeltai);
1392
+
1393
+ if (bwdRefQpDeltai < 0)
1394
+ bwdRefQpDeltai = BWD_WINDOW_DELTA * m_param->fwdRefQpDeltai;
1395
+ sliceTypeDeltai = SLICE_TYPE_DELTA * bwdRefQpDeltai;
1396
+
1397
+ if (bwdNonRefQpDeltai < 0)
1398
+ bwdNonRefQpDeltai = bwdRefQpDeltai + sliceTypeDeltai;
1399
+ }
1400
1401
+ if (curFrame->m_isInsideWindow == BACKWARD_WINDOW)
1402
+ {
1403
if (curFrame->m_lowres.sliceType == X265_TYPE_P)
1404
- qp += bwdRefQpDelta - sliceTypeDelta;
1405
+ {
1406
+ //Add offsets corresponding to the window in which the P-frame occurs
1407
+ if (curFrame->m_poc >= (lastScenecut - int(windowSize0)))
1408
+ qp += bwdRefQpDelta0 - sliceTypeDelta0;
1409
+ else if (((curFrame->m_poc) < (lastScenecut - int(windowSize0))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize1))))
1410
+ qp += bwdRefQpDelta1 - sliceTypeDelta1;
1411
+ else if (((curFrame->m_poc) < (lastScenecut - int(windowSize1))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize2))))
1412
+ qp += bwdRefQpDelta2 - sliceTypeDelta2;
1413
+ else if (((curFrame->m_poc) < (lastScenecut - int(windowSize2))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize3))))
1414
+ qp += bwdRefQpDelta3 - sliceTypeDelta3;
1415
+ else if (((curFrame->m_poc) < (lastScenecut - int(windowSize3))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize4))))
1416
+ qp += bwdRefQpDelta4 - sliceTypeDelta4;
1417
+ else if (curFrame->m_poc < lastScenecut - int(windowSize4))
1418
+ qp += bwdRefQpDelta5 - sliceTypeDelta5;
1419
+ }
1420
else if (curFrame->m_lowres.sliceType == X265_TYPE_BREF)
1421
- qp += bwdRefQpDelta;
1422
+ {
1423
+ //Add offsets corresponding to the window in which the B-frame occurs
1424
+ if (curFrame->m_poc >= (lastScenecut - int(windowSize0)))
1425
+ qp += bwdRefQpDelta0;
1426
+ else if (((curFrame->m_poc) < (lastScenecut - int(windowSize0))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize1))))
1427
+ qp += bwdRefQpDelta1;
1428
+ else if (((curFrame->m_poc) < (lastScenecut - int(windowSize1))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize2))))
1429
+ qp += bwdRefQpDelta2;
1430
+ else if (((curFrame->m_poc) < (lastScenecut - int(windowSize2))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize3))))
1431
+ qp += bwdRefQpDelta3;
1432
+ else if (((curFrame->m_poc) < (lastScenecut - int(windowSize3))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize4))))
1433
+ qp += bwdRefQpDelta4;
1434
+ else if (curFrame->m_poc < lastScenecut - int(windowSize4))
1435
+ qp += bwdRefQpDelta5;
1436
+ }
1437
else if (curFrame->m_lowres.sliceType == X265_TYPE_B)
1438
- qp += bwdNonRefQpDelta;
1439
+ {
1440
+ //Add offsets corresponding to the window in which the b-frame occurs
1441
+ if (curFrame->m_poc >= (lastScenecut - int(windowSize0)))
1442
+ qp += bwdNonRefQpDelta0;
1443
+ else if (((curFrame->m_poc) < (lastScenecut - int(windowSize0))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize1))))
1444
+ qp += bwdNonRefQpDelta1;
1445
+ else if (((curFrame->m_poc) < (lastScenecut - int(windowSize1))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize2))))
1446
+ qp += bwdNonRefQpDelta2;
1447
+ else if (((curFrame->m_poc) < (lastScenecut - int(windowSize2))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize3))))
1448
+ qp += bwdNonRefQpDelta3;
1449
+ else if (((curFrame->m_poc) < (lastScenecut - int(windowSize3))) && ((curFrame->m_poc) >= (lastScenecut - int(windowSize4))))
1450
+ qp += bwdNonRefQpDelta4;
1451
+ else if (curFrame->m_poc < lastScenecut - int(windowSize4))
1452
+ qp += bwdNonRefQpDelta5;
1453
+ }
1454
}
1455
1456
return x265_qp2qScale(qp);
1457
x265_3.5.tar.gz/source/encoder/ratecontrol.h -> x265_3.6.tar.gz/source/encoder/ratecontrol.h
Changed
90
1
2
3
#include "common.h"
4
#include "sei.h"
5
+#include "ringmem.h"
6
7
namespace X265_NS {
8
// encoder namespace
9
10
#define MIN_AMORTIZE_FRACTION 0.2
11
#define CLIP_DURATION(f) x265_clip3(MIN_FRAME_DURATION, MAX_FRAME_DURATION, f)
12
13
-/*Scenecut Aware QP*/
14
-#define WINDOW1_DELTA 1.0 /* The offset for the frames coming in the window-1*/
15
-#define WINDOW2_DELTA 0.7 /* The offset for the frames coming in the window-2*/
16
-#define WINDOW3_DELTA 0.4 /* The offset for the frames coming in the window-3*/
17
-
18
struct Predictor
19
{
20
double coeffMin;
21
22
Predictor rowPreds32;
23
Predictor* rowPred2;
24
25
+ int64_t currentSatd;
26
int64_t lastSatd; /* Contains the picture cost of the previous frame, required for resetAbr and VBV */
27
int64_t leadingNoBSatd;
28
int64_t rowTotalBits; /* update cplxrsum and totalbits at the end of 2 rows */
29
30
double rowCplxrSum;
31
double qpNoVbv;
32
double bufferFill;
33
+ double bufferFillFinal;
34
+ double bufferFillActual;
35
double targetFill;
36
bool vbvEndAdj;
37
double frameDuration;
38
39
double m_qCompress;
40
int64_t m_totalBits; /* total bits used for already encoded frames (after ammortization) */
41
int64_t m_encodedBits; /* bits used for encoded frames (without ammortization) */
42
+ int64_t m_encodedSegmentBits; /* bits used for encoded frames in a segment*/
43
+ double m_segDur;
44
double m_fps;
45
int64_t m_satdCostWindow50;
46
int64_t m_encodedBitsWindow50;
47
48
FILE* m_statFileOut;
49
FILE* m_cutreeStatFileOut;
50
FILE* m_cutreeStatFileIn;
51
+ ///< store the cutree data in memory instead of file
52
+ RingMem *m_cutreeShrMem;
53
double m_lastAccumPNorm;
54
double m_expectedBitsSum; /* sum of qscale2bits after rceq, ratefactor, and overflow, only includes finished frames */
55
int64_t m_predictedBits;
56
57
RateControl(x265_param& p, Encoder *enc);
58
bool init(const SPS& sps);
59
void initHRD(SPS& sps);
60
+ void initVBV(const SPS& sps);
61
void reconfigureRC();
62
63
void setFinalFrameCount(int count);
64
65
int writeRateControlFrameStats(Frame* curFrame, RateControlEntry* rce);
66
bool initPass2();
67
68
+ bool initCUTreeSharedMem();
69
+ void skipCUTreeSharedMemRead(int32_t cnt);
70
+
71
double forwardMasking(Frame* curFrame, double q);
72
double backwardMasking(Frame* curFrame, double q);
73
74
75
double rateEstimateQscale(Frame* pic, RateControlEntry *rce); // main logic for calculating QP based on ABR
76
double tuneAbrQScaleFromFeedback(double qScale);
77
double tuneQScaleForZone(RateControlEntry *rce, double qScale); // Tune qScale to adhere to zone budget
78
+ double tuneQscaleForSBRC(Frame* curFrame, double q); // Tune qScale to adhere to segment budget
79
void accumPQpUpdate();
80
81
int getPredictorType(int lowresSliceType, int sliceType);
82
83
double tuneQScaleForGrain(double rcOverflow);
84
void splitdeltaPOC(char deltapoc, RateControlEntry *rce);
85
void splitbUsed(char deltapoc, RateControlEntry *rce);
86
+ void checkAndResetCRF(RateControlEntry* rce);
87
};
88
}
89
#endif // ifndef X265_RATECONTROL_H
90
x265_3.5.tar.gz/source/encoder/sei.cpp -> x265_3.6.tar.gz/source/encoder/sei.cpp
Changed
10
1
2
{
3
if (nalUnitType != NAL_UNIT_UNSPECIFIED)
4
bs.writeByteAlignment();
5
- list.serialize(nalUnitType, bs);
6
+ list.serialize(nalUnitType, bs, (1 + (nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N)));
7
}
8
}
9
10
x265_3.5.tar.gz/source/encoder/sei.h -> x265_3.6.tar.gz/source/encoder/sei.h
Changed
103
1
2
}
3
};
4
5
+/* Film grain characteristics */
6
+class FilmGrainCharacteristics : public SEI
7
+{
8
+ public:
9
+
10
+ FilmGrainCharacteristics()
11
+ {
12
+ m_payloadType = FILM_GRAIN_CHARACTERISTICS;
13
+ m_payloadSize = 0;
14
+ }
15
+
16
+ struct CompModelIntensityValues
17
+ {
18
+ uint8_t intensityIntervalLowerBound;
19
+ uint8_t intensityIntervalUpperBound;
20
+ int* compModelValue;
21
+ };
22
+
23
+ struct CompModel
24
+ {
25
+ bool bPresentFlag;
26
+ uint8_t numModelValues;
27
+ uint8_t m_filmGrainNumIntensityIntervalMinus1;
28
+ CompModelIntensityValues* intensityValues;
29
+ };
30
+
31
+ CompModel m_compModelMAX_NUM_COMPONENT;
32
+ bool m_filmGrainCharacteristicsPersistenceFlag;
33
+ bool m_filmGrainCharacteristicsCancelFlag;
34
+ bool m_separateColourDescriptionPresentFlag;
35
+ bool m_filmGrainFullRangeFlag;
36
+ uint8_t m_filmGrainModelId;
37
+ uint8_t m_blendingModeId;
38
+ uint8_t m_log2ScaleFactor;
39
+ uint8_t m_filmGrainBitDepthLumaMinus8;
40
+ uint8_t m_filmGrainBitDepthChromaMinus8;
41
+ uint8_t m_filmGrainColourPrimaries;
42
+ uint8_t m_filmGrainTransferCharacteristics;
43
+ uint8_t m_filmGrainMatrixCoeffs;
44
+
45
+ void writeSEI(const SPS&)
46
+ {
47
+ WRITE_FLAG(m_filmGrainCharacteristicsCancelFlag, "film_grain_characteristics_cancel_flag");
48
+
49
+ if (!m_filmGrainCharacteristicsCancelFlag)
50
+ {
51
+ WRITE_CODE(m_filmGrainModelId, 2, "film_grain_model_id");
52
+ WRITE_FLAG(m_separateColourDescriptionPresentFlag, "separate_colour_description_present_flag");
53
+ if (m_separateColourDescriptionPresentFlag)
54
+ {
55
+ WRITE_CODE(m_filmGrainBitDepthLumaMinus8, 3, "film_grain_bit_depth_luma_minus8");
56
+ WRITE_CODE(m_filmGrainBitDepthChromaMinus8, 3, "film_grain_bit_depth_chroma_minus8");
57
+ WRITE_FLAG(m_filmGrainFullRangeFlag, "film_grain_full_range_flag");
58
+ WRITE_CODE(m_filmGrainColourPrimaries, X265_BYTE, "film_grain_colour_primaries");
59
+ WRITE_CODE(m_filmGrainTransferCharacteristics, X265_BYTE, "film_grain_transfer_characteristics");
60
+ WRITE_CODE(m_filmGrainMatrixCoeffs, X265_BYTE, "film_grain_matrix_coeffs");
61
+ }
62
+ WRITE_CODE(m_blendingModeId, 2, "blending_mode_id");
63
+ WRITE_CODE(m_log2ScaleFactor, 4, "log2_scale_factor");
64
+ for (uint8_t c = 0; c < 3; c++)
65
+ {
66
+ WRITE_FLAG(m_compModelc.bPresentFlag && m_compModelc.m_filmGrainNumIntensityIntervalMinus1 + 1 > 0 && m_compModelc.numModelValues > 0, "comp_model_present_flagc");
67
+ }
68
+ for (uint8_t c = 0; c < 3; c++)
69
+ {
70
+ if (m_compModelc.bPresentFlag && m_compModelc.m_filmGrainNumIntensityIntervalMinus1 + 1 > 0 && m_compModelc.numModelValues > 0)
71
+ {
72
+ assert(m_compModelc.m_filmGrainNumIntensityIntervalMinus1 + 1 <= 256);
73
+ assert(m_compModelc.numModelValues <= X265_BYTE);
74
+ WRITE_CODE(m_compModelc.m_filmGrainNumIntensityIntervalMinus1 , X265_BYTE, "num_intensity_intervals_minus1c");
75
+ WRITE_CODE(m_compModelc.numModelValues - 1, 3, "num_model_values_minus1c");
76
+ for (uint8_t interval = 0; interval < m_compModelc.m_filmGrainNumIntensityIntervalMinus1 + 1; interval++)
77
+ {
78
+ WRITE_CODE(m_compModelc.intensityValuesinterval.intensityIntervalLowerBound, X265_BYTE, "intensity_interval_lower_boundci");
79
+ WRITE_CODE(m_compModelc.intensityValuesinterval.intensityIntervalUpperBound, X265_BYTE, "intensity_interval_upper_boundci");
80
+ for (uint8_t j = 0; j < m_compModelc.numModelValues; j++)
81
+ {
82
+ WRITE_SVLC(m_compModelc.intensityValuesinterval.compModelValuej,"comp_model_valueci");
83
+ }
84
+ }
85
+ }
86
+ }
87
+ WRITE_FLAG(m_filmGrainCharacteristicsPersistenceFlag, "film_grain_characteristics_persistence_flag");
88
+ }
89
+ if (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
90
+ {
91
+ WRITE_FLAG(1, "payload_bit_equal_to_one");
92
+ while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
93
+ {
94
+ WRITE_FLAG(0, "payload_bit_equal_to_zero");
95
+ }
96
+ }
97
+ }
98
+};
99
+
100
static const uint32_t ISO_IEC_11578_LEN = 16;
101
102
class SEIuserDataUnregistered : public SEI
103
x265_3.5.tar.gz/source/encoder/slicetype.cpp -> x265_3.6.tar.gz/source/encoder/slicetype.cpp
Changed
1444
1
2
3
namespace X265_NS {
4
5
+uint32_t acEnergyVarHist(uint64_t sum_ssd, int shift)
6
+{
7
+ uint32_t sum = (uint32_t)sum_ssd;
8
+ uint32_t ssd = (uint32_t)(sum_ssd >> 32);
9
+
10
+ return ssd - ((uint64_t)sum * sum >> shift);
11
+}
12
+
13
bool computeEdge(pixel* edgePic, pixel* refPic, pixel* edgeTheta, intptr_t stride, int height, int width, bool bcalcTheta, pixel whitePixel)
14
{
15
intptr_t rowOne = 0, rowTwo = 0, rowThree = 0, colOne = 0, colTwo = 0, colThree = 0;
16
17
{
18
for (int colNum = 0; colNum < width; colNum++)
19
{
20
- if ((rowNum >= 2) && (colNum >= 2) && (rowNum != height - 2) && (colNum != width - 2)) //Ignoring the border pixels of the picture
21
+ if ((rowNum >= 2) && (colNum >= 2) && (rowNum < height - 2) && (colNum < width - 2)) //Ignoring the border pixels of the picture
22
{
23
/* 5x5 Gaussian filter
24
2 4 5 4 2
25
26
if (param->rc.aqMode == X265_AQ_EDGE)
27
edgeFilter(curFrame, param);
28
29
- if (param->rc.aqMode == X265_AQ_EDGE && !param->bHistBasedSceneCut && param->recursionSkipMode == EDGE_BASED_RSKIP)
30
+ if (param->rc.aqMode == X265_AQ_EDGE && param->recursionSkipMode == EDGE_BASED_RSKIP)
31
{
32
pixel* src = curFrame->m_edgePic + curFrame->m_fencPic->m_lumaMarginY * curFrame->m_fencPic->m_stride + curFrame->m_fencPic->m_lumaMarginX;
33
primitives.planecopy_pp_shr(src, curFrame->m_fencPic->m_stride, curFrame->m_edgeBitPic,
34
35
m_countPreLookahead = 0;
36
#endif
37
38
- memset(m_histogram, 0, sizeof(m_histogram));
39
+ m_accHistDiffRunningAvgCb = X265_MALLOC(uint32_t*, NUMBER_OF_SEGMENTS_IN_WIDTH * sizeof(uint32_t*));
40
+ m_accHistDiffRunningAvgCb0 = X265_MALLOC(uint32_t, NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
41
+ memset(m_accHistDiffRunningAvgCb0, 0, sizeof(uint32_t) * NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
42
+ for (uint32_t w = 1; w < NUMBER_OF_SEGMENTS_IN_WIDTH; w++) {
43
+ m_accHistDiffRunningAvgCbw = m_accHistDiffRunningAvgCb0 + w * NUMBER_OF_SEGMENTS_IN_HEIGHT;
44
+ }
45
+
46
+ m_accHistDiffRunningAvgCr = X265_MALLOC(uint32_t*, NUMBER_OF_SEGMENTS_IN_WIDTH * sizeof(uint32_t*));
47
+ m_accHistDiffRunningAvgCr0 = X265_MALLOC(uint32_t, NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
48
+ memset(m_accHistDiffRunningAvgCr0, 0, sizeof(uint32_t) * NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
49
+ for (uint32_t w = 1; w < NUMBER_OF_SEGMENTS_IN_WIDTH; w++) {
50
+ m_accHistDiffRunningAvgCrw = m_accHistDiffRunningAvgCr0 + w * NUMBER_OF_SEGMENTS_IN_HEIGHT;
51
+ }
52
+
53
+ m_accHistDiffRunningAvg = X265_MALLOC(uint32_t*, NUMBER_OF_SEGMENTS_IN_WIDTH * sizeof(uint32_t*));
54
+ m_accHistDiffRunningAvg0 = X265_MALLOC(uint32_t, NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
55
+ memset(m_accHistDiffRunningAvg0, 0, sizeof(uint32_t) * NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
56
+ for (uint32_t w = 1; w < NUMBER_OF_SEGMENTS_IN_WIDTH; w++) {
57
+ m_accHistDiffRunningAvgw = m_accHistDiffRunningAvg0 + w * NUMBER_OF_SEGMENTS_IN_HEIGHT;
58
+ }
59
+
60
+ m_resetRunningAvg = true;
61
+
62
+ m_segmentCountThreshold = (uint32_t)(((float)((NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT) * 50) / 100) + 0.5);
63
+
64
+ if (m_param->bEnableTemporalSubLayers > 2)
65
+ {
66
+ switch (m_param->bEnableTemporalSubLayers)
67
+ {
68
+ case 3:
69
+ m_gopId = 0;
70
+ break;
71
+ case 4:
72
+ m_gopId = 1;
73
+ break;
74
+ case 5:
75
+ m_gopId = 2;
76
+ break;
77
+ default:
78
+ break;
79
+ }
80
+ }
81
}
82
83
#if DETAILED_CU_STATS
84
85
m_pooli.stopWorkers();
86
}
87
}
88
+
89
void Lookahead::destroy()
90
{
91
// these two queues will be empty unless the encode was aborted
92
93
default:
94
return;
95
}
96
- if (!m_param->analysisLoad || !m_param->bDisableLookahead)
97
+ if (!curFrame->m_param->analysisLoad || !curFrame->m_param->bDisableLookahead)
98
{
99
X265_CHECK(curFrame->m_lowres.costEstb - p0p1 - b > 0, "Slice cost not estimated\n")
100
101
- if (m_param->rc.cuTree && !m_param->rc.bStatRead)
102
+ if (curFrame->m_param->rc.cuTree && !curFrame->m_param->rc.bStatRead)
103
/* update row satds based on cutree offsets */
104
curFrame->m_lowres.satdCost = frameCostRecalculate(frames, p0, p1, b);
105
- else if (!m_param->analysisLoad || m_param->scaleFactor || m_param->bAnalysisType == HEVC_INFO)
106
+ else if (!curFrame->m_param->analysisLoad || curFrame->m_param->scaleFactor || curFrame->m_param->bAnalysisType == HEVC_INFO)
107
{
108
- if (m_param->rc.aqMode)
109
+ if (curFrame->m_param->rc.aqMode)
110
curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstAqb - p0p1 - b;
111
else
112
curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstb - p0p1 - b;
113
}
114
- if (m_param->rc.vbvBufferSize && m_param->rc.vbvMaxBitrate)
115
+ if (curFrame->m_param->rc.vbvBufferSize && curFrame->m_param->rc.vbvMaxBitrate)
116
{
117
/* aggregate lowres row satds to CTU resolution */
118
curFrame->m_lowres.lowresCostForRc = curFrame->m_lowres.lowresCostsb - p0p1 - b;
119
uint32_t lowresRow = 0, lowresCol = 0, lowresCuIdx = 0, sum = 0, intraSum = 0;
120
- uint32_t scale = m_param->maxCUSize / (2 * X265_LOWRES_CU_SIZE);
121
- uint32_t numCuInHeight = (m_param->sourceHeight + m_param->maxCUSize - 1) / m_param->maxCUSize;
122
+ uint32_t scale = curFrame->m_param->maxCUSize / (2 * X265_LOWRES_CU_SIZE);
123
+ uint32_t numCuInHeight = (curFrame->m_param->sourceHeight + curFrame->m_param->maxCUSize - 1) / curFrame->m_param->maxCUSize;
124
uint32_t widthInLowresCu = (uint32_t)m_8x8Width, heightInLowresCu = (uint32_t)m_8x8Height;
125
double *qp_offset = 0;
126
/* Factor in qpoffsets based on Aq/Cutree in CU costs */
127
- if (m_param->rc.aqMode || m_param->bAQMotion)
128
- qp_offset = (framesb->sliceType == X265_TYPE_B || !m_param->rc.cuTree) ? framesb->qpAqOffset : framesb->qpCuTreeOffset;
129
+ if (curFrame->m_param->rc.aqMode || curFrame->m_param->bAQMotion)
130
+ qp_offset = (framesb->sliceType == X265_TYPE_B || !curFrame->m_param->rc.cuTree) ? framesb->qpAqOffset : framesb->qpCuTreeOffset;
131
132
for (uint32_t row = 0; row < numCuInHeight; row++)
133
{
134
135
if (qp_offset)
136
{
137
double qpOffset;
138
- if (m_param->rc.qgSize == 8)
139
+ if (curFrame->m_param->rc.qgSize == 8)
140
qpOffset = (qp_offsetlowresCol * 2 + lowresRow * widthInLowresCu * 4 +
141
qp_offsetlowresCol * 2 + lowresRow * widthInLowresCu * 4 + 1 +
142
qp_offsetlowresCol * 2 + lowresRow * widthInLowresCu * 4 + curFrame->m_lowres.maxBlocksInRowFullRes +
143
144
int32_t intraCuCost = curFrame->m_lowres.intraCostlowresCuIdx;
145
curFrame->m_lowres.intraCostlowresCuIdx = (intraCuCost * x265_exp2fix8(qpOffset) + 128) >> 8;
146
}
147
- if (m_param->bIntraRefresh && slice->m_sliceType == X265_TYPE_P)
148
+ if (curFrame->m_param->bIntraRefresh && slice->m_sliceType == X265_TYPE_P)
149
for (uint32_t x = curFrame->m_encData->m_pir.pirStartCol; x <= curFrame->m_encData->m_pir.pirEndCol; x++)
150
diff += curFrame->m_lowres.intraCostlowresCuIdx - lowresCuCost;
151
curFrame->m_lowres.lowresCostForRclowresCuIdx = lowresCuCost;
152
153
}
154
}
155
156
+uint32_t LookaheadTLD::calcVariance(pixel* inpSrc, intptr_t stride, intptr_t blockOffset, uint32_t plane)
157
+{
158
+ pixel* src = inpSrc + blockOffset;
159
+
160
+ uint32_t var;
161
+ if (!plane)
162
+ var = acEnergyVarHist(primitives.cuBLOCK_8x8.var(src, stride), 6);
163
+ else
164
+ var = acEnergyVarHist(primitives.cuBLOCK_4x4.var(src, stride), 4);
165
+
166
+ x265_emms();
167
+ return var;
168
+}
169
+
170
+/*
171
+** Compute Block and Picture Variance, Block Mean for all blocks in the picture
172
+*/
173
+void LookaheadTLD::computePictureStatistics(Frame *curFrame)
174
+{
175
+ int maxCol = curFrame->m_fencPic->m_picWidth;
176
+ int maxRow = curFrame->m_fencPic->m_picHeight;
177
+ intptr_t inpStride = curFrame->m_fencPic->m_stride;
178
+
179
+ // Variance
180
+ uint64_t picTotVariance = 0;
181
+ uint32_t variance;
182
+
183
+ uint64_t blockXY = 0;
184
+ pixel* src = curFrame->m_fencPic->m_picOrg0;
185
+
186
+ for (int blockY = 0; blockY < maxRow; blockY += 8)
187
+ {
188
+ uint64_t rowVariance = 0;
189
+ for (int blockX = 0; blockX < maxCol; blockX += 8)
190
+ {
191
+ intptr_t blockOffsetLuma = blockX + (blockY * inpStride);
192
+
193
+ variance = calcVariance(
194
+ src,
195
+ inpStride,
196
+ blockOffsetLuma, 0);
197
+
198
+ rowVariance += variance;
199
+ blockXY++;
200
+ }
201
+ picTotVariance += (uint16_t)(rowVariance / maxCol);
202
+ }
203
+
204
+ curFrame->m_lowres.picAvgVariance = (uint16_t)(picTotVariance / maxRow);
205
+
206
+ // Collect chroma variance
207
+ int hShift = curFrame->m_fencPic->m_hChromaShift;
208
+ int vShift = curFrame->m_fencPic->m_vChromaShift;
209
+
210
+ int maxColChroma = curFrame->m_fencPic->m_picWidth >> hShift;
211
+ int maxRowChroma = curFrame->m_fencPic->m_picHeight >> vShift;
212
+ intptr_t cStride = curFrame->m_fencPic->m_strideC;
213
+
214
+ pixel* srcCb = curFrame->m_fencPic->m_picOrg1;
215
+
216
+ picTotVariance = 0;
217
+ for (int blockY = 0; blockY < maxRowChroma; blockY += 4)
218
+ {
219
+ uint64_t rowVariance = 0;
220
+ for (int blockX = 0; blockX < maxColChroma; blockX += 4)
221
+ {
222
+ intptr_t blockOffsetChroma = blockX + blockY * cStride;
223
+
224
+ variance = calcVariance(
225
+ srcCb,
226
+ cStride,
227
+ blockOffsetChroma, 1);
228
+
229
+ rowVariance += variance;
230
+ blockXY++;
231
+ }
232
+ picTotVariance += (uint16_t)(rowVariance / maxColChroma);
233
+ }
234
+
235
+ curFrame->m_lowres.picAvgVarianceCb = (uint16_t)(picTotVariance / maxRowChroma);
236
+
237
+
238
+ pixel* srcCr = curFrame->m_fencPic->m_picOrg2;
239
+
240
+ picTotVariance = 0;
241
+ for (int blockY = 0; blockY < maxRowChroma; blockY += 4)
242
+ {
243
+ uint64_t rowVariance = 0;
244
+ for (int blockX = 0; blockX < maxColChroma; blockX += 4)
245
+ {
246
+ intptr_t blockOffsetChroma = blockX + blockY * cStride;
247
+
248
+ variance = calcVariance(
249
+ srcCr,
250
+ cStride,
251
+ blockOffsetChroma, 2);
252
+
253
+ rowVariance += variance;
254
+ blockXY++;
255
+ }
256
+ picTotVariance += (uint16_t)(rowVariance / maxColChroma);
257
+ }
258
+
259
+ curFrame->m_lowres.picAvgVarianceCr = (uint16_t)(picTotVariance / maxRowChroma);
260
+}
261
+
262
+/*
263
+* Compute histogram of n-bins for the input
264
+*/
265
+void LookaheadTLD::calculateHistogram(
266
+ pixel *inputSrc,
267
+ uint32_t inputWidth,
268
+ uint32_t inputHeight,
269
+ intptr_t stride,
270
+ uint8_t dsFactor,
271
+ uint32_t *histogram,
272
+ uint64_t *sum)
273
+
274
+{
275
+ *sum = 0;
276
+
277
+ for (uint32_t verticalIdx = 0; verticalIdx < inputHeight; verticalIdx += dsFactor)
278
+ {
279
+ for (uint32_t horizontalIdx = 0; horizontalIdx < inputWidth; horizontalIdx += dsFactor)
280
+ {
281
+ ++(histograminputSrchorizontalIdx);
282
+ *sum += inputSrchorizontalIdx;
283
+ }
284
+ inputSrc += (stride << (dsFactor >> 1));
285
+ }
286
+
287
+ return;
288
+}
289
+
290
+/*
291
+* Compute histogram bins and chroma pixel intensity *
292
+*/
293
+void LookaheadTLD::computeIntensityHistogramBinsChroma(
294
+ Frame *curFrame,
295
+ uint64_t *sumAverageIntensityCb,
296
+ uint64_t *sumAverageIntensityCr)
297
+{
298
+ uint64_t sum;
299
+ uint8_t dsFactor = 4;
300
+
301
+ uint32_t segmentWidth = curFrame->m_lowres.widthFullRes / NUMBER_OF_SEGMENTS_IN_WIDTH;
302
+ uint32_t segmentHeight = curFrame->m_lowres.heightFullRes / NUMBER_OF_SEGMENTS_IN_HEIGHT;
303
+
304
+ for (uint32_t segmentInFrameWidthIndex = 0; segmentInFrameWidthIndex < NUMBER_OF_SEGMENTS_IN_WIDTH; segmentInFrameWidthIndex++)
305
+ {
306
+ for (uint32_t segmentInFrameHeightIndex = 0; segmentInFrameHeightIndex < NUMBER_OF_SEGMENTS_IN_HEIGHT; segmentInFrameHeightIndex++)
307
+ {
308
+ // Initialize bins to 1
309
+ for (uint32_t cuIndex = 0; cuIndex < 256; cuIndex++) {
310
+ curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex1cuIndex = 1;
311
+ curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex2cuIndex = 1;
312
+ }
313
+
314
+ uint32_t segmentWidthOffset = (segmentInFrameWidthIndex == NUMBER_OF_SEGMENTS_IN_WIDTH - 1) ?
315
+ curFrame->m_lowres.widthFullRes - (NUMBER_OF_SEGMENTS_IN_WIDTH * segmentWidth) : 0;
316
+
317
+ uint32_t segmentHeightOffset = (segmentInFrameHeightIndex == NUMBER_OF_SEGMENTS_IN_HEIGHT - 1) ?
318
+ curFrame->m_lowres.heightFullRes - (NUMBER_OF_SEGMENTS_IN_HEIGHT * segmentHeight) : 0;
319
+
320
+
321
+ // U Histogram
322
+ calculateHistogram(
323
+ curFrame->m_fencPic->m_picOrg1 + ((segmentInFrameWidthIndex * segmentWidth) >> 1) + (((segmentInFrameHeightIndex * segmentHeight) >> 1) * curFrame->m_fencPic->m_strideC),
324
+ (segmentWidth + segmentWidthOffset) >> 1,
325
+ (segmentHeight + segmentHeightOffset) >> 1,
326
+ curFrame->m_fencPic->m_strideC,
327
+ dsFactor,
328
+ curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex1,
329
+ &sum);
330
+
331
+ sum = (sum << dsFactor);
332
+ *sumAverageIntensityCb += sum;
333
+ curFrame->m_lowres.averageIntensityPerSegmentsegmentInFrameWidthIndexsegmentInFrameHeightIndex1 =
334
+ (uint8_t)((sum + (((segmentWidth + segmentWidthOffset) * (segmentHeight + segmentHeightOffset)) >> 3)) / (((segmentWidth + segmentWidthOffset) * (segmentHeight + segmentHeightOffset)) >> 2));
335
+
336
+ for (uint16_t histogramBin = 0; histogramBin < HISTOGRAM_NUMBER_OF_BINS; histogramBin++) {
337
+ curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex1histogramBin =
338
+ curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex1histogramBin << dsFactor;
339
+ }
340
+
341
+ // V Histogram
342
+ calculateHistogram(
343
+ curFrame->m_fencPic->m_picOrg2 + ((segmentInFrameWidthIndex * segmentWidth) >> 1) + (((segmentInFrameHeightIndex * segmentHeight) >> 1) * curFrame->m_fencPic->m_strideC),
344
+ (segmentWidth + segmentWidthOffset) >> 1,
345
+ (segmentHeight + segmentHeightOffset) >> 1,
346
+ curFrame->m_fencPic->m_strideC,
347
+ dsFactor,
348
+ curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex2,
349
+ &sum);
350
+
351
+ sum = (sum << dsFactor);
352
+ *sumAverageIntensityCr += sum;
353
+ curFrame->m_lowres.averageIntensityPerSegmentsegmentInFrameWidthIndexsegmentInFrameHeightIndex2 =
354
+ (uint8_t)((sum + (((segmentWidth + segmentWidthOffset) * (segmentHeight + segmentHeightOffset)) >> 3)) / (((segmentWidth + segmentHeightOffset) * (segmentHeight + segmentHeightOffset)) >> 2));
355
+
356
+ for (uint16_t histogramBin = 0; histogramBin < HISTOGRAM_NUMBER_OF_BINS; histogramBin++) {
357
+ curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex2histogramBin =
358
+ curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex2histogramBin << dsFactor;
359
+ }
360
+ }
361
+ }
362
+ return;
363
+
364
+}
365
+
366
+/*
367
+* Compute histogram bins and luma pixel intensity *
368
+*/
369
+void LookaheadTLD::computeIntensityHistogramBinsLuma(
370
+ Frame *curFrame,
371
+ uint64_t *sumAvgIntensityTotalSegmentsLuma)
372
+{
373
+ uint64_t sum;
374
+
375
+ uint32_t segmentWidth = curFrame->m_lowres.quarterSampleLowResWidth / NUMBER_OF_SEGMENTS_IN_WIDTH;
376
+ uint32_t segmentHeight = curFrame->m_lowres.quarterSampleLowResHeight / NUMBER_OF_SEGMENTS_IN_HEIGHT;
377
+
378
+ for (uint32_t segmentInFrameWidthIndex = 0; segmentInFrameWidthIndex < NUMBER_OF_SEGMENTS_IN_WIDTH; segmentInFrameWidthIndex++)
379
+ {
380
+ for (uint32_t segmentInFrameHeightIndex = 0; segmentInFrameHeightIndex < NUMBER_OF_SEGMENTS_IN_HEIGHT; segmentInFrameHeightIndex++)
381
+ {
382
+ // Initialize bins to 1
383
+ for (uint32_t cuIndex = 0; cuIndex < 256; cuIndex++) {
384
+ curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex0cuIndex = 1;
385
+ }
386
+
387
+ uint32_t segmentWidthOffset = (segmentInFrameWidthIndex == NUMBER_OF_SEGMENTS_IN_WIDTH - 1) ?
388
+ curFrame->m_lowres.quarterSampleLowResWidth - (NUMBER_OF_SEGMENTS_IN_WIDTH * segmentWidth) : 0;
389
+
390
+ uint32_t segmentHeightOffset = (segmentInFrameHeightIndex == NUMBER_OF_SEGMENTS_IN_HEIGHT - 1) ?
391
+ curFrame->m_lowres.quarterSampleLowResHeight - (NUMBER_OF_SEGMENTS_IN_HEIGHT * segmentHeight) : 0;
392
+
393
+ // Y Histogram
394
+ calculateHistogram(
395
+ curFrame->m_lowres.quarterSampleLowResBuffer + (curFrame->m_lowres.quarterSampleLowResOriginX + segmentInFrameWidthIndex * segmentWidth) + ((curFrame->m_lowres.quarterSampleLowResOriginY + segmentInFrameHeightIndex * segmentHeight) * curFrame->m_lowres.quarterSampleLowResStrideY),
396
+ segmentWidth + segmentWidthOffset,
397
+ segmentHeight + segmentHeightOffset,
398
+ curFrame->m_lowres.quarterSampleLowResStrideY,
399
+ 1,
400
+ curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex0,
401
+ &sum);
402
+
403
+ curFrame->m_lowres.averageIntensityPerSegmentsegmentInFrameWidthIndexsegmentInFrameHeightIndex0 = (uint8_t)((sum + (((segmentWidth + segmentWidthOffset)*(segmentWidth + segmentHeightOffset)) >> 1)) / ((segmentWidth + segmentWidthOffset)*(segmentHeight + segmentHeightOffset)));
404
+ (*sumAvgIntensityTotalSegmentsLuma) += (sum << 4);
405
+ for (uint32_t histogramBin = 0; histogramBin < HISTOGRAM_NUMBER_OF_BINS; histogramBin++)
406
+ {
407
+ curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex0histogramBin =
408
+ curFrame->m_lowres.picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex0histogramBin << 4;
409
+ }
410
+ }
411
+ }
412
+}
413
+
414
+void LookaheadTLD::collectPictureStatistics(Frame *curFrame)
415
+{
416
+
417
+ uint64_t sumAverageIntensityCb = 0;
418
+ uint64_t sumAverageIntensityCr = 0;
419
+ uint64_t sumAverageIntensity = 0;
420
+
421
+ // Histogram bins for Luma
422
+ computeIntensityHistogramBinsLuma(
423
+ curFrame,
424
+ &sumAverageIntensity);
425
+
426
+ // Histogram bins for Chroma
427
+ computeIntensityHistogramBinsChroma(
428
+ curFrame,
429
+ &sumAverageIntensityCb,
430
+ &sumAverageIntensityCr);
431
+
432
+ curFrame->m_lowres.averageIntensity0 = (uint8_t)((sumAverageIntensity + ((curFrame->m_lowres.widthFullRes * curFrame->m_lowres.heightFullRes) >> 1)) / (curFrame->m_lowres.widthFullRes * curFrame->m_lowres.heightFullRes));
433
+ curFrame->m_lowres.averageIntensity1 = (uint8_t)((sumAverageIntensityCb + ((curFrame->m_lowres.widthFullRes * curFrame->m_lowres.heightFullRes) >> 3)) / ((curFrame->m_lowres.widthFullRes * curFrame->m_lowres.heightFullRes) >> 2));
434
+ curFrame->m_lowres.averageIntensity2 = (uint8_t)((sumAverageIntensityCr + ((curFrame->m_lowres.widthFullRes * curFrame->m_lowres.heightFullRes) >> 3)) / ((curFrame->m_lowres.widthFullRes * curFrame->m_lowres.heightFullRes) >> 2));
435
+
436
+ computePictureStatistics(curFrame);
437
+
438
+ curFrame->m_lowres.bHistScenecutAnalyzed = false;
439
+}
440
+
441
void PreLookaheadGroup::processTasks(int workerThreadID)
442
{
443
if (workerThreadID < 0)
444
445
preFrame->m_lowres.init(preFrame->m_fencPic, preFrame->m_poc);
446
if (m_lookahead.m_bAdaptiveQuant)
447
tld.calcAdaptiveQuantFrame(preFrame, m_lookahead.m_param);
448
+
449
+ if (m_lookahead.m_param->bHistBasedSceneCut)
450
+ tld.collectPictureStatistics(preFrame);
451
+
452
tld.lowresIntraEstimate(preFrame->m_lowres, m_lookahead.m_param->rc.qgSize);
453
preFrame->m_lowresInit = true;
454
455
456
m_lock.release();
457
}
458
459
+
460
+void Lookahead::placeBref(Frame** frames, int start, int end, int num, int *brefs)
461
+{
462
+ int avg = (start + end) / 2;
463
+ if (m_param->bEnableTemporalSubLayers < 2)
464
+ {
465
+ (*framesavg).m_lowres.sliceType = X265_TYPE_BREF;
466
+ (*brefs)++;
467
+ return;
468
+ }
469
+ else
470
+ {
471
+ if (num <= 2)
472
+ return;
473
+ else
474
+ {
475
+ (*framesavg).m_lowres.sliceType = X265_TYPE_BREF;
476
+ (*brefs)++;
477
+ placeBref(frames, start, avg, avg - start, brefs);
478
+ placeBref(frames, avg + 1, end, end - avg, brefs);
479
+ return;
480
+ }
481
+ }
482
+}
483
+
484
+
485
+void Lookahead::compCostBref(Lowres **frames, int start, int end, int num)
486
+{
487
+ CostEstimateGroup estGroup(*this, frames);
488
+ int avg = (start + end) / 2;
489
+ if (num <= 2)
490
+ {
491
+ for (int i = start; i < end; i++)
492
+ {
493
+ estGroup.singleCost(start, end + 1, i + 1);
494
+ }
495
+ return;
496
+ }
497
+ else
498
+ {
499
+ estGroup.singleCost(start, end + 1, avg + 1);
500
+ compCostBref(frames, start, avg, avg - start);
501
+ compCostBref(frames, avg + 1, end, end - avg);
502
+ return;
503
+ }
504
+}
505
+
506
/* called by API thread or worker thread with inputQueueLock acquired */
507
void Lookahead::slicetypeDecide()
508
{
509
510
ScopedLock lock(m_inputLock);
511
512
Frame *curFrame = m_inputQueue.first();
513
+ if (m_param->bResetZoneConfig)
514
+ {
515
+ for (int i = 0; i < m_param->rc.zonefileCount; i++)
516
+ {
517
+ if (m_param->rc.zonesi.startFrame == curFrame->m_poc)
518
+ m_param = m_param->rc.zonesi.zoneParam;
519
+ int nextZoneStart = m_param->rc.zonesi.startFrame;
520
+ nextZoneStart += nextZoneStart ? m_param->rc.zonesi.zoneParam->radl : 0;
521
+ if (nextZoneStart < curFrame->m_poc + maxSearch && curFrame->m_poc < nextZoneStart)
522
+ maxSearch = nextZoneStart - curFrame->m_poc;
523
+ }
524
+ }
525
int j;
526
for (j = 0; j < m_param->bframes + 2; j++)
527
{
528
529
m_param->rc.cuTree || m_param->scenecutThreshold || m_param->bHistBasedSceneCut ||
530
(m_param->lookaheadDepth && m_param->rc.vbvBufferSize)))
531
{
532
- if(!m_param->rc.bStatRead)
533
+ if (!m_param->rc.bStatRead)
534
slicetypeAnalyse(frames, false);
535
bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
536
if ((m_param->analysisLoad && m_param->scaleFactor && bIsVbv) || m_param->bliveVBV2pass)
537
538
{
539
Lowres& frm = listbframes->m_lowres;
540
541
+ if (frm.sliceTypeReq != X265_TYPE_AUTO && frm.sliceTypeReq != frm.sliceType)
542
+ frm.sliceType = frm.sliceTypeReq;
543
if (frm.sliceType == X265_TYPE_BREF && !m_param->bBPyramid && brefs == m_param->bBPyramid)
544
{
545
frm.sliceType = X265_TYPE_B;
546
547
}
548
if (frm.sliceType == X265_TYPE_IDR && frm.bScenecut && isClosedGopRadl)
549
{
550
- if (!m_param->bHistBasedSceneCut || (m_param->bHistBasedSceneCut && frm.m_bIsHardScenecut))
551
- {
552
- for (int i = bframes; i < bframes + m_param->radl; i++)
553
- listi->m_lowres.sliceType = X265_TYPE_B;
554
- list(bframes + m_param->radl)->m_lowres.sliceType = X265_TYPE_IDR;
555
- }
556
+ for (int i = bframes; i < bframes + m_param->radl; i++)
557
+ listi->m_lowres.sliceType = X265_TYPE_B;
558
+ list(bframes + m_param->radl)->m_lowres.sliceType = X265_TYPE_IDR;
559
}
560
if (frm.sliceType == X265_TYPE_IDR)
561
{
562
563
break;
564
}
565
}
566
- if (bframes)
567
- listbframes - 1->m_lowres.bLastMiniGopBFrame = true;
568
- listbframes->m_lowres.leadingBframes = bframes;
569
- m_lastNonB = &listbframes->m_lowres;
570
- m_histogrambframes++;
571
-
572
- /* insert a bref into the sequence */
573
- if (m_param->bBPyramid && bframes > 1 && !brefs)
574
- {
575
- listbframes / 2->m_lowres.sliceType = X265_TYPE_BREF;
576
- brefs++;
577
- }
578
- /* calculate the frame costs ahead of time for estimateFrameCost while we still have lowres */
579
- if (m_param->rc.rateControlMode != X265_RC_CQP)
580
- {
581
- int p0, p1, b;
582
- /* For zero latency tuning, calculate frame cost to be used later in RC */
583
- if (!maxSearch)
584
+
585
+ if (m_param->bEnableTemporalSubLayers > 2)
586
+ {
587
+ //Split the partial mini GOP into sub mini GOPs when temporal sub layers are enabled
588
+ if (bframes < m_param->bframes)
589
{
590
- for (int i = 0; i <= bframes; i++)
591
- framesi + 1 = &listi->m_lowres;
592
- }
593
+ int leftOver = bframes + 1;
594
+ int8_t gopId = m_gopId - 1;
595
+ int gopLen = x265_gop_ra_lengthgopId;
596
+ int listReset = 0;
597
598
- /* estimate new non-B cost */
599
- p1 = b = bframes + 1;
600
- p0 = (IS_X265_TYPE_I(framesbframes + 1->sliceType)) ? b : 0;
601
+ m_outputLock.acquire();
602
603
- CostEstimateGroup estGroup(*this, frames);
604
+ while ((gopId >= 0) && (leftOver > 3))
605
+ {
606
+ if (leftOver < gopLen)
607
+ {
608
+ gopId = gopId - 1;
609
+ gopLen = x265_gop_ra_lengthgopId;
610
+ continue;
611
+ }
612
+ else
613
+ {
614
+ int newbFrames = listReset + gopLen - 1;
615
+ //Re-assign GOP
616
+ listnewbFrames->m_lowres.sliceType = IS_X265_TYPE_I(listnewbFrames->m_lowres.sliceType) ? listnewbFrames->m_lowres.sliceType : X265_TYPE_P;
617
+ if (newbFrames)
618
+ listnewbFrames - 1->m_lowres.bLastMiniGopBFrame = true;
619
+ listnewbFrames->m_lowres.leadingBframes = newbFrames;
620
+ m_lastNonB = &listnewbFrames->m_lowres;
621
+
622
+ /* insert a bref into the sequence */
623
+ if (m_param->bBPyramid && newbFrames)
624
+ {
625
+ placeBref(list, listReset, newbFrames, newbFrames + 1, &brefs);
626
+ }
627
+ if (m_param->rc.rateControlMode != X265_RC_CQP)
628
+ {
629
+ int p0, p1, b;
630
+ /* For zero latency tuning, calculate frame cost to be used later in RC */
631
+ if (!maxSearch)
632
+ {
633
+ for (int i = listReset; i <= newbFrames; i++)
634
+ framesi + 1 = &listlistReset + i->m_lowres;
635
+ }
636
637
- estGroup.singleCost(p0, p1, b);
638
+ /* estimate new non-B cost */
639
+ p1 = b = newbFrames + 1;
640
+ p0 = (IS_X265_TYPE_I(framesnewbFrames + 1->sliceType)) ? b : listReset;
641
642
- if (bframes)
643
+ CostEstimateGroup estGroup(*this, frames);
644
+
645
+ estGroup.singleCost(p0, p1, b);
646
+
647
+ if (newbFrames)
648
+ compCostBref(frames, listReset, newbFrames, newbFrames + 1);
649
+ }
650
+
651
+ m_inputLock.acquire();
652
+ /* dequeue all frames from inputQueue that are about to be enqueued
653
+ * in the output queue. The order is important because Frame can
654
+ * only be in one list at a time */
655
+ int64_t ptsX265_BFRAME_MAX + 1;
656
+ for (int i = 0; i < gopLen; i++)
657
+ {
658
+ Frame *curFrame;
659
+ curFrame = m_inputQueue.popFront();
660
+ ptsi = curFrame->m_pts;
661
+ maxSearch--;
662
+ }
663
+ m_inputLock.release();
664
+
665
+ int idx = 0;
666
+ /* add non-B to output queue */
667
+ listnewbFrames->m_reorderedPts = ptsidx++;
668
+ listnewbFrames->m_gopOffset = 0;
669
+ listnewbFrames->m_gopId = gopId;
670
+ listnewbFrames->m_tempLayer = x265_gop_ragopId0.layer;
671
+ m_outputQueue.pushBack(*listnewbFrames);
672
+
673
+ /* add B frames to output queue */
674
+ int i = 1, j = 1;
675
+ while (i < gopLen)
676
+ {
677
+ int offset = listReset + (x265_gop_ragopIdj.poc_offset - 1);
678
+ if (!listoffset || offset == newbFrames)
679
+ continue;
680
+
681
+ // Assign gop offset and temporal layer of frames
682
+ listoffset->m_gopOffset = j;
683
+ listbframes->m_gopId = gopId;
684
+ listoffset->m_tempLayer = x265_gop_ragopIdj++.layer;
685
+
686
+ listoffset->m_reorderedPts = ptsidx++;
687
+ m_outputQueue.pushBack(*listoffset);
688
+ i++;
689
+ }
690
+
691
+ listReset += gopLen;
692
+ leftOver = leftOver - gopLen;
693
+ gopId -= 1;
694
+ gopLen = (gopId >= 0) ? x265_gop_ra_lengthgopId : 0;
695
+ }
696
+ }
697
+
698
+ if (leftOver > 0 && leftOver < 4)
699
+ {
700
+ int64_t ptsX265_BFRAME_MAX + 1;
701
+ int idx = 0;
702
+
703
+ int newbFrames = listReset + leftOver - 1;
704
+ listnewbFrames->m_lowres.sliceType = IS_X265_TYPE_I(listnewbFrames->m_lowres.sliceType) ? listnewbFrames->m_lowres.sliceType : X265_TYPE_P;
705
+ if (newbFrames)
706
+ listnewbFrames - 1->m_lowres.bLastMiniGopBFrame = true;
707
+ listnewbFrames->m_lowres.leadingBframes = newbFrames;
708
+ m_lastNonB = &listnewbFrames->m_lowres;
709
+
710
+ /* insert a bref into the sequence */
711
+ if (m_param->bBPyramid && (newbFrames- listReset) > 1)
712
+ placeBref(list, listReset, newbFrames, newbFrames + 1, &brefs);
713
+
714
+ if (m_param->rc.rateControlMode != X265_RC_CQP)
715
+ {
716
+ int p0, p1, b;
717
+ /* For zero latency tuning, calculate frame cost to be used later in RC */
718
+ if (!maxSearch)
719
+ {
720
+ for (int i = listReset; i <= newbFrames; i++)
721
+ framesi + 1 = &listlistReset + i->m_lowres;
722
+ }
723
+
724
+ /* estimate new non-B cost */
725
+ p1 = b = newbFrames + 1;
726
+ p0 = (IS_X265_TYPE_I(framesnewbFrames + 1->sliceType)) ? b : listReset;
727
+
728
+ CostEstimateGroup estGroup(*this, frames);
729
+
730
+ estGroup.singleCost(p0, p1, b);
731
+
732
+ if (newbFrames)
733
+ compCostBref(frames, listReset, newbFrames, newbFrames + 1);
734
+ }
735
+
736
+ m_inputLock.acquire();
737
+ /* dequeue all frames from inputQueue that are about to be enqueued
738
+ * in the output queue. The order is important because Frame can
739
+ * only be in one list at a time */
740
+ for (int i = 0; i < leftOver; i++)
741
+ {
742
+ Frame *curFrame;
743
+ curFrame = m_inputQueue.popFront();
744
+ ptsi = curFrame->m_pts;
745
+ maxSearch--;
746
+ }
747
+ m_inputLock.release();
748
+
749
+ m_lastNonB = &listnewbFrames->m_lowres;
750
+ listnewbFrames->m_reorderedPts = ptsidx++;
751
+ listnewbFrames->m_gopOffset = 0;
752
+ listnewbFrames->m_gopId = -1;
753
+ listnewbFrames->m_tempLayer = 0;
754
+ m_outputQueue.pushBack(*listnewbFrames);
755
+ if (brefs)
756
+ {
757
+ for (int i = listReset; i < newbFrames; i++)
758
+ {
759
+ if (listi->m_lowres.sliceType == X265_TYPE_BREF)
760
+ {
761
+ listi->m_reorderedPts = ptsidx++;
762
+ listi->m_gopOffset = 0;
763
+ listi->m_gopId = -1;
764
+ listi->m_tempLayer = 0;
765
+ m_outputQueue.pushBack(*listi);
766
+ }
767
+ }
768
+ }
769
+
770
+ /* add B frames to output queue */
771
+ for (int i = listReset; i < newbFrames; i++)
772
+ {
773
+ /* push all the B frames into output queue except B-ref, which already pushed into output queue */
774
+ if (listi->m_lowres.sliceType != X265_TYPE_BREF)
775
+ {
776
+ listi->m_reorderedPts = ptsidx++;
777
+ listi->m_gopOffset = 0;
778
+ listi->m_gopId = -1;
779
+ listi->m_tempLayer = 1;
780
+ m_outputQueue.pushBack(*listi);
781
+ }
782
+ }
783
+ }
784
+ }
785
+ else
786
+ // Fill the complete mini GOP when temporal sub layers are enabled
787
{
788
- p0 = 0; // last nonb
789
- bool isp0available = framesbframes + 1->sliceType == X265_TYPE_IDR ? false : true;
790
791
- for (b = 1; b <= bframes; b++)
792
+ listbframes - 1->m_lowres.bLastMiniGopBFrame = true;
793
+ listbframes->m_lowres.leadingBframes = bframes;
794
+ m_lastNonB = &listbframes->m_lowres;
795
+
796
+ /* insert a bref into the sequence */
797
+ if (m_param->bBPyramid && !brefs)
798
{
799
- if (!isp0available)
800
- p0 = b;
801
+ placeBref(list, 0, bframes, bframes + 1, &brefs);
802
+ }
803
804
- if (framesb->sliceType == X265_TYPE_B)
805
- for (p1 = b; framesp1->sliceType == X265_TYPE_B; p1++)
806
- ; // find new nonb or bref
807
- else
808
- p1 = bframes + 1;
809
+ /* calculate the frame costs ahead of time for estimateFrameCost while we still have lowres */
810
+ if (m_param->rc.rateControlMode != X265_RC_CQP)
811
+ {
812
+ int p0, p1, b;
813
+ /* For zero latency tuning, calculate frame cost to be used later in RC */
814
+ if (!maxSearch)
815
+ {
816
+ for (int i = 0; i <= bframes; i++)
817
+ framesi + 1 = &listi->m_lowres;
818
+ }
819
820
+ /* estimate new non-B cost */
821
+ p1 = b = bframes + 1;
822
+ p0 = (IS_X265_TYPE_I(framesbframes + 1->sliceType)) ? b : 0;
823
+
824
+ CostEstimateGroup estGroup(*this, frames);
825
estGroup.singleCost(p0, p1, b);
826
827
- if (framesb->sliceType == X265_TYPE_BREF)
828
+ compCostBref(frames, 0, bframes, bframes + 1);
829
+ }
830
+
831
+ m_inputLock.acquire();
832
+ /* dequeue all frames from inputQueue that are about to be enqueued
833
+ * in the output queue. The order is important because Frame can
834
+ * only be in one list at a time */
835
+ int64_t ptsX265_BFRAME_MAX + 1;
836
+ for (int i = 0; i <= bframes; i++)
837
+ {
838
+ Frame *curFrame;
839
+ curFrame = m_inputQueue.popFront();
840
+ ptsi = curFrame->m_pts;
841
+ maxSearch--;
842
+ }
843
+ m_inputLock.release();
844
+
845
+ m_outputLock.acquire();
846
+
847
+ int idx = 0;
848
+ /* add non-B to output queue */
849
+ listbframes->m_reorderedPts = ptsidx++;
850
+ listbframes->m_gopOffset = 0;
851
+ listbframes->m_gopId = m_gopId;
852
+ listbframes->m_tempLayer = x265_gop_ram_gopId0.layer;
853
+ m_outputQueue.pushBack(*listbframes);
854
+
855
+ int i = 1, j = 1;
856
+ while (i <= bframes)
857
+ {
858
+ int offset = x265_gop_ram_gopIdj.poc_offset - 1;
859
+ if (!listoffset || offset == bframes)
860
+ continue;
861
+
862
+ // Assign gop offset and temporal layer of frames
863
+ listoffset->m_gopOffset = j;
864
+ listoffset->m_gopId = m_gopId;
865
+ listoffset->m_tempLayer = x265_gop_ram_gopIdj++.layer;
866
+
867
+ /* add B frames to output queue */
868
+ listoffset->m_reorderedPts = ptsidx++;
869
+ m_outputQueue.pushBack(*listoffset);
870
+ i++;
871
+ }
872
+ }
873
+
874
+ bool isKeyFrameAnalyse = (m_param->rc.cuTree || (m_param->rc.vbvBufferSize && m_param->lookaheadDepth));
875
+ if (isKeyFrameAnalyse && IS_X265_TYPE_I(m_lastNonB->sliceType))
876
+ {
877
+ m_inputLock.acquire();
878
+ Frame *curFrame = m_inputQueue.first();
879
+ frames0 = m_lastNonB;
880
+ int j;
881
+ for (j = 0; j < maxSearch; j++)
882
+ {
883
+ framesj + 1 = &curFrame->m_lowres;
884
+ curFrame = curFrame->m_next;
885
+ }
886
+ m_inputLock.release();
887
+
888
+ framesj + 1 = NULL;
889
+ if (!m_param->rc.bStatRead)
890
+ slicetypeAnalyse(frames, true);
891
+ bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
892
+ if ((m_param->analysisLoad && m_param->scaleFactor && bIsVbv) || m_param->bliveVBV2pass)
893
+ {
894
+ int numFrames;
895
+ for (numFrames = 0; numFrames < maxSearch; numFrames++)
896
{
897
- p0 = b;
898
- isp0available = true;
899
+ Lowres *fenc = framesnumFrames + 1;
900
+ if (!fenc)
901
+ break;
902
}
903
+ vbvLookahead(frames, numFrames, true);
904
}
905
}
906
- }
907
908
- m_inputLock.acquire();
909
- /* dequeue all frames from inputQueue that are about to be enqueued
910
- * in the output queue. The order is important because Frame can
911
- * only be in one list at a time */
912
- int64_t ptsX265_BFRAME_MAX + 1;
913
- for (int i = 0; i <= bframes; i++)
914
- {
915
- Frame *curFrame;
916
- curFrame = m_inputQueue.popFront();
917
- ptsi = curFrame->m_pts;
918
- maxSearch--;
919
- }
920
- m_inputLock.release();
921
922
- m_outputLock.acquire();
923
- /* add non-B to output queue */
924
- int idx = 0;
925
- listbframes->m_reorderedPts = ptsidx++;
926
- m_outputQueue.pushBack(*listbframes);
927
- /* Add B-ref frame next to P frame in output queue, the B-ref encode before non B-ref frame */
928
- if (brefs)
929
+ m_outputLock.release();
930
+ }
931
+ else
932
{
933
- for (int i = 0; i < bframes; i++)
934
+
935
+ if (bframes)
936
+ listbframes - 1->m_lowres.bLastMiniGopBFrame = true;
937
+ listbframes->m_lowres.leadingBframes = bframes;
938
+ m_lastNonB = &listbframes->m_lowres;
939
+
940
+ /* insert a bref into the sequence */
941
+ if (m_param->bBPyramid && bframes > 1 && !brefs)
942
{
943
- if (listi->m_lowres.sliceType == X265_TYPE_BREF)
944
+ placeBref(list, 0, bframes, bframes + 1, &brefs);
945
+ }
946
+ /* calculate the frame costs ahead of time for estimateFrameCost while we still have lowres */
947
+ if (m_param->rc.rateControlMode != X265_RC_CQP)
948
+ {
949
+ int p0, p1, b;
950
+ /* For zero latency tuning, calculate frame cost to be used later in RC */
951
+ if (!maxSearch)
952
{
953
- listi->m_reorderedPts = ptsidx++;
954
- m_outputQueue.pushBack(*listi);
955
+ for (int i = 0; i <= bframes; i++)
956
+ framesi + 1 = &listi->m_lowres;
957
+ }
958
+
959
+ /* estimate new non-B cost */
960
+ p1 = b = bframes + 1;
961
+ p0 = (IS_X265_TYPE_I(framesbframes + 1->sliceType)) ? b : 0;
962
+
963
+ CostEstimateGroup estGroup(*this, frames);
964
+ estGroup.singleCost(p0, p1, b);
965
+
966
+ if (m_param->bEnableTemporalSubLayers > 1 && bframes)
967
+ {
968
+ compCostBref(frames, 0, bframes, bframes + 1);
969
+ }
970
+ else
971
+ {
972
+ if (bframes)
973
+ {
974
+ p0 = 0; // last nonb
975
+ bool isp0available = framesbframes + 1->sliceType == X265_TYPE_IDR ? false : true;
976
+
977
+ for (b = 1; b <= bframes; b++)
978
+ {
979
+ if (!isp0available)
980
+ p0 = b;
981
+
982
+ if (framesb->sliceType == X265_TYPE_B)
983
+ for (p1 = b; framesp1->sliceType == X265_TYPE_B; p1++)
984
+ ; // find new nonb or bref
985
+ else
986
+ p1 = bframes + 1;
987
+
988
+ estGroup.singleCost(p0, p1, b);
989
+
990
+ if (framesb->sliceType == X265_TYPE_BREF)
991
+ {
992
+ p0 = b;
993
+ isp0available = true;
994
+ }
995
+ }
996
+ }
997
}
998
}
999
- }
1000
1001
- /* add B frames to output queue */
1002
- for (int i = 0; i < bframes; i++)
1003
- {
1004
- /* push all the B frames into output queue except B-ref, which already pushed into output queue */
1005
- if (listi->m_lowres.sliceType != X265_TYPE_BREF)
1006
+ m_inputLock.acquire();
1007
+ /* dequeue all frames from inputQueue that are about to be enqueued
1008
+ * in the output queue. The order is important because Frame can
1009
+ * only be in one list at a time */
1010
+ int64_t ptsX265_BFRAME_MAX + 1;
1011
+ for (int i = 0; i <= bframes; i++)
1012
+ {
1013
+ Frame *curFrame;
1014
+ curFrame = m_inputQueue.popFront();
1015
+ ptsi = curFrame->m_pts;
1016
+ maxSearch--;
1017
+ }
1018
+ m_inputLock.release();
1019
+
1020
+ m_outputLock.acquire();
1021
+
1022
+ /* add non-B to output queue */
1023
+ int idx = 0;
1024
+ listbframes->m_reorderedPts = ptsidx++;
1025
+ m_outputQueue.pushBack(*listbframes);
1026
+
1027
+ /* Add B-ref frame next to P frame in output queue, the B-ref encode before non B-ref frame */
1028
+ if (brefs)
1029
{
1030
- listi->m_reorderedPts = ptsidx++;
1031
- m_outputQueue.pushBack(*listi);
1032
+ for (int i = 0; i < bframes; i++)
1033
+ {
1034
+ if (listi->m_lowres.sliceType == X265_TYPE_BREF)
1035
+ {
1036
+ listi->m_reorderedPts = ptsidx++;
1037
+ m_outputQueue.pushBack(*listi);
1038
+ }
1039
+ }
1040
}
1041
- }
1042
1043
- bool isKeyFrameAnalyse = (m_param->rc.cuTree || (m_param->rc.vbvBufferSize && m_param->lookaheadDepth));
1044
- if (isKeyFrameAnalyse && IS_X265_TYPE_I(m_lastNonB->sliceType))
1045
- {
1046
- m_inputLock.acquire();
1047
- Frame *curFrame = m_inputQueue.first();
1048
- frames0 = m_lastNonB;
1049
- int j;
1050
- for (j = 0; j < maxSearch; j++)
1051
+ /* add B frames to output queue */
1052
+ for (int i = 0; i < bframes; i++)
1053
{
1054
- framesj + 1 = &curFrame->m_lowres;
1055
- curFrame = curFrame->m_next;
1056
+ /* push all the B frames into output queue except B-ref, which already pushed into output queue */
1057
+ if (listi->m_lowres.sliceType != X265_TYPE_BREF)
1058
+ {
1059
+ listi->m_reorderedPts = ptsidx++;
1060
+ m_outputQueue.pushBack(*listi);
1061
+ }
1062
}
1063
- m_inputLock.release();
1064
1065
- framesj + 1 = NULL;
1066
- if (!m_param->rc.bStatRead)
1067
- slicetypeAnalyse(frames, true);
1068
- bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
1069
- if ((m_param->analysisLoad && m_param->scaleFactor && bIsVbv) || m_param->bliveVBV2pass)
1070
+
1071
+ bool isKeyFrameAnalyse = (m_param->rc.cuTree || (m_param->rc.vbvBufferSize && m_param->lookaheadDepth));
1072
+ if (isKeyFrameAnalyse && IS_X265_TYPE_I(m_lastNonB->sliceType))
1073
{
1074
- int numFrames;
1075
- for (numFrames = 0; numFrames < maxSearch; numFrames++)
1076
+ m_inputLock.acquire();
1077
+ Frame *curFrame = m_inputQueue.first();
1078
+ frames0 = m_lastNonB;
1079
+ int j;
1080
+ for (j = 0; j < maxSearch; j++)
1081
+ {
1082
+ framesj + 1 = &curFrame->m_lowres;
1083
+ curFrame = curFrame->m_next;
1084
+ }
1085
+ m_inputLock.release();
1086
+
1087
+ framesj + 1 = NULL;
1088
+ if (!m_param->rc.bStatRead)
1089
+ slicetypeAnalyse(frames, true);
1090
+ bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
1091
+ if ((m_param->analysisLoad && m_param->scaleFactor && bIsVbv) || m_param->bliveVBV2pass)
1092
{
1093
- Lowres *fenc = framesnumFrames + 1;
1094
- if (!fenc)
1095
- break;
1096
+ int numFrames;
1097
+ for (numFrames = 0; numFrames < maxSearch; numFrames++)
1098
+ {
1099
+ Lowres *fenc = framesnumFrames + 1;
1100
+ if (!fenc)
1101
+ break;
1102
+ }
1103
+ vbvLookahead(frames, numFrames, true);
1104
}
1105
- vbvLookahead(frames, numFrames, true);
1106
}
1107
+
1108
+ m_outputLock.release();
1109
}
1110
- m_outputLock.release();
1111
}
1112
1113
void Lookahead::vbvLookahead(Lowres **frames, int numFrames, int keyframe)
1114
1115
nextZoneStart += (i + 1 < m_param->rc.zonefileCount) ? m_param->rc.zonesi + 1.startFrame + m_param->rc.zonesi + 1.zoneParam->radl : m_param->totalFrames;
1116
if (curZoneStart <= frames0->frameNum && nextZoneStart > frames0->frameNum)
1117
m_param->keyframeMax = nextZoneStart - curZoneStart;
1118
+ if (m_param->rc.zonesm_param->rc.zonefileCount - 1.startFrame <= frames0->frameNum && nextZoneStart == 0)
1119
+ m_param->keyframeMax = m_param->rc.zones0.keyframeMax;
1120
}
1121
}
1122
int keylimit = m_param->keyframeMax;
1123
1124
int numAnalyzed = numFrames;
1125
bool isScenecut = false;
1126
1127
- /* Temporal computations for scenecut detection */
1128
if (m_param->bHistBasedSceneCut)
1129
- {
1130
- for (int i = numFrames - 1; i > 0; i--)
1131
- {
1132
- if (framesi->interPCostPercDiff > 0.0)
1133
- continue;
1134
- int64_t interCost = framesi->costEst10;
1135
- int64_t intraCost = framesi->costEst00;
1136
- if (interCost < 0 || intraCost < 0)
1137
- continue;
1138
- int times = 0;
1139
- double averagePcost = 0.0, averageIcost = 0.0;
1140
- for (int j = i - 1; j >= 0 && times < 5; j--, times++)
1141
- {
1142
- if (framesj->costEst00 > 0 && framesj->costEst10 > 0)
1143
- {
1144
- averageIcost += framesj->costEst00;
1145
- averagePcost += framesj->costEst10;
1146
- }
1147
- else
1148
- times--;
1149
- }
1150
- if (times)
1151
- {
1152
- averageIcost = averageIcost / times;
1153
- averagePcost = averagePcost / times;
1154
- framesi->interPCostPercDiff = abs(interCost - averagePcost) / X265_MIN(interCost, averagePcost) * 100;
1155
- framesi->intraCostPercDiff = abs(intraCost - averageIcost) / X265_MIN(intraCost, averageIcost) * 100;
1156
- }
1157
- }
1158
- }
1159
-
1160
- /* When scenecut threshold is set, use scenecut detection for I frame placements */
1161
- if (!m_param->bHistBasedSceneCut || (m_param->bHistBasedSceneCut && frames1->bScenecut))
1162
+ isScenecut = histBasedScenecut(frames, 0, 1, origNumFrames);
1163
+ else
1164
isScenecut = scenecut(frames, 0, 1, true, origNumFrames);
1165
1166
- if (isScenecut && (m_param->bHistBasedSceneCut || m_param->scenecutThreshold))
1167
+ /* When scenecut threshold is set, use scenecut detection for I frame placements */
1168
+ if (m_param->scenecutThreshold && isScenecut)
1169
{
1170
frames1->sliceType = X265_TYPE_I;
1171
return;
1172
1173
m_extendGopBoundary = false;
1174
for (int i = m_param->bframes + 1; i < origNumFrames; i += m_param->bframes + 1)
1175
{
1176
- if (!m_param->bHistBasedSceneCut || (m_param->bHistBasedSceneCut && framesi + 1->bScenecut))
1177
- scenecut(frames, i, i + 1, true, origNumFrames);
1178
+ scenecut(frames, i, i + 1, true, origNumFrames);
1179
1180
for (int j = i + 1; j <= X265_MIN(i + m_param->bframes + 1, origNumFrames); j++)
1181
{
1182
1183
{
1184
for (int j = 1; j < numBFrames + 1; j++)
1185
{
1186
- bool isNextScenecut = false;
1187
- if (!m_param->bHistBasedSceneCut || (m_param->bHistBasedSceneCut && framesj + 1->bScenecut))
1188
- isNextScenecut = scenecut(frames, j, j + 1, false, origNumFrames);
1189
- if (isNextScenecut || (bForceRADL && framesj->frameNum == preRADL))
1190
+ if (scenecut(frames, j, j + 1, false, origNumFrames) ||
1191
+ (bForceRADL && (framesj->frameNum == preRADL)))
1192
{
1193
framesj->sliceType = X265_TYPE_P;
1194
numAnalyzed = j;
1195
1196
/* Where A and B are scenes: AAAAAABBBAAAAAA
1197
* If BBB is shorter than (maxp1-p0), it is detected as a flash
1198
* and not considered a scenecut. */
1199
+
1200
for (int cp1 = p1; cp1 <= maxp1; cp1++)
1201
{
1202
- if (!scenecutInternal(frames, p0, cp1, false) && !m_param->bHistBasedSceneCut)
1203
+ if (!scenecutInternal(frames, p0, cp1, false))
1204
{
1205
/* Any frame in between p0 and cur_p1 cannot be a real scenecut. */
1206
for (int i = cp1; i > p0; i--)
1207
1208
noScenecuts = false;
1209
}
1210
}
1211
- else if ((m_param->bHistBasedSceneCut && framescp1->m_bIsMaxThres) || scenecutInternal(frames, cp1 - 1, cp1, false))
1212
+ else if (scenecutInternal(frames, cp1 - 1, cp1, false))
1213
{
1214
/* If current frame is a Scenecut from p0 frame as well as Scenecut from
1215
* preceeding frame, mark it as a Scenecut */
1216
1217
1218
if (!framesp1->bScenecut)
1219
return false;
1220
- /* Check only scene transitions if max threshold */
1221
- if (m_param->bHistBasedSceneCut && framesp1->m_bIsMaxThres)
1222
- return framesp1->bScenecut;
1223
1224
return scenecutInternal(frames, p0, p1, bRealScenecut);
1225
}
1226
1227
/* magic numbers pulled out of thin air */
1228
float threshMin = (float)(threshMax * 0.25);
1229
double bias = m_param->scenecutBias;
1230
- if (m_param->bHistBasedSceneCut)
1231
- {
1232
- double minT = TEMPORAL_SCENECUT_THRESHOLD * (1 + m_param->edgeTransitionThreshold);
1233
- if (frame->interPCostPercDiff > minT || frame->intraCostPercDiff > minT)
1234
- {
1235
- if (bRealScenecut && frame->bScenecut)
1236
- x265_log(m_param, X265_LOG_DEBUG, "scene cut at %d \n", frame->frameNum);
1237
- return frame->bScenecut;
1238
- }
1239
- else
1240
- return false;
1241
- }
1242
- else if (bRealScenecut)
1243
+
1244
+ if (bRealScenecut)
1245
{
1246
if (m_param->keyframeMin == m_param->keyframeMax)
1247
threshMin = threshMax;
1248
1249
return res;
1250
}
1251
1252
+bool Lookahead::detectHistBasedSceneChange(Lowres **frames, int p0, int p1, int p2)
1253
+{
1254
+ bool isAbruptChange;
1255
+ bool isSceneChange;
1256
+
1257
+ Lowres *previousFrame = framesp0;
1258
+ Lowres *currentFrame = framesp1;
1259
+ Lowres *futureFrame = framesp2;
1260
+
1261
+ currentFrame->bHistScenecutAnalyzed = true;
1262
+
1263
+ uint32_t **accHistDiffRunningAvgCb = m_accHistDiffRunningAvgCb;
1264
+ uint32_t **accHistDiffRunningAvgCr = m_accHistDiffRunningAvgCr;
1265
+ uint32_t **accHistDiffRunningAvg = m_accHistDiffRunningAvg;
1266
+
1267
+ uint8_t absIntDiffFuturePast = 0;
1268
+ uint8_t absIntDiffFuturePresent = 0;
1269
+ uint8_t absIntDiffPresentPast = 0;
1270
+
1271
+ uint32_t abruptChangeCount = 0;
1272
+ uint32_t sceneChangeCount = 0;
1273
+
1274
+ uint32_t segmentWidth = frames1->widthFullRes / NUMBER_OF_SEGMENTS_IN_WIDTH;
1275
+ uint32_t segmentHeight = frames1->heightFullRes / NUMBER_OF_SEGMENTS_IN_HEIGHT;
1276
+
1277
+ for (uint32_t segmentInFrameWidthIndex = 0; segmentInFrameWidthIndex < NUMBER_OF_SEGMENTS_IN_WIDTH; segmentInFrameWidthIndex++)
1278
+ {
1279
+ for (uint32_t segmentInFrameHeightIndex = 0; segmentInFrameHeightIndex < NUMBER_OF_SEGMENTS_IN_HEIGHT; segmentInFrameHeightIndex++)
1280
+ {
1281
+ isAbruptChange = false;
1282
+ isSceneChange = false;
1283
+
1284
+ // accumulative absolute histogram differences between the past and current frame
1285
+ uint32_t accHistDiff = 0;
1286
+ uint32_t accHistDiffCb = 0;
1287
+ uint32_t accHistDiffCr = 0;
1288
+
1289
+ uint32_t segmentWidthOffset = (segmentInFrameWidthIndex == NUMBER_OF_SEGMENTS_IN_WIDTH - 1) ?
1290
+ frames1->widthFullRes - (NUMBER_OF_SEGMENTS_IN_WIDTH * segmentWidth) : 0;
1291
+
1292
+ uint32_t segmentHeightOffset = (segmentInFrameHeightIndex == NUMBER_OF_SEGMENTS_IN_HEIGHT - 1) ?
1293
+ frames1->heightFullRes - (NUMBER_OF_SEGMENTS_IN_HEIGHT * segmentHeight) : 0;
1294
+
1295
+ segmentWidth += segmentWidthOffset;
1296
+ segmentHeight += segmentHeightOffset;
1297
+
1298
+ uint32_t segmentThreshHold = (
1299
+ ((X265_ABS((int64_t)currentFrame->picAvgVariance - (int64_t)previousFrame->picAvgVariance)) > PICTURE_DIFF_VARIANCE_TH) &&
1300
+ (currentFrame->picAvgVariance > PICTURE_VARIANCE_TH || previousFrame->picAvgVariance > PICTURE_VARIANCE_TH)) ?
1301
+ HIGH_VAR_SCENE_CHANGE_TH * NUM64x64INPIC(segmentWidth, segmentHeight) : LOW_VAR_SCENE_CHANGE_TH * NUM64x64INPIC(segmentWidth, segmentHeight);
1302
+
1303
+ uint32_t segmentThreshHoldCb = (
1304
+ ((X265_ABS((int64_t)currentFrame->picAvgVarianceCb - (int64_t)previousFrame->picAvgVarianceCb)) > PICTURE_DIFF_VARIANCE_CHROMA_TH) &&
1305
+ (currentFrame->picAvgVarianceCb > PICTURE_VARIANCE_CHROMA_TH || previousFrame->picAvgVarianceCb > PICTURE_VARIANCE_CHROMA_TH)) ?
1306
+ HIGH_VAR_SCENE_CHANGE_CHROMA_TH * NUM64x64INPIC(segmentWidth, segmentHeight) : LOW_VAR_SCENE_CHANGE_CHROMA_TH * NUM64x64INPIC(segmentWidth, segmentHeight);
1307
+
1308
+ uint32_t segmentThreshHoldCr = (
1309
+ ((X265_ABS((int64_t)currentFrame->picAvgVarianceCr - (int64_t)previousFrame->picAvgVarianceCr)) > PICTURE_DIFF_VARIANCE_CHROMA_TH) &&
1310
+ (currentFrame->picAvgVarianceCr > PICTURE_VARIANCE_CHROMA_TH || previousFrame->picAvgVarianceCr > PICTURE_VARIANCE_CHROMA_TH)) ?
1311
+ HIGH_VAR_SCENE_CHANGE_CHROMA_TH * NUM64x64INPIC(segmentWidth, segmentHeight) : LOW_VAR_SCENE_CHANGE_CHROMA_TH * NUM64x64INPIC(segmentWidth, segmentHeight);
1312
+
1313
+ for (uint32_t bin = 0; bin < HISTOGRAM_NUMBER_OF_BINS; ++bin) {
1314
+ accHistDiff += X265_ABS((int32_t)currentFrame->picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex0bin - (int32_t)previousFrame->picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex0bin);
1315
+ accHistDiffCb += X265_ABS((int32_t)currentFrame->picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex1bin - (int32_t)previousFrame->picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex1bin);
1316
+ accHistDiffCr += X265_ABS((int32_t)currentFrame->picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex2bin - (int32_t)previousFrame->picHistogramsegmentInFrameWidthIndexsegmentInFrameHeightIndex2bin);
1317
+ }
1318
+
1319
+ if (m_resetRunningAvg) {
1320
+ accHistDiffRunningAvgsegmentInFrameWidthIndexsegmentInFrameHeightIndex = accHistDiff;
1321
+ accHistDiffRunningAvgCbsegmentInFrameWidthIndexsegmentInFrameHeightIndex = accHistDiffCb;
1322
+ accHistDiffRunningAvgCrsegmentInFrameWidthIndexsegmentInFrameHeightIndex = accHistDiffCr;
1323
+ }
1324
+
1325
+ // difference between accumulative absolute histogram differences and the running average at the current frame.
1326
+ uint32_t accHistDiffError = X265_ABS((int32_t)accHistDiffRunningAvgsegmentInFrameWidthIndexsegmentInFrameHeightIndex - (int32_t)accHistDiff);
1327
+ uint32_t accHistDiffErrorCb = X265_ABS((int32_t)accHistDiffRunningAvgCbsegmentInFrameWidthIndexsegmentInFrameHeightIndex - (int32_t)accHistDiffCb);
1328
+ uint32_t accHistDiffErrorCr = X265_ABS((int32_t)accHistDiffRunningAvgCrsegmentInFrameWidthIndexsegmentInFrameHeightIndex - (int32_t)accHistDiffCr);
1329
+
1330
+ if ((accHistDiffError > segmentThreshHold && accHistDiff >= accHistDiffError) ||
1331
+ (accHistDiffErrorCb > segmentThreshHoldCb && accHistDiffCb >= accHistDiffErrorCb) ||
1332
+ (accHistDiffErrorCr > segmentThreshHoldCr && accHistDiffCr >= accHistDiffErrorCr)) {
1333
+
1334
+ isAbruptChange = true;
1335
+ }
1336
+
1337
+ if (isAbruptChange)
1338
+ {
1339
+ absIntDiffFuturePast = (uint8_t)X265_ABS((int16_t)futureFrame->averageIntensityPerSegmentsegmentInFrameWidthIndexsegmentInFrameHeightIndex0 - (int16_t)previousFrame->averageIntensityPerSegmentsegmentInFrameWidthIndexsegmentInFrameHeightIndex0);
1340
+ absIntDiffFuturePresent = (uint8_t)X265_ABS((int16_t)futureFrame->averageIntensityPerSegmentsegmentInFrameWidthIndexsegmentInFrameHeightIndex0 - (int16_t)currentFrame->averageIntensityPerSegmentsegmentInFrameWidthIndexsegmentInFrameHeightIndex0);
1341
+ absIntDiffPresentPast = (uint8_t)X265_ABS((int16_t)currentFrame->averageIntensityPerSegmentsegmentInFrameWidthIndexsegmentInFrameHeightIndex0 - (int16_t)previousFrame->averageIntensityPerSegmentsegmentInFrameWidthIndexsegmentInFrameHeightIndex0);
1342
+
1343
+ if (absIntDiffFuturePresent >= FLASH_TH * absIntDiffFuturePast && absIntDiffPresentPast >= FLASH_TH * absIntDiffFuturePast) {
1344
+ x265_log(m_param, X265_LOG_DEBUG, "Flash in frame# %i , %i, %i, %i\n", currentFrame->frameNum, absIntDiffFuturePast, absIntDiffFuturePresent, absIntDiffPresentPast);
1345
+ }
1346
+ else if (absIntDiffFuturePresent < FADE_TH && absIntDiffPresentPast < FADE_TH) {
1347
+ x265_log(m_param, X265_LOG_DEBUG, "Fade in frame# %i , %i, %i, %i\n", currentFrame->frameNum, absIntDiffFuturePast, absIntDiffFuturePresent, absIntDiffPresentPast);
1348
+ }
1349
+ else if (X265_ABS(absIntDiffFuturePresent - absIntDiffPresentPast) < INTENSITY_CHANGE_TH && absIntDiffFuturePresent + absIntDiffPresentPast >= absIntDiffFuturePast) {
1350
+ x265_log(m_param, X265_LOG_DEBUG, "Intensity Change in frame# %i , %i, %i, %i\n", currentFrame->frameNum, absIntDiffFuturePast, absIntDiffFuturePresent, absIntDiffPresentPast);
1351
+ }
1352
+ else {
1353
+ isSceneChange = true;
1354
+ x265_log(m_param, X265_LOG_DEBUG, "Scene change in frame# %i , %i, %i, %i\n", currentFrame->frameNum, absIntDiffFuturePast, absIntDiffFuturePresent, absIntDiffPresentPast);
1355
+ }
1356
+
1357
+ }
1358
+ else {
1359
+ accHistDiffRunningAvgsegmentInFrameWidthIndexsegmentInFrameHeightIndex = (3 * accHistDiffRunningAvgsegmentInFrameWidthIndexsegmentInFrameHeightIndex + accHistDiff) / 4;
1360
+ }
1361
+
1362
+ abruptChangeCount += isAbruptChange;
1363
+ sceneChangeCount += isSceneChange;
1364
+ }
1365
+ }
1366
+
1367
+ if (abruptChangeCount >= m_segmentCountThreshold) {
1368
+ m_resetRunningAvg = true;
1369
+ }
1370
+ else {
1371
+ m_resetRunningAvg = false;
1372
+ }
1373
+
1374
+ if ((sceneChangeCount >= m_segmentCountThreshold)) {
1375
+ x265_log(m_param, X265_LOG_DEBUG, "Scene Change in Pic Number# %i\n", currentFrame->frameNum);
1376
+
1377
+ return true;
1378
+ }
1379
+ else {
1380
+ return false;
1381
+ }
1382
+
1383
+}
1384
+
1385
+bool Lookahead::histBasedScenecut(Lowres **frames, int p0, int p1, int numFrames)
1386
+{
1387
+ /* Only do analysis during a normal scenecut check. */
1388
+ if (m_param->bframes)
1389
+ {
1390
+ int origmaxp1 = p0 + 1;
1391
+ /* Look ahead to avoid coding short flashes as scenecuts. */
1392
+ origmaxp1 += m_param->bframes;
1393
+ int maxp1 = X265_MIN(origmaxp1, numFrames);
1394
+
1395
+ for (int cp1 = p0; cp1 < maxp1; cp1++)
1396
+ {
1397
+ if (framescp1 + 1->bHistScenecutAnalyzed == true)
1398
+ continue;
1399
+
1400
+ if (framescp1 + 2 != NULL && detectHistBasedSceneChange(frames, cp1, cp1 + 1, cp1 + 2))
1401
+ {
1402
+ /* If current frame is a Scenecut from p0 frame as well as Scenecut from
1403
+ * preceeding frame, mark it as a Scenecut */
1404
+ framescp1+1->bScenecut = true;
1405
+ }
1406
+ }
1407
+
1408
+ }
1409
+
1410
+ return framesp1->bScenecut;
1411
+}
1412
+
1413
void Lookahead::slicetypePath(Lowres **frames, int length, char(*best_paths)X265_LOOKAHEAD_MAX + 1)
1414
{
1415
char paths2X265_LOOKAHEAD_MAX + 1;
1416
1417
memcpy(best_pathslength % (X265_BFRAME_MAX + 1), pathsidx ^ 1, length);
1418
}
1419
1420
+// Find slicetype of the frame with poc # in lookahead buffer
1421
+int Lookahead::findSliceType(int poc)
1422
+{
1423
+ int out_slicetype = X265_TYPE_AUTO;
1424
+ if (m_filled)
1425
+ {
1426
+ m_outputLock.acquire();
1427
+ Frame* out = m_outputQueue.first();
1428
+ while (out != NULL) {
1429
+ if (poc == out->m_poc)
1430
+ {
1431
+ out_slicetype = out->m_lowres.sliceType;
1432
+ break;
1433
+ }
1434
+ out = out->m_next;
1435
+ }
1436
+ m_outputLock.release();
1437
+ }
1438
+ return out_slicetype;
1439
+}
1440
+
1441
int64_t Lookahead::slicetypePathCost(Lowres **frames, char *path, int64_t threshold)
1442
{
1443
int64_t cost = 0;
1444
x265_3.5.tar.gz/source/encoder/slicetype.h -> x265_3.6.tar.gz/source/encoder/slicetype.h
Changed
110
1
2
#define EDGE_INCLINATION 45
3
#define TEMPORAL_SCENECUT_THRESHOLD 50
4
5
+#define X265_ABS(a) (((a) < 0) ? (-(a)) : (a))
6
+
7
+#define PICTURE_DIFF_VARIANCE_TH 390
8
+#define PICTURE_VARIANCE_TH 1500
9
+#define LOW_VAR_SCENE_CHANGE_TH 2250
10
+#define HIGH_VAR_SCENE_CHANGE_TH 3500
11
+
12
+#define PICTURE_DIFF_VARIANCE_CHROMA_TH 10
13
+#define PICTURE_VARIANCE_CHROMA_TH 20
14
+#define LOW_VAR_SCENE_CHANGE_CHROMA_TH 2250/4
15
+#define HIGH_VAR_SCENE_CHANGE_CHROMA_TH 3500/4
16
+
17
+#define FLASH_TH 1.5
18
+#define FADE_TH 4
19
+#define INTENSITY_CHANGE_TH 4
20
+
21
+#define NUM64x64INPIC(w,h) ((w*h)>> (MAX_LOG2_CU_SIZE<<1))
22
+
23
#if HIGH_BIT_DEPTH
24
#define EDGE_THRESHOLD 1023.0
25
#else
26
27
28
~LookaheadTLD() { X265_FREE(wbuffer0); }
29
30
+ void collectPictureStatistics(Frame *curFrame);
31
+ void computeIntensityHistogramBinsLuma(Frame *curFrame, uint64_t *sumAvgIntensityTotalSegmentsLuma);
32
+
33
+ void computeIntensityHistogramBinsChroma(
34
+ Frame *curFrame,
35
+ uint64_t *sumAverageIntensityCb,
36
+ uint64_t *sumAverageIntensityCr);
37
+
38
+ void calculateHistogram(
39
+ pixel *inputSrc,
40
+ uint32_t inputWidth,
41
+ uint32_t inputHeight,
42
+ intptr_t stride,
43
+ uint8_t dsFactor,
44
+ uint32_t *histogram,
45
+ uint64_t *sum);
46
+
47
+ void computePictureStatistics(Frame *curFrame);
48
+
49
+ uint32_t calcVariance(pixel* src, intptr_t stride, intptr_t blockOffset, uint32_t plane);
50
+
51
void calcAdaptiveQuantFrame(Frame *curFrame, x265_param* param);
52
+ void calcFrameSegment(Frame *curFrame);
53
void lowresIntraEstimate(Lowres& fenc, uint32_t qgSize);
54
55
void weightsAnalyse(Lowres& fenc, Lowres& ref);
56
57
58
/* pre-lookahead */
59
int m_fullQueueSize;
60
- int m_histogramX265_BFRAME_MAX + 1;
61
int m_lastKeyframe;
62
int m_8x8Width;
63
int m_8x8Height;
64
65
bool m_isFadeIn;
66
uint64_t m_fadeCount;
67
int m_fadeStart;
68
+
69
+ uint32_t **m_accHistDiffRunningAvgCb;
70
+ uint32_t **m_accHistDiffRunningAvgCr;
71
+ uint32_t **m_accHistDiffRunningAvg;
72
+
73
+ bool m_resetRunningAvg;
74
+ uint32_t m_segmentCountThreshold;
75
+
76
+ int8_t m_gopId;
77
+
78
Lookahead(x265_param *param, ThreadPool *pool);
79
#if DETAILED_CU_STATS
80
int64_t m_slicetypeDecideElapsedTime;
81
82
83
void getEstimatedPictureCost(Frame *pic);
84
void setLookaheadQueue();
85
+ int findSliceType(int poc);
86
87
protected:
88
89
90
/* called by slicetypeAnalyse() to make slice decisions */
91
bool scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, int numFrames);
92
bool scenecutInternal(Lowres **frames, int p0, int p1, bool bRealScenecut);
93
+
94
+ bool histBasedScenecut(Lowres **frames, int p0, int p1, int numFrames);
95
+ bool detectHistBasedSceneChange(Lowres **frames, int p0, int p1, int p2);
96
+
97
void slicetypePath(Lowres **frames, int length, char(*best_paths)X265_LOOKAHEAD_MAX + 1);
98
int64_t slicetypePathCost(Lowres **frames, char *path, int64_t threshold);
99
int64_t vbvFrameCost(Lowres **frames, int p0, int p1, int b);
100
101
102
/* called by getEstimatedPictureCost() to finalize cuTree costs */
103
int64_t frameCostRecalculate(Lowres **frames, int p0, int p1, int b);
104
+ /*Compute index for positioning B-Ref frames*/
105
+ void placeBref(Frame** frames, int start, int end, int num, int *brefs);
106
+ void compCostBref(Lowres **frame, int start, int end, int num);
107
};
108
109
class PreLookaheadGroup : public BondedTaskGroup
110
x265_3.5.tar.gz/source/output/output.cpp -> x265_3.6.tar.gz/source/output/output.cpp
Changed
19
1
2
3
using namespace X265_NS;
4
5
-ReconFile* ReconFile::open(const char *fname, int width, int height, uint32_t bitdepth, uint32_t fpsNum, uint32_t fpsDenom, int csp)
6
+ReconFile* ReconFile::open(const char *fname, int width, int height, uint32_t bitdepth, uint32_t fpsNum, uint32_t fpsDenom, int csp, int sourceBitDepth)
7
{
8
const char * s = strrchr(fname, '.');
9
10
if (s && !strcmp(s, ".y4m"))
11
- return new Y4MOutput(fname, width, height, fpsNum, fpsDenom, csp);
12
+ return new Y4MOutput(fname, width, height, bitdepth, fpsNum, fpsDenom, csp, sourceBitDepth);
13
else
14
- return new YUVOutput(fname, width, height, bitdepth, csp);
15
+ return new YUVOutput(fname, width, height, bitdepth, csp, sourceBitDepth);
16
}
17
18
OutputFile* OutputFile::open(const char *fname, InputFileInfo& inputInfo)
19
x265_3.5.tar.gz/source/output/output.h -> x265_3.6.tar.gz/source/output/output.h
Changed
10
1
2
ReconFile() {}
3
4
static ReconFile* open(const char *fname, int width, int height, uint32_t bitdepth,
5
- uint32_t fpsNum, uint32_t fpsDenom, int csp);
6
+ uint32_t fpsNum, uint32_t fpsDenom, int csp, int sourceBitDepth);
7
8
virtual bool isFail() const = 0;
9
10
x265_3.5.tar.gz/source/output/y4m.cpp -> x265_3.6.tar.gz/source/output/y4m.cpp
Changed
145
1
2
using namespace X265_NS;
3
using namespace std;
4
5
-Y4MOutput::Y4MOutput(const char *filename, int w, int h, uint32_t fpsNum, uint32_t fpsDenom, int csp)
6
+Y4MOutput::Y4MOutput(const char* filename, int w, int h, uint32_t bitdepth, uint32_t fpsNum, uint32_t fpsDenom, int csp, int inputdepth)
7
: width(w)
8
, height(h)
9
+ , bitDepth(bitdepth)
10
, colorSpace(csp)
11
, frameSize(0)
12
+ , inputDepth(inputdepth)
13
{
14
ofs.open(filename, ios::binary | ios::out);
15
buf = new charwidth;
16
17
18
if (ofs)
19
{
20
- ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << fpsNum << ":" << fpsDenom << " Ip" << " C" << cf << "\n";
21
+ if (bitDepth == 10)
22
+ ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << fpsNum << ":" << fpsDenom << " Ip" << " C" << cf << "p10" << " XYSCSS = " << cf << "P10" << "\n";
23
+ else if (bitDepth == 12)
24
+ ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << fpsNum << ":" << fpsDenom << " Ip" << " C" << cf << "p12" << " XYSCSS = " << cf << "P12" << "\n";
25
+ else
26
+ ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << fpsNum << ":" << fpsDenom << " Ip" << " C" << cf << "\n";
27
+
28
header = ofs.tellp();
29
}
30
31
32
bool Y4MOutput::writePicture(const x265_picture& pic)
33
{
34
std::ofstream::pos_type outPicPos = header;
35
- outPicPos += (uint64_t)pic.poc * (6 + frameSize);
36
+ if (pic.bitDepth > 8)
37
+ outPicPos += (uint64_t)(pic.poc * (6 + frameSize * 2));
38
+ else
39
+ outPicPos += (uint64_t)pic.poc * (6 + frameSize);
40
ofs.seekp(outPicPos);
41
ofs << "FRAME\n";
42
43
-#if HIGH_BIT_DEPTH
44
- if (pic.bitDepth > 8 && pic.poc == 0)
45
- x265_log(NULL, X265_LOG_WARNING, "y4m: down-shifting reconstructed pixels to 8 bits\n");
46
-#else
47
- if (pic.bitDepth > 8 && pic.poc == 0)
48
- x265_log(NULL, X265_LOG_WARNING, "y4m: forcing reconstructed pixels to 8 bits\n");
49
-#endif
50
+ if (inputDepth > 8)
51
+ {
52
+ if (pic.bitDepth == 8 && pic.poc == 0)
53
+ x265_log(NULL, X265_LOG_WARNING, "y4m: down-shifting reconstructed pixels to 8 bits\n");
54
+ }
55
56
X265_CHECK(pic.colorSpace == colorSpace, "invalid chroma subsampling\n");
57
58
-#if HIGH_BIT_DEPTH
59
-
60
- // encoder gave us short pixels, downshift, then write
61
- X265_CHECK(pic.bitDepth > 8, "invalid bit depth\n");
62
- int shift = pic.bitDepth - 8;
63
- for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
64
+ if (inputDepth > 8)//if HIGH_BIT_DEPTH
65
{
66
- uint16_t *src = (uint16_t*)pic.planesi;
67
- for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
68
+ if (pic.bitDepth == 8)
69
{
70
- for (int w = 0; w < width >> x265_cli_cspscolorSpace.widthi; w++)
71
- bufw = (char)(srcw >> shift);
72
-
73
- ofs.write(buf, width >> x265_cli_cspscolorSpace.widthi);
74
- src += pic.stridei / sizeof(*src);
75
+ // encoder gave us short pixels, downshift, then write
76
+ X265_CHECK(pic.bitDepth == 8, "invalid bit depth\n");
77
+ int shift = pic.bitDepth - 8;
78
+ for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
79
+ {
80
+ char *src = (char*)pic.planesi;
81
+ for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
82
+ {
83
+ for (int w = 0; w < width >> x265_cli_cspscolorSpace.widthi; w++)
84
+ bufw = (char)(srcw >> shift);
85
+
86
+ ofs.write(buf, width >> x265_cli_cspscolorSpace.widthi);
87
+ src += pic.stridei / sizeof(*src);
88
+ }
89
+ }
90
+ }
91
+ else
92
+ {
93
+ X265_CHECK(pic.bitDepth > 8, "invalid bit depth\n");
94
+ for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
95
+ {
96
+ uint16_t *src = (uint16_t*)pic.planesi;
97
+ for (int h = 0; h < (height * 1) >> x265_cli_cspscolorSpace.heighti; h++)
98
+ {
99
+ ofs.write((const char*)src, (width * 2) >> x265_cli_cspscolorSpace.widthi);
100
+ src += pic.stridei / sizeof(*src);
101
+ }
102
+ }
103
}
104
}
105
-
106
-#else // if HIGH_BIT_DEPTH
107
-
108
- X265_CHECK(pic.bitDepth == 8, "invalid bit depth\n");
109
- for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
110
+ else if (inputDepth == 8 && pic.bitDepth > 8)
111
{
112
- char *src = (char*)pic.planesi;
113
- for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
114
+ X265_CHECK(pic.bitDepth > 8, "invalid bit depth\n");
115
+ for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
116
{
117
- ofs.write(src, width >> x265_cli_cspscolorSpace.widthi);
118
- src += pic.stridei / sizeof(*src);
119
+ uint16_t* src = (uint16_t*)pic.planesi;
120
+ for (int h = 0; h < (height * 1) >> x265_cli_cspscolorSpace.heighti; h++)
121
+ {
122
+ ofs.write((const char*)src, (width * 2) >> x265_cli_cspscolorSpace.widthi);
123
+ src += pic.stridei / sizeof(*src);
124
+ }
125
+ }
126
+ }
127
+ else
128
+ {
129
+ X265_CHECK(pic.bitDepth == 8, "invalid bit depth\n");
130
+ for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
131
+ {
132
+ char *src = (char*)pic.planesi;
133
+ for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
134
+ {
135
+ ofs.write(src, width >> x265_cli_cspscolorSpace.widthi);
136
+ src += pic.stridei / sizeof(*src);
137
+ }
138
}
139
}
140
-
141
-#endif // if HIGH_BIT_DEPTH
142
143
return true;
144
}
145
x265_3.5.tar.gz/source/output/y4m.h -> x265_3.6.tar.gz/source/output/y4m.h
Changed
25
1
2
3
int height;
4
5
+ uint32_t bitDepth;
6
+
7
int colorSpace;
8
9
uint32_t frameSize;
10
11
+ int inputDepth;
12
+
13
std::ofstream ofs;
14
15
std::ofstream::pos_type header;
16
17
18
public:
19
20
- Y4MOutput(const char *filename, int width, int height, uint32_t fpsNum, uint32_t fpsDenom, int csp);
21
+ Y4MOutput(const char *filename, int width, int height, uint32_t bitdepth, uint32_t fpsNum, uint32_t fpsDenom, int csp, int inputDepth);
22
23
virtual ~Y4MOutput();
24
25
x265_3.5.tar.gz/source/output/yuv.cpp -> x265_3.6.tar.gz/source/output/yuv.cpp
Changed
107
1
2
using namespace X265_NS;
3
using namespace std;
4
5
-YUVOutput::YUVOutput(const char *filename, int w, int h, uint32_t d, int csp)
6
+YUVOutput::YUVOutput(const char *filename, int w, int h, uint32_t d, int csp, int inputdepth)
7
: width(w)
8
, height(h)
9
, depth(d)
10
, colorSpace(csp)
11
, frameSize(0)
12
+ , inputDepth(inputdepth)
13
{
14
ofs.open(filename, ios::binary | ios::out);
15
buf = new charwidth;
16
17
X265_CHECK(pic.colorSpace == colorSpace, "invalid chroma subsampling\n");
18
X265_CHECK(pic.bitDepth == (int)depth, "invalid bit depth\n");
19
20
-#if HIGH_BIT_DEPTH
21
- if (depth == 8)
22
+ if (inputDepth > 8)
23
{
24
- int shift = pic.bitDepth - 8;
25
- ofs.seekp((std::streamoff)fileOffset);
26
- for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
27
- {
28
- uint16_t *src = (uint16_t*)pic.planesi;
29
- for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
30
- {
31
- for (int w = 0; w < width >> x265_cli_cspscolorSpace.widthi; w++)
32
- bufw = (char)(srcw >> shift);
33
+ if (depth == 8)
34
+ {
35
+ int shift = pic.bitDepth - 8;
36
+ ofs.seekp((std::streamoff)fileOffset);
37
+ for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
38
+ {
39
+ uint16_t *src = (uint16_t*)pic.planesi;
40
+ for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
41
+ {
42
+ for (int w = 0; w < width >> x265_cli_cspscolorSpace.widthi; w++)
43
+ bufw = (char)(srcw >> shift);
44
45
- ofs.write(buf, width >> x265_cli_cspscolorSpace.widthi);
46
- src += pic.stridei / sizeof(*src);
47
- }
48
- }
49
+ ofs.write(buf, width >> x265_cli_cspscolorSpace.widthi);
50
+ src += pic.stridei / sizeof(*src);
51
+ }
52
+ }
53
+ }
54
+ else
55
+ {
56
+ ofs.seekp((std::streamoff)(fileOffset * 2));
57
+ for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
58
+ {
59
+ uint16_t *src = (uint16_t*)pic.planesi;
60
+ for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
61
+ {
62
+ ofs.write((const char*)src, (width * 2) >> x265_cli_cspscolorSpace.widthi);
63
+ src += pic.stridei / sizeof(*src);
64
+ }
65
+ }
66
+ }
67
}
68
else
69
{
70
- ofs.seekp((std::streamoff)(fileOffset * 2));
71
- for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
72
- {
73
- uint16_t *src = (uint16_t*)pic.planesi;
74
- for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
75
- {
76
- ofs.write((const char*)src, (width * 2) >> x265_cli_cspscolorSpace.widthi);
77
- src += pic.stridei / sizeof(*src);
78
- }
79
- }
80
+ ofs.seekp((std::streamoff)fileOffset);
81
+ for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
82
+ {
83
+ char *src = (char*)pic.planesi;
84
+ for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
85
+ {
86
+ ofs.write(src, width >> x265_cli_cspscolorSpace.widthi);
87
+ src += pic.stridei / sizeof(*src);
88
+ }
89
+ }
90
}
91
-#else // if HIGH_BIT_DEPTH
92
- ofs.seekp((std::streamoff)fileOffset);
93
- for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
94
- {
95
- char *src = (char*)pic.planesi;
96
- for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
97
- {
98
- ofs.write(src, width >> x265_cli_cspscolorSpace.widthi);
99
- src += pic.stridei / sizeof(*src);
100
- }
101
- }
102
-
103
-#endif // if HIGH_BIT_DEPTH
104
105
return true;
106
}
107
x265_3.5.tar.gz/source/output/yuv.h -> x265_3.6.tar.gz/source/output/yuv.h
Changed
18
1
2
3
uint32_t frameSize;
4
5
+ int inputDepth;
6
+
7
char *buf;
8
9
std::ofstream ofs;
10
11
public:
12
13
- YUVOutput(const char *filename, int width, int height, uint32_t bitdepth, int csp);
14
+ YUVOutput(const char *filename, int width, int height, uint32_t bitdepth, int csp, int inputDepth);
15
16
virtual ~YUVOutput();
17
18
x265_3.5.tar.gz/source/test/CMakeLists.txt -> x265_3.6.tar.gz/source/test/CMakeLists.txt
Changed
24
1
2
3
# add ARM assembly files
4
if(ARM OR CROSS_COMPILE_ARM)
5
- if(NOT ARM64)
6
- enable_language(ASM)
7
- set(NASM_SRC checkasm-arm.S)
8
- add_custom_command(
9
- OUTPUT checkasm-arm.obj
10
- COMMAND ${CMAKE_CXX_COMPILER}
11
- ARGS ${NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S -o checkasm-arm.obj
12
- DEPENDS checkasm-arm.S)
13
- endif()
14
+ enable_language(ASM)
15
+ set(NASM_SRC checkasm-arm.S)
16
+ add_custom_command(
17
+ OUTPUT checkasm-arm.obj
18
+ COMMAND ${CMAKE_CXX_COMPILER}
19
+ ARGS ${NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S -o checkasm-arm.obj
20
+ DEPENDS checkasm-arm.S)
21
endif(ARM OR CROSS_COMPILE_ARM)
22
23
# add PowerPC assembly files
24
x265_3.5.tar.gz/source/test/pixelharness.cpp -> x265_3.6.tar.gz/source/test/pixelharness.cpp
Changed
63
1
2
return true;
3
}
4
5
+bool PixelHarness::check_downscaleluma_t(downscaleluma_t ref, downscaleluma_t opt)
6
+{
7
+ ALIGN_VAR_16(pixel, ref_destf32 * 32);
8
+ ALIGN_VAR_16(pixel, opt_destf32 * 32);
9
+
10
+ intptr_t src_stride = 64;
11
+ intptr_t dst_stride = 32;
12
+ int bx = 32;
13
+ int by = 32;
14
+ int j = 0;
15
+ for (int i = 0; i < ITERS; i++)
16
+ {
17
+ int index = i % TEST_CASES;
18
+ ref(pixel_test_buffindex + j, ref_destf, src_stride, dst_stride, bx, by);
19
+ checked(opt, pixel_test_buffindex + j, opt_destf, src_stride, dst_stride, bx, by);
20
+
21
+ if (memcmp(ref_destf, opt_destf, 32 * 32 * sizeof(pixel)))
22
+ return false;
23
+
24
+ reportfail();
25
+ j += INCR;
26
+ }
27
+
28
+ return true;
29
+}
30
+
31
bool PixelHarness::check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt)
32
{
33
ALIGN_VAR_16(int16_t, ref_dest64 * 64);
34
35
}
36
}
37
38
+ if (opt.frameSubSampleLuma)
39
+ {
40
+ if (!check_downscaleluma_t(ref.frameSubSampleLuma, opt.frameSubSampleLuma))
41
+ {
42
+ printf("SubSample Luma failed!\n");
43
+ return false;
44
+ }
45
+ }
46
+
47
if (opt.scale1D_128to64NONALIGNED)
48
{
49
if (!check_scale1D_pp(ref.scale1D_128to64NONALIGNED, opt.scale1D_128to64NONALIGNED))
50
51
REPORT_SPEEDUP(opt.frameInitLowres, ref.frameInitLowres, pbuf2, pbuf1, pbuf2, pbuf3, pbuf4, 64, 64, 64, 64);
52
}
53
54
+ if (opt.frameSubSampleLuma)
55
+ {
56
+ HEADER0("downscaleluma");
57
+ REPORT_SPEEDUP(opt.frameSubSampleLuma, ref.frameSubSampleLuma, pbuf2, pbuf1, 64, 64, 64, 64);
58
+ }
59
+
60
if (opt.scale1D_128to64NONALIGNED)
61
{
62
HEADER0("scale1D_128to64");
63
x265_3.5.tar.gz/source/test/pixelharness.h -> x265_3.6.tar.gz/source/test/pixelharness.h
Changed
9
1
2
bool check_integral_inith(integralh_t ref, integralh_t opt);
3
bool check_ssimDist(ssimDistortion_t ref, ssimDistortion_t opt);
4
bool check_normFact(normFactor_t ref, normFactor_t opt, int block);
5
+ bool check_downscaleluma_t(downscaleluma_t ref, downscaleluma_t opt);
6
7
public:
8
9
x265_3.5.tar.gz/source/test/rate-control-tests.txt -> x265_3.6.tar.gz/source/test/rate-control-tests.txt
Changed
10
1
2
112_1920x1080_25.yuv,--preset ultrafast --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 15000 --hrd --strict-cbr
3
Traffic_4096x2048_30.yuv,--preset superfast --bitrate 20000 --vbv-maxrate 20000 --vbv-bufsize 20000 --repeat-headers --strict-cbr
4
Traffic_4096x2048_30.yuv,--preset faster --bitrate 8000 --vbv-maxrate 8000 --vbv-bufsize 6000 --aud --repeat-headers --no-open-gop --hrd --pmode --pme
5
-News-4k.y4m,--preset veryfast --bitrate 3000 --vbv-maxrate 5000 --vbv-bufsize 5000 --repeat-headers --temporal-layers
6
+News-4k.y4m,--preset veryfast --bitrate 3000 --vbv-maxrate 5000 --vbv-bufsize 5000 --repeat-headers --temporal-layers 3
7
NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --bitrate 18000 --vbv-bufsize 20000 --vbv-maxrate 18000 --strict-cbr
8
NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --bitrate 8000 --vbv-bufsize 12000 --vbv-maxrate 10000 --tune grain
9
big_buck_bunny_360p24.y4m,--preset medium --bitrate 400 --vbv-bufsize 600 --vbv-maxrate 600 --aud --hrd --tune fast-decode
10
x265_3.5.tar.gz/source/test/regression-tests.txt -> x265_3.6.tar.gz/source/test/regression-tests.txt
Changed
91
1
2
BasketballDrive_1920x1080_50.y4m,--preset faster --aq-strength 2 --merange 190 --slices 3
3
BasketballDrive_1920x1080_50.y4m,--preset medium --ctu 16 --max-tu-size 8 --subme 7 --qg-size 16 --cu-lossless --tu-inter-depth 3 --limit-tu 1
4
BasketballDrive_1920x1080_50.y4m,--preset medium --keyint -1 --nr-inter 100 -F4 --no-sao
5
-BasketballDrive_1920x1080_50.y4m,--preset medium --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 2 --bitrate 7000 --limit-modes::--preset medium --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 2 --bitrate 7000 --limit-modes
6
+BasketballDrive_1920x1080_50.y4m,--preset medium --analysis-save x265_analysis.dat --analysis-save-reuse-level 2 --bitrate 7000 --limit-modes::--preset medium --analysis-load x265_analysis.dat --analysis-load-reuse-level 2 --bitrate 7000 --limit-modes
7
BasketballDrive_1920x1080_50.y4m,--preset slow --nr-intra 100 -F4 --aq-strength 3 --qg-size 16 --limit-refs 1
8
BasketballDrive_1920x1080_50.y4m,--preset slower --lossless --chromaloc 3 --subme 0 --limit-tu 4
9
-BasketballDrive_1920x1080_50.y4m,--preset slower --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --bitrate 7000 --limit-tu 0::--preset slower --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 10 --bitrate 7000 --limit-tu 0
10
+BasketballDrive_1920x1080_50.y4m,--preset slower --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --bitrate 7000 --limit-tu 0::--preset slower --analysis-load x265_analysis.dat --analysis-load-reuse-level 10 --bitrate 7000 --limit-tu 0
11
BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1 --aq-mode 3 --limit-tu 3
12
-BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --crf 18 --tskip-fast --limit-tu 2::--preset veryslow --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 5 --crf 18 --tskip-fast --limit-tu 2
13
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --crf 18 --tskip-fast --limit-tu 2::--preset veryslow --analysis-load x265_analysis.dat --analysis-load-reuse-level 5 --crf 18 --tskip-fast --limit-tu 2
14
BasketballDrive_1920x1080_50.y4m,--preset veryslow --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
15
Coastguard-4k.y4m,--preset ultrafast --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
16
Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop
17
18
Coastguard-4k.y4m,--preset slow --tune psnr --cbqpoffs -1 --crqpoffs 1 --limit-refs 1
19
CrowdRun_1920x1080_50_10bit_422.yuv,--preset ultrafast --weightp --tune zerolatency --qg-size 16
20
CrowdRun_1920x1080_50_10bit_422.yuv,--preset superfast --weightp --no-wpp --sao
21
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset veryfast --temporal-layers --tune grain
22
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset veryfast --temporal-layers 2 --tune grain
23
CrowdRun_1920x1080_50_10bit_422.yuv,--preset faster --max-tu-size 4 --min-cu-size 32
24
CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --aq-mode 0 --sar 2 --range full
25
CrowdRun_1920x1080_50_10bit_422.yuv,--preset medium --no-wpp --no-cutree --no-strong-intra-smoothing --limit-refs 1
26
27
CrowdRun_1920x1080_50_10bit_422.yuv,--preset slower --tune ssim --tune fastdecode --limit-refs 2
28
CrowdRun_1920x1080_50_10bit_444.yuv,--preset ultrafast --weightp --no-wpp --no-open-gop
29
CrowdRun_1920x1080_50_10bit_444.yuv,--preset superfast --weightp --dither --no-psy-rd
30
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryfast --temporal-layers --repeat-headers --limit-refs 2
31
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryfast --temporal-layers 2 --repeat-headers --limit-refs 2
32
CrowdRun_1920x1080_50_10bit_444.yuv,--preset medium --dither --keyint -1 --rdoq-level 1 --limit-modes
33
CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --tskip --tskip-fast --no-scenecut --limit-tu 1
34
CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --aq-mode 3 --aq-strength 1.5 --aq-motion --bitrate 5000
35
36
CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --hevc-aq --no-cutree --qg-size 16
37
DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset superfast --weightp --qg-size 16
38
DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset medium --tune psnr --bframes 16 --limit-modes
39
-DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
40
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers 2 --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
41
DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset veryfast --weightp --nr-intra 1000 -F4
42
DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset medium --nr-inter 500 -F4 --no-psy-rdoq
43
DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset slower --no-weightp --rdoq-level 0 --limit-refs 3 --tu-inter-depth 4 --limit-tu 3
44
-DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset fast --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1::--preset fast --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 5 --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1
45
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset fast --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1::--preset fast --analysis-load x265_analysis.dat --analysis-load-reuse-level 5 --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1
46
FourPeople_1280x720_60.y4m,--preset superfast --no-wpp --lookahead-slices 2
47
FourPeople_1280x720_60.y4m,--preset veryfast --aq-mode 2 --aq-strength 1.5 --qg-size 8
48
FourPeople_1280x720_60.y4m,--preset medium --qp 38 --no-psy-rd
49
50
ducks_take_off_420_1_720p50.y4m,--preset medium --selective-sao 4 --sao --crf 20
51
Traffic_4096x2048_30p.y4m, --preset medium --frame-dup --dup-threshold 60 --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000
52
Kimono1_1920x1080_24_400.yuv,--preset superfast --qp 28 --zones 0,139,q=32
53
-sintel_trailer_2k_1920x1080_24.yuv, --preset medium --hist-scenecut --hist-threshold 0.02 --frame-dup --dup-threshold 60 --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000
54
-sintel_trailer_2k_1920x1080_24.yuv, --preset medium --hist-scenecut --hist-threshold 0.02
55
-sintel_trailer_2k_1920x1080_24.yuv, --preset ultrafast --hist-scenecut --hist-threshold 0.02
56
crowd_run_1920x1080_50.yuv, --preset faster --ctu 32 --rskip 2 --rskip-edge-threshold 5
57
crowd_run_1920x1080_50.yuv, --preset fast --ctu 64 --rskip 2 --rskip-edge-threshold 5 --aq-mode 4
58
-crowd_run_1920x1080_50.yuv, --preset slow --ctu 32 --rskip 2 --rskip-edge-threshold 5 --hist-scenecut --hist-threshold 0.1
59
-crowd_run_1920x1080_50.yuv, --preset slower --ctu 16 --rskip 2 --rskip-edge-threshold 5 --hist-scenecut --hist-threshold 0.1 --aq-mode 4
60
+crowd_run_1920x1080_50.yuv, --preset ultrafast --video-signal-type-preset BT2100_PQ_YCC:BT2100x108n0005
61
+crowd_run_1920x1080_50.yuv, --preset ultrafast --eob --eos
62
63
# Main12 intraCost overflow bug test
64
720p50_parkrun_ter.y4m,--preset medium
65
66
67
#scaled save/load test
68
crowd_run_1080p50.y4m,--preset ultrafast --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 8000 --vbv-bufsize 8000::crowd_run_2160p50.y4m, --preset ultrafast --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 12000 --vbv-bufsize 12000
69
-crowd_run_1080p50.y4m,--preset superfast --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m, --preset superfast --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000
70
-crowd_run_1080p50.y4m,--preset fast --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m, --preset fast --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 5 --scale-factor 2 --qp 18
71
+crowd_run_1080p50.y4m,--preset superfast --analysis-save x265_analysis.dat --analysis-save-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m, --preset superfast --analysis-load x265_analysis.dat --analysis-load-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000
72
+crowd_run_1080p50.y4m,--preset fast --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m, --preset fast --analysis-load x265_analysis.dat --analysis-load-reuse-level 5 --scale-factor 2 --qp 18
73
crowd_run_1080p50.y4m,--preset medium --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-maxrate 5000 --vbv-bufsize 5000 --early-skip --tu-inter-depth 3::crowd_run_2160p50.y4m, --preset medium --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 4 --dynamic-refine::crowd_run_2160p50.y4m, --preset medium --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 3 --refine-inter 3
74
-RaceHorses_416x240_30.y4m,--preset slow --no-cutree --ctu 16 --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --crf 22 --vbv-maxrate 1000 --vbv-bufsize 1000::RaceHorses_832x480_30.y4m, --preset slow --no-cutree --ctu 32 --analysis-load x265_analysis.dat --analysis-save x265_analysis_2.dat --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --crf 16 --vbv-maxrate 4000 --vbv-bufsize 4000 --refine-intra 0 --refine-inter 1::RaceHorses_1664x960_30.y4m,--preset slow --no-cutree --ctu 64 --analysis-load x265_analysis_2.dat --analysis-load-reuse-level 10 --scale-factor 2 --crf 12 --vbv-maxrate 7000 --vbv-bufsize 7000 --refine-intra 2 --refine-inter 2
75
+RaceHorses_416x240_30.y4m,--preset slow --ctu 16 --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --crf 22 --vbv-maxrate 1000 --vbv-bufsize 1000::RaceHorses_832x480_30.y4m, --preset slow --ctu 32 --analysis-load x265_analysis.dat --analysis-save x265_analysis_2.dat --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --crf 16 --vbv-maxrate 4000 --vbv-bufsize 4000 --refine-intra 0 --refine-inter 1::RaceHorses_1664x960_30.y4m,--preset slow --ctu 64 --analysis-load x265_analysis_2.dat --analysis-load-reuse-level 10 --scale-factor 2 --crf 12 --vbv-maxrate 7000 --vbv-bufsize 7000 --refine-intra 2 --refine-inter 2
76
ElFunete_960x540_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune psnr --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500 --analysis-save-reuse-level 10 --analysis-save elfuente_960x540.dat --scale-factor 2::ElFunete_1920x1080_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune psnr --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500 --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --analysis-save elfuente_1920x1080.dat --limit-tu 0 --scale-factor 2 --analysis-load elfuente_960x540.dat --refine-intra 4 --refine-inter 2::ElFuente_3840x2160_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune=psnr --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000 --analysis-load-reuse-level 10 --limit-tu 0 --scale-factor 2 --analysis-load elfuente_1920x1080.dat --refine-intra 4 --refine-inter 2
77
#save/load with ctu distortion refinement
78
CrowdRun_1920x1080_50_10bit_422.yuv,--no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --refine-ctu-distortion 1 --bitrate 7000::--no-cutree --analysis-load x265_analysis.dat --refine-ctu-distortion 1 --bitrate 7000 --analysis-load-reuse-level 5
79
#segment encoding
80
BasketballDrive_1920x1080_50.y4m, --preset ultrafast --no-open-gop --chunk-start 100 --chunk-end 200
81
82
+#Test FG SEI message addition
83
+#OldTownCross_1920x1080_50_10bit_422.yuv,--preset slower --tune grain --film-grain "OldTownCross_1920x1080_50_10bit_422.bin"
84
+#RaceHorses_416x240_30_10bit.yuv,--preset ultrafast --signhide --colormatrix bt709 --film-grain "RaceHorses_416x240_30_10bit.bin"
85
+
86
+#Temporal layers tests
87
+ducks_take_off_420_720p50.y4m,--preset slow --temporal-layers 3 --b-adapt 0
88
+parkrun_ter_720p50.y4m,--preset medium --temporal-layers 4 --b-adapt 0
89
+BasketballDrive_1920x1080_50.y4m, --preset medium --no-open-gop --keyint 50 --min-keyint 50 --temporal-layers 5 --b-adapt 0
90
# vim: tw=200
91
x265_3.5.tar.gz/source/test/save-load-tests.txt -> x265_3.6.tar.gz/source/test/save-load-tests.txt
Changed
16
1
2
# not auto-detected.
3
crowd_run_1080p50.y4m, --preset ultrafast --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 8000 --vbv-bufsize 8000::crowd_run_2160p50.y4m, --preset ultrafast --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 12000 --vbv-bufsize 12000
4
crowd_run_540p50.y4m, --preset ultrafast --no-cutree --analysis-save x265_analysis.dat --scale-factor 2 --crf 26 --vbv-maxrate 8000 --vbv-bufsize 8000::crowd_run_1080p50.y4m, --preset ultrafast --no-cutree --analysis-load x265_analysis.dat --scale-factor 2 --crf 26 --vbv-maxrate 12000 --vbv-bufsize 12000
5
-crowd_run_1080p50.y4m, --preset superfast --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m, --preset superfast --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000
6
-crowd_run_1080p50.y4m, --preset fast --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m, --preset fast --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 5 --scale-factor 2 --qp 18
7
-crowd_run_1080p50.y4m, --preset medium --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-maxrate 5000 --vbv-bufsize 5000 --early-skip --tu-inter-depth 3::crowd_run_2160p50.y4m, --preset medium --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 4 --dynamic-refine::crowd_run_2160p50.y4m, --preset medium --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 3 --refine-inter 3
8
+crowd_run_1080p50.y4m, --preset superfast --analysis-save x265_analysis.dat --analysis-save-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m, --preset superfast --analysis-load x265_analysis.dat --analysis-load-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000
9
+crowd_run_1080p50.y4m, --preset fast --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m, --preset fast --analysis-load x265_analysis.dat --analysis-load-reuse-level 5 --scale-factor 2 --qp 18
10
+crowd_run_1080p50.y4m, --preset medium --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-maxrate 5000 --vbv-bufsize 5000 --early-skip --tu-inter-depth 3::crowd_run_2160p50.y4m, --preset medium --analysis-load x265_analysis.dat --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 4 --dynamic-refine::crowd_run_2160p50.y4m, --preset medium --analysis-load x265_analysis.dat --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 3 --refine-inter 3
11
RaceHorses_416x240_30.y4m, --preset slow --no-cutree --ctu 16 --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --crf 22 --vbv-maxrate 1000 --vbv-bufsize 1000::RaceHorses_832x480_30.y4m, --preset slow --no-cutree --ctu 32 --analysis-load x265_analysis.dat --analysis-save x265_analysis_2.dat --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --crf 16 --vbv-maxrate 4000 --vbv-bufsize 4000 --refine-intra 0 --refine-inter 1::RaceHorses_1664x960_30.y4m, --preset slow --no-cutree --ctu 64 --analysis-load x265_analysis_2.dat --analysis-load-reuse-level 10 --scale-factor 2 --crf 12 --vbv-maxrate 7000 --vbv-bufsize 7000 --refine-intra 2 --refine-inter 2
12
-crowd_run_540p50.y4m, --preset veryslow --no-cutree --analysis-save x265_analysis_540.dat --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-bufsize 15000 --vbv-maxrate 9000::crowd_run_1080p50.y4m, --preset veryslow --no-cutree --analysis-save x265_analysis_1080.dat --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_1080p50.y4m, --preset veryslow --no-cutree --analysis-save x265_analysis_1080.dat --analysis-load x265_analysis_540.dat --refine-intra 4 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_2160p50.y4m, --preset veryslow --no-cutree --analysis-save x265_analysis_2160.dat --analysis-load x265_analysis_1080.dat --refine-intra 3 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000::crowd_run_2160p50.y4m, --preset veryslow --no-cutree --analysis-load x265_analysis_2160.dat --refine-intra 2 --dynamic-refine --analysis-load-reuse-level 10 --scale-factor 1 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000
13
+crowd_run_540p50.y4m, --preset veryslow --analysis-save x265_analysis_540.dat --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-bufsize 15000 --vbv-maxrate 9000::crowd_run_1080p50.y4m, --preset veryslow --analysis-save x265_analysis_1080.dat --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_1080p50.y4m, --preset veryslow --analysis-save x265_analysis_1080.dat --analysis-load x265_analysis_540.dat --refine-intra 4 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_2160p50.y4m, --preset veryslow --analysis-save x265_analysis_2160.dat --analysis-load x265_analysis_1080.dat --refine-intra 3 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000::crowd_run_2160p50.y4m, --preset veryslow --analysis-load x265_analysis_2160.dat --refine-intra 2 --dynamic-refine --analysis-load-reuse-level 10 --scale-factor 1 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000
14
crowd_run_540p50.y4m, --preset medium --no-cutree --analysis-save x265_analysis_540.dat --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-bufsize 15000 --vbv-maxrate 9000::crowd_run_1080p50.y4m, --preset medium --no-cutree --analysis-save x265_analysis_1080.dat --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_1080p50.y4m, --preset medium --no-cutree --analysis-save x265_analysis_1080.dat --analysis-load x265_analysis_540.dat --refine-intra 4 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_2160p50.y4m, --preset medium --no-cutree --analysis-save x265_analysis_2160.dat --analysis-load x265_analysis_1080.dat --refine-intra 3 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000::crowd_run_2160p50.y4m, --preset medium --no-cutree --analysis-load x265_analysis_2160.dat --refine-intra 2 --dynamic-refine --analysis-load-reuse-level 10 --scale-factor 1 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000
15
News-4k.y4m, --preset medium --analysis-save x265_analysis_fdup.dat --frame-dup --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000::News-4k.y4m, --analysis-load x265_analysis_fdup.dat --frame-dup --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000
16
x265_3.5.tar.gz/source/test/smoke-tests.txt -> x265_3.6.tar.gz/source/test/smoke-tests.txt
Changed
9
1
2
# Main12 intraCost overflow bug test
3
720p50_parkrun_ter.y4m,--preset medium
4
720p50_parkrun_ter.y4m,--preset=fast --hevc-aq --no-cutree
5
+# Test FG SEI message addition
6
+# CrowdRun_1920x1080_50_10bit_444.yuv,--preset=ultrafast --weightp --keyint -1 --film-grain "CrowdRun_1920x1080_50_10bit_444.bin"
7
+# DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=veryfast --min-cu 16 --film-grain "DucksAndLegs_1920x1080_60_10bit_422.bin"
8
+# NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset=superfast --bitrate 10000 --sao --limit-sao --cll --max-cll "1000,400" --film-grain "NebutaFestival_2560x1600_60_10bit_crop.bin"
9
x265_3.5.tar.gz/source/test/testbench.cpp -> x265_3.6.tar.gz/source/test/testbench.cpp
Changed
43
1
2
{ "AVX512", X265_CPU_AVX512 },
3
{ "ARMv6", X265_CPU_ARMV6 },
4
{ "NEON", X265_CPU_NEON },
5
+ { "SVE2", X265_CPU_SVE2 },
6
+ { "SVE", X265_CPU_SVE },
7
{ "FastNeonMRC", X265_CPU_FAST_NEON_MRC },
8
{ "", 0 },
9
};
10
11
12
EncoderPrimitives asmprim;
13
memset(&asmprim, 0, sizeof(asmprim));
14
- setupAssemblyPrimitives(asmprim, test_archi.flag);
15
-
16
-#if X265_ARCH_ARM64
17
- /* Temporary workaround because luma_vsp assembly primitive has not been completed
18
- * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
19
- * Otherwise, segment fault occurs. */
20
- setupAliasCPrimitives(cprim, asmprim, test_archi.flag);
21
-#endif
22
23
+ setupAssemblyPrimitives(asmprim, test_archi.flag);
24
setupAliasPrimitives(asmprim);
25
memcpy(&primitives, &asmprim, sizeof(EncoderPrimitives));
26
for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++)
27
28
#if X265_ARCH_X86
29
setupInstrinsicPrimitives(optprim, cpuid);
30
#endif
31
- setupAssemblyPrimitives(optprim, cpuid);
32
33
-#if X265_ARCH_ARM64
34
- /* Temporary workaround because luma_vsp assembly primitive has not been completed
35
- * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
36
- * Otherwise, segment fault occurs. */
37
- setupAliasCPrimitives(cprim, optprim, cpuid);
38
-#endif
39
+ setupAssemblyPrimitives(optprim, cpuid);
40
41
/* Note that we do not setup aliases for performance tests, that would be
42
* redundant. The testbench only verifies they are correctly aliased */
43
x265_3.5.tar.gz/source/test/testharness.h -> x265_3.6.tar.gz/source/test/testharness.h
Changed
48
1
2
#include <x86intrin.h>
3
#elif ( !defined(__APPLE__) && defined (__GNUC__) && defined(__ARM_NEON__))
4
#include <arm_neon.h>
5
-#elif defined(__GNUC__) && (!defined(__clang__) || __clang_major__ < 4)
6
+#else
7
/* fallback for older GCC/MinGW */
8
static inline uint32_t __rdtsc(void)
9
{
10
11
#if X265_ARCH_X86
12
asm volatile("rdtsc" : "=a" (a) ::"edx");
13
#elif X265_ARCH_ARM
14
-#if X265_ARCH_ARM64
15
- asm volatile("mrs %0, cntvct_el0" : "=r"(a));
16
-#else
17
// TOD-DO: verify following inline asm to get cpu Timestamp Counter for ARM arch
18
// asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(a));
19
20
// TO-DO: replace clock() function with appropriate ARM cpu instructions
21
a = clock();
22
-#endif
23
+#elif X265_ARCH_ARM64
24
+ asm volatile("mrs %0, cntvct_el0" : "=r"(a));
25
#endif
26
return a;
27
}
28
29
x265_emms(); \
30
float optperf = (10.0f * cycles / runs) / 4; \
31
float refperf = (10.0f * refcycles / refruns) / 4; \
32
- printf("\t%3.2fx ", refperf / optperf); \
33
- printf("\t %-8.2lf \t %-8.2lf\n", optperf, refperf); \
34
+ printf(" | \t%3.2fx | ", refperf / optperf); \
35
+ printf("\t %-8.2lf | \t %-8.2lf\n", optperf, refperf); \
36
}
37
38
extern "C" {
39
40
* needs an explicit asm check because it only sometimes crashes in normal use. */
41
intptr_t PFX(checkasm_call)(intptr_t (*func)(), int *ok, ...);
42
float PFX(checkasm_call_float)(float (*func)(), int *ok, ...);
43
-#elif X265_ARCH_ARM == 0
44
+#elif (X265_ARCH_ARM == 0 && X265_ARCH_ARM64 == 0)
45
#define PFX(stack_pagealign)(func, align) func()
46
#endif
47
48
x265_3.5.tar.gz/source/x265.cpp -> x265_3.6.tar.gz/source/x265.cpp
Changed
18
1
2
3
int ret = 0;
4
5
+ if (cliopt0.scenecutAwareQpConfig)
6
+ {
7
+ if (!cliopt0.parseScenecutAwareQpConfig())
8
+ {
9
+ x265_log(NULL, X265_LOG_ERROR, "Unable to parse scenecut aware qp config file \n");
10
+ fclose(cliopt0.scenecutAwareQpConfig);
11
+ cliopt0.scenecutAwareQpConfig = NULL;
12
+ }
13
+ }
14
+
15
AbrEncoder* abrEnc = new AbrEncoder(cliopt, numEncodes, ret);
16
int threadsActive = abrEnc->m_numActiveEncodes.get();
17
while (threadsActive)
18
x265_3.5.tar.gz/source/x265.h -> x265_3.6.tar.gz/source/x265.h
Changed
470
1
2
#define X265_H
3
#include <stdint.h>
4
#include <stdio.h>
5
+#include <sys/stat.h>
6
#include "x265_config.h"
7
#ifdef __cplusplus
8
extern "C" {
9
10
NAL_UNIT_CODED_SLICE_TRAIL_N = 0,
11
NAL_UNIT_CODED_SLICE_TRAIL_R,
12
NAL_UNIT_CODED_SLICE_TSA_N,
13
- NAL_UNIT_CODED_SLICE_TLA_R,
14
+ NAL_UNIT_CODED_SLICE_TSA_R,
15
NAL_UNIT_CODED_SLICE_STSA_N,
16
NAL_UNIT_CODED_SLICE_STSA_R,
17
NAL_UNIT_CODED_SLICE_RADL_N,
18
19
double vmafFrameScore;
20
double bufferFillFinal;
21
double unclippedBufferFillFinal;
22
+ uint8_t tLayer;
23
} x265_frame_stats;
24
25
typedef struct x265_ctu_info_t
26
27
/* ARM */
28
#define X265_CPU_ARMV6 0x0000001
29
#define X265_CPU_NEON 0x0000002 /* ARM NEON */
30
+#define X265_CPU_SVE2 0x0000008 /* ARM SVE2 */
31
+#define X265_CPU_SVE 0x0000010 /* ARM SVE2 */
32
#define X265_CPU_FAST_NEON_MRC 0x0000004 /* Transfer from NEON to ARM register is fast (Cortex-A9) */
33
34
/* IBM Power8 */
35
36
#define SLICE_TYPE_DELTA 0.3 /* The offset decremented or incremented for P-frames or b-frames respectively*/
37
#define BACKWARD_WINDOW 1 /* Scenecut window before a scenecut */
38
#define FORWARD_WINDOW 2 /* Scenecut window after a scenecut */
39
+#define BWD_WINDOW_DELTA 0.4
40
+
41
+#define X265_MAX_GOP_CONFIG 3
42
+#define X265_MAX_GOP_LENGTH 16
43
+#define MAX_T_LAYERS 7
44
+
45
+#define X265_IPRATIO_STRENGTH 1.43
46
47
typedef struct x265_cli_csp
48
{
49
50
typedef struct x265_zone
51
{
52
int startFrame, endFrame; /* range of frame numbers */
53
+ int keyframeMax; /* it store the default/user defined keyframeMax value*/
54
int bForceQp; /* whether to use qp vs bitrate factor */
55
int qp;
56
float bitrateFactor;
57
58
59
static const x265_vmaf_commondata vcd = { { NULL, (char *)"/usr/local/share/model/vmaf_v0.6.1.pkl", NULL, NULL, 0, 0, 0, 0, 0, 0, 0, NULL, 0, 1, 0 } };
60
61
+typedef struct x265_temporal_layer {
62
+ int poc_offset; /* POC offset */
63
+ int8_t layer; /* Current layer */
64
+ int8_t qp_offset; /* QP offset */
65
+} x265_temporal_layer;
66
+
67
+static const int8_t x265_temporal_layer_bframesMAX_T_LAYERS = {-1, -1, 3, 7, 15, -1, -1};
68
+
69
+static const int8_t x265_gop_ra_lengthX265_MAX_GOP_CONFIG = { 4, 8, 16};
70
+static const x265_temporal_layer x265_gop_raX265_MAX_GOP_CONFIGX265_MAX_GOP_LENGTH = {
71
+ {
72
+ {
73
+ 4,
74
+ 0,
75
+ 1,
76
+ },
77
+ {
78
+ 2,
79
+ 1,
80
+ 5,
81
+ },
82
+ {
83
+ 1,
84
+ 2,
85
+ 3,
86
+ },
87
+ {
88
+ 3,
89
+ 2,
90
+ 5,
91
+ },
92
+ {
93
+ -1,
94
+ -1,
95
+ -1,
96
+ },
97
+ {
98
+ -1,
99
+ -1,
100
+ -1,
101
+ },
102
+ {
103
+ -1,
104
+ -1,
105
+ -1,
106
+ },
107
+ {
108
+ -1,
109
+ -1,
110
+ -1,
111
+ },
112
+ {
113
+ -1,
114
+ -1,
115
+ -1,
116
+ },
117
+ {
118
+ -1,
119
+ -1,
120
+ -1,
121
+ },
122
+ {
123
+ -1,
124
+ -1,
125
+ -1,
126
+ },
127
+ {
128
+ -1,
129
+ -1,
130
+ -1,
131
+ },
132
+ {
133
+ -1,
134
+ -1,
135
+ -1,
136
+ },
137
+ {
138
+ -1,
139
+ -1,
140
+ -1,
141
+ },
142
+ {
143
+ -1,
144
+ -1,
145
+ -1,
146
+ },
147
+ {
148
+ -1,
149
+ -1,
150
+ -1,
151
+ }
152
+ },
153
+
154
+ {
155
+ {
156
+ 8,
157
+ 0,
158
+ 1,
159
+ },
160
+ {
161
+ 4,
162
+ 1,
163
+ 5,
164
+ },
165
+ {
166
+ 2,
167
+ 2,
168
+ 4,
169
+ },
170
+ {
171
+ 1,
172
+ 3,
173
+ 5,
174
+ },
175
+ {
176
+ 3,
177
+ 3,
178
+ 2,
179
+ },
180
+ {
181
+ 6,
182
+ 2,
183
+ 5,
184
+ },
185
+ {
186
+ 5,
187
+ 3,
188
+ 4,
189
+ },
190
+ {
191
+ 7,
192
+ 3,
193
+ 5,
194
+ },
195
+ {
196
+ -1,
197
+ -1,
198
+ -1,
199
+ },
200
+ {
201
+ -1,
202
+ -1,
203
+ -1,
204
+ },
205
+ {
206
+ -1,
207
+ -1,
208
+ -1,
209
+ },
210
+ {
211
+ -1,
212
+ -1,
213
+ -1,
214
+ },
215
+ {
216
+ -1,
217
+ -1,
218
+ -1,
219
+ },
220
+ {
221
+ -1,
222
+ -1,
223
+ -1,
224
+ },
225
+ {
226
+ -1,
227
+ -1,
228
+ -1,
229
+ },
230
+ {
231
+ -1,
232
+ -1,
233
+ -1,
234
+ },
235
+ },
236
+ {
237
+ {
238
+ 16,
239
+ 0,
240
+ 1,
241
+ },
242
+ {
243
+ 8,
244
+ 1,
245
+ 6,
246
+ },
247
+ {
248
+ 4,
249
+ 2,
250
+ 5,
251
+ },
252
+ {
253
+ 2,
254
+ 3,
255
+ 6,
256
+ },
257
+ {
258
+ 1,
259
+ 4,
260
+ 4,
261
+ },
262
+ {
263
+ 3,
264
+ 4,
265
+ 6,
266
+ },
267
+ {
268
+ 6,
269
+ 3,
270
+ 5,
271
+ },
272
+ {
273
+ 5,
274
+ 4,
275
+ 6,
276
+ },
277
+ {
278
+ 7,
279
+ 4,
280
+ 1,
281
+ },
282
+ {
283
+ 12,
284
+ 2,
285
+ 6,
286
+ },
287
+ {
288
+ 10,
289
+ 3,
290
+ 5,
291
+ },
292
+ {
293
+ 9,
294
+ 4,
295
+ 6,
296
+ },
297
+ {
298
+ 11,
299
+ 4,
300
+ 4,
301
+ },
302
+ {
303
+ 14,
304
+ 3,
305
+ 6,
306
+ },
307
+ {
308
+ 13,
309
+ 4,
310
+ 5,
311
+ },
312
+ {
313
+ 15,
314
+ 4,
315
+ 6,
316
+ }
317
+ }
318
+};
319
+
320
+typedef enum
321
+{
322
+ X265_SHARE_MODE_FILE = 0,
323
+ X265_SHARE_MODE_SHAREDMEM
324
+}X265_DATA_SHARE_MODES;
325
+
326
/* x265 input parameters
327
*
328
* For version safety you may use x265_param_alloc/free() to manage the
329
330
* performance impact, but the use case may preclude it. Default true */
331
int bOpenGOP;
332
333
+ /*Force nal type to CRA to all frames expect first frame. Default disabled*/
334
+ int craNal;
335
+
336
/* Scene cuts closer together than this are coded as I, not IDR. */
337
int keyframeMin;
338
339
340
double rfConstantMin;
341
342
/* Multi-pass encoding */
343
- /* Enable writing the stats in a multi-pass encode to the stat output file */
344
+ /* Enable writing the stats in a multi-pass encode to the stat output file/memory */
345
int bStatWrite;
346
347
- /* Enable loading data from the stat input file in a multi pass encode */
348
+ /* Enable loading data from the stat input file/memory in a multi pass encode */
349
int bStatRead;
350
351
/* Filename of the 2pass output/input stats file, if unspecified the
352
353
/* internally enable if tune grain is set */
354
int bEnableConstVbv;
355
356
+ /* if only the focused frames would be re-encode or not */
357
+ int bEncFocusedFramesOnly;
358
+
359
+ /* Share the data with stats file or shared memory.
360
+ It must be one of the X265_DATA_SHARE_MODES enum values
361
+ Available if the bStatWrite or bStatRead is true.
362
+ Use stats file by default.
363
+ The stats file mode would be used among the encoders running in sequence.
364
+ The shared memory mode could only be used among the encoders running in parallel.
365
+ Now only the cutree data could be shared among shared memory. More data would be support in the future.*/
366
+ int dataShareMode;
367
+
368
+ /* Unique shared memory name. Required if the shared memory mode enabled. NULL by default */
369
+ const char* sharedMemName;
370
+
371
} rc;
372
373
/*== Video Usability Information ==*/
374
375
Default 1 (Enabled). API only. */
376
int bResetZoneConfig;
377
378
+ /*Flag to indicate rate-control history has not to be reset during zone reconfiguration.
379
+ Default 0 (Disabled) */
380
+ int bNoResetZoneConfig;
381
+
382
/* It reduces the bits spent on the inter-frames within the scenecutWindow before and / or after a scenecut
383
* by increasing their QP in ratecontrol pass2 algorithm without any deterioration in visual quality.
384
* 0 - Disabled (default).
385
386
387
/* The duration(in milliseconds) for which there is a reduction in the bits spent on the inter-frames after a scenecut
388
* by increasing their QP, when bEnableSceneCutAwareQp is 1 or 3. Default is 500ms.*/
389
- int fwdScenecutWindow;
390
+ int fwdMaxScenecutWindow;
391
+ int fwdScenecutWindow6;
392
393
/* The offset by which QP is incremented for inter-frames after a scenecut when bEnableSceneCutAwareQp is 1 or 3.
394
* Default is +5. */
395
- double fwdRefQpDelta;
396
+ double fwdRefQpDelta6;
397
398
/* The offset by which QP is incremented for non-referenced inter-frames after a scenecut when bEnableSceneCutAwareQp is 1 or 3. */
399
- double fwdNonRefQpDelta;
400
-
401
- /* A genuine threshold used for histogram based scene cut detection.
402
- * This threshold determines whether a frame is a scenecut or not
403
- * when compared against the edge and chroma histogram sad values.
404
- * Default 0.03. Range: Real number in the interval (0,1). */
405
- double edgeTransitionThreshold;
406
+ double fwdNonRefQpDelta6;
407
408
/* Enables histogram based scenecut detection algorithm to detect scenecuts. Default disabled */
409
int bHistBasedSceneCut;
410
411
412
/* The duration(in milliseconds) for which there is a reduction in the bits spent on the inter-frames before a scenecut
413
* by increasing their QP, when bEnableSceneCutAwareQp is 2 or 3. Default is 100ms.*/
414
- int bwdScenecutWindow;
415
+ int bwdMaxScenecutWindow;
416
+ int bwdScenecutWindow6;
417
418
/* The offset by which QP is incremented for inter-frames before a scenecut when bEnableSceneCutAwareQp is 2 or 3. */
419
- double bwdRefQpDelta;
420
+ double bwdRefQpDelta6;
421
422
/* The offset by which QP is incremented for non-referenced inter-frames before a scenecut when bEnableSceneCutAwareQp is 2 or 3. */
423
- double bwdNonRefQpDelta;
424
+ double bwdNonRefQpDelta6;
425
+
426
+ /* Specify combinations of color primaries, transfer characteristics, color matrix,
427
+ * range of luma and chroma signals, and chroma sample location. This has higher
428
+ * precedence than individual VUI parameters. If any individual VUI option is specified
429
+ * together with this, which changes the values set corresponding to the system-id
430
+ * or color-volume, it will be discarded. */
431
+ const char* videoSignalTypePreset;
432
+
433
+ /* Flag indicating whether the encoder should emit an End of Bitstream
434
+ * NAL at the end of bitstream. Default false */
435
+ int bEnableEndOfBitstream;
436
+
437
+ /* Flag indicating whether the encoder should emit an End of Sequence
438
+ * NAL at the end of every Coded Video Sequence. Default false */
439
+ int bEnableEndOfSequence;
440
+
441
+ /* Film Grain Characteristic file */
442
+ char* filmGrain;
443
+
444
+ /*Motion compensated temporal filter*/
445
+ int bEnableTemporalFilter;
446
+ double temporalFilterStrength;
447
+
448
+ /*SBRC*/
449
+ int bEnableSBRC;
450
} x265_param;
451
452
/* x265_param_alloc:
453
454
455
int x265_zone_param_parse(x265_param* p, const char* name, const char* value);
456
457
+int x265_scenecut_aware_qp_param_parse(x265_param* p, const char* name, const char* value);
458
+
459
static const char * const x265_profile_names = {
460
/* HEVC v1 */
461
"main", "main10", "mainstillpicture", /* alias */ "msp",
462
463
void (*param_free)(x265_param*);
464
void (*param_default)(x265_param*);
465
int (*param_parse)(x265_param*, const char*, const char*);
466
+ int (*scenecut_aware_qp_param_parse)(x265_param*, const char*, const char*);
467
int (*param_apply_profile)(x265_param*, const char*);
468
int (*param_default_preset)(x265_param*, const char*, const char *);
469
x265_picture* (*picture_alloc)(void);
470
x265_3.5.tar.gz/source/x265cli.cpp -> x265_3.6.tar.gz/source/x265cli.cpp
Changed
393
1
2
#include "x265cli.h"
3
#include "svt.h"
4
5
-#define START_CODE 0x00000001
6
-#define START_CODE_BYTES 4
7
+#define START_CODE 0x00000001
8
+#define START_CODE_BYTES 4
9
10
#ifdef __cplusplus
11
namespace X265_NS {
12
13
H0(" --rdpenalty <0..2> penalty for 32x32 intra TU in non-I slices. 0:disabled 1:RD-penalty 2:maximum. Default %d\n", param->rdPenalty);
14
H0("\nSlice decision options:\n");
15
H0(" --no-open-gop Enable open-GOP, allows I slices to be non-IDR. Default %s\n", OPT(param->bOpenGOP));
16
+ H0(" --cra-nal Force nal type to CRA to all frames expect first frame, works only with keyint 1. Default %s\n", OPT(param->craNal));
17
H0("-I/--keyint <integer> Max IDR period in frames. -1 for infinite-gop. Default %d\n", param->keyframeMax);
18
H0("-i/--min-keyint <integer> Scenecuts closer together than this are coded as I, not IDR. Default: auto\n");
19
H0(" --gop-lookahead <integer> Extends gop boundary if a scenecut is found within this from keyint boundary. Default 0\n");
20
21
H1(" --scenecut-bias <0..100.0> Bias for scenecut detection. Default %.2f\n", param->scenecutBias);
22
H0(" --hist-scenecut Enables histogram based scene-cut detection using histogram based algorithm.\n");
23
H0(" --no-hist-scenecut Disables histogram based scene-cut detection using histogram based algorithm.\n");
24
- H1(" --hist-threshold <0.0..1.0> Luma Edge histogram's Normalized SAD threshold for histogram based scenecut detection Default %.2f\n", param->edgeTransitionThreshold);
25
H0(" --no-fades Enable detection and handling of fade-in regions. Default %s\n", OPT(param->bEnableFades));
26
H1(" --scenecut-aware-qp <0..3> Enable increasing QP for frames inside the scenecut window around scenecut. Default %s\n", OPT(param->bEnableSceneCutAwareQp));
27
H1(" 0 - Disabled\n");
28
29
H1(" 2 - Backward masking\n");
30
H1(" 3 - Bidirectional masking\n");
31
H1(" --masking-strength <string> Comma separated values which specify the duration and offset for the QP increment for inter-frames when scenecut-aware-qp is enabled.\n");
32
+ H1(" --scenecut-qp-config <file> File containing scenecut-aware-qp mode, window duration and offsets settings required for the masking. Works only with --pass 2\n");
33
H0(" --radl <integer> Number of RADL pictures allowed in front of IDR. Default %d\n", param->radl);
34
H0(" --intra-refresh Use Periodic Intra Refresh instead of IDR frames\n");
35
H0(" --rc-lookahead <integer> Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth);
36
37
H0(" --aq-strength <float> Reduces blocking and blurring in flat and textured areas (0 to 3.0). Default %.2f\n", param->rc.aqStrength);
38
H0(" --qp-adaptation-range <float> Delta QP range by QP adaptation based on a psycho-visual model (1.0 to 6.0). Default %.2f\n", param->rc.qpAdaptationRange);
39
H0(" --no-aq-motion Block level QP adaptation based on the relative motion between the block and the frame. Default %s\n", OPT(param->bAQMotion));
40
+ H1(" --no-sbrc Enables the segment based rate control. Default %s\n", OPT(param->bEnableSBRC));
41
H0(" --qg-size <int> Specifies the size of the quantization group (64, 32, 16, 8). Default %d\n", param->rc.qgSize);
42
H0(" --no-cutree Enable cutree for Adaptive Quantization. Default %s\n", OPT(param->rc.cuTree));
43
H0(" --no-rc-grain Enable ratecontrol mode to handle grains specifically. turned on with tune grain. Default %s\n", OPT(param->rc.bEnableGrain));
44
45
H1(" q=<integer> (force QP)\n");
46
H1(" or b=<float> (bitrate multiplier)\n");
47
H0(" --zonefile <filename> Zone file containing the zone boundaries and the parameters to be reconfigured.\n");
48
+ H0(" --no-zonefile-rc-init This allow to use rate-control history across zones in zonefile.\n");
49
H1(" --lambda-file <string> Specify a file containing replacement values for the lambda tables\n");
50
H1(" MAX_MAX_QP+1 floats for lambda table, then again for lambda2 table\n");
51
H1(" Blank lines and lines starting with hash(#) are ignored\n");
52
53
H0(" --master-display <string> SMPTE ST 2086 master display color volume info SEI (HDR)\n");
54
H0(" format: G(x,y)B(x,y)R(x,y)WP(x,y)L(max,min)\n");
55
H0(" --max-cll <string> Specify content light level info SEI as \"cll,fall\" (HDR).\n");
56
+ H0(" --video-signal-type-preset <string> Specify combinations of color primaries, transfer characteristics, color matrix, range of luma and chroma signals, and chroma sample location\n");
57
+ H0(" format: <system-id>:<color-volume>\n");
58
+ H0(" This has higher precedence than individual VUI parameters. If any individual VUI option is specified together with this,\n");
59
+ H0(" which changes the values set corresponding to the system-id or color-volume, it will be discarded.\n");
60
+ H0(" The color-volume can be used only with the system-id options BT2100_PQ_YCC, BT2100_PQ_ICTCP, and BT2100_PQ_RGB.\n");
61
+ H0(" system-id options and their corresponding values:\n");
62
+ H0(" BT601_525: --colorprim smpte170m --transfer smpte170m --colormatrix smpte170m --range limited --chromaloc 0\n");
63
+ H0(" BT601_626: --colorprim bt470bg --transfer smpte170m --colormatrix bt470bg --range limited --chromaloc 0\n");
64
+ H0(" BT709_YCC: --colorprim bt709 --transfer bt709 --colormatrix bt709 --range limited --chromaloc 0\n");
65
+ H0(" BT709_RGB: --colorprim bt709 --transfer bt709 --colormatrix gbr --range limited\n");
66
+ H0(" BT2020_YCC_NCL: --colorprim bt2020 --transfer bt2020-10 --colormatrix bt709 --range limited --chromaloc 2\n");
67
+ H0(" BT2020_RGB: --colorprim bt2020 --transfer smpte2084 --colormatrix bt2020nc --range limited\n");
68
+ H0(" BT2100_PQ_YCC: --colorprim bt2020 --transfer smpte2084 --colormatrix bt2020nc --range limited --chromaloc 2\n");
69
+ H0(" BT2100_PQ_ICTCP: --colorprim bt2020 --transfer smpte2084 --colormatrix ictcp --range limited --chromaloc 2\n");
70
+ H0(" BT2100_PQ_RGB: --colorprim bt2020 --transfer smpte2084 --colormatrix gbr --range limited\n");
71
+ H0(" BT2100_HLG_YCC: --colorprim bt2020 --transfer arib-std-b67 --colormatrix bt2020nc --range limited --chromaloc 2\n");
72
+ H0(" BT2100_HLG_RGB: --colorprim bt2020 --transfer arib-std-b67 --colormatrix gbr --range limited\n");
73
+ H0(" FR709_RGB: --colorprim bt709 --transfer bt709 --colormatrix gbr --range full\n");
74
+ H0(" FR2020_RGB: --colorprim bt2020 --transfer bt2020-10 --colormatrix gbr --range full\n");
75
+ H0(" FRP3D65_YCC: --colorprim smpte432 --transfer bt709 --colormatrix smpte170m --range full --chromaloc 1\n");
76
+ H0(" color-volume options and their corresponding values:\n");
77
+ H0(" P3D65x1000n0005: --master-display G(13250,34500)B(7500,3000)R(34000,16000)WP(15635,16450)L(10000000,5)\n");
78
+ H0(" P3D65x4000n005: --master-display G(13250,34500)B(7500,3000)R(34000,16000)WP(15635,16450)L(40000000,50)\n");
79
+ H0(" BT2100x108n0005: --master-display G(8500,39850)B(6550,2300)R(34000,146000)WP(15635,16450)L(10000000,1)\n");
80
H0(" --no-cll Emit content light level info SEI. Default %s\n", OPT(param->bEmitCLL));
81
H0(" --no-hdr10 Control dumping of HDR10 SEI packet. If max-cll or master-display has non-zero values, this is enabled. Default %s\n", OPT(param->bEmitHDR10SEI));
82
H0(" --no-hdr-opt Add luma and chroma offsets for HDR/WCG content. Default %s. Now deprecated.\n", OPT(param->bHDROpt));
83
84
H0(" --no-repeat-headers Emit SPS and PPS headers at each keyframe. Default %s\n", OPT(param->bRepeatHeaders));
85
H0(" --no-info Emit SEI identifying encoder and parameters. Default %s\n", OPT(param->bEmitInfoSEI));
86
H0(" --no-hrd Enable HRD parameters signaling. Default %s\n", OPT(param->bEmitHRDSEI));
87
- H0(" --no-idr-recovery-sei Emit recovery point infor SEI at each IDR frame \n");
88
- H0(" --no-temporal-layers Enable a temporal sublayer for unreferenced B frames. Default %s\n", OPT(param->bEnableTemporalSubLayers));
89
+ H0(" --no-idr-recovery-sei Emit recovery point infor SEI at each IDR frame \n");
90
+ H0(" --temporal-layers Enable a temporal sublayer for unreferenced B frames. Default %s\n", OPT(param->bEnableTemporalSubLayers));
91
H0(" --no-aud Emit access unit delimiters at the start of each access unit. Default %s\n", OPT(param->bEnableAccessUnitDelimiters));
92
+ H0(" --no-eob Emit end of bitstream nal unit at the end of the bitstream. Default %s\n", OPT(param->bEnableEndOfBitstream));
93
+ H0(" --no-eos Emit end of sequence nal unit at the end of every coded video sequence. Default %s\n", OPT(param->bEnableEndOfSequence));
94
H1(" --hash <integer> Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default %d\n", param->decodedPictureHashSEI);
95
H0(" --atc-sei <integer> Emit the alternative transfer characteristics SEI message where the integer is the preferred transfer characteristics. Default disabled\n");
96
H0(" --pic-struct <integer> Set the picture structure and emits it in the picture timing SEI message. Values in the range 0..12. See D.3.3 of the HEVC spec. for a detailed explanation.\n");
97
98
H0(" --lowpass-dct Use low-pass subband dct approximation. Default %s\n", OPT(param->bLowPassDct));
99
H0(" --no-frame-dup Enable Frame duplication. Default %s\n", OPT(param->bEnableFrameDuplication));
100
H0(" --dup-threshold <integer> PSNR threshold for Frame duplication. Default %d\n", param->dupThreshold);
101
+ H0(" --no-mcstf Enable GOP based temporal filter. Default %d\n", param->bEnableTemporalFilter);
102
#ifdef SVT_HEVC
103
H0(" --nosvt Enable SVT HEVC encoder %s\n", OPT(param->bEnableSvtHevc));
104
H0(" --no-svt-hme Enable Hierarchial motion estimation(HME) in SVT HEVC encoder \n");
105
106
H1(" 2 - unable to open encoder\n");
107
H1(" 3 - unable to generate stream headers\n");
108
H1(" 4 - encoder abort\n");
109
+ H0("\nSEI Message Options\n");
110
+ H0(" --film-grain <filename> File containing Film Grain Characteristics to be written as a SEI Message\n");
111
+
112
#undef OPT
113
#undef H0
114
#undef H1
115
116
117
memcpy(globalParam->rc.zoneszonefileCount.zoneParam, globalParam, sizeof(x265_param));
118
119
+ if (zonefileCount == 0)
120
+ globalParam->rc.zoneszonefileCount.keyframeMax = globalParam->keyframeMax;
121
+
122
for (optind = 0;;)
123
{
124
int long_options_index = -1;
125
126
return true;
127
}
128
}
129
+ OPT("scenecut-qp-config")
130
+ {
131
+ this->scenecutAwareQpConfig = x265_fopen(optarg, "rb");
132
+ if (!this->scenecutAwareQpConfig)
133
+ x265_log_file(param, X265_LOG_ERROR, "%s scenecut aware qp config file not found or error in opening config file\n", optarg);
134
+ }
135
OPT("zonefile")
136
{
137
this->zoneFile = x265_fopen(optarg, "rb");
138
if (!this->zoneFile)
139
x265_log_file(param, X265_LOG_ERROR, "%s zone file not found or error in opening zone file\n", optarg);
140
}
141
+ OPT("no-zonefile-rc-init") this->param->bNoResetZoneConfig = true;
142
OPT("fullhelp")
143
{
144
param->logLevel = X265_LOG_FULL;
145
146
if (reconFileBitDepth == 0)
147
reconFileBitDepth = param->internalBitDepth;
148
this->recon = ReconFile::open(reconfn, param->sourceWidth, param->sourceHeight, reconFileBitDepth,
149
- param->fpsNum, param->fpsDenom, param->internalCsp);
150
+ param->fpsNum, param->fpsDenom, param->internalCsp, param->sourceBitDepth);
151
if (this->recon->isFail())
152
{
153
x265_log(param, X265_LOG_WARNING, "unable to write reconstructed outputs file\n");
154
155
param->rc.zones = X265_MALLOC(x265_zone, param->rc.zonefileCount);
156
for (int i = 0; i < param->rc.zonefileCount; i++)
157
{
158
+ param->rc.zonesi.startFrame = -1;
159
while (fgets(line, sizeof(line), zoneFile))
160
{
161
if (*line == '#' || (strcmp(line, "\r\n") == 0))
162
163
return 1;
164
}
165
166
- /* Parse the RPU file and extract the RPU corresponding to the current picture
167
- * and fill the rpu field of the input picture */
168
- int CLIOptions::rpuParser(x265_picture * pic)
169
- {
170
- uint8_t byteVal;
171
- uint32_t code = 0;
172
- int bytesRead = 0;
173
- pic->rpu.payloadSize = 0;
174
-
175
- if (!pic->pts)
176
- {
177
- while (bytesRead++ < 4 && fread(&byteVal, sizeof(uint8_t), 1, dolbyVisionRpu))
178
- code = (code << 8) | byteVal;
179
-
180
- if (code != START_CODE)
181
- {
182
- x265_log(NULL, X265_LOG_ERROR, "Invalid Dolby Vision RPU startcode in POC %d\n", pic->pts);
183
- return 1;
184
- }
185
- }
186
-
187
- bytesRead = 0;
188
- while (fread(&byteVal, sizeof(uint8_t), 1, dolbyVisionRpu))
189
- {
190
- code = (code << 8) | byteVal;
191
- if (bytesRead++ < 3)
192
- continue;
193
- if (bytesRead >= 1024)
194
- {
195
- x265_log(NULL, X265_LOG_ERROR, "Invalid Dolby Vision RPU size in POC %d\n", pic->pts);
196
- return 1;
197
- }
198
-
199
- if (code != START_CODE)
200
- pic->rpu.payloadpic->rpu.payloadSize++ = (code >> (3 * 8)) & 0xFF;
201
- else
202
- return 0;
203
- }
204
-
205
- int ShiftBytes = START_CODE_BYTES - (bytesRead - pic->rpu.payloadSize);
206
- int bytesLeft = bytesRead - pic->rpu.payloadSize;
207
- code = (code << ShiftBytes * 8);
208
- for (int i = 0; i < bytesLeft; i++)
209
- {
210
- pic->rpu.payloadpic->rpu.payloadSize++ = (code >> (3 * 8)) & 0xFF;
211
- code = (code << 8);
212
- }
213
- if (!pic->rpu.payloadSize)
214
- x265_log(NULL, X265_LOG_WARNING, "Dolby Vision RPU not found for POC %d\n", pic->pts);
215
- return 0;
216
- }
217
+ /* Parse the RPU file and extract the RPU corresponding to the current picture
218
+ * and fill the rpu field of the input picture */
219
+ int CLIOptions::rpuParser(x265_picture * pic)
220
+ {
221
+ uint8_t byteVal;
222
+ uint32_t code = 0;
223
+ int bytesRead = 0;
224
+ pic->rpu.payloadSize = 0;
225
+
226
+ if (!pic->pts)
227
+ {
228
+ while (bytesRead++ < 4 && fread(&byteVal, sizeof(uint8_t), 1, dolbyVisionRpu))
229
+ code = (code << 8) | byteVal;
230
+
231
+ if (code != START_CODE)
232
+ {
233
+ x265_log(NULL, X265_LOG_ERROR, "Invalid Dolby Vision RPU startcode in POC %d\n", pic->pts);
234
+ return 1;
235
+ }
236
+ }
237
+
238
+ bytesRead = 0;
239
+ while (fread(&byteVal, sizeof(uint8_t), 1, dolbyVisionRpu))
240
+ {
241
+ code = (code << 8) | byteVal;
242
+ if (bytesRead++ < 3)
243
+ continue;
244
+ if (bytesRead >= 1024)
245
+ {
246
+ x265_log(NULL, X265_LOG_ERROR, "Invalid Dolby Vision RPU size in POC %d\n", pic->pts);
247
+ return 1;
248
+ }
249
+
250
+ if (code != START_CODE)
251
+ pic->rpu.payloadpic->rpu.payloadSize++ = (code >> (3 * 8)) & 0xFF;
252
+ else
253
+ return 0;
254
+ }
255
+
256
+ int ShiftBytes = START_CODE_BYTES - (bytesRead - pic->rpu.payloadSize);
257
+ int bytesLeft = bytesRead - pic->rpu.payloadSize;
258
+ code = (code << ShiftBytes * 8);
259
+ for (int i = 0; i < bytesLeft; i++)
260
+ {
261
+ pic->rpu.payloadpic->rpu.payloadSize++ = (code >> (3 * 8)) & 0xFF;
262
+ code = (code << 8);
263
+ }
264
+ if (!pic->rpu.payloadSize)
265
+ x265_log(NULL, X265_LOG_WARNING, "Dolby Vision RPU not found for POC %d\n", pic->pts);
266
+ return 0;
267
+ }
268
+
269
+ bool CLIOptions::parseScenecutAwareQpConfig()
270
+ {
271
+ char line256;
272
+ char* argLine;
273
+ rewind(scenecutAwareQpConfig);
274
+ while (fgets(line, sizeof(line), scenecutAwareQpConfig))
275
+ {
276
+ if (*line == '#' || (strcmp(line, "\r\n") == 0))
277
+ continue;
278
+ int index = (int)strcspn(line, "\r\n");
279
+ lineindex = '\0';
280
+ argLine = line;
281
+ while (isspace((unsigned char)*argLine)) argLine++;
282
+ char* start = strchr(argLine, '-');
283
+ int argCount = 0;
284
+ char **args = (char**)malloc(256 * sizeof(char *));
285
+ //Adding a dummy string to avoid file parsing error
286
+ argsargCount++ = (char *)"x265";
287
+ char* token = strtok(start, " ");
288
+ while (token)
289
+ {
290
+ argsargCount++ = token;
291
+ token = strtok(NULL, " ");
292
+ }
293
+ argsargCount = NULL;
294
+ CLIOptions cliopt;
295
+ if (cliopt.parseScenecutAwareQpParam(argCount, args, param))
296
+ {
297
+ cliopt.destroy();
298
+ if (cliopt.api)
299
+ cliopt.api->param_free(cliopt.param);
300
+ exit(1);
301
+ }
302
+ break;
303
+ }
304
+ return 1;
305
+ }
306
+ bool CLIOptions::parseScenecutAwareQpParam(int argc, char **argv, x265_param* globalParam)
307
+ {
308
+ bool bError = false;
309
+ int bShowHelp = false;
310
+ int outputBitDepth = 0;
311
+ const char *profile = NULL;
312
+ /* Presets are applied before all other options. */
313
+ for (optind = 0;;)
314
+ {
315
+ int c = getopt_long(argc, argv, short_options, long_options, NULL);
316
+ if (c == -1)
317
+ break;
318
+ else if (c == 'D')
319
+ outputBitDepth = atoi(optarg);
320
+ else if (c == 'P')
321
+ profile = optarg;
322
+ else if (c == '?')
323
+ bShowHelp = true;
324
+ }
325
+ if (!outputBitDepth && profile)
326
+ {
327
+ /*try to derive the output bit depth from the requested profile*/
328
+ if (strstr(profile, "10"))
329
+ outputBitDepth = 10;
330
+ else if (strstr(profile, "12"))
331
+ outputBitDepth = 12;
332
+ else
333
+ outputBitDepth = 8;
334
+ }
335
+ api = x265_api_get(outputBitDepth);
336
+ if (!api)
337
+ {
338
+ x265_log(NULL, X265_LOG_WARNING, "falling back to default bit-depth\n");
339
+ api = x265_api_get(0);
340
+ }
341
+ if (bShowHelp)
342
+ {
343
+ printVersion(globalParam, api);
344
+ showHelp(globalParam);
345
+ }
346
+ for (optind = 0;;)
347
+ {
348
+ int long_options_index = -1;
349
+ int c = getopt_long(argc, argv, short_options, long_options, &long_options_index);
350
+ if (c == -1)
351
+ break;
352
+ if (long_options_index < 0 && c > 0)
353
+ {
354
+ for (size_t i = 0; i < sizeof(long_options) / sizeof(long_options0); i++)
355
+ {
356
+ if (long_optionsi.val == c)
357
+ {
358
+ long_options_index = (int)i;
359
+ break;
360
+ }
361
+ }
362
+ if (long_options_index < 0)
363
+ {
364
+ /* getopt_long might have already printed an error message */
365
+ if (c != 63)
366
+ x265_log(NULL, X265_LOG_WARNING, "internal error: short option '%c' has no long option\n", c);
367
+ return true;
368
+ }
369
+ }
370
+ if (long_options_index < 0)
371
+ {
372
+ x265_log(NULL, X265_LOG_WARNING, "short option '%c' unrecognized\n", c);
373
+ return true;
374
+ }
375
+ bError |= !!api->scenecut_aware_qp_param_parse(globalParam, long_optionslong_options_index.name, optarg);
376
+ if (bError)
377
+ {
378
+ const char *name = long_options_index > 0 ? long_optionslong_options_index.name : argvoptind - 2;
379
+ x265_log(NULL, X265_LOG_ERROR, "invalid argument: %s = %s\n", name, optarg);
380
+ return true;
381
+ }
382
+ }
383
+ if (optind < argc)
384
+ {
385
+ x265_log(param, X265_LOG_WARNING, "extra unused command arguments given <%s>\n", argvoptind);
386
+ return true;
387
+ }
388
+ return false;
389
+ }
390
391
#ifdef __cplusplus
392
}
393
x265_3.5.tar.gz/source/x265cli.h -> x265_3.6.tar.gz/source/x265cli.h
Changed
104
1
2
{ "no-fast-intra", no_argument, NULL, 0 },
3
{ "no-open-gop", no_argument, NULL, 0 },
4
{ "open-gop", no_argument, NULL, 0 },
5
+ { "cra-nal", no_argument, NULL, 0 },
6
{ "keyint", required_argument, NULL, 'I' },
7
{ "min-keyint", required_argument, NULL, 'i' },
8
{ "gop-lookahead", required_argument, NULL, 0 },
9
10
{ "scenecut-bias", required_argument, NULL, 0 },
11
{ "hist-scenecut", no_argument, NULL, 0},
12
{ "no-hist-scenecut", no_argument, NULL, 0},
13
- { "hist-threshold", required_argument, NULL, 0},
14
{ "fades", no_argument, NULL, 0 },
15
{ "no-fades", no_argument, NULL, 0 },
16
{ "scenecut-aware-qp", required_argument, NULL, 0 },
17
18
{ "qp", required_argument, NULL, 'q' },
19
{ "aq-mode", required_argument, NULL, 0 },
20
{ "aq-strength", required_argument, NULL, 0 },
21
+ { "sbrc", no_argument, NULL, 0 },
22
+ { "no-sbrc", no_argument, NULL, 0 },
23
{ "rc-grain", no_argument, NULL, 0 },
24
{ "no-rc-grain", no_argument, NULL, 0 },
25
{ "ipratio", required_argument, NULL, 0 },
26
27
{ "crop-rect", required_argument, NULL, 0 }, /* DEPRECATED */
28
{ "master-display", required_argument, NULL, 0 },
29
{ "max-cll", required_argument, NULL, 0 },
30
+ {"video-signal-type-preset", required_argument, NULL, 0 },
31
{ "min-luma", required_argument, NULL, 0 },
32
{ "max-luma", required_argument, NULL, 0 },
33
{ "log2-max-poc-lsb", required_argument, NULL, 8 },
34
35
{ "repeat-headers", no_argument, NULL, 0 },
36
{ "aud", no_argument, NULL, 0 },
37
{ "no-aud", no_argument, NULL, 0 },
38
+ { "eob", no_argument, NULL, 0 },
39
+ { "no-eob", no_argument, NULL, 0 },
40
+ { "eos", no_argument, NULL, 0 },
41
+ { "no-eos", no_argument, NULL, 0 },
42
{ "info", no_argument, NULL, 0 },
43
{ "no-info", no_argument, NULL, 0 },
44
{ "zones", required_argument, NULL, 0 },
45
{ "qpfile", required_argument, NULL, 0 },
46
{ "zonefile", required_argument, NULL, 0 },
47
+ { "no-zonefile-rc-init", no_argument, NULL, 0 },
48
{ "lambda-file", required_argument, NULL, 0 },
49
{ "b-intra", no_argument, NULL, 0 },
50
{ "no-b-intra", no_argument, NULL, 0 },
51
52
{ "dynamic-refine", no_argument, NULL, 0 },
53
{ "no-dynamic-refine", no_argument, NULL, 0 },
54
{ "strict-cbr", no_argument, NULL, 0 },
55
- { "temporal-layers", no_argument, NULL, 0 },
56
- { "no-temporal-layers", no_argument, NULL, 0 },
57
+ { "temporal-layers", required_argument, NULL, 0 },
58
{ "qg-size", required_argument, NULL, 0 },
59
{ "recon-y4m-exec", required_argument, NULL, 0 },
60
{ "analyze-src-pics", no_argument, NULL, 0 },
61
62
{ "frame-dup", no_argument, NULL, 0 },
63
{ "no-frame-dup", no_argument, NULL, 0 },
64
{ "dup-threshold", required_argument, NULL, 0 },
65
+ { "mcstf", no_argument, NULL, 0 },
66
+ { "no-mcstf", no_argument, NULL, 0 },
67
#ifdef SVT_HEVC
68
{ "svt", no_argument, NULL, 0 },
69
{ "no-svt", no_argument, NULL, 0 },
70
71
{ "abr-ladder", required_argument, NULL, 0 },
72
{ "min-vbv-fullness", required_argument, NULL, 0 },
73
{ "max-vbv-fullness", required_argument, NULL, 0 },
74
+ { "scenecut-qp-config", required_argument, NULL, 0 },
75
+ { "film-grain", required_argument, NULL, 0 },
76
{ 0, 0, 0, 0 },
77
{ 0, 0, 0, 0 },
78
{ 0, 0, 0, 0 },
79
80
FILE* qpfile;
81
FILE* zoneFile;
82
FILE* dolbyVisionRpu; /* File containing Dolby Vision BL RPU metadata */
83
+ FILE* scenecutAwareQpConfig; /* File containing scenecut aware frame quantization related CLI options */
84
const char* reconPlayCmd;
85
const x265_api* api;
86
x265_param* param;
87
88
qpfile = NULL;
89
zoneFile = NULL;
90
dolbyVisionRpu = NULL;
91
+ scenecutAwareQpConfig = NULL;
92
reconPlayCmd = NULL;
93
api = NULL;
94
param = NULL;
95
96
bool parseQPFile(x265_picture &pic_org);
97
bool parseZoneFile();
98
int rpuParser(x265_picture * pic);
99
+ bool parseScenecutAwareQpConfig();
100
+ bool parseScenecutAwareQpParam(int argc, char **argv, x265_param* globalParam);
101
};
102
#ifdef __cplusplus
103
}
104
x265_3.5.tar.gz/x265Version.txt -> x265_3.6.tar.gz/x265Version.txt
Changed
8
1
2
#Attribute: Values
3
-repositorychangeset: f0c1022b6
4
+repositorychangeset: aa7f602f7
5
releasetagdistance: 1
6
-releasetag: 3.5
7
+releasetag: 3.6
8
Refresh
x265
x86_64
aarch64
x86_64
aarch64
armv7l
Refresh
No rpmlint log
Login required, please
login
or
signup
in order to comment