We truncated the diff of some files because they were too big.
If you want to see the full diff for every file, click here.
Changes of Revision 20
x265.changes
Changed
x
1
2
-------------------------------------------------------------------
3
+Mon Sep 30 12:34:56 UTC 2024 - olaf@aepfle.de
4
+
5
+- Update to version 4.0
6
+ New features:
7
+ * Alpha Channel feature.
8
+ * Screen Content Coding (SCC).
9
+ * MV-HEVC feature.
10
+ Enhancements to existing features:
11
+ * Added support for the VMAF v3.x.
12
+ API changes
13
+ * Add command line parameter for Alpha Channel feature :option:`--alpha`.
14
+ * Add command line parameter for SCC feature :option:`--scc 1`.
15
+ * Add command line parameters for the MV-HEVC feature
16
+ :option:`--multiview-config "multiview_config.txt"`.
17
+ Optimizations
18
+ * Arm SIMD optimizations: Several time-consuming scalar C
19
+ functions now have SIMD implementations on Arm platforms.
20
+ Existing Arm SIMD implementations have also been optimized.
21
+ These optimizations result in up to 57% faster encoding
22
+ compared to release 3.6.
23
+ * Arm SIMD optimizations include use of Armv8.4 DotProd, Armv8.6
24
+ I8MM, and Armv9 SVE2 instruction set extensions. The following
25
+ algorithms now have optimized SIMD implementations: SAD, SSE,
26
+ DCT, SAO, convolution, quantization, intra_planar,
27
+ intraFilter, intrapred DC and IDCT16x16.
28
+ Bug fixes
29
+ * Fix for y4m pipe input broken.
30
+ * Fix SCC crash on multipass encode.
31
+ * Fix mcstf when :option:`--bframes` value was less than 5.
32
+ * Fix lowpass DCT for high bit depth.
33
+ * Fix issue in default code flow and memory leak.
34
+ * Fix scc crash on multipass encode.
35
+
36
+-------------------------------------------------------------------
37
Thu Jun 13 05:58:19 UTC 2024 - Luigi Baldoni <aloisio@gmx.com>
38
39
- Update to version 3.6
40
x265.spec
Changed
73
1
2
#
3
4
5
-%define sover 209
6
+%define sover 212
7
%define libname lib%{name}
8
%define libsoname %{libname}-%{sover}
9
-%define uver 3_6
10
+%define uver 4_0
11
Name: x265
12
-Version: 3.6
13
+Version: 4.0
14
Release: 0
15
Summary: A free h265/HEVC encoder - encoder binary
16
License: GPL-2.0-or-later
17
18
URL: https://bitbucket.org/multicoreware/x265_git
19
Source0: https://bitbucket.org/multicoreware/x265_git/downloads/%{name}_%{version}.tar.gz
20
Patch1: x265.pkgconfig.patch
21
-Patch2: x265-fix_enable512.patch
22
Patch3: 0001-Fix-arm-flags.patch
23
Patch4: 0004-Do-not-build-with-assembly-support-on-arm.patch
24
-BuildRequires: cmake >= 2.8.8
25
+BuildRequires: cmake
26
+%if 0%{?suse_version} > 1500
27
BuildRequires: gcc-c++
28
+%else
29
+%if 0%{?sle_version} > 150500
30
+BuildRequires: gcc13
31
+BuildRequires: gcc13-c++
32
+%else
33
+BuildRequires: gcc10
34
+BuildRequires: gcc10-c++
35
+%endif
36
+%endif
37
BuildRequires: nasm >= 2.13
38
BuildRequires: pkgconfig
39
%ifarch x86_64
40
41
streams.
42
43
%prep
44
-%setup -q -n %{name}_%{version}
45
-%autopatch -p1
46
+%autosetup -p1 -n %{name}_%{version}
47
48
+%build
49
+test -x "$(type -p gcc)" && CC="$_"
50
+test -x "$(type -p g++)" && CXX="$_"
51
+test -x "$(type -p gcc-10)" && CC="$_"
52
+test -x "$(type -p g++-10)" && CXX="$_"
53
+test -x "$(type -p gcc-13)" && CC="$_"
54
+test -x "$(type -p g++-13)" && CXX="$_"
55
+export CC="$(readlink -f ${CC})"
56
+export CXX="$(readlink -f ${CXX})"
57
+CFLAGS='%optflags -Wno-misleading-indentation -Wno-unused-parameter -Wno-unused-variable'
58
+CXXFLAGS='%optflags -Wno-misleading-indentation -Wno-unused-parameter -Wno-unused-variable'
59
# set the version by hand
60
-sed -i "/^include(Version)/d" source/CMakeLists.txt
61
+sed -i~ "/^include(Version)/d" source/CMakeLists.txt
62
+diff -u "$_"~ "$_" && exit 1
63
# force version number in the soname
64
-sed -i 's/hdr10plus-shared PROPERTIES OUTPUT_NAME hdr10plus/hdr10plus-shared PROPERTIES OUTPUT_NAME hdr10plus-%{version}/' \
65
+sed -i~ 's/hdr10plus-shared PROPERTIES OUTPUT_NAME hdr10plus/hdr10plus-shared PROPERTIES OUTPUT_NAME hdr10plus-%{version}/' \
66
source/CMakeLists.txt
67
+diff -u "$_"~ "$_" && exit 1
68
69
-%build
70
SOURCE_DIR="$PWD"/source
71
COMMON_FLAGS="-DENABLE_TESTS=OFF -DENABLE_PIC=ON -Wno-dev"
72
HIGH_BIT_DEPTH_FLAGS="-DENABLE_CLI=OFF -DENABLE_SHARED=OFF -DEXPORT_C_API=OFF -DHIGH_BIT_DEPTH=ON"
73
0001-Fix-arm-flags.patch
Changed
74
1
2
source/CMakeLists.txt | 7 ++-----
3
1 file changed, 2 insertions(+), 5 deletions(-)
4
5
-diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
6
-index ab5ddfe..eb9b19b 100755
7
--- a/source/CMakeLists.txt
8
+++ b/source/CMakeLists.txt
9
-@@ -253,10 +253,7 @@ if(GCC)
10
+@@ -257,10 +257,7 @@
11
elseif(ARM)
12
find_package(Neon)
13
if(CPU_HAS_NEON)
14
15
- set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
16
endif()
17
endif()
18
- if(ARM64 OR CROSS_COMPILE_ARM64)
19
-@@ -265,13 +262,13 @@ if(GCC)
20
- find_package(SVE2)
21
- if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2)
22
- message(STATUS "Found SVE2")
23
-- set(ARM_ARGS -O3 -march=armv8-a+sve2 -fPIC -flax-vector-conversions)
24
-+ set(ARM_ARGS -fPIC -flax-vector-conversions)
25
- add_definitions(-DHAVE_SVE2)
26
- add_definitions(-DHAVE_SVE)
27
- add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE2
28
- elseif(CPU_HAS_SVE OR CROSS_COMPILE_SVE)
29
- message(STATUS "Found SVE")
30
-- set(ARM_ARGS -O3 -march=armv8-a+sve -fPIC -flax-vector-conversions)
31
-+ set(ARM_ARGS -fPIC -flax-vector-conversions)
32
- add_definitions(-DHAVE_SVE)
33
- add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE
34
- elseif(CPU_HAS_NEON)
35
+ if(ARM64)
36
+--- a/source/cmake/FindNEON_DOTPROD.cmake
37
++++ b/source/cmake/FindNEON_DOTPROD.cmake
38
+@@ -17,5 +17,5 @@
39
+ endif()
40
+
41
+ if(has_dot_product)
42
+- set(CPU_HAS_NEON_DOTPROD 1)
43
++ set(CPU_HAS_NEON_DOTPROD 0)
44
+ endif()
45
+--- a/source/cmake/FindNEON_I8MM.cmake
46
++++ b/source/cmake/FindNEON_I8MM.cmake
47
+@@ -17,5 +17,5 @@
48
+ endif()
49
+
50
+ if(has_i8mm)
51
+- set(CPU_HAS_NEON_I8MM 1)
52
++ set(CPU_HAS_NEON_I8MM 0)
53
+ endif()
54
+--- a/source/cmake/FindSVE.cmake
55
++++ b/source/cmake/FindSVE.cmake
56
+@@ -17,5 +17,5 @@
57
+ endif()
58
+
59
+ if(sve_version)
60
+- set(CPU_HAS_SVE 1)
61
++ set(CPU_HAS_SVE 0)
62
+ endif()
63
+--- a/source/cmake/FindSVE2.cmake
64
++++ b/source/cmake/FindSVE2.cmake
65
+@@ -17,6 +17,6 @@
66
+ endif()
67
+
68
+ if(sve2_version)
69
+- set(CPU_HAS_SVE 1)
70
+- set(CPU_HAS_SVE2 1)
71
++ set(CPU_HAS_SVE 0)
72
++ set(CPU_HAS_SVE2 0)
73
+ endif()
74
0004-Do-not-build-with-assembly-support-on-arm.patch
Changed
22
1
2
source/CMakeLists.txt | 9 ---------
3
1 file changed, 9 deletions(-)
4
5
-diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
6
-index 672cc2d..f112330 100755
7
--- a/source/CMakeLists.txt
8
+++ b/source/CMakeLists.txt
9
-@@ -73,15 +73,6 @@ elseif(POWERMATCH GREATER "-1")
10
+@@ -72,15 +72,6 @@
11
add_definitions(-DPPC64=1)
12
message(STATUS "Detected POWER PPC64 target processor")
13
endif()
14
15
- set(ARM 1)
16
- add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
17
elseif(ARM64MATCH GREATER "-1")
18
- #if(CROSS_COMPILE_ARM64)
19
- #message(STATUS "Cross compiling for ARM64 arch")
20
+ message(STATUS "Detected ARM64 target processor")
21
+ set(ARM64 1)
22
x265-fix_enable512.patch
Deleted
28
1
2
---- a/source/common/cpu.cpp
3
-+++ b/source/common/cpu.cpp
4
-@@ -110,6 +110,11 @@ const cpu_name_t cpu_names =
5
- { "", 0 },
6
- };
7
-
8
-+bool detect512()
9
-+{
10
-+ return(enable512);
11
-+}
12
-+
13
- #if X265_ARCH_X86
14
-
15
- extern "C" {
16
-@@ -123,11 +128,6 @@ uint64_t PFX(cpu_xgetbv)(int xcr);
17
- #pragma warning(disable: 4309) // truncation of constant value
18
- #endif
19
-
20
--bool detect512()
21
--{
22
-- return(enable512);
23
--}
24
--
25
- uint32_t cpu_detect(bool benableavx512 )
26
- {
27
-
28
baselibs.conf
Changed
4
1
2
-libx265-209
3
+libx265-212
4
x265_3.6.tar.gz/source/common/aarch64/ipfilter-common.S
Deleted
201
1
2
-/*****************************************************************************
3
- * Copyright (C) 2022-2023 MulticoreWare, Inc
4
- *
5
- * Authors: David Chen <david.chen@myais.com.cn>
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
- *
21
- * This program is also available under a commercial proprietary license.
22
- * For more information, contact us at license @ x265.com.
23
- *****************************************************************************/
24
-
25
-// This file contains the macros written using NEON instruction set
26
-// that are also used by the SVE2 functions
27
-
28
-// Macros below follow these conventions:
29
-// - input data in registers: v0, v1, v2, v3, v4, v5, v6, v7
30
-// - constants in registers: v24, v25, v26, v27, v31
31
-// - temporary registers: v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30.
32
-// - _32b macros output a result in v17.4s
33
-// - _64b and _32b_1 macros output results in v17.4s, v18.4s
34
-
35
-#include "asm.S"
36
-
37
-.arch armv8-a
38
-
39
-#ifdef __APPLE__
40
-.section __RODATA,__rodata
41
-#else
42
-.section .rodata
43
-#endif
44
-
45
-.align 4
46
-
47
-.macro vextin8 v
48
- ldp d6, d7, x11, #16
49
-.if \v == 0
50
- // qpel_filter_0 only uses values in v3
51
- ext v3.8b, v6.8b, v7.8b, #4
52
-.else
53
-.if \v != 3
54
- ext v0.8b, v6.8b, v7.8b, #1
55
-.endif
56
- ext v1.8b, v6.8b, v7.8b, #2
57
- ext v2.8b, v6.8b, v7.8b, #3
58
- ext v3.8b, v6.8b, v7.8b, #4
59
- ext v4.8b, v6.8b, v7.8b, #5
60
- ext v5.8b, v6.8b, v7.8b, #6
61
- ext v6.8b, v6.8b, v7.8b, #7
62
-.endif
63
-.endm
64
-
65
-.macro vextin8_64 v
66
- ldp q6, q7, x11, #32
67
-.if \v == 0
68
- // qpel_filter_0 only uses values in v3
69
- ext v3.16b, v6.16b, v7.16b, #4
70
-.else
71
-.if \v != 3
72
- // qpel_filter_3 does not use values in v0
73
- ext v0.16b, v6.16b, v7.16b, #1
74
-.endif
75
- ext v1.16b, v6.16b, v7.16b, #2
76
- ext v2.16b, v6.16b, v7.16b, #3
77
- ext v3.16b, v6.16b, v7.16b, #4
78
- ext v4.16b, v6.16b, v7.16b, #5
79
- ext v5.16b, v6.16b, v7.16b, #6
80
-.if \v == 1
81
- ext v6.16b, v6.16b, v7.16b, #7
82
- // qpel_filter_1 does not use v7
83
-.else
84
- ext v16.16b, v6.16b, v7.16b, #7
85
- ext v7.16b, v6.16b, v7.16b, #8
86
- mov v6.16b, v16.16b
87
-.endif
88
-.endif
89
-.endm
90
-
91
-.macro vextin8_chroma v
92
- ldp d6, d7, x11, #16
93
-.if \v == 0
94
- // qpel_filter_chroma_0 only uses values in v1
95
- ext v1.8b, v6.8b, v7.8b, #2
96
-.else
97
- ext v0.8b, v6.8b, v7.8b, #1
98
- ext v1.8b, v6.8b, v7.8b, #2
99
- ext v2.8b, v6.8b, v7.8b, #3
100
- ext v3.8b, v6.8b, v7.8b, #4
101
-.endif
102
-.endm
103
-
104
-.macro vextin8_chroma_64 v
105
- ldp q16, q17, x11, #32
106
-.if \v == 0
107
- // qpel_filter_chroma_0 only uses values in v1
108
- ext v1.16b, v16.16b, v17.16b, #2
109
-.else
110
- ext v0.16b, v16.16b, v17.16b, #1
111
- ext v1.16b, v16.16b, v17.16b, #2
112
- ext v2.16b, v16.16b, v17.16b, #3
113
- ext v3.16b, v16.16b, v17.16b, #4
114
-.endif
115
-.endm
116
-
117
-.macro qpel_load_32b v
118
-.if \v == 0
119
- add x6, x6, x11 // do not load 3 values that are not used in qpel_filter_0
120
- ld1 {v3.8b}, x6, x1
121
-.elseif \v == 1 || \v == 2 || \v == 3
122
-.if \v != 3 // not used in qpel_filter_3
123
- ld1 {v0.8b}, x6, x1
124
-.else
125
- add x6, x6, x1
126
-.endif
127
- ld1 {v1.8b}, x6, x1
128
- ld1 {v2.8b}, x6, x1
129
- ld1 {v3.8b}, x6, x1
130
- ld1 {v4.8b}, x6, x1
131
- ld1 {v5.8b}, x6, x1
132
-.if \v != 1 // not used in qpel_filter_1
133
- ld1 {v6.8b}, x6, x1
134
- ld1 {v7.8b}, x6
135
-.else
136
- ld1 {v6.8b}, x6
137
-.endif
138
-.endif
139
-.endm
140
-
141
-.macro qpel_load_64b v
142
-.if \v == 0
143
- add x6, x6, x11 // do not load 3 values that are not used in qpel_filter_0
144
- ld1 {v3.16b}, x6, x1
145
-.elseif \v == 1 || \v == 2 || \v == 3
146
-.if \v != 3 // not used in qpel_filter_3
147
- ld1 {v0.16b}, x6, x1
148
-.else
149
- add x6, x6, x1
150
-.endif
151
- ld1 {v1.16b}, x6, x1
152
- ld1 {v2.16b}, x6, x1
153
- ld1 {v3.16b}, x6, x1
154
- ld1 {v4.16b}, x6, x1
155
- ld1 {v5.16b}, x6, x1
156
-.if \v != 1 // not used in qpel_filter_1
157
- ld1 {v6.16b}, x6, x1
158
- ld1 {v7.16b}, x6
159
-.else
160
- ld1 {v6.16b}, x6
161
-.endif
162
-.endif
163
-.endm
164
-
165
-.macro qpel_chroma_load_32b v
166
-.if \v == 0
167
- // qpel_filter_chroma_0 only uses values in v1
168
- add x6, x6, x1
169
- ldr d1, x6
170
-.else
171
- ld1 {v0.8b}, x6, x1
172
- ld1 {v1.8b}, x6, x1
173
- ld1 {v2.8b}, x6, x1
174
- ld1 {v3.8b}, x6
175
-.endif
176
-.endm
177
-
178
-.macro qpel_chroma_load_64b v
179
-.if \v == 0
180
- // qpel_filter_chroma_0 only uses values in v1
181
- add x6, x6, x1
182
- ldr q1, x6
183
-.else
184
- ld1 {v0.16b}, x6, x1
185
- ld1 {v1.16b}, x6, x1
186
- ld1 {v2.16b}, x6, x1
187
- ld1 {v3.16b}, x6
188
-.endif
189
-.endm
190
-
191
-// a, b, c, d, e, f, g, h
192
-// .hword 0, 0, 0, 64, 0, 0, 0, 0
193
-.macro qpel_start_0
194
- movi v24.16b, #64
195
-.endm
196
-
197
-.macro qpel_filter_0_32b
198
- umull v17.8h, v3.8b, v24.8b // 64*d
199
-.endm
200
-
201
x265_3.6.tar.gz/source/common/aarch64/ipfilter-sve2.S
Deleted
201
1
2
-/*****************************************************************************
3
- * Copyright (C) 2022-2023 MulticoreWare, Inc
4
- *
5
- * Authors: David Chen <david.chen@myais.com.cn>
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
- *
21
- * This program is also available under a commercial proprietary license.
22
- * For more information, contact us at license @ x265.com.
23
- *****************************************************************************/
24
-
25
-// Functions in this file:
26
-// ***** luma_vpp *****
27
-// ***** luma_vps *****
28
-// ***** luma_vsp *****
29
-// ***** luma_vss *****
30
-// ***** luma_hpp *****
31
-// ***** luma_hps *****
32
-// ***** chroma_vpp *****
33
-// ***** chroma_vps *****
34
-// ***** chroma_vsp *****
35
-// ***** chroma_vss *****
36
-// ***** chroma_hpp *****
37
-// ***** chroma_hps *****
38
-
39
-#include "asm-sve.S"
40
-#include "ipfilter-common.S"
41
-
42
-.arch armv8-a+sve2
43
-
44
-#ifdef __APPLE__
45
-.section __RODATA,__rodata
46
-#else
47
-.section .rodata
48
-#endif
49
-
50
-.align 4
51
-
52
-.text
53
-
54
-.macro qpel_load_32b_sve2 v
55
-.if \v == 0
56
- add x6, x6, x11 // do not load 3 values that are not used in qpel_filter_0
57
- ld1b {z3.h}, p0/z, x6
58
- add x6, x6, x1
59
-.elseif \v == 1 || \v == 2 || \v == 3
60
-.if \v != 3 // not used in qpel_filter_3
61
- ld1b {z0.h}, p0/z, x6
62
- add x6, x6, x1
63
-.else
64
- add x6, x6, x1
65
-.endif
66
- ld1b {z1.h}, p0/z, x6
67
- add x6, x6, x1
68
- ld1b {z2.h}, p0/z, x6
69
- add x6, x6, x1
70
- ld1b {z3.h}, p0/z, x6
71
- add x6, x6, x1
72
- ld1b {z4.h}, p0/z, x6
73
- add x6, x6, x1
74
- ld1b {z5.h}, p0/z, x6
75
- add x6, x6, x1
76
-.if \v != 1 // not used in qpel_filter_1
77
- ld1b {z6.h}, p0/z, x6
78
- add x6, x6, x1
79
- ld1b {z7.h}, p0/z, x6
80
-.else
81
- ld1b {z6.h}, p0/z, x6
82
-.endif
83
-.endif
84
-.endm
85
-
86
-.macro qpel_load_64b_sve2_gt_16 v
87
-.if \v == 0
88
- add x6, x6, x11 // do not load 3 values that are not used in qpel_filter_0
89
- ld1b {z3.h}, p2/z, x6
90
- add x6, x6, x1
91
-.elseif \v == 1 || \v == 2 || \v == 3
92
-.if \v != 3 // not used in qpel_filter_3
93
- ld1b {z0.h}, p2/z, x6
94
- add x6, x6, x1
95
-.else
96
- add x6, x6, x1
97
-.endif
98
- ld1b {z1.h}, p2/z, x6
99
- add x6, x6, x1
100
- ld1b {z2.h}, p2/z, x6
101
- add x6, x6, x1
102
- ld1b {z3.h}, p2/z, x6
103
- add x6, x6, x1
104
- ld1b {z4.h}, p2/z, x6
105
- add x6, x6, x1
106
- ld1b {z5.h}, p2/z, x6
107
- add x6, x6, x1
108
-.if \v != 1 // not used in qpel_filter_1
109
- ld1b {z6.h}, p2/z, x6
110
- add x6, x6, x1
111
- ld1b {z7.h}, p2/z, x6
112
-.else
113
- ld1b {z6.h}, p2/z, x6
114
-.endif
115
-.endif
116
-.endm
117
-
118
-.macro qpel_chroma_load_32b_sve2 v
119
-.if \v == 0
120
- // qpel_filter_chroma_0 only uses values in v1
121
- add x6, x6, x1
122
- ld1b {z1.h}, p0/z, x6
123
-.else
124
- ld1b {z0.h}, p0/z, x6
125
- add x6, x6, x1
126
- ld1b {z1.h}, p0/z, x6
127
- add x6, x6, x1
128
- ld1b {z2.h}, p0/z, x6
129
- add x6, x6, x1
130
- ld1b {z3.h}, p0/z, x6
131
-.endif
132
-.endm
133
-
134
-.macro qpel_start_sve2_0
135
- mov z24.h, #64
136
-.endm
137
-
138
-.macro qpel_filter_sve2_0_32b
139
- mul z17.h, z3.h, z24.h // 64*d
140
-.endm
141
-
142
-.macro qpel_filter_sve2_0_64b
143
- qpel_filter_sve2_0_32b
144
- mul z18.h, z11.h, z24.h
145
-.endm
146
-
147
-.macro qpel_start_sve2_1
148
- mov z24.h, #58
149
- mov z25.h, #10
150
- mov z26.h, #17
151
- mov z27.h, #5
152
-.endm
153
-
154
-.macro qpel_filter_sve2_1_32b
155
- mul z19.h, z2.h, z25.h // c*10
156
- mul z17.h, z3.h, z24.h // d*58
157
- mul z21.h, z4.h, z26.h // e*17
158
- mul z23.h, z5.h, z27.h // f*5
159
- sub z17.h, z17.h, z19.h // d*58 - c*10
160
- lsl z18.h, z1.h, #2 // b*4
161
- add z17.h, z17.h, z21.h // d*58 - c*10 + e*17
162
- sub z21.h, z6.h, z0.h // g - a
163
- add z17.h, z17.h, z18.h // d*58 - c*10 + e*17 + b*4
164
- sub z21.h, z21.h, z23.h // g - a - f*5
165
- add z17.h, z17.h, z21.h // d*58 - c*10 + e*17 + b*4 + g - a - f*5
166
-.endm
167
-
168
-.macro qpel_filter_sve2_1_64b
169
- qpel_filter_sve2_1_32b
170
- mul z20.h, z10.h, z25.h // c*10
171
- mul z18.h, z11.h, z24.h // d*58
172
- mul z21.h, z12.h, z26.h // e*17
173
- mul z23.h, z13.h, z27.h // f*5
174
- sub z18.h, z18.h, z20.h // d*58 - c*10
175
- lsl z28.h, z30.h, #2 // b*4
176
- add z18.h, z18.h, z21.h // d*58 - c*10 + e*17
177
- sub z21.h, z14.h, z29.h // g - a
178
- add z18.h, z18.h, z28.h // d*58 - c*10 + e*17 + b*4
179
- sub z21.h, z21.h, z23.h // g - a - f*5
180
- add z18.h, z18.h, z21.h // d*58 - c*10 + e*17 + b*4 + g - a - f*5
181
-.endm
182
-
183
-.macro qpel_start_sve2_2
184
- mov z24.h, #11
185
- mov z25.h, #40
186
-.endm
187
-
188
-.macro qpel_filter_sve2_2_32b
189
- add z17.h, z3.h, z4.h // d + e
190
- add z19.h, z2.h, z5.h // c + f
191
- add z23.h, z1.h, z6.h // b + g
192
- add z21.h, z0.h, z7.h // a + h
193
- mul z17.h, z17.h, z25.h // 40 * (d + e)
194
- mul z19.h, z19.h, z24.h // 11 * (c + f)
195
- lsl z23.h, z23.h, #2 // (b + g) * 4
196
- add z19.h, z19.h, z21.h // 11 * (c + f) + a + h
197
- add z17.h, z17.h, z23.h // 40 * (d + e) + (b + g) * 4
198
- sub z17.h, z17.h, z19.h // 40 * (d + e) + (b + g) * 4 - 11 * (c + f) - a - h
199
-.endm
200
-
201
x265_3.6.tar.gz/source/common/aarch64/ipfilter.S
Deleted
201
1
2
-/*****************************************************************************
3
- * Copyright (C) 2021 MulticoreWare, Inc
4
- *
5
- * Authors: Sebastian Pop <spop@amazon.com>
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
- *
21
- * This program is also available under a commercial proprietary license.
22
- * For more information, contact us at license @ x265.com.
23
- *****************************************************************************/
24
-
25
-// Functions in this file:
26
-// ***** luma_vpp *****
27
-// ***** luma_vps *****
28
-// ***** luma_vsp *****
29
-// ***** luma_vss *****
30
-// ***** luma_hpp *****
31
-// ***** luma_hps *****
32
-// ***** chroma_vpp *****
33
-// ***** chroma_vps *****
34
-// ***** chroma_vsp *****
35
-// ***** chroma_vss *****
36
-// ***** chroma_hpp *****
37
-// ***** chroma_hps *****
38
-
39
-#include "asm.S"
40
-#include "ipfilter-common.S"
41
-
42
-#ifdef __APPLE__
43
-.section __RODATA,__rodata
44
-#else
45
-.section .rodata
46
-#endif
47
-
48
-.align 4
49
-
50
-.text
51
-
52
-// ***** luma_vpp *****
53
-// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
54
-.macro LUMA_VPP_4xN h
55
-function x265_interp_8tap_vert_pp_4x\h\()_neon
56
- movrel x10, g_luma_s16
57
- sub x0, x0, x1
58
- sub x0, x0, x1, lsl #1 // src -= 3 * srcStride
59
- lsl x4, x4, #4
60
- ldr q0, x10, x4 // q0 = luma interpolate coeff
61
- dup v24.8h, v0.h0
62
- dup v25.8h, v0.h1
63
- trn1 v24.2d, v24.2d, v25.2d
64
- dup v26.8h, v0.h2
65
- dup v27.8h, v0.h3
66
- trn1 v26.2d, v26.2d, v27.2d
67
- dup v28.8h, v0.h4
68
- dup v29.8h, v0.h5
69
- trn1 v28.2d, v28.2d, v29.2d
70
- dup v30.8h, v0.h6
71
- dup v31.8h, v0.h7
72
- trn1 v30.2d, v30.2d, v31.2d
73
-
74
- // prepare to load 8 lines
75
- ld1 {v0.s}0, x0, x1
76
- ld1 {v0.s}1, x0, x1
77
- ushll v0.8h, v0.8b, #0
78
- ld1 {v1.s}0, x0, x1
79
- ld1 {v1.s}1, x0, x1
80
- ushll v1.8h, v1.8b, #0
81
- ld1 {v2.s}0, x0, x1
82
- ld1 {v2.s}1, x0, x1
83
- ushll v2.8h, v2.8b, #0
84
- ld1 {v3.s}0, x0, x1
85
- ld1 {v3.s}1, x0, x1
86
- ushll v3.8h, v3.8b, #0
87
-
88
- mov x9, #\h
89
-.loop_4x\h:
90
- ld1 {v4.s}0, x0, x1
91
- ld1 {v4.s}1, x0, x1
92
- ushll v4.8h, v4.8b, #0
93
-
94
- // row0-1
95
- mul v16.8h, v0.8h, v24.8h
96
- ext v21.16b, v0.16b, v1.16b, #8
97
- mul v17.8h, v21.8h, v24.8h
98
- mov v0.16b, v1.16b
99
-
100
- // row2-3
101
- mla v16.8h, v1.8h, v26.8h
102
- ext v21.16b, v1.16b, v2.16b, #8
103
- mla v17.8h, v21.8h, v26.8h
104
- mov v1.16b, v2.16b
105
-
106
- // row4-5
107
- mla v16.8h, v2.8h, v28.8h
108
- ext v21.16b, v2.16b, v3.16b, #8
109
- mla v17.8h, v21.8h, v28.8h
110
- mov v2.16b, v3.16b
111
-
112
- // row6-7
113
- mla v16.8h, v3.8h, v30.8h
114
- ext v21.16b, v3.16b, v4.16b, #8
115
- mla v17.8h, v21.8h, v30.8h
116
- mov v3.16b, v4.16b
117
-
118
- // sum row0-7
119
- trn1 v20.2d, v16.2d, v17.2d
120
- trn2 v21.2d, v16.2d, v17.2d
121
- add v16.8h, v20.8h, v21.8h
122
-
123
- sqrshrun v16.8b, v16.8h, #6
124
- st1 {v16.s}0, x2, x3
125
- st1 {v16.s}1, x2, x3
126
-
127
- sub x9, x9, #2
128
- cbnz x9, .loop_4x\h
129
- ret
130
-endfunc
131
-.endm
132
-
133
-LUMA_VPP_4xN 4
134
-LUMA_VPP_4xN 8
135
-LUMA_VPP_4xN 16
136
-
137
-// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
138
-.macro LUMA_VPP w, h
139
-function x265_interp_8tap_vert_pp_\w\()x\h\()_neon
140
- cmp x4, #0
141
- b.eq 0f
142
- cmp x4, #1
143
- b.eq 1f
144
- cmp x4, #2
145
- b.eq 2f
146
- cmp x4, #3
147
- b.eq 3f
148
-0:
149
- FILTER_LUMA_VPP \w, \h, 0
150
-1:
151
- FILTER_LUMA_VPP \w, \h, 1
152
-2:
153
- FILTER_LUMA_VPP \w, \h, 2
154
-3:
155
- FILTER_LUMA_VPP \w, \h, 3
156
-endfunc
157
-.endm
158
-
159
-LUMA_VPP 8, 4
160
-LUMA_VPP 8, 8
161
-LUMA_VPP 8, 16
162
-LUMA_VPP 8, 32
163
-LUMA_VPP 12, 16
164
-LUMA_VPP 16, 4
165
-LUMA_VPP 16, 8
166
-LUMA_VPP 16, 16
167
-LUMA_VPP 16, 32
168
-LUMA_VPP 16, 64
169
-LUMA_VPP 16, 12
170
-LUMA_VPP 24, 32
171
-LUMA_VPP 32, 8
172
-LUMA_VPP 32, 16
173
-LUMA_VPP 32, 32
174
-LUMA_VPP 32, 64
175
-LUMA_VPP 32, 24
176
-LUMA_VPP 48, 64
177
-LUMA_VPP 64, 16
178
-LUMA_VPP 64, 32
179
-LUMA_VPP 64, 64
180
-LUMA_VPP 64, 48
181
-
182
-// ***** luma_vps *****
183
-// void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
184
-.macro LUMA_VPS_4xN h
185
-function x265_interp_8tap_vert_ps_4x\h\()_neon
186
- lsl x3, x3, #1
187
- lsl x5, x4, #6
188
- lsl x4, x1, #2
189
- sub x4, x4, x1
190
- sub x0, x0, x4
191
-
192
- mov w6, #8192
193
- dup v28.4s, w6
194
- mov x4, #\h
195
- movrel x12, g_lumaFilter
196
- add x12, x12, x5
197
- ld1r {v16.2d}, x12, #8
198
- ld1r {v17.2d}, x12, #8
199
- ld1r {v18.2d}, x12, #8
200
- ld1r {v19.2d}, x12, #8
201
x265_3.6.tar.gz/source/common/aarch64/sad-a-common.S
Deleted
201
1
2
-/*****************************************************************************
3
- * Copyright (C) 2022-2023 MulticoreWare, Inc
4
- *
5
- * Authors: David Chen <david.chen@myais.com.cn>
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
- *
21
- * This program is also available under a commercial proprietary license.
22
- * For more information, contact us at license @ x265.com.
23
- *****************************************************************************/
24
-
25
-// This file contains the macros written using NEON instruction set
26
-// that are also used by the SVE2 functions
27
-
28
-#include "asm.S"
29
-
30
-.arch armv8-a
31
-
32
-#ifdef __APPLE__
33
-.section __RODATA,__rodata
34
-#else
35
-.section .rodata
36
-#endif
37
-
38
-.align 4
39
-
40
-.macro SAD_START_4 f
41
- ld1 {v0.s}0, x0, x1
42
- ld1 {v0.s}1, x0, x1
43
- ld1 {v1.s}0, x2, x3
44
- ld1 {v1.s}1, x2, x3
45
- \f v16.8h, v0.8b, v1.8b
46
-.endm
47
-
48
-.macro SAD_4 h
49
-.rept \h / 2 - 1
50
- SAD_START_4 uabal
51
-.endr
52
-.endm
53
-
54
-.macro SAD_START_8 f
55
- ld1 {v0.8b}, x0, x1
56
- ld1 {v1.8b}, x2, x3
57
- ld1 {v2.8b}, x0, x1
58
- ld1 {v3.8b}, x2, x3
59
- \f v16.8h, v0.8b, v1.8b
60
- \f v17.8h, v2.8b, v3.8b
61
-.endm
62
-
63
-.macro SAD_8 h
64
-.rept \h / 2 - 1
65
- SAD_START_8 uabal
66
-.endr
67
-.endm
68
-
69
-.macro SAD_START_16 f
70
- ld1 {v0.16b}, x0, x1
71
- ld1 {v1.16b}, x2, x3
72
- ld1 {v2.16b}, x0, x1
73
- ld1 {v3.16b}, x2, x3
74
- \f v16.8h, v0.8b, v1.8b
75
- \f\()2 v17.8h, v0.16b, v1.16b
76
- uabal v16.8h, v2.8b, v3.8b
77
- uabal2 v17.8h, v2.16b, v3.16b
78
-.endm
79
-
80
-.macro SAD_16 h
81
-.rept \h / 2 - 1
82
- SAD_START_16 uabal
83
-.endr
84
-.endm
85
-
86
-.macro SAD_START_32
87
- movi v16.16b, #0
88
- movi v17.16b, #0
89
- movi v18.16b, #0
90
- movi v19.16b, #0
91
-.endm
92
-
93
-.macro SAD_32
94
- ld1 {v0.16b-v1.16b}, x0, x1
95
- ld1 {v2.16b-v3.16b}, x2, x3
96
- ld1 {v4.16b-v5.16b}, x0, x1
97
- ld1 {v6.16b-v7.16b}, x2, x3
98
- uabal v16.8h, v0.8b, v2.8b
99
- uabal2 v17.8h, v0.16b, v2.16b
100
- uabal v18.8h, v1.8b, v3.8b
101
- uabal2 v19.8h, v1.16b, v3.16b
102
- uabal v16.8h, v4.8b, v6.8b
103
- uabal2 v17.8h, v4.16b, v6.16b
104
- uabal v18.8h, v5.8b, v7.8b
105
- uabal2 v19.8h, v5.16b, v7.16b
106
-.endm
107
-
108
-.macro SAD_END_32
109
- add v16.8h, v16.8h, v17.8h
110
- add v17.8h, v18.8h, v19.8h
111
- add v16.8h, v16.8h, v17.8h
112
- uaddlv s0, v16.8h
113
- fmov w0, s0
114
- ret
115
-.endm
116
-
117
-.macro SAD_START_64
118
- movi v16.16b, #0
119
- movi v17.16b, #0
120
- movi v18.16b, #0
121
- movi v19.16b, #0
122
- movi v20.16b, #0
123
- movi v21.16b, #0
124
- movi v22.16b, #0
125
- movi v23.16b, #0
126
-.endm
127
-
128
-.macro SAD_64
129
- ld1 {v0.16b-v3.16b}, x0, x1
130
- ld1 {v4.16b-v7.16b}, x2, x3
131
- ld1 {v24.16b-v27.16b}, x0, x1
132
- ld1 {v28.16b-v31.16b}, x2, x3
133
- uabal v16.8h, v0.8b, v4.8b
134
- uabal2 v17.8h, v0.16b, v4.16b
135
- uabal v18.8h, v1.8b, v5.8b
136
- uabal2 v19.8h, v1.16b, v5.16b
137
- uabal v20.8h, v2.8b, v6.8b
138
- uabal2 v21.8h, v2.16b, v6.16b
139
- uabal v22.8h, v3.8b, v7.8b
140
- uabal2 v23.8h, v3.16b, v7.16b
141
-
142
- uabal v16.8h, v24.8b, v28.8b
143
- uabal2 v17.8h, v24.16b, v28.16b
144
- uabal v18.8h, v25.8b, v29.8b
145
- uabal2 v19.8h, v25.16b, v29.16b
146
- uabal v20.8h, v26.8b, v30.8b
147
- uabal2 v21.8h, v26.16b, v30.16b
148
- uabal v22.8h, v27.8b, v31.8b
149
- uabal2 v23.8h, v27.16b, v31.16b
150
-.endm
151
-
152
-.macro SAD_END_64
153
- add v16.8h, v16.8h, v17.8h
154
- add v17.8h, v18.8h, v19.8h
155
- add v16.8h, v16.8h, v17.8h
156
- uaddlp v16.4s, v16.8h
157
- add v18.8h, v20.8h, v21.8h
158
- add v19.8h, v22.8h, v23.8h
159
- add v17.8h, v18.8h, v19.8h
160
- uaddlp v17.4s, v17.8h
161
- add v16.4s, v16.4s, v17.4s
162
- uaddlv d0, v16.4s
163
- fmov x0, d0
164
- ret
165
-.endm
166
-
167
-.macro SAD_START_12
168
- movrel x12, sad12_mask
169
- ld1 {v31.16b}, x12
170
- movi v16.16b, #0
171
- movi v17.16b, #0
172
-.endm
173
-
174
-.macro SAD_12
175
- ld1 {v0.16b}, x0, x1
176
- and v0.16b, v0.16b, v31.16b
177
- ld1 {v1.16b}, x2, x3
178
- and v1.16b, v1.16b, v31.16b
179
- ld1 {v2.16b}, x0, x1
180
- and v2.16b, v2.16b, v31.16b
181
- ld1 {v3.16b}, x2, x3
182
- and v3.16b, v3.16b, v31.16b
183
- uabal v16.8h, v0.8b, v1.8b
184
- uabal2 v17.8h, v0.16b, v1.16b
185
- uabal v16.8h, v2.8b, v3.8b
186
- uabal2 v17.8h, v2.16b, v3.16b
187
-.endm
188
-
189
-.macro SAD_END_12
190
- add v16.8h, v16.8h, v17.8h
191
- uaddlv s0, v16.8h
192
- fmov w0, s0
193
- ret
194
-.endm
195
-
196
-.macro SAD_START_24
197
- movi v16.16b, #0
198
- movi v17.16b, #0
199
- movi v18.16b, #0
200
- sub x1, x1, #16
201
x265_3.6.tar.gz/source/common/aarch64/sad-a-sve2.S
Deleted
201
1
2
-/*****************************************************************************
3
- * Copyright (C) 2022-2023 MulticoreWare, Inc
4
- *
5
- * Authors: David Chen <david.chen@myais.com.cn>
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
- *
21
- * This program is also available under a commercial proprietary license.
22
- * For more information, contact us at license @ x265.com.
23
- *****************************************************************************/
24
-
25
-#include "asm-sve.S"
26
-#include "sad-a-common.S"
27
-
28
-.arch armv8-a+sve2
29
-
30
-#ifdef __APPLE__
31
-.section __RODATA,__rodata
32
-#else
33
-.section .rodata
34
-#endif
35
-
36
-.align 4
37
-
38
-.text
39
-
40
-.macro SAD_SVE2_16 h
41
- mov z16.d, #0
42
- ptrue p0.h, vl16
43
-.rept \h
44
- ld1b {z0.h}, p0/z, x0
45
- ld1b {z2.h}, p0/z, x2
46
- add x0, x0, x1
47
- add x2, x2, x3
48
- uaba z16.h, z0.h, z2.h
49
-.endr
50
- uaddv d0, p0, z16.h
51
- fmov w0, s0
52
- ret
53
-.endm
54
-
55
-.macro SAD_SVE2_32 h
56
- ptrue p0.b, vl32
57
-.rept \h
58
- ld1b {z0.b}, p0/z, x0
59
- ld1b {z4.b}, p0/z, x2
60
- add x0, x0, x1
61
- add x2, x2, x3
62
- uabalb z16.h, z0.b, z4.b
63
- uabalt z16.h, z0.b, z4.b
64
-.endr
65
- uaddv d0, p0, z16.h
66
- fmov w0, s0
67
- ret
68
-.endm
69
-
70
-.macro SAD_SVE2_64 h
71
- cmp x9, #48
72
- bgt .vl_gt_48_pixel_sad_64x\h
73
- mov z16.d, #0
74
- mov z17.d, #0
75
- mov z18.d, #0
76
- mov z19.d, #0
77
- ptrue p0.b, vl32
78
-.rept \h
79
- ld1b {z0.b}, p0/z, x0
80
- ld1b {z1.b}, p0/z, x0, #1, mul vl
81
- ld1b {z4.b}, p0/z, x2
82
- ld1b {z5.b}, p0/z, x2, #1, mul vl
83
- add x0, x0, x1
84
- add x2, x2, x3
85
- uabalb z16.h, z0.b, z4.b
86
- uabalt z17.h, z0.b, z4.b
87
- uabalb z18.h, z1.b, z5.b
88
- uabalt z19.h, z1.b, z5.b
89
-.endr
90
- add z16.h, z16.h, z17.h
91
- add z17.h, z18.h, z19.h
92
- add z16.h, z16.h, z17.h
93
- uadalp z24.s, p0/m, z16.h
94
- uaddv d5, p0, z24.s
95
- fmov x0, d5
96
- ret
97
-.vl_gt_48_pixel_sad_64x\h\():
98
- mov z16.d, #0
99
- mov z17.d, #0
100
- mov z24.d, #0
101
- ptrue p0.b, vl64
102
-.rept \h
103
- ld1b {z0.b}, p0/z, x0
104
- ld1b {z4.b}, p0/z, x2
105
- add x0, x0, x1
106
- add x2, x2, x3
107
- uabalb z16.h, z0.b, z4.b
108
- uabalt z17.h, z0.b, z4.b
109
-.endr
110
- add z16.h, z16.h, z17.h
111
- uadalp z24.s, p0/m, z16.h
112
- uaddv d5, p0, z24.s
113
- fmov x0, d5
114
- ret
115
-.endm
116
-
117
-.macro SAD_SVE2_24 h
118
- mov z16.d, #0
119
- mov x10, #24
120
- mov x11, #0
121
- whilelt p0.b, x11, x10
122
-.rept \h
123
- ld1b {z0.b}, p0/z, x0
124
- ld1b {z8.b}, p0/z, x2
125
- add x0, x0, x1
126
- add x2, x2, x3
127
- uabalb z16.h, z0.b, z8.b
128
- uabalt z16.h, z0.b, z8.b
129
-.endr
130
- uaddv d5, p0, z16.h
131
- fmov w0, s5
132
- ret
133
-.endm
134
-
135
-.macro SAD_SVE2_48 h
136
- cmp x9, #48
137
- bgt .vl_gt_48_pixel_sad_48x\h
138
- mov z16.d, #0
139
- mov z17.d, #0
140
- mov z18.d, #0
141
- mov z19.d, #0
142
- ptrue p0.b, vl32
143
- ptrue p1.b, vl16
144
-.rept \h
145
- ld1b {z0.b}, p0/z, x0
146
- ld1b {z1.b}, p1/z, x0, #1, mul vl
147
- ld1b {z8.b}, p0/z, x2
148
- ld1b {z9.b}, p1/z, x2, #1, mul vl
149
- add x0, x0, x1
150
- add x2, x2, x3
151
- uabalb z16.h, z0.b, z8.b
152
- uabalt z17.h, z0.b, z8.b
153
- uabalb z18.h, z1.b, z9.b
154
- uabalt z19.h, z1.b, z9.b
155
-.endr
156
- add z16.h, z16.h, z17.h
157
- add z17.h, z18.h, z19.h
158
- add z16.h, z16.h, z17.h
159
- uaddv d5, p0, z16.h
160
- fmov w0, s5
161
- ret
162
-.vl_gt_48_pixel_sad_48x\h\():
163
- mov z16.d, #0
164
- mov z17.d, #0
165
- mov x10, #48
166
- mov x11, #0
167
- whilelt p0.b, x11, x10
168
-.rept \h
169
- ld1b {z0.b}, p0/z, x0
170
- ld1b {z8.b}, p0/z, x2
171
- add x0, x0, x1
172
- add x2, x2, x3
173
- uabalb z16.h, z0.b, z8.b
174
- uabalt z17.h, z0.b, z8.b
175
-.endr
176
- add z16.h, z16.h, z17.h
177
- uaddv d5, p0, z16.h
178
- fmov w0, s5
179
- ret
180
-.endm
181
-
182
-// Fully unrolled.
183
-.macro SAD_FUNC_SVE2 w, h
184
-function PFX(pixel_sad_\w\()x\h\()_sve2)
185
- rdvl x9, #1
186
- cmp x9, #16
187
- bgt .vl_gt_16_pixel_sad_\w\()x\h
188
- SAD_START_\w uabdl
189
- SAD_\w \h
190
-.if \w > 4
191
- add v16.8h, v16.8h, v17.8h
192
-.endif
193
- uaddlv s0, v16.8h
194
- fmov w0, s0
195
- ret
196
-.vl_gt_16_pixel_sad_\w\()x\h\():
197
-.if \w == 4 || \w == 8 || \w == 12
198
- SAD_START_\w uabdl
199
- SAD_\w \h
200
-.if \w > 4
201
x265_3.6.tar.gz/source/common/aarch64/ssd-a-sve.S
Deleted
80
1
2
-/*****************************************************************************
3
- * Copyright (C) 2022-2023 MulticoreWare, Inc
4
- *
5
- * Authors: David Chen <david.chen@myais.com.cn>
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
- *
21
- * This program is also available under a commercial proprietary license.
22
- * For more information, contact us at license @ x265.com.
23
- *****************************************************************************/
24
-
25
-#include "asm-sve.S"
26
-
27
-.arch armv8-a+sve
28
-
29
-#ifdef __APPLE__
30
-.section __RODATA,__rodata
31
-#else
32
-.section .rodata
33
-#endif
34
-
35
-.align 4
36
-
37
-.text
38
-
39
-function PFX(pixel_sse_pp_4x4_sve)
40
- ptrue p0.s, vl4
41
- ld1b {z0.s}, p0/z, x0
42
- ld1b {z17.s}, p0/z, x2
43
- add x0, x0, x1
44
- add x2, x2, x3
45
- sub z0.s, p0/m, z0.s, z17.s
46
- mul z0.s, p0/m, z0.s, z0.s
47
-.rept 3
48
- ld1b {z16.s}, p0/z, x0
49
- ld1b {z17.s}, p0/z, x2
50
- add x0, x0, x1
51
- add x2, x2, x3
52
- sub z16.s, p0/m, z16.s, z17.s
53
- mla z0.s, p0/m, z16.s, z16.s
54
-.endr
55
- uaddv d0, p0, z0.s
56
- fmov w0, s0
57
- ret
58
-endfunc
59
-
60
-function PFX(pixel_sse_pp_4x8_sve)
61
- ptrue p0.s, vl4
62
- ld1b {z0.s}, p0/z, x0
63
- ld1b {z17.s}, p0/z, x2
64
- add x0, x0, x1
65
- add x2, x2, x3
66
- sub z0.s, p0/m, z0.s, z17.s
67
- mul z0.s, p0/m, z0.s, z0.s
68
-.rept 7
69
- ld1b {z16.s}, p0/z, x0
70
- ld1b {z17.s}, p0/z, x2
71
- add x0, x0, x1
72
- add x2, x2, x3
73
- sub z16.s, p0/m, z16.s, z17.s
74
- mla z0.s, p0/m, z16.s, z16.s
75
-.endr
76
- uaddv d0, p0, z0.s
77
- fmov w0, s0
78
- ret
79
-endfunc
80
x265_4.0.tar.gz/.readthedocs.yaml
Added
29
1
2
+# Read the Docs configuration file for Sphinx projects
3
+# .readthedocs.yaml
4
+
5
+# Project Information
6
+# Required
7
+version: 2
8
+
9
+build:
10
+ os: "ubuntu-20.04"
11
+ tools:
12
+ python: "3.10"
13
+
14
+# Use a requirements file for pip dependencies
15
+python:
16
+ install:
17
+ - requirements: doc/requirements.txt
18
+
19
+# Build documentation in the "docs/" directory with Sphinx
20
+sphinx:
21
+ builder: html
22
+ configuration: doc/reST/conf.py
23
+ fail_on_warning: false
24
+
25
+# Optionally build your docs in additional formats such as PDF and ePub
26
+# formats:
27
+# - pdf
28
+# - epub
29
x265_3.6.tar.gz/build/README.txt -> x265_4.0.tar.gz/build/README.txt
Changed
58
1
2
3
= Build Instructions for cross-compilation for Arm AArch64 Targets=
4
5
-When the target platform is based on Arm AArch64 architecture, the x265 can be
6
-built in x86 platforms. However, the CMAKE_C_COMPILER and CMAKE_CXX_COMPILER
7
-enviroment variables should be set to point to the cross compilers of the
8
-appropriate gcc. For example:
9
+Cross compilation of x265 for AArch64 targets is possible on x86 platforms by
10
+passing a toolchain file when running CMake to configure the project:
11
12
-1. export CMAKE_C_COMPILER=aarch64-unknown-linux-gnu-gcc
13
-2. export CMAKE_CXX_COMPILER=aarch64-unknown-linux-gnu-g++
14
+* cmake -DCMAKE_TOOLCHAIN_FILE=<path-to-toolchain-file>
15
16
-The default ones are aarch64-linux-gnu-gcc and aarch64-linux-gnu-g++.
17
-Then, the normal building process can be followed.
18
+Toolchain files for AArch64 cross-compilation exist in the /build directory.
19
+These specify a default cross-compiler to use; however this can be overridden
20
+by setting the CMAKE_C_COMPILER and CMAKE_CXX_COMPILER CMake variables when
21
+running CMake to configure the project. For example:
22
23
-Moreover, if the target platform supports SVE or SVE2 instruction set, the
24
-CROSS_COMPILE_SVE or CROSS_COMPILE_SVE2 environment variables should be set
25
-to true, respectively. For example:
26
+* cmake -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++
27
28
-1. export CROSS_COMPILE_SVE2=true
29
-2. export CROSS_COMPILE_SVE=true
30
+If target platform supports Armv8.4 Neon DotProd instructions, the
31
+CROSS_COMPILE_NEON_DOTPROD CMake option should be set to ON:
32
33
-Then, the normal building process can be followed.
34
+* cmake -DCROSS_COMPILE_NEON_DOTPROD=ON <other configuration options...>
35
+
36
+If target platform supports Armv8.6 Neon I8MM instructions, the
37
+CROSS_COMPILE_NEON_I8MM CMake option should be set to ON:
38
+
39
+* cmake -DCROSS_COMPILE_NEON_I8MM=ON <other configuration options...>
40
+
41
+If the target platform supports SVE or SVE2, CROSS_COMPILE_SVE or
42
+CROSS_COMPILE_SVE2 CMake options should be set to ON, respectively.
43
+For example, when running CMake to configure the project:
44
+
45
+1. cmake -DCROSS_COMPILE_SVE=ON <other configuration options...>
46
+2. cmake -DCROSS_COMPILE_SVE2=ON <other configuration options...>
47
+
48
+Note: when the CROSS_COMPILE_SVE option is set to ON the build configuration will
49
+also compile for Neon DotProd and I8MM, as we impose the constraint that SVE implies
50
+both Neon DotProd and I8MM.
51
+
52
+Similarly when the CROSS_COMPILE_SVE2 option is set to ON the build configuration
53
+will also compile for Neon I8MM, as we impose the constraint that SVE2 implies Neon
54
+I8MM. SVE2 already implies that Neon DotProd is implemented since SVE2 is an Armv9.0
55
+feature which implies Armv8.5, and Neon DotProd is mandatory from Armv8.4.
56
+
57
+Then, the normal build process can be followed.
58
x265_3.6.tar.gz/build/aarch64-darwin/crosscompile.cmake -> x265_4.0.tar.gz/build/aarch64-darwin/crosscompile.cmake
Changed
26
1
2
set(CMAKE_SYSTEM_NAME Darwin)
3
set(CMAKE_SYSTEM_PROCESSOR aarch64)
4
5
-# specify the cross compiler
6
-set(CMAKE_C_COMPILER gcc-12)
7
-set(CMAKE_CXX_COMPILER g++-12)
8
+# specify the cross compiler (giving precedence to user-supplied CC/CXX)
9
+if(NOT DEFINED CMAKE_C_COMPILER)
10
+ set(CMAKE_C_COMPILER gcc)
11
+endif()
12
+if(NOT DEFINED CMAKE_CXX_COMPILER)
13
+ set(CMAKE_CXX_COMPILER g++)
14
+endif()
15
16
# specify the target environment
17
SET(CMAKE_FIND_ROOT_PATH /opt/homebrew/bin/)
18
19
-# specify whether SVE/SVE2 is supported by the target platform
20
-if(DEFINED ENV{CROSS_COMPILE_SVE2})
21
- set(CROSS_COMPILE_SVE2 1)
22
-elseif(DEFINED ENV{CROSS_COMPILE_SVE})
23
- set(CROSS_COMPILE_SVE 1)
24
-endif()
25
-
26
x265_4.0.tar.gz/build/aarch64-linux-clang
Added
2
1
+(directory)
2
x265_4.0.tar.gz/build/aarch64-linux-clang/crosscompile.cmake
Added
27
1
2
+# CMake toolchain file for cross compiling x265 for AArch64, using Clang.
3
+
4
+set(CROSS_COMPILE_ARM64 1)
5
+set(CMAKE_SYSTEM_NAME Linux)
6
+set(CMAKE_SYSTEM_PROCESSOR aarch64)
7
+
8
+set(TARGET_TRIPLE aarch64-linux-gnu)
9
+
10
+# specify the cross compiler (giving precedence to user-supplied CC/CXX)
11
+if(NOT DEFINED CMAKE_C_COMPILER)
12
+ set(CMAKE_C_COMPILER clang)
13
+endif()
14
+if(NOT DEFINED CMAKE_CXX_COMPILER)
15
+ set(CMAKE_CXX_COMPILER clang++)
16
+endif()
17
+
18
+# specify compiler target
19
+set(CMAKE_C_COMPILER_TARGET ${TARGET_TRIPLE})
20
+set(CMAKE_CXX_COMPILER_TARGET ${TARGET_TRIPLE})
21
+
22
+# specify assembler target
23
+list(APPEND ASM_FLAGS "--target=${TARGET_TRIPLE}")
24
+
25
+# specify the target environment
26
+SET(CMAKE_FIND_ROOT_PATH /usr/aarch64-linux-gnu)
27
x265_3.6.tar.gz/build/aarch64-linux/crosscompile.cmake -> x265_4.0.tar.gz/build/aarch64-linux/crosscompile.cmake
Changed
30
1
2
set(CMAKE_SYSTEM_NAME Linux)
3
set(CMAKE_SYSTEM_PROCESSOR aarch64)
4
5
-# specify the cross compiler
6
-if(DEFINED ENV{CMAKE_C_COMPILER})
7
- set(CMAKE_C_COMPILER $ENV{CMAKE_C_COMPILER})
8
-else()
9
+# specify the cross compiler (giving precedence to user-supplied CC/CXX)
10
+if(NOT DEFINED CMAKE_C_COMPILER)
11
set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
12
endif()
13
-if(DEFINED ENV{CMAKE_CXX_COMPILER})
14
- set(CMAKE_CXX_COMPILER $ENV{CMAKE_CXX_COMPILER})
15
-else()
16
+if(NOT DEFINED CMAKE_CXX_COMPILER)
17
set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
18
endif()
19
20
# specify the target environment
21
SET(CMAKE_FIND_ROOT_PATH /usr/aarch64-linux-gnu)
22
23
-# specify whether SVE/SVE2 is supported by the target platform
24
-if(DEFINED ENV{CROSS_COMPILE_SVE2})
25
- set(CROSS_COMPILE_SVE2 1)
26
-elseif(DEFINED ENV{CROSS_COMPILE_SVE})
27
- set(CROSS_COMPILE_SVE 1)
28
-endif()
29
-
30
x265_4.0.tar.gz/build/vc17-x86
Added
2
1
+(directory)
2
x265_4.0.tar.gz/build/vc17-x86/build-all.bat
Added
25
1
2
+@echo off
3
+setlocal enabledelayedexpansion
4
+if "%VS170COMNTOOLS%" == "" (
5
+for /f "usebackq tokens=1* delims=: " %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -latest `) do (
6
+ if /i "%%i"=="productPath" (
7
+ set VS170COMNTOOLS=%%j
8
+)
9
+)
10
+)
11
+setx VS170COMNTOOLS "!VS170COMNTOOLS!"
12
+if "%VS170COMNTOOLS%" == "" (
13
+ msg "%username%" "Visual Studio 17 not detected"
14
+ exit 1
15
+)
16
+if not exist x265.sln (
17
+ call make-solutions.bat
18
+)
19
+if exist x265.sln (
20
+ call "%VS170COMNTOOLS%\..\..\tools\VsDevCmd.bat"
21
+ MSBuild /property:Configuration="Release" x265.sln
22
+ MSBuild /property:Configuration="Debug" x265.sln
23
+ MSBuild /property:Configuration="RelWithDebInfo" x265.sln
24
+)
25
x265_4.0.tar.gz/build/vc17-x86/make-solutions.bat
Added
8
1
2
+@echo off
3
+::
4
+:: run this batch file to create a Visual Studio solution file for this project.
5
+:: See the cmake documentation for other generator targets
6
+::
7
+cmake -G "Visual Studio 17 2022" ..\..\source && cmake-gui ..\..\source
8
x265_4.0.tar.gz/build/vc17-x86_64
Added
2
1
+(directory)
2
x265_4.0.tar.gz/build/vc17-x86_64/build-all.bat
Added
25
1
2
+@echo off
3
+setlocal enabledelayedexpansion
4
+if "%VS170COMNTOOLS%" == "" (
5
+for /f "usebackq tokens=1* delims=: " %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -latest `) do (
6
+ if /i "%%i"=="productPath" (
7
+ set VS170COMNTOOLS=%%j
8
+)
9
+)
10
+)
11
+setx VS170COMNTOOLS "!VS170COMNTOOLS!"
12
+if "%VS170COMNTOOLS%" == "" (
13
+ msg "%username%" "Visual Studio 17 not detected"
14
+ exit 1
15
+)
16
+if not exist x265.sln (
17
+ call make-solutions.bat
18
+)
19
+if exist x265.sln (
20
+ call "%VS170COMNTOOLS%\..\..\tools\VsDevCmd.bat"
21
+ MSBuild /property:Configuration="Release" x265.sln
22
+ MSBuild /property:Configuration="Debug" x265.sln
23
+ MSBuild /property:Configuration="RelWithDebInfo" x265.sln
24
+)
25
x265_4.0.tar.gz/build/vc17-x86_64/make-solutions.bat
Added
8
1
2
+@echo off
3
+::
4
+:: run this batch file to create a Visual Studio solution file for this project.
5
+:: See the cmake documentation for other generator targets
6
+::
7
+cmake -G "Visual Studio 17 2022" ..\..\source && cmake-gui ..\..\source
8
x265_4.0.tar.gz/build/vc17-x86_64/multilib.bat
Added
50
1
2
+@echo off
3
+setlocal enabledelayedexpansion
4
+if "%VS170COMNTOOLS%" == "" (
5
+for /f "usebackq tokens=1* delims=: " %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -latest `) do (
6
+ if /i "%%i"=="productPath" (
7
+ set VS170COMNTOOLS=%%j
8
+)
9
+)
10
+)
11
+setx VS170COMNTOOLS "!VS170COMNTOOLS!"
12
+call "%VS170COMNTOOLS%\..\..\tools\VsDevCmd.bat"
13
+@mkdir 12bit
14
+@mkdir 10bit
15
+@mkdir 8bit
16
+
17
+@cd 12bit
18
+cmake -G "Visual Studio 17 2022" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF -DMAIN12=ON
19
+if exist x265.sln (
20
+ MSBuild /property:Configuration="Release" x265.sln
21
+ copy/y Release\x265-static.lib ..\8bit\x265-static-main12.lib
22
+)
23
+
24
+@cd ..\10bit
25
+cmake -G "Visual Studio 17 2022" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF
26
+if exist x265.sln (
27
+ MSBuild /property:Configuration="Release" x265.sln
28
+ copy/y Release\x265-static.lib ..\8bit\x265-static-main10.lib
29
+)
30
+
31
+@cd ..\8bit
32
+if not exist x265-static-main10.lib (
33
+ msg "%username%" "10bit build failed"
34
+ exit 1
35
+)
36
+if not exist x265-static-main12.lib (
37
+ msg "%username%" "12bit build failed"
38
+ exit 1
39
+)
40
+cmake -G "Visual Studio 17 2022" ../../../source -DEXTRA_LIB="x265-static-main10.lib;x265-static-main12.lib" -DLINKED_10BIT=ON -DLINKED_12BIT=ON
41
+if exist x265.sln (
42
+ MSBuild /property:Configuration="Release" x265.sln
43
+ :: combine static libraries (ignore warnings caused by winxp.cpp hacks)
44
+ move Release\x265-static.lib x265-static-main.lib
45
+ LIB.EXE /ignore:4006 /ignore:4221 /OUT:Release\x265-static.lib x265-static-main.lib x265-static-main10.lib x265-static-main12.lib
46
+)
47
+
48
+pause
49
\ No newline at end of file
50
x265_3.6.tar.gz/doc/reST/api.rst -> x265_4.0.tar.gz/doc/reST/api.rst
Changed
31
1
2
void x265_cleanup(void);
3
4
VMAF (Video Multi-Method Assessment Fusion)
5
-==========================================
6
+===========================================
7
8
If you set the ENABLE_LIBVMAF cmake option to ON, then x265 will report per frame
9
and aggregate VMAF score for the given input and dump the scores in csv file.
10
-The user also need to specify the :option:`--recon` in command line to get the VMAF scores.
11
+The user also need to specify the :option:`--recon` in command line to get the VMAF scores.::
12
13
/* x265_calculate_vmafScore:
14
- * returns VMAF score for the input video.
15
- * This api must be called only after encoding was done. */
16
- double x265_calculate_vmafscore(x265_param*, x265_vmaf_data*);
17
+ * returns VMAF score for the input video.
18
+ * This API must be called only after encoding was done. */
19
+ double x265_calculate_vmafscore(x265_param*, x265_vmaf_data*);
20
21
/* x265_calculate_vmaf_framelevelscore:
22
- * returns VMAF score for each frame in a given input video. The frame level VMAF score does not include temporal scores. */
23
- double x265_calculate_vmaf_framelevelscore(x265_vmaf_framedata*);
24
-
25
+ * returns VMAF score for each frame in a given input video. The frame level VMAF score does not include temporal scores. */
26
+ double x265_calculate_vmaf_framelevelscore(x265_vmaf_framedata*);
27
+
28
.. Note::
29
30
When setting ENABLE_LIBVMAF cmake option to ON, it is recommended to
31
x265_3.6.tar.gz/doc/reST/cli.rst -> x265_4.0.tar.gz/doc/reST/cli.rst
Changed
201
1
2
metrics from the 4 sub-CUs. When multiple inter modes like :option:`--rect`
3
and/or :option:`--amp` are enabled, this feature will use motion cost
4
heuristics from the 4 sub-CUs to bypass modes that are unlikely to be the
5
- best choice. This can significantly improve performance when :option:`rect`
6
+ best choice. This can significantly improve performance when :option:`--rect`
7
and/or :option:`--amp` are enabled at minimal compression efficiency loss.
8
9
.. option:: --rect, --no-rect
10
11
Store/normalize ctu distortion in analysis-save/load.
12
0 - Disabled.
13
1 - Save ctu distortion to the analysis file specified during :option:`--analysis-save`.
14
- Load CTU distortion from the analysis file and normalize it across every frame during :option:`--analysis-load`.
15
+ - Load CTU distortion from the analysis file and normalize it across every frame during :option:`--analysis-load`.
16
Default 0.
17
18
.. option:: --scale-factor
19
20
21
.. option:: --rdoq-level <0|1|2>, --no-rdoq-level
22
23
- Specify the amount of rate-distortion analysis to use within
24
- quantization::
25
+ Specify the amount of rate-distortion analysis to use within quantization::
26
27
- At level 0 rate-distortion cost is not considered in quant
28
-
29
- At level 1 rate-distortion cost is used to find optimal rounding
30
- values for each level (and allows psy-rdoq to be effective). It
31
- trades-off the signaling cost of the coefficient vs its post-inverse
32
- quant distortion from the pre-quant coefficient. When
33
- :option:`--psy-rdoq` is enabled, this formula is biased in favor of
34
- more energy in the residual (larger coefficient absolute levels)
35
-
36
- At level 2 rate-distortion cost is used to make decimate decisions
37
- on each 4x4 coding group, including the cost of signaling the group
38
- within the group bitmap. If the total distortion of not signaling
39
- the entire coding group is less than the rate cost, the block is
40
- decimated. Next, it applies rate-distortion cost analysis to the
41
- last non-zero coefficient, which can result in many (or all) of the
42
- coding groups being decimated. Psy-rdoq is less effective at
43
- preserving energy when RDOQ is at level 2, since it only has
44
- influence over the level distortion costs.
45
+ At level 0 rate-distortion cost is not considered in quant.
46
+
47
+ At level 1 rate-distortion cost is used to find optimal rounding values for each level (and allows psy-rdoq to be effective). It trades-off the signaling cost of the coefficient vs its post-inverse quant distortion from the pre-quant coefficient. When :option:`--psy-rdoq` is enabled, this formula is biased in favor of more energy in the residual (larger coefficient absolute levels).
48
+
49
+ At level 2 rate-distortion cost is used to make decimate decisions on each 4x4 coding group, including the cost of signaling the group within the group bitmap. If the total distortion of not signaling the entire coding group is less than the rate cost, the block is decimated. Next, it applies rate-distortion cost analysis to the last non-zero coefficient, which can result in many (or all) of the coding groups being decimated. Psy-rdoq is less effective at preserving energy when RDOQ is at level 2, since it only has influence over the level distortion costs.
50
51
.. option:: --tu-intra-depth <1..4>
52
53
54
55
.. option:: --me <integer|string>
56
57
- Motion search method. Generally, the higher the number the harder
58
- the ME method will try to find an optimal match. Diamond search is
59
- the simplest. Hexagon search is a little better. Uneven
60
- Multi-Hexagon is an adaption of the search method used by x264 for
61
- slower presets. Star is a three-step search adapted from the HM
62
- encoder: a star-pattern search followed by an optional radix scan
63
- followed by an optional star-search refinement. Full is an
64
- exhaustive search; an order of magnitude slower than all other
65
- searches but not much better than umh or star. SEA is similar to
66
- x264's ESA implementation and a speed optimization of full search.
67
- It is a three-step motion search where the DC calculation is
68
- followed by ADS calculation followed by SAD of the passed motion
69
- vector candidates.
70
+ Motion search method. Generally, the higher the number the harder the ME method
71
+ will try to find an optimal match. Diamond search is the simplest. Hexagon search
72
+ is a little better. Uneven Multi-Hexagon is an adaption of the search method used
73
+ by x264 for slower presets. Star is a three-step search adapted from the HM encoder: a
74
+ star-pattern search followed by an optional radix scan followed by an optional
75
+ star-search refinement. Full is an exhaustive search; an order of magnitude slower
76
+ than all other searches but not much better than umh or star. SEA is similar to x264's
77
+ ESA implementation and a speed optimization of full search. It is a three-step motion
78
+ search where the DC calculation is followed by ADS calculation followed by SAD of the
79
+ passed motion vector candidates.
80
81
0. dia
82
1. hex **(default)**
83
84
85
.. option:: --mcstf, --no-mcstf
86
87
- Enable Motion Compensated Temporal filtering.
88
+ Motion-compensated spatio-temporal filtering (MCSTF) improves the compression
89
+ efficiency of videos that contain a high level of noise. It introduces a
90
+ temporal filter before encoding and this filter is applied only to the I- and P-frames.
91
+ It utilizes previously generated motion vectors across different video content
92
+ resolutions to find the best temporal correspondence for low-pass filtering. Here,
93
+ motion estimation is applied between the central picture and each future or past
94
+ picture, thereby generating multiple motion-compensated predictions, which are then
95
+ combined by using adaptive filtering to produce a final noise-reduced picture.
96
Default: disabled
97
98
Spatial/intra options
99
100
whereas for the :option:`--scenecut`, inserts RADL at every scenecut.
101
Recommended value is 2-3. Default 0 (disabled).
102
103
- **Range of values: Between 0 and `--bframes`
104
+ **Range of values:** Between 0 and `--bframes`
105
106
.. option:: --ctu-info <0, 1, 2, 4, 6>
107
108
109
as *lslices*
110
111
**Values:** 0 - disabled. 1 is the same as 0. Max 16.
112
- Default: 8 for ultrafast, superfast, faster, fast, medium
113
- 4 for slow, slower
114
- disabled for veryslow, slower
115
+ Default: 8 for ultrafast, superfast, faster, fast, medium; 4 for slow, slower; disabled for veryslow, slower.
116
117
.. option:: --lookahead-threads <integer>
118
119
120
121
Values:
122
0 - flush the encoder only when all the input pictures are over.
123
- 1 - flush all the frames even when the input is not over.
124
- slicetype decision may change with this option.
125
+ 1 - flush all the frames even when the input is not over. Slicetype decision may change with this option.
126
2 - flush the slicetype decided frames only.
127
128
.. option:: --fades, --no-fades
129
130
Detect and handle fade-in regions. Default disabled.
131
132
+.. option:: --cra-nal
133
+
134
+ Force NAL type to CRA to all the frames expect for the first frame, works only with :option:`--keyint` is 1.
135
+
136
Quality, rate control and rate distortion options
137
=================================================
138
139
140
0. disabled
141
1. AQ enabled
142
2. AQ enabled with auto-variance **(default)**
143
- 3. AQ enabled with auto-variance and bias to dark scenes. This is
144
- recommended for 8-bit encodes or low-bitrate 10-bit encodes, to
145
- prevent color banding/blocking.
146
+ 3. AQ enabled with auto-variance and bias to dark scenes. This is recommended for 8-bit encodes or low-bitrate 10-bit encodes, to prevent color banding/blocking.
147
4. AQ enabled with auto-variance and edge information.
148
149
.. option:: --aq-strength <float>
150
151
Default 1.0.
152
**Range of values:** 0.0 to 3.0
153
154
-.. option:: --sbrc --no-sbrc
155
+.. option:: --sbrc, --no-sbrc
156
+
157
+ To enable and disable segment-based rate control. SBRC controls the overflow with
158
+ segment sizes, and it is based on the Capped CRF mode. Segment duration depends on
159
+ the keyframe interval specified. If unspecified, the default keyframe interval will
160
+ be used. Default: disabled. **Experimental Feature**
161
162
- To enable and disable segment based rate control.Segment duration depends on the
163
- keyframe interval specified.If unspecified,default keyframe interval will be used.
164
- Default: disabled.
165
166
.. option:: --hevc-aq
167
168
169
and also redundant steps are skipped.
170
In pass 1 analysis information like motion vector, depth, reference and prediction
171
modes of the final best CTU partition is stored for each CTU.
172
- Multipass analysis refinement cannot be enabled when :option:`--analysis-save`/:option:`analysis-load`
173
+ Multipass analysis refinement cannot be enabled when :option:`--analysis-save`/:option:`--analysis-load`
174
is enabled and both will be disabled when enabled together. This feature requires :option:`--pmode`/:option:`--pme`
175
to be disabled and hence pmode/pme will be disabled when enabled at the same time.
176
177
178
When :option:`--scenecut-aware-qp` is:
179
180
* 1 (Forward masking):
181
- --masking-strength <fwdMaxWindow,fwdRefQPDelta,fwdNonRefQPDelta>
182
- or
183
- --masking-strength <fwdWindow1,fwdRefQPDelta1,fwdNonRefQPDelta1,fwdWindow2,fwdRefQPDelta2,fwdNonRefQPDelta2,
184
- fwdWindow3,fwdRefQPDelta3,fwdNonRefQPDelta3,fwdWindow4,fwdRefQPDelta4,fwdNonRefQPDelta4,
185
- fwdWindow5,fwdRefQPDelta5,fwdNonRefQPDelta5,fwdWindow6,fwdRefQPDelta6,fwdNonRefQPDelta6>
186
+
187
+ --masking-strength <fwdMaxWindow,fwdRefQPDelta,fwdNonRefQPDelta>
188
+
189
+ or
190
+
191
+ --masking-strength <fwdWindow1,fwdRefQPDelta1,fwdNonRefQPDelta1,fwdWindow2,fwdRefQPDelta2,fwdNonRefQPDelta2,fwdWindow3,fwdRefQPDelta3,fwdNonRefQPDelta3,fwdWindow4,fwdRefQPDelta4,fwdNonRefQPDelta4,fwdWindow5,fwdRefQPDelta5,fwdNonRefQPDelta5,fwdWindow6,fwdRefQPDelta6,fwdNonRefQPDelta6>
192
+
193
* 2 (Backward masking):
194
- --masking-strength <bwdMaxWindow,bwdRefQPDelta,bwdNonRefQPDelta>
195
- or
196
- --masking-strength <bwdWindow1,bwdRefQPDelta1,bwdNonRefQPDelta1,bwdWindow2,bwdRefQPDelta2,bwdNonRefQPDelta2,
197
- bwdWindow3,bwdRefQPDelta3,bwdNonRefQPDelta3,bwdWindow4,bwdRefQPDelta4,bwdNonRefQPDelta4,
198
- bwdWindow5,bwdRefQPDelta5,bwdNonRefQPDelta5,bwdWindow6,bwdRefQPDelta6,bwdNonRefQPDelta6>
199
+
200
+ --masking-strength <bwdMaxWindow,bwdRefQPDelta,bwdNonRefQPDelta>
201
x265_3.6.tar.gz/doc/reST/conf.py -> x265_4.0.tar.gz/doc/reST/conf.py
Changed
10
1
2
copyright = u'2014 MulticoreWare Inc'
3
4
# -- Options for HTML output ---------------------------------------------------
5
-html_theme = "default"
6
+html_theme = "sphinx_rtd_theme"
7
8
# One entry per manual page. List of tuples
9
# (source start file, name, description, authors, manual section).
10
x265_3.6.tar.gz/doc/reST/presets.rst -> x265_4.0.tar.gz/doc/reST/presets.rst
Changed
38
1
2
The presets adjust encoder parameters as shown in the following table.
3
Any parameters below that are specified in your command-line will be
4
changed from the value specified by the preset.
5
- 0. ultrafast
6
- 1. superfast
7
- 2. veryfast
8
- 3. faster
9
- 4. fast
10
- 5. medium **(default)**
11
- 6. slow
12
- 7. slower
13
- 8. veryslow
14
- 9. placebo
15
+
16
+ 0. ultrafast
17
+ 1. superfast
18
+ 2. veryfast
19
+ 3. faster
20
+ 4. fast
21
+ 5. medium **(default)**
22
+ 6. slow
23
+ 7. slower
24
+ 8. veryslow
25
+ 9. placebo
26
27
+-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
28
| preset | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
29
30
* :option:`--sao` 0
31
* :option:`--psy-rd` 4.0
32
* :option:`--psy-rdoq` 10.0
33
- * :option:`--recursion-skip` 0
34
+ * :option:`--rskip` 0
35
36
It also enables a specialised ratecontrol algorithm :option:`--rc-grain`
37
that strictly minimises QP fluctuations across frames, while still allowing
38
x265_3.6.tar.gz/doc/reST/releasenotes.rst -> x265_4.0.tar.gz/doc/reST/releasenotes.rst
Changed
116
1
2
Release Notes
3
*************
4
5
+Version 4.0
6
+===========
7
+
8
+Release date - 13th September, 2024.
9
+
10
+New feature
11
+-----------
12
+1. Alpha Channel feature.
13
+2. Screen Content Coding (SCC).
14
+3. MV-HEVC feature.
15
+
16
+Enhancements to existing features
17
+---------------------------------
18
+1. Added support for the VMAF v3.x.
19
+
20
+API changes
21
+-----------
22
+1. Add command line parameter for Alpha Channel feature :option:`--alpha`.
23
+2. Add command line parameter for SCC feature :option:`--scc 1`.
24
+3. Add command line parameters for the MV-HEVC feature :option:`--multiview-config "multiview_config.txt"`.
25
+
26
+Optimizations
27
+---------------------
28
+1. Arm SIMD optimizations: Several time-consuming scalar C functions now have SIMD implementations on Arm platforms. Existing Arm SIMD implementations have also been optimized. These optimizations result in up to 57% faster encoding compared to release 3.6.
29
+2. Arm SIMD optimizations include use of Armv8.4 DotProd, Armv8.6 I8MM, and Armv9 SVE2 instruction set extensions. The following algorithms now have optimized SIMD implementations: SAD, SSE, DCT, SAO, convolution, quantization, intra_planar, intraFilter, intrapred DC and IDCT16x16.
30
+
31
+Bug fixes
32
+---------
33
+1. Fix for y4m pipe input broken.
34
+2. Fix SCC crash on multipass encode.
35
+3. Fix mcstf when :option:`--bframes` value was less than 5.
36
+4. Fix lowpass DCT for high bit depth.
37
+5. Added build support for Visual Studio 17.
38
+6. Fix issue in default code flow and memory leak.
39
+7. Framethreads tuning for Windows ARM devices.
40
+8. Fix scc crash on multipass encode.
41
+
42
+
43
Version 3.6
44
===========
45
46
47
48
New feature
49
-----------
50
-1. Segment based Ratecontrol (SBRC) feature
51
-2. Motion-Compensated Spatio-Temporal Filtering
52
-3. Scene-cut aware qp - BBAQ (Bidirectional Boundary Aware Quantization)
53
-4. Histogram-Based Scene Change Detection
54
-5. Film-Grain characteristics as a SEI message to support Film Grain Synthesis(FGS)
55
-6. Add temporal layer implementation(Hierarchical B-frame implementation)
56
-
57
+1. Segment based Ratecontrol (SBRC) feature.
58
+2. Motion-Compensated Spatio-Temporal Filtering.
59
+3. Scene-cut aware qp - BBAQ (Bidirectional Boundary Aware Quantization).
60
+4. Histogram-Based Scene Change Detection.
61
+5. Film-Grain characteristics as a SEI message to support Film Grain Synthesis (FGS).
62
+6. Add temporal layer implementation (Hierarchical B-frame implementation).
63
+
64
Enhancements to existing features
65
---------------------------------
66
-1. Added Dolby Vision 8.4 Profile Support
67
+1. Added Dolby Vision 8.4 Profile Support.
68
69
70
API changes
71
-----------
72
-1. Add Segment based Ratecontrol(SBRC) feature: "--no-sbrc".
73
-2. Add command line parameter for mcstf feature: "--no-mctf".
74
-3. Add command line parameters for the scene cut aware qp feature: "--scenecut-aware-qp" and "--masking-strength".
75
-4. Add command line parameters for Histogram-Based Scene Change Detection: "--hist-scenecut".
76
-5. Add film grain characteristics as a SEI message to the bitstream: "--film-grain <filename>"
77
-6. cli: add new option --cra-nal (Force nal type to CRA to all frames expect for the first frame, works only with keyint 1)
78
+1. Add command line parameter for SBRC feature :option:`--sbrc`.
79
+2. Add command line parameter for mcstf feature :option:`--mcstf`.
80
+3. Add command line parameters for the scene cut aware qp feature :option:`--scenecut-aware-qp` and :option:`--masking-strength`.
81
+4. Add command line parameters for Histogram-Based Scene Change Detection :option:`--hist-scenecut`.
82
+5. Add command line parameters for film grain characteristics as a SEI message to the bitstream :option:`--film-grain`.
83
+6. cli: add new option :option:`--cra-nal` (Force NAL type to CRA to all the frames expect for the first frame, works only with :option:`--keyint` is 1).
84
85
Optimizations
86
---------------------
87
-ARM64 NEON optimizations:- Several time-consuming C functions have been optimized for the targeted platform - aarch64. The overall performance increased by around 20%.
88
-SVE/SVE2 optimizations
89
+1. ARM64 NEON optimizations:- Several time-consuming C functions have been optimized for the targeted platform - aarch64. The overall performance increased by around 20%.
90
+2. SVE/SVE2 optimizations.
91
92
93
Bug fixes
94
---------
95
-1. Linux bug to utilize all the cores
96
-2. Crash with hist-scenecut build when source resolution is not multiple of minCuSize
97
-3. 32bit and 64bit builds generation for ARM
98
-4. bugs in zonefile feature (Reflect Zonefile Parameters inside Lookahead, extra IDR issue, Avg I Slice QP value issue etc..)
99
-5. Add x86 ASM implementation for subsampling luma
100
-6. Fix for abrladder segfault with load reuse level 1
101
-7. Reorder miniGOP based on temporal layer hierarchy and add support for more B frame
102
-8. Add MacOS aarch64 build support
103
-9. Fix boundary condition issue for Gaussian filter
104
+1. Linux bug to utilize all the cores.
105
+2. Crash with hist-scenecut build when source resolution is not multiple of minCuSize.
106
+3. 32bit and 64bit builds generation for ARM.
107
+4. bugs in zonefile feature (Reflect Zonefile Parameters inside Lookahead, extra IDR issue, Avg I Slice QP value issue etc.).
108
+5. Add x86 ASM implementation for subsampling luma.
109
+6. Fix for abrladder segfault with load reuse level 1.
110
+7. Reorder miniGOP based on temporal layer hierarchy and add support for more B frames.
111
+8. Add MacOS aarch64 build support.
112
+9. Fix boundary condition issue for Gaussian filter.
113
114
115
Version 3.5
116
x265_3.6.tar.gz/doc/reST/svthevc.rst -> x265_4.0.tar.gz/doc/reST/svthevc.rst
Changed
19
1
2
3
.. _SvtHevc:
4
5
-x265 has support for open source HEVC encoder `SVT-HEVC <https://01.org/svt>`_
6
+x265 has support for open source HEVC encoder `SVT-HEVC <https://www.intel.com/content/www/us/en/developer/articles/technical/scalable-video-technology.html>`_
7
and can generate SVT-HEVC compliant bitstreams. SVT-HEVC encoder can be enabled at run time
8
using :option:`--svt`. Since SVT-HEVC params/CLI are not exposed outside, it has to be
9
configured only via x265 CLI options. The API's of SVT-HEVC are accessed through x265's API
10
11
12
**SVT-HEVC**
13
14
-1. Clone `SVT-HEVC <https://github.com/intel/SVT-HEVC>`_ (say at path "/home/app/") and build it (follow the build steps in its README file)
15
+1. Clone `SVT-HEVC-repo <https://github.com/intel/SVT-HEVC>`_ (say at path "/home/app/") and build it (follow the build steps in its README file)
16
2. Once build is successful, binaries can be found inside the *Bin* folder at its root directory ("/home/app/SVT-HEVC/Bin/Release/")
17
18
**x265**
19
x265_3.6.tar.gz/doc/reST/x265.rst -> x265_4.0.tar.gz/doc/reST/x265.rst
Changed
7
1
2
+:orphan:
3
+
4
x265 CLI Documentation
5
######################
6
7
x265_4.0.tar.gz/doc/requirements.txt
Added
5
1
2
+sphinx
3
+sphinx-rtd-theme
4
+# Add other dependencies here
5
x265_3.6.tar.gz/source/CMakeLists.txt -> x265_4.0.tar.gz/source/CMakeLists.txt
Changed
201
1
2
include(CheckFunctionExists)
3
include(CheckSymbolExists)
4
include(CheckCXXCompilerFlag)
5
+include(CheckCSourceCompiles)
6
+include(CheckCXXSourceCompiles)
7
8
option(FPROFILE_GENERATE "Compile executable to generate usage data" OFF)
9
option(FPROFILE_USE "Compile executable using generated usage data" OFF)
10
11
option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
12
mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
13
# X265_BUILD must be incremented each time the public API is changed
14
-set(X265_BUILD 209)
15
+set(X265_BUILD 212)
16
configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
17
"${PROJECT_BINARY_DIR}/x265.def")
18
configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
19
20
set(ARM 1)
21
add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
22
elseif(ARM64MATCH GREATER "-1")
23
- #if(CROSS_COMPILE_ARM64)
24
- #message(STATUS "Cross compiling for ARM64 arch")
25
- #else()
26
- #set(CROSS_COMPILE_ARM64 0)
27
- #endif()
28
message(STATUS "Detected ARM64 target processor")
29
set(ARM64 1)
30
- add_definitions(-DX265_ARCH_ARM64=1 -DHAVE_NEON)
31
+
32
+ option(AARCH64_WARNINGS_AS_ERRORS "Build with -Werror for AArch64 Intrinsics files" OFF)
33
+
34
+ # Options for cross compiling AArch64 optional extensions
35
+ option(CROSS_COMPILE_SVE "Cross Compile for SVE Target" OFF)
36
+ option(CROSS_COMPILE_SVE2 "Cross Compile for SVE2 Target" OFF)
37
+ option(CROSS_COMPILE_NEON_DOTPROD "Cross Compile for Neon DotProd Target" OFF)
38
+ option(CROSS_COMPILE_NEON_I8MM "Cross Compile for Neon I8MM Target" OFF)
39
else()
40
message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
41
message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
42
43
set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
44
endif()
45
endif()
46
- if(ARM64 OR CROSS_COMPILE_ARM64)
47
- find_package(Neon)
48
- find_package(SVE)
49
- find_package(SVE2)
50
- if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2)
51
- message(STATUS "Found SVE2")
52
- set(ARM_ARGS -O3 -march=armv8-a+sve2 -fPIC -flax-vector-conversions)
53
- add_definitions(-DHAVE_SVE2)
54
- add_definitions(-DHAVE_SVE)
55
- add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE2
56
- elseif(CPU_HAS_SVE OR CROSS_COMPILE_SVE)
57
- message(STATUS "Found SVE")
58
- set(ARM_ARGS -O3 -march=armv8-a+sve -fPIC -flax-vector-conversions)
59
- add_definitions(-DHAVE_SVE)
60
- add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE
61
- elseif(CPU_HAS_NEON)
62
- message(STATUS "Found NEON")
63
- set(ARM_ARGS -fPIC -flax-vector-conversions)
64
- add_definitions(-DHAVE_NEON)
65
+ if(ARM64)
66
+ message(STATUS "Found Neon")
67
+ set(CPU_HAS_NEON 1)
68
+ add_definitions(-DX265_ARCH_ARM64=1 -DHAVE_NEON=1)
69
+
70
+ if(CROSS_COMPILE_ARM64)
71
+ # Handle cross-compilation options.
72
+ if(CROSS_COMPILE_NEON_DOTPROD)
73
+ set(CPU_HAS_NEON_DOTPROD 1)
74
+ endif()
75
+ if(CROSS_COMPILE_NEON_I8MM)
76
+ set(CPU_HAS_NEON_I8MM 1)
77
+ # Impose the constraint that Neon I8MM implies Neon DotProd.
78
+ set(CPU_HAS_NEON_DOTPROD 1)
79
+ endif()
80
+ if(CROSS_COMPILE_SVE)
81
+ set(CPU_HAS_SVE 1)
82
+ # Impose the constraint that SVE implies Neon DotProd and I8MM.
83
+ set(CPU_HAS_NEON_DOTPROD 1)
84
+ set(CPU_HAS_NEON_I8MM 1)
85
+ endif()
86
+ if(CROSS_COMPILE_SVE2)
87
+ set(CPU_HAS_SVE2 1)
88
+ # SVE2 implies SVE and Neon DotProd.
89
+ set(CPU_HAS_SVE 1)
90
+ set(CPU_HAS_NEON_DOTPROD 1)
91
+ # Impose the constraint that SVE2 implies Neon I8MM.
92
+ set(CPU_HAS_NEON_I8MM 1)
93
+ endif()
94
else()
95
- set(ARM_ARGS -fPIC -flax-vector-conversions)
96
- endif()
97
+ if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin")
98
+ find_package(NEON_DOTPROD)
99
+ find_package(NEON_I8MM)
100
+ find_package(SVE)
101
+ find_package(SVE2)
102
+ else()
103
+ message(STATUS "Compile time feature detection unsupported on this platform")
104
+ endif()
105
+ endif()
106
+
107
+ if(CPU_HAS_NEON_DOTPROD)
108
+ # Neon DotProd is mandatory from Armv8.4.
109
+ message(STATUS "Found Neon DotProd")
110
+ set(ARM_ARGS -O3 -march=armv8.2-a+dotprod)
111
+ add_definitions(-DHAVE_NEON_DOTPROD=1)
112
+ endif()
113
+ if(CPU_HAS_NEON_I8MM)
114
+ # Neon I8MM is mandatory from Armv8.6.
115
+ message(STATUS "Found Neon I8MM")
116
+ # Impose the constraint that Neon I8MM implies Neon DotProd.
117
+ if(NOT CPU_HAS_NEON_DOTPROD)
118
+ message(FATAL_ERROR "Unsupported AArch64 feature combination (Neon I8MM without Neon DotProd)")
119
+ endif()
120
+ set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm)
121
+ add_definitions(-DHAVE_NEON_I8MM=1)
122
+ endif()
123
+ if(CPU_HAS_SVE)
124
+ message(STATUS "Found SVE")
125
+ # Impose the constraint that SVE implies Neon I8MM.
126
+ if(NOT CPU_HAS_NEON_I8MM)
127
+ message(FATAL_ERROR "Unsupported AArch64 feature combination (SVE without Neon I8MM)")
128
+ endif()
129
+ set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm+sve)
130
+ add_definitions(-DHAVE_SVE=1)
131
+ endif()
132
+ if(CPU_HAS_SVE2)
133
+ message(STATUS "Found SVE2")
134
+ # SVE2 is only available from Armv9.0, and armv9-a implies +dotprod
135
+ set(ARM_ARGS -O3 -march=armv9-a+i8mm+sve2)
136
+ add_definitions(-DHAVE_SVE2=1)
137
+ endif()
138
+ set(ARM_ARGS ${ARM_ARGS} -fPIC)
139
+ # Do not allow implicit vector type conversions in Clang builds (this
140
+ # is already the default in GCC builds).
141
+ check_cxx_compiler_flag(-flax-vector-conversions=none CC_HAS_FLAX_VEC_CONV_NONE)
142
+ if(CC_HAS_FLAX_VEC_CONV_NONE)
143
+ set(ARM_ARGS ${ARM_ARGS} -flax-vector-conversions=none)
144
+ endif()
145
+ if(CPU_HAS_SVE)
146
+ set(SVE_HEADER_TEST "
147
+#ifndef __ARM_NEON_SVE_BRIDGE
148
+#error 1
149
+#endif
150
+#include <arm_sve.h>
151
+#include <arm_neon_sve_bridge.h>
152
+int main() { return 0; }")
153
+ set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
154
+ # CMAKE_REQUIRED_FLAGS requires a space-delimited string, whereas
155
+ # ARM_ARGS is defined and used elsewhere as a ;-list.
156
+ foreach(ARM_ARG ${ARM_ARGS})
157
+ string(APPEND CMAKE_REQUIRED_FLAGS " ${ARM_ARG}")
158
+ endforeach()
159
+ check_c_source_compiles("${SVE_HEADER_TEST}" SVE_HEADER_C_TEST_COMPILED)
160
+ check_cxx_source_compiles("${SVE_HEADER_TEST}" SVE_HEADER_CXX_TEST_COMPILED)
161
+ set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
162
+ if(SVE_HEADER_C_TEST_COMPILED AND SVE_HEADER_CXX_TEST_COMPILED)
163
+ add_definitions(-DHAVE_SVE_BRIDGE=1)
164
+ set(HAVE_SVE_BRIDGE 1)
165
+ endif()
166
+ endif()
167
endif()
168
if(ENABLE_PIC)
169
list(APPEND ARM_ARGS -DPIC)
170
171
if (CC_HAS_FAST_MATH)
172
add_definitions(-ffast-math)
173
endif()
174
- check_cxx_compiler_flag(-mstackrealign CC_HAS_STACK_REALIGN)
175
- if (CC_HAS_STACK_REALIGN)
176
- add_definitions(-mstackrealign)
177
+ if (NOT (ARM64 OR CROSS_COMPILE_ARM64))
178
+ check_cxx_compiler_flag(-mstackrealign CC_HAS_STACK_REALIGN)
179
+ if (CC_HAS_STACK_REALIGN)
180
+ add_definitions(-mstackrealign)
181
+ endif()
182
endif()
183
# Disable exceptions. Reduce executable size, increase compability.
184
check_cxx_compiler_flag(-fno-exceptions CC_HAS_FNO_EXCEPTIONS_FLAG)
185
186
add_definitions(-DDETAILED_CU_STATS)
187
endif(DETAILED_CU_STATS)
188
189
+option(ENABLE_ALPHA "Enable alpha encoding in x265" OFF)
190
+if(ENABLE_ALPHA)
191
+ add_definitions(-DENABLE_ALPHA)
192
+endif()
193
+
194
+option(ENABLE_MULTIVIEW "Enable Multi-view encoding in HEVC" OFF)
195
+if(ENABLE_MULTIVIEW)
196
+ add_definitions(-DENABLE_MULTIVIEW)
197
+endif()
198
+
199
+option(ENABLE_SCC_EXT "Enable screen content coding extension in HEVC" OFF)
200
+if(ENABLE_SCC_EXT)
201
x265_3.6.tar.gz/source/abrEncApp.cpp -> x265_4.0.tar.gz/source/abrEncApp.cpp
Changed
201
1
2
m_passEnci->init(ret);
3
}
4
5
+ m_numInputViews = m_passEnc0->m_param->numViews;
6
if (!allocBuffers())
7
{
8
x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for buffers\n");
9
10
11
bool AbrEncoder::allocBuffers()
12
{
13
+#if ENABLE_MULTIVIEW
14
+ m_inputPicBuffer = X265_MALLOC(x265_picture**, MAX_VIEWS);
15
+#else
16
m_inputPicBuffer = X265_MALLOC(x265_picture**, m_numEncodes);
17
+#endif
18
m_analysisBuffer = X265_MALLOC(x265_analysis_data*, m_numEncodes);
19
20
m_picWriteCnt = new ThreadSafeIntegerm_numEncodes;
21
22
m_analysisRead = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
23
m_readFlag = X265_MALLOC(int*, m_numEncodes);
24
25
- for (uint8_t pass = 0; pass < m_numEncodes; pass++)
26
+#if ENABLE_MULTIVIEW
27
+ if (m_passEnc0->m_param->numViews > 1)
28
{
29
- m_inputPicBufferpass = X265_MALLOC(x265_picture*, m_queueSize);
30
- for (uint32_t idx = 0; idx < m_queueSize; idx++)
31
+ for (uint8_t pass = 0; pass < m_numInputViews; pass++)
32
{
33
- m_inputPicBufferpassidx = x265_picture_alloc();
34
- x265_picture_init(m_passEncpass->m_param, m_inputPicBufferpassidx);
35
+ m_inputPicBufferpass = X265_MALLOC(x265_picture*, m_queueSize);
36
+ for (uint32_t idx = 0; idx < m_queueSize; idx++)
37
+ {
38
+ m_inputPicBufferpassidx = x265_picture_alloc();
39
+ x265_picture_init(m_passEnc0->m_param, m_inputPicBufferpassidx);
40
+ }
41
+ if (pass == 0)
42
+ {
43
+ CHECKED_MALLOC_ZERO(m_analysisBufferpass, x265_analysis_data, m_queueSize);
44
+ m_picIdxReadCntpass = new ThreadSafeIntegerm_queueSize;
45
+ m_analysisWritepass = new ThreadSafeIntegerm_queueSize;
46
+ m_analysisReadpass = new ThreadSafeIntegerm_queueSize;
47
+ m_readFlagpass = X265_MALLOC(int, m_queueSize);
48
+ }
49
}
50
+ }
51
+ else
52
+ {
53
+#endif
54
+ for (uint8_t pass = 0; pass < m_numEncodes; pass++)
55
+ {
56
+ m_inputPicBufferpass = X265_MALLOC(x265_picture*, m_queueSize);
57
+ for (uint32_t idx = 0; idx < m_queueSize; idx++)
58
+ {
59
+ m_inputPicBufferpassidx = x265_picture_alloc();
60
+ x265_picture_init(m_passEncpass->m_param, m_inputPicBufferpassidx);
61
+ }
62
63
- CHECKED_MALLOC_ZERO(m_analysisBufferpass, x265_analysis_data, m_queueSize);
64
- m_picIdxReadCntpass = new ThreadSafeIntegerm_queueSize;
65
- m_analysisWritepass = new ThreadSafeIntegerm_queueSize;
66
- m_analysisReadpass = new ThreadSafeIntegerm_queueSize;
67
- m_readFlagpass = X265_MALLOC(int, m_queueSize);
68
+ CHECKED_MALLOC_ZERO(m_analysisBufferpass, x265_analysis_data, m_queueSize);
69
+ m_picIdxReadCntpass = new ThreadSafeIntegerm_queueSize;
70
+ m_analysisWritepass = new ThreadSafeIntegerm_queueSize;
71
+ m_analysisReadpass = new ThreadSafeIntegerm_queueSize;
72
+ m_readFlagpass = X265_MALLOC(int, m_queueSize);
73
+ }
74
+#if ENABLE_MULTIVIEW
75
}
76
+#endif
77
return true;
78
fail:
79
return false;
80
81
void AbrEncoder::destroy()
82
{
83
x265_cleanup(); /* Free library singletons */
84
- for (uint8_t pass = 0; pass < m_numEncodes; pass++)
85
+#if ENABLE_MULTIVIEW
86
+ for (uint8_t pass = 0; pass < m_numInputViews; pass++)
87
{
88
for (uint32_t index = 0; index < m_queueSize; index++)
89
{
90
X265_FREE(m_inputPicBufferpassindex->planes0);
91
x265_picture_free(m_inputPicBufferpassindex);
92
}
93
+ X265_FREE(m_inputPicBufferpass);
94
95
+ if (pass == 0)
96
+ {
97
+ X265_FREE(m_analysisBufferpass);
98
+ X265_FREE(m_readFlagpass);
99
+ delete m_picIdxReadCntpass;
100
+ delete m_analysisWritepass;
101
+ delete m_analysisReadpass;
102
+ m_passEncpass->destroy();
103
+ delete m_passEncpass;
104
+ }
105
+ }
106
+#else
107
+ for (uint8_t pass = 0; pass < m_numEncodes; pass++)
108
+ {
109
+ for (uint32_t index = 0; index < m_queueSize; index++)
110
+ {
111
+ X265_FREE(m_inputPicBufferpassindex->planes0);
112
+ x265_picture_free(m_inputPicBufferpassindex);
113
+ }
114
X265_FREE(m_inputPicBufferpass);
115
+
116
X265_FREE(m_analysisBufferpass);
117
X265_FREE(m_readFlagpass);
118
delete m_picIdxReadCntpass;
119
120
m_passEncpass->destroy();
121
delete m_passEncpass;
122
}
123
+#endif
124
X265_FREE(m_inputPicBuffer);
125
X265_FREE(m_analysisBuffer);
126
X265_FREE(m_readFlag);
127
128
m_id = id;
129
m_cliopt = cliopt;
130
m_parent = parent;
131
- if(!(m_cliopt.enableScaler && m_id))
132
- m_input = m_cliopt.input;
133
+ if (!(m_cliopt.enableScaler && m_id))
134
+ {
135
+ for (int view = 0; view < m_cliopt.param->numViews; view++)
136
+ m_inputview = m_cliopt.inputview;
137
+ }
138
m_param = cliopt.param;
139
m_inputOver = false;
140
m_lastIdx = -1;
141
142
{
143
x265_log(NULL, X265_LOG_ERROR, "x265_encoder_open() failed for Enc, \n");
144
m_ret = 2;
145
+ m_reader = NULL;
146
return -1;
147
}
148
149
150
}
151
152
153
- bool PassEncoder::readPicture(x265_picture *dstPic)
154
+ bool PassEncoder::readPicture(x265_picture* dstPic, int view)
155
{
156
/*Check and wait if there any input frames to read*/
157
int ipread = m_parent->m_picReadCntm_id.get();
158
159
}
160
161
162
- x265_picture *srcPic = (x265_picture*)(m_parent->m_inputPicBufferm_idreadPos);
163
+ x265_picture* srcPic = (m_param->numViews > 1) ? (x265_picture*)(m_parent->m_inputPicBufferviewreadPos) : (x265_picture*)(m_parent->m_inputPicBufferm_idreadPos);
164
165
x265_picture *pic = (x265_picture*)(dstPic);
166
pic->colorSpace = srcPic->colorSpace;
167
168
pic->planes0 = srcPic->planes0;
169
pic->planes1 = srcPic->planes1;
170
pic->planes2 = srcPic->planes2;
171
+ pic->planes3 = srcPic->planes3;
172
+ pic->format = srcPic->format;
173
if (isAbrLoad)
174
pic->analysisData = *analysisData;
175
return true;
176
177
x265_log(m_param, X265_LOG_ERROR, "Unable to register CTRL+C handler: %s in %s\n",
178
strerror(errno), profileName);
179
180
- x265_picture pic_orig, pic_out;
181
- x265_picture *pic_in = &pic_orig;
182
+ x265_picture pic_origMAX_VIEWS;
183
+ x265_picture *pic_inMAX_VIEWS;
184
+ for (int view = 0; view < m_param->numViews; view++)
185
+ pic_inview = &pic_origview;
186
/* Allocate recon picture if analysis save/load is enabled */
187
std::priority_queue<int64_t>* pts_queue = m_cliopt.output->needPTS() ? new std::priority_queue<int64_t>() : NULL;
188
- x265_picture *pic_recon = (m_cliopt.recon || m_param->analysisSave || m_param->analysisLoad || pts_queue || reconPlay || m_param->csvLogLevel) ? &pic_out : NULL;
189
+ x265_picture* pic_reconMAX_LAYERS;
190
+ x265_picture pic_outMAX_LAYERS;
191
+
192
+ for (int i = 0; i < m_param->numLayers; i++)
193
+ pic_reconi = (m_cliopt.reconi || m_param->analysisSave || m_param->analysisLoad || pts_queue || reconPlay || m_param->csvLogLevel) ? &pic_outi : NULL;
194
uint32_t inFrameCount = 0;
195
uint32_t outFrameCount = 0;
196
x265_nal *p_nal;
197
198
uint8_t *rpuPayload = NULL;
199
int inputPicNum = 1;
200
x265_picture picField1, picField2;
201
x265_3.6.tar.gz/source/abrEncApp.h -> x265_4.0.tar.gz/source/abrEncApp.h
Changed
36
1
2
{
3
public:
4
uint8_t m_numEncodes;
5
+ uint8_t m_numInputViews; // Number of inputs for multiview-extension
6
PassEncoder **m_passEnc;
7
uint32_t m_queueSize;
8
ThreadSafeInteger m_numActiveEncodes;
9
10
x265_picture **m_outputRecon;
11
12
CLIOptions m_cliopt;
13
- InputFile* m_input;
14
+ InputFile* m_inputMAX_VIEWS;
15
const char* m_reconPlayCmd;
16
FILE* m_qpfile;
17
FILE* m_zoneFile;
18
19
void startThreads();
20
void copyInfo(x265_analysis_data *src);
21
22
- bool readPicture(x265_picture*);
23
+ bool readPicture(x265_picture*, int view);
24
void destroy();
25
26
private:
27
28
public:
29
PassEncoder *m_parentEnc;
30
int m_id;
31
- InputFile* m_input;
32
+ InputFile* m_inputMAX_VIEWS;
33
int m_threadActive;
34
35
Reader(int id, PassEncoder *parentEnc);
36
x265_4.0.tar.gz/source/cmake/FindNEON_DOTPROD.cmake
Added
23
1
2
+include(FindPackageHandleStandardArgs)
3
+
4
+# Check if Armv8.4 Neon DotProd is supported by the Arm CPU
5
+if(APPLE)
6
+ execute_process(COMMAND sysctl -a
7
+ COMMAND grep "hw.optional.arm.FEAT_DotProd: 1"
8
+ OUTPUT_VARIABLE has_dot_product
9
+ ERROR_QUIET
10
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
11
+else()
12
+ execute_process(COMMAND cat /proc/cpuinfo
13
+ COMMAND grep Features
14
+ COMMAND grep asimddp
15
+ OUTPUT_VARIABLE has_dot_product
16
+ ERROR_QUIET
17
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
18
+endif()
19
+
20
+if(has_dot_product)
21
+ set(CPU_HAS_NEON_DOTPROD 1)
22
+endif()
23
x265_4.0.tar.gz/source/cmake/FindNEON_I8MM.cmake
Added
23
1
2
+include(FindPackageHandleStandardArgs)
3
+
4
+# Check if Armv8.6 Neon I8MM is supported by the Arm CPU
5
+if(APPLE)
6
+ execute_process(COMMAND sysctl -a
7
+ COMMAND grep "hw.optional.arm.FEAT_I8MM: 1"
8
+ OUTPUT_VARIABLE has_i8mm
9
+ ERROR_QUIET
10
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
11
+else()
12
+ execute_process(COMMAND cat /proc/cpuinfo
13
+ COMMAND grep Features
14
+ COMMAND grep i8mm
15
+ OUTPUT_VARIABLE has_i8mm
16
+ ERROR_QUIET
17
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
18
+endif()
19
+
20
+if(has_i8mm)
21
+ set(CPU_HAS_NEON_I8MM 1)
22
+endif()
23
x265_3.6.tar.gz/source/common/CMakeLists.txt -> x265_4.0.tar.gz/source/common/CMakeLists.txt
Changed
64
1
2
add_definitions(-DAUTO_VECTORIZE=1)
3
endif()
4
5
- set(C_SRCS asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h)
6
+ set(C_SRCS_NEON asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h sao-prim.cpp mem-neon.h)
7
+ set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp)
8
+ set(C_SRCS_NEON_I8MM filter-neon-i8mm.cpp)
9
+ set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp)
10
+ set(C_SRCS_SVE2 sao-prim-sve2.cpp)
11
enable_language(ASM)
12
13
# add ARM assembly/intrinsic files here
14
- set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S sad-a-common.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S ipfilter.S ipfilter-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S)
15
- set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S)
16
- set(A_SRCS_SVE2 mc-a-sve2.S sad-a-sve2.S pixel-util-sve2.S ipfilter-sve2.S ssd-a-sve2.S)
17
+ set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S intrapred.S dct.S)
18
+ set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S)
19
+ set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S ssd-a-sve2.S)
20
+ set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S)
21
set(VEC_PRIMITIVES)
22
23
set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
24
set(ARM_ASMS_SVE "${A_SRCS_SVE}" CACHE INTERNAL "ARM Assembly Sources that use SVE instruction set")
25
set(ARM_ASMS_SVE2 "${A_SRCS_SVE2}" CACHE INTERNAL "ARM Assembly Sources that use SVE2 instruction set")
26
- foreach(SRC ${C_SRCS})
27
+ set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL "Arm Assembly Sources that use the Neon DotProd extension")
28
+ foreach(SRC ${C_SRCS_NEON})
29
set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
30
endforeach()
31
+
32
+ if(CPU_HAS_NEON_I8MM)
33
+ foreach(SRC ${C_SRCS_NEON_I8MM})
34
+ set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
35
+ endforeach()
36
+ endif()
37
+
38
+ if(CPU_HAS_NEON_DOTPROD)
39
+ foreach(SRC ${C_SRCS_NEON_DOTPROD})
40
+ set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
41
+ endforeach()
42
+ endif()
43
+
44
+ if(CPU_HAS_SVE AND HAVE_SVE_BRIDGE)
45
+ foreach(SRC ${C_SRCS_SVE})
46
+ set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
47
+ endforeach()
48
+ endif()
49
+
50
+ if(CPU_HAS_SVE2 AND HAVE_SVE_BRIDGE)
51
+ foreach(SRC ${C_SRCS_SVE2})
52
+ set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
53
+ endforeach()
54
+ endif()
55
+
56
source_group(Assembly FILES ${ASM_PRIMITIVES})
57
+
58
+ if(AARCH64_WARNINGS_AS_ERRORS)
59
+ set_source_files_properties(${ASM_PRIMITIVES} PROPERTIES COMPILE_FLAGS -Werror)
60
+ endif()
61
endif(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
62
63
if(POWER)
64
x265_3.6.tar.gz/source/common/aarch64/arm64-utils.cpp -> x265_4.0.tar.gz/source/common/aarch64/arm64-utils.cpp
Changed
201
1
2
#include "arm64-utils.h"
3
#include <arm_neon.h>
4
5
-#define COPY_16(d,s) *(uint8x16_t *)(d) = *(uint8x16_t *)(s)
6
namespace X265_NS
7
{
8
9
10
11
void transpose8x8(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
12
{
13
- uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7;
14
- uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;
15
-
16
- a0 = *(uint8x8_t *)(src + 0 * sstride);
17
- a1 = *(uint8x8_t *)(src + 1 * sstride);
18
- a2 = *(uint8x8_t *)(src + 2 * sstride);
19
- a3 = *(uint8x8_t *)(src + 3 * sstride);
20
- a4 = *(uint8x8_t *)(src + 4 * sstride);
21
- a5 = *(uint8x8_t *)(src + 5 * sstride);
22
- a6 = *(uint8x8_t *)(src + 6 * sstride);
23
- a7 = *(uint8x8_t *)(src + 7 * sstride);
24
-
25
- b0 = vtrn1_u32(a0, a4);
26
- b1 = vtrn1_u32(a1, a5);
27
- b2 = vtrn1_u32(a2, a6);
28
- b3 = vtrn1_u32(a3, a7);
29
- b4 = vtrn2_u32(a0, a4);
30
- b5 = vtrn2_u32(a1, a5);
31
- b6 = vtrn2_u32(a2, a6);
32
- b7 = vtrn2_u32(a3, a7);
33
-
34
- a0 = vtrn1_u16(b0, b2);
35
- a1 = vtrn1_u16(b1, b3);
36
- a2 = vtrn2_u16(b0, b2);
37
- a3 = vtrn2_u16(b1, b3);
38
- a4 = vtrn1_u16(b4, b6);
39
- a5 = vtrn1_u16(b5, b7);
40
- a6 = vtrn2_u16(b4, b6);
41
- a7 = vtrn2_u16(b5, b7);
42
-
43
- b0 = vtrn1_u8(a0, a1);
44
- b1 = vtrn2_u8(a0, a1);
45
- b2 = vtrn1_u8(a2, a3);
46
- b3 = vtrn2_u8(a2, a3);
47
- b4 = vtrn1_u8(a4, a5);
48
- b5 = vtrn2_u8(a4, a5);
49
- b6 = vtrn1_u8(a6, a7);
50
- b7 = vtrn2_u8(a6, a7);
51
-
52
- *(uint8x8_t *)(dst + 0 * dstride) = b0;
53
- *(uint8x8_t *)(dst + 1 * dstride) = b1;
54
- *(uint8x8_t *)(dst + 2 * dstride) = b2;
55
- *(uint8x8_t *)(dst + 3 * dstride) = b3;
56
- *(uint8x8_t *)(dst + 4 * dstride) = b4;
57
- *(uint8x8_t *)(dst + 5 * dstride) = b5;
58
- *(uint8x8_t *)(dst + 6 * dstride) = b6;
59
- *(uint8x8_t *)(dst + 7 * dstride) = b7;
60
+ uint8x8_t a0 = vld1_u8(src + 0 * sstride);
61
+ uint8x8_t a1 = vld1_u8(src + 1 * sstride);
62
+ uint8x8_t a2 = vld1_u8(src + 2 * sstride);
63
+ uint8x8_t a3 = vld1_u8(src + 3 * sstride);
64
+ uint8x8_t a4 = vld1_u8(src + 4 * sstride);
65
+ uint8x8_t a5 = vld1_u8(src + 5 * sstride);
66
+ uint8x8_t a6 = vld1_u8(src + 6 * sstride);
67
+ uint8x8_t a7 = vld1_u8(src + 7 * sstride);
68
+
69
+ uint32x2_t b0 = vtrn1_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4));
70
+ uint32x2_t b1 = vtrn1_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5));
71
+ uint32x2_t b2 = vtrn1_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6));
72
+ uint32x2_t b3 = vtrn1_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7));
73
+ uint32x2_t b4 = vtrn2_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4));
74
+ uint32x2_t b5 = vtrn2_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5));
75
+ uint32x2_t b6 = vtrn2_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6));
76
+ uint32x2_t b7 = vtrn2_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7));
77
+
78
+ uint16x4_t c0 = vtrn1_u16(vreinterpret_u16_u32(b0),
79
+ vreinterpret_u16_u32(b2));
80
+ uint16x4_t c1 = vtrn1_u16(vreinterpret_u16_u32(b1),
81
+ vreinterpret_u16_u32(b3));
82
+ uint16x4_t c2 = vtrn2_u16(vreinterpret_u16_u32(b0),
83
+ vreinterpret_u16_u32(b2));
84
+ uint16x4_t c3 = vtrn2_u16(vreinterpret_u16_u32(b1),
85
+ vreinterpret_u16_u32(b3));
86
+ uint16x4_t c4 = vtrn1_u16(vreinterpret_u16_u32(b4),
87
+ vreinterpret_u16_u32(b6));
88
+ uint16x4_t c5 = vtrn1_u16(vreinterpret_u16_u32(b5),
89
+ vreinterpret_u16_u32(b7));
90
+ uint16x4_t c6 = vtrn2_u16(vreinterpret_u16_u32(b4),
91
+ vreinterpret_u16_u32(b6));
92
+ uint16x4_t c7 = vtrn2_u16(vreinterpret_u16_u32(b5),
93
+ vreinterpret_u16_u32(b7));
94
+
95
+ uint8x8_t d0 = vtrn1_u8(vreinterpret_u8_u16(c0), vreinterpret_u8_u16(c1));
96
+ uint8x8_t d1 = vtrn2_u8(vreinterpret_u8_u16(c0), vreinterpret_u8_u16(c1));
97
+ uint8x8_t d2 = vtrn1_u8(vreinterpret_u8_u16(c2), vreinterpret_u8_u16(c3));
98
+ uint8x8_t d3 = vtrn2_u8(vreinterpret_u8_u16(c2), vreinterpret_u8_u16(c3));
99
+ uint8x8_t d4 = vtrn1_u8(vreinterpret_u8_u16(c4), vreinterpret_u8_u16(c5));
100
+ uint8x8_t d5 = vtrn2_u8(vreinterpret_u8_u16(c4), vreinterpret_u8_u16(c5));
101
+ uint8x8_t d6 = vtrn1_u8(vreinterpret_u8_u16(c6), vreinterpret_u8_u16(c7));
102
+ uint8x8_t d7 = vtrn2_u8(vreinterpret_u8_u16(c6), vreinterpret_u8_u16(c7));
103
+
104
+ vst1_u8(dst + 0 * dstride, d0);
105
+ vst1_u8(dst + 1 * dstride, d1);
106
+ vst1_u8(dst + 2 * dstride, d2);
107
+ vst1_u8(dst + 3 * dstride, d3);
108
+ vst1_u8(dst + 4 * dstride, d4);
109
+ vst1_u8(dst + 5 * dstride, d5);
110
+ vst1_u8(dst + 6 * dstride, d6);
111
+ vst1_u8(dst + 7 * dstride, d7);
112
}
113
114
115
116
117
void transpose16x16(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
118
{
119
- uint16x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aA, aB, aC, aD, aE, aF;
120
- uint16x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, bA, bB, bC, bD, bE, bF;
121
- uint16x8_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF;
122
- uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, dA, dB, dC, dD, dE, dF;
123
-
124
- a0 = *(uint16x8_t *)(src + 0 * sstride);
125
- a1 = *(uint16x8_t *)(src + 1 * sstride);
126
- a2 = *(uint16x8_t *)(src + 2 * sstride);
127
- a3 = *(uint16x8_t *)(src + 3 * sstride);
128
- a4 = *(uint16x8_t *)(src + 4 * sstride);
129
- a5 = *(uint16x8_t *)(src + 5 * sstride);
130
- a6 = *(uint16x8_t *)(src + 6 * sstride);
131
- a7 = *(uint16x8_t *)(src + 7 * sstride);
132
- a8 = *(uint16x8_t *)(src + 8 * sstride);
133
- a9 = *(uint16x8_t *)(src + 9 * sstride);
134
- aA = *(uint16x8_t *)(src + 10 * sstride);
135
- aB = *(uint16x8_t *)(src + 11 * sstride);
136
- aC = *(uint16x8_t *)(src + 12 * sstride);
137
- aD = *(uint16x8_t *)(src + 13 * sstride);
138
- aE = *(uint16x8_t *)(src + 14 * sstride);
139
- aF = *(uint16x8_t *)(src + 15 * sstride);
140
-
141
- b0 = vtrn1q_u64(a0, a8);
142
- b1 = vtrn1q_u64(a1, a9);
143
- b2 = vtrn1q_u64(a2, aA);
144
- b3 = vtrn1q_u64(a3, aB);
145
- b4 = vtrn1q_u64(a4, aC);
146
- b5 = vtrn1q_u64(a5, aD);
147
- b6 = vtrn1q_u64(a6, aE);
148
- b7 = vtrn1q_u64(a7, aF);
149
- b8 = vtrn2q_u64(a0, a8);
150
- b9 = vtrn2q_u64(a1, a9);
151
- bA = vtrn2q_u64(a2, aA);
152
- bB = vtrn2q_u64(a3, aB);
153
- bC = vtrn2q_u64(a4, aC);
154
- bD = vtrn2q_u64(a5, aD);
155
- bE = vtrn2q_u64(a6, aE);
156
- bF = vtrn2q_u64(a7, aF);
157
-
158
- c0 = vtrn1q_u32(b0, b4);
159
- c1 = vtrn1q_u32(b1, b5);
160
- c2 = vtrn1q_u32(b2, b6);
161
- c3 = vtrn1q_u32(b3, b7);
162
- c4 = vtrn2q_u32(b0, b4);
163
- c5 = vtrn2q_u32(b1, b5);
164
- c6 = vtrn2q_u32(b2, b6);
165
- c7 = vtrn2q_u32(b3, b7);
166
- c8 = vtrn1q_u32(b8, bC);
167
- c9 = vtrn1q_u32(b9, bD);
168
- cA = vtrn1q_u32(bA, bE);
169
- cB = vtrn1q_u32(bB, bF);
170
- cC = vtrn2q_u32(b8, bC);
171
- cD = vtrn2q_u32(b9, bD);
172
- cE = vtrn2q_u32(bA, bE);
173
- cF = vtrn2q_u32(bB, bF);
174
-
175
- d0 = vtrn1q_u16(c0, c2);
176
- d1 = vtrn1q_u16(c1, c3);
177
- d2 = vtrn2q_u16(c0, c2);
178
- d3 = vtrn2q_u16(c1, c3);
179
- d4 = vtrn1q_u16(c4, c6);
180
- d5 = vtrn1q_u16(c5, c7);
181
- d6 = vtrn2q_u16(c4, c6);
182
- d7 = vtrn2q_u16(c5, c7);
183
- d8 = vtrn1q_u16(c8, cA);
184
- d9 = vtrn1q_u16(c9, cB);
185
- dA = vtrn2q_u16(c8, cA);
186
- dB = vtrn2q_u16(c9, cB);
187
- dC = vtrn1q_u16(cC, cE);
188
- dD = vtrn1q_u16(cD, cF);
189
- dE = vtrn2q_u16(cC, cE);
190
- dF = vtrn2q_u16(cD, cF);
191
-
192
- *(uint16x8_t *)(dst + 0 * dstride) = vtrn1q_u8(d0, d1);
193
- *(uint16x8_t *)(dst + 1 * dstride) = vtrn2q_u8(d0, d1);
194
- *(uint16x8_t *)(dst + 2 * dstride) = vtrn1q_u8(d2, d3);
195
- *(uint16x8_t *)(dst + 3 * dstride) = vtrn2q_u8(d2, d3);
196
- *(uint16x8_t *)(dst + 4 * dstride) = vtrn1q_u8(d4, d5);
197
- *(uint16x8_t *)(dst + 5 * dstride) = vtrn2q_u8(d4, d5);
198
- *(uint16x8_t *)(dst + 6 * dstride) = vtrn1q_u8(d6, d7);
199
- *(uint16x8_t *)(dst + 7 * dstride) = vtrn2q_u8(d6, d7);
200
- *(uint16x8_t *)(dst + 8 * dstride) = vtrn1q_u8(d8, d9);
201
x265_3.6.tar.gz/source/common/aarch64/arm64-utils.h -> x265_4.0.tar.gz/source/common/aarch64/arm64-utils.h
Changed
9
1
2
#ifndef __ARM64_UTILS_H__
3
#define __ARM64_UTILS_H__
4
5
+#include <stdint.h>
6
7
namespace X265_NS
8
{
9
x265_3.6.tar.gz/source/common/aarch64/asm-primitives.cpp -> x265_4.0.tar.gz/source/common/aarch64/asm-primitives.cpp
Changed
201
1
2
p.cuBLOCK_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
3
p.cuBLOCK_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
4
p.cuBLOCK_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu)
5
-#define LUMA_TU_TYPED_NEON(prim, fncdef, fname) \
6
- p.cuBLOCK_4x4.prim = fncdef PFX(fname ## _4x4_ ## neon); \
7
- p.cuBLOCK_8x8.prim = fncdef PFX(fname ## _8x8_ ## neon); \
8
- p.cuBLOCK_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
9
- p.cuBLOCK_64x64.prim = fncdef PFX(fname ## _64x64_ ## neon)
10
#define LUMA_TU_TYPED_CAN_USE_SVE(prim, fncdef, fname) \
11
p.cuBLOCK_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve)
12
#define ALL_LUMA_TU(prim, fname, cpu) ALL_LUMA_TU_TYPED(prim, , fname, cpu)
13
-#define LUMA_TU_NEON(prim, fname) LUMA_TU_TYPED_NEON(prim, , fname)
14
#define LUMA_TU_CAN_USE_SVE(prim, fname) LUMA_TU_TYPED_CAN_USE_SVE(prim, , fname)
15
16
#define ALL_LUMA_PU_TYPED(prim, fncdef, fname, cpu) \
17
18
p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
19
p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \
20
p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu)
21
-#define LUMA_PU_TYPED_MULTIPLE_ARCHS_1(prim, fncdef, fname, cpu) \
22
- p.puLUMA_4x4.prim = fncdef PFX(fname ## _4x4_ ## cpu); \
23
- p.puLUMA_4x8.prim = fncdef PFX(fname ## _4x8_ ## cpu); \
24
- p.puLUMA_4x16.prim = fncdef PFX(fname ## _4x16_ ## cpu)
25
-#define LUMA_PU_TYPED_MULTIPLE_ARCHS_2(prim, fncdef, fname, cpu) \
26
- p.puLUMA_8x8.prim = fncdef PFX(fname ## _8x8_ ## cpu); \
27
- p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
28
- p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
29
- p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
30
- p.puLUMA_8x4.prim = fncdef PFX(fname ## _8x4_ ## cpu); \
31
- p.puLUMA_16x8.prim = fncdef PFX(fname ## _16x8_ ## cpu); \
32
- p.puLUMA_8x16.prim = fncdef PFX(fname ## _8x16_ ## cpu); \
33
- p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
34
- p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
35
- p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
36
- p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
37
- p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
38
- p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
39
- p.puLUMA_16x4.prim = fncdef PFX(fname ## _16x4_ ## cpu); \
40
- p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
41
- p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
42
- p.puLUMA_32x8.prim = fncdef PFX(fname ## _32x8_ ## cpu); \
43
- p.puLUMA_8x32.prim = fncdef PFX(fname ## _8x32_ ## cpu); \
44
- p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
45
- p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
46
- p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \
47
- p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu)
48
-#define LUMA_PU_TYPED_NEON_1(prim, fncdef, fname) \
49
- p.puLUMA_4x4.prim = fncdef PFX(fname ## _4x4_ ## neon); \
50
- p.puLUMA_4x8.prim = fncdef PFX(fname ## _4x8_ ## neon); \
51
- p.puLUMA_4x16.prim = fncdef PFX(fname ## _4x16_ ## neon); \
52
- p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## neon); \
53
- p.puLUMA_8x8.prim = fncdef PFX(fname ## _8x8_ ## neon); \
54
- p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
55
- p.puLUMA_8x4.prim = fncdef PFX(fname ## _8x4_ ## neon); \
56
- p.puLUMA_16x8.prim = fncdef PFX(fname ## _16x8_ ## neon); \
57
- p.puLUMA_8x16.prim = fncdef PFX(fname ## _8x16_ ## neon); \
58
- p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \
59
- p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
60
- p.puLUMA_16x4.prim = fncdef PFX(fname ## _16x4_ ## neon); \
61
- p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## neon); \
62
- p.puLUMA_8x32.prim = fncdef PFX(fname ## _8x32_ ## neon); \
63
- p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## neon); \
64
- p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## neon)
65
#define LUMA_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fncdef, fname) \
66
p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve); \
67
p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## sve); \
68
69
p.puLUMA_32x8.prim = fncdef PFX(fname ## _32x8_ ## sve); \
70
p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## sve); \
71
p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## sve)
72
-#define LUMA_PU_TYPED_NEON_2(prim, fncdef, fname) \
73
- p.puLUMA_4x4.prim = fncdef PFX(fname ## _4x4_ ## neon); \
74
- p.puLUMA_8x4.prim = fncdef PFX(fname ## _8x4_ ## neon); \
75
- p.puLUMA_4x8.prim = fncdef PFX(fname ## _4x8_ ## neon); \
76
- p.puLUMA_8x8.prim = fncdef PFX(fname ## _8x8_ ## neon); \
77
- p.puLUMA_16x8.prim = fncdef PFX(fname ## _16x8_ ## neon); \
78
- p.puLUMA_8x16.prim = fncdef PFX(fname ## _8x16_ ## neon); \
79
- p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
80
- p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
81
- p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \
82
- p.puLUMA_16x4.prim = fncdef PFX(fname ## _16x4_ ## neon); \
83
- p.puLUMA_4x16.prim = fncdef PFX(fname ## _4x16_ ## neon); \
84
- p.puLUMA_8x32.prim = fncdef PFX(fname ## _8x32_ ## neon); \
85
- p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## neon)
86
#define LUMA_PU_TYPED_MULTIPLE_ARCHS_3(prim, fncdef, fname, cpu) \
87
p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
88
p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
89
90
p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
91
p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
92
p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu)
93
-#define LUMA_PU_TYPED_NEON_3(prim, fncdef, fname) \
94
- p.puLUMA_4x4.prim = fncdef PFX(fname ## _4x4_ ## neon); \
95
- p.puLUMA_4x8.prim = fncdef PFX(fname ## _4x8_ ## neon); \
96
- p.puLUMA_4x16.prim = fncdef PFX(fname ## _4x16_ ## neon)
97
#define LUMA_PU_TYPED_CAN_USE_SVE2(prim, fncdef, fname) \
98
p.puLUMA_8x8.prim = fncdef PFX(fname ## _8x8_ ## sve2); \
99
p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## sve2); \
100
101
p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## sve2); \
102
p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## sve2); \
103
p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## sve2)
104
-#define LUMA_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
105
- p.puLUMA_4x4.prim = fncdef PFX(filterPixelToShort ## _4x4_ ## neon); \
106
- p.puLUMA_8x8.prim = fncdef PFX(filterPixelToShort ## _8x8_ ## neon); \
107
- p.puLUMA_16x16.prim = fncdef PFX(filterPixelToShort ## _16x16_ ## neon); \
108
- p.puLUMA_8x4.prim = fncdef PFX(filterPixelToShort ## _8x4_ ## neon); \
109
- p.puLUMA_4x8.prim = fncdef PFX(filterPixelToShort ## _4x8_ ## neon); \
110
- p.puLUMA_16x8.prim = fncdef PFX(filterPixelToShort ## _16x8_ ## neon); \
111
- p.puLUMA_8x16.prim = fncdef PFX(filterPixelToShort ## _8x16_ ## neon); \
112
- p.puLUMA_16x32.prim = fncdef PFX(filterPixelToShort ## _16x32_ ## neon); \
113
- p.puLUMA_16x12.prim = fncdef PFX(filterPixelToShort ## _16x12_ ## neon); \
114
- p.puLUMA_12x16.prim = fncdef PFX(filterPixelToShort ## _12x16_ ## neon); \
115
- p.puLUMA_16x4.prim = fncdef PFX(filterPixelToShort ## _16x4_ ## neon); \
116
- p.puLUMA_4x16.prim = fncdef PFX(filterPixelToShort ## _4x16_ ## neon); \
117
- p.puLUMA_24x32.prim = fncdef PFX(filterPixelToShort ## _24x32_ ## neon); \
118
- p.puLUMA_8x32.prim = fncdef PFX(filterPixelToShort ## _8x32_ ## neon); \
119
- p.puLUMA_16x64.prim = fncdef PFX(filterPixelToShort ## _16x64_ ## neon)
120
#define LUMA_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
121
p.puLUMA_32x32.prim = fncdef PFX(filterPixelToShort ## _32x32_ ## sve); \
122
p.puLUMA_32x16.prim = fncdef PFX(filterPixelToShort ## _32x16_ ## sve); \
123
124
p.puLUMA_64x48.prim = fncdef PFX(filterPixelToShort ## _64x48_ ## sve); \
125
p.puLUMA_64x16.prim = fncdef PFX(filterPixelToShort ## _64x16_ ## sve); \
126
p.puLUMA_48x64.prim = fncdef PFX(filterPixelToShort ## _48x64_ ## sve)
127
+#define LUMA_PU_TYPED_MULTIPLE_16(prim, fncdef, fname, cpu) \
128
+ p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
129
+ p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
130
+ p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
131
+ p.puLUMA_16x8.prim = fncdef PFX(fname ## _16x8_ ## cpu); \
132
+ p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
133
+ p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
134
+ p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
135
+ p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
136
+ p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
137
+ p.puLUMA_16x4.prim = fncdef PFX(fname ## _16x4_ ## cpu); \
138
+ p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
139
+ p.puLUMA_32x8.prim = fncdef PFX(fname ## _32x8_ ## cpu); \
140
+ p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
141
+ p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
142
+ p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \
143
+ p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu)
144
#define ALL_LUMA_PU(prim, fname, cpu) ALL_LUMA_PU_TYPED(prim, , fname, cpu)
145
-#define LUMA_PU_MULTIPLE_ARCHS_1(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_1(prim, , fname, cpu)
146
-#define LUMA_PU_MULTIPLE_ARCHS_2(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_2(prim, , fname, cpu)
147
-#define LUMA_PU_NEON_1(prim, fname) LUMA_PU_TYPED_NEON_1(prim, , fname)
148
#define LUMA_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fname) LUMA_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, , fname)
149
-#define LUMA_PU_NEON_2(prim, fname) LUMA_PU_TYPED_NEON_2(prim, , fname)
150
#define LUMA_PU_MULTIPLE_ARCHS_3(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_3(prim, , fname, cpu)
151
-#define LUMA_PU_NEON_3(prim, fname) LUMA_PU_TYPED_NEON_3(prim, , fname)
152
#define LUMA_PU_CAN_USE_SVE2(prim, fname) LUMA_PU_TYPED_CAN_USE_SVE2(prim, , fname)
153
-#define LUMA_PU_NEON_FILTER_PIXEL_TO_SHORT(prim) LUMA_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, )
154
#define LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(prim) LUMA_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, )
155
+#define LUMA_PU_MULTIPLE_16(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_16(prim, , fname, cpu)
156
157
158
#define ALL_LUMA_PU_T(prim, fname) \
159
160
p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
161
p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim = fncdef PFX(fname ## _32x8_ ## cpu); \
162
p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim = fncdef PFX(fname ## _8x32_ ## cpu)
163
-#define CHROMA_420_PU_TYPED_NEON_1(prim, fncdef, fname) \
164
- p.chromaX265_CSP_I420.puCHROMA_420_4x4.prim = fncdef PFX(fname ## _4x4_ ## neon); \
165
- p.chromaX265_CSP_I420.puCHROMA_420_4x2.prim = fncdef PFX(fname ## _4x2_ ## neon); \
166
- p.chromaX265_CSP_I420.puCHROMA_420_4x8.prim = fncdef PFX(fname ## _4x8_ ## neon); \
167
- p.chromaX265_CSP_I420.puCHROMA_420_6x8.prim = fncdef PFX(fname ## _6x8_ ## neon); \
168
- p.chromaX265_CSP_I420.puCHROMA_420_12x16.prim = fncdef PFX(fname ## _12x16_ ## neon); \
169
- p.chromaX265_CSP_I420.puCHROMA_420_4x16.prim = fncdef PFX(fname ## _4x16_ ## neon); \
170
- p.chromaX265_CSP_I420.puCHROMA_420_32x24.prim = fncdef PFX(fname ## _32x24_ ## neon); \
171
- p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(fname ## _24x32_ ## neon); \
172
- p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim = fncdef PFX(fname ## _32x8_ ## neon); \
173
- p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim = fncdef PFX(fname ## _8x32_ ## neon); \
174
- p.chromaX265_CSP_I420.puCHROMA_420_8x8.prim = fncdef PFX(fname ## _8x8_ ## neon); \
175
- p.chromaX265_CSP_I420.puCHROMA_420_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
176
- p.chromaX265_CSP_I420.puCHROMA_420_2x4.prim = fncdef PFX(fname ## _2x4_ ## neon); \
177
- p.chromaX265_CSP_I420.puCHROMA_420_8x4.prim = fncdef PFX(fname ## _8x4_ ## neon); \
178
- p.chromaX265_CSP_I420.puCHROMA_420_16x8.prim = fncdef PFX(fname ## _16x8_ ## neon); \
179
- p.chromaX265_CSP_I420.puCHROMA_420_8x16.prim = fncdef PFX(fname ## _8x16_ ## neon); \
180
- p.chromaX265_CSP_I420.puCHROMA_420_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
181
- p.chromaX265_CSP_I420.puCHROMA_420_8x6.prim = fncdef PFX(fname ## _8x6_ ## neon); \
182
- p.chromaX265_CSP_I420.puCHROMA_420_8x2.prim = fncdef PFX(fname ## _8x2_ ## neon); \
183
- p.chromaX265_CSP_I420.puCHROMA_420_2x8.prim = fncdef PFX(fname ## _2x8_ ## neon); \
184
- p.chromaX265_CSP_I420.puCHROMA_420_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \
185
- p.chromaX265_CSP_I420.puCHROMA_420_16x4.prim = fncdef PFX(fname ## _16x4_ ## neon)
186
#define CHROMA_420_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fncdef, fname) \
187
p.chromaX265_CSP_I420.puCHROMA_420_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve); \
188
p.chromaX265_CSP_I420.puCHROMA_420_32x16.prim = fncdef PFX(fname ## _32x16_ ## sve)
189
-#define CHROMA_420_PU_TYPED_NEON_2(prim, fncdef, fname) \
190
- p.chromaX265_CSP_I420.puCHROMA_420_4x4.prim = fncdef PFX(fname ## _4x4_ ## neon); \
191
- p.chromaX265_CSP_I420.puCHROMA_420_4x2.prim = fncdef PFX(fname ## _4x2_ ## neon); \
192
- p.chromaX265_CSP_I420.puCHROMA_420_4x8.prim = fncdef PFX(fname ## _4x8_ ## neon); \
193
- p.chromaX265_CSP_I420.puCHROMA_420_4x16.prim = fncdef PFX(fname ## _4x16_ ## neon)
194
#define CHROMA_420_PU_TYPED_MULTIPLE_ARCHS(prim, fncdef, fname, cpu) \
195
p.chromaX265_CSP_I420.puCHROMA_420_8x8.prim = fncdef PFX(fname ## _8x8_ ## cpu); \
196
p.chromaX265_CSP_I420.puCHROMA_420_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
197
198
p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
199
p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim = fncdef PFX(fname ## _32x8_ ## cpu); \
200
p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim = fncdef PFX(fname ## _8x32_ ## cpu)
201
x265_3.6.tar.gz/source/common/aarch64/asm.S -> x265_4.0.tar.gz/source/common/aarch64/asm.S
Changed
40
1
2
3
#define PFX_C(name) JOIN(JOIN(JOIN(EXTERN_ASM, X265_NS), _), name)
4
5
+// Alignment of stack arguments of size less than 8 bytes.
6
+#ifdef __APPLE__
7
+#define STACK_ARG_ALIGNMENT 4
8
+#else
9
+#define STACK_ARG_ALIGNMENT 8
10
+#endif
11
+
12
+// Get offset from SP of stack argument at index `idx`.
13
+#define STACK_ARG_OFFSET(idx) (idx * STACK_ARG_ALIGNMENT)
14
+
15
#ifdef __APPLE__
16
.macro endfunc
17
ELF .size \name, . - \name
18
19
vtrn \t3, \t4, \s3, \s4
20
.endm
21
22
-#endif
23
\ No newline at end of file
24
+
25
+.macro push_vec_regs
26
+ stp d8, d9, sp,#-16!
27
+ stp d10, d11, sp,#-16!
28
+ stp d12, d13, sp,#-16!
29
+ stp d14, d15, sp,#-16!
30
+.endm
31
+
32
+.macro pop_vec_regs
33
+ ldp d14, d15, sp, #16
34
+ ldp d12, d13, sp, #16
35
+ ldp d10, d11, sp, #16
36
+ ldp d8, d9, sp, #16
37
+.endm
38
+
39
+#endif
40
x265_3.6.tar.gz/source/common/aarch64/blockcopy8-sve.S -> x265_4.0.tar.gz/source/common/aarch64/blockcopy8-sve.S
Changed
201
1
2
lsl x3, x3, #1
3
movrel x11, xtn_xtn2_table
4
ld1 {v31.16b}, x11
5
-.loop_csp32_sve:
6
+.Loop_csp32_sve:
7
sub w12, w12, #1
8
.rept 4
9
ld1 {v0.8h-v3.8h}, x2, x3
10
11
st1 {v0.16b-v1.16b}, x0, x1
12
st1 {v2.16b-v3.16b}, x0, x1
13
.endr
14
- cbnz w12, .loop_csp32_sve
15
+ cbnz w12, .Loop_csp32_sve
16
ret
17
.vl_gt_16_blockcopy_sp_32_32:
18
cmp x9, #48
19
20
bgt .vl_gt_16_blockcopy_ps_32_32
21
lsl x1, x1, #1
22
mov w12, #4
23
-.loop_cps32_sve:
24
+.Loop_cps32_sve:
25
sub w12, w12, #1
26
.rept 4
27
ld1 {v16.16b-v17.16b}, x2, x3
28
29
st1 {v0.8h-v3.8h}, x0, x1
30
st1 {v4.8h-v7.8h}, x0, x1
31
.endr
32
- cbnz w12, .loop_cps32_sve
33
+ cbnz w12, .Loop_cps32_sve
34
ret
35
.vl_gt_16_blockcopy_ps_32_32:
36
cmp x9, #48
37
38
lsl x1, x1, #1
39
sub x1, x1, #64
40
mov w12, #16
41
-.loop_cps64_sve:
42
+.Loop_cps64_sve:
43
sub w12, w12, #1
44
.rept 4
45
ld1 {v16.16b-v19.16b}, x2, x3
46
47
st1 {v0.8h-v3.8h}, x0, #64
48
st1 {v4.8h-v7.8h}, x0, x1
49
.endr
50
- cbnz w12, .loop_cps64_sve
51
+ cbnz w12, .Loop_cps64_sve
52
ret
53
.vl_gt_16_blockcopy_ps_64_64:
54
cmp x9, #48
55
56
lsl x1, x1, #1
57
lsl x3, x3, #1
58
mov w12, #4
59
-.loop_css32_sve:
60
+.Loop_css32_sve:
61
sub w12, w12, #1
62
.rept 8
63
ld1 {v0.8h-v3.8h}, x2, x3
64
st1 {v0.8h-v3.8h}, x0, x1
65
.endr
66
- cbnz w12, .loop_css32_sve
67
+ cbnz w12, .Loop_css32_sve
68
ret
69
.vl_gt_16_blockcopy_ss_32_32:
70
cmp x9, #48
71
72
lsl x3, x3, #1
73
sub x3, x3, #64
74
mov w12, #8
75
-.loop_css64_sve:
76
+.Loop_css64_sve:
77
sub w12, w12, #1
78
.rept 8
79
ld1 {v0.8h-v3.8h}, x2, #64
80
81
st1 {v0.8h-v3.8h}, x0, #64
82
st1 {v4.8h-v7.8h}, x0, x1
83
.endr
84
- cbnz w12, .loop_css64_sve
85
+ cbnz w12, .Loop_css64_sve
86
ret
87
.vl_gt_16_blockcopy_ss_64_64:
88
cmp x9, #48
89
90
lsl x1, x1, #1
91
lsl x3, x3, #1
92
mov w12, #8
93
-.loop_css32x64_sve:
94
+.Loop_css32x64_sve:
95
sub w12, w12, #1
96
.rept 8
97
ld1 {v0.8h-v3.8h}, x2, x3
98
st1 {v0.8h-v3.8h}, x0, x1
99
.endr
100
- cbnz w12, .loop_css32x64_sve
101
+ cbnz w12, .Loop_css32x64_sve
102
ret
103
.vl_gt_16_blockcopy_ss_32_64:
104
cmp x9, #48
105
106
bgt .vl_gt_16_blockcopy_ps_32_64
107
lsl x1, x1, #1
108
mov w12, #8
109
-.loop_cps32x64_sve:
110
+.Loop_cps32x64_sve:
111
sub w12, w12, #1
112
.rept 4
113
ld1 {v16.16b-v17.16b}, x2, x3
114
115
st1 {v0.8h-v3.8h}, x0, x1
116
st1 {v4.8h-v7.8h}, x0, x1
117
.endr
118
- cbnz w12, .loop_cps32x64_sve
119
+ cbnz w12, .Loop_cps32x64_sve
120
ret
121
.vl_gt_16_blockcopy_ps_32_64:
122
cmp x9, #48
123
124
rdvl x9, #1
125
cmp x9, #16
126
bgt .vl_gt_16_blockcopy_pp_32xN_\h
127
-.loop_sve_32x\h\():
128
+.Loop_sve_32x\h\():
129
sub w12, w12, #1
130
.rept 8
131
ld1 {v0.16b-v1.16b}, x2, x3
132
st1 {v0.16b-v1.16b}, x0, x1
133
.endr
134
- cbnz w12, .loop_sve_32x\h
135
+ cbnz w12, .Loop_sve_32x\h
136
ret
137
.vl_gt_16_blockcopy_pp_32xN_\h:
138
ptrue p0.b, vl32
139
140
rdvl x9, #1
141
cmp x9, #16
142
bgt .vl_gt_16_blockcopy_pp_64xN_\h
143
-.loop_sve_64x\h\():
144
+.Loop_sve_64x\h\():
145
sub w12, w12, #1
146
.rept 4
147
ld1 {v0.16b-v3.16b}, x2, x3
148
st1 {v0.16b-v3.16b}, x0, x1
149
.endr
150
- cbnz w12, .loop_sve_64x\h
151
+ cbnz w12, .Loop_sve_64x\h
152
ret
153
.vl_gt_16_blockcopy_pp_64xN_\h:
154
cmp x9, #48
155
156
bgt .vl_gt_16_cpy2Dto1D_shl_16x16
157
cpy2Dto1D_shl_start_sve
158
mov w12, #4
159
-.loop_cpy2Dto1D_shl_16_sve:
160
+.Loop_cpy2Dto1D_shl_16_sve:
161
sub w12, w12, #1
162
.rept 4
163
ld1 {v2.16b-v3.16b}, x1, x2
164
165
sshl v3.8h, v3.8h, v0.8h
166
st1 {v2.16b-v3.16b}, x0, #32
167
.endr
168
- cbnz w12, .loop_cpy2Dto1D_shl_16_sve
169
+ cbnz w12, .Loop_cpy2Dto1D_shl_16_sve
170
ret
171
.vl_gt_16_cpy2Dto1D_shl_16x16:
172
ptrue p0.h, vl16
173
174
bgt .vl_gt_16_cpy2Dto1D_shl_32x32
175
cpy2Dto1D_shl_start_sve
176
mov w12, #16
177
-.loop_cpy2Dto1D_shl_32_sve:
178
+.Loop_cpy2Dto1D_shl_32_sve:
179
sub w12, w12, #1
180
.rept 2
181
ld1 {v2.16b-v5.16b}, x1, x2
182
183
sshl v5.8h, v5.8h, v0.8h
184
st1 {v2.16b-v5.16b}, x0, #64
185
.endr
186
- cbnz w12, .loop_cpy2Dto1D_shl_32_sve
187
+ cbnz w12, .Loop_cpy2Dto1D_shl_32_sve
188
ret
189
.vl_gt_16_cpy2Dto1D_shl_32x32:
190
cmp x9, #48
191
192
cpy2Dto1D_shl_start_sve
193
mov w12, #32
194
sub x2, x2, #64
195
-.loop_cpy2Dto1D_shl_64_sve:
196
+.Loop_cpy2Dto1D_shl_64_sve:
197
sub w12, w12, #1
198
.rept 2
199
ld1 {v2.16b-v5.16b}, x1, #64
200
201
x265_3.6.tar.gz/source/common/aarch64/blockcopy8.S -> x265_4.0.tar.gz/source/common/aarch64/blockcopy8.S
Changed
201
1
2
lsl x3, x3, #1
3
movrel x11, xtn_xtn2_table
4
ld1 {v31.16b}, x11
5
-.loop_csp32:
6
+.Loop_csp32:
7
sub w12, w12, #1
8
.rept 4
9
ld1 {v0.8h-v3.8h}, x2, x3
10
11
st1 {v0.16b-v1.16b}, x0, x1
12
st1 {v2.16b-v3.16b}, x0, x1
13
.endr
14
- cbnz w12, .loop_csp32
15
+ cbnz w12, .Loop_csp32
16
ret
17
endfunc
18
19
20
sub x3, x3, #64
21
movrel x11, xtn_xtn2_table
22
ld1 {v31.16b}, x11
23
-.loop_csp64:
24
+.Loop_csp64:
25
sub w12, w12, #1
26
.rept 4
27
ld1 {v0.8h-v3.8h}, x2, #64
28
29
tbl v3.16b, {v6.16b,v7.16b}, v31.16b
30
st1 {v0.16b-v3.16b}, x0, x1
31
.endr
32
- cbnz w12, .loop_csp64
33
+ cbnz w12, .Loop_csp64
34
ret
35
endfunc
36
37
38
function PFX(blockcopy_ps_32x32_neon)
39
lsl x1, x1, #1
40
mov w12, #4
41
-.loop_cps32:
42
+.Loop_cps32:
43
sub w12, w12, #1
44
.rept 4
45
ld1 {v16.16b-v17.16b}, x2, x3
46
47
st1 {v0.8h-v3.8h}, x0, x1
48
st1 {v4.8h-v7.8h}, x0, x1
49
.endr
50
- cbnz w12, .loop_cps32
51
+ cbnz w12, .Loop_cps32
52
ret
53
endfunc
54
55
56
lsl x1, x1, #1
57
sub x1, x1, #64
58
mov w12, #16
59
-.loop_cps64:
60
+.Loop_cps64:
61
sub w12, w12, #1
62
.rept 4
63
ld1 {v16.16b-v19.16b}, x2, x3
64
65
st1 {v0.8h-v3.8h}, x0, #64
66
st1 {v4.8h-v7.8h}, x0, x1
67
.endr
68
- cbnz w12, .loop_cps64
69
+ cbnz w12, .Loop_cps64
70
ret
71
endfunc
72
73
74
lsl x1, x1, #1
75
lsl x3, x3, #1
76
mov w12, #4
77
-.loop_css32:
78
+.Loop_css32:
79
sub w12, w12, #1
80
.rept 8
81
ld1 {v0.8h-v3.8h}, x2, x3
82
st1 {v0.8h-v3.8h}, x0, x1
83
.endr
84
- cbnz w12, .loop_css32
85
+ cbnz w12, .Loop_css32
86
ret
87
endfunc
88
89
90
lsl x3, x3, #1
91
sub x3, x3, #64
92
mov w12, #8
93
-.loop_css64:
94
+.Loop_css64:
95
sub w12, w12, #1
96
.rept 8
97
ld1 {v0.8h-v3.8h}, x2, #64
98
99
st1 {v0.8h-v3.8h}, x0, #64
100
st1 {v4.8h-v7.8h}, x0, x1
101
.endr
102
- cbnz w12, .loop_css64
103
+ cbnz w12, .Loop_css64
104
ret
105
endfunc
106
107
108
lsl x1, x1, #1
109
lsl x3, x3, #1
110
mov w12, #8
111
-.loop_css32x64:
112
+.Loop_css32x64:
113
sub w12, w12, #1
114
.rept 8
115
ld1 {v0.8h-v3.8h}, x2, x3
116
st1 {v0.8h-v3.8h}, x0, x1
117
.endr
118
- cbnz w12, .loop_css32x64
119
+ cbnz w12, .Loop_css32x64
120
ret
121
endfunc
122
123
124
function PFX(blockcopy_ps_32x64_neon)
125
lsl x1, x1, #1
126
mov w12, #8
127
-.loop_cps32x64:
128
+.Loop_cps32x64:
129
sub w12, w12, #1
130
.rept 4
131
ld1 {v16.16b-v17.16b}, x2, x3
132
133
st1 {v0.8h-v3.8h}, x0, x1
134
st1 {v4.8h-v7.8h}, x0, x1
135
.endr
136
- cbnz w12, .loop_cps32x64
137
+ cbnz w12, .Loop_cps32x64
138
ret
139
endfunc
140
141
142
lsl x3, x3, #1
143
movrel x11, xtn_xtn2_table
144
ld1 {v31.16b}, x11
145
-.loop_csp32x64:
146
+.Loop_csp32x64:
147
sub w12, w12, #1
148
.rept 4
149
ld1 {v0.8h-v3.8h}, x2, x3
150
151
st1 {v0.16b-v1.16b}, x0, x1
152
st1 {v2.16b-v3.16b}, x0, x1
153
.endr
154
- cbnz w12, .loop_csp32x64
155
+ cbnz w12, .Loop_csp32x64
156
ret
157
endfunc
158
159
160
161
function PFX(blockcopy_pp_8x64_neon)
162
mov w12, #4
163
-.loop_pp_8x64:
164
+.Loop_pp_8x64:
165
sub w12, w12, #1
166
.rept 16
167
ld1 {v0.4h}, x2, x3
168
st1 {v0.4h}, x0, x1
169
.endr
170
- cbnz w12, .loop_pp_8x64
171
+ cbnz w12, .Loop_pp_8x64
172
ret
173
endfunc
174
175
176
.macro blockcopy_pp_16xN1_neon h
177
function PFX(blockcopy_pp_16x\h\()_neon)
178
mov w12, #\h / 8
179
-.loop_16x\h\():
180
+.Loop_16x\h\():
181
.rept 8
182
ld1 {v0.8h}, x2, x3
183
st1 {v0.8h}, x0, x1
184
.endr
185
sub w12, w12, #1
186
- cbnz w12, .loop_16x\h
187
+ cbnz w12, .Loop_16x\h
188
ret
189
endfunc
190
.endm
191
192
function PFX(blockcopy_pp_12x32_neon)
193
sub x1, x1, #8
194
mov w12, #4
195
-.loop_pp_12x32:
196
+.Loop_pp_12x32:
197
sub w12, w12, #1
198
.rept 8
199
ld1 {v0.16b}, x2, x3
200
str d0, x0, #8
201
x265_4.0.tar.gz/source/common/aarch64/dct-prim-sve.cpp
Added
201
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2024 MulticoreWare, Inc
4
+ *
5
+ * Authors: Hari Limaye <hari.limaye@arm.com>
6
+ * Jonathan Wright <jonathan.wright@arm.com>
7
+ *
8
+ * This program is free software; you can redistribute it and/or modify
9
+ * it under the terms of the GNU General Public License as published by
10
+ * the Free Software Foundation; either version 2 of the License, or
11
+ * (at your option) any later version.
12
+ *
13
+ * This program is distributed in the hope that it will be useful,
14
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ * GNU General Public License for more details.
17
+ *
18
+ * You should have received a copy of the GNU General Public License
19
+ * along with this program; if not, write to the Free Software
20
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21
+ *
22
+ * This program is also available under a commercial proprietary license.
23
+ * For more information, contact us at license @ x265.com.
24
+ *****************************************************************************/
25
+
26
+#include "dct-prim.h"
27
+#include "neon-sve-bridge.h"
28
+#include <arm_neon.h>
29
+
30
+
31
+namespace
32
+{
33
+using namespace X265_NS;
34
+
35
+// First four elements (duplicated) of rows 1, 3, 5 and 7 in g_t8 (8x8 DCT
36
+// matrix.)
37
+const int16_t t8_odd48 =
38
+{
39
+ { 89, 75, 50, 18, 89, 75, 50, 18 },
40
+ { 75, -18, -89, -50, 75, -18, -89, -50 },
41
+ { 50, -89, 18, 75, 50, -89, 18, 75 },
42
+ { 18, -50, 75, -89, 18, -50, 75, -89 },
43
+};
44
+
45
+template<int shift>
46
+static inline void partialButterfly8_sve(const int16_t *src, int16_t *dst)
47
+{
48
+ const int line = 8;
49
+
50
+ int16x8_t Oline / 2;
51
+ int32x4_t EEline / 2;
52
+ int32x4_t EOline / 2;
53
+
54
+ for (int i = 0; i < line; i += 2)
55
+ {
56
+ int16x8_t s_lo = vcombine_s16(vld1_s16(src + i * line),
57
+ vld1_s16(src + (i + 1) * line));
58
+ int16x8_t s_hi = vcombine_s16(
59
+ vrev64_s16(vld1_s16(src + i * line + 4)),
60
+ vrev64_s16(vld1_s16(src + (i + 1) * line + 4)));
61
+
62
+ int32x4_t E0 = vaddl_s16(vget_low_s16(s_lo), vget_low_s16(s_hi));
63
+ int32x4_t E1 = vaddl_s16(vget_high_s16(s_lo), vget_high_s16(s_hi));
64
+
65
+ Oi / 2 = vsubq_s16(s_lo, s_hi);
66
+
67
+ int32x4_t t0 = vreinterpretq_s32_s64(
68
+ vzip1q_s64(vreinterpretq_s64_s32(E0), vreinterpretq_s64_s32(E1)));
69
+ int32x4_t t1 = vrev64q_s32(vreinterpretq_s32_s64(
70
+ vzip2q_s64(vreinterpretq_s64_s32(E0), vreinterpretq_s64_s32(E1))));
71
+
72
+ EEi / 2 = vaddq_s32(t0, t1);
73
+ EOi / 2 = vsubq_s32(t0, t1);
74
+ }
75
+
76
+ int16_t *d = dst;
77
+
78
+ int32x4_t c0 = vld1q_s32(t8_even0);
79
+ int32x4_t c2 = vld1q_s32(t8_even1);
80
+ int32x4_t c4 = vld1q_s32(t8_even2);
81
+ int32x4_t c6 = vld1q_s32(t8_even3);
82
+ int16x8_t c1 = vld1q_s16(t8_odd0);
83
+ int16x8_t c3 = vld1q_s16(t8_odd1);
84
+ int16x8_t c5 = vld1q_s16(t8_odd2);
85
+ int16x8_t c7 = vld1q_s16(t8_odd3);
86
+
87
+ for (int j = 0; j < line; j += 4)
88
+ {
89
+ // O
90
+ int64x2_t t01 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 0, c1);
91
+ int64x2_t t23 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 1, c1);
92
+ int32x4_t t0123 = vcombine_s32(vmovn_s64(t01), vmovn_s64(t23));
93
+ int16x4_t res1 = vrshrn_n_s32(t0123, shift);
94
+ vst1_s16(d + 1 * line, res1);
95
+
96
+ t01 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 0, c3);
97
+ t23 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 1, c3);
98
+ t0123 = vcombine_s32(vmovn_s64(t01), vmovn_s64(t23));
99
+ int16x4_t res3 = vrshrn_n_s32(t0123, shift);
100
+ vst1_s16(d + 3 * line, res3);
101
+
102
+ t01 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 0, c5);
103
+ t23 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 1, c5);
104
+ t0123 = vcombine_s32(vmovn_s64(t01), vmovn_s64(t23));
105
+ int16x4_t res5 = vrshrn_n_s32(t0123, shift);
106
+ vst1_s16(d + 5 * line, res5);
107
+
108
+ t01 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 0, c7);
109
+ t23 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 1, c7);
110
+ t0123 = vcombine_s32(vmovn_s64(t01), vmovn_s64(t23));
111
+ int16x4_t res7 = vrshrn_n_s32(t0123, shift);
112
+ vst1_s16(d + 7 * line, res7);
113
+
114
+ // EE and EO
115
+ int32x4_t t0 = vpaddq_s32(EEj / 2 + 0, EEj / 2 + 1);
116
+ int32x4_t t1 = vmulq_s32(c0, t0);
117
+ int16x4_t res0 = vrshrn_n_s32(t1, shift);
118
+ vst1_s16(d + 0 * line, res0);
119
+
120
+ int32x4_t t2 = vmulq_s32(c2, EOj / 2 + 0);
121
+ int32x4_t t3 = vmulq_s32(c2, EOj / 2 + 1);
122
+ int16x4_t res2 = vrshrn_n_s32(vpaddq_s32(t2, t3), shift);
123
+ vst1_s16(d + 2 * line, res2);
124
+
125
+ int32x4_t t4 = vmulq_s32(c4, EEj / 2 + 0);
126
+ int32x4_t t5 = vmulq_s32(c4, EEj / 2 + 1);
127
+ int16x4_t res4 = vrshrn_n_s32(vpaddq_s32(t4, t5), shift);
128
+ vst1_s16(d + 4 * line, res4);
129
+
130
+ int32x4_t t6 = vmulq_s32(c6, EOj / 2 + 0);
131
+ int32x4_t t7 = vmulq_s32(c6, EOj / 2 + 1);
132
+ int16x4_t res6 = vrshrn_n_s32(vpaddq_s32(t6, t7), shift);
133
+ vst1_s16(d + 6 * line, res6);
134
+
135
+ d += 4;
136
+ }
137
+}
138
+
139
+template<int shift>
140
+static inline void partialButterfly16_sve(const int16_t *src, int16_t *dst)
141
+{
142
+ const int line = 16;
143
+
144
+ int16x8_t Oline;
145
+ int16x8_t EOline / 2;
146
+ int32x4_t EEEline;
147
+ int32x4_t EEOline;
148
+
149
+ for (int i = 0; i < line; i += 2)
150
+ {
151
+ int16x8_t s0_lo = vld1q_s16(src + i * line);
152
+ int16x8_t s0_hi = rev16(vld1q_s16(src + i * line + 8));
153
+
154
+ int16x8_t s1_lo = vld1q_s16(src + (i + 1) * line);
155
+ int16x8_t s1_hi = rev16(vld1q_s16(src + (i + 1) * line + 8));
156
+
157
+ int32x4_t E02;
158
+ E00 = vaddl_s16(vget_low_s16(s0_lo), vget_low_s16(s0_hi));
159
+ E01 = vaddl_s16(vget_high_s16(s0_lo), vget_high_s16(s0_hi));
160
+
161
+ int32x4_t E12;
162
+ E10 = vaddl_s16(vget_low_s16(s1_lo), vget_low_s16(s1_hi));
163
+ E11 = vaddl_s16(vget_high_s16(s1_lo), vget_high_s16(s1_hi));
164
+
165
+ Oi + 0 = vsubq_s16(s0_lo, s0_hi);
166
+ Oi + 1 = vsubq_s16(s1_lo, s1_hi);
167
+
168
+ int16x4_t EO_lo = vmovn_s32(vsubq_s32(E00, rev32(E01)));
169
+ int16x4_t EO_hi = vmovn_s32(vsubq_s32(E10, rev32(E11)));
170
+ EOi / 2 = vcombine_s16(EO_lo, EO_hi);
171
+
172
+ int32x4_t EE0 = vaddq_s32(E00, rev32(E01));
173
+ int32x4_t EE1 = vaddq_s32(E10, rev32(E11));
174
+
175
+ int32x4_t t0 = vreinterpretq_s32_s64(
176
+ vzip1q_s64(vreinterpretq_s64_s32(EE0), vreinterpretq_s64_s32(EE1)));
177
+ int32x4_t t1 = vrev64q_s32(vreinterpretq_s32_s64(
178
+ vzip2q_s64(vreinterpretq_s64_s32(EE0),
179
+ vreinterpretq_s64_s32(EE1))));
180
+
181
+ EEEi / 2 = vaddq_s32(t0, t1);
182
+ EEOi / 2 = vsubq_s32(t0, t1);
183
+ }
184
+
185
+ for (int i = 0; i < line; i += 4)
186
+ {
187
+ for (int k = 1; k < 16; k += 2)
188
+ {
189
+ int16x8_t c0_c4 = vld1q_s16(&g_t16k0);
190
+
191
+ int64x2_t t0 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, Oi + 0);
192
+ int64x2_t t1 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, Oi + 1);
193
+ int64x2_t t2 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, Oi + 2);
194
+ int64x2_t t3 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, Oi + 3);
195
+
196
+ int32x4_t t01 = vcombine_s32(vmovn_s64(t0), vmovn_s64(t1));
197
+ int32x4_t t23 = vcombine_s32(vmovn_s64(t2), vmovn_s64(t3));
198
+ int16x4_t res = vrshrn_n_s32(vpaddq_s32(t01, t23), shift);
199
+ vst1_s16(dst + k * line, res);
200
+ }
201
x265_3.6.tar.gz/source/common/aarch64/dct-prim.cpp -> x265_4.0.tar.gz/source/common/aarch64/dct-prim.cpp
Changed
201
1
2
3
#include <arm_neon.h>
4
5
+#define X265_PRAGMA(text) _Pragma(#text)
6
+#if defined(__clang__)
7
+#define X265_PRAGMA_UNROLL(n) X265_PRAGMA(unroll(n))
8
+#elif defined(__GNUC__)
9
+#define X265_PRAGMA_UNROLL(n) X265_PRAGMA(GCC unroll (n))
10
+#else
11
+#define X265_PRAGMA_UNROLL(n)
12
+#endif
13
+
14
+extern "C" void PFX(dct16_neon)(const int16_t *src, int16_t *dst, intptr_t srcStride);
15
+extern "C" void PFX(idct16_neon)(const int16_t *src, int16_t *dst, intptr_t dstStride);
16
17
namespace
18
{
19
using namespace X265_NS;
20
21
-
22
-static int16x8_t rev16(const int16x8_t a)
23
+static void transpose_4x4x16(int16x4_t &x0, int16x4_t &x1, int16x4_t &x2, int16x4_t &x3)
24
{
25
- static const int8x16_t tbl = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
26
- return vqtbx1q_u8(a, a, tbl);
27
-}
28
+ int32x2_t s0, s1, s2, s3;
29
30
-static int32x4_t rev32(const int32x4_t a)
31
-{
32
- static const int8x16_t tbl = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
33
- return vqtbx1q_u8(a, a, tbl);
34
-}
35
+ s0 = vtrn1_s32(vreinterpret_s32_s16(x0), vreinterpret_s32_s16(x2));
36
+ s1 = vtrn1_s32(vreinterpret_s32_s16(x1), vreinterpret_s32_s16(x3));
37
+ s2 = vtrn2_s32(vreinterpret_s32_s16(x0), vreinterpret_s32_s16(x2));
38
+ s3 = vtrn2_s32(vreinterpret_s32_s16(x1), vreinterpret_s32_s16(x3));
39
40
-static void transpose_4x4x16(int16x4_t &x0, int16x4_t &x1, int16x4_t &x2, int16x4_t &x3)
41
-{
42
- int16x4_t s0, s1, s2, s3;
43
- s0 = vtrn1_s32(x0, x2);
44
- s1 = vtrn1_s32(x1, x3);
45
- s2 = vtrn2_s32(x0, x2);
46
- s3 = vtrn2_s32(x1, x3);
47
-
48
- x0 = vtrn1_s16(s0, s1);
49
- x1 = vtrn2_s16(s0, s1);
50
- x2 = vtrn1_s16(s2, s3);
51
- x3 = vtrn2_s16(s2, s3);
52
+ x0 = vtrn1_s16(vreinterpret_s16_s32(s0), vreinterpret_s16_s32(s1));
53
+ x1 = vtrn2_s16(vreinterpret_s16_s32(s0), vreinterpret_s16_s32(s1));
54
+ x2 = vtrn1_s16(vreinterpret_s16_s32(s2), vreinterpret_s16_s32(s3));
55
+ x3 = vtrn2_s16(vreinterpret_s16_s32(s2), vreinterpret_s16_s32(s3));
56
}
57
58
59
60
int64x2_t vcost_sum_1 = vdupq_n_s64(0);
61
for (int y = 0; y < MLS_CG_SIZE; y++)
62
{
63
- int16x4_t in = *(int16x4_t *)&m_resiDctCoeffblkPos;
64
+ int16x4_t in = vld1_s16(&m_resiDctCoeffblkPos);
65
int32x4_t mul = vmull_s16(in, in);
66
int64x2_t cost0, cost1;
67
cost0 = vshll_n_s32(vget_low_s32(mul), scaleBits);
68
cost1 = vshll_high_n_s32(mul, scaleBits);
69
- *(int64x2_t *)&costUncodedblkPos + 0 = cost0;
70
- *(int64x2_t *)&costUncodedblkPos + 2 = cost1;
71
+ vst1q_s64(&costUncodedblkPos + 0, cost0);
72
+ vst1q_s64(&costUncodedblkPos + 2, cost1);
73
vcost_sum_0 = vaddq_s64(vcost_sum_0, cost0);
74
vcost_sum_1 = vaddq_s64(vcost_sum_1, cost1);
75
blkPos += trSize;
76
77
int32x4_t vpsy = vdupq_n_s32(*psyScale);
78
for (int y = 0; y < MLS_CG_SIZE; y++)
79
{
80
- int32x4_t signCoef = vmovl_s16(*(int16x4_t *)&m_resiDctCoeffblkPos);
81
- int32x4_t predictedCoef = vsubq_s32(vmovl_s16(*(int16x4_t *)&m_fencDctCoeffblkPos), signCoef);
82
+ int32x4_t signCoef = vmovl_s16(vld1_s16(&m_resiDctCoeffblkPos));
83
+ int32x4_t fencCoef = vmovl_s16(vld1_s16(&m_fencDctCoeffblkPos));
84
+ int32x4_t predictedCoef = vsubq_s32(fencCoef, signCoef);
85
int64x2_t cost0, cost1;
86
cost0 = vmull_s32(vget_low_s32(signCoef), vget_low_s32(signCoef));
87
cost1 = vmull_high_s32(signCoef, signCoef);
88
89
}
90
cost0 = vsubq_s64(cost0, neg0);
91
cost1 = vsubq_s64(cost1, neg1);
92
- *(int64x2_t *)&costUncodedblkPos + 0 = cost0;
93
- *(int64x2_t *)&costUncodedblkPos + 2 = cost1;
94
+ vst1q_s64(&costUncodedblkPos + 0, cost0);
95
+ vst1q_s64(&costUncodedblkPos + 2, cost1);
96
vcost_sum_0 = vaddq_s64(vcost_sum_0, cost0);
97
vcost_sum_1 = vaddq_s64(vcost_sum_1, cost1);
98
99
100
int i = 0;
101
for (; (i + 8) <= numCoeff; i += 8)
102
{
103
- int16x8_t in = *(int16x8_t *)&quantCoeffi;
104
- vcount = vaddq_s16(vcount, vtstq_s16(in, in));
105
+ int16x8_t in = vld1q_s16(&quantCoeffi);
106
+ uint16x8_t tst = vtstq_s16(in, in);
107
+ vcount = vaddq_s16(vcount, vreinterpretq_s16_u16(tst));
108
}
109
for (; i < numCoeff; i++)
110
{
111
112
int j = 0;
113
for (; (j + 8) <= trSize; j += 8)
114
{
115
- int16x8_t in = *(int16x8_t *)&residualj;
116
- *(int16x8_t *)&coeffj = in;
117
- vcount = vaddq_s16(vcount, vtstq_s16(in, in));
118
+ int16x8_t in = vld1q_s16(&residualj);
119
+ vst1q_s16(&coeffj, in);
120
+ uint16x8_t tst = vtstq_s16(in, in);
121
+ vcount = vaddq_s16(vcount, vreinterpretq_s16_u16(tst));
122
}
123
for (; j < trSize; j++)
124
{
125
126
return numSig - vaddvq_s16(vcount);
127
}
128
129
-
130
-static void partialButterfly16(const int16_t *src, int16_t *dst, int shift, int line)
131
+template<int shift>
132
+static inline void partialButterfly16_neon(const int16_t *src, int16_t *dst)
133
{
134
- int j, k;
135
- int32x4_t E2, O2;
136
- int32x4_t EE, EO;
137
- int32x2_t EEE, EEO;
138
- const int add = 1 << (shift - 1);
139
- const int32x4_t _vadd = {add, 0};
140
+ const int line = 16;
141
142
- for (j = 0; j < line; j++)
143
+ int16x8_t Oline;
144
+ int32x4_t EOline;
145
+ int32x4_t EEEline;
146
+ int32x4_t EEOline;
147
+
148
+ for (int i = 0; i < line; i += 2)
149
{
150
- int16x8_t in0 = *(int16x8_t *)src;
151
- int16x8_t in1 = rev16(*(int16x8_t *)&src8);
152
+ int16x8_t s0_lo = vld1q_s16(src + i * line);
153
+ int16x8_t s0_hi = rev16(vld1q_s16(src + i * line + 8));
154
155
- E0 = vaddl_s16(vget_low_s16(in0), vget_low_s16(in1));
156
- O0 = vsubl_s16(vget_low_s16(in0), vget_low_s16(in1));
157
- E1 = vaddl_high_s16(in0, in1);
158
- O1 = vsubl_high_s16(in0, in1);
159
+ int16x8_t s1_lo = vld1q_s16(src + (i + 1) * line);
160
+ int16x8_t s1_hi = rev16(vld1q_s16(src + (i + 1) * line + 8));
161
162
- for (k = 1; k < 16; k += 2)
163
- {
164
- int32x4_t c0 = vmovl_s16(*(int16x4_t *)&g_t16k0);
165
- int32x4_t c1 = vmovl_s16(*(int16x4_t *)&g_t16k4);
166
+ int32x4_t E02;
167
+ E00 = vaddl_s16(vget_low_s16(s0_lo), vget_low_s16(s0_hi));
168
+ E01 = vaddl_s16(vget_high_s16(s0_lo), vget_high_s16(s0_hi));
169
170
- int32x4_t res = _vadd;
171
- res = vmlaq_s32(res, c0, O0);
172
- res = vmlaq_s32(res, c1, O1);
173
- dstk * line = (int16_t)(vaddvq_s32(res) >> shift);
174
- }
175
+ int32x4_t E12;
176
+ E10 = vaddl_s16(vget_low_s16(s1_lo), vget_low_s16(s1_hi));
177
+ E11 = vaddl_s16(vget_high_s16(s1_lo), vget_high_s16(s1_hi));
178
+
179
+ Oi + 0 = vsubq_s16(s0_lo, s0_hi);
180
+ Oi + 1 = vsubq_s16(s1_lo, s1_hi);
181
+
182
+ int32x4_t EE0 = vaddq_s32(E00, rev32(E01));
183
+ int32x4_t EE1 = vaddq_s32(E10, rev32(E11));
184
+ EOi + 0 = vsubq_s32(E00, rev32(E01));
185
+ EOi + 1 = vsubq_s32(E10, rev32(E11));
186
+
187
+ int32x4_t t0 = vreinterpretq_s32_s64(
188
+ vzip1q_s64(vreinterpretq_s64_s32(EE0), vreinterpretq_s64_s32(EE1)));
189
+ int32x4_t t1 = vrev64q_s32(vreinterpretq_s32_s64(vzip2q_s64(
190
+ vreinterpretq_s64_s32(EE0), vreinterpretq_s64_s32(EE1))));
191
192
- /* EE and EO */
193
- EE = vaddq_s32(E0, rev32(E1));
194
- EO = vsubq_s32(E0, rev32(E1));
195
196
- for (k = 2; k < 16; k += 4)
197
+ EEEi / 2 = vaddq_s32(t0, t1);
198
+ EEOi / 2 = vsubq_s32(t0, t1);
199
+ }
200
+
201
x265_3.6.tar.gz/source/common/aarch64/dct-prim.h -> x265_4.0.tar.gz/source/common/aarch64/dct-prim.h
Changed
53
1
2
#include "primitives.h"
3
#include "contexts.h" // costCoeffNxN_c
4
#include "threading.h" // CLZ
5
+#include <arm_neon.h>
6
7
namespace X265_NS
8
{
9
+// First two columns of the 4x4 dct transform matrix, duplicated to 4x4 to allow
10
+// processing two lines at once.
11
+const int32_t t8_even44 =
12
+{
13
+ { 64, 64, 64, 64 },
14
+ { 83, 36, 83, 36 },
15
+ { 64, -64, 64, -64 },
16
+ { 36, -83, 36, -83 },
17
+};
18
+
19
+const uint8_t rev16_tbl16 =
20
+{
21
+ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
22
+};
23
+
24
+const uint8_t rev32_tbl16 =
25
+{
26
+ 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
27
+};
28
+
29
+static inline int16x8_t rev16(const int16x8_t a)
30
+{
31
+ const uint8x16_t tbl = vld1q_u8(rev16_tbl);
32
+ const int8x16_t a_s8 = vreinterpretq_s8_s16(a);
33
+
34
+ return vreinterpretq_s16_s8(vqtbx1q_s8(a_s8, a_s8, tbl));
35
+}
36
+
37
+static inline int32x4_t rev32(const int32x4_t a)
38
+{
39
+ const uint8x16_t tbl = vld1q_u8(rev32_tbl);
40
+ const int8x16_t a_s8 = vreinterpretq_s8_s32(a);
41
+
42
+ return vreinterpretq_s32_s8(vqtbx1q_s8(a_s8, a_s8, tbl));
43
+}
44
+
45
// x265 private namespace
46
void setupDCTPrimitives_neon(EncoderPrimitives &p);
47
+#if defined(HAVE_SVE) && HAVE_SVE_BRIDGE
48
+void setupDCTPrimitives_sve(EncoderPrimitives &p);
49
+#endif
50
};
51
52
53
x265_4.0.tar.gz/source/common/aarch64/dct.S
Added
201
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2021 MulticoreWare, Inc
4
+ *
5
+ * Authors: Min Chen <min.chen@multicorewareinc.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// Functions in this file:
26
+// ***** luma_vpp *****
27
+
28
+#include "asm.S"
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+.set idct16_shift_1, 7
40
+.set idct16_shift_2, 12-(BIT_DEPTH-8)
41
+
42
+.set dct16_shift_1, 3+(BIT_DEPTH-8)
43
+.set dct16_shift_2, 10
44
+
45
+.align 4
46
+// NOTE: Hardcoded due to asm syntax issue, don't reorder!
47
+tbl_const_idct_0:
48
+ .hword 64, 83, 36, 89, 75, 50, 18, 0 // v0
49
+ .hword 90, 87, 80, 70, 57, 43, 25, 9 // v1
50
+// .hword 0=64, 1=83, 2=36, 3=89, 4=75, 5=50, 6=18, 7=00
51
+// .hword 0=90, 1=87, 2=80, 3=70, 4=57, 5=43, 6=25, 7= 9
52
+
53
+ .hword 64, 83, 64, 36 // v0
54
+ .hword 64, 36,-64,-83
55
+ .hword 64,-36,-64, 83 // v1
56
+ .hword 64,-83, 64,-36
57
+
58
+ .hword 89, 75, 50, 18 // v2
59
+ .hword 75,-18,-89,-50
60
+ .hword 50,-89, 18, 75 // v3
61
+ .hword 18,-50, 75,-89
62
+
63
+ .hword 90,+87,+80,+70, +57,+43,+25,+ 9 // v4
64
+ .hword 87,+57, +9,-43, -80,-90,-70,-25 // v5
65
+ .hword 80, +9,-70,-87, -25,+57,+90,+43 // v6
66
+ .hword 70,-43,-87, +9, +90,+25,-80,-57 // v7
67
+ .hword 57,-80,-25,+90, - 9,-87,+43,+70 // v8
68
+ .hword 43,-90,+57,+25, -87,+70,+ 9,-80 // v9
69
+ .hword 25,-70,+90,-80, +43,+ 9,-57,+87 // v16
70
+ .hword 9,-25,+43,-57, +70,-80,+87,-90 // v17
71
+
72
+ .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 // v18
73
+
74
+tbl_const_dct_0:
75
+ // EE
76
+ .hword 64,+64,+64,+64 // v16
77
+ .hword 83,+36,-36,-83 // v17
78
+ .hword 64,-64,-64,+64 // v18
79
+ .hword 36,-83,+83,-36 // v19
80
+
81
+ // EO
82
+ .hword 89,+75,+50,+18 // v20
83
+ .hword 75,-18,-89,-50 // v21
84
+ .hword 50,-89,+18,+75 // v22
85
+ .hword 18,-50,+75,-89 // v23
86
+
87
+ // O
88
+ .hword 90,+87,+80,+70,+57,+43,+25, +9 // v24
89
+ .hword 87,+57, +9,-43,-80,-90,-70,-25 // v25
90
+ .hword 80, +9,-70,-87,-25,+57,+90,+43 // v26
91
+ .hword 70,-43,-87, +9,+90,+25,-80,-57 // v27
92
+ .hword 57,-80,-25,+90, -9,-87,+43,+70 // v28
93
+ .hword 43,-90,+57,+25,-87,+70, +9,-80 // v29
94
+ .hword 25,-70,+90,-80,+43, +9,-57,+87 // v30
95
+ .hword 9,-25,+43,-57,+70,-80,+87,-90 // v31
96
+
97
+ .byte 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 // v0
98
+// .byte 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9 // v1
99
+
100
+ .word 64, 83, 36, 89, 75, 50, 18, 0 // v0, v1
101
+ .word 90, 87, 80, 70, 57, 43, 25, 9 // v2, v3
102
+
103
+
104
+// ***** idct 16x16 *****
105
+// void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride)
106
+function PFX(idct16_neon)
107
+// Register map
108
+// x0 = src
109
+// x1 = dst
110
+// x2 = dstStride
111
+// x8 = tbl_const_idct_0
112
+
113
+ stp d8, d9, sp,#-16!
114
+ sub sp, sp, #(16*16*2)
115
+
116
+ adr x8, tbl_const_idct_0
117
+ ldp q0, q1, x8
118
+
119
+ mov x5, sp
120
+ mov w4, #16
121
+
122
+ // Pass1
123
+5:
124
+ ldr d16, x0, #(0*16*2)
125
+ ldr d17, x0, #(2*16*2)
126
+ ldr d18, x0, #(4*16*2)
127
+ ldr d19, x0, #(6*16*2)
128
+ ldr d20, x0, #(8*16*2)
129
+ ldr d21, x0, #(10*16*2)
130
+ ldr d22, x0, #(12*16*2)
131
+ ldr d23, x0, #(14*16*2)
132
+
133
+// EEE0 = 64*src0*16+i + 64*src 8*16+i;
134
+// EEE1 = 64*src0*16+i - 64*src 8*16+i;
135
+// EEO0 = 83*src4*16+i + 36*src12*16+i;
136
+// EEO1 = 36*src4*16+i - 83*src12*16+i;
137
+ smull v24.4s, v16.4h, v0.h0 // EEE0 = 64*0
138
+ smull v26.4s, v18.4h, v0.h1 // EEO0 = 83*4
139
+ mov v25.16b, v24.16b // EEE1 = 64*0
140
+ smull v27.4s, v18.4h, v0.h2 // EEO1 = 36*4
141
+
142
+// EO0 = 89*src 2*16+i + 75*src 6*16+i + 50*src10*16+i + 18*src14*16+i;
143
+// EO1 = 75*src 2*16+i - 18*src 6*16+i - 89*src10*16+i - 50*src14*16+i;
144
+// EO2 = 50*src 2*16+i - 89*src 6*16+i + 18*src10*16+i + 75*src14*16+i;
145
+// EO3 = 18*src 2*16+i - 50*src 6*16+i + 75*src10*16+i - 89*src14*16+i;
146
+ smull v28.4s, v17.4h, v0.h3 // EO0 = 89*2
147
+ smull v29.4s, v17.4h, v0.h4 // EO1 = 75*2
148
+ smull v30.4s, v17.4h, v0.h5 // EO2 = 50*2
149
+ smull v31.4s, v17.4h, v0.h6 // EO3 = 18*2
150
+
151
+ smlal v28.4s, v19.4h, v0.h4 // EO0 = 89*2+75*6
152
+ smlsl v29.4s, v19.4h, v0.h6 // EO1 = 75*2-18*6
153
+ smlsl v30.4s, v19.4h, v0.h3 // EO2 = 50*2-89*6
154
+ smlsl v31.4s, v19.4h, v0.h5 // EO3 = 18*2-50*6
155
+
156
+ ldr d16, x0, #(1*16*2)
157
+ ldr d17, x0, #(3*16*2)
158
+ ldr d18, x0, #(5*16*2)
159
+ ldr d19, x0, #(7*16*2)
160
+
161
+ orr v2.8b, v20.8b, v21.8b
162
+ orr v2.8b, v2.8b, v22.8b
163
+ orr v2.8b, v2.8b, v23.8b
164
+ orr v3.8b, v18.8b, v19.8b
165
+ mov x6, v2.d0
166
+ mov x7, v3.d0
167
+
168
+// O0 = 90*src 1*16+i + 87*src 3*16+i + 80*src 5*16+i + 70*src 7*16+i + 57*src 9*16+i + 43*src11*16+i + 25*src13*16+i + 9*src15*16+i;
169
+// O1 = 87*src 1*16+i + 57*src 3*16+i + 9*src 5*16+i - 43*src 7*16+i - 80*src 9*16+i - 90*src11*16+i - 70*src13*16+i - 25*src15*16+i;
170
+// O2 = 80*src 1*16+i + 9*src 3*16+i - 70*src 5*16+i - 87*src 7*16+i - 25*src 9*16+i + 57*src11*16+i + 90*src13*16+i + 43*src15*16+i;
171
+// O3 = 70*src 1*16+i - 43*src 3*16+i - 87*src 5*16+i + 9*src 7*16+i + 90*src 9*16+i + 25*src11*16+i - 80*src13*16+i - 57*src15*16+i;
172
+// O4 = 57*src 1*16+i - 80*src 3*16+i - 25*src 5*16+i + 90*src 7*16+i - 9*src 9*16+i - 87*src11*16+i + 43*src13*16+i + 70*src15*16+i;
173
+// O5 = 43*src 1*16+i - 90*src 3*16+i + 57*src 5*16+i + 25*src 7*16+i - 87*src 9*16+i + 70*src11*16+i + 9*src13*16+i - 80*src15*16+i;
174
+// O6 = 25*src 1*16+i - 70*src 3*16+i + 90*src 5*16+i - 80*src 7*16+i + 43*src 9*16+i + 9*src11*16+i - 57*src13*16+i + 87*src15*16+i;
175
+// O7 = 9*src 1*16+i - 25*src 3*16+i + 43*src 5*16+i - 57*src 7*16+i + 70*src 9*16+i - 80*src11*16+i + 87*src13*16+i - 90*src15*16+i;
176
+ smull v2.4s, v16.4h, v1.h0 // v2 = O0 = 90*1
177
+ smull v3.4s, v16.4h, v1.h1 // v3 = O1 = 87*1
178
+ smull v4.4s, v16.4h, v1.h2 // v4 = O2 = 80*1
179
+ smull v5.4s, v16.4h, v1.h3 // v5 = O3 = 70*1
180
+ smull v6.4s, v16.4h, v1.h4 // v6 = O4 = 57*1
181
+ smull v7.4s, v16.4h, v1.h5 // v7 = O5 = 43*1
182
+ smull v8.4s, v16.4h, v1.h6 // v8 = O6 = 25*1
183
+ smull v9.4s, v16.4h, v1.h7 // v9 = O7 = 9*1
184
+
185
+ smlal v2.4s, v17.4h, v1.h1 // v2 = O0 = 90*1+87*3
186
+ smlal v3.4s, v17.4h, v1.h4 // v3 = O1 = 87*1+57*3
187
+ smlal v4.4s, v17.4h, v1.h7 // v4 = O2 = 80*1+ 9*3
188
+ smlsl v5.4s, v17.4h, v1.h5 // v5 = O3 = 70*1-43*3
189
+ smlsl v6.4s, v17.4h, v1.h2 // v6 = O4 = 57*1-80*3
190
+ smlsl v7.4s, v17.4h, v1.h0 // v7 = O5 = 43*1-90*3
191
+ smlsl v8.4s, v17.4h, v1.h3 // v8 = O6 = 25*1-70*3
192
+ smlsl v9.4s, v17.4h, v1.h6 // v9 = O7 = 9*1-25*3
193
+
194
+ //cmp x7, #0
195
+ //beq 1f
196
+ cbz x7, 1f
197
+
198
+ smlal v2.4s, v18.4h, v1.h2 // v2 = O0 = 90*1+87*3+80*5
199
+ smlal v3.4s, v18.4h, v1.h7 // v3 = O1 = 87*1+57*3+ 9*5
200
+ smlsl v4.4s, v18.4h, v1.h3 // v4 = O2 = 80*1+ 9*3-70*5
201
x265_4.0.tar.gz/source/common/aarch64/filter-neon-dotprod.cpp
Added
201
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2024 MulticoreWare, Inc
4
+ *
5
+ * Authors: Hari Limaye <hari.limaye@arm.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "filter-neon-dotprod.h"
26
+
27
+#if !HIGH_BIT_DEPTH
28
+#include "mem-neon.h"
29
+#include <arm_neon.h>
30
+
31
+namespace {
32
+static const uint8_t dotprod_permute_tbl48 = {
33
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
34
+ 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
35
+ 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
36
+};
37
+
38
+static const uint8_t dot_prod_merge_block_tbl48 = {
39
+ // Shift left and insert new last column in transposed 4x4 block.
40
+ 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
41
+ // Shift left and insert two new columns in transposed 4x4 block.
42
+ 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
43
+ // Shift left and insert three new columns in transposed 4x4 block.
44
+ 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
45
+};
46
+
47
+uint8x8_t inline filter8_8_pp(uint8x16_t samples, const int8x8_t filter,
48
+ const int32x4_t constant, const uint8x16x3_t tbl)
49
+{
50
+ // Transform sample range from uint8_t to int8_t for signed dot product.
51
+ int8x16_t samples_s8 =
52
+ vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
53
+
54
+ // Permute input samples for dot product.
55
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
56
+ int8x16_t perm_samples_0 = vqtbl1q_s8(samples_s8, tbl.val0);
57
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
58
+ int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val1);
59
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
60
+ int8x16_t perm_samples_2 = vqtbl1q_s8(samples_s8, tbl.val2);
61
+
62
+ int32x4_t dotprod_lo = vdotq_lane_s32(constant, perm_samples_0, filter, 0);
63
+ int32x4_t dotprod_hi = vdotq_lane_s32(constant, perm_samples_1, filter, 0);
64
+ dotprod_lo = vdotq_lane_s32(dotprod_lo, perm_samples_1, filter, 1);
65
+ dotprod_hi = vdotq_lane_s32(dotprod_hi, perm_samples_2, filter, 1);
66
+
67
+ // Narrow and combine.
68
+ int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
69
+ vmovn_s32(dotprod_hi));
70
+ return vqrshrun_n_s16(dotprod, IF_FILTER_PREC);
71
+}
72
+
73
+void inline init_sample_permute(uint8x8_t *samples, const uint8x16x3_t tbl,
74
+ int8x16_t *d)
75
+{
76
+ // Transform sample range from uint8_t to int8_t for signed dot product.
77
+ int8x8_t samples_s84;
78
+ samples_s80 = vreinterpret_s8_u8(vsub_u8(samples0, vdup_n_u8(128)));
79
+ samples_s81 = vreinterpret_s8_u8(vsub_u8(samples1, vdup_n_u8(128)));
80
+ samples_s82 = vreinterpret_s8_u8(vsub_u8(samples2, vdup_n_u8(128)));
81
+ samples_s83 = vreinterpret_s8_u8(vsub_u8(samples3, vdup_n_u8(128)));
82
+
83
+ // Permute input samples for dot product.
84
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
85
+ d0 = vqtbl1q_s8(vcombine_s8(samples_s80, vdup_n_s8(0)), tbl.val0);
86
+ d1 = vqtbl1q_s8(vcombine_s8(samples_s81, vdup_n_s8(0)), tbl.val0);
87
+ d2 = vqtbl1q_s8(vcombine_s8(samples_s82, vdup_n_s8(0)), tbl.val0);
88
+ d3 = vqtbl1q_s8(vcombine_s8(samples_s83, vdup_n_s8(0)), tbl.val0);
89
+}
90
+
91
+uint8x8_t inline filter8_8_pp_reuse(uint8x16_t samples, const int8x8_t filter,
92
+ const int32x4_t constant,
93
+ const uint8x16x3_t tbl,
94
+ int8x16_t &perm_samples_0)
95
+{
96
+ // Transform sample range from uint8_t to int8_t for signed dot product.
97
+ int8x16_t samples_s8 =
98
+ vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
99
+
100
+ // Permute input samples for dot product.
101
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
102
+ // Already in perm_samples_0.
103
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
104
+ int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val1);
105
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
106
+ int8x16_t perm_samples_2 = vqtbl1q_s8(samples_s8, tbl.val2);
107
+
108
+ int32x4_t dotprod_lo = vdotq_lane_s32(constant, perm_samples_0, filter, 0);
109
+ int32x4_t dotprod_hi = vdotq_lane_s32(constant, perm_samples_1, filter, 0);
110
+ dotprod_lo = vdotq_lane_s32(dotprod_lo, perm_samples_1, filter, 1);
111
+ dotprod_hi = vdotq_lane_s32(dotprod_hi, perm_samples_2, filter, 1);
112
+
113
+ // Save for re-use in next iteration.
114
+ perm_samples_0 = perm_samples_2;
115
+
116
+ // Narrow and combine.
117
+ int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
118
+ vmovn_s32(dotprod_hi));
119
+ return vqrshrun_n_s16(dotprod, IF_FILTER_PREC);
120
+}
121
+
122
+int16x4_t inline filter8_4_ps(uint8x16_t samples, const int8x8_t filter,
123
+ const uint8x16x3_t tbl)
124
+{
125
+ // Transform sample range from uint8_t to int8_t for signed dot product.
126
+ int8x16_t samples_s8 =
127
+ vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
128
+
129
+ // Permute input samples for dot product.
130
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
131
+ int8x16_t perm_samples_0 = vqtbl1q_s8(samples_s8, tbl.val0);
132
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
133
+ int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val1);
134
+
135
+ // Correction accounting for sample range transform cancels to 0.
136
+ int32x4_t constant = vdupq_n_s32(0);
137
+ int32x4_t dotprod = vdotq_lane_s32(constant, perm_samples_0, filter, 0);
138
+ dotprod = vdotq_lane_s32(dotprod, perm_samples_1, filter, 1);
139
+
140
+ // Narrow.
141
+ return vmovn_s32(dotprod);
142
+}
143
+
144
+int16x8_t inline filter8_8_ps(uint8x16_t samples, const int8x8_t filter,
145
+ const uint8x16x3_t tbl)
146
+{
147
+ // Transform sample range from uint8_t to int8_t for signed dot product.
148
+ int8x16_t samples_s8 =
149
+ vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
150
+
151
+ // Permute input samples for dot product.
152
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
153
+ int8x16_t perm_samples_0 = vqtbl1q_s8(samples_s8, tbl.val0);
154
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
155
+ int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val1);
156
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
157
+ int8x16_t perm_samples_2 = vqtbl1q_s8(samples_s8, tbl.val2);
158
+
159
+ // Correction accounting for sample range transform cancels to 0.
160
+ int32x4_t constant = vdupq_n_s32(0);
161
+ int32x4_t dotprod_lo = vdotq_lane_s32(constant, perm_samples_0, filter, 0);
162
+ int32x4_t dotprod_hi = vdotq_lane_s32(constant, perm_samples_1, filter, 0);
163
+ dotprod_lo = vdotq_lane_s32(dotprod_lo, perm_samples_1, filter, 1);
164
+ dotprod_hi = vdotq_lane_s32(dotprod_hi, perm_samples_2, filter, 1);
165
+
166
+ // Narrow and combine.
167
+ return vcombine_s16(vmovn_s32(dotprod_lo), vmovn_s32(dotprod_hi));
168
+}
169
+
170
+int16x8_t inline filter8_8_ps_reuse(uint8x16_t samples, const int8x8_t filter,
171
+ const uint8x16x3_t tbl,
172
+ int8x16_t &perm_samples_0)
173
+{
174
+ // Transform sample range from uint8_t to int8_t for signed dot product.
175
+ int8x16_t samples_s8 =
176
+ vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
177
+
178
+ // Permute input samples for dot product.
179
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
180
+ // Already in perm_samples_0.
181
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
182
+ int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val1);
183
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
184
+ int8x16_t perm_samples_2 = vqtbl1q_s8(samples_s8, tbl.val2);
185
+
186
+ // Correction accounting for sample range transform cancels to 0.
187
+ int32x4_t constant = vdupq_n_s32(0);
188
+ int32x4_t dotprod_lo = vdotq_lane_s32(constant, perm_samples_0, filter, 0);
189
+ int32x4_t dotprod_hi = vdotq_lane_s32(constant, perm_samples_1, filter, 0);
190
+ dotprod_lo = vdotq_lane_s32(dotprod_lo, perm_samples_1, filter, 1);
191
+ dotprod_hi = vdotq_lane_s32(dotprod_hi, perm_samples_2, filter, 1);
192
+
193
+ // Save for re-use in next iteration.
194
+ perm_samples_0 = perm_samples_2;
195
+
196
+ // Narrow and combine.
197
+ return vcombine_s16(vmovn_s32(dotprod_lo), vmovn_s32(dotprod_hi));
198
+}
199
+
200
+uint8x8_t inline filter4_8_pp(uint8x16_t samples, const int8x8_t filter,
201
x265_4.0.tar.gz/source/common/aarch64/filter-neon-dotprod.h
Added
39
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2024 MulticoreWare, Inc
4
+ *
5
+ * Authors: Hari Limaye <hari.limaye@arm.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#ifndef X265_COMMON_AARCH64_FILTER_NEON_DOTPROD_H
26
+#define X265_COMMON_AARCH64_FILTER_NEON_DOTPROD_H
27
+
28
+#if defined(HAVE_NEON_DOTPROD)
29
+
30
+#include "primitives.h"
31
+
32
+namespace X265_NS {
33
+void setupFilterPrimitives_neon_dotprod(EncoderPrimitives &p);
34
+}
35
+
36
+#endif // defined(HAVE_NEON_DOTPROD)
37
+
38
+#endif // X265_COMMON_AARCH64_FILTER_NEON_DOTPROD_H
39
x265_4.0.tar.gz/source/common/aarch64/filter-neon-i8mm.cpp
Added
201
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2024 MulticoreWare, Inc
4
+ *
5
+ * Authors: Hari Limaye <hari.limaye@arm.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#if defined(HAVE_NEON_I8MM)
26
+#include "filter-neon-i8mm.h"
27
+#if !HIGH_BIT_DEPTH
28
+
29
+#include "mem-neon.h"
30
+
31
+#include <arm_neon.h>
32
+
33
+namespace {
34
+static const uint8_t dotprod_permute_tbl48 = {
35
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
36
+ 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
37
+ 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
38
+};
39
+
40
+static const uint8_t matmul_permute_tbl232 = {
41
+ // Permute for luma filter 3.
42
+ { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9,
43
+ 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 },
44
+ // Permute for luma filter 1.
45
+ { 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9, 10,
46
+ 5, 6, 7, 8, 9, 10, 11, 12, 7, 8, 9, 10, 11, 12, 13, 14 }
47
+};
48
+
49
+static const int8_t matmul_luma_filter216 = {
50
+ { -1, 4, -10, 58, 17, -5, 1, 0, 0, -1, 4, -10, 58, 17, -5, 1 },
51
+ { 1, -5, 17, 58, -10, 4, -1, 0, 0, 1, -5, 17, 58, -10, 4, -1 }
52
+};
53
+
54
+static const uint8_t dot_prod_merge_block_tbl48 = {
55
+ // Shift left and insert new last column in transposed 4x4 block.
56
+ 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
57
+ // Shift left and insert two new columns in transposed 4x4 block.
58
+ 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
59
+ // Shift left and insert three new columns in transposed 4x4 block.
60
+ 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
61
+};
62
+
63
+uint8x8_t inline filter8_8_pp(uint8x16_t samples, const int8x8_t filter,
64
+ const uint8x16x3_t tbl)
65
+{
66
+ // Permute input samples for dot product.
67
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
68
+ uint8x16_t perm_s0 = vqtbl1q_u8(samples, tbl.val0);
69
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
70
+ uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1);
71
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
72
+ uint8x16_t perm_S2 = vqtbl1q_u8(samples, tbl.val2);
73
+
74
+ int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0);
75
+ dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1);
76
+ int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0);
77
+ dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_S2, filter, 1);
78
+
79
+ // Narrow and combine.
80
+ int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
81
+ vmovn_s32(dotprod_hi));
82
+ return vqrshrun_n_s16(dotprod, IF_FILTER_PREC);
83
+}
84
+
85
+void inline init_sample_permute(uint8x8_t *samples, const uint8x16x3_t tbl,
86
+ uint8x16_t *d)
87
+{
88
+ // Permute input samples for dot product.
89
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
90
+ d0 = vqtbl1q_u8(vcombine_u8(samples0, vdup_n_u8(0)), tbl.val0);
91
+ d1 = vqtbl1q_u8(vcombine_u8(samples1, vdup_n_u8(0)), tbl.val0);
92
+ d2 = vqtbl1q_u8(vcombine_u8(samples2, vdup_n_u8(0)), tbl.val0);
93
+ d3 = vqtbl1q_u8(vcombine_u8(samples3, vdup_n_u8(0)), tbl.val0);
94
+}
95
+
96
+uint8x8_t inline filter8_8_pp_reuse(uint8x16_t samples, const int8x8_t filter,
97
+ const uint8x16x3_t tbl, uint8x16_t &perm_s0)
98
+{
99
+ // Permute input samples for dot product.
100
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
101
+ // Already in perm_s0.
102
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
103
+ uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1);
104
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
105
+ uint8x16_t perm_s2 = vqtbl1q_u8(samples, tbl.val2);
106
+
107
+ int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0);
108
+ dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1);
109
+ int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0);
110
+ dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_s2, filter, 1);
111
+
112
+ // Save for re-use in next iteration.
113
+ perm_s0 = perm_s2;
114
+
115
+ // Narrow and combine.
116
+ int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
117
+ vmovn_s32(dotprod_hi));
118
+ return vqrshrun_n_s16(dotprod, IF_FILTER_PREC);
119
+}
120
+
121
+uint8x8_t inline filter8_8_pp_matmul(uint8x16_t samples, const int8x16_t filter,
122
+ const uint8x16x2_t tbl)
123
+{
124
+ // Permute input samples for 8x2 by 2x8 matrix multiply.
125
+ uint8x16_t perm_s0 = vqtbl1q_u8(samples, tbl.val0);
126
+ uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1);
127
+
128
+ int32x4_t matmul_lo = vusmmlaq_s32(vdupq_n_s32(0), perm_s0, filter);
129
+ int32x4_t matmul_hi = vusmmlaq_s32(vdupq_n_s32(0), perm_s1, filter);
130
+
131
+ // Narrow and combine.
132
+ int16x8_t matmul = vcombine_s16(vmovn_s32(matmul_lo), vmovn_s32(matmul_hi));
133
+ return vqrshrun_n_s16(matmul, IF_FILTER_PREC);
134
+}
135
+
136
+int16x4_t inline filter8_4_ps(uint8x16_t samples, const int8x8_t filter,
137
+ const int16x8_t constant, const uint8x16x3_t tbl)
138
+{
139
+ // Permute input samples for dot product.
140
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
141
+ uint8x16_t perm_s0 = vqtbl1q_u8(samples, tbl.val0);
142
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
143
+ uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1);
144
+
145
+ int32x4_t dotprod = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0);
146
+ dotprod = vusdotq_lane_s32(dotprod, perm_s1, filter, 1);
147
+
148
+ // Narrow.
149
+ return vadd_s16(vmovn_s32(dotprod), vget_low_s16(constant));
150
+}
151
+
152
+int16x8_t inline filter8_8_ps(uint8x16_t samples, const int8x8_t filter,
153
+ const int16x8_t constant, const uint8x16x3_t tbl)
154
+{
155
+ // Permute input samples for dot product.
156
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
157
+ uint8x16_t perm_s0 = vqtbl1q_u8(samples, tbl.val0);
158
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
159
+ uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1);
160
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
161
+ uint8x16_t perm_S2 = vqtbl1q_u8(samples, tbl.val2);
162
+
163
+ int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0);
164
+ dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1);
165
+ int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0);
166
+ dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_S2, filter, 1);
167
+
168
+ // Narrow and combine.
169
+ int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
170
+ vmovn_s32(dotprod_hi));
171
+ return vaddq_s16(dotprod, constant);
172
+}
173
+
174
+int16x8_t inline filter8_8_ps_reuse(uint8x16_t samples, const int8x8_t filter,
175
+ const int16x8_t constant,
176
+ const uint8x16x3_t tbl, uint8x16_t &perm_s0)
177
+{
178
+ // Permute input samples for dot product.
179
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
180
+ // Already in perm_s0.
181
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
182
+ uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1);
183
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
184
+ uint8x16_t perm_s2 = vqtbl1q_u8(samples, tbl.val2);
185
+
186
+ int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0);
187
+ dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1);
188
+ int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0);
189
+ dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_s2, filter, 1);
190
+
191
+ // Save for re-use in next iteration.
192
+ perm_s0 = perm_s2;
193
+
194
+ // Narrow and combine.
195
+ int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
196
+ vmovn_s32(dotprod_hi));
197
+ return vaddq_s16(dotprod, constant);
198
+}
199
+
200
+int16x8_t inline filter8_8_ps_matmul(uint8x16_t samples, const int8x16_t filter,
201
x265_4.0.tar.gz/source/common/aarch64/filter-neon-i8mm.h
Added
39
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2024 MulticoreWare, Inc
4
+ *
5
+ * Authors: Hari Limaye <hari.limaye@arm.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#ifndef X265_FILTER_NEON_I8MM_H
26
+#define X265_FILTER_NEON_I8MM_H
27
+
28
+#if defined(HAVE_NEON_I8MM)
29
+
30
+#include "primitives.h"
31
+
32
+namespace X265_NS {
33
+void setupFilterPrimitives_neon_i8mm(EncoderPrimitives &p);
34
+}
35
+
36
+#endif // defined(HAVE_NEON_I8MM)
37
+
38
+#endif // X265_FILTER_NEON_I8MM_H
39
x265_3.6.tar.gz/source/common/aarch64/filter-prim.cpp -> x265_4.0.tar.gz/source/common/aarch64/filter-prim.cpp
Changed
201
1
2
#if HAVE_NEON
3
4
#include "filter-prim.h"
5
+#include "mem-neon.h"
6
+
7
#include <arm_neon.h>
8
9
-namespace
10
+namespace {
11
+void inline filter4_s16x8(int coeffIdx, const int16x8_t *s, const int16x4_t f,
12
+ const int32x4_t c, int32x4_t &d0, int32x4_t &d1)
13
+{
14
+ if (coeffIdx == 4)
15
+ {
16
+ // { -4, 36, 36, -4 }
17
+ int16x8_t t0 = vaddq_s16(s1, s2);
18
+ int16x8_t t1 = vaddq_s16(s0, s3);
19
+ d0 = vmlal_n_s16(c, vget_low_s16(t0), 36);
20
+ d0 = vmlsl_n_s16(d0, vget_low_s16(t1), 4);
21
+
22
+ d1 = vmlal_n_s16(c, vget_high_s16(t0), 36);
23
+ d1 = vmlsl_n_s16(d1, vget_high_s16(t1), 4);
24
+ }
25
+ else
26
+ {
27
+ d0 = vmlal_lane_s16(c, vget_low_s16(s0), f, 0);
28
+ d0 = vmlal_lane_s16(d0, vget_low_s16(s1), f, 1);
29
+ d0 = vmlal_lane_s16(d0, vget_low_s16(s2), f, 2);
30
+ d0 = vmlal_lane_s16(d0, vget_low_s16(s3), f, 3);
31
+
32
+ d1 = vmlal_lane_s16(c, vget_high_s16(s0), f, 0);
33
+ d1 = vmlal_lane_s16(d1, vget_high_s16(s1), f, 1);
34
+ d1 = vmlal_lane_s16(d1, vget_high_s16(s2), f, 2);
35
+ d1 = vmlal_lane_s16(d1, vget_high_s16(s3), f, 3);
36
+ }
37
+}
38
+
39
+template<int coeffIdx>
40
+void inline filter8_s16x4(const int16x4_t *s, const int32x4_t c, int32x4_t &d)
41
+{
42
+ if (coeffIdx == 1)
43
+ {
44
+ // { -1, 4, -10, 58, 17, -5, 1, 0 }
45
+ d = vsubl_s16(s6, s0);
46
+ d = vaddq_s32(d, c);
47
+ d = vmlal_n_s16(d, s1, 4);
48
+ d = vmlsl_n_s16(d, s2, 10);
49
+ d = vmlal_n_s16(d, s3, 58);
50
+ d = vmlal_n_s16(d, s4, 17);
51
+ d = vmlsl_n_s16(d, s5, 5);
52
+ }
53
+ else if (coeffIdx == 2)
54
+ {
55
+ // { -1, 4, -11, 40, 40, -11, 4, -1 }
56
+ int32x4_t t0 = vaddl_s16(s3, s4);
57
+ int32x4_t t1 = vaddl_s16(s2, s5);
58
+ int32x4_t t2 = vaddl_s16(s1, s6);
59
+ int32x4_t t3 = vaddl_s16(s0, s7);
60
+
61
+ d = vmlaq_n_s32(c, t0, 40);
62
+ d = vmlaq_n_s32(d, t1, -11);
63
+ d = vmlaq_n_s32(d, t2, 4);
64
+ d = vmlaq_n_s32(d, t3, -1);
65
+ }
66
+ else
67
+ {
68
+ // { 0, 1, -5, 17, 58, -10, 4, -1 }
69
+ d = vsubl_s16(s1, s7);
70
+ d = vaddq_s32(d, c);
71
+ d = vmlal_n_s16(d, s6, 4);
72
+ d = vmlsl_n_s16(d, s5, 10);
73
+ d = vmlal_n_s16(d, s4, 58);
74
+ d = vmlal_n_s16(d, s3, 17);
75
+ d = vmlsl_n_s16(d, s2, 5);
76
+ }
77
+}
78
+
79
+template<int coeffIdx>
80
+void inline filter8_s16x8(const int16x8_t *s, const int32x4_t c, int32x4_t &d0,
81
+ int32x4_t &d1)
82
+{
83
+ if (coeffIdx == 1)
84
+ {
85
+ // { -1, 4, -10, 58, 17, -5, 1, 0 }
86
+ d0 = vsubl_s16(vget_low_s16(s6), vget_low_s16(s0));
87
+ d0 = vaddq_s32(d0, c);
88
+ d0 = vmlal_n_s16(d0, vget_low_s16(s1), 4);
89
+ d0 = vmlsl_n_s16(d0, vget_low_s16(s2), 10);
90
+ d0 = vmlal_n_s16(d0, vget_low_s16(s3), 58);
91
+ d0 = vmlal_n_s16(d0, vget_low_s16(s4), 17);
92
+ d0 = vmlsl_n_s16(d0, vget_low_s16(s5), 5);
93
+
94
+ d1 = vsubl_s16(vget_high_s16(s6), vget_high_s16(s0));
95
+ d1 = vaddq_s32(d1, c);
96
+ d1 = vmlal_n_s16(d1, vget_high_s16(s1), 4);
97
+ d1 = vmlsl_n_s16(d1, vget_high_s16(s2), 10);
98
+ d1 = vmlal_n_s16(d1, vget_high_s16(s3), 58);
99
+ d1 = vmlal_n_s16(d1, vget_high_s16(s4), 17);
100
+ d1 = vmlsl_n_s16(d1, vget_high_s16(s5), 5);
101
+ }
102
+ else if (coeffIdx == 2)
103
+ {
104
+ // { -1, 4, -11, 40, 40, -11, 4, -1 }
105
+ int32x4_t t0 = vaddl_s16(vget_low_s16(s3), vget_low_s16(s4));
106
+ int32x4_t t1 = vaddl_s16(vget_low_s16(s2), vget_low_s16(s5));
107
+ int32x4_t t2 = vaddl_s16(vget_low_s16(s1), vget_low_s16(s6));
108
+ int32x4_t t3 = vaddl_s16(vget_low_s16(s0), vget_low_s16(s7));
109
+
110
+ d0 = vmlaq_n_s32(c, t0, 40);
111
+ d0 = vmlaq_n_s32(d0, t1, -11);
112
+ d0 = vmlaq_n_s32(d0, t2, 4);
113
+ d0 = vmlaq_n_s32(d0, t3, -1);
114
+
115
+ int32x4_t t4 = vaddl_s16(vget_high_s16(s3), vget_high_s16(s4));
116
+ int32x4_t t5 = vaddl_s16(vget_high_s16(s2), vget_high_s16(s5));
117
+ int32x4_t t6 = vaddl_s16(vget_high_s16(s1), vget_high_s16(s6));
118
+ int32x4_t t7 = vaddl_s16(vget_high_s16(s0), vget_high_s16(s7));
119
+
120
+ d1 = vmlaq_n_s32(c, t4, 40);
121
+ d1 = vmlaq_n_s32(d1, t5, -11);
122
+ d1 = vmlaq_n_s32(d1, t6, 4);
123
+ d1 = vmlaq_n_s32(d1, t7, -1);
124
+ }
125
+ else
126
+ {
127
+ // { 0, 1, -5, 17, 58, -10, 4, -1 }
128
+ d0 = vsubl_s16(vget_low_s16(s1), vget_low_s16(s7));
129
+ d0 = vaddq_s32(d0, c);
130
+ d0 = vmlal_n_s16(d0, vget_low_s16(s6), 4);
131
+ d0 = vmlsl_n_s16(d0, vget_low_s16(s5), 10);
132
+ d0 = vmlal_n_s16(d0, vget_low_s16(s4), 58);
133
+ d0 = vmlal_n_s16(d0, vget_low_s16(s3), 17);
134
+ d0 = vmlsl_n_s16(d0, vget_low_s16(s2), 5);
135
+
136
+ d1 = vsubl_s16(vget_high_s16(s1), vget_high_s16(s7));
137
+ d1 = vaddq_s32(d1, c);
138
+ d1 = vmlal_n_s16(d1, vget_high_s16(s6), 4);
139
+ d1 = vmlsl_n_s16(d1, vget_high_s16(s5), 10);
140
+ d1 = vmlal_n_s16(d1, vget_high_s16(s4), 58);
141
+ d1 = vmlal_n_s16(d1, vget_high_s16(s3), 17);
142
+ d1 = vmlsl_n_s16(d1, vget_high_s16(s2), 5);
143
+ }
144
+}
145
+
146
+template<int width, int height>
147
+void interp4_vert_ss_neon(const int16_t *src, intptr_t srcStride, int16_t *dst,
148
+ intptr_t dstStride, int coeffIdx)
149
+{
150
+ const int N_TAPS = 4;
151
+ src -= (N_TAPS / 2 - 1) * srcStride;
152
+
153
+ const int16x4_t filter = vld1_s16(X265_NS::g_chromaFiltercoeffIdx);
154
+
155
+ // Zero constant in order to use filter helper functions (optimised away).
156
+ const int32x4_t c = vdupq_n_s32(0);
157
+
158
+ if (width == 12)
159
+ {
160
+ const int16_t *s = src;
161
+ int16_t *d = dst;
162
+
163
+ int16x8_t in7;
164
+ load_s16x8xn<3>(s, srcStride, in);
165
+ s += 3 * srcStride;
166
+
167
+ for (int row = 0; (row + 4) <= height; row += 4)
168
+ {
169
+ load_s16x8xn<4>(s, srcStride, in + 3);
170
+
171
+ int32x4_t sum_lo4;
172
+ int32x4_t sum_hi4;
173
+ filter4_s16x8(coeffIdx, in + 0, filter, c, sum_lo0, sum_hi0);
174
+ filter4_s16x8(coeffIdx, in + 1, filter, c, sum_lo1, sum_hi1);
175
+ filter4_s16x8(coeffIdx, in + 2, filter, c, sum_lo2, sum_hi2);
176
+ filter4_s16x8(coeffIdx, in + 3, filter, c, sum_lo3, sum_hi3);
177
+
178
+ int16x8_t sum4;
179
+ sum0 = vcombine_s16(vshrn_n_s32(sum_lo0, IF_FILTER_PREC),
180
+ vshrn_n_s32(sum_hi0, IF_FILTER_PREC));
181
+ sum1 = vcombine_s16(vshrn_n_s32(sum_lo1, IF_FILTER_PREC),
182
+ vshrn_n_s32(sum_hi1, IF_FILTER_PREC));
183
+ sum2 = vcombine_s16(vshrn_n_s32(sum_lo2, IF_FILTER_PREC),
184
+ vshrn_n_s32(sum_hi2, IF_FILTER_PREC));
185
+ sum3 = vcombine_s16(vshrn_n_s32(sum_lo3, IF_FILTER_PREC),
186
+ vshrn_n_s32(sum_hi3, IF_FILTER_PREC));
187
+
188
+ store_s16x8xn<4>(d, dstStride, sum);
189
+
190
+ in0 = in4;
191
+ in1 = in5;
192
+ in2 = in6;
193
+
194
+ s += 4 * srcStride;
195
+ d += 4 * dstStride;
196
+ }
197
+
198
+ src += 8;
199
+ dst += 8;
200
+ s = src;
201
x265_3.6.tar.gz/source/common/aarch64/fun-decls.h -> x265_4.0.tar.gz/source/common/aarch64/fun-decls.h
Changed
201
1
2
ret PFX(name ## _64x16_ ## cpu)(__VA_ARGS__); \
3
ret PFX(name ## _16x64_ ## cpu)(__VA_ARGS__)
4
5
+#define FUNCDEF_PU_MULT_16(ret, name, cpu, ...) \
6
+ ret PFX(name ## _16x16_ ## cpu)(__VA_ARGS__); \
7
+ ret PFX(name ## _32x32_ ## cpu)(__VA_ARGS__); \
8
+ ret PFX(name ## _64x64_ ## cpu)(__VA_ARGS__); \
9
+ ret PFX(name ## _16x8_ ## cpu)(__VA_ARGS__); \
10
+ ret PFX(name ## _16x32_ ## cpu)(__VA_ARGS__); \
11
+ ret PFX(name ## _32x16_ ## cpu)(__VA_ARGS__); \
12
+ ret PFX(name ## _64x32_ ## cpu)(__VA_ARGS__); \
13
+ ret PFX(name ## _32x64_ ## cpu)(__VA_ARGS__); \
14
+ ret PFX(name ## _16x12_ ## cpu)(__VA_ARGS__); \
15
+ ret PFX(name ## _16x4_ ## cpu)(__VA_ARGS__); \
16
+ ret PFX(name ## _32x24_ ## cpu)(__VA_ARGS__); \
17
+ ret PFX(name ## _32x8_ ## cpu)(__VA_ARGS__); \
18
+ ret PFX(name ## _64x48_ ## cpu)(__VA_ARGS__); \
19
+ ret PFX(name ## _48x64_ ## cpu)(__VA_ARGS__); \
20
+ ret PFX(name ## _64x16_ ## cpu)(__VA_ARGS__); \
21
+ ret PFX(name ## _16x64_ ## cpu)(__VA_ARGS__)
22
+
23
#define FUNCDEF_CHROMA_PU(ret, name, cpu, ...) \
24
FUNCDEF_PU(ret, name, cpu, __VA_ARGS__); \
25
ret PFX(name ## _4x2_ ## cpu)(__VA_ARGS__); \
26
27
FUNCDEF_CHROMA_PU(void, blockcopy_pp, cpu, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); \
28
FUNCDEF_PU(void, blockcopy_sp, cpu, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); \
29
FUNCDEF_PU(void, blockcopy_ps, cpu, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); \
30
- FUNCDEF_PU(void, interp_8tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
31
- FUNCDEF_PU(void, interp_8tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
32
- FUNCDEF_PU(void, interp_8tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
33
- FUNCDEF_PU(void, interp_8tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
34
- FUNCDEF_PU(void, interp_8tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
35
- FUNCDEF_PU(void, interp_8tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
36
- FUNCDEF_PU(void, interp_8tap_hv_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY); \
37
FUNCDEF_CHROMA_PU(void, filterPixelToShort, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
38
FUNCDEF_CHROMA_PU(void, filterPixelToShort_aligned, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
39
- FUNCDEF_CHROMA_PU(void, interp_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
40
- FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
41
- FUNCDEF_CHROMA_PU(void, interp_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
42
- FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
43
- FUNCDEF_CHROMA_PU(void, interp_4tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
44
- FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
45
- FUNCDEF_CHROMA_PU(void, interp_4tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
46
- FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
47
FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
48
FUNCDEF_CHROMA_PU(void, addAvg_aligned, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
49
FUNCDEF_PU(void, pixel_avg_pp, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
50
51
DECLS(sve);
52
DECLS(sve2);
53
54
+FUNCDEF_PU_MULT_16(int, pixel_sad, neon_dotprod, const pixel*, intptr_t, const pixel*, intptr_t);
55
+FUNCDEF_PU_MULT_16(void, sad_x3, neon_dotprod, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*);
56
+FUNCDEF_PU_MULT_16(void, sad_x4, neon_dotprod, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*);
57
+FUNCDEF_PU(sse_t, pixel_sse_pp, neon_dotprod, const pixel*, intptr_t, const pixel*, intptr_t);
58
59
-void x265_pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
60
+void PFX(pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift));
61
62
-uint64_t x265_pixel_var_8x8_neon(const pixel* pix, intptr_t stride);
63
-uint64_t x265_pixel_var_16x16_neon(const pixel* pix, intptr_t stride);
64
-uint64_t x265_pixel_var_32x32_neon(const pixel* pix, intptr_t stride);
65
-uint64_t x265_pixel_var_64x64_neon(const pixel* pix, intptr_t stride);
66
+uint64_t PFX(pixel_var_8x8_neon(const pixel* pix, intptr_t stride));
67
+uint64_t PFX(pixel_var_16x16_neon(const pixel* pix, intptr_t stride));
68
+uint64_t PFX(pixel_var_32x32_neon(const pixel* pix, intptr_t stride));
69
+uint64_t PFX(pixel_var_64x64_neon(const pixel* pix, intptr_t stride));
70
71
-void x265_getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
72
-void x265_getResidual8_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
73
-void x265_getResidual16_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
74
-void x265_getResidual32_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
75
+void PFX(getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
76
+void PFX(getResidual8_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
77
+void PFX(getResidual16_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
78
+void PFX(getResidual32_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
79
80
-void x265_scale1D_128to64_neon(pixel *dst, const pixel *src);
81
-void x265_scale2D_64to32_neon(pixel* dst, const pixel* src, intptr_t stride);
82
+void PFX(scale1D_128to64_neon(pixel *dst, const pixel *src));
83
+void PFX(scale2D_64to32_neon(pixel* dst, const pixel* src, intptr_t stride));
84
85
-int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
86
-int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
87
-int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
88
-int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
89
-int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
90
-int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
91
-int x265_pixel_satd_8x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
92
-int x265_pixel_satd_8x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
93
-int x265_pixel_satd_8x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
94
-int x265_pixel_satd_8x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
95
-int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
96
-int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
97
-int x265_pixel_satd_16x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
98
-int x265_pixel_satd_16x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
99
-int x265_pixel_satd_16x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
100
-int x265_pixel_satd_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
101
-int x265_pixel_satd_16x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
102
-int x265_pixel_satd_16x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
103
-int x265_pixel_satd_16x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
104
-int x265_pixel_satd_24x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
105
-int x265_pixel_satd_24x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
106
-int x265_pixel_satd_32x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
107
-int x265_pixel_satd_32x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
108
-int x265_pixel_satd_32x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
109
-int x265_pixel_satd_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
110
-int x265_pixel_satd_32x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
111
-int x265_pixel_satd_32x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
112
-int x265_pixel_satd_48x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
113
-int x265_pixel_satd_64x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
114
-int x265_pixel_satd_64x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
115
-int x265_pixel_satd_64x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
116
-int x265_pixel_satd_64x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
117
+int PFX(pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
118
+int PFX(pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
119
+int PFX(pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
120
+int PFX(pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
121
+int PFX(pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
122
+int PFX(pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
123
+int PFX(pixel_satd_8x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
124
+int PFX(pixel_satd_8x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
125
+int PFX(pixel_satd_8x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
126
+int PFX(pixel_satd_8x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
127
+int PFX(pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
128
+int PFX(pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
129
+int PFX(pixel_satd_16x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
130
+int PFX(pixel_satd_16x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
131
+int PFX(pixel_satd_16x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
132
+int PFX(pixel_satd_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
133
+int PFX(pixel_satd_16x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
134
+int PFX(pixel_satd_16x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
135
+int PFX(pixel_satd_16x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
136
+int PFX(pixel_satd_24x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
137
+int PFX(pixel_satd_24x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
138
+int PFX(pixel_satd_32x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
139
+int PFX(pixel_satd_32x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
140
+int PFX(pixel_satd_32x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
141
+int PFX(pixel_satd_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
142
+int PFX(pixel_satd_32x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
143
+int PFX(pixel_satd_32x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
144
+int PFX(pixel_satd_48x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
145
+int PFX(pixel_satd_64x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
146
+int PFX(pixel_satd_64x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
147
+int PFX(pixel_satd_64x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
148
+int PFX(pixel_satd_64x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
149
150
-int x265_pixel_sa8d_8x8_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
151
-int x265_pixel_sa8d_8x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
152
-int x265_pixel_sa8d_16x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
153
-int x265_pixel_sa8d_16x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
154
-int x265_pixel_sa8d_32x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
155
-int x265_pixel_sa8d_32x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
156
-int x265_pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
157
+int PFX(pixel_sa8d_8x8_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
158
+int PFX(pixel_sa8d_8x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
159
+int PFX(pixel_sa8d_16x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
160
+int PFX(pixel_sa8d_16x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
161
+int PFX(pixel_sa8d_32x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
162
+int PFX(pixel_sa8d_32x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
163
+int PFX(pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
164
165
uint32_t PFX(quant_neon)(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
166
uint32_t PFX(nquant_neon)(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
167
168
-void x265_dequant_scaling_neon(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift);
169
-void x265_dequant_normal_neon(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
170
+void PFX(dequant_scaling_neon(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift));
171
+void PFX(dequant_normal_neon(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift));
172
173
-void x265_ssim_4x4x2_core_neon(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24);
174
+void PFX(ssim_4x4x2_core_neon(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24));
175
176
int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
177
int PFX(psyCost_8x8_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
178
179
int PFX(scanPosLast_neon)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
180
uint32_t PFX(costCoeffNxN_neon)(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
181
182
-uint64_t x265_pixel_var_8x8_sve2(const pixel* pix, intptr_t stride);
183
-uint64_t x265_pixel_var_16x16_sve2(const pixel* pix, intptr_t stride);
184
-uint64_t x265_pixel_var_32x32_sve2(const pixel* pix, intptr_t stride);
185
-uint64_t x265_pixel_var_64x64_sve2(const pixel* pix, intptr_t stride);
186
-
187
-void x265_getResidual16_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
188
-void x265_getResidual32_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
189
+uint64_t PFX(pixel_var_8x8_sve2(const pixel* pix, intptr_t stride));
190
+uint64_t PFX(pixel_var_16x16_sve2(const pixel* pix, intptr_t stride));
191
+uint64_t PFX(pixel_var_32x32_sve2(const pixel* pix, intptr_t stride));
192
+uint64_t PFX(pixel_var_64x64_sve2(const pixel* pix, intptr_t stride));
193
194
-void x265_scale1D_128to64_sve2(pixel *dst, const pixel *src);
195
-void x265_scale2D_64to32_sve2(pixel* dst, const pixel* src, intptr_t stride);
196
+void PFX(getResidual16_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
197
+void PFX(getResidual32_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
198
199
-int x265_pixel_satd_4x4_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
200
-int x265_pixel_satd_8x4_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
201
x265_3.6.tar.gz/source/common/aarch64/intrapred-prim.cpp -> x265_4.0.tar.gz/source/common/aarch64/intrapred-prim.cpp
Changed
201
1
2
#include "primitives.h"
3
4
5
-#if 1
6
+#if HAVE_NEON
7
#include "arm64-utils.h"
8
#include <arm_neon.h>
9
10
11
{
12
13
14
+template<int tuSize>
15
+void intraFilter_neon(const pixel* samples, pixel* filtered) /* 1:2:1 filtering of left and top reference samples */
16
+{
17
+ const int tuSize2 = tuSize << 1;
18
+ pixel topLeft = samples0, topLast = samplestuSize2, leftLast = samplestuSize2 + tuSize2;
19
+
20
+ uint16x8_t two_vec = vdupq_n_u16(2);
21
+#if !HIGH_BIT_DEPTH
22
+ {
23
+ for(int i = 0; i < tuSize2 + tuSize2; i+=8)
24
+ {
25
+ uint16x8_t sample1 = vmovl_u8(vld1_u8(&samplesi));
26
+ uint16x8_t sample2 = vmovl_u8(vld1_u8(&samplesi-1));
27
+ uint16x8_t sample3 = vmovl_u8(vld1_u8(&samplesi+1));
28
+
29
+ uint16x8_t result1 = vaddq_u16(vshlq_n_u16(sample1,1), sample2 );
30
+ uint16x8_t result2 = vaddq_u16(sample3, two_vec);
31
+ uint16x8_t result3 = vaddq_u16(result1,result2);
32
+ vst1_u8(&filteredi , vmovn_u16(vshrq_n_u16(result3, 2)));
33
+ }
34
+ }
35
+#else
36
+ {
37
+ for(int i = 0; i < tuSize2 + tuSize2; i+=8)
38
+ {
39
+ uint16x8_t sample1 = vld1q_u16(&samplesi);
40
+ uint16x8_t sample2 = vld1q_u16(&samplesi-1);
41
+ uint16x8_t sample3 = vld1q_u16(&samplesi+1);
42
+
43
+ uint16x8_t result1 = vaddq_u16(vshlq_n_u16(sample1,1), sample2 );
44
+ uint16x8_t result2 = vaddq_u16(sample3, two_vec);
45
+ uint16x8_t result3 = vaddq_u16(result1,result2);
46
+ vst1q_u16(&filteredi , vshrq_n_u16(result3, 2));
47
+ }
48
+ }
49
+#endif
50
+ // filtering top
51
+ filteredtuSize2 = topLast;
52
+
53
+ // filtering top-left
54
+ filtered0 = ((topLeft << 1) + samples1 + samplestuSize2 + 1 + 2) >> 2;
55
+
56
+ // filtering left
57
+ filteredtuSize2 + 1 = ((samplestuSize2 + 1 << 1) + topLeft + samplestuSize2 + 2 + 2) >> 2;
58
+ filteredtuSize2 + tuSize2 = leftLast;
59
+}
60
61
template<int width>
62
void intra_pred_ang_neon(pixel *dst, intptr_t dstStride, const pixel *srcPix0, int dirMode, int bFilter)
63
64
{
65
if (width >= 8 && sizeof(pixel) == 1)
66
{
67
- const int16x8_t f0 = vdupq_n_s16(32 - fraction);
68
- const int16x8_t f1 = vdupq_n_s16(fraction);
69
+ // We have to cast to the 'real' type so that this block
70
+ // will compile for both low and high bitdepth.
71
+ const uint8_t *ref_u8 = (const uint8_t *)ref + offset;
72
+ uint8_t *dst_u8 = (uint8_t *)dst;
73
+
74
+ // f0 and f1 are unsigned (fraction is in range 0, 31).
75
+ const uint8x8_t f0 = vdup_n_u8(32 - fraction);
76
+ const uint8x8_t f1 = vdup_n_u8(fraction);
77
for (int x = 0; x < width; x += 8)
78
{
79
- uint8x8_t in0 = *(uint8x8_t *)&refoffset + x;
80
- uint8x8_t in1 = *(uint8x8_t *)&refoffset + x + 1;
81
- int16x8_t lo = vmlaq_s16(vdupq_n_s16(16), vmovl_u8(in0), f0);
82
- lo = vmlaq_s16(lo, vmovl_u8(in1), f1);
83
- lo = vshrq_n_s16(lo, 5);
84
- *(uint8x8_t *)&dsty * dstStride + x = vmovn_u16(lo);
85
+ uint8x8_t in0 = vld1_u8(ref_u8 + x);
86
+ uint8x8_t in1 = vld1_u8(ref_u8 + x + 1);
87
+ uint16x8_t lo = vmlal_u8(vdupq_n_u16(16), in0, f0);
88
+ lo = vmlal_u8(lo, in1, f1);
89
+ uint8x8_t res = vshrn_n_u16(lo, 5);
90
+ vst1_u8(dst_u8 + y * dstStride + x, res);
91
}
92
}
93
else if (width >= 4 && sizeof(pixel) == 2)
94
{
95
- const int32x4_t f0 = vdupq_n_s32(32 - fraction);
96
- const int32x4_t f1 = vdupq_n_s32(fraction);
97
+ // We have to cast to the 'real' type so that this block
98
+ // will compile for both low and high bitdepth.
99
+ const uint16_t *ref_u16 = (const uint16_t *)ref + offset;
100
+ uint16_t *dst_u16 = (uint16_t *)dst;
101
+
102
+ // f0 and f1 are unsigned (fraction is in range 0, 31).
103
+ const uint16x4_t f0 = vdup_n_u16(32 - fraction);
104
+ const uint16x4_t f1 = vdup_n_u16(fraction);
105
for (int x = 0; x < width; x += 4)
106
{
107
- uint16x4_t in0 = *(uint16x4_t *)&refoffset + x;
108
- uint16x4_t in1 = *(uint16x4_t *)&refoffset + x + 1;
109
- int32x4_t lo = vmlaq_s32(vdupq_n_s32(16), vmovl_u16(in0), f0);
110
- lo = vmlaq_s32(lo, vmovl_u16(in1), f1);
111
- lo = vshrq_n_s32(lo, 5);
112
- *(uint16x4_t *)&dsty * dstStride + x = vmovn_u32(lo);
113
+ uint16x4_t in0 = vld1_u16(ref_u16 + x);
114
+ uint16x4_t in1 = vld1_u16(ref_u16 + x + 1);
115
+ uint32x4_t lo = vmlal_u16(vdupq_n_u32(16), in0, f0);
116
+ lo = vmlal_u16(lo, in1, f1);
117
+ uint16x4_t res = vshrn_n_u32(lo, 5);
118
+ vst1_u16(dst_u16 + y * dstStride + x, res);
119
}
120
}
121
else
122
123
}
124
}
125
126
+#endif
127
template<int log2Size>
128
void all_angs_pred_neon(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
129
{
130
131
}
132
}
133
}
134
+
135
+template<int log2Size>
136
+void planar_pred_neon(pixel * dst, intptr_t dstStride, const pixel * srcPix, int /*dirMode*/, int /*bFilter*/)
137
+{
138
+ const int blkSize = 1 << log2Size;
139
+
140
+ const pixel* above = srcPix + 1;
141
+ const pixel* left = srcPix + (2 * blkSize + 1);
142
+
143
+ switch (blkSize) {
144
+ case 8:
145
+ {
146
+ const uint16_t log2SizePlusOne = log2Size + 1;
147
+ uint16x8_t blkSizeVec = vdupq_n_u16(blkSize);
148
+ uint16x8_t topRight = vdupq_n_u16(aboveblkSize);
149
+ uint16_t bottomLeft = leftblkSize;
150
+ uint16x8_t oneVec = vdupq_n_u16(1);
151
+ uint16x8_t blkSizeSubOneVec = vdupq_n_u16(blkSize - 1);
152
+
153
+ for (int y = 0; y < blkSize; y++) {
154
+ // (blkSize - 1 - y)
155
+ uint16x8_t vlkSizeYVec = vdupq_n_u16(blkSize - 1 - y);
156
+ // (y + 1) * bottomLeft
157
+ uint16x8_t bottomLeftYVec = vdupq_n_u16((y + 1) * bottomLeft);
158
+ // lefty
159
+ uint16x8_t leftYVec = vdupq_n_u16(lefty);
160
+
161
+ for (int x = 0; x < blkSize; x += 8) {
162
+ int idx = y * dstStride + x;
163
+ uint16x8_t xvec = { (uint16_t)(x + 0), (uint16_t)(x + 1),
164
+ (uint16_t)(x + 2), (uint16_t)(x + 3),
165
+ (uint16_t)(x + 4), (uint16_t)(x + 5),
166
+ (uint16_t)(x + 6), (uint16_t)(x + 7) };
167
+
168
+ // (blkSize - 1 - y) * abovex
169
+ uint16x8_t aboveVec = { (uint16_t)(abovex + 0),
170
+ (uint16_t)(abovex + 1),
171
+ (uint16_t)(abovex + 2),
172
+ (uint16_t)(abovex + 3),
173
+ (uint16_t)(abovex + 4),
174
+ (uint16_t)(abovex + 5),
175
+ (uint16_t)(abovex + 6),
176
+ (uint16_t)(abovex + 7) };
177
+
178
+ aboveVec = vmulq_u16(aboveVec, vlkSizeYVec);
179
+
180
+ // (blkSize - 1 - x) * lefty
181
+ uint16x8_t first = vsubq_u16(blkSizeSubOneVec, xvec);
182
+ first = vmulq_u16(first, leftYVec);
183
+
184
+ // (x + 1) * topRight
185
+ uint16x8_t second = vaddq_u16(xvec, oneVec);
186
+ second = vmulq_u16(second, topRight);
187
+
188
+ uint16x8_t resVec = vaddq_u16(first, second);
189
+ resVec = vaddq_u16(resVec, aboveVec);
190
+ resVec = vaddq_u16(resVec, bottomLeftYVec);
191
+ resVec = vaddq_u16(resVec, blkSizeVec);
192
+ resVec = vshrq_n_u16(resVec, log2SizePlusOne);
193
+
194
+ for (int i = 0; i < 8; i++)
195
+ dstidx + i = (pixel)resVeci;
196
+ }
197
+}
198
+ }
199
+ break;
200
+ case 4:
201
x265_4.0.tar.gz/source/common/aarch64/intrapred.S
Added
173
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2021 MulticoreWare, Inc
4
+ *
5
+ * Authors: Min Chen <min.chen@multicorewareinc.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// Functions in this file:
26
+// ***** luma_vpp *****
27
+
28
+#include "asm.S"
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+.align 4
41
+tbl_const_1to8_7to0:
42
+ .byte 1, 2, 3, 4, 5, 6, 7, 8
43
+ .byte 7, 6, 5, 4, 3, 2, 1, 0
44
+ .byte 9, 10, 11, 12, 13, 14, 15, 16
45
+ .byte 15, 14, 13, 12, 11, 10, 9, 8
46
+
47
+// ***** planar_pred *****
48
+// void planar_pred(pixel* dst, intptr_t dstStride, const pixel* srcPix, int /*dirMode*/, int /*bFilter*/)
49
+function PFX(intra_pred_planar8_neon)
50
+// Register map
51
+// x0 = dst
52
+// x1 = dstStride
53
+// x2 = *srcPix
54
+// x3 = leftx
55
+// x4 = tmp
56
+// v0 = above7:0
57
+// v1 = left7:0
58
+// v2 = topRight = rep(aboveblkSize)
59
+// v3 = bottomLeft = rep(leftblkSize)
60
+// v4 = const8 7 6 5 4 3 2 1
61
+// v5 = const7 6 5 4 3 2 1 0
62
+
63
+//{
64
+// const int blkSize = 1 << log2Size;
65
+// const pixel* above = srcPix + 1;
66
+// const pixel* left = srcPix + (2 * blkSize + 1);
67
+// pixel topRight = aboveblkSize;
68
+// pixel bottomLeft = leftblkSize;
69
+// for (int y = 0; y < blkSize; y++)
70
+// for (int x = 0; x < blkSize; x++)
71
+// dsty * dstStride + x = (pixel) (((blkSize - 1 - x) * lefty + (blkSize - 1 -y) * abovex + (x + 1) * topRight + (y + 1) * bottomLeft + blkSize) >> (log2Size + 1));
72
+//}
73
+
74
+ ldurb w3, x2, #(1+8) // topRight
75
+ ldurb w4, x2, #(2*8+1+8) // bottomLeft
76
+ dup v2.8b, w3 // v2 = topRight_b
77
+ dup v3.8h, w4 // v3 = bottomLeft_h
78
+ ldr x3, x2, #(2*8+1) // x3 = leftx_b
79
+ ldr d0, x2, #1 // v0 = abovex_b
80
+
81
+ adr x4, tbl_const_1to8_7to0
82
+ ldr d4, x4 // v4 = const_b8 7 6 5 4 3 2 1
83
+ ldr d5, x4, #8 // v5 = const_b7 6 5 4 3 2 1 0
84
+
85
+ ushll v6.8h, v0.8b, #3 // v6 = 8 * abovex
86
+ usubw v0.8h, v3.8h, v0.8b // v0 = bottomLeft - abovex
87
+
88
+ umlal v6.8h, v4.8b, v2.8b // v6 = 8 * abovex + (x + 1) * topRight
89
+
90
+ mov w4, #8
91
+
92
+1:
93
+ dup v1.8b, w3
94
+ lsr x3, x3, #8
95
+ add v6.8h, v6.8h, v0.8h // v6 = (blkSize - 1 -y=0) * abovex + (x + 1) * topRight + (y=0 + 1) * bottomLeft
96
+ mov v3.16b, v6.16b
97
+ umlal v3.8h, v5.8b, v1.8b // v3 = (blkSize - 1 - x) * lefty=0 + (blkSize - 1 -y=0) * abovex + (x + 1) * topRight + (y=0 + 1) * bottomLeft
98
+ rshrn v3.8b, v3.8h, #4
99
+ sub w4, w4, #1
100
+ st1 {v3.8b}, x0, x1
101
+ cbnz w4, 1b
102
+
103
+ ret
104
+endfunc
105
+
106
+// void planar_pred(pixel* dst, intptr_t dstStride, const pixel* srcPix, int /*dirMode*/, int /*bFilter*/)
107
+function PFX(intra_pred_planar16_neon)
108
+// Register map
109
+// x0 = dst
110
+// x1 = dstStride
111
+// x2 = *srcPix
112
+// x3 = leftx
113
+// x4 = tmp
114
+// v0 = above7:0
115
+// v1 = left7:0
116
+// v2 = topRight = rep(aboveblkSize)
117
+// v3 = bottomLeft = rep(leftblkSize)
118
+// v4 = const16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1
119
+// v5 = const15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
120
+
121
+//{
122
+// const int blkSize = 1 << log2Size;
123
+// const pixel* above = srcPix + 1;
124
+// const pixel* left = srcPix + (2 * blkSize + 1);
125
+// pixel topRight = aboveblkSize;
126
+// pixel bottomLeft = leftblkSize;
127
+// for (int y = 0; y < blkSize; y++)
128
+// for (int x = 0; x < blkSize; x++)
129
+// dsty * dstStride + x = (pixel) (((blkSize - 1 - x) * lefty + (blkSize - 1 -y) * abovex + (x + 1) * topRight + (y + 1) * bottomLeft + blkSize) >> (log2Size + 1));
130
+//}
131
+
132
+ ldurb w3, x2, #(1+16) // topRight
133
+ ldurb w4, x2, #(2*16+1+16) // bottomLeft
134
+ ldr q0, x2, #(2*16+1) // v0 = leftx_b
135
+ ldr q1, x2, #1 // v1 = abovex_b
136
+ dup v2.16b, w3 // v2 = topRight_b
137
+ dup v3.8h, w4 // v3 = bottomLeft_h
138
+
139
+ adr x4, tbl_const_1to8_7to0
140
+ ld2 {v4.2d, v5.2d}, x4 // v4 = const_b16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1
141
+ ext v5.16b, v5.16b, v5.16b, #8 // v5 = const_b15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
142
+
143
+ ushll v16.8h, v1.8b, #4 // v16,v17 = 16 * abovex
144
+ ushll2 v17.8h, v1.16b, #4
145
+ usubw v6.8h, v3.8h, v1.8b // v6,v7 = bottomLeft - abovex
146
+ usubw2 v7.8h, v3.8h, v1.16b
147
+
148
+ umlal v16.8h, v4.8b, v2.8b // v16,v17 = 16 * abovex + (x + 1) * topRight
149
+ umlal2 v17.8h, v4.16b, v2.16b
150
+
151
+ mov w4, #16
152
+
153
+1:
154
+ dup v1.16b, v0.b0 // v1 = leftx_b
155
+ ext v0.16b, v0.16b, v0.16b, #1
156
+
157
+ add v16.8h, v16.8h, v6.8h // v16,v17 = (blkSize - 1 -y=0) * abovex + (x + 1) * topRight + (y=0 + 1) * bottomLeft
158
+ add v17.8h, v17.8h, v7.8h
159
+
160
+ mov v18.16b, v16.16b
161
+ mov v19.16b, v17.16b
162
+
163
+ umlal v18.8h, v5.8b, v1.8b // v3 = (blkSize - 1 - x) * lefty=0 + (blkSize - 1 -y=0) * abovex + (x + 1) * topRight + (y=0 + 1) * bottomLeft
164
+ umlal2 v19.8h, v5.16b, v1.16b
165
+ rshrn v18.8b, v18.8h, #5
166
+ rshrn2 v18.16b, v19.8h, #5
167
+ st1 {v18.16b}, x0, x1
168
+ sub w4, w4, #1
169
+ cbnz w4, 1b
170
+
171
+ ret
172
+endfunc
173
x265_3.6.tar.gz/source/common/aarch64/loopfilter-prim.cpp -> x265_4.0.tar.gz/source/common/aarch64/loopfilter-prim.cpp
Changed
201
1
2
+#include "common.h"
3
#include "loopfilter-prim.h"
4
5
#define PIXEL_MIN 0
6
7
{
8
9
10
-/* get the sign of input variable (TODO: this is a dup, make common) */
11
-static inline int8_t signOf(int x)
12
-{
13
- return (x >> 31) | ((int)((((uint32_t) - x)) >> 31));
14
-}
15
-
16
static inline int8x8_t sign_diff_neon(const uint8x8_t in0, const uint8x8_t in1)
17
{
18
- int16x8_t in = vsubl_u8(in0, in1);
19
+ int16x8_t in = vreinterpretq_s16_u16(vsubl_u8(in0, in1));
20
+
21
return vmovn_s16(vmaxq_s16(vminq_s16(in, vdupq_n_s16(1)), vdupq_n_s16(-1)));
22
}
23
24
25
int x = 0;
26
for (; (x + 8) <= endX; x += 8)
27
{
28
- *(int8x8_t *)&dstx = sign_diff_neon(*(uint8x8_t *)&src1x, *(uint8x8_t *)&src2x);
29
+ int8x8_t sign = sign_diff_neon(vld1_u8(src1 + x), vld1_u8(src2 + x));
30
+ vst1_s8(dst + x, sign);
31
}
32
33
for (; x < endX; x++)
34
{
35
- dstx = signOf(src1x - src2x);
36
+ dstx = x265_signOf(src1x - src2x);
37
}
38
}
39
40
41
int8x8x2_t shifter;
42
shifter.val10 = signLeft0;
43
static const int8x8_t index = {8, 0, 1, 2, 3, 4, 5, 6};
44
- int8x8_t tbl = *(int8x8_t *)offsetEo;
45
+ int8x8_t tbl = vld1_s8(offsetEo);
46
for (; (x + 8) <= width; x += 8)
47
{
48
- uint8x8_t in = *(uint8x8_t *)&recx;
49
- vsignRight = sign_diff_neon(in, *(uint8x8_t *)&recx + 1);
50
+ uint8x8_t in = vld1_u8(rec + x);
51
+ vsignRight = sign_diff_neon(in, vld1_u8(rec + x + 1));
52
shifter.val0 = vneg_s8(vsignRight);
53
int8x8_t tmp = shifter.val0;
54
int8x8_t edge = vtbl2_s8(shifter, index);
55
int8x8_t vedgeType = vadd_s8(vadd_s8(vsignRight, edge), vdup_n_s8(2));
56
shifter.val10 = tmp7;
57
int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
58
- t1 = vaddw_u8(t1, in);
59
- t1 = vmaxq_s16(t1, vdupq_n_s16(0));
60
- t1 = vminq_s16(t1, vdupq_n_s16(255));
61
- *(uint8x8_t *)&recx = vmovn_u16(t1);
62
+ t1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(t1),
63
+ in));
64
+ vst1_u8(rec + x, vqmovun_s16(t1));
65
}
66
signLeft0 = shifter.val10;
67
}
68
69
70
if (width >= 8)
71
{
72
- int8x8_t tbl = *(int8x8_t *)offsetEo;
73
+ int8x8_t tbl = vld1_s8(offsetEo);
74
+ const int8x8_t c = vdup_n_s8(2);
75
+
76
for (; (x + 8) <= width; x += 8)
77
{
78
- uint8x8_t in0 = *(uint8x8_t *)&recx;
79
- uint8x8_t in1 = *(uint8x8_t *)&recx + stride;
80
+ uint8x8_t in0 = vld1_u8(rec + x);
81
+ uint8x8_t in1 = vld1_u8(rec + x + stride);
82
int8x8_t vsignDown = sign_diff_neon(in0, in1);
83
- int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1x), vdup_n_s8(2));
84
- *(int8x8_t *)&upBuff1x = vneg_s8(vsignDown);
85
+ int8x8_t vsignUp = vld1_s8(upBuff1 + x);
86
+ int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, vsignUp), c);
87
+ vst1_s8(upBuff1 + x, vneg_s8(vsignDown));
88
int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
89
- t1 = vaddw_u8(t1, in0);
90
- *(uint8x8_t *)&recx = vqmovun_s16(t1);
91
+ t1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(t1),
92
+ in0));
93
+ vst1_u8(rec + x, vqmovun_s16(t1));
94
}
95
}
96
for (; x < width; x++)
97
{
98
- signDown = signOf(recx - recx + stride);
99
+ signDown = x265_signOf(recx - recx + stride);
100
edgeType = signDown + upBuff1x + 2;
101
upBuff1x = -signDown;
102
recx = x265_clip(recx + offsetEoedgeType);
103
104
int x = 0;
105
if (width >= 8)
106
{
107
- int8x8_t tbl = *(int8x8_t *)offsetEo;
108
+ int8x8_t tbl = vld1_s8(offsetEo);
109
+ const int8x8_t c = vdup_n_s8(2);
110
+
111
for (; (x + 8) <= width; x += 8)
112
{
113
- uint8x8_t in0 = *(uint8x8_t *)&recx;
114
- uint8x8_t in1 = *(uint8x8_t *)&recx + stride;
115
+ uint8x8_t in0 = vld1_u8(rec + x);
116
+ uint8x8_t in1 = vld1_u8(rec + x + stride);
117
int8x8_t vsignDown = sign_diff_neon(in0, in1);
118
- int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1x), vdup_n_s8(2));
119
- *(int8x8_t *)&upBuff1x = vneg_s8(vsignDown);
120
+ int8x8_t vsignUp = vld1_s8(upBuff1 + x);
121
+ int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, vsignUp), c);
122
+ vst1_s8(upBuff1 + x, vneg_s8(vsignDown));
123
int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
124
- t1 = vaddw_u8(t1, in0);
125
- t1 = vmaxq_s16(t1, vdupq_n_s16(0));
126
- t1 = vminq_s16(t1, vdupq_n_s16(255));
127
- *(uint8x8_t *)&recx = vmovn_u16(t1);
128
-
129
+ t1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(t1),
130
+ in0));
131
+ vst1_u8(rec + x, vqmovun_s16(t1));
132
}
133
}
134
for (; x < width; x++)
135
{
136
- signDown = signOf(recx - recx + stride);
137
+ signDown = x265_signOf(recx - recx + stride);
138
edgeType = signDown + upBuff1x + 2;
139
upBuff1x = -signDown;
140
recx = x265_clip(recx + offsetEoedgeType);
141
142
{
143
int x;
144
145
- if (abs(buff1 - bufft) < 16)
146
+ if (abs(static_cast<int>(buff1 - bufft)) < 16)
147
{
148
for (x = 0; x < width; x++)
149
{
150
- int8_t signDown = signOf(recx - recx + stride + 1);
151
+ int8_t signDown = x265_signOf(recx - recx + stride + 1);
152
int edgeType = signDown + buff1x + 2;
153
bufftx + 1 = -signDown;
154
recx = x265_clip(recx + offsetEoedgeType);;
155
156
}
157
else
158
{
159
- int8x8_t tbl = *(int8x8_t *)offsetEo;
160
+ int8x8_t tbl = vld1_s8(offsetEo);
161
+ const int8x8_t c = vdup_n_s8(2);
162
+
163
x = 0;
164
for (; (x + 8) <= width; x += 8)
165
{
166
- uint8x8_t in0 = *(uint8x8_t *)&recx;
167
- uint8x8_t in1 = *(uint8x8_t *)&recx + stride + 1;
168
+ uint8x8_t in0 = vld1_u8(rec + x);
169
+ uint8x8_t in1 = vld1_u8(rec + x + stride + 1);
170
int8x8_t vsignDown = sign_diff_neon(in0, in1);
171
- int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&buff1x), vdup_n_s8(2));
172
- *(int8x8_t *)&bufftx + 1 = vneg_s8(vsignDown);
173
+ int8x8_t vsignUp = vld1_s8(buff1 + x);
174
+ int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, vsignUp), c);
175
+ vst1_s8(bufft + x + 1, vneg_s8(vsignDown));
176
int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
177
- t1 = vaddw_u8(t1, in0);
178
- t1 = vmaxq_s16(t1, vdupq_n_s16(0));
179
- t1 = vminq_s16(t1, vdupq_n_s16(255));
180
- *(uint8x8_t *)&recx = vmovn_u16(t1);
181
+ t1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(t1),
182
+ in0));
183
+ vst1_u8(rec + x, vqmovun_s16(t1));
184
}
185
for (; x < width; x++)
186
{
187
- int8_t signDown = signOf(recx - recx + stride + 1);
188
+ int8_t signDown = x265_signOf(recx - recx + stride + 1);
189
int edgeType = signDown + buff1x + 2;
190
bufftx + 1 = -signDown;
191
recx = x265_clip(recx + offsetEoedgeType);;
192
193
{
194
int8_t signDown;
195
int8_t edgeType;
196
- int8x8_t tbl = *(int8x8_t *)offsetEo;
197
+ int8x8_t tbl = vld1_s8(offsetEo);
198
+ const int8x8_t c = vdup_n_s8(2);
199
200
int x = startX + 1;
201
x265_3.6.tar.gz/source/common/aarch64/mc-a-sve2.S -> x265_4.0.tar.gz/source/common/aarch64/mc-a-sve2.S
Changed
201
1
2
mov x11, #0
3
whilelt p0.b, x11, x10
4
mov w12, #8
5
-.loop_gt_32_pixel_avg_pp_48x64:
6
+.Loop_gt_32_pixel_avg_pp_48x64:
7
sub w12, w12, #1
8
.rept 8
9
ld1b {z0.b}, p0/z, x2
10
11
st1b {z0.b}, p0, x0
12
add x0, x0, x1
13
.endr
14
- cbnz w12, .loop_gt_32_pixel_avg_pp_48x64
15
+ cbnz w12, .Loop_gt_32_pixel_avg_pp_48x64
16
ret
17
endfunc
18
19
20
mov w12, #\h / 2
21
ptrue p0.b, vl16
22
ptrue p2.h, vl6
23
-.loop_sve2_addavg_6x\h\():
24
+.Loop_sve2_addavg_6x\h\():
25
sub w12, w12, #1
26
ld1b {z0.b}, p0/z, x0
27
ld1b {z1.b}, p0/z, x1
28
29
add x2, x2, x5
30
st1b {z2.h}, p2, x2
31
add x2, x2, x5
32
- cbnz w12, .loop_sve2_addavg_6x\h
33
+ cbnz w12, .Loop_sve2_addavg_6x\h
34
ret
35
endfunc
36
.endm
37
38
function PFX(addAvg_8x\h\()_sve2)
39
mov w12, #\h / 2
40
ptrue p0.b, vl16
41
-.loop_sve2_addavg_8x\h\():
42
+.Loop_sve2_addavg_8x\h\():
43
sub w12, w12, #1
44
ld1b {z0.b}, p0/z, x0
45
ld1b {z1.b}, p0/z, x1
46
47
add x2, x2, x5
48
st1b {z2.h}, p0, x2
49
add x2, x2, x5
50
- cbnz w12, .loop_sve2_addavg_8x\h
51
+ cbnz w12, .Loop_sve2_addavg_8x\h
52
ret
53
endfunc
54
.endm
55
56
bgt .vl_gt_16_addAvg_12x\h
57
ptrue p0.b, vl16
58
ptrue p1.b, vl8
59
-.loop_sve2_addavg_12x\h\():
60
+.Loop_sve2_addavg_12x\h\():
61
sub w12, w12, #1
62
ld1b {z0.b}, p0/z, x0
63
ld1b {z1.b}, p0/z, x1
64
65
st1b {z0.h}, p0, x2
66
st1b {z2.h}, p1, x2, #1, mul vl
67
add x2, x2, x5
68
- cbnz w12, .loop_sve2_addavg_12x\h
69
+ cbnz w12, .Loop_sve2_addavg_12x\h
70
ret
71
.vl_gt_16_addAvg_12x\h\():
72
mov x10, #24
73
mov x11, #0
74
whilelt p0.b, x11, x10
75
-.loop_sve2_gt_16_addavg_12x\h\():
76
+.Loop_sve2_gt_16_addavg_12x\h\():
77
sub w12, w12, #1
78
ld1b {z0.b}, p0/z, x0
79
ld1b {z1.b}, p0/z, x1
80
81
add z2.b, z2.b, #0x80
82
st1b {z0.h}, p0, x2
83
add x2, x2, x5
84
- cbnz w12, .loop_sve2_gt_16_addavg_12x\h
85
+ cbnz w12, .Loop_sve2_gt_16_addavg_12x\h
86
ret
87
endfunc
88
.endm
89
90
cmp x9, #16
91
bgt .vl_gt_16_addAvg_16x\h
92
ptrue p0.b, vl16
93
-.loop_eq_16_sve2_addavg_16x\h\():
94
+.Loop_eq_16_sve2_addavg_16x\h\():
95
sub w12, w12, #1
96
ld1b {z0.b}, p0/z, x0
97
ld1b {z1.b}, p0/z, x1
98
99
st1b {z0.h}, p0, x2
100
st1b {z2.h}, p0, x2, #1, mul vl
101
add x2, x2, x5
102
- cbnz w12, .loop_eq_16_sve2_addavg_16x\h
103
+ cbnz w12, .Loop_eq_16_sve2_addavg_16x\h
104
ret
105
.vl_gt_16_addAvg_16x\h\():
106
cmp x9, #32
107
bgt .vl_gt_32_addAvg_16x\h
108
ptrue p0.b, vl32
109
-.loop_gt_16_sve2_addavg_16x\h\():
110
+.Loop_gt_16_sve2_addavg_16x\h\():
111
sub w12, w12, #1
112
ld1b {z0.b}, p0/z, x0
113
ld1b {z1.b}, p0/z, x1
114
115
add z0.b, z0.b, #0x80
116
st1b {z0.h}, p1, x2
117
add x2, x2, x5
118
- cbnz w12, .loop_gt_16_sve2_addavg_16x\h
119
+ cbnz w12, .Loop_gt_16_sve2_addavg_16x\h
120
ret
121
.vl_gt_32_addAvg_16x\h\():
122
mov x10, #48
123
mov x11, #0
124
whilelt p0.b, x11, x10
125
-.loop_gt_32_sve2_addavg_16x\h\():
126
+.Loop_gt_32_sve2_addavg_16x\h\():
127
sub w12, w12, #1
128
ld1b {z0.b}, p0/z, x0
129
add x0, x0, x3, lsl #1
130
131
add z0.b, z0.b, #0x80
132
st1b {z0.h}, p0, x2
133
add x2, x2, x5
134
- cbnz w12, .loop_gt_32_sve2_addavg_16x\h
135
+ cbnz w12, .Loop_gt_32_sve2_addavg_16x\h
136
ret
137
endfunc
138
.endm
139
140
cmp x9, #16
141
bgt .vl_gt_16_addAvg_24x\h
142
addAvg_start
143
-.loop_eq_16_sve2_addavg_24x\h\():
144
+.Loop_eq_16_sve2_addavg_24x\h\():
145
sub w12, w12, #1
146
ld1 {v0.16b-v2.16b}, x0, x3
147
ld1 {v3.16b-v5.16b}, x1, x4
148
149
sqxtun v1.8b, v1.8h
150
sqxtun v2.8b, v2.8h
151
st1 {v0.8b-v2.8b}, x2, x5
152
- cbnz w12, .loop_eq_16_sve2_addavg_24x\h
153
+ cbnz w12, .Loop_eq_16_sve2_addavg_24x\h
154
ret
155
.vl_gt_16_addAvg_24x\h\():
156
cmp x9, #48
157
bgt .vl_gt_48_addAvg_24x\h
158
ptrue p0.b, vl32
159
ptrue p1.b, vl16
160
-.loop_gt_16_sve2_addavg_24x\h\():
161
+.Loop_gt_16_sve2_addavg_24x\h\():
162
sub w12, w12, #1
163
ld1b {z0.b}, p0/z, x0
164
ld1b {z1.b}, p1/z, x0, #1, mul vl
165
166
st1b {z0.h}, p0, x2
167
st1b {z1.h}, p1, x2, #1, mul vl
168
add x2, x2, x5
169
- cbnz w12, .loop_gt_16_sve2_addavg_24x\h
170
+ cbnz w12, .Loop_gt_16_sve2_addavg_24x\h
171
ret
172
.vl_gt_48_addAvg_24x\h\():
173
mov x10, #48
174
mov x11, #0
175
whilelt p0.b, x11, x10
176
-.loop_gt_48_sve2_addavg_24x\h\():
177
+.Loop_gt_48_sve2_addavg_24x\h\():
178
sub w12, w12, #1
179
ld1b {z0.b}, p0/z, x0
180
ld1b {z2.b}, p0/z, x1
181
182
add z0.b, z0.b, #0x80
183
st1b {z0.h}, p0, x2
184
add x2, x2, x5
185
- cbnz w12, .loop_gt_48_sve2_addavg_24x\h
186
+ cbnz w12, .Loop_gt_48_sve2_addavg_24x\h
187
ret
188
endfunc
189
.endm
190
191
cmp x9, #16
192
bgt .vl_gt_16_addAvg_32x\h
193
ptrue p0.b, vl16
194
-.loop_eq_16_sve2_addavg_32x\h\():
195
+.Loop_eq_16_sve2_addavg_32x\h\():
196
sub w12, w12, #1
197
ld1b {z0.b}, p0/z, x0
198
ld1b {z1.b}, p0/z, x0, #1, mul vl
199
200
st1b {z2.h}, p0, x2, #2, mul vl
201
x265_3.6.tar.gz/source/common/aarch64/mc-a.S -> x265_4.0.tar.gz/source/common/aarch64/mc-a.S
Changed
145
1
2
addAvg_start
3
mov w12, #\h / 2
4
sub x5, x5, #4
5
-.loop_addavg_6x\h:
6
+.Loop_addavg_6x\h:
7
sub w12, w12, #1
8
ld1 {v0.16b}, x0, x3
9
ld1 {v1.16b}, x1, x4
10
11
st1 {v0.h}2, x2, x5
12
str s1, x2, #4
13
st1 {v1.h}2, x2, x5
14
- cbnz w12, .loop_addavg_6x\h
15
+ cbnz w12, .Loop_addavg_6x\h
16
ret
17
endfunc
18
.endm
19
20
function PFX(addAvg_8x\h\()_neon)
21
addAvg_start
22
mov w12, #\h / 2
23
-.loop_addavg_8x\h:
24
+.Loop_addavg_8x\h:
25
sub w12, w12, #1
26
ld1 {v0.16b}, x0, x3
27
ld1 {v1.16b}, x1, x4
28
29
sqxtun v1.8b, v1.8h
30
st1 {v0.8b}, x2, x5
31
st1 {v1.8b}, x2, x5
32
- cbnz w12, .loop_addavg_8x\h
33
+ cbnz w12, .Loop_addavg_8x\h
34
ret
35
endfunc
36
.endm
37
38
sub x4, x4, #16
39
sub x5, x5, #8
40
mov w12, #\h
41
-.loop_addAvg_12X\h\():
42
+.Loop_addAvg_12X\h\():
43
sub w12, w12, #1
44
ld1 {v0.16b}, x0, #16
45
ld1 {v1.16b}, x1, #16
46
47
sqxtun v1.8b, v1.8h
48
st1 {v0.8b}, x2, #8
49
st1 {v1.s}0, x2, x5
50
- cbnz w12, .loop_addAvg_12X\h
51
+ cbnz w12, .Loop_addAvg_12X\h
52
ret
53
endfunc
54
.endm
55
56
function PFX(addAvg_16x\h\()_neon)
57
addAvg_start
58
mov w12, #\h
59
-.loop_addavg_16x\h:
60
+.Loop_addavg_16x\h:
61
sub w12, w12, #1
62
ld1 {v0.8h-v1.8h}, x0, x3
63
ld1 {v2.8h-v3.8h}, x1, x4
64
65
sqxtun v0.8b, v0.8h
66
sqxtun2 v0.16b, v1.8h
67
st1 {v0.16b}, x2, x5
68
- cbnz w12, .loop_addavg_16x\h
69
+ cbnz w12, .Loop_addavg_16x\h
70
ret
71
endfunc
72
.endm
73
74
function PFX(addAvg_24x\h\()_neon)
75
addAvg_start
76
mov w12, #\h
77
-.loop_addavg_24x\h\():
78
+.Loop_addavg_24x\h\():
79
sub w12, w12, #1
80
ld1 {v0.16b-v2.16b}, x0, x3
81
ld1 {v3.16b-v5.16b}, x1, x4
82
83
sqxtun v1.8b, v1.8h
84
sqxtun v2.8b, v2.8h
85
st1 {v0.8b-v2.8b}, x2, x5
86
- cbnz w12, .loop_addavg_24x\h
87
+ cbnz w12, .Loop_addavg_24x\h
88
ret
89
endfunc
90
.endm
91
92
function PFX(addAvg_32x\h\()_neon)
93
addAvg_start
94
mov w12, #\h
95
-.loop_addavg_32x\h\():
96
+.Loop_addavg_32x\h\():
97
sub w12, w12, #1
98
ld1 {v0.8h-v3.8h}, x0, x3
99
ld1 {v4.8h-v7.8h}, x1, x4
100
101
sqxtun v2.8b, v2.8h
102
sqxtun v3.8b, v3.8h
103
st1 {v0.8b-v3.8b}, x2, x5
104
- cbnz w12, .loop_addavg_32x\h
105
+ cbnz w12, .Loop_addavg_32x\h
106
ret
107
endfunc
108
.endm
109
110
sub x3, x3, #64
111
sub x4, x4, #64
112
mov w12, #64
113
-.loop_addavg_48x64:
114
+.Loop_addavg_48x64:
115
sub w12, w12, #1
116
ld1 {v0.8h-v3.8h}, x0, #64
117
ld1 {v4.8h-v7.8h}, x1, #64
118
119
sqxtun v2.8b, v20.8h
120
sqxtun2 v2.16b, v21.8h
121
st1 {v0.16b-v2.16b}, x2, x5
122
- cbnz w12, .loop_addavg_48x64
123
+ cbnz w12, .Loop_addavg_48x64
124
ret
125
endfunc
126
127
128
mov w12, #\h
129
sub x3, x3, #64
130
sub x4, x4, #64
131
-.loop_addavg_64x\h\():
132
+.Loop_addavg_64x\h\():
133
sub w12, w12, #1
134
ld1 {v0.8h-v3.8h}, x0, #64
135
ld1 {v4.8h-v7.8h}, x1, #64
136
137
sqxtun v3.8b, v22.8h
138
sqxtun2 v3.16b, v23.8h
139
st1 {v0.16b-v3.16b}, x2, x5
140
- cbnz w12, .loop_addavg_64x\h
141
+ cbnz w12, .Loop_addavg_64x\h
142
ret
143
endfunc
144
.endm
145
x265_4.0.tar.gz/source/common/aarch64/mem-neon.h
Added
201
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2024 MulticoreWare, Inc
4
+ *
5
+ * Authors: Hari Limaye <hari.limaye@arm.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#ifndef X265_COMMON_AARCH64_MEM_NEON_H
26
+#define X265_COMMON_AARCH64_MEM_NEON_H
27
+
28
+#include <arm_neon.h>
29
+#include <cassert>
30
+#include <stdint.h>
31
+
32
+// Load 4 bytes into the low half of a uint8x8_t, zero the upper half.
33
+static uint8x8_t inline load_u8x4x1(const uint8_t *s)
34
+{
35
+ uint8x8_t ret = vdup_n_u8(0);
36
+
37
+ ret = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t*)s,
38
+ vreinterpret_u32_u8(ret), 0));
39
+ return ret;
40
+}
41
+
42
+static uint8x8_t inline load_u8x4x2(const uint8_t *s, intptr_t stride)
43
+{
44
+ uint8x8_t ret = vdup_n_u8(0);
45
+
46
+ ret = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t*)s,
47
+ vreinterpret_u32_u8(ret), 0));
48
+ s += stride;
49
+ ret = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t*)s,
50
+ vreinterpret_u32_u8(ret), 1));
51
+
52
+ return ret;
53
+}
54
+
55
+// Store 4 bytes from the low half of a uint8x8_t.
56
+static void inline store_u8x4x1(uint8_t *d, const uint8x8_t s)
57
+{
58
+ vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(s), 0);
59
+}
60
+
61
+// Store N blocks of 32-bits from (N / 2) D-Registers.
62
+template<int N>
63
+static void inline store_u8x4_strided_xN(uint8_t *d, intptr_t stride,
64
+ const uint8x8_t *s)
65
+{
66
+ assert(N % 2 == 0);
67
+ for (int i = 0; i < N / 2; ++i)
68
+ {
69
+ vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(si), 0);
70
+ d += stride;
71
+ vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(si), 1);
72
+ d += stride;
73
+ }
74
+}
75
+
76
+template<int N>
77
+static void inline load_u8x8xn(const uint8_t *src, const intptr_t stride,
78
+ uint8x8_t *dst)
79
+{
80
+ for (int i = 0; i < N; ++i)
81
+ {
82
+ dsti = vld1_u8(src);
83
+ src += stride;
84
+ }
85
+}
86
+
87
+template<int N>
88
+static void inline load_u8x16xn(const uint8_t *src, const intptr_t stride,
89
+ uint8x16_t *dst)
90
+{
91
+ for (int i = 0; i < N; ++i)
92
+ {
93
+ dsti = vld1q_u8(src);
94
+ src += stride;
95
+ }
96
+}
97
+
98
+template<int N>
99
+static void inline store_u8x2xn(uint8_t *dst, intptr_t dst_stride,
100
+ const uint8x8_t *src)
101
+{
102
+ for (int i = 0; i < N; ++i)
103
+ {
104
+ vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(srci), 0);
105
+ dst += dst_stride;
106
+ }
107
+}
108
+
109
+template<int N>
110
+static void inline store_u8x4xn(uint8_t *dst, intptr_t dst_stride,
111
+ const uint8x8_t *src)
112
+{
113
+ for (int i = 0; i < N; ++i)
114
+ {
115
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(srci), 0);
116
+ dst += dst_stride;
117
+ }
118
+}
119
+
120
+template<int N>
121
+static void inline store_u8x6xn(uint8_t *dst, intptr_t dst_stride,
122
+ const uint8x8_t *src)
123
+{
124
+ for (int i = 0; i < N; ++i)
125
+ {
126
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(srci), 0);
127
+ vst1_lane_u16((uint16_t *)(dst + 4), vreinterpret_u16_u8(srci), 2);
128
+ dst += dst_stride;
129
+ }
130
+}
131
+
132
+template<int N>
133
+static void inline store_u8x8xn(uint8_t *dst, intptr_t dst_stride,
134
+ const uint8x8_t *src)
135
+{
136
+ for (int i = 0; i < N; ++i)
137
+ {
138
+ vst1_u8(dst, srci);
139
+ dst += dst_stride;
140
+ }
141
+}
142
+
143
+template<int N, int M>
144
+static void inline store_u8xnxm(uint8_t *dst, intptr_t dst_stride,
145
+ const uint8x8_t *src)
146
+{
147
+ switch (N)
148
+ {
149
+ case 2: return store_u8x2xn<M>(dst, dst_stride, src);
150
+ case 4: return store_u8x4xn<M>(dst, dst_stride, src);
151
+ case 6: return store_u8x6xn<M>(dst, dst_stride, src);
152
+ case 8: return store_u8x8xn<M>(dst, dst_stride, src);
153
+ }
154
+}
155
+
156
+template<int N>
157
+static void inline store_u8x16xn(uint8_t *dst, intptr_t dst_stride,
158
+ const uint8x16_t *src)
159
+{
160
+ for (int i = 0; i < N; ++i)
161
+ {
162
+ vst1q_u8(dst, srci);
163
+ dst += dst_stride;
164
+ }
165
+}
166
+
167
+template<int N>
168
+static void inline load_s16x4xn(const int16_t *src, const intptr_t stride,
169
+ int16x4_t *dst)
170
+{
171
+ for (int i = 0; i < N; ++i)
172
+ {
173
+ dsti = vld1_s16(src);
174
+ src += stride;
175
+ }
176
+}
177
+
178
+template<int N>
179
+static void inline load_s16x8xn(const int16_t *src, const intptr_t stride,
180
+ int16x8_t *dst)
181
+{
182
+ for (int i = 0; i < N; ++i)
183
+ {
184
+ dsti = vld1q_s16(src);
185
+ src += stride;
186
+ }
187
+}
188
+
189
+template<int N>
190
+static void inline store_s16x2xn(int16_t *dst, intptr_t dst_stride,
191
+ const int16x4_t *src)
192
+{
193
+ for (int i = 0; i < N; ++i)
194
+ {
195
+ vst1_lane_s32((int32_t*)dst, vreinterpret_s32_s16(srci), 0);
196
+ dst += dst_stride;
197
+ }
198
+}
199
+
200
+template<int N>
201
x265_4.0.tar.gz/source/common/aarch64/neon-sve-bridge.h
Added
69
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2024 MulticoreWare, Inc
4
+ *
5
+ * Authors: Hari Limaye <hari.limaye@arm.com>
6
+ * Jonathan Wright <jonathan.wright@arm.com>
7
+ *
8
+ * This program is free software; you can redistribute it and/or modify
9
+ * it under the terms of the GNU General Public License as published by
10
+ * the Free Software Foundation; either version 2 of the License, or
11
+ * (at your option) any later version.
12
+ *
13
+ * This program is distributed in the hope that it will be useful,
14
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ * GNU General Public License for more details.
17
+ *
18
+ * You should have received a copy of the GNU General Public License
19
+ * along with this program; if not, write to the Free Software
20
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21
+ *
22
+ * This program is also available under a commercial proprietary license.
23
+ * For more information, contact us at license @ x265.com.
24
+ *****************************************************************************/
25
+
26
+#ifndef X265_COMMON_AARCH64_NEON_SVE_BRIDGE_H
27
+#define X265_COMMON_AARCH64_NEON_SVE_BRIDGE_H
28
+
29
+#include <arm_neon.h>
30
+
31
+#if defined(HAVE_SVE) && HAVE_SVE_BRIDGE
32
+#include <arm_sve.h>
33
+#include <arm_neon_sve_bridge.h>
34
+
35
+/* We can access instructions that are exclusive to the SVE or SVE2 instruction
36
+ * sets from a predominantly Neon context by making use of the Neon-SVE bridge
37
+ * intrinsics to reinterpret Neon vectors as SVE vectors - with the high part of
38
+ * the SVE vector (if it's longer than 128 bits) being "don't care".
39
+ *
40
+ * While sub-optimal on machines that have SVE vector length > 128-bit - as the
41
+ * remainder of the vector is unused - this approach is still beneficial when
42
+ * compared to a Neon-only implementation. */
43
+
44
+static inline int32x4_t x265_vld1sh_s32(const int16_t *ptr)
45
+{
46
+ return svget_neonq_s32(svld1sh_s32(svptrue_pat_b32(SV_VL4), ptr));
47
+}
48
+
49
+static inline int64x2_t x265_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y)
50
+{
51
+ return svget_neonq_s64(svdot_s64(svset_neonq_s64(svundef_s64(), acc),
52
+ svset_neonq_s16(svundef_s16(), x),
53
+ svset_neonq_s16(svundef_s16(), y)));
54
+}
55
+
56
+static inline int8x16_t x265_sve_mask(const int x, const int endX,
57
+ const int8x16_t in)
58
+{
59
+ // Use predicate to shift "unused lanes" outside of range -2, 2
60
+ svbool_t svpred = svwhilelt_b8(x, endX);
61
+ svint8_t edge_type = svsel_s8(svpred, svset_neonq_s8(svundef_s8(), in),
62
+ svdup_n_s8(-3));
63
+ return svget_neonq_s8(edge_type);
64
+}
65
+
66
+#endif // defined(HAVE_SVE) && HAVE_SVE_BRIDGE
67
+
68
+#endif // X265_COMMON_AARCH64_NEON_SVE_BRIDGE_H
69
x265_3.6.tar.gz/source/common/aarch64/p2s-sve.S -> x265_4.0.tar.gz/source/common/aarch64/p2s-sve.S
Changed
55
1
2
#else
3
p2s_start
4
mov x9, #\h
5
-.loop_filter_sve_P2S_32x\h:
6
+.Loop_filter_sve_P2S_32x\h:
7
sub x9, x9, #1
8
ld1 {v0.16b-v1.16b}, x0, x1
9
ushll v22.8h, v0.8b, #P2S_SHIFT
10
11
add v24.8h, v24.8h, v31.8h
12
add v25.8h, v25.8h, v31.8h
13
st1 {v22.16b-v25.16b}, x2, x3
14
- cbnz x9, .loop_filter_sve_P2S_32x\h
15
+ cbnz x9, .Loop_filter_sve_P2S_32x\h
16
ret
17
#endif
18
endfunc
19
20
p2s_start
21
sub x3, x3, #64
22
mov x9, #\h
23
-.loop_filter_sve_P2S_64x\h:
24
+.Loop_filter_sve_P2S_64x\h:
25
sub x9, x9, #1
26
ld1 {v0.16b-v3.16b}, x0, x1
27
ushll v16.8h, v0.8b, #P2S_SHIFT
28
29
add v23.8h, v23.8h, v31.8h
30
st1 {v16.16b-v19.16b}, x2, #64
31
st1 {v20.16b-v23.16b}, x2, x3
32
- cbnz x9, .loop_filter_sve_P2S_64x\h
33
+ cbnz x9, .Loop_filter_sve_P2S_64x\h
34
ret
35
#endif
36
endfunc
37
38
p2s_start
39
sub x3, x3, #64
40
mov x9, #64
41
-.loop_filterP2S_sve_48x64:
42
+.Loop_filterP2S_sve_48x64:
43
sub x9, x9, #1
44
ld1 {v0.16b-v2.16b}, x0, x1
45
ushll v16.8h, v0.8b, #P2S_SHIFT
46
47
add v21.8h, v21.8h, v31.8h
48
st1 {v16.16b-v19.16b}, x2, #64
49
st1 {v20.16b-v21.16b}, x2, x3
50
- cbnz x9, .loop_filterP2S_sve_48x64
51
+ cbnz x9, .Loop_filterP2S_sve_48x64
52
ret
53
#endif
54
endfunc
55
x265_3.6.tar.gz/source/common/aarch64/p2s.S -> x265_4.0.tar.gz/source/common/aarch64/p2s.S
Changed
54
1
2
function PFX(filterPixelToShort_32x\h\()_neon)
3
p2s_start
4
mov x9, #\h
5
-.loop_filterP2S_32x\h:
6
+.Loop_filterP2S_32x\h:
7
sub x9, x9, #1
8
#if HIGH_BIT_DEPTH
9
ld1 {v0.16b-v3.16b}, x0, x1
10
11
add v24.8h, v24.8h, v31.8h
12
add v25.8h, v25.8h, v31.8h
13
st1 {v22.16b-v25.16b}, x2, x3
14
- cbnz x9, .loop_filterP2S_32x\h
15
+ cbnz x9, .Loop_filterP2S_32x\h
16
ret
17
endfunc
18
.endm
19
20
#endif
21
sub x3, x3, #64
22
mov x9, #\h
23
-.loop_filterP2S_64x\h:
24
+.Loop_filterP2S_64x\h:
25
sub x9, x9, #1
26
#if HIGH_BIT_DEPTH
27
ld1 {v0.16b-v3.16b}, x0, #64
28
29
add v23.8h, v23.8h, v31.8h
30
st1 {v16.16b-v19.16b}, x2, #64
31
st1 {v20.16b-v23.16b}, x2, x3
32
- cbnz x9, .loop_filterP2S_64x\h
33
+ cbnz x9, .Loop_filterP2S_64x\h
34
ret
35
endfunc
36
.endm
37
38
#endif
39
sub x3, x3, #64
40
mov x9, #64
41
-.loop_filterP2S_48x64:
42
+.Loop_filterP2S_48x64:
43
sub x9, x9, #1
44
#if HIGH_BIT_DEPTH
45
ld1 {v0.16b-v3.16b}, x0, #64
46
47
add v21.8h, v21.8h, v31.8h
48
st1 {v16.16b-v19.16b}, x2, #64
49
st1 {v20.16b-v21.16b}, x2, x3
50
- cbnz x9, .loop_filterP2S_48x64
51
+ cbnz x9, .Loop_filterP2S_48x64
52
ret
53
endfunc
54
x265_3.6.tar.gz/source/common/aarch64/pixel-prim.cpp -> x265_4.0.tar.gz/source/common/aarch64/pixel-prim.cpp
Changed
201
1
2
#include "arm64-utils.h"
3
#if HAVE_NEON
4
5
+#include "mem-neon.h"
6
+
7
#include <arm_neon.h>
8
9
using namespace X265_NS;
10
11
sub = vsubq_s16(a, b);
12
}
13
14
-static inline void transpose_8h(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
15
+static inline void transpose_8h_8h(int16x8_t &t1, int16x8_t &t2,
16
+ const int16x8_t s1, const int16x8_t s2)
17
{
18
t1 = vtrn1q_s16(s1, s2);
19
t2 = vtrn2q_s16(s1, s2);
20
}
21
22
-static inline void transpose_4s(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
23
+static inline void transpose_4s_8h(int16x8_t &t1, int16x8_t &t2,
24
+ const int16x8_t s1, const int16x8_t s2)
25
{
26
- t1 = vtrn1q_s32(s1, s2);
27
- t2 = vtrn2q_s32(s1, s2);
28
+ int32x4_t tmp1 = vreinterpretq_s32_s16(s1);
29
+ int32x4_t tmp2 = vreinterpretq_s32_s16(s2);
30
+
31
+ t1 = vreinterpretq_s16_s32(vtrn1q_s32(tmp1, tmp2));
32
+ t2 = vreinterpretq_s16_s32(vtrn2q_s32(tmp1, tmp2));
33
}
34
35
-#if (X265_DEPTH <= 10)
36
-static inline void transpose_2d(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
37
+static inline void transpose_2d_8h(int16x8_t &t1, int16x8_t &t2,
38
+ const int16x8_t s1, const int16x8_t s2)
39
{
40
- t1 = vtrn1q_s64(s1, s2);
41
- t2 = vtrn2q_s64(s1, s2);
42
-}
43
-#endif
44
+ int64x2_t tmp1 = vreinterpretq_s64_s16(s1);
45
+ int64x2_t tmp2 = vreinterpretq_s64_s16(s2);
46
47
+ t1 = vreinterpretq_s16_s64(vtrn1q_s64(tmp1, tmp2));
48
+ t2 = vreinterpretq_s16_s64(vtrn2q_s64(tmp1, tmp2));
49
+}
50
51
static inline void SUMSUB_ABCD(int16x8_t &s1, int16x8_t &d1, int16x8_t &s2, int16x8_t &d2,
52
int16x8_t a, int16x8_t b, int16x8_t c, int16x8_t d)
53
54
SUMSUB_AB(v4 , v6 , v16, v18);
55
SUMSUB_AB(v5 , v7 , v17, v19);
56
57
- v0 = vtrn1q_s16(v4, v5);
58
- v1 = vtrn2q_s16(v4, v5);
59
- v2 = vtrn1q_s16(v6, v7);
60
- v3 = vtrn2q_s16(v6, v7);
61
+ transpose_8h_8h(v0, v1, v4, v5);
62
+ transpose_8h_8h(v2, v3, v6, v7);
63
64
SUMSUB_AB(v16, v17, v0, v1);
65
SUMSUB_AB(v18, v19, v2, v3);
66
67
- v0 = vtrn1q_s32(v16, v18);
68
- v1 = vtrn2q_s32(v16, v18);
69
- v2 = vtrn1q_s32(v17, v19);
70
- v3 = vtrn2q_s32(v17, v19);
71
+ transpose_4s_8h(v0, v1, v16, v18);
72
+ transpose_4s_8h(v2, v3, v17, v19);
73
74
- v0 = vabsq_s16(v0);
75
- v1 = vabsq_s16(v1);
76
- v2 = vabsq_s16(v2);
77
- v3 = vabsq_s16(v3);
78
+ uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0));
79
+ uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1));
80
+ uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v2));
81
+ uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v3));
82
83
- v0 = vmaxq_u16(v0, v1);
84
- v1 = vmaxq_u16(v2, v3);
85
+ uint16x8_t max0 = vmaxq_u16(abs0, abs1);
86
+ uint16x8_t max1 = vmaxq_u16(abs2, abs3);
87
88
- v0 = vaddq_u16(v0, v1);
89
- return vaddlvq_u16(v0);
90
+ uint16x8_t sum = vaddq_u16(max0, max1);
91
+ return vaddlvq_u16(sum);
92
}
93
94
static inline int _satd_4x4_neon(int16x8_t v0, int16x8_t v1)
95
96
int16x8_t v2, v3;
97
SUMSUB_AB(v2, v3, v0, v1);
98
99
- v0 = vzip1q_s64(v2, v3);
100
- v1 = vzip2q_s64(v2, v3);
101
+ transpose_2d_8h(v0, v1, v2, v3);
102
SUMSUB_AB(v2, v3, v0, v1);
103
104
- v0 = vtrn1q_s16(v2, v3);
105
- v1 = vtrn2q_s16(v2, v3);
106
+ transpose_8h_8h(v0, v1, v2, v3);
107
SUMSUB_AB(v2, v3, v0, v1);
108
109
- v0 = vtrn1q_s32(v2, v3);
110
- v1 = vtrn2q_s32(v2, v3);
111
+ transpose_4s_8h(v0, v1, v2, v3);
112
113
- v0 = vabsq_s16(v0);
114
- v1 = vabsq_s16(v1);
115
- v0 = vmaxq_u16(v0, v1);
116
+ uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0));
117
+ uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1));
118
+ uint16x8_t max = vmaxq_u16(abs0, abs1);
119
120
- return vaddlvq_s16(v0);
121
+ return vaddlvq_u16(max);
122
}
123
124
static void _satd_8x4v_8x8h_neon(int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3, int16x8_t &v20,
125
126
127
HADAMARD4_V(v20, v21, v22, v23, v0, v1, v2, v3);
128
129
- transpose_8h(v0, v1, v16, v17);
130
- transpose_8h(v2, v3, v18, v19);
131
- transpose_8h(v4, v5, v20, v21);
132
- transpose_8h(v6, v7, v22, v23);
133
+ transpose_8h_8h(v0, v1, v16, v17);
134
+ transpose_8h_8h(v2, v3, v18, v19);
135
+ transpose_8h_8h(v4, v5, v20, v21);
136
+ transpose_8h_8h(v6, v7, v22, v23);
137
138
SUMSUB_AB(v16, v17, v0, v1);
139
SUMSUB_AB(v18, v19, v2, v3);
140
SUMSUB_AB(v20, v21, v4, v5);
141
SUMSUB_AB(v22, v23, v6, v7);
142
143
- transpose_4s(v0, v2, v16, v18);
144
- transpose_4s(v1, v3, v17, v19);
145
- transpose_4s(v4, v6, v20, v22);
146
- transpose_4s(v5, v7, v21, v23);
147
-
148
- v0 = vabsq_s16(v0);
149
- v1 = vabsq_s16(v1);
150
- v2 = vabsq_s16(v2);
151
- v3 = vabsq_s16(v3);
152
- v4 = vabsq_s16(v4);
153
- v5 = vabsq_s16(v5);
154
- v6 = vabsq_s16(v6);
155
- v7 = vabsq_s16(v7);
156
-
157
- v0 = vmaxq_u16(v0, v2);
158
- v1 = vmaxq_u16(v1, v3);
159
- v2 = vmaxq_u16(v4, v6);
160
- v3 = vmaxq_u16(v5, v7);
161
-
162
+ transpose_4s_8h(v0, v2, v16, v18);
163
+ transpose_4s_8h(v1, v3, v17, v19);
164
+ transpose_4s_8h(v4, v6, v20, v22);
165
+ transpose_4s_8h(v5, v7, v21, v23);
166
+
167
+ uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0));
168
+ uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1));
169
+ uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v2));
170
+ uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v3));
171
+ uint16x8_t abs4 = vreinterpretq_u16_s16(vabsq_s16(v4));
172
+ uint16x8_t abs5 = vreinterpretq_u16_s16(vabsq_s16(v5));
173
+ uint16x8_t abs6 = vreinterpretq_u16_s16(vabsq_s16(v6));
174
+ uint16x8_t abs7 = vreinterpretq_u16_s16(vabsq_s16(v7));
175
+
176
+ v0 = vreinterpretq_s16_u16(vmaxq_u16(abs0, abs2));
177
+ v1 = vreinterpretq_s16_u16(vmaxq_u16(abs1, abs3));
178
+ v2 = vreinterpretq_s16_u16(vmaxq_u16(abs4, abs6));
179
+ v3 = vreinterpretq_s16_u16(vmaxq_u16(abs5, abs7));
180
}
181
182
#if HIGH_BIT_DEPTH
183
184
#if (X265_DEPTH > 10)
185
-static inline void transpose_2d(int32x4_t &t1, int32x4_t &t2, const int32x4_t s1, const int32x4_t s2)
186
+static inline void transpose_2d_4s(int32x4_t &t1, int32x4_t &t2,
187
+ const int32x4_t s1, const int32x4_t s2)
188
{
189
- t1 = vtrn1q_s64(s1, s2);
190
- t2 = vtrn2q_s64(s1, s2);
191
+ int64x2_t tmp1 = vreinterpretq_s64_s32(s1);
192
+ int64x2_t tmp2 = vreinterpretq_s64_s32(s2);
193
+
194
+ t1 = vreinterpretq_s32_s64(vtrn1q_s64(tmp1, tmp2));
195
+ t2 = vreinterpretq_s32_s64(vtrn2q_s64(tmp1, tmp2));
196
}
197
198
static inline void ISUMSUB_AB(int32x4_t &sum, int32x4_t &sub, const int32x4_t a, const int32x4_t b)
199
200
int16x8_t v16, v17;
201
x265_3.6.tar.gz/source/common/aarch64/pixel-util-sve.S -> x265_4.0.tar.gz/source/common/aarch64/pixel-util-sve.S
Changed
107
1
2
ld1b {z7.h}, p0/z, x2, x11
3
add x0, x0, x1
4
add x2, x2, x3
5
- ld1b {z29.h}, p0/z, x0
6
- ld1b {z9.h}, p0/z, x0, x11
7
- ld1b {z10.h}, p0/z, x2
8
- ld1b {z11.h}, p0/z, x2, x11
9
- add x0, x0, x1
10
- add x2, x2, x3
11
- ld1b {z12.h}, p0/z, x0
12
- ld1b {z13.h}, p0/z, x0, x11
13
- ld1b {z14.h}, p0/z, x2
14
- ld1b {z15.h}, p0/z, x2, x11
15
- add x0, x0, x1
16
- add x2, x2, x3
17
-
18
sub \v0\().h, z0.h, z2.h
19
sub \v4\().h, z1.h, z3.h
20
sub \v1\().h, z4.h, z6.h
21
sub \v5\().h, z5.h, z7.h
22
- sub \v2\().h, z29.h, z10.h
23
- sub \v6\().h, z9.h, z11.h
24
- sub \v3\().h, z12.h, z14.h
25
- sub \v7\().h, z13.h, z15.h
26
+
27
+ ld1b {z0.h}, p0/z, x0
28
+ ld1b {z1.h}, p0/z, x0, x11
29
+ ld1b {z2.h}, p0/z, x2
30
+ ld1b {z3.h}, p0/z, x2, x11
31
+ add x0, x0, x1
32
+ add x2, x2, x3
33
+ ld1b {z4.h}, p0/z, x0
34
+ ld1b {z5.h}, p0/z, x0, x11
35
+ ld1b {z6.h}, p0/z, x2
36
+ ld1b {z7.h}, p0/z, x2, x11
37
+ add x0, x0, x1
38
+ add x2, x2, x3
39
+ sub \v2\().h, z0.h, z2.h
40
+ sub \v6\().h, z1.h, z3.h
41
+ sub \v3\().h, z4.h, z6.h
42
+ sub \v7\().h, z5.h, z7.h
43
.endm
44
45
// one vertical hadamard pass and two horizontal
46
47
mov x0, x7
48
ret x10
49
endfunc
50
-
51
-/********* ssim ***********/
52
-// uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
53
-// No need to fully use sve instructions for this function
54
-function PFX(quant_sve)
55
- mov w9, #1
56
- lsl w9, w9, w4
57
- mov z0.s, w9
58
- neg w9, w4
59
- mov z1.s, w9
60
- add w9, w9, #8
61
- mov z2.s, w9
62
- mov z3.s, w5
63
-
64
- lsr w6, w6, #2
65
- eor z4.d, z4.d, z4.d
66
- eor w10, w10, w10
67
- eor z17.d, z17.d, z17.d
68
-
69
-.loop_quant_sve:
70
- ld1 {v18.4h}, x0, #8
71
- ld1 {v7.4s}, x1, #16
72
- sxtl v6.4s, v18.4h
73
-
74
- cmlt v5.4s, v6.4s, #0
75
-
76
- abs v6.4s, v6.4s
77
-
78
-
79
- mul v6.4s, v6.4s, v7.4s
80
-
81
- add v7.4s, v6.4s, v3.4s
82
- sshl v7.4s, v7.4s, v1.4s
83
-
84
- mls v6.4s, v7.4s, v0.s0
85
- sshl v16.4s, v6.4s, v2.4s
86
- st1 {v16.4s}, x2, #16
87
-
88
- // numsig
89
- cmeq v16.4s, v7.4s, v17.4s
90
- add v4.4s, v4.4s, v16.4s
91
- add w10, w10, #4
92
-
93
- // level *= sign
94
- eor z16.d, z7.d, z5.d
95
- sub v16.4s, v16.4s, v5.4s
96
- sqxtn v5.4h, v16.4s
97
- st1 {v5.4h}, x3, #8
98
-
99
- subs w6, w6, #1
100
- b.ne .loop_quant_sve
101
-
102
- addv s4, v4.4s
103
- mov w9, v4.s0
104
- add w0, w10, w9
105
- ret
106
-endfunc
107
x265_3.6.tar.gz/source/common/aarch64/pixel-util-sve2.S -> x265_4.0.tar.gz/source/common/aarch64/pixel-util-sve2.S
Changed
201
1
2
bgt .vl_gt_16_pixel_var_16x16
3
pixel_var_start
4
mov w12, #16
5
-.loop_var_16_sve2:
6
+.Loop_var_16_sve2:
7
sub w12, w12, #1
8
ld1 {v4.16b}, x0, x1
9
pixel_var_1 v4
10
- cbnz w12, .loop_var_16_sve2
11
+ cbnz w12, .Loop_var_16_sve2
12
pixel_var_end
13
ret
14
.vl_gt_16_pixel_var_16x16:
15
16
bgt .vl_gt_16_pixel_var_32x32
17
pixel_var_start
18
mov w12, #32
19
-.loop_var_32_sve2:
20
+.Loop_var_32_sve2:
21
sub w12, w12, #1
22
ld1 {v4.16b-v5.16b}, x0, x1
23
pixel_var_1 v4
24
pixel_var_1 v5
25
- cbnz w12, .loop_var_32_sve2
26
+ cbnz w12, .Loop_var_32_sve2
27
pixel_var_end
28
ret
29
.vl_gt_16_pixel_var_32x32:
30
31
bgt .vl_gt_16_pixel_var_64x64
32
pixel_var_start
33
mov w12, #64
34
-.loop_var_64_sve2:
35
+.Loop_var_64_sve2:
36
sub w12, w12, #1
37
ld1 {v4.16b-v7.16b}, x0, x1
38
pixel_var_1 v4
39
pixel_var_1 v5
40
pixel_var_1 v6
41
pixel_var_1 v7
42
- cbnz w12, .loop_var_64_sve2
43
+ cbnz w12, .Loop_var_64_sve2
44
pixel_var_end
45
ret
46
.vl_gt_16_pixel_var_64x64:
47
48
bgt .vl_gt_16_getResidual32
49
lsl x4, x3, #1
50
mov w12, #4
51
-.loop_residual_32:
52
+.Loop_residual_32:
53
sub w12, w12, #1
54
.rept 4
55
ld1 {v0.16b-v1.16b}, x0, x3
56
57
st1 {v16.8h-v19.8h}, x2, x4
58
st1 {v20.8h-v23.8h}, x2, x4
59
.endr
60
- cbnz w12, .loop_residual_32
61
+ cbnz w12, .Loop_residual_32
62
ret
63
.vl_gt_16_getResidual32:
64
cmp x9, #48
65
66
bgt .vl_gt_16_pixel_sub_ps_32x32
67
lsl x1, x1, #1
68
mov w12, #4
69
-.loop_sub_ps_32_sve2:
70
+.Loop_sub_ps_32_sve2:
71
sub w12, w12, #1
72
.rept 4
73
ld1 {v0.16b-v1.16b}, x2, x4
74
75
st1 {v16.8h-v19.8h}, x0, x1
76
st1 {v20.8h-v23.8h}, x0, x1
77
.endr
78
- cbnz w12, .loop_sub_ps_32_sve2
79
+ cbnz w12, .Loop_sub_ps_32_sve2
80
ret
81
.vl_gt_16_pixel_sub_ps_32x32:
82
cmp x9, #48
83
84
lsl x1, x1, #1
85
sub x1, x1, #64
86
mov w12, #16
87
-.loop_sub_ps_64_sve2:
88
+.Loop_sub_ps_64_sve2:
89
sub w12, w12, #1
90
.rept 4
91
ld1 {v0.16b-v3.16b}, x2, x4
92
93
st1 {v16.8h-v19.8h}, x0, #64
94
st1 {v20.8h-v23.8h}, x0, x1
95
.endr
96
- cbnz w12, .loop_sub_ps_64_sve2
97
+ cbnz w12, .Loop_sub_ps_64_sve2
98
ret
99
.vl_gt_16_pixel_sub_ps_64x64:
100
rdvl x9, #1
101
102
bgt .vl_gt_16_pixel_sub_ps_32x64
103
lsl x1, x1, #1
104
mov w12, #8
105
-.loop_sub_ps_32x64_sve2:
106
+.Loop_sub_ps_32x64_sve2:
107
sub w12, w12, #1
108
.rept 4
109
ld1 {v0.16b-v1.16b}, x2, x4
110
111
st1 {v16.8h-v19.8h}, x0, x1
112
st1 {v20.8h-v23.8h}, x0, x1
113
.endr
114
- cbnz w12, .loop_sub_ps_32x64_sve2
115
+ cbnz w12, .Loop_sub_ps_32x64_sve2
116
ret
117
.vl_gt_16_pixel_sub_ps_32x64:
118
cmp x9, #48
119
120
bgt .vl_gt_16_pixel_add_ps_32x\h
121
lsl x5, x5, #1
122
mov w12, #\h / 4
123
-.loop_add_ps__sve2_32x\h\():
124
+.Loop_add_ps__sve2_32x\h\():
125
sub w12, w12, #1
126
.rept 4
127
ld1 {v0.16b-v1.16b}, x2, x4
128
129
sqxtun2 v5.16b, v27.8h
130
st1 {v4.16b-v5.16b}, x0, x1
131
.endr
132
- cbnz w12, .loop_add_ps__sve2_32x\h
133
+ cbnz w12, .Loop_add_ps__sve2_32x\h
134
ret
135
.vl_gt_16_pixel_add_ps_32x\h\():
136
cmp x9, #48
137
138
bgt .vl_gt_16_ssimDist16
139
ssimDist_start
140
ptrue p0.s, vl4
141
-.loop_ssimDist16_sve2:
142
+.Loop_ssimDist16_sve2:
143
sub w12, w12, #1
144
ld1b {z4.s}, p0/z, x0
145
ld1b {z5.s}, p0/z, x0, #1, mul vl
146
147
add x2, x2, x3
148
ssimDist_1_sve2 z4, z5, z8, z9
149
ssimDist_1_sve2 z6, z7, z10, z11
150
- cbnz w12, .loop_ssimDist16_sve2
151
+ cbnz w12, .Loop_ssimDist16_sve2
152
ssimDist_end
153
ret
154
.vl_gt_16_ssimDist16:
155
156
bgt .vl_gt_16_ssimDist32
157
ssimDist_start
158
ptrue p0.s, vl4
159
-.loop_ssimDist32_sve2:
160
+.Loop_ssimDist32_sve2:
161
sub w12, w12, #1
162
ld1b {z2.s}, p0/z, x0
163
ld1b {z3.s}, p0/z, x0, #1, mul vl
164
165
ssimDist_1_sve2 z4, z5, z12, z13
166
ssimDist_1_sve2 z6, z7, z14, z15
167
ssimDist_1_sve2 z8, z9, z30, z31
168
- cbnz w12, .loop_ssimDist32_sve2
169
+ cbnz w12, .Loop_ssimDist32_sve2
170
ssimDist_end
171
ret
172
.vl_gt_16_ssimDist32:
173
174
bgt .vl_gt_16_ssimDist64
175
ssimDist_start
176
ptrue p0.s, vl4
177
-.loop_ssimDist64_sve2:
178
+.Loop_ssimDist64_sve2:
179
sub w12, w12, #1
180
ld1b {z2.s}, p0/z, x0
181
ld1b {z3.s}, p0/z, x0, #1, mul vl
182
183
ssimDist_1_sve2 z8, z9, z29, z30
184
add x0, x0, x1
185
add x2, x2, x3
186
- cbnz w12, .loop_ssimDist64_sve2
187
+ cbnz w12, .Loop_ssimDist64_sve2
188
ssimDist_end
189
ret
190
.vl_gt_16_ssimDist64:
191
192
bgt .vl_gt_16_normFact16
193
normFact_start
194
ptrue p0.s, vl4
195
-.loop_normFact16_sve2:
196
+.Loop_normFact16_sve2:
197
sub w12, w12, #1
198
ld1b {z4.s}, p0/z, x0
199
ld1b {z5.s}, p0/z, x0, #1, mul vl
200
201
x265_3.6.tar.gz/source/common/aarch64/pixel-util.S -> x265_4.0.tar.gz/source/common/aarch64/pixel-util.S
Changed
201
1
2
function PFX(pixel_var_16x16_neon)
3
pixel_var_start
4
mov w12, #16
5
-.loop_var_16:
6
+.Loop_var_16:
7
sub w12, w12, #1
8
ld1 {v4.16b}, x0, x1
9
pixel_var_1 v4
10
- cbnz w12, .loop_var_16
11
+ cbnz w12, .Loop_var_16
12
pixel_var_end
13
ret
14
endfunc
15
16
function PFX(pixel_var_32x32_neon)
17
pixel_var_start
18
mov w12, #32
19
-.loop_var_32:
20
+.Loop_var_32:
21
sub w12, w12, #1
22
ld1 {v4.16b-v5.16b}, x0, x1
23
pixel_var_1 v4
24
pixel_var_1 v5
25
- cbnz w12, .loop_var_32
26
+ cbnz w12, .Loop_var_32
27
pixel_var_end
28
ret
29
endfunc
30
31
function PFX(pixel_var_64x64_neon)
32
pixel_var_start
33
mov w12, #64
34
-.loop_var_64:
35
+.Loop_var_64:
36
sub w12, w12, #1
37
ld1 {v4.16b-v7.16b}, x0, x1
38
pixel_var_1 v4
39
pixel_var_1 v5
40
pixel_var_1 v6
41
pixel_var_1 v7
42
- cbnz w12, .loop_var_64
43
+ cbnz w12, .Loop_var_64
44
pixel_var_end
45
ret
46
endfunc
47
48
function PFX(getResidual32_neon)
49
lsl x4, x3, #1
50
mov w12, #4
51
-.loop_residual_32:
52
+.Loop_residual_32:
53
sub w12, w12, #1
54
.rept 4
55
ld1 {v0.16b-v1.16b}, x0, x3
56
57
st1 {v16.8h-v19.8h}, x2, x4
58
st1 {v20.8h-v23.8h}, x2, x4
59
.endr
60
- cbnz w12, .loop_residual_32
61
+ cbnz w12, .Loop_residual_32
62
ret
63
endfunc
64
65
66
function PFX(pixel_sub_ps_32x32_neon)
67
lsl x1, x1, #1
68
mov w12, #4
69
-.loop_sub_ps_32:
70
+.Loop_sub_ps_32:
71
sub w12, w12, #1
72
.rept 4
73
ld1 {v0.16b-v1.16b}, x2, x4
74
75
st1 {v16.8h-v19.8h}, x0, x1
76
st1 {v20.8h-v23.8h}, x0, x1
77
.endr
78
- cbnz w12, .loop_sub_ps_32
79
+ cbnz w12, .Loop_sub_ps_32
80
ret
81
endfunc
82
83
84
lsl x1, x1, #1
85
sub x1, x1, #64
86
mov w12, #16
87
-.loop_sub_ps_64:
88
+.Loop_sub_ps_64:
89
sub w12, w12, #1
90
.rept 4
91
ld1 {v0.16b-v3.16b}, x2, x4
92
93
st1 {v16.8h-v19.8h}, x0, #64
94
st1 {v20.8h-v23.8h}, x0, x1
95
.endr
96
- cbnz w12, .loop_sub_ps_64
97
+ cbnz w12, .Loop_sub_ps_64
98
ret
99
endfunc
100
101
102
function PFX(pixel_sub_ps_32x64_neon)
103
lsl x1, x1, #1
104
mov w12, #8
105
-.loop_sub_ps_32x64:
106
+.Loop_sub_ps_32x64:
107
sub w12, w12, #1
108
.rept 4
109
ld1 {v0.16b-v1.16b}, x2, x4
110
111
st1 {v16.8h-v19.8h}, x0, x1
112
st1 {v20.8h-v23.8h}, x0, x1
113
.endr
114
- cbnz w12, .loop_sub_ps_32x64
115
+ cbnz w12, .Loop_sub_ps_32x64
116
ret
117
endfunc
118
119
120
function PFX(pixel_add_ps_16x\h\()_neon)
121
lsl x5, x5, #1
122
mov w12, #\h / 8
123
-.loop_add_ps_16x\h\():
124
+.Loop_add_ps_16x\h\():
125
sub w12, w12, #1
126
.rept 4
127
ld1 {v0.16b}, x2, x4
128
129
st1 {v4.16b}, x0, x1
130
st1 {v5.16b}, x0, x1
131
.endr
132
- cbnz w12, .loop_add_ps_16x\h
133
+ cbnz w12, .Loop_add_ps_16x\h
134
ret
135
endfunc
136
.endm
137
138
function PFX(pixel_add_ps_32x\h\()_neon)
139
lsl x5, x5, #1
140
mov w12, #\h / 4
141
-.loop_add_ps_32x\h\():
142
+.Loop_add_ps_32x\h\():
143
sub w12, w12, #1
144
.rept 4
145
ld1 {v0.16b-v1.16b}, x2, x4
146
147
sqxtun2 v5.16b, v27.8h
148
st1 {v4.16b-v5.16b}, x0, x1
149
.endr
150
- cbnz w12, .loop_add_ps_32x\h
151
+ cbnz w12, .Loop_add_ps_32x\h
152
ret
153
endfunc
154
.endm
155
156
lsl x5, x5, #1
157
sub x5, x5, #64
158
mov w12, #32
159
-.loop_add_ps_64x64:
160
+.Loop_add_ps_64x64:
161
sub w12, w12, #1
162
.rept 2
163
ld1 {v0.16b-v3.16b}, x2, x4
164
165
sqxtun2 v3.16b, v7.8h
166
st1 {v0.16b-v3.16b}, x0, x1
167
.endr
168
- cbnz w12, .loop_add_ps_64x64
169
+ cbnz w12, .Loop_add_ps_64x64
170
ret
171
endfunc
172
173
174
// void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride)
175
function PFX(scale2D_64to32_neon)
176
mov w12, #32
177
-.loop_scale2D:
178
+.Loop_scale2D:
179
ld1 {v0.16b-v3.16b}, x1, x2
180
sub w12, w12, #1
181
ld1 {v4.16b-v7.16b}, x1, x2
182
183
uqrshrn v1.8b, v2.8h, #2
184
uqrshrn2 v1.16b, v3.8h, #2
185
st1 {v0.16b-v1.16b}, x0, #32
186
- cbnz w12, .loop_scale2D
187
+ cbnz w12, .Loop_scale2D
188
ret
189
endfunc
190
191
192
function PFX(pixel_planecopy_cp_neon)
193
dup v2.16b, w6
194
sub x5, x5, #1
195
-.loop_h:
196
+.Loop_h:
197
mov x6, x0
198
mov x12, x2
199
mov x7, #0
200
-.loop_w:
201
x265_3.6.tar.gz/source/common/aarch64/sad-a.S -> x265_4.0.tar.gz/source/common/aarch64/sad-a.S
Changed
201
1
2
/*****************************************************************************
3
- * Copyright (C) 2020-2021 MulticoreWare, Inc
4
+ * Copyright (C) 2020-2024 MulticoreWare, Inc
5
*
6
* Authors: Hongbin Liu <liuhongbin1@huawei.com>
7
* Sebastian Pop <spop@amazon.com>
8
+ Hari Limaye <hari.limaye@arm.com>
9
*
10
* This program is free software; you can redistribute it and/or modify
11
* it under the terms of the GNU General Public License as published by
12
13
*****************************************************************************/
14
15
#include "asm.S"
16
-#include "sad-a-common.S"
17
18
#ifdef __APPLE__
19
.section __RODATA,__rodata
20
21
22
.text
23
24
+.macro SAD_START_4 f
25
+ ldr s0, x0
26
+ ldr s1, x2
27
+ add x0, x0, x1
28
+ add x2, x2, x3
29
+ ld1 {v0.s}1, x0, x1
30
+ ld1 {v1.s}1, x2, x3
31
+ \f v16.8h, v0.8b, v1.8b
32
+.endm
33
+
34
+.macro SAD_4 h
35
+.rept \h / 2 - 1
36
+ SAD_START_4 uabal
37
+.endr
38
+.endm
39
+
40
+.macro SAD_START_8 f
41
+ ld1 {v0.8b}, x0, x1
42
+ ld1 {v1.8b}, x2, x3
43
+ \f v16.8h, v0.8b, v1.8b
44
+.endm
45
+
46
+.macro SAD_8 h
47
+.rept \h - 3
48
+ SAD_START_8 uabal
49
+.endr
50
+ ldr d0, x0
51
+ ldr d1, x2
52
+ uabal v16.8h, v0.8b, v1.8b
53
+ ldr d0, x0, x1
54
+ ldr d1, x2, x3
55
+ uabal v16.8h, v0.8b, v1.8b
56
+.endm
57
+
58
+.macro SAD_START_16
59
+ movi v16.16b, #0
60
+ movi v17.16b, #0
61
+.endm
62
+
63
+.macro SAD_16
64
+ ld1 {v0.16b}, x0, x1
65
+ ld1 {v1.16b}, x2, x3
66
+ ld1 {v2.16b}, x0, x1
67
+ ld1 {v3.16b}, x2, x3
68
+ uabd v20.16b, v0.16b, v1.16b
69
+ uadalp v16.8h, v20.16b
70
+ uabd v21.16b, v2.16b, v3.16b
71
+ uadalp v17.8h, v21.16b
72
+.endm
73
+
74
+.macro SAD_END_16
75
+ add v16.8h, v16.8h, v17.8h
76
+ uaddlv s0, v16.8h
77
+ fmov x0, d0
78
+ ret
79
+.endm
80
+
81
+.macro SAD_START_32
82
+ movi v16.16b, #0
83
+ movi v17.16b, #0
84
+ movi v18.16b, #0
85
+ movi v19.16b, #0
86
+.endm
87
+
88
+.macro SAD_32
89
+ ld1 {v0.16b-v1.16b}, x0, x1
90
+ ld1 {v2.16b-v3.16b}, x2, x3
91
+ ld1 {v4.16b-v5.16b}, x0, x1
92
+ ld1 {v6.16b-v7.16b}, x2, x3
93
+ uabd v20.16b, v0.16b, v2.16b
94
+ uadalp v16.8h, v20.16b
95
+ uabd v21.16b, v1.16b, v3.16b
96
+ uadalp v17.8h, v21.16b
97
+ uabd v22.16b, v4.16b, v6.16b
98
+ uadalp v18.8h, v22.16b
99
+ uabd v23.16b, v5.16b, v7.16b
100
+ uadalp v19.8h, v23.16b
101
+.endm
102
+
103
+.macro SAD_END_32
104
+ add v16.8h, v16.8h, v17.8h
105
+ add v17.8h, v18.8h, v19.8h
106
+ add v16.8h, v16.8h, v17.8h
107
+ uaddlv s0, v16.8h
108
+ fmov w0, s0
109
+ ret
110
+.endm
111
+
112
+.macro SAD_START_64
113
+ movi v16.16b, #0
114
+ movi v17.16b, #0
115
+ movi v18.16b, #0
116
+ movi v19.16b, #0
117
+.endm
118
+
119
+.macro SAD_64
120
+ ld1 {v0.16b-v3.16b}, x0, x1
121
+ ld1 {v4.16b-v7.16b}, x2, x3
122
+ ld1 {v24.16b-v27.16b}, x0, x1
123
+ ld1 {v28.16b-v31.16b}, x2, x3
124
+ uabd v20.16b, v0.16b, v4.16b
125
+ uadalp v16.8h, v20.16b
126
+ uabd v21.16b, v1.16b, v5.16b
127
+ uadalp v17.8h, v21.16b
128
+ uabd v22.16b, v2.16b, v6.16b
129
+ uadalp v18.8h, v22.16b
130
+ uabd v23.16b, v3.16b, v7.16b
131
+ uadalp v19.8h, v23.16b
132
+ uabd v20.16b, v24.16b, v28.16b
133
+ uadalp v16.8h, v20.16b
134
+ uabd v21.16b, v25.16b, v29.16b
135
+ uadalp v17.8h, v21.16b
136
+ uabd v22.16b, v26.16b, v30.16b
137
+ uadalp v18.8h, v22.16b
138
+ uabd v23.16b, v27.16b, v31.16b
139
+ uadalp v19.8h, v23.16b
140
+.endm
141
+
142
+.macro SAD_END_64
143
+ uaddlp v16.4s, v16.8h
144
+ uadalp v16.4s, v17.8h
145
+ uadalp v16.4s, v18.8h
146
+ uadalp v16.4s, v19.8h
147
+ uaddlv d0, v16.4s
148
+ fmov x0, d0
149
+ ret
150
+.endm
151
+
152
+.macro SAD_START_12
153
+ movrel x12, sad12_mask
154
+ ld1 {v31.16b}, x12
155
+ movi v16.16b, #0
156
+ movi v17.16b, #0
157
+.endm
158
+
159
+.macro SAD_12
160
+ ld1 {v0.16b}, x0, x1
161
+ and v0.16b, v0.16b, v31.16b
162
+ ld1 {v1.16b}, x2, x3
163
+ and v1.16b, v1.16b, v31.16b
164
+ ld1 {v2.16b}, x0, x1
165
+ and v2.16b, v2.16b, v31.16b
166
+ ld1 {v3.16b}, x2, x3
167
+ and v3.16b, v3.16b, v31.16b
168
+ uabd v20.16b, v0.16b, v1.16b
169
+ uadalp v16.8h, v20.16b
170
+ uabd v21.16b, v2.16b, v3.16b
171
+ uadalp v17.8h, v21.16b
172
+.endm
173
+
174
+.macro SAD_END_12
175
+ add v16.8h, v16.8h, v17.8h
176
+ uaddlv s0, v16.8h
177
+ fmov w0, s0
178
+ ret
179
+.endm
180
+
181
+.macro SAD_START_24
182
+ movi v16.16b, #0
183
+ movi v17.16b, #0
184
+ sub x1, x1, #16
185
+ sub x3, x3, #16
186
+.endm
187
+
188
+.macro SAD_24
189
+ ld1 {v0.16b}, x0, #16
190
+ ld1 {v1.8b}, x0, x1
191
+ ld1 {v2.16b}, x2, #16
192
+ ld1 {v3.8b}, x2, x3
193
+ ld1 {v4.16b}, x0, #16
194
+ ld1 {v5.8b}, x0, x1
195
+ ld1 {v6.16b}, x2, #16
196
+ ld1 {v7.8b}, x2, x3
197
+ uabd v20.16b, v0.16b, v2.16b
198
+ uadalp v16.8h, v20.16b
199
+ uabal v17.8h, v1.8b, v3.8b
200
+ uabd v20.16b, v4.16b, v6.16b
201
x265_4.0.tar.gz/source/common/aarch64/sad-neon-dotprod.S
Added
201
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2024 MulticoreWare, Inc
4
+ *
5
+ * Authors: Hari Limaye <hari.limaye@arm.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+
27
+.arch armv8.2-a+dotprod
28
+
29
+#ifdef __APPLE__
30
+.section __RODATA,__rodata
31
+#else
32
+.section .rodata
33
+#endif
34
+
35
+.align 4
36
+
37
+.text
38
+
39
+// Fully unrolled with single accumulator for smaller block heights.
40
+.macro SAD_NEON_DOTPROD_16_S h
41
+function PFX(pixel_sad_16x\h\()_neon_dotprod)
42
+ movi v0.16b, #0
43
+ movi v1.16b, #1
44
+.rept \h - 2
45
+ ldr q2, x0
46
+ ldr q3, x2
47
+ add x0, x0, x1
48
+ add x2, x2, x3
49
+ uabd v4.16b, v2.16b, v3.16b
50
+ udot v0.4s, v4.16b, v1.16b
51
+.endr
52
+ ldr q2, x0
53
+ ldr q3, x2
54
+ uabd v4.16b, v2.16b, v3.16b
55
+ udot v0.4s, v4.16b, v1.16b
56
+ ldr q2, x0, x1
57
+ ldr q3, x2, x3
58
+ uabd v4.16b, v2.16b, v3.16b
59
+ udot v0.4s, v4.16b, v1.16b
60
+
61
+ addv s0, v0.4s
62
+ fmov w0, s0
63
+ ret
64
+endfunc
65
+.endm
66
+
67
+.macro SAD_NEON_DOTPROD_START
68
+ // v31: 1 across all lanes for use in UDOT instructions.
69
+ movi v31.16b, #1
70
+ movi v16.16b, #0
71
+ movi v17.16b, #0
72
+.endm
73
+
74
+.macro SAD_NEON_DOTPROD_END
75
+ add v16.4s, v16.4s, v17.4s
76
+ addv s0, v16.4s
77
+ fmov w0, s0
78
+ ret
79
+.endm
80
+
81
+// Fully unrolled.
82
+.macro SAD_NEON_DOTPROD_16 h
83
+function PFX(pixel_sad_16x\h\()_neon_dotprod)
84
+ SAD_NEON_DOTPROD_START
85
+.rept \h / 2
86
+ ld1 {v0.16b}, x0, x1
87
+ ld1 {v1.16b}, x0, x1
88
+ ld1 {v2.16b}, x2, x3
89
+ ld1 {v3.16b}, x2, x3
90
+ uabd v20.16b, v0.16b, v2.16b
91
+ udot v16.4s, v20.16b, v31.16b
92
+ uabd v21.16b, v1.16b, v3.16b
93
+ udot v17.4s, v21.16b, v31.16b
94
+.endr
95
+ SAD_NEON_DOTPROD_END
96
+endfunc
97
+.endm
98
+
99
+// Process four rows of width 32.
100
+.macro SAD_NEON_DOTPROD_32
101
+.rept 4
102
+ ld1 {v0.16b-v1.16b}, x0, x1
103
+ ld1 {v2.16b-v3.16b}, x2, x3
104
+ uabd v20.16b, v0.16b, v2.16b
105
+ udot v16.4s, v20.16b, v31.16b
106
+ uabd v21.16b, v1.16b, v3.16b
107
+ udot v17.4s, v21.16b, v31.16b
108
+.endr
109
+.endm
110
+
111
+// Process four rows of width 48.
112
+.macro SAD_NEON_DOTPROD_48
113
+.rept 4
114
+ ld1 {v0.16b-v2.16b}, x0, x1
115
+ ld1 {v4.16b-v6.16b}, x2, x3
116
+ uabd v20.16b, v0.16b, v4.16b
117
+ udot v16.4s, v20.16b, v31.16b
118
+ uabd v21.16b, v1.16b, v5.16b
119
+ udot v17.4s, v21.16b, v31.16b
120
+ uabd v20.16b, v2.16b, v6.16b
121
+ udot v16.4s, v20.16b, v31.16b
122
+.endr
123
+.endm
124
+
125
+// Process four rows of width 64.
126
+.macro SAD_NEON_DOTPROD_64
127
+.rept 4
128
+ ld1 {v0.16b-v3.16b}, x0, x1
129
+ ld1 {v4.16b-v7.16b}, x2, x3
130
+ uabd v20.16b, v0.16b, v4.16b
131
+ udot v16.4s, v20.16b, v31.16b
132
+ uabd v21.16b, v1.16b, v5.16b
133
+ udot v17.4s, v21.16b, v31.16b
134
+ uabd v20.16b, v2.16b, v6.16b
135
+ udot v16.4s, v20.16b, v31.16b
136
+ uabd v21.16b, v3.16b, v7.16b
137
+ udot v17.4s, v21.16b, v31.16b
138
+.endr
139
+.endm
140
+
141
+// Loop unrolled to process 4 rows per iteration.
142
+.macro SAD_NEON_DOTPROD_LOOP w, h
143
+function PFX(pixel_sad_\w\()x\h\()_neon_dotprod)
144
+ SAD_NEON_DOTPROD_START
145
+ mov w9, #\h/4
146
+.Loop_\w\()x\h:
147
+ sub w9, w9, #1
148
+
149
+ SAD_NEON_DOTPROD_\w
150
+
151
+ cbnz w9, .Loop_\w\()x\h
152
+ SAD_NEON_DOTPROD_END
153
+endfunc
154
+.endm
155
+
156
+SAD_NEON_DOTPROD_16_S 4
157
+SAD_NEON_DOTPROD_16_S 8
158
+SAD_NEON_DOTPROD_16_S 12
159
+SAD_NEON_DOTPROD_16_S 16
160
+SAD_NEON_DOTPROD_16 32
161
+SAD_NEON_DOTPROD_16 64
162
+SAD_NEON_DOTPROD_LOOP 32, 8
163
+SAD_NEON_DOTPROD_LOOP 32, 16
164
+SAD_NEON_DOTPROD_LOOP 32, 24
165
+SAD_NEON_DOTPROD_LOOP 32, 32
166
+SAD_NEON_DOTPROD_LOOP 32, 64
167
+SAD_NEON_DOTPROD_LOOP 48, 64
168
+SAD_NEON_DOTPROD_LOOP 64, 16
169
+SAD_NEON_DOTPROD_LOOP 64, 32
170
+SAD_NEON_DOTPROD_LOOP 64, 48
171
+SAD_NEON_DOTPROD_LOOP 64, 64
172
+
173
+.macro PREP_ARGS_SAD_X_NEON_DOTPROD x
174
+ mov x9, #FENC_STRIDE
175
+
176
+// Make function arguments for x == 3 look like x == 4.
177
+.if \x == 3
178
+ mov x6, x5
179
+ mov x5, x4
180
+.endif
181
+
182
+ // v31: 1 across all lanes for use in UDOT instructions.
183
+ movi v31.16b, #1
184
+.endm
185
+
186
+.macro SAD_X_NEON_DOTPROD_START x
187
+ movi v16.4s, #0
188
+ movi v17.4s, #0
189
+ movi v18.4s, #0
190
+.if \x == 4
191
+ movi v19.4s, #0
192
+.endif
193
+.endm
194
+
195
+.macro SAD_X_NEON_DOTPROD_END x
196
+.if \x == 3
197
+ addv s0, v16.4s
198
+ addv s1, v17.4s
199
+ addv s2, v18.4s
200
+ stp s0, s1, x6
201
x265_4.0.tar.gz/source/common/aarch64/sao-prim-sve.cpp
Added
201
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2024 MulticoreWare, Inc
4
+ *
5
+ * Authors: Hari Limaye <hari.limaye@arm.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "sao-prim.h"
26
+
27
+/*
28
+ * Compute Edge Offset statistics (count and stats).
29
+ * To save some instructions compute count and stats as negative values - since
30
+ * output of Neon comparison instructions for a matched condition is all 1s (-1).
31
+ */
32
+static inline void compute_eo_stats(const int8x16_t edge_type,
33
+ const int16_t *diff, int16x8_t *count,
34
+ int64x2_t *stats)
35
+{
36
+ // Create a mask for each edge type.
37
+ int8x16_t mask0 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-2)));
38
+ int8x16_t mask1 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-1)));
39
+ int8x16_t mask2 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(0)));
40
+ int8x16_t mask3 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(1)));
41
+ int8x16_t mask4 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(2)));
42
+
43
+ // Compute negative counts for each edge type.
44
+ count0 = vpadalq_s8(count0, mask0);
45
+ count1 = vpadalq_s8(count1, mask1);
46
+ count2 = vpadalq_s8(count2, mask2);
47
+ count3 = vpadalq_s8(count3, mask3);
48
+ count4 = vpadalq_s8(count4, mask4);
49
+
50
+ // Widen the masks to 16-bit.
51
+ int16x8_t mask0_lo = vreinterpretq_s16_s8(vzip1q_s8(mask0, mask0));
52
+ int16x8_t mask0_hi = vreinterpretq_s16_s8(vzip2q_s8(mask0, mask0));
53
+ int16x8_t mask1_lo = vreinterpretq_s16_s8(vzip1q_s8(mask1, mask1));
54
+ int16x8_t mask1_hi = vreinterpretq_s16_s8(vzip2q_s8(mask1, mask1));
55
+ int16x8_t mask2_lo = vreinterpretq_s16_s8(vzip1q_s8(mask2, mask2));
56
+ int16x8_t mask2_hi = vreinterpretq_s16_s8(vzip2q_s8(mask2, mask2));
57
+ int16x8_t mask3_lo = vreinterpretq_s16_s8(vzip1q_s8(mask3, mask3));
58
+ int16x8_t mask3_hi = vreinterpretq_s16_s8(vzip2q_s8(mask3, mask3));
59
+ int16x8_t mask4_lo = vreinterpretq_s16_s8(vzip1q_s8(mask4, mask4));
60
+ int16x8_t mask4_hi = vreinterpretq_s16_s8(vzip2q_s8(mask4, mask4));
61
+
62
+ int16x8_t diff_lo = vld1q_s16(diff);
63
+ int16x8_t diff_hi = vld1q_s16(diff + 8);
64
+
65
+ // Compute negative stats for each edge type.
66
+ stats0 = x265_sdotq_s16(stats0, diff_lo, mask0_lo);
67
+ stats0 = x265_sdotq_s16(stats0, diff_hi, mask0_hi);
68
+ stats1 = x265_sdotq_s16(stats1, diff_lo, mask1_lo);
69
+ stats1 = x265_sdotq_s16(stats1, diff_hi, mask1_hi);
70
+ stats2 = x265_sdotq_s16(stats2, diff_lo, mask2_lo);
71
+ stats2 = x265_sdotq_s16(stats2, diff_hi, mask2_hi);
72
+ stats3 = x265_sdotq_s16(stats3, diff_lo, mask3_lo);
73
+ stats3 = x265_sdotq_s16(stats3, diff_hi, mask3_hi);
74
+ stats4 = x265_sdotq_s16(stats4, diff_lo, mask4_lo);
75
+ stats4 = x265_sdotq_s16(stats4, diff_hi, mask4_hi);
76
+}
77
+
78
+/*
79
+ * Reduce and store Edge Offset statistics (count and stats).
80
+ */
81
+static inline void reduce_eo_stats(int64x2_t *vstats, int16x8_t *vcount,
82
+ int32_t *stats, int32_t *count)
83
+{
84
+ // s_eoTable maps edge types to memory in order: {2, 0, 1, 3, 4}.
85
+ int16x8_t c01 = vpaddq_s16(vcount2, vcount0);
86
+ int16x8_t c23 = vpaddq_s16(vcount1, vcount3);
87
+ int16x8_t c0123 = vpaddq_s16(c01, c23);
88
+ // Subtract from current count, as we calculate the negation.
89
+ vst1q_s32(count, vsubq_s32(vld1q_s32(count), vpaddlq_s16(c0123)));
90
+ count4 -= vaddvq_s16(vcount4);
91
+
92
+ int32x4_t s01 = vcombine_s32(vmovn_s64(vstats2), vmovn_s64(vstats0));
93
+ int32x4_t s23 = vcombine_s32(vmovn_s64(vstats1), vmovn_s64(vstats3));
94
+ int32x4_t s0123 = vpaddq_s32(s01, s23);
95
+ // Subtract from current stats, as we calculate the negation.
96
+ vst1q_s32(stats, vsubq_s32(vld1q_s32(stats), s0123));
97
+ stats4 -= vaddvq_s64(vstats4);
98
+}
99
+
100
+namespace X265_NS {
101
+void saoCuStatsE0_sve(const int16_t *diff, const pixel *rec, intptr_t stride,
102
+ int endX, int endY, int32_t *stats, int32_t *count)
103
+{
104
+ // Separate buffers for each edge type, so that we can vectorise.
105
+ int16x8_t tmp_count5 = { vdupq_n_s16(0), vdupq_n_s16(0), vdupq_n_s16(0),
106
+ vdupq_n_s16(0), vdupq_n_s16(0) };
107
+ int64x2_t tmp_stats5 = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
108
+ vdupq_n_s64(0), vdupq_n_s64(0) };
109
+
110
+ for (int y = 0; y < endY; y++)
111
+ {
112
+ // Calculate negated sign_left(x) directly, to save negation when
113
+ // reusing sign_right(x) as sign_left(x + 1).
114
+ int8x16_t neg_sign_left = vdupq_n_s8(x265_signOf(rec-1 - rec0));
115
+ for (int x = 0; x < endX; x += 16)
116
+ {
117
+ int8x16_t sign_right = signOf_neon(rec + x, rec + x + 1);
118
+
119
+ // neg_sign_left(x) = sign_right(x + 1), reusing one from previous
120
+ // iteration.
121
+ neg_sign_left = vextq_s8(neg_sign_left, sign_right, 15);
122
+
123
+ // Subtract instead of add, as sign_left is negated.
124
+ int8x16_t edge_type = vsubq_s8(sign_right, neg_sign_left);
125
+
126
+ // For reuse in the next iteration.
127
+ neg_sign_left = sign_right;
128
+
129
+ edge_type = x265_sve_mask(x, endX, edge_type);
130
+ compute_eo_stats(edge_type, diff + x, tmp_count, tmp_stats);
131
+ }
132
+
133
+ diff += MAX_CU_SIZE;
134
+ rec += stride;
135
+ }
136
+
137
+ reduce_eo_stats(tmp_stats, tmp_count, stats, count);
138
+}
139
+
140
+void saoCuStatsE1_sve(const int16_t *diff, const pixel *rec, intptr_t stride,
141
+ int8_t *upBuff1, int endX, int endY, int32_t *stats,
142
+ int32_t *count)
143
+{
144
+ // Separate buffers for each edge type, so that we can vectorise.
145
+ int16x8_t tmp_count5 = { vdupq_n_s16(0), vdupq_n_s16(0), vdupq_n_s16(0),
146
+ vdupq_n_s16(0), vdupq_n_s16(0) };
147
+ int64x2_t tmp_stats5 = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
148
+ vdupq_n_s64(0), vdupq_n_s64(0) };
149
+
150
+ // Negate upBuff1 (sign_up), so we can subtract and save repeated negations.
151
+ for (int x = 0; x < endX; x += 16)
152
+ {
153
+ vst1q_s8(upBuff1 + x, vnegq_s8(vld1q_s8(upBuff1 + x)));
154
+ }
155
+
156
+ for (int y = 0; y < endY; y++)
157
+ {
158
+ for (int x = 0; x < endX; x += 16)
159
+ {
160
+ int8x16_t sign_up = vld1q_s8(upBuff1 + x);
161
+ int8x16_t sign_down = signOf_neon(rec + x, rec + x + stride);
162
+
163
+ // Subtract instead of add, as sign_up is negated.
164
+ int8x16_t edge_type = vsubq_s8(sign_down, sign_up);
165
+
166
+ // For reuse in the next iteration.
167
+ vst1q_s8(upBuff1 + x, sign_down);
168
+
169
+ edge_type = x265_sve_mask(x, endX, edge_type);
170
+ compute_eo_stats(edge_type, diff + x, tmp_count, tmp_stats);
171
+ }
172
+
173
+ diff += MAX_CU_SIZE;
174
+ rec += stride;
175
+ }
176
+
177
+ reduce_eo_stats(tmp_stats, tmp_count, stats, count);
178
+}
179
+
180
+void saoCuStatsE2_sve(const int16_t *diff, const pixel *rec, intptr_t stride,
181
+ int8_t *upBuff1, int8_t *upBufft, int endX, int endY,
182
+ int32_t *stats, int32_t *count)
183
+{
184
+ // Separate buffers for each edge type, so that we can vectorise.
185
+ int16x8_t tmp_count5 = { vdupq_n_s16(0), vdupq_n_s16(0), vdupq_n_s16(0),
186
+ vdupq_n_s16(0), vdupq_n_s16(0) };
187
+ int64x2_t tmp_stats5 = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
188
+ vdupq_n_s64(0), vdupq_n_s64(0) };
189
+
190
+ // Negate upBuff1 (sign_up) so we can subtract and save repeated negations.
191
+ for (int x = 0; x < endX; x += 16)
192
+ {
193
+ vst1q_s8(upBuff1 + x, vnegq_s8(vld1q_s8(upBuff1 + x)));
194
+ }
195
+
196
+ for (int y = 0; y < endY; y++)
197
+ {
198
+ upBufft0 = x265_signOf(rec-1 - recstride);
199
+ for (int x = 0; x < endX; x += 16)
200
+ {
201
x265_4.0.tar.gz/source/common/aarch64/sao-prim-sve2.cpp
Added
201
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2024 MulticoreWare, Inc
4
+ *
5
+ * Authors: Hari Limaye <hari.limaye@arm.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "sao-prim.h"
26
+
27
+static inline uint8x16_t sve_count(int8x16_t in)
28
+{
29
+ // We do not care about initialising the values in the rest of the vector,
30
+ // for VL > 128, as HISTSEG counts matching elements in 128-bit segments.
31
+ svint8_t edge_type = svset_neonq_s8(svundef_s8(), in);
32
+
33
+ // Use an arbitrary value outside of range -2, 2 for lanes we don't
34
+ // need to use the result from.
35
+ const int DC = -3;
36
+ // s_eoTable maps edge types to memory in order: {2, 0, 1, 3, 4}.
37
+ // We use (edge_class - 2) resulting in {0, -2, -1, 1, 2}
38
+ int8x16_t idx = { 0, -2, -1, 1, 2, DC, DC, DC, DC, DC, DC, DC, DC, DC, DC,
39
+ DC };
40
+ svint8_t svidx = svset_neonq_s8(svundef_s8(), idx);
41
+
42
+ svuint8_t count = svhistseg_s8(svidx, edge_type);
43
+ return svget_neonq_u8(count);
44
+}
45
+
46
+/*
47
+ * Compute Edge Offset statistics (stats array).
48
+ * To save some instructions compute stats as negative values - since output of
49
+ * Neon comparison instructions for a matched condition is all 1s (-1).
50
+ */
51
+static inline void compute_eo_stats(const int8x16_t edge_type,
52
+ const int16_t *diff, int64x2_t *stats)
53
+{
54
+ // Create a mask for each edge type.
55
+ int8x16_t mask0 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-2)));
56
+ int8x16_t mask1 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-1)));
57
+ int8x16_t mask2 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(0)));
58
+ int8x16_t mask3 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(1)));
59
+ int8x16_t mask4 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(2)));
60
+
61
+ // Widen the masks to 16-bit.
62
+ int16x8_t mask0_lo = vreinterpretq_s16_s8(vzip1q_s8(mask0, mask0));
63
+ int16x8_t mask0_hi = vreinterpretq_s16_s8(vzip2q_s8(mask0, mask0));
64
+ int16x8_t mask1_lo = vreinterpretq_s16_s8(vzip1q_s8(mask1, mask1));
65
+ int16x8_t mask1_hi = vreinterpretq_s16_s8(vzip2q_s8(mask1, mask1));
66
+ int16x8_t mask2_lo = vreinterpretq_s16_s8(vzip1q_s8(mask2, mask2));
67
+ int16x8_t mask2_hi = vreinterpretq_s16_s8(vzip2q_s8(mask2, mask2));
68
+ int16x8_t mask3_lo = vreinterpretq_s16_s8(vzip1q_s8(mask3, mask3));
69
+ int16x8_t mask3_hi = vreinterpretq_s16_s8(vzip2q_s8(mask3, mask3));
70
+ int16x8_t mask4_lo = vreinterpretq_s16_s8(vzip1q_s8(mask4, mask4));
71
+ int16x8_t mask4_hi = vreinterpretq_s16_s8(vzip2q_s8(mask4, mask4));
72
+
73
+ int16x8_t diff_lo = vld1q_s16(diff);
74
+ int16x8_t diff_hi = vld1q_s16(diff + 8);
75
+
76
+ // Compute negative stats for each edge type.
77
+ stats0 = x265_sdotq_s16(stats0, diff_lo, mask0_lo);
78
+ stats0 = x265_sdotq_s16(stats0, diff_hi, mask0_hi);
79
+ stats1 = x265_sdotq_s16(stats1, diff_lo, mask1_lo);
80
+ stats1 = x265_sdotq_s16(stats1, diff_hi, mask1_hi);
81
+ stats2 = x265_sdotq_s16(stats2, diff_lo, mask2_lo);
82
+ stats2 = x265_sdotq_s16(stats2, diff_hi, mask2_hi);
83
+ stats3 = x265_sdotq_s16(stats3, diff_lo, mask3_lo);
84
+ stats3 = x265_sdotq_s16(stats3, diff_hi, mask3_hi);
85
+ stats4 = x265_sdotq_s16(stats4, diff_lo, mask4_lo);
86
+ stats4 = x265_sdotq_s16(stats4, diff_hi, mask4_hi);
87
+}
88
+
89
+/*
90
+ * Reduce and store Edge Offset statistics (count and stats).
91
+ */
92
+static inline void reduce_eo_stats(int64x2_t *vstats, uint16x8_t vcount,
93
+ int32_t *stats, int32_t *count)
94
+{
95
+ // s_eoTable maps edge types to memory in order: {2, 0, 1, 3, 4}.
96
+ // We already have the count values in the correct order for the store,
97
+ // so widen to 32-bit and accumulate to the destination.
98
+ int32x4_t c0123 = vmovl_s16(vget_low_s16(vreinterpretq_s16_u16(vcount)));
99
+ vst1q_s32(count, vaddq_s32(vld1q_s32(count), c0123));
100
+ count4 += vcount4;
101
+
102
+ int32x4_t s01 = vcombine_s32(vmovn_s64(vstats2), vmovn_s64(vstats0));
103
+ int32x4_t s23 = vcombine_s32(vmovn_s64(vstats1), vmovn_s64(vstats3));
104
+ int32x4_t s0123 = vpaddq_s32(s01, s23);
105
+ // Subtract from current stats, as we calculate the negation.
106
+ vst1q_s32(stats, vsubq_s32(vld1q_s32(stats), s0123));
107
+ stats4 -= vaddvq_s64(vstats4);
108
+}
109
+
110
+namespace X265_NS {
111
+void saoCuStatsE0_sve2(const int16_t *diff, const pixel *rec, intptr_t stride,
112
+ int endX, int endY, int32_t *stats, int32_t *count)
113
+{
114
+ // Separate buffers for each edge type, so that we can vectorise.
115
+ int64x2_t tmp_stats5 = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
116
+ vdupq_n_s64(0), vdupq_n_s64(0) };
117
+ uint16x8_t count_acc_u16 = vdupq_n_u16(0);
118
+
119
+ for (int y = 0; y < endY; y++)
120
+ {
121
+ uint8x16_t count_acc_u8 = vdupq_n_u8(0);
122
+
123
+ // Calculate negated sign_left(x) directly, to save negation when
124
+ // reusing sign_right(x) as sign_left(x + 1).
125
+ int8x16_t neg_sign_left = vdupq_n_s8(x265_signOf(rec-1 - rec0));
126
+ for (int x = 0; x < endX; x += 16)
127
+ {
128
+ int8x16_t sign_right = signOf_neon(rec + x, rec + x + 1);
129
+
130
+ // neg_sign_left(x) = sign_right(x + 1), reusing one from previous
131
+ // iteration.
132
+ neg_sign_left = vextq_s8(neg_sign_left, sign_right, 15);
133
+
134
+ // Subtract instead of add, as sign_left is negated.
135
+ int8x16_t edge_type = vsubq_s8(sign_right, neg_sign_left);
136
+
137
+ // For reuse in the next iteration.
138
+ neg_sign_left = sign_right;
139
+
140
+ edge_type = x265_sve_mask(x, endX, edge_type);
141
+ count_acc_u8 = vaddq_u8(count_acc_u8, sve_count(edge_type));
142
+ compute_eo_stats(edge_type, diff + x, tmp_stats);
143
+ }
144
+
145
+ // The width (endX) can be a maximum of 64, so we can safely
146
+ // widen from 8-bit count accumulators after one inner loop iteration.
147
+ // Technically the largest an accumulator could reach after one inner
148
+ // loop iteration is 64, if every input value had the same edge type, so
149
+ // we could complete two iterations (2 * 64 = 128) before widening.
150
+ count_acc_u16 = vaddw_u8(count_acc_u16, vget_low_u8(count_acc_u8));
151
+
152
+ diff += MAX_CU_SIZE;
153
+ rec += stride;
154
+ }
155
+
156
+ reduce_eo_stats(tmp_stats, count_acc_u16, stats, count);
157
+}
158
+
159
+void saoCuStatsE1_sve2(const int16_t *diff, const pixel *rec, intptr_t stride,
160
+ int8_t *upBuff1, int endX, int endY, int32_t *stats,
161
+ int32_t *count)
162
+{
163
+ // Separate buffers for each edge type, so that we can vectorise.
164
+ int64x2_t tmp_stats5 = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
165
+ vdupq_n_s64(0), vdupq_n_s64(0) };
166
+ uint16x8_t count_acc_u16 = vdupq_n_u16(0);
167
+
168
+ // Negate upBuff1 (sign_up), so we can subtract and save repeated negations.
169
+ for (int x = 0; x < endX; x += 16)
170
+ {
171
+ vst1q_s8(upBuff1 + x, vnegq_s8(vld1q_s8(upBuff1 + x)));
172
+ }
173
+
174
+ for (int y = 0; y < endY; y++)
175
+ {
176
+ uint8x16_t count_acc_u8 = vdupq_n_u8(0);
177
+
178
+ for (int x = 0; x < endX; x += 16)
179
+ {
180
+ int8x16_t sign_up = vld1q_s8(upBuff1 + x);
181
+ int8x16_t sign_down = signOf_neon(rec + x, rec + x + stride);
182
+
183
+ // Subtract instead of add, as sign_up is negated.
184
+ int8x16_t edge_type = vsubq_s8(sign_down, sign_up);
185
+
186
+ // For reuse in the next iteration.
187
+ vst1q_s8(upBuff1 + x, sign_down);
188
+
189
+ edge_type = x265_sve_mask(x, endX, edge_type);
190
+ count_acc_u8 = vaddq_u8(count_acc_u8, sve_count(edge_type));
191
+ compute_eo_stats(edge_type, diff + x, tmp_stats);
192
+ }
193
+
194
+ // The width (endX) can be a maximum of 64, so we can safely
195
+ // widen from 8-bit count accumulators after one inner loop iteration.
196
+ // Technically the largest an accumulator could reach after one inner
197
+ // loop iteration is 64, if every input value had the same edge type, so
198
+ // we could complete two iterations (2 * 64 = 128) before widening.
199
+ count_acc_u16 = vaddw_u8(count_acc_u16, vget_low_u8(count_acc_u8));
200
+
201
x265_4.0.tar.gz/source/common/aarch64/sao-prim.cpp
Added
201
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2024 MulticoreWare, Inc
4
+ *
5
+ * Authors: Hari Limaye <hari.limaye@arm.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "sao-prim.h"
26
+#include "sao.h"
27
+#include <arm_neon.h>
28
+
29
+// Predicate mask indices.
30
+static const int8_t quad_reg_byte_indices16 = {
31
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
32
+};
33
+
34
+static inline int8x16_t mask_inactive_elems(const int rem, int8x16_t edge_type)
35
+{
36
+ // Compute a predicate mask where the bits of an element are 0 if the index
37
+ // is less than the remainder (active), and 1 otherwise.
38
+ const int8x16_t indices = vld1q_s8(quad_reg_byte_indices);
39
+ int8x16_t pred = vreinterpretq_s8_u8(vcgeq_s8(indices, vdupq_n_s8(rem)));
40
+
41
+ // Use predicate mask to shift "unused lanes" outside of range -2, 2
42
+ pred = vshlq_n_s8(pred, 3);
43
+ return veorq_s8(edge_type, pred);
44
+}
45
+
46
+/*
47
+ * Compute Edge Offset statistics (count and stats).
48
+ * To save some instructions compute count and stats as negative values - since
49
+ * output of Neon comparison instructions for a matched condition is all 1s (-1).
50
+ */
51
+static inline void compute_eo_stats(const int8x16_t edge_type,
52
+ const int16_t *diff, int16x8_t *count,
53
+ int32x4_t *stats)
54
+{
55
+ // Create a mask for each edge type.
56
+ int8x16_t mask0 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-2)));
57
+ int8x16_t mask1 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-1)));
58
+ int8x16_t mask2 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(0)));
59
+ int8x16_t mask3 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(1)));
60
+ int8x16_t mask4 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(2)));
61
+
62
+ // Compute negative counts for each edge type.
63
+ count0 = vpadalq_s8(count0, mask0);
64
+ count1 = vpadalq_s8(count1, mask1);
65
+ count2 = vpadalq_s8(count2, mask2);
66
+ count3 = vpadalq_s8(count3, mask3);
67
+ count4 = vpadalq_s8(count4, mask4);
68
+
69
+ // Widen the masks to 16-bit.
70
+ int16x8_t mask0_lo = vreinterpretq_s16_s8(vzip1q_s8(mask0, mask0));
71
+ int16x8_t mask0_hi = vreinterpretq_s16_s8(vzip2q_s8(mask0, mask0));
72
+ int16x8_t mask1_lo = vreinterpretq_s16_s8(vzip1q_s8(mask1, mask1));
73
+ int16x8_t mask1_hi = vreinterpretq_s16_s8(vzip2q_s8(mask1, mask1));
74
+ int16x8_t mask2_lo = vreinterpretq_s16_s8(vzip1q_s8(mask2, mask2));
75
+ int16x8_t mask2_hi = vreinterpretq_s16_s8(vzip2q_s8(mask2, mask2));
76
+ int16x8_t mask3_lo = vreinterpretq_s16_s8(vzip1q_s8(mask3, mask3));
77
+ int16x8_t mask3_hi = vreinterpretq_s16_s8(vzip2q_s8(mask3, mask3));
78
+ int16x8_t mask4_lo = vreinterpretq_s16_s8(vzip1q_s8(mask4, mask4));
79
+ int16x8_t mask4_hi = vreinterpretq_s16_s8(vzip2q_s8(mask4, mask4));
80
+
81
+ int16x8_t diff_lo = vld1q_s16(diff);
82
+ int16x8_t diff_hi = vld1q_s16(diff + 8);
83
+
84
+ // Compute negative stats for each edge type.
85
+ int16x8_t stats0 = vmulq_s16(diff_lo, mask0_lo);
86
+ int16x8_t stats1 = vmulq_s16(diff_lo, mask1_lo);
87
+ int16x8_t stats2 = vmulq_s16(diff_lo, mask2_lo);
88
+ int16x8_t stats3 = vmulq_s16(diff_lo, mask3_lo);
89
+ int16x8_t stats4 = vmulq_s16(diff_lo, mask4_lo);
90
+ stats0 = vmlaq_s16(stats0, diff_hi, mask0_hi);
91
+ stats1 = vmlaq_s16(stats1, diff_hi, mask1_hi);
92
+ stats2 = vmlaq_s16(stats2, diff_hi, mask2_hi);
93
+ stats3 = vmlaq_s16(stats3, diff_hi, mask3_hi);
94
+ stats4 = vmlaq_s16(stats4, diff_hi, mask4_hi);
95
+
96
+ stats0 = vpadalq_s16(stats0, stats0);
97
+ stats1 = vpadalq_s16(stats1, stats1);
98
+ stats2 = vpadalq_s16(stats2, stats2);
99
+ stats3 = vpadalq_s16(stats3, stats3);
100
+ stats4 = vpadalq_s16(stats4, stats4);
101
+}
102
+
103
+/*
104
+ * Reduce and store Edge Offset statistics (count and stats).
105
+ */
106
+static inline void reduce_eo_stats(int32x4_t *vstats, int16x8_t *vcount,
107
+ int32_t *stats, int32_t *count)
108
+{
109
+ // s_eoTable maps edge types to memory in order: {2, 0, 1, 3, 4}.
110
+ int16x8_t c01 = vpaddq_s16(vcount2, vcount0);
111
+ int16x8_t c23 = vpaddq_s16(vcount1, vcount3);
112
+ int16x8_t c0123 = vpaddq_s16(c01, c23);
113
+
114
+ // Subtract from current count, as we calculate the negation.
115
+ vst1q_s32(count, vsubq_s32(vld1q_s32(count), vpaddlq_s16(c0123)));
116
+ count4 -= vaddvq_s16(vcount4);
117
+
118
+ int32x4_t s01 = vpaddq_s32(vstats2, vstats0);
119
+ int32x4_t s23 = vpaddq_s32(vstats1, vstats3);
120
+ int32x4_t s0123 = vpaddq_s32(s01, s23);
121
+
122
+ // Subtract from current stats, as we calculate the negation.
123
+ vst1q_s32(stats, vsubq_s32(vld1q_s32(stats), s0123));
124
+ stats4 -= vaddvq_s32(vstats4);
125
+}
126
+
127
+namespace X265_NS {
128
+void saoCuStatsBO_neon(const int16_t *diff, const pixel *rec, intptr_t stride,
129
+ int endX, int endY, int32_t *stats, int32_t *count)
130
+{
131
+#if HIGH_BIT_DEPTH
132
+ const int n_elem = 4;
133
+ const int elem_width = 16;
134
+#else
135
+ const int n_elem = 8;
136
+ const int elem_width = 8;
137
+#endif
138
+
139
+ // Additional temporary buffer for accumulation.
140
+ int32_t stats_tmp32 = { 0 };
141
+ int32_t count_tmp32 = { 0 };
142
+
143
+ // Byte-addressable pointers to buffers, to optimise address calculation.
144
+ uint8_t *stats_b2 = {
145
+ reinterpret_cast<uint8_t *>(stats),
146
+ reinterpret_cast<uint8_t *>(stats_tmp),
147
+ };
148
+ uint8_t *count_b2 = {
149
+ reinterpret_cast<uint8_t *>(count),
150
+ reinterpret_cast<uint8_t *>(count_tmp),
151
+ };
152
+
153
+ // Combine shift for index calculation with shift for address calculation.
154
+ const int right_shift = X265_DEPTH - X265_NS::SAO::SAO_BO_BITS;
155
+ const int left_shift = 2;
156
+ const int shift = right_shift - left_shift;
157
+ // Mask out bits 7, 1 & 0 to account for combination of shifts.
158
+ const int mask = 0x7c;
159
+
160
+ // Compute statistics into temporary buffers.
161
+ for (int y = 0; y < endY; y++)
162
+ {
163
+ int x = 0;
164
+ for (; x + n_elem < endX; x += n_elem)
165
+ {
166
+ uint64_t class_idx_64 =
167
+ *reinterpret_cast<const uint64_t *>(rec + x) >> shift;
168
+
169
+ for (int i = 0; i < n_elem; ++i)
170
+ {
171
+ const int idx = i & 1;
172
+ const int off = (class_idx_64 >> (i * elem_width)) & mask;
173
+ *reinterpret_cast<uint32_t*>(stats_bidx + off) += diffx + i;
174
+ *reinterpret_cast<uint32_t*>(count_bidx + off) += 1;
175
+ }
176
+ }
177
+
178
+ if (x < endX)
179
+ {
180
+ uint64_t class_idx_64 =
181
+ *reinterpret_cast<const uint64_t *>(rec + x) >> shift;
182
+
183
+ for (int i = 0; (i + x) < endX; ++i)
184
+ {
185
+ const int idx = i & 1;
186
+ const int off = (class_idx_64 >> (i * elem_width)) & mask;
187
+ *reinterpret_cast<uint32_t*>(stats_bidx + off) += diffx + i;
188
+ *reinterpret_cast<uint32_t*>(count_bidx + off) += 1;
189
+ }
190
+ }
191
+
192
+ diff += MAX_CU_SIZE;
193
+ rec += stride;
194
+ }
195
+
196
+ // Reduce temporary buffers to destination using Neon.
197
+ for (int i = 0; i < 32; i += 4)
198
+ {
199
+ int32x4_t s0 = vld1q_s32(stats_tmp + i);
200
+ int32x4_t s1 = vld1q_s32(stats + i);
201
x265_4.0.tar.gz/source/common/aarch64/sao-prim.h
Added
72
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2024 MulticoreWare, Inc
4
+ *
5
+ * Authors: Hari Limaye <hari.limaye@arm.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#ifndef X265_COMMON_AARCH64_SAO_PRIM_H
26
+#define X265_COMMON_AARCH64_SAO_PRIM_H
27
+
28
+#include "neon-sve-bridge.h"
29
+#include "primitives.h"
30
+#include <arm_neon.h>
31
+
32
+static inline int8x16_t signOf_neon(const pixel *a, const pixel *b)
33
+{
34
+#if HIGH_BIT_DEPTH
35
+ uint16x8_t s0_lo = vld1q_u16(a);
36
+ uint16x8_t s0_hi = vld1q_u16(a + 8);
37
+ uint16x8_t s1_lo = vld1q_u16(b);
38
+ uint16x8_t s1_hi = vld1q_u16(b + 8);
39
+
40
+ // signOf(a - b) = -(a > b ? -1 : 0) | (a < b ? -1 : 0)
41
+ int16x8_t cmp0_lo = vreinterpretq_s16_u16(vcgtq_u16(s0_lo, s1_lo));
42
+ int16x8_t cmp0_hi = vreinterpretq_s16_u16(vcgtq_u16(s0_hi, s1_hi));
43
+ int16x8_t cmp1_lo = vreinterpretq_s16_u16(vcgtq_u16(s1_lo, s0_lo));
44
+ int16x8_t cmp1_hi = vreinterpretq_s16_u16(vcgtq_u16(s1_hi, s0_hi));
45
+
46
+ int8x16_t cmp0 = vcombine_s8(vmovn_s16(cmp0_lo), vmovn_s16(cmp0_hi));
47
+ int8x16_t cmp1 = vcombine_s8(vmovn_s16(cmp1_lo), vmovn_s16(cmp1_hi));
48
+#else // HIGH_BIT_DEPTH
49
+ uint8x16_t s0 = vld1q_u8(a);
50
+ uint8x16_t s1 = vld1q_u8(b);
51
+
52
+ // signOf(a - b) = -(a > b ? -1 : 0) | (a < b ? -1 : 0)
53
+ int8x16_t cmp0 = vreinterpretq_s8_u8(vcgtq_u8(s0, s1));
54
+ int8x16_t cmp1 = vreinterpretq_s8_u8(vcgtq_u8(s1, s0));
55
+#endif // HIGH_BIT_DEPTH
56
+ return vorrq_s8(vnegq_s8(cmp0), cmp1);
57
+}
58
+
59
+namespace X265_NS {
60
+void setupSaoPrimitives_neon(EncoderPrimitives &p);
61
+
62
+#if defined(HAVE_SVE) && HAVE_SVE_BRIDGE
63
+void setupSaoPrimitives_sve(EncoderPrimitives &p);
64
+#endif
65
+
66
+#if defined(HAVE_SVE2) && HAVE_SVE_BRIDGE
67
+void setupSaoPrimitives_sve2(EncoderPrimitives &p);
68
+#endif
69
+}
70
+
71
+#endif // X265_COMMON_AARCH64_SAO_PRIM_H
72
x265_3.6.tar.gz/source/common/aarch64/ssd-a-common.S -> x265_4.0.tar.gz/source/common/aarch64/ssd-a-common.S
Changed
12
1
2
.arch armv8-a
3
4
.macro ret_v0_w0
5
- trn2 v1.2d, v0.2d, v0.2d
6
- add v0.2s, v0.2s, v1.2s
7
- addp v0.2s, v0.2s, v0.2s
8
+ addv s0, v0.4s
9
fmov w0, s0
10
ret
11
.endm
12
x265_3.6.tar.gz/source/common/aarch64/ssd-a-sve2.S -> x265_4.0.tar.gz/source/common/aarch64/ssd-a-sve2.S
Changed
201
1
2
3
.text
4
5
-function PFX(pixel_sse_pp_32x32_sve2)
6
- rdvl x9, #1
7
- cmp x9, #16
8
- bgt .vl_gt_16_pixel_sse_pp_32x32
9
- mov w12, #8
10
- movi v0.16b, #0
11
- movi v1.16b, #0
12
-.loop_sse_pp_32_sve2:
13
- sub w12, w12, #1
14
-.rept 4
15
- ld1 {v16.16b,v17.16b}, x0, x1
16
- ld1 {v18.16b,v19.16b}, x2, x3
17
- usubl v2.8h, v16.8b, v18.8b
18
- usubl2 v3.8h, v16.16b, v18.16b
19
- usubl v4.8h, v17.8b, v19.8b
20
- usubl2 v5.8h, v17.16b, v19.16b
21
- smlal v0.4s, v2.4h, v2.4h
22
- smlal2 v1.4s, v2.8h, v2.8h
23
- smlal v0.4s, v3.4h, v3.4h
24
- smlal2 v1.4s, v3.8h, v3.8h
25
- smlal v0.4s, v4.4h, v4.4h
26
- smlal2 v1.4s, v4.8h, v4.8h
27
- smlal v0.4s, v5.4h, v5.4h
28
- smlal2 v1.4s, v5.8h, v5.8h
29
-.endr
30
- cbnz w12, .loop_sse_pp_32_sve2
31
- add v0.4s, v0.4s, v1.4s
32
- ret_v0_w0
33
-.vl_gt_16_pixel_sse_pp_32x32:
34
- ptrue p0.b, vl32
35
- ld1b {z16.b}, p0/z, x0
36
- ld1b {z18.b}, p0/z, x2
37
- add x0, x0, x1
38
- add x2, x2, x3
39
- usublb z1.h, z16.b, z18.b
40
- usublt z2.h, z16.b, z18.b
41
- smullb z0.s, z1.h, z1.h
42
- smlalt z0.s, z1.h, z1.h
43
- smlalb z0.s, z2.h, z2.h
44
- smlalt z0.s, z2.h, z2.h
45
-.rept 31
46
- ld1b {z16.b}, p0/z, x0
47
- ld1b {z18.b}, p0/z, x2
48
- add x0, x0, x1
49
- add x2, x2, x3
50
- usublb z1.h, z16.b, z18.b
51
- usublt z2.h, z16.b, z18.b
52
- smullb z0.s, z1.h, z1.h
53
- smlalt z0.s, z1.h, z1.h
54
- smlalb z0.s, z2.h, z2.h
55
- smlalt z0.s, z2.h, z2.h
56
-.endr
57
- uaddv d3, p0, z0.s
58
- fmov w0, s3
59
- ret
60
-endfunc
61
-
62
-function PFX(pixel_sse_pp_32x64_sve2)
63
- rdvl x9, #1
64
- cmp x9, #16
65
- bgt .vl_gt_16_pixel_sse_pp_32x64
66
- ptrue p0.b, vl16
67
- ld1b {z16.b}, p0/z, x0
68
- ld1b {z17.b}, p0/z, x0, #1, mul vl
69
- ld1b {z18.b}, p0/z, x2
70
- ld1b {z19.b}, p0/z, x2, #1, mul vl
71
- add x0, x0, x1
72
- add x2, x2, x3
73
- usublb z1.h, z16.b, z18.b
74
- usublt z2.h, z16.b, z18.b
75
- usublb z3.h, z17.b, z19.b
76
- usublt z4.h, z17.b, z19.b
77
- smullb z20.s, z1.h, z1.h
78
- smullt z21.s, z1.h, z1.h
79
- smlalb z20.s, z2.h, z2.h
80
- smlalt z21.s, z2.h, z2.h
81
- smlalb z20.s, z3.h, z3.h
82
- smlalt z21.s, z3.h, z3.h
83
- smlalb z20.s, z4.h, z4.h
84
- smlalt z21.s, z4.h, z4.h
85
-.rept 63
86
- ld1b {z16.b}, p0/z, x0
87
- ld1b {z17.b}, p0/z, x0, #1, mul vl
88
- ld1b {z18.b}, p0/z, x2
89
- ld1b {z19.b}, p0/z, x2, #1, mul vl
90
- add x0, x0, x1
91
- add x2, x2, x3
92
- usublb z1.h, z16.b, z18.b
93
- usublt z2.h, z16.b, z18.b
94
- usublb z3.h, z17.b, z19.b
95
- usublt z4.h, z17.b, z19.b
96
- smlalb z20.s, z1.h, z1.h
97
- smlalt z21.s, z1.h, z1.h
98
- smlalb z20.s, z2.h, z2.h
99
- smlalt z21.s, z2.h, z2.h
100
- smlalb z20.s, z3.h, z3.h
101
- smlalt z21.s, z3.h, z3.h
102
- smlalb z20.s, z4.h, z4.h
103
- smlalt z21.s, z4.h, z4.h
104
-.endr
105
- uaddv d3, p0, z20.s
106
- fmov w0, s3
107
- uaddv d4, p0, z21.s
108
- fmov w1, s4
109
- add w0, w0, w1
110
- ret
111
-.vl_gt_16_pixel_sse_pp_32x64:
112
- ptrue p0.b, vl32
113
- ld1b {z16.b}, p0/z, x0
114
- ld1b {z18.b}, p0/z, x2
115
- add x0, x0, x1
116
- add x2, x2, x3
117
- usublb z1.h, z16.b, z18.b
118
- usublt z2.h, z16.b, z18.b
119
- smullb z20.s, z1.h, z1.h
120
- smullt z21.s, z1.h, z1.h
121
- smlalb z20.s, z2.h, z2.h
122
- smlalt z21.s, z2.h, z2.h
123
-.rept 63
124
- ld1b {z16.b}, p0/z, x0
125
- ld1b {z18.b}, p0/z, x2
126
- add x0, x0, x1
127
- add x2, x2, x3
128
- usublb z1.h, z16.b, z18.b
129
- usublt z2.h, z16.b, z18.b
130
- smlalb z20.s, z1.h, z1.h
131
- smlalt z21.s, z1.h, z1.h
132
- smlalb z20.s, z2.h, z2.h
133
- smlalt z21.s, z2.h, z2.h
134
-.endr
135
- uaddv d3, p0, z20.s
136
- fmov w0, s3
137
- uaddv d4, p0, z21.s
138
- fmov w1, s4
139
- add w0, w0, w1
140
- ret
141
-endfunc
142
-
143
-function PFX(pixel_sse_pp_64x64_sve2)
144
- rdvl x9, #1
145
- cmp x9, #16
146
- bgt .vl_gt_16_pixel_sse_pp_64x64
147
- mov w12, #16
148
- movi v0.16b, #0
149
- movi v1.16b, #0
150
-
151
-.loop_sse_pp_64_sve2:
152
- sub w12, w12, #1
153
-.rept 4
154
- ld1 {v16.16b-v19.16b}, x0, x1
155
- ld1 {v20.16b-v23.16b}, x2, x3
156
-
157
- usubl v2.8h, v16.8b, v20.8b
158
- usubl2 v3.8h, v16.16b, v20.16b
159
- usubl v4.8h, v17.8b, v21.8b
160
- usubl2 v5.8h, v17.16b, v21.16b
161
- smlal v0.4s, v2.4h, v2.4h
162
- smlal2 v1.4s, v2.8h, v2.8h
163
- smlal v0.4s, v3.4h, v3.4h
164
- smlal2 v1.4s, v3.8h, v3.8h
165
- smlal v0.4s, v4.4h, v4.4h
166
- smlal2 v1.4s, v4.8h, v4.8h
167
- smlal v0.4s, v5.4h, v5.4h
168
- smlal2 v1.4s, v5.8h, v5.8h
169
-
170
- usubl v2.8h, v18.8b, v22.8b
171
- usubl2 v3.8h, v18.16b, v22.16b
172
- usubl v4.8h, v19.8b, v23.8b
173
- usubl2 v5.8h, v19.16b, v23.16b
174
- smlal v0.4s, v2.4h, v2.4h
175
- smlal2 v1.4s, v2.8h, v2.8h
176
- smlal v0.4s, v3.4h, v3.4h
177
- smlal2 v1.4s, v3.8h, v3.8h
178
- smlal v0.4s, v4.4h, v4.4h
179
- smlal2 v1.4s, v4.8h, v4.8h
180
- smlal v0.4s, v5.4h, v5.4h
181
- smlal2 v1.4s, v5.8h, v5.8h
182
-.endr
183
- cbnz w12, .loop_sse_pp_64_sve2
184
- add v0.4s, v0.4s, v1.4s
185
- ret_v0_w0
186
-.vl_gt_16_pixel_sse_pp_64x64:
187
- cmp x9, #48
188
- bgt .vl_gt_48_pixel_sse_pp_64x64
189
- ptrue p0.b, vl32
190
- ld1b {z16.b}, p0/z, x0
191
- ld1b {z17.b}, p0/z, x0, #1, mul vl
192
- ld1b {z20.b}, p0/z, x2
193
- ld1b {z21.b}, p0/z, x2, #1, mul vl
194
- add x0, x0, x1
195
- add x2, x2, x3
196
- usublb z1.h, z16.b, z20.b
197
- usublt z2.h, z16.b, z20.b
198
- usublb z3.h, z17.b, z21.b
199
- usublt z4.h, z17.b, z21.b
200
- smullb z24.s, z1.h, z1.h
201
x265_3.6.tar.gz/source/common/aarch64/ssd-a.S -> x265_4.0.tar.gz/source/common/aarch64/ssd-a.S
Changed
201
1
2
* Copyright (C) 2021 MulticoreWare, Inc
3
*
4
* Authors: Sebastian Pop <spop@amazon.com>
5
+ * Hari Limaye <hari.limaye@arm.com>
6
*
7
* This program is free software; you can redistribute it and/or modify
8
* it under the terms of the GNU General Public License as published by
9
10
11
.text
12
13
-function PFX(pixel_sse_pp_4x4_neon)
14
- ld1 {v16.s}0, x0, x1
15
- ld1 {v17.s}0, x2, x3
16
- ld1 {v18.s}0, x0, x1
17
- ld1 {v19.s}0, x2, x3
18
- ld1 {v20.s}0, x0, x1
19
- ld1 {v21.s}0, x2, x3
20
- ld1 {v22.s}0, x0, x1
21
- ld1 {v23.s}0, x2, x3
22
-
23
- usubl v1.8h, v16.8b, v17.8b
24
- usubl v2.8h, v18.8b, v19.8b
25
- usubl v3.8h, v20.8b, v21.8b
26
- usubl v4.8h, v22.8b, v23.8b
27
-
28
- smull v0.4s, v1.4h, v1.4h
29
- smlal v0.4s, v2.4h, v2.4h
30
- smlal v0.4s, v3.4h, v3.4h
31
- smlal v0.4s, v4.4h, v4.4h
32
- ret_v0_w0
33
-endfunc
34
+// Fully unrolled.
35
+.macro SSE_PP_4xN h
36
+function PFX(pixel_sse_pp_4x\h\()_neon)
37
+ movi v0.4s, #0
38
+.rept \h / 2
39
+ ldr s16, x0
40
+ ldr s17, x2
41
+ add x0, x0, x1
42
+ add x2, x2, x3
43
+ ld1 {v16.s}1, x0, x1
44
+ ld1 {v17.s}1, x2, x3
45
46
-function PFX(pixel_sse_pp_4x8_neon)
47
- ld1 {v16.s}0, x0, x1
48
- ld1 {v17.s}0, x2, x3
49
- usubl v1.8h, v16.8b, v17.8b
50
- ld1 {v16.s}0, x0, x1
51
- ld1 {v17.s}0, x2, x3
52
- smull v0.4s, v1.4h, v1.4h
53
-.rept 6
54
- usubl v1.8h, v16.8b, v17.8b
55
- ld1 {v16.s}0, x0, x1
56
- smlal v0.4s, v1.4h, v1.4h
57
- ld1 {v17.s}0, x2, x3
58
+ uabd v1.8b, v16.8b, v17.8b
59
+ umull v20.8h, v1.8b, v1.8b
60
+ uadalp v0.4s, v20.8h
61
.endr
62
- usubl v1.8h, v16.8b, v17.8b
63
- smlal v0.4s, v1.4h, v1.4h
64
ret_v0_w0
65
endfunc
66
+.endm
67
68
-function PFX(pixel_sse_pp_8x8_neon)
69
- ld1 {v16.8b}, x0, x1
70
- ld1 {v17.8b}, x2, x3
71
- usubl v1.8h, v16.8b, v17.8b
72
- ld1 {v16.8b}, x0, x1
73
- smull v0.4s, v1.4h, v1.4h
74
- smlal2 v0.4s, v1.8h, v1.8h
75
- ld1 {v17.8b}, x2, x3
76
-
77
-.rept 6
78
- usubl v1.8h, v16.8b, v17.8b
79
- ld1 {v16.8b}, x0, x1
80
- smlal v0.4s, v1.4h, v1.4h
81
- smlal2 v0.4s, v1.8h, v1.8h
82
- ld1 {v17.8b}, x2, x3
83
-.endr
84
- usubl v1.8h, v16.8b, v17.8b
85
- smlal v0.4s, v1.4h, v1.4h
86
- smlal2 v0.4s, v1.8h, v1.8h
87
- ret_v0_w0
88
-endfunc
89
+SSE_PP_4xN 4
90
+SSE_PP_4xN 8
91
92
-function PFX(pixel_sse_pp_8x16_neon)
93
- ld1 {v16.8b}, x0, x1
94
- ld1 {v17.8b}, x2, x3
95
- usubl v1.8h, v16.8b, v17.8b
96
+// Fully unrolled.
97
+.macro SSE_PP_8xN h
98
+function PFX(pixel_sse_pp_8x\h\()_neon)
99
+ movi v0.4s, #0
100
+.rept \h
101
ld1 {v16.8b}, x0, x1
102
- smull v0.4s, v1.4h, v1.4h
103
- smlal2 v0.4s, v1.8h, v1.8h
104
ld1 {v17.8b}, x2, x3
105
106
-.rept 14
107
- usubl v1.8h, v16.8b, v17.8b
108
- ld1 {v16.8b}, x0, x1
109
- smlal v0.4s, v1.4h, v1.4h
110
- smlal2 v0.4s, v1.8h, v1.8h
111
- ld1 {v17.8b}, x2, x3
112
+ uabd v1.8b, v16.8b, v17.8b
113
+ umull v20.8h, v1.8b, v1.8b
114
+ uadalp v0.4s, v20.8h
115
.endr
116
- usubl v1.8h, v16.8b, v17.8b
117
- smlal v0.4s, v1.4h, v1.4h
118
- smlal2 v0.4s, v1.8h, v1.8h
119
ret_v0_w0
120
endfunc
121
+.endm
122
+
123
+SSE_PP_8xN 8
124
+SSE_PP_8xN 16
125
126
-.macro sse_pp_16xN h
127
+// Fully unrolled.
128
+.macro SSE_PP_16xN h
129
function PFX(pixel_sse_pp_16x\h\()_neon)
130
+ movi v0.4s, #0
131
+ movi v1.4s, #0
132
+.rept \h
133
ld1 {v16.16b}, x0, x1
134
ld1 {v17.16b}, x2, x3
135
- usubl v1.8h, v16.8b, v17.8b
136
- usubl2 v2.8h, v16.16b, v17.16b
137
- ld1 {v16.16b}, x0, x1
138
- ld1 {v17.16b}, x2, x3
139
- smull v0.4s, v1.4h, v1.4h
140
- smlal2 v0.4s, v1.8h, v1.8h
141
- smlal v0.4s, v2.4h, v2.4h
142
- smlal2 v0.4s, v2.8h, v2.8h
143
-.rept \h - 2
144
- usubl v1.8h, v16.8b, v17.8b
145
- usubl2 v2.8h, v16.16b, v17.16b
146
- ld1 {v16.16b}, x0, x1
147
- smlal v0.4s, v1.4h, v1.4h
148
- smlal2 v0.4s, v1.8h, v1.8h
149
- ld1 {v17.16b}, x2, x3
150
- smlal v0.4s, v2.4h, v2.4h
151
- smlal2 v0.4s, v2.8h, v2.8h
152
+
153
+ uabd v2.16b, v16.16b, v17.16b
154
+ umull v20.8h, v2.8b, v2.8b
155
+ uadalp v0.4s, v20.8h
156
+ umull2 v21.8h, v2.16b, v2.16b
157
+ uadalp v1.4s, v21.8h
158
.endr
159
- usubl v1.8h, v16.8b, v17.8b
160
- usubl2 v2.8h, v16.16b, v17.16b
161
- smlal v0.4s, v1.4h, v1.4h
162
- smlal2 v0.4s, v1.8h, v1.8h
163
- smlal v0.4s, v2.4h, v2.4h
164
- smlal2 v0.4s, v2.8h, v2.8h
165
+ add v0.4s, v0.4s, v1.4s
166
ret_v0_w0
167
endfunc
168
.endm
169
170
-sse_pp_16xN 16
171
-sse_pp_16xN 32
172
+SSE_PP_16xN 16
173
+SSE_PP_16xN 32
174
175
-function PFX(pixel_sse_pp_32x32_neon)
176
- mov w12, #8
177
- movi v0.16b, #0
178
- movi v1.16b, #0
179
-.loop_sse_pp_32:
180
- sub w12, w12, #1
181
+// Loop unrolled to process 4 rows per iteration.
182
+function PFX(pixel_sse_pp_32xh_neon), export=0
183
+ movi v0.4s, #0
184
+ movi v1.4s, #0
185
+.Loop_sse_pp_32xh:
186
+ sub w4, w4, #1
187
.rept 4
188
ld1 {v16.16b,v17.16b}, x0, x1
189
ld1 {v18.16b,v19.16b}, x2, x3
190
- usubl v2.8h, v16.8b, v18.8b
191
- usubl2 v3.8h, v16.16b, v18.16b
192
- usubl v4.8h, v17.8b, v19.8b
193
- usubl2 v5.8h, v17.16b, v19.16b
194
- smlal v0.4s, v2.4h, v2.4h
195
- smlal2 v1.4s, v2.8h, v2.8h
196
- smlal v0.4s, v3.4h, v3.4h
197
- smlal2 v1.4s, v3.8h, v3.8h
198
- smlal v0.4s, v4.4h, v4.4h
199
- smlal2 v1.4s, v4.8h, v4.8h
200
- smlal v0.4s, v5.4h, v5.4h
201
x265_4.0.tar.gz/source/common/aarch64/ssd-neon-dotprod.S
Added
171
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2024 MulticoreWare, Inc
4
+ *
5
+ * Authors: Hari Limaye <hari.limaye@arm.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+
27
+.arch armv8.2-a+dotprod
28
+
29
+#ifdef __APPLE__
30
+.section __RODATA,__rodata
31
+#else
32
+.section .rodata
33
+#endif
34
+
35
+.align 4
36
+
37
+.text
38
+
39
+// Fully unrolled.
40
+.macro SSE_PP_4xN h
41
+function PFX(pixel_sse_pp_4x\h\()_neon_dotprod)
42
+ movi v0.4s, #0
43
+.rept \h / 4
44
+ ldr s16, x0
45
+ ldr s17, x2
46
+ add x0, x0, x1
47
+ add x2, x2, x3
48
+ ld1 {v16.s}1, x0, x1
49
+ ld1 {v16.s}2, x0, x1
50
+ ld1 {v16.s}3, x0, x1
51
+ ld1 {v17.s}1, x2, x3
52
+ ld1 {v17.s}2, x2, x3
53
+ ld1 {v17.s}3, x2, x3
54
+
55
+ uabd v1.16b, v16.16b, v17.16b
56
+ udot v0.4s, v1.16b, v1.16b
57
+.endr
58
+ addv s0, v0.4s
59
+ fmov w0, s0
60
+ ret
61
+endfunc
62
+.endm
63
+
64
+SSE_PP_4xN 4
65
+SSE_PP_4xN 8
66
+
67
+// Fully unrolled.
68
+.macro SSE_PP_8xN h
69
+function PFX(pixel_sse_pp_8x\h\()_neon_dotprod)
70
+ movi v0.4s, #0
71
+.rept \h
72
+ ld1 {v16.8b}, x0, x1
73
+ ld1 {v17.8b}, x2, x3
74
+
75
+ uabd v1.8b, v16.8b, v17.8b
76
+ udot v0.2s, v1.8b, v1.8b
77
+.endr
78
+ addv s0, v0.4s
79
+ fmov w0, s0
80
+ ret
81
+endfunc
82
+.endm
83
+
84
+SSE_PP_8xN 8
85
+SSE_PP_8xN 16
86
+
87
+// Fully unrolled.
88
+.macro SSE_PP_16xN h
89
+function PFX(pixel_sse_pp_16x\h\()_neon_dotprod)
90
+ movi v0.4s, #0
91
+ movi v1.4s, #0
92
+.rept \h / 2
93
+ ld1 {v16.16b}, x0, x1
94
+ ld1 {v17.16b}, x2, x3
95
+ ld1 {v18.16b}, x0, x1
96
+ ld1 {v19.16b}, x2, x3
97
+
98
+ uabd v2.16b, v16.16b, v17.16b
99
+ udot v0.4s, v2.16b, v2.16b
100
+ uabd v3.16b, v18.16b, v19.16b
101
+ udot v1.4s, v3.16b, v3.16b
102
+.endr
103
+ add v0.4s, v0.4s, v1.4s
104
+ addv s0, v0.4s
105
+ fmov w0, s0
106
+ ret
107
+endfunc
108
+.endm
109
+
110
+SSE_PP_16xN 16
111
+SSE_PP_16xN 32
112
+
113
+// Loop unrolled to process 4 rows per iteration.
114
+function PFX(pixel_sse_pp_32xh_neon_dotprod), export=0
115
+ movi v0.4s, #0
116
+ movi v1.4s, #0
117
+.Loop_sse_pp_32xh:
118
+ sub w4, w4, #1
119
+.rept 4
120
+ ld1 {v16.16b,v17.16b}, x0, x1
121
+ ld1 {v18.16b,v19.16b}, x2, x3
122
+
123
+ uabd v2.16b, v16.16b, v18.16b
124
+ udot v0.4s, v2.16b, v2.16b
125
+ uabd v3.16b, v17.16b, v19.16b
126
+ udot v1.4s, v3.16b, v3.16b
127
+.endr
128
+ cbnz w4, .Loop_sse_pp_32xh
129
+ add v0.4s, v0.4s, v1.4s
130
+ addv s0, v0.4s
131
+ fmov w0, s0
132
+ ret
133
+endfunc
134
+
135
+.macro SSE_PP_32xN h
136
+function PFX(pixel_sse_pp_32x\h\()_neon_dotprod)
137
+ mov w4, \h / 4
138
+ b PFX(pixel_sse_pp_32xh_neon_dotprod)
139
+endfunc
140
+.endm
141
+
142
+SSE_PP_32xN 32
143
+SSE_PP_32xN 64
144
+
145
+// Loop unrolled to process 4 rows per iteration.
146
+function PFX(pixel_sse_pp_64x64_neon_dotprod)
147
+ mov w12, #16
148
+ movi v0.4s, #0
149
+ movi v1.4s, #0
150
+.Loop_sse_pp_64:
151
+ sub w12, w12, #1
152
+.rept 4
153
+ ld1 {v16.16b-v19.16b}, x0, x1
154
+ ld1 {v20.16b-v23.16b}, x2, x3
155
+
156
+ uabd v2.16b, v16.16b, v20.16b
157
+ udot v0.4s, v2.16b, v2.16b
158
+ uabd v3.16b, v17.16b, v21.16b
159
+ udot v1.4s, v3.16b, v3.16b
160
+ uabd v4.16b, v18.16b, v22.16b
161
+ udot v0.4s, v4.16b, v4.16b
162
+ uabd v5.16b, v19.16b, v23.16b
163
+ udot v1.4s, v5.16b, v5.16b
164
+.endr
165
+ cbnz w12, .Loop_sse_pp_64
166
+ add v0.4s, v0.4s, v1.4s
167
+ addv s0, v0.4s
168
+ fmov w0, s0
169
+ ret
170
+endfunc
171
x265_3.6.tar.gz/source/common/arm/blockcopy8.S -> x265_4.0.tar.gz/source/common/arm/blockcopy8.S
Changed
19
1
2
vmov q2, q12
3
vmov q3, q14
4
5
-.loop:
6
+.Loop:
7
vldm r0!, {q8-q15}
8
subs r1, #1
9
10
11
vadd.s8 q1, q10
12
vadd.s8 q2, q12
13
vadd.s8 q3, q14
14
- bgt .loop
15
+ bgt .Loop
16
17
// sum
18
vadd.s8 q0, q1
19
x265_3.6.tar.gz/source/common/arm/dct-a.S -> x265_4.0.tar.gz/source/common/arm/dct-a.S
Changed
37
1
2
mov lr, #4*16*2
3
4
// DCT-1D
5
-.loop1:
6
+.Loop1:
7
// Row0-3
8
vld1.16 {q8-q9}, r0, :64, r2 // q8 = 07 06 05 04 03 02 01 00, q9 = 0F 0E 0D 0C 0B 0A 09 08
9
vld1.16 {q10-q11}, r0, :64, r2 // q10 = 17 16 15 14 13 12 11 10, q11 = 1F 1E 1D 1C 1B 1A 19 18
10
11
// loop into next process group
12
sub r3, #3*4*16*2
13
subs r12, #1
14
- bgt .loop1
15
+ bgt .Loop1
16
17
18
// DCT-2D
19
20
mov r3, #16*2*2
21
mov r12, #16/4 // Process 4 rows every loop
22
23
-.loop2:
24
+.Loop2:
25
vldm r2, {q8-q15}
26
27
// d16 = 30 20 10 00
28
29
30
sub r1, #(17*16-4)*2
31
subs r12, #1
32
- bgt .loop2
33
+ bgt .Loop2
34
35
add sp, #16*16*2
36
vpop {q4-q7}
37
x265_3.6.tar.gz/source/common/arm/ipfilter8.S -> x265_4.0.tar.gz/source/common/arm/ipfilter8.S
Changed
201
1
2
vmov.u16 q1, #8192
3
vneg.s16 q1, q1
4
mov r12, #8
5
-.loop_filterP2S_32x16:
6
+.Loop_filterP2S_32x16:
7
subs r12, #1
8
.rept 2
9
vld1.u8 {q9-q10}, r0, r1
10
11
vmla.s16 q3, q10, q0
12
vst1.16 {q2-q3}, r2, r3
13
.endr
14
- bgt .loop_filterP2S_32x16
15
+ bgt .Loop_filterP2S_32x16
16
bx lr
17
endfunc
18
19
20
vmov.u16 q1, #8192
21
vneg.s16 q1, q1
22
mov r12, #12
23
-.loop_filterP2S_32x24:
24
+.Loop_filterP2S_32x24:
25
subs r12, #1
26
.rept 2
27
vld1.u8 {q9-q10}, r0, r1
28
29
vmla.s16 q3, q10, q0
30
vst1.16 {q2-q3}, r2, r3
31
.endr
32
- bgt .loop_filterP2S_32x24
33
+ bgt .Loop_filterP2S_32x24
34
bx lr
35
endfunc
36
37
38
vmov.u16 q1, #8192
39
vneg.s16 q1, q1
40
mov r12, #16
41
-.loop_filterP2S_32x32:
42
+.Loop_filterP2S_32x32:
43
subs r12, #1
44
.rept 2
45
vld1.u8 {q9-q10}, r0, r1
46
47
vmla.s16 q3, q10, q0
48
vst1.16 {q2-q3}, r2, r3
49
.endr
50
- bgt .loop_filterP2S_32x32
51
+ bgt .Loop_filterP2S_32x32
52
bx lr
53
endfunc
54
55
56
vmov.u16 q1, #8192
57
vneg.s16 q1, q1
58
mov r12, #32
59
-.loop_filterP2S_32x64:
60
+.Loop_filterP2S_32x64:
61
subs r12, #1
62
.rept 2
63
vld1.u8 {q9-q10}, r0, r1
64
65
vmla.s16 q3, q10, q0
66
vst1.16 {q2-q3}, r2, r3
67
.endr
68
- bgt .loop_filterP2S_32x64
69
+ bgt .Loop_filterP2S_32x64
70
bx lr
71
endfunc
72
73
74
vmov.u16 q1, #8192
75
vneg.s16 q1, q1
76
mov r12, #8
77
-.loop_filterP2S_64x16:
78
+.Loop_filterP2S_64x16:
79
subs r12, #1
80
.rept 2
81
vld1.u8 {q9-q10}, r0!
82
83
vmla.s16 q3, q10, q0
84
vst1.16 {q2-q3}, r2, r3
85
.endr
86
- bgt .loop_filterP2S_64x16
87
+ bgt .Loop_filterP2S_64x16
88
bx lr
89
endfunc
90
91
92
vmov.u16 q1, #8192
93
vneg.s16 q1, q1
94
mov r12, #16
95
-.loop_filterP2S_64x32:
96
+.Loop_filterP2S_64x32:
97
subs r12, #1
98
.rept 2
99
vld1.u8 {q9-q10}, r0!
100
101
vmla.s16 q3, q10, q0
102
vst1.16 {q2-q3}, r2, r3
103
.endr
104
- bgt .loop_filterP2S_64x32
105
+ bgt .Loop_filterP2S_64x32
106
bx lr
107
endfunc
108
109
110
vmov.u16 q1, #8192
111
vneg.s16 q1, q1
112
mov r12, #24
113
-.loop_filterP2S_64x48:
114
+.Loop_filterP2S_64x48:
115
subs r12, #1
116
.rept 2
117
vld1.u8 {q9-q10}, r0!
118
119
vmla.s16 q3, q10, q0
120
vst1.16 {q2-q3}, r2, r3
121
.endr
122
- bgt .loop_filterP2S_64x48
123
+ bgt .Loop_filterP2S_64x48
124
bx lr
125
endfunc
126
127
128
vmov.u16 q1, #8192
129
vneg.s16 q1, q1
130
mov r12, #32
131
-.loop_filterP2S_64x64:
132
+.Loop_filterP2S_64x64:
133
subs r12, #1
134
.rept 2
135
vld1.u8 {q9-q10}, r0!
136
137
vmla.s16 q3, q10, q0
138
vst1.16 {q2-q3}, r2, r3
139
.endr
140
- bgt .loop_filterP2S_64x64
141
+ bgt .Loop_filterP2S_64x64
142
bx lr
143
endfunc
144
145
146
vmov.u16 q1, #8192
147
vneg.s16 q1, q1
148
mov r12, #32
149
-.loop_filterP2S_48x64:
150
+.Loop_filterP2S_48x64:
151
subs r12, #1
152
.rept 2
153
vld1.u8 {q9-q10}, r0!
154
155
vmla.s16 q3, q9, q0
156
vst1.16 {q2-q3}, r2, r3
157
.endr
158
- bgt .loop_filterP2S_48x64
159
+ bgt .Loop_filterP2S_48x64
160
bx lr
161
endfunc
162
163
164
vmovl.u8 q2, d4
165
vmovl.u8 q3, d6
166
167
-.loop_4x\h:
168
+.Loop_4x\h:
169
// TODO: read extra 1 row for speed optimize, may made crash on OS X platform!
170
vld1.u32 {d160}, r0, r1
171
vld1.u32 {d161}, r0, r1
172
173
vst1.u32 {d181}, r2, r3
174
175
subs r12, #2
176
- bne .loop_4x4
177
+ bne .Loop_4x4
178
179
pop {pc}
180
.ltorg
181
182
183
.macro FILTER_VPP a b filterv
184
185
-.loop_\filterv\()_\a\()x\b:
186
+.Loop_\filterv\()_\a\()x\b:
187
188
mov r7, r2
189
mov r6, r0
190
eor r8, r8
191
192
-.loop_w8_\filterv\()_\a\()x\b:
193
+.Loop_w8_\filterv\()_\a\()x\b:
194
195
add r6, r0, r8
196
197
198
199
add r8, #8
200
cmp r8, #\a
201
x265_3.6.tar.gz/source/common/arm/mc-a.S -> x265_4.0.tar.gz/source/common/arm/mc-a.S
Changed
37
1
2
vsri.s16 q1, #1
3
vneg.s16 q0, q0
4
mov r3, #4
5
-.loop_cpy2Dto1D_shr_16:
6
+.Loop_cpy2Dto1D_shr_16:
7
subs r3, #1
8
.rept 4
9
vld1.s16 {q2-q3}, r1, r2
10
11
vshl.s16 q3, q0
12
vst1.16 {q2-q3}, r0!
13
.endr
14
- bgt .loop_cpy2Dto1D_shr_16
15
+ bgt .Loop_cpy2Dto1D_shr_16
16
bx lr
17
endfunc
18
19
20
vsri.s16 q1, #1
21
vneg.s16 q0, q0
22
mov r3, 16
23
-.loop_cpy2Dto1D_shr_32:
24
+.Loop_cpy2Dto1D_shr_32:
25
subs r3, #1
26
.rept 2
27
vld1.s16 {q2-q3}, r1!
28
29
vst1.16 {q2-q3}, r0!
30
vst1.16 {q8-q9}, r0!
31
.endr
32
- bgt .loop_cpy2Dto1D_shr_32
33
+ bgt .Loop_cpy2Dto1D_shr_32
34
bx lr
35
endfunc
36
37
x265_3.6.tar.gz/source/common/arm/pixel-util.S -> x265_4.0.tar.gz/source/common/arm/pixel-util.S
Changed
116
1
2
vdup.8 q2, r12
3
sub r5, #1
4
5
-.loop_h:
6
+.Loop_h:
7
mov r6, r0
8
mov r12, r2
9
eor r7, r7
10
-.loop_w:
11
+.Loop_w:
12
vld1.u8 {q0}, r6!
13
vshl.u8 q0, q0, q2
14
vst1.u8 {q0}, r12!
15
16
add r7, #16
17
cmp r7, r4
18
- blt .loop_w
19
+ blt .Loop_w
20
21
add r0, r1
22
add r2, r3
23
24
subs r5, #1
25
- bgt .loop_h
26
+ bgt .Loop_h
27
28
// handle last row
29
mov r5, r4
30
lsr r5, #3
31
32
-.loopW8:
33
+.LoopW8:
34
vld1.u8 d0, r0!
35
vshl.u8 d0, d0, d4
36
vst1.u8 d0, r2!
37
subs r4, r4, #8
38
subs r5, #1
39
- bgt .loopW8
40
+ bgt .LoopW8
41
42
mov r5,#8
43
sub r5, r4
44
45
eor r5, r5
46
veor.s32 q12, q12
47
48
-.loop_quant:
49
+.Loop_quant:
50
51
vld1.s16 d16, r0!
52
vmovl.s16 q9, d16 // q9= coefblockpos
53
54
vst1.s16 d16, r3!
55
56
subs r4, #1
57
- bne .loop_quant
58
+ bne .Loop_quant
59
60
vadd.u32 d8, d9
61
vpadd.u32 d8, d8
62
63
eor r4, r4
64
veor.s32 q12, q12
65
66
-.loop_nquant:
67
+.Loop_nquant:
68
69
vld1.s16 d16, r0!
70
vmovl.s16 q9, d16 // q9= coefblockpos
71
72
vst1.s16 d17, r2!
73
74
subs r3, #1
75
- bne .loop_nquant
76
+ bne .Loop_nquant
77
78
vadd.u32 d8, d9
79
vpadd.u32 d8, d8
80
81
mov r10, #4
82
eor r9, r9
83
84
-.loop_32:
85
+.Loop_32:
86
87
sa8d_16x16 r4
88
89
90
sub r2, r2, #24
91
92
subs r10, #1
93
- bgt .loop_32
94
+ bgt .Loop_32
95
96
mov r0, r9
97
vpop {d8-d11}
98
99
mov r10, #4
100
eor r9, r9
101
102
-.loop_1:
103
+.Loop_1:
104
105
sa8d_16x16 r4
106
107
108
sub r2, r2, #56
109
110
subs r10, #1
111
- bgt .loop_1
112
+ bgt .Loop_1
113
114
mov r0, r9
115
vpop {d8-d11}
116
x265_3.6.tar.gz/source/common/arm/sad-a.S -> x265_4.0.tar.gz/source/common/arm/sad-a.S
Changed
151
1
2
vabal.u8 q9, d5, d7
3
mov r12, #(\h-2)/2
4
5
-.loop_16x\h:
6
+.Loop_16x\h:
7
8
subs r12, #1
9
vld1.8 {q0}, r0, r1
10
11
vabal.u8 q9, d1, d3
12
vabal.u8 q8, d4, d6
13
vabal.u8 q9, d5, d7
14
- bne .loop_16x\h
15
+ bne .Loop_16x\h
16
17
vadd.u16 q8, q8, q9
18
.if \h == 64
19
20
veor.u8 q11, q11
21
mov r12, #\h/8
22
23
-.loop_32x\h:
24
+.Loop_32x\h:
25
26
subs r12, #1
27
.rept 4
28
29
vabal.u8 q10, d26, d30
30
vabal.u8 q11, d27, d31
31
.endr
32
- bne .loop_32x\h
33
+ bne .Loop_32x\h
34
35
vadd.u16 q8, q8, q9
36
vadd.u16 q10, q10, q11
37
38
sub r3, r12
39
mov r12, #\h/8
40
41
-.loop_64x\h:
42
+.Loop_64x\h:
43
44
subs r12, #1
45
.rept 4
46
47
vabal.u8 q10, d26, d30
48
vabal.u8 q11, d27, d31
49
.endr
50
- bne .loop_64x\h
51
+ bne .Loop_64x\h
52
53
vadd.u16 q8, q8, q9
54
vadd.u16 q10, q10, q11
55
56
sub r3, #16
57
mov r12, #8
58
59
-.loop_24x32:
60
+.Loop_24x32:
61
62
subs r12, #1
63
.rept 4
64
65
vld1.8 {d1}, r2, r3
66
vabal.u8 q10, d0, d1
67
.endr
68
- bne .loop_24x32
69
+ bne .Loop_24x32
70
71
vadd.u16 q8, q8, q9
72
vadd.u16 d16, d16, d17
73
74
sub r3, #32
75
mov r12, #16
76
77
-.loop_48x64:
78
+.Loop_48x64:
79
80
subs r12, #1
81
.rept 4
82
83
vabal.u8 q14, d4, d20
84
vabal.u8 q15, d5, d21
85
.endr
86
- bne .loop_48x64
87
+ bne .Loop_48x64
88
89
vadd.u16 q3, q3, q11
90
vadd.u16 d6, d6, d7
91
92
veor.u8 q15, q15
93
.endif
94
95
-.loop_sad_x\x\()_16x\h:
96
+.Loop_sad_x\x\()_16x\h:
97
.rept 8
98
SAD_X_16 \x
99
.endr
100
subs r6, #1
101
- bne .loop_sad_x\x\()_16x\h
102
+ bne .Loop_sad_x\x\()_16x\h
103
104
vadd.u16 q8, q8, q9
105
vadd.u16 q10, q10, q11
106
107
veor.u8 q14, q14
108
veor.u8 q15, q15
109
.endif
110
-.loop_sad_x\x\()_64x\h:
111
+.Loop_sad_x\x\()_64x\h:
112
.rept 8
113
SAD_X_64 \x
114
.endr
115
subs r6, #1
116
- bne .loop_sad_x\x\()_64x\h
117
+ bne .Loop_sad_x\x\()_64x\h
118
119
.if \h <= 16
120
vadd.u16 q8, q8, q9
121
122
veor.u8 q15, q15
123
.endif
124
125
-.loop_sad_x\x\()_48x64:
126
+.Loop_sad_x\x\()_48x64:
127
.rept 8
128
SAD_X_48 \x
129
.endr
130
subs r6, #1
131
- bne .loop_sad_x\x\()_48x64
132
+ bne .Loop_sad_x\x\()_48x64
133
134
vpaddl.u16 q8, q8
135
vpaddl.u16 q9, q9
136
137
veor.u8 q15, q15
138
.endif
139
140
-.loop_sad_x\x\()_24x32:
141
+.Loop_sad_x\x\()_24x32:
142
.rept 8
143
SAD_X_24 \x
144
.endr
145
subs r6, #1
146
- bne .loop_sad_x\x\()_24x32
147
+ bne .Loop_sad_x\x\()_24x32
148
149
vadd.u16 q8, q8, q9
150
vadd.u16 q10, q10, q11
151
x265_3.6.tar.gz/source/common/arm/ssd-a.S -> x265_4.0.tar.gz/source/common/arm/ssd-a.S
Changed
127
1
2
veor.u8 q0, q0
3
veor.u8 q1, q1
4
5
-.loop_sse_pp_32:
6
+.Loop_sse_pp_32:
7
subs r12, #1
8
.rept 4
9
vld1.64 {q8-q9}, r0, r1
10
11
vmlal.s16 q0, d26, d26
12
vmlal.s16 q1, d27, d27
13
.endr
14
- bne .loop_sse_pp_32
15
+ bne .Loop_sse_pp_32
16
vadd.s32 q0, q1
17
vadd.s32 d0, d0, d1
18
vpadd.s32 d0, d0, d0
19
20
veor.u8 q0, q0
21
veor.u8 q1, q1
22
23
-.loop_sse_pp_64:
24
+.Loop_sse_pp_64:
25
subs r12, #1
26
.rept 4
27
vld1.64 {q8-q9}, r0!
28
29
vmlal.s16 q0, d26, d26
30
vmlal.s16 q1, d27, d27
31
.endr
32
- bne .loop_sse_pp_64
33
+ bne .Loop_sse_pp_64
34
vadd.s32 q0, q1
35
vadd.s32 d0, d0, d1
36
vpadd.s32 d0, d0, d0
37
38
veor.u8 q0, q0
39
veor.u8 q1, q1
40
41
-.loop_sse_ss_16:
42
+.Loop_sse_ss_16:
43
subs r12, #1
44
.rept 4
45
vld1.s16 {q8-q9}, r0, r1
46
47
vmlal.s16 q0, d18, d18
48
vmlal.s16 q1, d19, d19
49
.endr
50
- bne .loop_sse_ss_16
51
+ bne .Loop_sse_ss_16
52
vadd.s32 q0, q1
53
vadd.s32 d0, d0, d1
54
vpadd.s32 d0, d0, d0
55
56
veor.u8 q0, q0
57
veor.u8 q1, q1
58
59
-.loop_sse_ss_32:
60
+.Loop_sse_ss_32:
61
subs r12, #1
62
.rept 4
63
vld1.s16 {q8-q9}, r0!
64
65
vmlal.s16 q0, d18, d18
66
vmlal.s16 q1, d19, d19
67
.endr
68
- bne .loop_sse_ss_32
69
+ bne .Loop_sse_ss_32
70
vadd.s32 q0, q1
71
vadd.s32 d0, d0, d1
72
vpadd.s32 d0, d0, d0
73
74
veor.u8 q0, q0
75
veor.u8 q1, q1
76
77
-.loop_sse_ss_64:
78
+.Loop_sse_ss_64:
79
subs r12, #1
80
.rept 2
81
vld1.s16 {q8-q9}, r0!
82
83
vmlal.s16 q0, d18, d18
84
vmlal.s16 q1, d19, d19
85
.endr
86
- bne .loop_sse_ss_64
87
+ bne .Loop_sse_ss_64
88
vadd.s32 q0, q1
89
vadd.s32 d0, d0, d1
90
vpadd.s32 d0, d0, d0
91
92
veor.u8 q0, q0
93
veor.u8 q1, q1
94
95
-.loop_ssd_s_16:
96
+.Loop_ssd_s_16:
97
subs r12, #1
98
.rept 2
99
vld1.s16 {q8-q9}, r0, r1
100
101
vmlal.s16 q0, d22, d22
102
vmlal.s16 q1, d23, d23
103
.endr
104
- bne .loop_ssd_s_16
105
+ bne .Loop_ssd_s_16
106
vadd.s32 q0, q1
107
vadd.s32 d0, d0, d1
108
vpadd.s32 d0, d0, d0
109
110
veor.u8 q0, q0
111
veor.u8 q1, q1
112
113
-.loop_ssd_s_32:
114
+.Loop_ssd_s_32:
115
subs r12, #1
116
.rept 4
117
vld1.s16 {q8-q9}, r0!
118
119
vmlal.s16 q0, d22, d22
120
vmlal.s16 q1, d23, d23
121
.endr
122
- bne .loop_ssd_s_32
123
+ bne .Loop_ssd_s_32
124
vadd.s32 q0, q1
125
vadd.s32 d0, d0, d1
126
vpadd.s32 d0, d0, d0
127
x265_3.6.tar.gz/source/common/common.h -> x265_4.0.tar.gz/source/common/common.h
Changed
14
1
2
template<typename T> /* clip to pixel range, 0..255 or 0..1023 */
3
inline pixel x265_clip(T x) { return (pixel)x265_min<T>(T((1 << X265_DEPTH) - 1), x265_max<T>(T(0), x)); }
4
5
+/* get the sign of input variable */
6
+static inline int8_t x265_signOf(int32_t x)
7
+{
8
+ return (x >> 31) | ((int32_t)((((uint32_t) - x)) >> 31));
9
+}
10
+
11
typedef int16_t coeff_t; // transform coefficient
12
13
#define X265_MIN(a, b) ((a) < (b) ? (a) : (b))
14
x265_3.6.tar.gz/source/common/cpu.cpp -> x265_4.0.tar.gz/source/common/cpu.cpp
Changed
45
1
2
#if defined(HAVE_SVE2)
3
{ "SVE2", X265_CPU_SVE2 },
4
#endif
5
+#if defined(HAVE_NEON_DOTPROD)
6
+ { "Neon_DotProd", X265_CPU_NEON_DOTPROD },
7
+#endif
8
+#if defined(HAVE_NEON_I8MM)
9
+ { "Neon_I8MM", X265_CPU_NEON_I8MM },
10
+#endif
11
#elif X265_ARCH_POWER8
12
{ "Altivec", X265_CPU_ALTIVEC },
13
14
15
{
16
int flags = 0;
17
18
- #if defined(HAVE_SVE2)
19
- flags |= X265_CPU_SVE2;
20
- flags |= X265_CPU_SVE;
21
+ #if HAVE_NEON
22
flags |= X265_CPU_NEON;
23
- #elif defined(HAVE_SVE)
24
+ #endif
25
+ #if HAVE_NEON_DOTPROD
26
+ flags |= X265_CPU_NEON_DOTPROD;
27
+ #endif
28
+ #if HAVE_NEON_I8MM
29
+ flags |= X265_CPU_NEON_I8MM;
30
+ #endif
31
+ #if HAVE_SVE
32
flags |= X265_CPU_SVE;
33
- flags |= X265_CPU_NEON;
34
- #elif HAVE_NEON
35
- flags |= X265_CPU_NEON;
36
#endif
37
-
38
+ #if HAVE_SVE2
39
+ flags |= X265_CPU_SVE2;
40
+ #endif
41
+
42
return flags;
43
}
44
45
x265_3.6.tar.gz/source/common/cudata.cpp -> x265_4.0.tar.gz/source/common/cudata.cpp
Changed
201
1
2
m_bFirstRowInSlice = (uint8_t)firstRowInSlice;
3
m_bLastRowInSlice = (uint8_t)lastRowInSlice;
4
m_bLastCuInSlice = (uint8_t)lastCuInSlice;
5
+#if ENABLE_SCC_EXT
6
+ m_lastIntraBCMv0.set(0, 0);
7
+ m_lastIntraBCMv1.set(0, 0);
8
+#endif
9
10
/* sequential memsets */
11
m_partSet((uint8_t*)m_qp, (uint8_t)qp);
12
13
}
14
15
// initialize Sub partition
16
-void CUData::initSubCU(const CUData& ctu, const CUGeom& cuGeom, int qp)
17
+void CUData::initSubCU(const CUData& ctu, const CUGeom& cuGeom, int qp, MV lastIntraBCMv2)
18
{
19
m_absIdxInCTU = cuGeom.absPartIdx;
20
m_encData = ctu.m_encData;
21
22
/* initialize the remaining CU data in one memset */
23
memset(m_predMode, 0, (ctu.m_chromaFormat == X265_CSP_I400 ? BytesPerPartition - 13 : BytesPerPartition - 9) * m_numPartitions);
24
memset(m_distortion, 0, m_numPartitions * sizeof(sse_t));
25
+
26
+#if ENABLE_SCC_EXT
27
+ if (lastIntraBCMv)
28
+ {
29
+ for (int i = 0; i < 2; i++)
30
+ m_lastIntraBCMvi = lastIntraBCMvi;
31
+ }
32
+#endif
33
}
34
35
/* Copy the results of a sub-part (split) CU to the parent CU */
36
37
memcpy(m_trCoeff1 + tmpC2, subCU.m_trCoeff1, sizeof(coeff_t) * tmpC);
38
memcpy(m_trCoeff2 + tmpC2, subCU.m_trCoeff2, sizeof(coeff_t) * tmpC);
39
}
40
+#if ENABLE_SCC_EXT
41
+ for (int i = 0; i < 2; i++)
42
+ m_lastIntraBCMvi = subCU.m_lastIntraBCMvi;
43
+#endif
44
}
45
46
/* If a sub-CU part is not present (off the edge of the picture) its depth and
47
48
return maxNumMergeCand;
49
}
50
}
51
+#if ENABLE_SCC_EXT
52
+ if (m_slice->m_bTemporalMvp)
53
+#else
54
if (m_slice->m_sps->bTemporalMVPEnabled)
55
+#endif
56
{
57
uint32_t partIdxRB = deriveRightBottomIdx(puIdx);
58
MV colmv;
59
60
}
61
}
62
}
63
- int numRefIdx = (isInterB) ? X265_MIN(m_slice->m_numRefIdx0, m_slice->m_numRefIdx1) : m_slice->m_numRefIdx0;
64
+ int numRefIdx0 = m_slice->m_numRefIdx0;
65
+#if ENABLE_SCC_EXT
66
+ if (m_slice->m_param->bEnableSCC)
67
+ numRefIdx0--;
68
+#endif
69
+ int numRefIdx = (isInterB) ? X265_MIN(numRefIdx0, m_slice->m_numRefIdx1) : numRefIdx0;
70
int r = 0;
71
int refcnt = 0;
72
- while (count < maxNumMergeCand)
73
+ while (numRefIdx && (count < maxNumMergeCand))
74
{
75
candDircount = 1;
76
candMvFieldcount0.mv.word = 0;
77
78
}
79
80
// Create the PMV list. Called for each reference index.
81
-int CUData::getPMV(InterNeighbourMV *neighbours, uint32_t picList, uint32_t refIdx, MV* amvpCand, MV* pmv) const
82
+int CUData::getPMV(InterNeighbourMV* neighbours, uint32_t picList, uint32_t refIdx, MV* amvpCand, MV* pmv, uint32_t puIdx, uint32_t absPartIdx) const
83
{
84
MV directMVMD_ABOVE_LEFT + 1;
85
MV indirectMVMD_ABOVE_LEFT + 1;
86
bool validDirectMD_ABOVE_LEFT + 1;
87
bool validIndirectMD_ABOVE_LEFT + 1;
88
89
- // Left candidate.
90
- validDirectMD_BELOW_LEFT = getDirectPMV(directMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx);
91
- validDirectMD_LEFT = getDirectPMV(directMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx);
92
- // Top candidate.
93
- validDirectMD_ABOVE_RIGHT = getDirectPMV(directMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx);
94
- validDirectMD_ABOVE = getDirectPMV(directMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx);
95
- validDirectMD_ABOVE_LEFT = getDirectPMV(directMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx);
96
-
97
- // Left candidate.
98
- validIndirectMD_BELOW_LEFT = getIndirectPMV(indirectMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx);
99
- validIndirectMD_LEFT = getIndirectPMV(indirectMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx);
100
- // Top candidate.
101
- validIndirectMD_ABOVE_RIGHT = getIndirectPMV(indirectMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx);
102
- validIndirectMD_ABOVE = getIndirectPMV(indirectMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx);
103
- validIndirectMD_ABOVE_LEFT = getIndirectPMV(indirectMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx);
104
+#if (ENABLE_MULTIVIEW || ENABLE_SCC_EXT)
105
+ if (m_slice->m_param->numViews > 1 || m_slice->m_param->bEnableSCC)
106
+ {
107
+ // Left candidate.
108
+ if ((neighbours + MD_BELOW_LEFT)->isAvailable || (neighbours + MD_LEFT)->isAvailable)
109
+ {
110
+ validIndirectMD_ABOVE_RIGHT = validIndirectMD_ABOVE = validIndirectMD_ABOVE_LEFT = false;
111
+
112
+ validDirectMD_BELOW_LEFT = getDirectPMV(directMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx);
113
+ validDirectMD_LEFT = getDirectPMV(directMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx);
114
+
115
+ validIndirectMD_BELOW_LEFT = getIndirectPMV(indirectMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx);
116
+ validIndirectMD_LEFT = getIndirectPMV(indirectMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx);
117
+ }
118
+
119
+ // Top candidate.
120
+ validDirectMD_ABOVE_RIGHT = getDirectPMV(directMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx);
121
+ validDirectMD_ABOVE = getDirectPMV(directMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx);
122
+ validDirectMD_ABOVE_LEFT = getDirectPMV(directMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx);
123
+
124
+ // Top candidate.
125
+ if (!((neighbours + MD_BELOW_LEFT)->isAvailable || (neighbours + MD_LEFT)->isAvailable))
126
+ {
127
+ validDirectMD_BELOW_LEFT = validDirectMD_LEFT = validIndirectMD_BELOW_LEFT = validIndirectMD_LEFT = false;
128
+ validIndirectMD_ABOVE_RIGHT = getIndirectPMV(indirectMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx);
129
+ validIndirectMD_ABOVE = getIndirectPMV(indirectMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx);
130
+ validIndirectMD_ABOVE_LEFT = getIndirectPMV(indirectMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx);
131
+ }
132
+ }
133
+ else
134
+#endif
135
+ {
136
+ // Left candidate.
137
+ validDirectMD_BELOW_LEFT = getDirectPMV(directMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx);
138
+ validDirectMD_LEFT = getDirectPMV(directMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx);
139
+ // Top candidate.
140
+ validDirectMD_ABOVE_RIGHT = getDirectPMV(directMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx);
141
+ validDirectMD_ABOVE = getDirectPMV(directMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx);
142
+ validDirectMD_ABOVE_LEFT = getDirectPMV(directMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx);
143
+
144
+ // Left candidate.
145
+ validIndirectMD_BELOW_LEFT = getIndirectPMV(indirectMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx);
146
+ validIndirectMD_LEFT = getIndirectPMV(indirectMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx);
147
+ // Top candidate.
148
+ validIndirectMD_ABOVE_RIGHT = getIndirectPMV(indirectMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx);
149
+ validIndirectMD_ABOVE = getIndirectPMV(indirectMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx);
150
+ validIndirectMD_ABOVE_LEFT = getIndirectPMV(indirectMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx);
151
+ }
152
153
int num = 0;
154
// Left predictor search
155
156
157
// Get the collocated candidate. At this step, either the first candidate
158
// was found or its value is 0.
159
- if (m_slice->m_sps->bTemporalMVPEnabled && num < 2)
160
+#if ENABLE_MULTIVIEW || ENABLE_SCC_EXT
161
+ if (m_slice->m_param->numViews > 1 || m_slice->m_param->bEnableSCC)
162
{
163
- int tempRefIdx = neighboursMD_COLLOCATED.refIdxpicList;
164
- if (tempRefIdx != -1)
165
+ if (m_slice->m_bTemporalMvp && num < 2)
166
{
167
- uint32_t cuAddr = neighboursMD_COLLOCATED.cuAddrpicList;
168
- const Frame* colPic = m_slice->m_refFrameListm_slice->isInterB() && !m_slice->m_colFromL0Flagm_slice->m_colRefIdx;
169
- const CUData* colCU = colPic->m_encData->getPicCTU(cuAddr);
170
+ int refId = refIdx;
171
+ uint32_t absPartAddr = m_absIdxInCTU + absPartIdx;
172
+ uint32_t partIdxRB = deriveRightBottomIdx(puIdx);
173
+ bool isValid;
174
+
175
+ // co-located RightBottom temporal predictor (H)
176
+ int ctuIdx = -1;
177
178
- // Scale the vector
179
- int colRefPOC = colCU->m_slice->m_refPOCListtempRefIdx >> 4tempRefIdx & 0xf;
180
- int colPOC = colCU->m_slice->m_poc;
181
+ // image boundary check
182
+ if (m_encData->getPicCTU(m_cuAddr)->m_cuPelX + g_zscanToPelXpartIdxRB + UNIT_SIZE < m_slice->m_sps->picWidthInLumaSamples &&
183
+ m_encData->getPicCTU(m_cuAddr)->m_cuPelY + g_zscanToPelYpartIdxRB + UNIT_SIZE < m_slice->m_sps->picHeightInLumaSamples)
184
+ {
185
+ uint32_t absPartIdxRB = g_zscanToRasterpartIdxRB;
186
+ uint32_t numUnits = s_numPartInCUSize;
187
+ bool bNotLastCol = lessThanCol(absPartIdxRB, numUnits - 1); // is not at the last column of CTU
188
+ bool bNotLastRow = lessThanRow(absPartIdxRB, numUnits - 1); // is not at the last row of CTU
189
190
- int curRefPOC = m_slice->m_refPOCListpicListrefIdx;
191
- int curPOC = m_slice->m_poc;
192
- pmvnumMvc++ = amvpCandnum++ = scaleMvByPOCDist(neighboursMD_COLLOCATED.mvpicList, curPOC, curRefPOC, colPOC, colRefPOC);
193
+ if (bNotLastCol && bNotLastRow)
194
+ {
195
+ absPartAddr = g_rasterToZscanabsPartIdxRB + RASTER_SIZE + 1;
196
+ ctuIdx = m_cuAddr;
197
+ }
198
+ else if (bNotLastCol)
199
+ absPartAddr = g_rasterToZscan(absPartIdxRB + 1) & (numUnits - 1);
200
+ else if (bNotLastRow)
201
x265_3.6.tar.gz/source/common/cudata.h -> x265_4.0.tar.gz/source/common/cudata.h
Changed
79
1
2
class Slice;
3
struct TUEntropyCodingParameters;
4
struct CUDataMemPool;
5
+#if ENABLE_SCC_EXT
6
+struct IBC;
7
+#endif
8
9
enum PartSize
10
{
11
12
// Collocated right bottom CU addr.
13
uint32_t cuAddr2;
14
15
+ bool isAvailable;
16
+
17
// For spatial prediction, this field contains the reference index
18
// in each list (-1 if not available).
19
//
20
21
union { int16_t refIdx2; int32_t unifiedRef; };
22
};
23
24
+struct IBC
25
+{
26
+ int m_numBVs;
27
+ int m_numBV16s;
28
+ MV m_BVs64;
29
+ MV m_lastIntraBCMv2;
30
+};
31
+
32
typedef void(*cucopy_t)(uint8_t* dst, uint8_t* src); // dst and src are aligned to MIN(size, 32)
33
typedef void(*cubcast_t)(uint8_t* dst, uint8_t val); // dst is aligned to MIN(size, 32)
34
35
36
uint32_t* m_collectCUVariance;
37
uint32_t* m_collectCUCount;
38
39
+#if ENABLE_SCC_EXT
40
+ MV m_lastIntraBCMv2;
41
+#endif
42
+
43
CUData();
44
45
void initialize(const CUDataMemPool& dataPool, uint32_t depth, const x265_param& param, int instance);
46
static void calcCTUGeoms(uint32_t ctuWidth, uint32_t ctuHeight, uint32_t maxCUSize, uint32_t minCUSize, CUGeom cuDataArrayCUGeom::MAX_GEOMS);
47
48
void initCTU(const Frame& frame, uint32_t cuAddr, int qp, uint32_t firstRowInSlice, uint32_t lastRowInSlice, uint32_t lastCUInSlice);
49
- void initSubCU(const CUData& ctu, const CUGeom& cuGeom, int qp);
50
+ void initSubCU(const CUData& ctu, const CUGeom& cuGeom, int qp, MV lastIntraBCMv2 = 0);
51
void initLosslessCU(const CUData& cu, const CUGeom& cuGeom);
52
53
void copyPartFrom(const CUData& cu, const CUGeom& childGeom, uint32_t subPartIdx);
54
55
int8_t getRefQP(uint32_t currAbsIdxInCTU) const;
56
uint32_t getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField (*candMvField)2, uint8_t* candDir) const;
57
void clipMv(MV& outMV) const;
58
- int getPMV(InterNeighbourMV *neighbours, uint32_t reference_list, uint32_t refIdx, MV* amvpCand, MV* pmv) const;
59
+ int getPMV(InterNeighbourMV* neighbours, uint32_t reference_list, uint32_t refIdx, MV* amvpCand, MV* pmv, uint32_t puIdx = 0, uint32_t absPartIdx = 0) const;
60
void getNeighbourMV(uint32_t puIdx, uint32_t absPartIdx, InterNeighbourMV* neighbours) const;
61
void getIntraTUQtDepthRange(uint32_t tuDepthRange2, uint32_t absPartIdx) const;
62
void getInterTUQtDepthRange(uint32_t tuDepthRange2, uint32_t absPartIdx) const;
63
64
const CUData* getPUAboveRightAdi(uint32_t& arPartUnitIdx, uint32_t curPartUnitIdx, uint32_t partUnitOffset) const;
65
const CUData* getPUBelowLeftAdi(uint32_t& blPartUnitIdx, uint32_t curPartUnitIdx, uint32_t partUnitOffset) const;
66
67
+#if ENABLE_SCC_EXT
68
+ void getIntraBCMVPsEncOnly(uint32_t absPartIdx, MV* MvPred, int& nbPred, int puIdx);
69
+ bool getDerivedBV(uint32_t absPartIdx, const MV& currentMv, MV& derivedMv, uint32_t width, uint32_t height);
70
+ bool isIntraBC(const CUData* cu, uint32_t absPartIdx) const;
71
+ bool getColMVPIBC(int ctuRsAddr, int partUnitIdx, MV& rcMv);
72
+ void roundMergeCandidates(MVField(*candMvField)2, int iCount) const;
73
+ bool is8x8BipredRestriction(MV mvL0, MV mvL1, int iRefIdxL0, int iRefIdxL1) const;
74
+#endif
75
+
76
protected:
77
78
template<typename T>
79
x265_3.6.tar.gz/source/common/dct.cpp -> x265_4.0.tar.gz/source/common/dct.cpp
Changed
100
1
2
}
3
}
4
5
-static void dst4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
6
+namespace X265_NS {
7
+void dst4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
8
{
9
const int shift_1st = 1 + X265_DEPTH - 8;
10
const int shift_2nd = 8;
11
12
fastForwardDst(coef, dst, shift_2nd);
13
}
14
15
-static void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
16
+void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
17
{
18
const int shift_1st = 1 + X265_DEPTH - 8;
19
const int shift_2nd = 8;
20
21
partialButterfly4(coef, dst, shift_2nd, 4);
22
}
23
24
-static void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
25
+void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
26
{
27
const int shift_1st = 2 + X265_DEPTH - 8;
28
const int shift_2nd = 9;
29
30
partialButterfly8(coef, dst, shift_2nd, 8);
31
}
32
33
-static void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
34
+void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
35
{
36
const int shift_1st = 3 + X265_DEPTH - 8;
37
const int shift_2nd = 10;
38
39
partialButterfly16(coef, dst, shift_2nd, 16);
40
}
41
42
-static void dct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
43
+void dct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
44
{
45
const int shift_1st = 4 + X265_DEPTH - 8;
46
const int shift_2nd = 11;
47
48
partialButterfly32(coef, dst, shift_2nd, 32);
49
}
50
51
-static void idst4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
52
+void idst4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
53
{
54
const int shift_1st = 7;
55
const int shift_2nd = 12 - (X265_DEPTH - 8);
56
57
}
58
}
59
60
-static void idct4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
61
+void idct4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
62
{
63
const int shift_1st = 7;
64
const int shift_2nd = 12 - (X265_DEPTH - 8);
65
66
}
67
}
68
69
-static void idct8_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
70
+void idct8_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
71
{
72
const int shift_1st = 7;
73
const int shift_2nd = 12 - (X265_DEPTH - 8);
74
75
}
76
}
77
78
-static void idct16_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
79
+void idct16_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
80
{
81
const int shift_1st = 7;
82
const int shift_2nd = 12 - (X265_DEPTH - 8);
83
84
}
85
}
86
87
-static void idct32_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
88
+void idct32_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
89
{
90
const int shift_1st = 7;
91
const int shift_2nd = 12 - (X265_DEPTH - 8);
92
93
memcpy(&dsti * dstStride, &blocki * 32, 32 * sizeof(int16_t));
94
}
95
}
96
+} // namespace X265_NS
97
98
static void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
99
{
100
x265_3.6.tar.gz/source/common/deblock.cpp -> x265_4.0.tar.gz/source/common/deblock.cpp
Changed
19
1
2
3
void Deblock::edgeFilterLuma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength)
4
{
5
- PicYuv* reconPic = cuQ->m_encData->m_reconPic;
6
+ PicYuv* reconPic = cuQ->m_encData->m_reconPic0;
7
pixel* src = reconPic->getLumaAddr(cuQ->m_cuAddr, absPartIdx);
8
intptr_t stride = reconPic->m_stride;
9
const PPS* pps = cuQ->m_slice->m_pps;
10
11
: ((g_zscanToPelYabsPartIdx + edge * UNIT_SIZE) >> cuQ->m_vChromaShift)) % DEBLOCK_SMALLEST_BLOCK == 0,
12
"invalid edge\n");
13
14
- PicYuv* reconPic = cuQ->m_encData->m_reconPic;
15
+ PicYuv* reconPic = cuQ->m_encData->m_reconPic0;
16
intptr_t stride = reconPic->m_strideC;
17
intptr_t srcOffset = reconPic->getChromaAddrOffset(cuQ->m_cuAddr, absPartIdx);
18
bool bCheckNoFilter = pps->bTransquantBypassEnabled;
19
x265_3.6.tar.gz/source/common/frame.cpp -> x265_4.0.tar.gz/source/common/frame.cpp
Changed
147
1
2
m_reconColCount = NULL;
3
m_countRefEncoders = 0;
4
m_encData = NULL;
5
- m_reconPic = NULL;
6
+ for (int i = 0; i < NUM_RECON_VERSION; i++)
7
+ m_reconPici = NULL;
8
m_quantOffsets = NULL;
9
m_next = NULL;
10
m_prev = NULL;
11
12
13
m_tempLayer = 0;
14
m_sameLayerRefPic = false;
15
+
16
+ m_viewId = 0;
17
+ m_valid = 0;
18
+ m_nextSubDPB = NULL;
19
+ m_prevSubDPB = NULL;
20
}
21
22
bool Frame::create(x265_param *param, float* quantOffsets)
23
24
if (m_param->bEnableTemporalFilter)
25
{
26
m_mcstf = new TemporalFilter;
27
+ m_mcstf->m_range = param->mcstfFrameRange;
28
m_mcstf->init(param);
29
30
m_fencPicSubsampled2 = new PicYuv;
31
32
bool Frame::allocEncodeData(x265_param *param, const SPS& sps)
33
{
34
m_encData = new FrameData;
35
- m_reconPic = new PicYuv;
36
m_param = param;
37
- m_encData->m_reconPic = m_reconPic;
38
- bool ok = m_encData->create(*param, sps, m_fencPic->m_picCsp) && m_reconPic->create(param);
39
+ for (int i = 0; i < !!m_param->bEnableSCC + 1; i++)
40
+ {
41
+ m_reconPici = new PicYuv;
42
+ m_encData->m_reconPici = m_reconPici;
43
+ }
44
+ bool ok = m_encData->create(*param, sps, m_fencPic->m_picCsp) && m_reconPic0->create(param) && (param->bEnableSCC ? (param->bEnableSCC && m_reconPic1->create(param)) : 1);
45
if (ok)
46
{
47
- /* initialize right border of m_reconpicYuv as SAO may read beyond the
48
+ /* initialize right border of m_reconPicYuv as SAO may read beyond the
49
* end of the picture accessing uninitialized pixels */
50
int maxHeight = sps.numCuInHeight * param->maxCUSize;
51
- memset(m_reconPic->m_picOrg0, 0, sizeof(pixel)* m_reconPic->m_stride * maxHeight);
52
+ memset(m_reconPic0->m_picOrg0, 0, sizeof(pixel)* m_reconPic0->m_stride * maxHeight);
53
54
- /* use pre-calculated cu/pu offsets cached in the SPS structure */
55
- m_reconPic->m_cuOffsetY = sps.cuOffsetY;
56
- m_reconPic->m_buOffsetY = sps.buOffsetY;
57
-
58
- if (param->internalCsp != X265_CSP_I400)
59
+ for (int i = 0; i < !!m_param->bEnableSCC + 1; i++)
60
{
61
- memset(m_reconPic->m_picOrg1, 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
62
- memset(m_reconPic->m_picOrg2, 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
63
-
64
/* use pre-calculated cu/pu offsets cached in the SPS structure */
65
- m_reconPic->m_cuOffsetC = sps.cuOffsetC;
66
- m_reconPic->m_buOffsetC = sps.buOffsetC;
67
+ m_reconPici->m_cuOffsetY = sps.cuOffsetY;
68
+ m_reconPici->m_buOffsetY = sps.buOffsetY;
69
+
70
+ if (param->internalCsp != X265_CSP_I400)
71
+ {
72
+ memset(m_reconPici->m_picOrg1, 0, sizeof(pixel) * m_reconPici->m_strideC * (maxHeight >> m_reconPici->m_vChromaShift));
73
+ memset(m_reconPici->m_picOrg2, 0, sizeof(pixel) * m_reconPici->m_strideC * (maxHeight >> m_reconPici->m_vChromaShift));
74
+
75
+ /* use pre-calculated cu/pu offsets cached in the SPS structure */
76
+ m_reconPici->m_cuOffsetC = sps.cuOffsetC;
77
+ m_reconPici->m_buOffsetC = sps.buOffsetC;
78
+ }
79
}
80
}
81
return ok;
82
83
void Frame::reinit(const SPS& sps)
84
{
85
m_bChromaExtended = false;
86
- m_reconPic = m_encData->m_reconPic;
87
+ for (int i = 0; i < !!m_param->bEnableSCC + 1; i++)
88
+ m_reconPici = m_encData->m_reconPici;
89
m_encData->reinit(sps);
90
}
91
92
93
m_encData = NULL;
94
}
95
96
+#if ENABLE_MULTIVIEW
97
+ //Destroy interlayer References
98
+ if (refPicSetInterLayer0.size())
99
+ {
100
+ Frame* iterFrame = refPicSetInterLayer0.first();
101
+
102
+ while (iterFrame)
103
+ {
104
+ Frame* curFrame = iterFrame;
105
+ iterFrame = iterFrame->m_nextSubDPB;
106
+ refPicSetInterLayer0.removeSubDPB(*curFrame);
107
+ iterFrame = refPicSetInterLayer0.first();
108
+ }
109
+ }
110
+
111
+ if (refPicSetInterLayer1.size())
112
+ {
113
+ Frame* iterFrame = refPicSetInterLayer1.first();
114
+
115
+ while (iterFrame)
116
+ {
117
+ Frame* curFrame = iterFrame;
118
+ iterFrame = iterFrame->m_nextSubDPB;
119
+ refPicSetInterLayer1.removeSubDPB(*curFrame);
120
+ iterFrame = refPicSetInterLayer1.first();
121
+ }
122
+ }
123
+#endif
124
+
125
if (m_fencPic)
126
{
127
if (m_param->bCopyPicToFrame)
128
129
X265_FREE(m_isSubSampled);
130
}
131
132
- if (m_reconPic)
133
+ for (int i = 0; i < !!m_param->bEnableSCC + 1; i++)
134
{
135
- m_reconPic->destroy();
136
- delete m_reconPic;
137
- m_reconPic = NULL;
138
+ if (m_reconPici)
139
+ {
140
+ m_reconPici->destroy();
141
+ delete m_reconPici;
142
+ m_reconPici = NULL;
143
+ }
144
}
145
146
if (m_reconRowFlag)
147
x265_3.6.tar.gz/source/common/frame.h -> x265_4.0.tar.gz/source/common/frame.h
Changed
33
1
2
/* These two items will be NULL until the Frame begins to be encoded, at which point
3
* it will be assigned a FrameData instance, which comes with a reconstructed image PicYuv */
4
FrameData* m_encData;
5
- PicYuv* m_reconPic;
6
+ PicYuv* m_reconPicNUM_RECON_VERSION;
7
8
/* Data associated with x265_picture */
9
PicYuv* m_fencPic;
10
PicYuv* m_fencPicSubsampled2;
11
PicYuv* m_fencPicSubsampled4;
12
13
+ PicList refPicSetInterLayer0;
14
+ PicList refPicSetInterLayer1;
15
+
16
int m_poc;
17
int m_encodeOrder;
18
int m_gopOffset;
19
20
int8_t m_gopId;
21
bool m_sameLayerRefPic;
22
23
+ int m_sLayerId;
24
+ bool m_valid;
25
+
26
+ int m_viewId;
27
+ Frame* m_nextSubDPB; // PicList doubly linked list pointers
28
+ Frame* m_prevSubDPB;
29
+
30
Frame();
31
32
bool create(x265_param *param, float* quantOffsets);
33
x265_3.6.tar.gz/source/common/framedata.h -> x265_4.0.tar.gz/source/common/framedata.h
Changed
10
1
2
const x265_param* m_param;
3
4
FrameData* m_freeListNext;
5
- PicYuv* m_reconPic;
6
+ PicYuv* m_reconPicNUM_RECON_VERSION;
7
bool m_bHasReferences; /* used during DPB/RPS updates */
8
int m_frameEncoderID; /* the ID of the FrameEncoder encoding this frame */
9
JobProvider* m_jobProvider;
10
x265_3.6.tar.gz/source/common/ipfilter.cpp -> x265_4.0.tar.gz/source/common/ipfilter.cpp
Changed
23
1
2
#pragma warning(disable: 4127) // conditional expression is constant, typical for templated functions
3
#endif
4
5
-namespace {
6
-// file local namespace
7
+namespace X265_NS {
8
+// x265 private namespace
9
10
template<int width, int height>
11
void filterPixelToShort_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
12
13
interp_horiz_ps_c<N, width, height>(src, srcStride, immed, width, idxX, 1);
14
filterVertical_sp_c<N>(immed + (N / 2 - 1) * width, width, dst, dstStride, width, height, idxY);
15
}
16
-}
17
-
18
-namespace X265_NS {
19
-// x265 private namespace
20
21
#define CHROMA_420(W, H) \
22
p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_hpp = interp_horiz_pp_c<4, W, H>; \
23
x265_3.6.tar.gz/source/common/loopfilter.cpp -> x265_4.0.tar.gz/source/common/loopfilter.cpp
Changed
55
1
2
3
namespace {
4
5
-/* get the sign of input variable (TODO: this is a dup, make common) */
6
-inline int8_t signOf(int x)
7
-{
8
- return (x >> 31) | ((int)((((uint32_t)-x)) >> 31));
9
-}
10
-
11
static void calSign(int8_t *dst, const pixel *src1, const pixel *src2, const int endX)
12
{
13
for (int x = 0; x < endX; x++)
14
- dstx = signOf(src1x - src2x);
15
+ dstx = x265_signOf(src1x - src2x);
16
}
17
18
static void processSaoCUE0(pixel * rec, int8_t * offsetEo, int width, int8_t* signLeft, intptr_t stride)
19
20
21
for (x = 0; x < width; x++)
22
{
23
- signDown = signOf(recx - recx + stride);
24
+ signDown = x265_signOf(recx - recx + stride);
25
edgeType = signDown + upBuff1x + 2;
26
upBuff1x = -signDown;
27
recx = x265_clip(recx + offsetEoedgeType);
28
29
{
30
for (x = 0; x < width; x++)
31
{
32
- signDown = signOf(recx - recx + stride);
33
+ signDown = x265_signOf(recx - recx + stride);
34
edgeType = signDown + upBuff1x + 2;
35
upBuff1x = -signDown;
36
recx = x265_clip(recx + offsetEoedgeType);
37
38
int x;
39
for (x = 0; x < width; x++)
40
{
41
- int8_t signDown = signOf(recx - recx + stride + 1);
42
+ int8_t signDown = x265_signOf(recx - recx + stride + 1);
43
int edgeType = signDown + buff1x + 2;
44
bufftx + 1 = -signDown;
45
recx = x265_clip(recx + offsetEoedgeType);;
46
47
48
for (int x = startX + 1; x < endX; x++)
49
{
50
- signDown = signOf(recx - recx + stride);
51
+ signDown = x265_signOf(recx - recx + stride);
52
edgeType = signDown + upBuff1x + 2;
53
upBuff1x - 1 = -signDown;
54
recx = x265_clip(recx + offsetEoedgeType);
55
x265_3.6.tar.gz/source/common/lowpassdct.cpp -> x265_4.0.tar.gz/source/common/lowpassdct.cpp
Changed
28
1
2
}
3
4
// replace first coef with total block average
5
- dst0 = totalSum << 1;
6
+ dst0 = (X265_DEPTH == 8) ? (totalSum << 1) : (totalSum >> ((X265_DEPTH - 9)));
7
}
8
9
static void lowPassDct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
10
11
{
12
memcpy(&dsti * 16, &coefi * 8, 8 * sizeof(int16_t));
13
}
14
- dst0 = static_cast<int16_t>(totalSum >> 1);
15
+ dst0 = static_cast<int16_t>(totalSum >> (1 + (X265_DEPTH - 8)));
16
}
17
18
static void lowPassDct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
19
20
{
21
memcpy(&dsti * 32, &coefi * 16, 16 * sizeof(int16_t));
22
}
23
- dst0 = static_cast<int16_t>(totalSum >> 3);
24
+ dst0 = static_cast<int16_t>(totalSum >> (3 + (X265_DEPTH - 8)));
25
}
26
27
namespace X265_NS {
28
x265_3.6.tar.gz/source/common/param.cpp -> x265_4.0.tar.gz/source/common/param.cpp
Changed
201
1
2
param->bEnableSceneCutAwareQp = 0;
3
param->fwdMaxScenecutWindow = 1200;
4
param->bwdMaxScenecutWindow = 600;
5
+ param->mcstfFrameRange = 2;
6
for (int i = 0; i < 6; i++)
7
{
8
int deltas6 = { 5, 4, 3, 2, 1, 0 };
9
10
param->bEnableTemporalFilter = 0;
11
param->temporalFilterStrength = 0.95;
12
13
+ /*Alpha Channel Encoding*/
14
+ param->bEnableAlpha = 0;
15
+ param->numScalableLayers = 1;
16
+
17
#ifdef SVT_HEVC
18
param->svtHevcParam = svtParam;
19
svt_param_default(param);
20
21
/* Film grain characteristics model filename */
22
param->filmGrain = NULL;
23
param->bEnableSBRC = 0;
24
+
25
+ /* Multi-View Encoding*/
26
+ param->numViews = 1;
27
+ param->format = 0;
28
+
29
+ param->numLayers = 1;
30
+
31
+ /* SCC */
32
+ param->bEnableSCC = 0;
33
}
34
35
int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
36
37
38
if (!strcmp(preset, "ultrafast"))
39
{
40
+ param->mcstfFrameRange = 1;
41
param->maxNumMergeCand = 2;
42
param->bIntraInBFrames = 0;
43
param->lookaheadDepth = 5;
44
45
}
46
else if (!strcmp(preset, "superfast"))
47
{
48
+ param->mcstfFrameRange = 1;
49
param->maxNumMergeCand = 2;
50
param->bIntraInBFrames = 0;
51
param->lookaheadDepth = 10;
52
53
}
54
else if (!strcmp(preset, "veryfast"))
55
{
56
+ param->mcstfFrameRange = 1;
57
param->maxNumMergeCand = 2;
58
param->limitReferences = 3;
59
param->bIntraInBFrames = 0;
60
61
}
62
else if (!strcmp(preset, "faster"))
63
{
64
+ param->mcstfFrameRange = 1;
65
param->maxNumMergeCand = 2;
66
param->limitReferences = 3;
67
param->bIntraInBFrames = 0;
68
69
}
70
else if (!strcmp(preset, "fast"))
71
{
72
+ param->mcstfFrameRange = 1;
73
param->maxNumMergeCand = 2;
74
param->limitReferences = 3;
75
param->bEnableEarlySkip = 0;
76
77
}
78
else if (!strcmp(preset, "medium"))
79
{
80
+ param->mcstfFrameRange = 1;
81
/* defaults */
82
}
83
else if (!strcmp(preset, "slow"))
84
85
OPT("film-grain") p->filmGrain = (char* )value;
86
OPT("mcstf") p->bEnableTemporalFilter = atobool(value);
87
OPT("sbrc") p->bEnableSBRC = atobool(value);
88
+#if ENABLE_ALPHA
89
+ OPT("alpha")
90
+ {
91
+ if (atobool(value))
92
+ {
93
+ p->bEnableAlpha = 1;
94
+ p->numScalableLayers = 2;
95
+ p->numLayers = 2;
96
+ }
97
+ }
98
+#endif
99
+#if ENABLE_MULTIVIEW
100
+ OPT("format")
101
+ p->format = atoi(value);
102
+ OPT("num-views")
103
+ {
104
+ p->numViews = atoi(value);
105
+ }
106
+#endif
107
+#if ENABLE_SCC_EXT
108
+ OPT("scc")
109
+ {
110
+ p->bEnableSCC = atoi(value);
111
+ if (p->bEnableSCC)
112
+ p->bEnableWeightedPred = false;
113
+ }
114
+#endif
115
else
116
return X265_PARAM_BAD_NAME;
117
}
118
119
CHECK(param->edgeVarThreshold < 0.0f || param->edgeVarThreshold > 1.0f,
120
"Minimum edge density percentage for a CU should be an integer between 0 to 100");
121
}
122
- CHECK(param->bframes && param->bframes >= param->lookaheadDepth && !param->rc.bStatRead,
123
+ CHECK(param->bframes && (param->bEnableTemporalFilter ? (param->bframes > param->lookaheadDepth) : (param->bframes >= param->lookaheadDepth)) && !param->rc.bStatRead,
124
"Lookahead depth must be greater than the max consecutive bframe count");
125
CHECK(param->bframes < 0,
126
"bframe count should be greater than zero");
127
128
}
129
}
130
CHECK(param->rc.dataShareMode != X265_SHARE_MODE_FILE && param->rc.dataShareMode != X265_SHARE_MODE_SHAREDMEM, "Invalid data share mode. It must be one of the X265_DATA_SHARE_MODES enum values\n" );
131
+#if ENABLE_ALPHA
132
+ if (param->bEnableAlpha)
133
+ {
134
+ CHECK((param->internalCsp != X265_CSP_I420), "Alpha encode supported only with i420a colorspace");
135
+ CHECK((param->analysisMultiPassDistortion || param->analysisMultiPassRefine), "Alpha encode doesnot support multipass feature");
136
+ }
137
+#endif
138
+#if ENABLE_MULTIVIEW
139
+ CHECK((param->numViews > 2), "Multi-View Encoding currently support only 2 views");
140
+ CHECK((param->numViews > 1) && (param->internalBitDepth != 8), "BitDepthConstraint must be 8 for Multiview main profile");
141
+ CHECK((param->numViews > 1) && (param->analysisMultiPassDistortion || param->analysisMultiPassRefine), "Multiview encode doesnot support multipass feature");
142
+#endif
143
+#if ENABLE_SCC_EXT
144
+ CHECK(!!param->bEnableSCC&& param->rdLevel != 6, "Enabling scc extension in x265 requires rdlevel of 6 ");
145
+#endif
146
return check_failed;
147
}
148
149
150
TOOLOPT(param->rc.bStatWrite, "stats-write");
151
TOOLOPT(param->rc.bStatRead, "stats-read");
152
TOOLOPT(param->bSingleSeiNal, "single-sei");
153
+#if ENABLE_ALPHA
154
+ TOOLOPT(param->numScalableLayers > 1, "alpha");
155
+#endif
156
+#if ENABLE_MULTIVIEW
157
+ TOOLOPT(param->numViews > 1, "multi-view");
158
+#endif
159
#if ENABLE_HDR10_PLUS
160
TOOLOPT(param->toneMapFile != NULL, "dhdr10-info");
161
#endif
162
163
if (p->filmGrain)
164
s += sprintf(s, " film-grain=%s", p->filmGrain); // Film grain characteristics model filename
165
BOOL(p->bEnableTemporalFilter, "mcstf");
166
+#if ENABLE_ALPHA
167
+ BOOL(p->bEnableAlpha, "alpha");
168
+#endif
169
+#if ENABLE_MULTIVIEW
170
+ s += sprintf(s, " num-views=%d", p->numViews);
171
+ s += sprintf(s, " format=%d", p->format);
172
+#endif
173
+#if ENABLE_SCC_EXT
174
+ s += sprintf(s, "scc=%d", p->bEnableSCC);
175
+#endif
176
BOOL(p->bEnableSBRC, "sbrc");
177
#undef BOOL
178
return buf;
179
180
181
void x265_copy_params(x265_param* dst, x265_param* src)
182
{
183
+ dst->mcstfFrameRange = src->mcstfFrameRange;
184
dst->cpuid = src->cpuid;
185
dst->frameNumThreads = src->frameNumThreads;
186
if (src->numaPools) dst->numaPools = strdup(src->numaPools);
187
188
dst->confWinRightOffset = src->confWinRightOffset;
189
dst->confWinBottomOffset = src->confWinBottomOffset;
190
dst->bliveVBV2pass = src->bliveVBV2pass;
191
+#if ENABLE_ALPHA
192
+ dst->bEnableAlpha = src->bEnableAlpha;
193
+ dst->numScalableLayers = src->numScalableLayers;
194
+#endif
195
+#if ENABLE_MULTIVIEW
196
+ dst->numViews = src->numViews;
197
+ dst->format = src->format;
198
+#endif
199
+ dst->numLayers = src->numLayers;
200
+#if ENABLE_SCC_EXT
201
x265_3.6.tar.gz/source/common/piclist.cpp -> x265_4.0.tar.gz/source/common/piclist.cpp
Changed
160
1
2
m_count++;
3
}
4
5
+#if ENABLE_MULTIVIEW
6
+Frame* PicList::popFrontSubDPB()
7
+{
8
+ if (m_start)
9
+ {
10
+ Frame* temp = m_start;
11
+ m_count--;
12
+
13
+ if (m_count)
14
+ {
15
+ m_start = m_start->m_nextSubDPB;
16
+ m_start->m_prevSubDPB = NULL;
17
+ }
18
+ else
19
+ {
20
+ m_start = m_end = NULL;
21
+ }
22
+ temp->m_next = temp->m_prev = NULL;
23
+ return temp;
24
+ }
25
+ else
26
+ return NULL;
27
+}
28
+
29
+void PicList::pushBackSubDPB(Frame& curFrame)
30
+{
31
+ X265_CHECK(!curFrame.m_nextSubDPB && !curFrame.m_prevSubDPB, "piclist: picture already in Sub DPB list\n"); // ensure frame is not in a list
32
+ curFrame.m_nextSubDPB = NULL;
33
+ curFrame.m_prevSubDPB = m_end;
34
+
35
+ if (m_count)
36
+ {
37
+ m_end->m_nextSubDPB = &curFrame;
38
+ m_end = &curFrame;
39
+ }
40
+ else
41
+ {
42
+ m_start = m_end = &curFrame;
43
+ }
44
+ m_count++;
45
+}
46
+
47
+void PicList::removeSubDPB(Frame& curFrame)
48
+{
49
+#if _DEBUG
50
+ Frame* tmp = m_start;
51
+ while (tmp && tmp != &curFrame)
52
+ {
53
+ tmp = tmp->m_nextSubDPB;
54
+ }
55
+
56
+ X265_CHECK(tmp == &curFrame, "piclist: pic being removed was not in list\n"); // verify pic is in this list
57
+#endif
58
+
59
+ m_count--;
60
+ if (m_count)
61
+ {
62
+ if (m_start == &curFrame)
63
+ m_start = curFrame.m_nextSubDPB;
64
+ if (m_end == &curFrame)
65
+ m_end = curFrame.m_prevSubDPB;
66
+
67
+ if (curFrame.m_nextSubDPB)
68
+ curFrame.m_nextSubDPB->m_prevSubDPB = curFrame.m_prevSubDPB;
69
+ if (curFrame.m_prevSubDPB)
70
+ curFrame.m_prevSubDPB->m_nextSubDPB = curFrame.m_nextSubDPB;
71
+ }
72
+ else
73
+ {
74
+ m_start = m_end = NULL;
75
+ }
76
+
77
+ curFrame.m_nextSubDPB = curFrame.m_prevSubDPB = NULL;
78
+}
79
+#endif
80
+
81
void PicList::pushBackMCSTF(Frame& curFrame)
82
{
83
X265_CHECK(!curFrame.m_nextMCSTF && !curFrame.m_prevMCSTF, "piclist: picture already in OPB list\n"); // ensure frame is not in a list
84
85
return NULL;
86
}
87
88
-Frame* PicList::getPOC(int poc)
89
+Frame* PicList::getPOC(int poc, int sLayerId)
90
{
91
Frame *curFrame = m_start;
92
- while (curFrame && curFrame->m_poc != poc)
93
+ int layer = curFrame->m_param->numViews > 1 ? curFrame->m_viewId : (curFrame->m_param->numScalableLayers > 1) ? curFrame->m_sLayerId : 0;
94
+ while (curFrame && (curFrame->m_poc != poc || layer != sLayerId))
95
+ {
96
curFrame = curFrame->m_next;
97
+ if(curFrame)
98
+ layer = curFrame->m_param->numViews > 1 ? curFrame->m_viewId : (curFrame->m_param->numScalableLayers > 1) ? curFrame->m_sLayerId : 0;
99
+ }
100
return curFrame;
101
}
102
103
104
return NULL;
105
}
106
107
-Frame* PicList::getCurFrame(void)
108
+Frame* PicList::getCurFrame(int sLayer)
109
{
110
Frame *curFrame = m_start;
111
- if (curFrame != NULL)
112
+ int layer = curFrame->m_param->numViews > 1 ? curFrame->m_viewId : (curFrame->m_param->numScalableLayers > 1) ? curFrame->m_sLayerId : 0;
113
+ if (layer == sLayer && curFrame != NULL)
114
return curFrame;
115
else
116
return NULL;
117
118
curFrame.m_next = curFrame.m_prev = NULL;
119
}
120
121
+
122
+Frame* PicList::removeFrame(Frame& curFrame)
123
+{
124
+ Frame* tmp = &curFrame;
125
+#if _DEBUG
126
+ tmp = m_start;
127
+ while (tmp && tmp != &curFrame)
128
+ {
129
+ tmp = tmp->m_next;
130
+ }
131
+
132
+ X265_CHECK(tmp == &curFrame, "piclist: pic being removed was not in list\n"); // verify pic is in this list
133
+#endif
134
+
135
+ m_count--;
136
+ if (m_count)
137
+ {
138
+ if (m_start == &curFrame)
139
+ m_start = curFrame.m_next;
140
+ if (m_end == &curFrame)
141
+ m_end = curFrame.m_prev;
142
+
143
+ if (curFrame.m_next)
144
+ curFrame.m_next->m_prev = curFrame.m_prev;
145
+ if (curFrame.m_prev)
146
+ curFrame.m_prev->m_next = curFrame.m_next;
147
+ }
148
+ else
149
+ {
150
+ m_start = m_end = NULL;
151
+ }
152
+
153
+ curFrame.m_next = curFrame.m_prev = NULL;
154
+ return tmp;
155
+}
156
+
157
void PicList::removeMCSTF(Frame& curFrame)
158
{
159
#if _DEBUG
160
x265_3.6.tar.gz/source/common/piclist.h -> x265_4.0.tar.gz/source/common/piclist.h
Changed
45
1
2
/** Push picture to end of the list */
3
void pushBack(Frame& pic);
4
void pushBackMCSTF(Frame& pic);
5
+#if ENABLE_MULTIVIEW
6
+ void pushBackSubDPB(Frame& pic);
7
+#endif
8
9
/** Push picture to beginning of the list */
10
void pushFront(Frame& pic);
11
void pushFrontMCSTF(Frame& pic);
12
+#if ENABLE_MULTIVIEW
13
+ Frame* popFrontSubDPB();
14
+#endif
15
16
/** Pop picture from end of the list */
17
Frame* popBack();
18
19
Frame* popFront();
20
21
/** Find frame with specified POC */
22
- Frame* getPOC(int poc);
23
+ Frame* getPOC(int poc, int sLayerId = 0);
24
/* Find next MCSTF frame with specified POC */
25
Frame* getPOCMCSTF(int poc);
26
27
/** Get the current Frame from the list **/
28
- Frame* getCurFrame(void);
29
+ Frame* getCurFrame(int sLayer);
30
31
/** Remove picture from list */
32
void remove(Frame& pic);
33
+
34
+ /** Remove picture from list */
35
+ Frame* removeFrame(Frame& pic);
36
/* Remove MCSTF picture from list */
37
void removeMCSTF(Frame& pic);
38
+#if ENABLE_MULTIVIEW
39
+ /** Remove picture from Sub list */
40
+ void removeSubDPB(Frame& pic);
41
+#endif
42
43
Frame* first() { return m_start; }
44
45
x265_3.6.tar.gz/source/common/picyuv.cpp -> x265_4.0.tar.gz/source/common/picyuv.cpp
Changed
201
1
2
3
/* Copy pixels from an x265_picture into internal PicYuv instance.
4
* Shift pixels as necessary, mask off bits above X265_DEPTH for safety. */
5
-void PicYuv::copyFromPicture(const x265_picture& pic, const x265_param& param, int padx, int pady)
6
+void PicYuv::copyFromPicture(const x265_picture& pic, const x265_param& param, int padx, int pady, bool isBase)
7
{
8
/* m_picWidth is the width that is being encoded, padx indicates how many
9
* of those pixels are padding to reach multiple of MinCU(4) size.
10
11
#else /* Case for (X265_DEPTH == 8) */
12
// TODO: Does we need this path? may merge into above in future
13
{
14
- pixel *yPixel = m_picOrg0;
15
- uint8_t *yChar = (uint8_t*)pic.planes0;
16
-
17
- for (int r = 0; r < height; r++)
18
+ if (isBase || param.numViews > 1)
19
{
20
- memcpy(yPixel, yChar, width * sizeof(pixel));
21
+ int offsetX, offsetY;
22
+ offsetX = (!isBase && pic.format == 1 ? width : 0);
23
+ offsetY = (!isBase && pic.format == 2 ? pic.stride0 * height : 0);
24
+ pixel *yPixel = m_picOrg0;
25
+ uint8_t* yChar = (uint8_t*)pic.planes0 + offsetX + offsetY;
26
27
- yPixel += m_stride;
28
- yChar += pic.stride0 / sizeof(*yChar);
29
- }
30
+ for (int r = 0; r < height; r++)
31
+ {
32
+ memcpy(yPixel, yChar, width * sizeof(pixel));
33
34
- if (param.internalCsp != X265_CSP_I400)
35
+ yPixel += m_stride;
36
+ yChar += pic.stride0 / sizeof(*yChar);
37
+ }
38
+
39
+ if (param.internalCsp != X265_CSP_I400)
40
+ {
41
+ offsetX = offsetX >> m_hChromaShift;
42
+ int offsetYU = (!isBase && pic.format == 2 ? pic.stride1 * (height >> m_vChromaShift) : 0);
43
+ int offsetYV = (!isBase && pic.format == 2 ? pic.stride2 * (height >> m_vChromaShift) : 0);
44
+
45
+ pixel *uPixel = m_picOrg1;
46
+ pixel *vPixel = m_picOrg2;
47
+
48
+ uint8_t* uChar = (uint8_t*)pic.planes1 + offsetX + offsetYU;
49
+ uint8_t* vChar = (uint8_t*)pic.planes2 + offsetX + offsetYV;
50
+
51
+ for (int r = 0; r < height >> m_vChromaShift; r++)
52
+ {
53
+ memcpy(uPixel, uChar, (width >> m_hChromaShift) * sizeof(pixel));
54
+ memcpy(vPixel, vChar, (width >> m_hChromaShift) * sizeof(pixel));
55
+
56
+ uPixel += m_strideC;
57
+ vPixel += m_strideC;
58
+ uChar += pic.stride1 / sizeof(*uChar);
59
+ vChar += pic.stride2 / sizeof(*vChar);
60
+ }
61
+ }
62
+ }
63
+#if ENABLE_ALPHA
64
+ if (!isBase && param.bEnableAlpha)
65
{
66
- pixel *uPixel = m_picOrg1;
67
- pixel *vPixel = m_picOrg2;
68
+ pixel* aPixel = m_picOrg0;
69
+ uint8_t* aChar = (uint8_t*)pic.planes3;
70
71
- uint8_t *uChar = (uint8_t*)pic.planes1;
72
- uint8_t *vChar = (uint8_t*)pic.planes2;
73
+ for (int r = 0; r < height; r++)
74
+ {
75
+ memcpy(aPixel, aChar, width * sizeof(pixel));
76
+
77
+ aPixel += m_stride;
78
+ aChar += pic.stride0 / sizeof(*aChar);
79
+ }
80
+
81
+ pixel* uPixel = m_picOrg1;
82
+ pixel* vPixel = m_picOrg2;
83
84
for (int r = 0; r < height >> m_vChromaShift; r++)
85
{
86
- memcpy(uPixel, uChar, (width >> m_hChromaShift) * sizeof(pixel));
87
- memcpy(vPixel, vChar, (width >> m_hChromaShift) * sizeof(pixel));
88
+ memset(uPixel, 128, (width >> m_hChromaShift) * sizeof(pixel));
89
+ memset(vPixel, 128, (width >> m_hChromaShift) * sizeof(pixel));
90
91
uPixel += m_strideC;
92
vPixel += m_strideC;
93
- uChar += pic.stride1 / sizeof(*uChar);
94
- vChar += pic.stride2 / sizeof(*vChar);
95
}
96
}
97
+#endif
98
}
99
#endif /* (X265_DEPTH > 8) */
100
}
101
else /* pic.bitDepth > 8 */
102
{
103
/* defensive programming, mask off bits that are supposed to be zero */
104
- uint16_t mask = (1 << X265_DEPTH) - 1;
105
- int shift = abs(pic.bitDepth - X265_DEPTH);
106
- pixel *yPixel = m_picOrg0;
107
+ if (isBase)
108
+ {
109
+ uint16_t mask = (1 << X265_DEPTH) - 1;
110
+ int shift = abs(pic.bitDepth - X265_DEPTH);
111
+ pixel* yPixel = m_picOrg0;
112
113
- uint16_t *yShort = (uint16_t*)pic.planes0;
114
+ uint16_t* yShort = (uint16_t*)pic.planes0;
115
116
- if (pic.bitDepth > X265_DEPTH)
117
- {
118
- /* shift right and mask pixels to final size */
119
- primitives.planecopy_sp(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
120
- }
121
- else /* Case for (pic.bitDepth <= X265_DEPTH) */
122
- {
123
- /* shift left and mask pixels to final size */
124
- primitives.planecopy_sp_shl(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
125
- }
126
+ if (pic.bitDepth > X265_DEPTH)
127
+ {
128
+ /* shift right and mask pixels to final size */
129
+ primitives.planecopy_sp(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
130
+ }
131
+ else /* Case for (pic.bitDepth <= X265_DEPTH) */
132
+ {
133
+ /* shift left and mask pixels to final size */
134
+ primitives.planecopy_sp_shl(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
135
+ }
136
137
- if (param.internalCsp != X265_CSP_I400)
138
+ if (param.internalCsp != X265_CSP_I400)
139
+ {
140
+ pixel* uPixel = m_picOrg1;
141
+ pixel* vPixel = m_picOrg2;
142
+
143
+ uint16_t* uShort = (uint16_t*)pic.planes1;
144
+ uint16_t* vShort = (uint16_t*)pic.planes2;
145
+
146
+ if (pic.bitDepth > X265_DEPTH)
147
+ {
148
+ primitives.planecopy_sp(uShort, pic.stride1 / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
149
+ primitives.planecopy_sp(vShort, pic.stride2 / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
150
+ }
151
+ else /* Case for (pic.bitDepth <= X265_DEPTH) */
152
+ {
153
+ primitives.planecopy_sp_shl(uShort, pic.stride1 / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
154
+ primitives.planecopy_sp_shl(vShort, pic.stride2 / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
155
+ }
156
+ }
157
+ }
158
+#if ENABLE_ALPHA
159
+ if (!isBase && param.bEnableAlpha)
160
{
161
- pixel *uPixel = m_picOrg1;
162
- pixel *vPixel = m_picOrg2;
163
+ /* defensive programming, mask off bits that are supposed to be zero */
164
+ uint16_t mask = (1 << X265_DEPTH) - 1;
165
+ int shift = abs(pic.bitDepth - X265_DEPTH);
166
+ pixel* yPixel = m_picOrg0;
167
168
- uint16_t *uShort = (uint16_t*)pic.planes1;
169
- uint16_t *vShort = (uint16_t*)pic.planes2;
170
+ uint16_t* yShort = (uint16_t*)pic.planes3;
171
172
if (pic.bitDepth > X265_DEPTH)
173
{
174
- primitives.planecopy_sp(uShort, pic.stride1 / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
175
- primitives.planecopy_sp(vShort, pic.stride2 / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
176
+ /* shift right and mask pixels to final size */
177
+ primitives.planecopy_sp(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
178
}
179
else /* Case for (pic.bitDepth <= X265_DEPTH) */
180
{
181
- primitives.planecopy_sp_shl(uShort, pic.stride1 / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
182
- primitives.planecopy_sp_shl(vShort, pic.stride2 / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
183
+ /* shift left and mask pixels to final size */
184
+ primitives.planecopy_sp_shl(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
185
+ }
186
+
187
+ if (param.internalCsp != X265_CSP_I400)
188
+ {
189
+ pixel* uPixel = m_picOrg1;
190
+ pixel* vPixel = m_picOrg2;
191
+
192
+ for (int r = 0; r < height >> m_vChromaShift; r++)
193
+ {
194
+ for (int c = 0; c < (width >> m_hChromaShift); c++)
195
+ {
196
+ uPixelc = ((1 << X265_DEPTH) >> 1);
197
+ vPixelc = ((1 << X265_DEPTH) >> 1);
198
+ }
199
+ uPixel += m_strideC;
200
+ vPixel += m_strideC;
201
x265_3.6.tar.gz/source/common/picyuv.h -> x265_4.0.tar.gz/source/common/picyuv.h
Changed
10
1
2
void destroy();
3
int getLumaBufLen(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp);
4
5
- void copyFromPicture(const x265_picture&, const x265_param& param, int padx, int pady);
6
+ void copyFromPicture(const x265_picture&, const x265_param& param, int padx, int pady, bool isBase = true);
7
void copyFromFrame(PicYuv* source);
8
9
intptr_t getChromaAddrOffset(uint32_t ctuAddr, uint32_t absPartIdx) const { return m_cuOffsetCctuAddr + m_buOffsetCabsPartIdx; }
10
x265_3.6.tar.gz/source/common/pixel.cpp -> x265_4.0.tar.gz/source/common/pixel.cpp
Changed
23
1
2
{
3
int satd = 0;
4
5
-#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 && !HIGH_BIT_DEPTH
6
- pixelcmp_t satd_4x4 = x265_pixel_satd_4x4_neon;
7
-#endif
8
-
9
for (int row = 0; row < h; row += 4)
10
for (int col = 0; col < w; col += 4)
11
satd += satd_4x4(pix1 + row * stride_pix1 + col, stride_pix1,
12
13
{
14
int satd = 0;
15
16
-#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 && !HIGH_BIT_DEPTH
17
- pixelcmp_t satd_8x4 = x265_pixel_satd_8x4_neon;
18
-#endif
19
-
20
for (int row = 0; row < h; row += 4)
21
for (int col = 0; col < w; col += 8)
22
satd += satd_8x4(pix1 + row * stride_pix1 + col, stride_pix1,
23
x265_3.6.tar.gz/source/common/predict.cpp -> x265_4.0.tar.gz/source/common/predict.cpp
Changed
98
1
2
}
3
else
4
{
5
- if (bLuma)
6
- predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
7
- if (bChroma)
8
- predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
9
+#if ENABLE_SCC_EXT
10
+ if (cu.m_slice->m_param->bEnableSCC && refIdx0 == (cu.m_slice->m_numRefIdx0 - 1))
11
+ {
12
+ if (bLuma)
13
+ predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0);
14
+ if (bChroma)
15
+ predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0);
16
+ }
17
+ else
18
+#endif
19
+ {
20
+ if (bLuma)
21
+ predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
22
+ if (bChroma)
23
+ predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
24
+ }
25
}
26
}
27
else
28
29
30
if (bLuma)
31
{
32
- predInterLumaShort(pu, m_predShortYuv0, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
33
+#if ENABLE_SCC_EXT
34
+ if (cu.m_slice->m_param->bEnableSCC && refIdx0 == (cu.m_slice->m_numRefIdx0 - 1))
35
+ predInterLumaShort(pu, m_predShortYuv0, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0);
36
+ else
37
+#endif
38
+ predInterLumaShort(pu, m_predShortYuv0, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
39
predInterLumaShort(pu, m_predShortYuv1, *cu.m_slice->m_refReconPicList1refIdx1, mv1);
40
}
41
if (bChroma)
42
{
43
- predInterChromaShort(pu, m_predShortYuv0, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
44
+#if ENABLE_SCC_EXT
45
+ if (cu.m_slice->m_param->bEnableSCC && refIdx0 == (cu.m_slice->m_numRefIdx0 - 1))
46
+ predInterChromaShort(pu, m_predShortYuv0, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0);
47
+ else
48
+#endif
49
+ predInterChromaShort(pu, m_predShortYuv0, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
50
predInterChromaShort(pu, m_predShortYuv1, *cu.m_slice->m_refReconPicList1refIdx1, mv1);
51
}
52
53
54
}
55
else
56
{
57
- if (bLuma)
58
- predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
59
- if (bChroma)
60
- predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
61
+#if ENABLE_SCC_EXT
62
+ if (cu.m_slice->m_param->bEnableSCC && refIdx0 == (cu.m_slice->m_numRefIdx0 - 1))
63
+ {
64
+ if (bLuma)
65
+ predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0);
66
+ if (bChroma)
67
+ predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0);
68
+ }
69
+ else
70
+#endif
71
+ {
72
+ if (bLuma)
73
+ predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
74
+ if (bChroma)
75
+ predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
76
+ }
77
}
78
}
79
else
80
81
int tuSize = 1 << intraNeighbors.log2TrSize;
82
int tuSize2 = tuSize << 1;
83
84
- PicYuv* reconPic = cu.m_encData->m_reconPic;
85
+ PicYuv* reconPic = cu.m_encData->m_reconPic0;
86
pixel* adiOrigin = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
87
intptr_t picStride = reconPic->m_stride;
88
89
90
91
void Predict::initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId)
92
{
93
- PicYuv* reconPic = cu.m_encData->m_reconPic;
94
+ PicYuv* reconPic = cu.m_encData->m_reconPic0;
95
const pixel* adiOrigin = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
96
intptr_t picStride = reconPic->m_strideC;
97
98
x265_3.6.tar.gz/source/common/primitives.cpp -> x265_4.0.tar.gz/source/common/primitives.cpp
Changed
12
1
2
primitives.cui.intra_pred_allangs = NULL;
3
4
#if ENABLE_ASSEMBLY
5
-#if X265_ARCH_X86
6
- setupInstrinsicPrimitives(primitives, param->cpuid);
7
+#if defined(X265_ARCH_X86) || defined(X265_ARCH_ARM64)
8
+ setupIntrinsicPrimitives(primitives, param->cpuid);
9
#endif
10
setupAssemblyPrimitives(primitives, param->cpuid);
11
#endif
12
x265_3.6.tar.gz/source/common/primitives.h -> x265_4.0.tar.gz/source/common/primitives.h
Changed
15
1
2
}
3
4
void setupCPrimitives(EncoderPrimitives &p);
5
-void setupInstrinsicPrimitives(EncoderPrimitives &p, int cpuMask);
6
+void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask);
7
void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask);
8
void setupAliasPrimitives(EncoderPrimitives &p);
9
-#if X265_ARCH_ARM64
10
-void setupAliasCPrimitives(EncoderPrimitives &cp, EncoderPrimitives &asmp, int cpuMask);
11
-#endif
12
#if HAVE_ALTIVEC
13
void setupPixelPrimitives_altivec(EncoderPrimitives &p);
14
void setupDCTPrimitives_altivec(EncoderPrimitives &p);
15
x265_3.6.tar.gz/source/common/slice.cpp -> x265_4.0.tar.gz/source/common/slice.cpp
Changed
201
1
2
3
using namespace X265_NS;
4
5
-void Slice::setRefPicList(PicList& picList)
6
+#if ENABLE_MULTIVIEW
7
+void Slice::createInterLayerReferencePictureSet(PicList& picList, PicList& refPicSetInterLayer0, PicList& refPicSetInterLayer1)
8
{
9
+
10
+ for (int i = 0; i < 1; i++)
11
+ {
12
+ int layerIdRef = 0;// getRefPicLayerId(i);
13
+ Frame* refPic = picList.getPOC(m_poc, 0);
14
+ int viewIdCur = 0;
15
+ int viewIdZero = 1;
16
+ int viewIdRef = 1;
17
+
18
+ if ((viewIdCur <= viewIdZero && viewIdCur <= viewIdRef) || (viewIdCur >= viewIdZero && viewIdCur >= viewIdRef))
19
+ {
20
+ refPicSetInterLayer0.pushBackSubDPB(*refPic);
21
+ }
22
+ else
23
+ {
24
+ refPicSetInterLayer1.pushBackSubDPB(*refPic);
25
+ }
26
+ }
27
+}
28
+#endif
29
+
30
+void Slice::setRefPicList(PicList& picList, PicList& refPicSetInterLayer0, PicList& refPicSetInterLayer1, int sLayerId)
31
+{
32
+ bool checkNumPocTotalCurr = m_param->bEnableSCC ? false : true;
33
if (m_sliceType == I_SLICE)
34
{
35
memset(m_refFrameList, 0, sizeof(m_refFrameList));
36
memset(m_refReconPicList, 0, sizeof(m_refReconPicList));
37
memset(m_refPOCList, 0, sizeof(m_refPOCList));
38
m_numRefIdx1 = m_numRefIdx0 = 0;
39
+
40
+#if ENABLE_SCC_EXT
41
+ if (!checkNumPocTotalCurr)
42
+ {
43
+ if (m_rps.numberOfPictures == 0)
44
+ {
45
+ Frame* prevPic = picList.getPOC(X265_MAX(0, m_poc - 1));
46
+ if (prevPic->m_poc != X265_MAX(0, m_poc - 1))
47
+ {
48
+ prevPic = picList.getPOC(m_poc);
49
+ }
50
+ m_lastEncPic = prevPic;
51
+ }
52
+ return;
53
+ }
54
+#endif
55
+
56
return;
57
}
58
59
+#if ENABLE_SCC_EXT || ENABLE_MULTIVIEW || ENABLE_ALPHA
60
+ /*Reset the number of references for I-slice marked as P-slice*/
61
+ if ((m_param->bEnableSCC || sLayerId) && m_sliceType != m_origSliceType)
62
+ {
63
+ memset(m_refFrameList, 0, sizeof(m_refFrameList));
64
+ memset(m_refReconPicList, 0, sizeof(m_refReconPicList));
65
+ memset(m_refPOCList, 0, sizeof(m_refPOCList));
66
+ m_numRefIdx0 = 1;
67
+ }
68
+#endif
69
+
70
+#if ENABLE_SCC_EXT
71
+ if (!checkNumPocTotalCurr && m_rps.numberOfPictures == 0)
72
+ {
73
+ Frame* prevPic = picList.getPOC(X265_MAX(0, m_poc - 1));
74
+ if (prevPic->m_poc != X265_MAX(0, m_poc - 1))
75
+ {
76
+ prevPic = picList.getPOC(m_poc);
77
+
78
+ }
79
+ m_lastEncPic = prevPic;
80
+ }
81
+#endif
82
+
83
Frame* refPic = NULL;
84
Frame* refPicSetStCurr0MAX_NUM_REF;
85
Frame* refPicSetStCurr1MAX_NUM_REF;
86
87
88
for (i = 0; i < m_rps.numberOfNegativePictures; i++)
89
{
90
- if (m_rps.bUsedi)
91
+ if (m_rps.bUsedi && m_origSliceType != I_SLICE)
92
{
93
- refPic = picList.getPOC(m_poc + m_rps.deltaPOCi);
94
+ refPic = picList.getPOC(m_poc + m_rps.deltaPOCi, m_rps.deltaPOCi ? sLayerId : 0);
95
refPicSetStCurr0numPocStCurr0 = refPic;
96
numPocStCurr0++;
97
}
98
99
100
for (; i < m_rps.numberOfNegativePictures + m_rps.numberOfPositivePictures; i++)
101
{
102
- if (m_rps.bUsedi)
103
+ if (m_rps.bUsedi && m_origSliceType != I_SLICE)
104
{
105
- refPic = picList.getPOC(m_poc + m_rps.deltaPOCi);
106
+ refPic = picList.getPOC(m_poc + m_rps.deltaPOCi, m_rps.deltaPOCi ? sLayerId : 0);
107
refPicSetStCurr1numPocStCurr1 = refPic;
108
numPocStCurr1++;
109
}
110
111
// ref_pic_list_init
112
Frame* rpsCurrList0MAX_NUM_REF + 1;
113
Frame* rpsCurrList1MAX_NUM_REF + 1;
114
+#if ENABLE_MULTIVIEW
115
+ int numPocTotalCurr = numPocStCurr0 + numPocStCurr1 + numPocLtCurr + refPicSetInterLayer0.size() + refPicSetInterLayer1.size();
116
+#else
117
int numPocTotalCurr = numPocStCurr0 + numPocStCurr1 + numPocLtCurr;
118
+#endif
119
+
120
+#if ENABLE_SCC_EXT
121
+ if (m_param->bEnableSCC)
122
+ numPocTotalCurr++;
123
+#endif
124
125
int cIdx = 0;
126
for (i = 0; i < numPocStCurr0; i++, cIdx++)
127
rpsCurrList0cIdx = refPicSetStCurr0i;
128
129
+#if ENABLE_MULTIVIEW
130
+ if (m_param->numViews > 1)
131
+ for (i = 0; i < refPicSetInterLayer0.size(); i++, cIdx++)
132
+ rpsCurrList0cIdx = refPicSetInterLayer0.getPOC(m_poc, 0);
133
+#endif
134
+
135
for (i = 0; i < numPocStCurr1; i++, cIdx++)
136
rpsCurrList0cIdx = refPicSetStCurr1i;
137
138
for (i = 0; i < numPocLtCurr; i++, cIdx++)
139
rpsCurrList0cIdx = refPicSetLtCurri;
140
141
+#if ENABLE_MULTIVIEW
142
+ if (m_param->numViews > 1)
143
+ for (i = 0; i < refPicSetInterLayer1.size(); i++, cIdx++)
144
+ rpsCurrList0cIdx = refPicSetInterLayer1.getPOC(m_poc, 0);
145
+#endif
146
+
147
+#if ENABLE_SCC_EXT
148
+ if (m_param->bEnableSCC)
149
+ rpsCurrList0cIdx++ = picList.getPOC(m_poc);
150
+#endif
151
+
152
X265_CHECK(cIdx == numPocTotalCurr, "RPS index check fail\n");
153
154
if (m_sliceType == B_SLICE)
155
156
for (i = 0; i < numPocStCurr1; i++, cIdx++)
157
rpsCurrList1cIdx = refPicSetStCurr1i;
158
159
+#if ENABLE_MULTIVIEW
160
+ if (m_param->numViews > 1)
161
+ for (i = 0; i < refPicSetInterLayer1.size(); i++, cIdx++)
162
+ rpsCurrList1cIdx = refPicSetInterLayer1.getPOC(m_poc, 0);
163
+#endif
164
+
165
for (i = 0; i < numPocStCurr0; i++, cIdx++)
166
rpsCurrList1cIdx = refPicSetStCurr0i;
167
168
for (i = 0; i < numPocLtCurr; i++, cIdx++)
169
rpsCurrList1cIdx = refPicSetLtCurri;
170
171
+#if ENABLE_MULTIVIEW
172
+ if (m_param->numViews > 1)
173
+ for (i = 0; i < refPicSetInterLayer0.size(); i++, cIdx++)
174
+ rpsCurrList1cIdx = refPicSetInterLayer0.getPOC(m_poc, 0);
175
+#endif
176
+
177
+#if ENABLE_SCC_EXT
178
+ if (m_param->bEnableSCC)
179
+ rpsCurrList1cIdx++ = picList.getPOC(m_poc);
180
+#endif
181
+
182
X265_CHECK(cIdx == numPocTotalCurr, "RPS index check fail\n");
183
}
184
185
186
cIdx = rIdx % numPocTotalCurr;
187
X265_CHECK(cIdx >= 0 && cIdx < numPocTotalCurr, "RPS index check fail\n");
188
m_refFrameList0rIdx = rpsCurrList0cIdx;
189
+#if ENABLE_MULTIVIEW
190
+ m_refFrameList0rIdx = rpsCurrList0cIdx;
191
+#endif
192
}
193
194
+#if ENABLE_SCC_EXT
195
+ if (m_param->bEnableSCC && numPocTotalCurr > m_numRefIdx0)
196
+ {
197
+ m_refFrameList0m_numRefIdx0 - 1 = picList.getPOC(m_poc);
198
+ }
199
+#endif
200
+
201
x265_3.6.tar.gz/source/common/slice.h -> x265_4.0.tar.gz/source/common/slice.h
Changed
124
1
2
MAIN10 = 2,
3
MAINSTILLPICTURE = 3,
4
MAINREXT = 4,
5
- HIGHTHROUGHPUTREXT = 5
6
+ HIGHTHROUGHPUTREXT = 5,
7
+ MULTIVIEWMAIN = 6,
8
+ SCALABLEMAIN = 7,
9
+ SCALABLEMAIN10 = 8,
10
+ MAINSCC = 9
11
};
12
}
13
14
15
16
struct ProfileTierLevel
17
{
18
- int profileIdc;
19
+ int profileIdcMAX_LAYERS;
20
int levelIdc;
21
uint32_t minCrForLevel;
22
uint32_t maxLumaSrForLevel;
23
24
uint32_t numReorderPicsMAX_T_LAYERS;
25
uint32_t maxDecPicBufferingMAX_T_LAYERS;
26
uint32_t maxLatencyIncreaseMAX_T_LAYERS;
27
+ int m_numLayers;
28
+ int m_numViews;
29
+ bool vps_extension_flag;
30
+
31
+#if (ENABLE_ALPHA || ENABLE_MULTIVIEW)
32
+ bool splitting_flag;
33
+ int m_scalabilityMaskMAX_VPS_NUM_SCALABILITY_TYPES;
34
+ int scalabilityTypes;
35
+ uint8_t m_dimensionIdLenMAX_VPS_NUM_SCALABILITY_TYPES;
36
+ uint8_t m_dimensionIdMAX_VPS_LAYER_ID_PLUS1MAX_VPS_NUM_SCALABILITY_TYPES;
37
+ bool m_nuhLayerIdPresentFlag;
38
+ uint8_t m_layerIdInNuhMAX_VPS_LAYER_ID_PLUS1;
39
+ uint8_t m_layerIdInVpsMAX_VPS_LAYER_ID_PLUS1;
40
+ int m_viewIdLen;
41
+ int m_vpsNumLayerSetsMinus1;
42
+ int m_numLayersInIdList1023;
43
+#endif
44
+
45
+#if ENABLE_MULTIVIEW
46
+ int m_layerIdIncludedFlag;
47
+#endif
48
};
49
50
struct Window
51
52
53
Window conformanceWindow;
54
VUI vuiParameters;
55
+ bool sps_extension_flag;
56
+
57
+#if ENABLE_MULTIVIEW
58
+ int setSpsExtOrMaxSubLayersMinus1;
59
+ int maxViews;
60
+ bool vui_parameters_present_flag;
61
+#endif
62
63
SPS()
64
{
65
66
67
int numRefIdxDefault2;
68
bool pps_slice_chroma_qp_offsets_present_flag;
69
+
70
+ bool pps_extension_flag;
71
+ int maxViews;
72
+
73
+ int profileIdc;
74
};
75
76
struct WeightParam
77
78
79
NalUnitType m_nalUnitType;
80
SliceType m_sliceType;
81
+ SliceType m_origSliceType;
82
int m_sliceQp;
83
int m_chromaQpOffset2;
84
int m_poc;
85
86
int m_fieldNum;
87
Frame* m_mcstfRefFrameList2MAX_MCSTF_TEMPORAL_WINDOW_LENGTH;
88
89
+#if ENABLE_SCC_EXT
90
+ Frame* m_lastEncPic;
91
+ bool m_bLMvdL1Zero;
92
+ bool m_useIntegerMv;
93
+#endif
94
+ bool m_bTemporalMvp;
95
+
96
Slice()
97
{
98
m_lastIDR = 0;
99
100
m_rpsIdx = -1;
101
m_chromaQpOffset0 = m_chromaQpOffset1 = 0;
102
m_fieldNum = 0;
103
+#if ENABLE_SCC_EXT
104
+ m_lastEncPic = NULL;
105
+ m_useIntegerMv = false;
106
+#endif
107
+ m_bTemporalMvp = false;
108
}
109
110
void disableWeights();
111
112
- void setRefPicList(PicList& picList);
113
+ void setRefPicList(PicList& picList, PicList& refPicSetInterLayer0, PicList& refPicSetInterLayer1, int viewId);
114
+#if ENABLE_MULTIVIEW
115
+ void createInterLayerReferencePictureSet(PicList& picList, PicList& refPicSetInterLayer0, PicList& refPicSetInterLayer1);
116
+#endif
117
+
118
+#if ENABLE_SCC_EXT
119
+ bool isOnlyCurrentPictureAsReference() const;
120
+#endif
121
122
bool getRapPicFlag() const
123
{
124
x265_3.6.tar.gz/source/common/threadpool.cpp -> x265_4.0.tar.gz/source/common/threadpool.cpp
Changed
13
1
2
else if (cpuCount >= 16)
3
p->frameNumThreads = 4;
4
else if (cpuCount >= 8)
5
+#if _WIN32 && X265_ARCH_ARM64
6
+ p->frameNumThreads = cpuCount;
7
+#else
8
p->frameNumThreads = 3;
9
+#endif
10
else if (cpuCount >= 4)
11
p->frameNumThreads = 2;
12
else
13
x265_3.6.tar.gz/source/common/vec/vec-primitives.cpp -> x265_4.0.tar.gz/source/common/vec/vec-primitives.cpp
Changed
10
1
2
void setupIntrinsicDCT_sse41(EncoderPrimitives&);
3
4
/* Use primitives for the best available vector architecture */
5
-void setupInstrinsicPrimitives(EncoderPrimitives &p, int cpuMask)
6
+void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask)
7
{
8
#ifdef HAVE_SSE3
9
if (cpuMask & X265_CPU_SSE3)
10
x265_3.6.tar.gz/source/common/wavefront.cpp -> x265_4.0.tar.gz/source/common/wavefront.cpp
Changed
22
1
2
x265_free((void*)m_externalDependencyBitmap);
3
}
4
5
+void WaveFront::setLayerId(int layer)
6
+{
7
+ m_sLayerId = layer;
8
+}
9
+
10
void WaveFront::clearEnabledRowMask()
11
{
12
memset((void*)m_externalDependencyBitmap, 0, sizeof(uint32_t) * m_numWords);
13
14
if (ATOMIC_AND(&m_internalDependencyBitmapw, ~bit) & bit)
15
{
16
/* we cleared the bit, we get to process the row */
17
- processRow(w * 32 + id, threadId);
18
+ processRow(w * 32 + id, threadId, m_sLayerId);
19
m_helpWanted = true;
20
return; /* check for a higher priority task */
21
}
22
x265_3.6.tar.gz/source/common/wavefront.h -> x265_4.0.tar.gz/source/common/wavefront.h
Changed
21
1
2
3
int m_numRows;
4
5
+ int m_sLayerId;
6
+
7
protected:
8
uint32_t *m_row_to_idx;
9
uint32_t *m_idx_to_row;
10
11
12
// Start or resume encode processing of this row, must be implemented by
13
// derived classes.
14
- virtual void processRow(int row, int threadId) = 0;
15
+ virtual void processRow(int row, int threadId, int layer) = 0;
16
+
17
+ void setLayerId(int layer);
18
};
19
} // end namespace X265_NS
20
21
x265_3.6.tar.gz/source/encoder/analysis.cpp -> x265_4.0.tar.gz/source/encoder/analysis.cpp
Changed
201
1
2
}
3
ProfileCUScope(ctu, totalCTUTime, totalCTUs);
4
5
- if (m_slice->m_sliceType == I_SLICE)
6
+#if ENABLE_SCC_EXT
7
+ memset(m_ibc.m_BVs, 0, sizeof(m_ibc.m_BVs));
8
+ memset(m_ibc.m_lastIntraBCMv, 0, sizeof(m_ibc.m_lastIntraBCMv));
9
+ m_ibc.m_numBV16s = 0; m_ibc.m_numBVs = 0;
10
+#endif
11
+ if (m_slice->m_sliceType == I_SLICE || (m_param->bEnableSCC && (m_slice->m_numRefIdx0 == 1) && m_slice->m_refPOCList00 == m_slice->m_poc))
12
{
13
x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
14
if (m_param->analysisLoadReuseLevel > 1)
15
16
memcpy(ctu.m_partSize, &intraDataCTU->partSizesctu.m_cuAddr * numPartition, sizeof(char) * numPartition);
17
memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModesctu.m_cuAddr * numPartition, sizeof(uint8_t) * numPartition);
18
}
19
+#if ENABLE_SCC_EXT
20
+ compressIntraCU(ctu, cuGeom, qp, &m_ibc);
21
+#else
22
compressIntraCU(ctu, cuGeom, qp);
23
+#endif
24
}
25
else
26
{
27
28
{
29
/* In RD Level 0/1, copy source pixels into the reconstructed block so
30
* they are available for intra predictions */
31
- m_modeDepth0.fencYuv.copyToPicYuv(*m_frame->m_reconPic, ctu.m_cuAddr, 0);
32
+ m_modeDepth0.fencYuv.copyToPicYuv(*m_frame->m_reconPic0, ctu.m_cuAddr, 0);
33
34
compressInterCU_rd0_4(ctu, cuGeom, qp);
35
36
37
else if (m_param->rdLevel <= 4)
38
compressInterCU_rd0_4(ctu, cuGeom, qp);
39
else
40
+#if ENABLE_SCC_EXT
41
+ compressInterCU_rd5_6(ctu, cuGeom, qp, &m_ibc);
42
+#else
43
compressInterCU_rd5_6(ctu, cuGeom, qp);
44
+#endif
45
}
46
47
if (m_param->bEnableRdRefine || m_param->bOptCUDeltaQP)
48
49
50
/* Copy best data to encData CTU and recon */
51
md.bestMode->cu.copyToPic(depth);
52
- md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
53
+ md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic0, parentCTU.m_cuAddr, cuGeom.absPartIdx);
54
}
55
56
+#if ENABLE_SCC_EXT
57
+uint64_t Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, IBC* ibc)
58
+#else
59
uint64_t Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
60
+#endif
61
{
62
uint32_t depth = cuGeom.depth;
63
ModeDepth& md = m_modeDepthdepth;
64
md.bestMode = NULL;
65
66
+ MV iMVCandList410;
67
+ memset(iMVCandList, 0, sizeof(MV) * 4 * 10);
68
+
69
bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
70
bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
71
72
73
checkBestMode(md.predPRED_INTRA_NxN, depth);
74
}
75
76
+#if ENABLE_SCC_EXT
77
+ bool intraBlockCopyFastSearch = (m_param->bEnableSCC == 1) ? true : false, bUse1DSearchFor8x8 = false;
78
+ if (m_param->bEnableSCC)
79
+ {
80
+ md.predPRED_MERGE_IBC.cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
81
+ checkRDCostIntraBCMerge2Nx2N(md.predPRED_MERGE_IBC, cuGeom);
82
+
83
+ md.predPRED_IBC_2Nx2N.cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
84
+ checkIntraBC_rd5_6(md.predPRED_IBC_2Nx2N, cuGeom, SIZE_2Nx2N, false, bUse1DSearchFor8x8, *ibc);
85
+ checkBestMode(md.predPRED_IBC_2Nx2N, depth);
86
+
87
+ if (intraBlockCopyFastSearch)
88
+ {
89
+ if ((int)depth == m_slice->m_sps->log2DiffMaxMinCodingBlockSize)
90
+ {
91
+ md.predPRED_IBC_Nx2N.cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
92
+ checkIntraBC_rd5_6(md.predPRED_IBC_Nx2N, cuGeom, SIZE_Nx2N, false, bUse1DSearchFor8x8, *ibc, (iMVCandListSIZE_Nx2N + 8));
93
+ checkBestMode(md.predPRED_IBC_Nx2N, depth);
94
+
95
+ md.predPRED_IBC_2NxN.cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
96
+ checkIntraBC_rd5_6(md.predPRED_IBC_2NxN, cuGeom, SIZE_2NxN, false, bUse1DSearchFor8x8, *ibc, (iMVCandListSIZE_2NxN + 8));
97
+ checkBestMode(md.predPRED_IBC_2NxN, depth);
98
+ }
99
+ }
100
+ else
101
+ {
102
+ md.predPRED_IBC_2NxN.cu.initSubCU(parentCTU, cuGeom, qp);
103
+ checkIntraBC_rd5_6(md.predPRED_IBC_2NxN, cuGeom, SIZE_2NxN, false, bUse1DSearchFor8x8, *ibc, (iMVCandListSIZE_2NxN + 8));
104
+ checkBestMode(md.predPRED_IBC_2NxN, depth);
105
+
106
+ md.predPRED_IBC_Nx2N.cu.initSubCU(parentCTU, cuGeom, qp);
107
+ checkIntraBC_rd5_6(md.predPRED_IBC_Nx2N, cuGeom, SIZE_Nx2N, false, bUse1DSearchFor8x8, *ibc, (iMVCandListSIZE_Nx2N + 8));
108
+ checkBestMode(md.predPRED_IBC_Nx2N, depth);
109
+ }
110
+ }
111
+#endif
112
+
113
if (m_bTryLossless)
114
tryLossless(cuGeom);
115
116
117
addSplitFlagCost(*md.bestMode, cuGeom.depth);
118
}
119
120
+#if ENABLE_SCC_EXT
121
+ // If Intra BC keep last coded Mv
122
+ if (md.bestMode && md.bestMode->cu.isInter(0))
123
+ {
124
+ MVField mvField;
125
+ const CUData* cu = &md.bestMode->cu;
126
+ md.bestMode->cu.getMvField(cu, 0, 0, mvField);
127
+ int iRefIdxFirst = mvField.refIdx;
128
+ md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
129
+ int iRefIdxLast = mvField.refIdx;
130
+ bool isIntraBCFirst = (iRefIdxFirst >= 0) ? cu->m_slice->m_refFrameList0iRefIdxFirst->m_poc == cu->m_slice->m_poc : false;
131
+ bool isIntraBCLast = (iRefIdxLast >= 0) ? cu->m_slice->m_refFrameList0iRefIdxLast->m_poc == cu->m_slice->m_poc : false;
132
+
133
+ if (isIntraBCFirst || isIntraBCLast)
134
+ {
135
+ if (cu->m_partSize0 == SIZE_2Nx2N)
136
+ {
137
+ md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
138
+ if (mvField.mv != cu->m_lastIntraBCMv0)
139
+ {
140
+ md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0;
141
+ md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv;
142
+ }
143
+ }
144
+ else if (cu->m_partSize0 == SIZE_2NxN || cu->m_partSize0 == SIZE_Nx2N)
145
+ {
146
+ // mixed PU, only one partition is IntraBC coded
147
+ if (isIntraBCFirst != isIntraBCLast)
148
+ {
149
+ if (isIntraBCFirst)
150
+ {
151
+ // Part 0
152
+ md.bestMode->cu.getMvField(cu, 0, 0, mvField);
153
+ if (mvField.mv != cu->m_lastIntraBCMv0)
154
+ {
155
+ md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0;
156
+ md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv;
157
+ }
158
+ }
159
+ else if (isIntraBCLast)
160
+ {
161
+ // Part 1
162
+ md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
163
+ if (mvField.mv != cu->m_lastIntraBCMv0)
164
+ {
165
+ md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0;
166
+ md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv;
167
+ }
168
+ }
169
+ }
170
+ else // normal IntraBC CU
171
+ {
172
+ // Part 0
173
+ md.bestMode->cu.getMvField(cu, 0, 0, mvField);
174
+ if (mvField.mv != cu->m_lastIntraBCMv0)
175
+ {
176
+ md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0;
177
+ md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv;
178
+ }
179
+ // Part 1
180
+ md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
181
+ if (mvField.mv != cu->m_lastIntraBCMv0)
182
+ {
183
+ md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0;
184
+ md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv;
185
+ }
186
+ }
187
+ }
188
+ else
189
+ {
190
+ // NxN
191
+ for (int part = 0; part < 4; part++)
192
+ {
193
+ md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 4 + part, 0, mvField);
194
+ if (mvField.mv != cu->m_lastIntraBCMv0)
195
+ {
196
+ md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0;
197
+ md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv;
198
+ }
199
+ }
200
+ }
201
x265_3.6.tar.gz/source/encoder/analysis.h -> x265_4.0.tar.gz/source/encoder/analysis.h
Changed
70
1
2
PRED_nRx2N,
3
PRED_INTRA_NxN, /* 4x4 intra PU blocks for 8x8 CU */
4
PRED_LOSSLESS, /* lossless encode of best mode */
5
+#if ENABLE_SCC_EXT
6
+ PRED_IBC_2Nx2N,
7
+ PRED_IBC_Nx2N,
8
+ PRED_IBC_2NxN,
9
+ PRED_MIXED_IBC_NX2N,
10
+ PRED_MIXED_IBC_2NXN,
11
+ PRED_MERGE_IBC,
12
+#endif
13
MAX_PRED_TYPES
14
};
15
16
17
bool m_modeFlag2;
18
bool m_checkMergeAndSkipOnly2;
19
20
+ IBC m_ibc;
21
Analysis();
22
23
bool create(ThreadLocalData* tld);
24
25
26
Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext);
27
int32_t loadTUDepth(CUGeom cuGeom, CUData parentCTU);
28
+
29
protected:
30
/* Analysis data for save/load mode, writes/reads data based on absPartIdx */
31
x265_analysis_inter_data* m_reuseInterDataCTU;
32
33
void qprdRefine(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp);
34
35
/* full analysis for an I-slice CU */
36
+#if ENABLE_SCC_EXT
37
+ uint64_t compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, IBC* ibc = NULL);
38
+#else
39
uint64_t compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
40
+#endif
41
42
/* full analysis for a P or B slice CU */
43
uint32_t compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
44
SplitData compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
45
+#if ENABLE_SCC_EXT
46
+ SplitData compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, IBC* ibc = NULL);
47
+#else
48
SplitData compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
49
+#endif
50
51
void recodeCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t origqp = -1);
52
53
54
55
/* measure inter options */
56
void checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refmask2);
57
- void checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refmask2);
58
+ void checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refmask2, MV* iMVCandList = NULL);
59
60
void checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom);
61
62
+#if ENABLE_SCC_EXT
63
+ void checkRDCostIntraBCMerge2Nx2N(Mode& merge, const CUGeom& cuGeom);
64
+ void checkIntraBC_rd5_6(Mode& intraBCMode, const CUGeom& cuGeom, PartSize ePartSize, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc, MV* iMVCandList = NULL);
65
+#endif
66
+
67
/* encode current bestMode losslessly, pick best RD cost */
68
void tryLossless(const CUGeom& cuGeom);
69
70
x265_3.6.tar.gz/source/encoder/api.cpp -> x265_4.0.tar.gz/source/encoder/api.cpp
Changed
201
1
2
* This program is also available under a commercial proprietary license.
3
* For more information, contact us at license @ x265.com.
4
*****************************************************************************/
5
-
6
#include "common.h"
7
#include "bitstream.h"
8
#include "param.h"
9
10
// will detect and set profile/tier/level in VPS
11
determineLevel(*param, encoder->m_vps);
12
13
- if (!param->bAllowNonConformance && encoder->m_vps.ptl.profileIdc == Profile::NONE)
14
+ if (!param->bAllowNonConformance && encoder->m_vps.ptl.profileIdc0 == Profile::NONE)
15
{
16
x265_log(param, X265_LOG_INFO, "non-conformant bitstreams not allowed (--allow-non-conformance)\n");
17
goto fail;
18
19
VPS saveVPS;
20
memcpy(&saveVPS.ptl, &encoder->m_vps.ptl, sizeof(saveVPS.ptl));
21
determineLevel(*encoder->m_latestParam, encoder->m_vps);
22
- if (saveVPS.ptl.profileIdc != encoder->m_vps.ptl.profileIdc || saveVPS.ptl.levelIdc != encoder->m_vps.ptl.levelIdc
23
+ if (saveVPS.ptl.profileIdc0 != encoder->m_vps.ptl.profileIdc0 || saveVPS.ptl.levelIdc != encoder->m_vps.ptl.levelIdc
24
|| saveVPS.ptl.tierFlag != encoder->m_vps.ptl.tierFlag)
25
{
26
x265_log(encoder->m_param, X265_LOG_WARNING, "Profile/Level/Tier has changed from %d/%d/%s to %d/%d/%s.Cannot reconfigure rate-control.\n",
27
- saveVPS.ptl.profileIdc, saveVPS.ptl.levelIdc, saveVPS.ptl.tierFlag ? "High" : "Main", encoder->m_vps.ptl.profileIdc,
28
+ saveVPS.ptl.profileIdc0, saveVPS.ptl.levelIdc, saveVPS.ptl.tierFlag ? "High" : "Main", encoder->m_vps.ptl.profileIdc0,
29
encoder->m_vps.ptl.levelIdc, encoder->m_vps.ptl.tierFlag ? "High" : "Main");
30
x265_copy_params(encoder->m_latestParam, &save);
31
memcpy(&encoder->m_vps.ptl, &saveVPS.ptl, sizeof(saveVPS.ptl));
32
33
return 0;
34
}
35
36
-int x265_encoder_encode(x265_encoder *enc, x265_nal **pp_nal, uint32_t *pi_nal, x265_picture *pic_in, x265_picture *pic_out)
37
+int x265_encoder_encode(x265_encoder *enc, x265_nal **pp_nal, uint32_t *pi_nal, x265_picture *pic_in, x265_picture **pic_out)
38
{
39
if (!enc)
40
return -1;
41
42
*pi_nal = 0;
43
44
if (numEncoded && encoder->m_param->csvLogLevel && encoder->m_outputCount >= encoder->m_latestParam->chunkStart)
45
- x265_csvlog_frame(encoder->m_param, pic_out);
46
+ {
47
+ for (int layer = 0; layer < encoder->m_param->numLayers; layer++)
48
+ x265_csvlog_frame(encoder->m_param, pic_outlayer);
49
+ }
50
51
if (numEncoded < 0)
52
encoder->m_aborted = true;
53
54
if (enc)
55
{
56
Encoder *encoder = static_cast<Encoder*>(enc);
57
- x265_stats stats;
58
- encoder->fetchStats(&stats, sizeof(stats));
59
+ x265_stats statsMAX_LAYERS;
60
int padx = encoder->m_sps.conformanceWindow.rightOffset;
61
int pady = encoder->m_sps.conformanceWindow.bottomOffset;
62
- x265_csvlog_encode(encoder->m_param, &stats, padx, pady, argc, argv);
63
+ for (int layer = 0; layer < encoder->m_param->numLayers; layer++)
64
+ {
65
+ encoder->fetchStats(stats, sizeof(statslayer), layer);
66
+ x265_csvlog_encode(encoder->m_param, &stats0, padx, pady, argc, argv);
67
+ }
68
}
69
}
70
71
72
if (!enc)
73
return -1;
74
Encoder *encoder = static_cast<Encoder*>(enc);
75
- if (!encoder->copySlicetypePocAndSceneCut(slicetype, poc, sceneCut))
76
+ if (!encoder->copySlicetypePocAndSceneCut(slicetype, poc, sceneCut, 0))
77
return 0;
78
return -1;
79
}
80
81
{
82
if (param->csvLogLevel)
83
{
84
- fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, Scenecut, ");
85
+ fprintf(csvfp, "Layer , Encode Order, Type, POC, QP, Bits, Scenecut, ");
86
if (!!param->bEnableTemporalSubLayers)
87
fprintf(csvfp, "Temporal Sub Layer ID, ");
88
if (param->csvLogLevel >= 2)
89
90
return;
91
92
const x265_frame_stats* frameStats = &pic->frameData;
93
- fprintf(param->csvfpt, "%d, %c-SLICE, %4d, %2.2lf, %10d, %d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc,
94
+ fprintf(param->csvfpt, "%d, %d, %c-SLICE, %4d, %2.2lf, %10d, %d,", pic->layerID, frameStats->encoderOrder, frameStats->sliceType, frameStats->poc,
95
frameStats->qp, (int)frameStats->bits, frameStats->bScenecut);
96
if (!!param->bEnableTemporalSubLayers)
97
fprintf(param->csvfpt, "%d,", frameStats->tLayer);
98
99
return ret;
100
}
101
102
+static enum VmafOutputFormat log_fmt_map(const char *log_fmt)
103
+{
104
+ if (log_fmt) {
105
+ if (!strcmp(log_fmt, "xml"))
106
+ return VMAF_OUTPUT_FORMAT_XML;
107
+ if (!strcmp(log_fmt, "json"))
108
+ return VMAF_OUTPUT_FORMAT_JSON;
109
+ if (!strcmp(log_fmt, "csv"))
110
+ return VMAF_OUTPUT_FORMAT_CSV;
111
+ if (!strcmp(log_fmt, "sub"))
112
+ return VMAF_OUTPUT_FORMAT_SUB;
113
+ }
114
+
115
+ return VMAF_OUTPUT_FORMAT_NONE;
116
+}
117
+
118
+static enum VmafPoolingMethod pool_method_map(const char *pool_method)
119
+{
120
+ if (pool_method) {
121
+ if (!strcmp(pool_method, "min"))
122
+ return VMAF_POOL_METHOD_MIN;
123
+ if (!strcmp(pool_method, "mean"))
124
+ return VMAF_POOL_METHOD_MEAN;
125
+ if (!strcmp(pool_method, "harmonic_mean"))
126
+ return VMAF_POOL_METHOD_HARMONIC_MEAN;
127
+ }
128
+ return VMAF_POOL_METHOD_MEAN;
129
+}
130
+
131
+static enum VmafPixelFormat pix_fmt_map(const char *fmt)
132
+{
133
+ if (fmt) {
134
+ if (!strcmp(fmt, "yuv420p") || !strcmp(fmt, "yuv420p10le") || !strcmp(fmt, "yuv420p12le") || !strcmp(fmt, "yuv420p16le"))
135
+ return VMAF_PIX_FMT_YUV420P;
136
+ if (!strcmp(fmt, "yuv422p") || !strcmp(fmt, "yuv422p10le"))
137
+ return VMAF_PIX_FMT_YUV422P;
138
+ if (!strcmp(fmt, "yuv444p") || !strcmp(fmt, "yuv444p10le"))
139
+ return VMAF_PIX_FMT_YUV444P;
140
+ }
141
+ return VMAF_PIX_FMT_UNKNOWN;
142
+}
143
+
144
+static void copy_picture(float *src, VmafPicture *dst, unsigned width, unsigned height, int src_stride, unsigned bpc)
145
+{
146
+ const int bytes_per_value = bpc > 8 ? 2 : 1;
147
+ const int dst_stride = dst->stride0 / bytes_per_value;
148
+ const unsigned b_shift = (bpc > 8) ? (bpc - 8) : 0;
149
+
150
+ uint8_t *dst_data = static_cast<uint8_t*>(dst->data0);
151
+
152
+ for (unsigned i = 0; i < height; i++) {
153
+ if (bpc > 8) {
154
+ uint16_t *dst_row = reinterpret_cast<uint16_t*>(dst_data);
155
+ for (unsigned j = 0; j < width; j++) {
156
+ dst_rowj = static_cast<uint16_t>(srcj * (1 << b_shift));
157
+ }
158
+ } else {
159
+ for (unsigned j = 0; j < width; j++) {
160
+ dst_dataj = static_cast<uint8_t>(srcj);
161
+ }
162
+ }
163
+ src += src_stride / sizeof(float);
164
+ dst_data += dst_stride * bytes_per_value;
165
+ }
166
+}
167
+
168
+int load_feature(VmafContext *vmaf, const char *feature_name, VmafFeatureDictionary *d) {
169
+ int err = vmaf_use_feature(vmaf, feature_name, d);
170
+ if (err) {
171
+ printf("problem loading feature extractor: %s\n", feature_name);
172
+ }
173
+ return err;
174
+}
175
+
176
+int compute_vmaf(double* vmaf_score, char* fmt, int width, int height, int bitdepth, int(*read_frame)(float *ref_data, float *main_data, float *temp_data, int stride_byte, void *user_data),
177
+ void *user_data, char *model_path, char *log_path, char *log_fmt, int disable_clip, int disable_avx, int enable_transform, int phone_model, int do_psnr, int do_ssim, int do_ms_ssim,
178
+ char *pool_method, int n_thread, int n_subsample)
179
+{
180
+ int err = 0;
181
+
182
+ VmafConfiguration cfg = {
183
+ .log_level = VMAF_LOG_LEVEL_INFO,
184
+ .n_threads = n_thread,
185
+ .n_subsample = n_subsample,
186
+ .cpumask = disable_avx ? -1 : 0,
187
+ .gpumask = 0,
188
+ };
189
+
190
+ VmafContext *vmaf;
191
+ err = vmaf_init(&vmaf, cfg);
192
+ if (err) {
193
+ printf("problem initializing VMAF context\n");
194
+ return -1;
195
+ }
196
+
197
+ uint64_t flags = VMAF_MODEL_FLAGS_DEFAULT;
198
+ if (disable_clip)
199
+ flags |= VMAF_MODEL_FLAG_DISABLE_CLIP;
200
+ if (enable_transform || phone_model)
201
x265_3.6.tar.gz/source/encoder/dpb.cpp -> x265_4.0.tar.gz/source/encoder/dpb.cpp
Changed
201
1
2
FrameData* next = m_frameDataFreeList->m_freeListNext;
3
m_frameDataFreeList->destroy();
4
5
- m_frameDataFreeList->m_reconPic->destroy();
6
- delete m_frameDataFreeList->m_reconPic;
7
+ m_frameDataFreeList->m_reconPic0->destroy();
8
+ delete m_frameDataFreeList->m_reconPic0;
9
10
delete m_frameDataFreeList;
11
m_frameDataFreeList = next;
12
13
if (curFrame->m_param->bEnableTemporalFilter)
14
isMCSTFReferenced =!!(curFrame->m_refPicCnt1);
15
16
- if (!curFrame->m_encData->m_bHasReferences && !curFrame->m_countRefEncoders && !isMCSTFReferenced)
17
+ if (curFrame->m_valid && !curFrame->m_encData->m_bHasReferences && !curFrame->m_countRefEncoders && !isMCSTFReferenced)
18
{
19
curFrame->m_bChromaExtended = false;
20
21
22
23
// iterator is invalidated by remove, restart scan
24
m_picList.remove(*curFrame);
25
+#if ENABLE_MULTIVIEW
26
+ if (curFrame->m_param->numViews > 1 && !curFrame->m_viewId && m_picList.getPOC(curFrame->m_poc, 1) && curFrame == m_picList.getPOC(curFrame->m_poc, 1)->refPicSetInterLayer0.getPOC(curFrame->m_poc, curFrame->m_viewId))
27
+ {
28
+ m_picList.getPOC(curFrame->m_poc, 1)->refPicSetInterLayer0.removeSubDPB(*curFrame);
29
+ }
30
+#endif
31
iterFrame = m_picList.first();
32
33
m_freeList.pushBack(*curFrame);
34
35
curFrame->m_prevCtuInfoChange = NULL;
36
}
37
curFrame->m_encData = NULL;
38
- curFrame->m_reconPic = NULL;
39
+ for (int i = 0; i < !!curFrame->m_param->bEnableSCC + 1; i++)
40
+ curFrame->m_reconPici = NULL;
41
}
42
}
43
}
44
45
m_lastIDR = pocCurr;
46
slice->m_lastIDR = m_lastIDR;
47
slice->m_sliceType = IS_X265_TYPE_B(type) ? B_SLICE : (type == X265_TYPE_P) ? P_SLICE : I_SLICE;
48
+#if ENABLE_SCC_EXT
49
+ if (slice->m_param->bEnableSCC) slice->m_origSliceType = slice->m_sliceType;
50
+ if (slice->m_param->bEnableSCC && IS_X265_TYPE_I(type))
51
+ slice->m_sliceType = P_SLICE;
52
+#endif
53
54
if (type == X265_TYPE_B)
55
{
56
57
58
m_picList.pushFront(*newFrame);
59
60
- if (m_bTemporalSublayer && getTemporalLayerNonReferenceFlag())
61
+ int layer = slice->m_param->numViews > 1 ? newFrame->m_viewId : (slice->m_param->numScalableLayers > 1) ? newFrame->m_sLayerId : 0;
62
+ if (m_bTemporalSublayer && getTemporalLayerNonReferenceFlag(layer))
63
{
64
switch (slice->m_nalUnitType)
65
{
66
67
}
68
}
69
// Do decoding refresh marking if any
70
- decodingRefreshMarking(pocCurr, slice->m_nalUnitType);
71
+ decodingRefreshMarking(pocCurr, slice->m_nalUnitType, layer);
72
73
- computeRPS(pocCurr, newFrame->m_tempLayer, slice->isIRAP(), &slice->m_rps, slice->m_sps->maxDecPicBufferingnewFrame->m_tempLayer);
74
+ uint32_t maxDecBuffer = (slice->m_sps->maxDecPicBufferingnewFrame->m_tempLayer >= 8 && slice->m_param->bEnableSCC) ? 7 : slice->m_sps->maxDecPicBufferingnewFrame->m_tempLayer;
75
+ computeRPS(pocCurr, newFrame->m_tempLayer, slice->isIRAP(), &slice->m_rps, maxDecBuffer, layer);
76
bool isTSAPic = ((slice->m_nalUnitType == 2) || (slice->m_nalUnitType == 3)) ? true : false;
77
// Mark pictures in m_piclist as unreferenced if they are not included in RPS
78
- applyReferencePictureSet(&slice->m_rps, pocCurr, newFrame->m_tempLayer, isTSAPic);
79
+ applyReferencePictureSet(&slice->m_rps, pocCurr, newFrame->m_tempLayer, isTSAPic, layer);
80
81
82
if (m_bTemporalSublayer && newFrame->m_tempLayer > 0
83
84
|| slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RASL_R)
85
)
86
{
87
- if (isTemporalLayerSwitchingPoint(pocCurr, newFrame->m_tempLayer) || (slice->m_sps->maxTempSubLayers == 1))
88
+ if (isTemporalLayerSwitchingPoint(pocCurr, newFrame->m_tempLayer, layer) || (slice->m_sps->maxTempSubLayers == 1))
89
{
90
- if (getTemporalLayerNonReferenceFlag())
91
+ if (getTemporalLayerNonReferenceFlag(layer))
92
{
93
slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_TSA_N;
94
}
95
96
slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_TSA_R;
97
}
98
}
99
- else if (isStepwiseTemporalLayerSwitchingPoint(&slice->m_rps, pocCurr, newFrame->m_tempLayer))
100
+ else if (isStepwiseTemporalLayerSwitchingPoint(&slice->m_rps, pocCurr, newFrame->m_tempLayer, layer))
101
{
102
bool isSTSA = true;
103
int id = newFrame->m_gopOffset % x265_gop_ra_lengthnewFrame->m_gopId;
104
105
}
106
if (isSTSA == true)
107
{
108
- if (getTemporalLayerNonReferenceFlag())
109
+ if (getTemporalLayerNonReferenceFlag(layer))
110
{
111
slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_STSA_N;
112
}
113
114
}
115
}
116
117
+#if ENABLE_MULTIVIEW
118
+ if (newFrame->m_viewId)
119
+ slice->createInterLayerReferencePictureSet(m_picList, newFrame->refPicSetInterLayer0, newFrame->refPicSetInterLayer1);
120
+#endif
121
+ int numRef = slice->m_param->bEnableSCC ? slice->m_rps.numberOfNegativePictures + 1 : slice->m_rps.numberOfNegativePictures;
122
if (slice->m_sliceType != I_SLICE)
123
- slice->m_numRefIdx0 = x265_clip3(1, newFrame->m_param->maxNumReferences, slice->m_rps.numberOfNegativePictures);
124
+ slice->m_numRefIdx0 = x265_clip3(1, newFrame->m_param->maxNumReferences, numRef + newFrame->refPicSetInterLayer0.size() + newFrame->refPicSetInterLayer1.size());
125
+ else
126
+ slice->m_numRefIdx0 = X265_MIN(newFrame->m_param->maxNumReferences, numRef); // Ensuring L0 contains just the -ve POC
127
+#if ENABLE_MULTIVIEW || ENABLE_SCC_EXT
128
+ if(slice->m_param->numViews > 1 || !!slice->m_param->bEnableSCC)
129
+ slice->m_numRefIdx1 = X265_MIN(newFrame->m_param->bBPyramid ? 3 : 2, slice->m_rps.numberOfPositivePictures + newFrame->refPicSetInterLayer0.size() + newFrame->refPicSetInterLayer1.size());
130
else
131
- slice->m_numRefIdx0 = X265_MIN(newFrame->m_param->maxNumReferences, slice->m_rps.numberOfNegativePictures); // Ensuring L0 contains just the -ve POC
132
- slice->m_numRefIdx1 = X265_MIN(newFrame->m_param->bBPyramid ? 2 : 1, slice->m_rps.numberOfPositivePictures);
133
- slice->setRefPicList(m_picList);
134
+#endif
135
+ slice->m_numRefIdx1 = X265_MIN(newFrame->m_param->bBPyramid ? 2 : 1, slice->m_rps.numberOfPositivePictures);
136
+ slice->setRefPicList(m_picList, newFrame->refPicSetInterLayer0, newFrame->refPicSetInterLayer1, layer);
137
138
X265_CHECK(slice->m_sliceType != B_SLICE || slice->m_numRefIdx1, "B slice without L1 references (non-fatal)\n");
139
140
141
/* TODO: the lookahead should be able to tell which reference picture
142
* had the least motion residual. We should be able to use that here to
143
* select a colocation reference list and index */
144
- slice->m_colFromL0Flag = false;
145
+
146
+ bool bLowDelay = true;
147
+ int iCurrPOC = slice->m_poc;
148
+ int iRefIdx = 0;
149
+
150
+ for (iRefIdx = 0; iRefIdx < slice->m_numRefIdx0 && bLowDelay; iRefIdx++)
151
+ {
152
+ if (slice->m_refPOCList0iRefIdx > iCurrPOC)
153
+ {
154
+ bLowDelay = false;
155
+ }
156
+ }
157
+ for (iRefIdx = 0; iRefIdx < slice->m_numRefIdx1 && bLowDelay; iRefIdx++)
158
+ {
159
+ if (slice->m_refPOCList1iRefIdx > iCurrPOC)
160
+ {
161
+ bLowDelay = false;
162
+ }
163
+ }
164
+
165
+ slice->m_bCheckLDC = bLowDelay;
166
+ slice->m_colFromL0Flag = bLowDelay;
167
slice->m_colRefIdx = 0;
168
- slice->m_bCheckLDC = false;
169
}
170
else
171
{
172
173
slice->m_colRefIdx = 0;
174
}
175
176
+ slice->m_bTemporalMvp = slice->m_sps->bTemporalMVPEnabled;
177
+#if ENABLE_SCC_EXT
178
+ bool bGPBcheck = false;
179
+ if (slice->m_sliceType == B_SLICE)
180
+ {
181
+ if (slice->m_param->bEnableSCC)
182
+ {
183
+ if (slice->m_numRefIdx0 - 1 == slice->m_numRefIdx1)
184
+ {
185
+ bGPBcheck = true;
186
+ for (int i = 0; i < slice->m_numRefIdx1; i++)
187
+ {
188
+ if (slice->m_refPOCList1i != slice->m_refPOCList0i)
189
+ {
190
+ bGPBcheck = false;
191
+ break;
192
+ }
193
+ }
194
+ }
195
+ }
196
+ else if (slice->m_numRefIdx0 == slice->m_numRefIdx1)
197
+ {
198
+ bGPBcheck = true;
199
+ int i;
200
+ for (i = 0; i < slice->m_numRefIdx1; i++)
201
x265_3.6.tar.gz/source/encoder/dpb.h -> x265_4.0.tar.gz/source/encoder/dpb.h
Changed
21
1
2
3
protected:
4
5
- void computeRPS(int curPoc,int tempId, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer);
6
+ void computeRPS(int curPoc,int tempId, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer, int sLayerId);
7
8
- void applyReferencePictureSet(RPS *rps, int curPoc, int tempId, bool isTSAPicture);
9
- bool getTemporalLayerNonReferenceFlag();
10
- void decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType);
11
- bool isTemporalLayerSwitchingPoint(int curPoc, int tempId);
12
- bool isStepwiseTemporalLayerSwitchingPoint(RPS *rps, int curPoc, int tempId);
13
+ void applyReferencePictureSet(RPS *rps, int curPoc, int tempId, bool isTSAPicture, int sLayerId);
14
+ bool getTemporalLayerNonReferenceFlag(int sLayerId);
15
+ void decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType, int sLayerId);
16
+ bool isTemporalLayerSwitchingPoint(int curPoc, int tempId, int sLayerId);
17
+ bool isStepwiseTemporalLayerSwitchingPoint(RPS *rps, int curPoc, int tempId, int sLayerId);
18
19
NalUnitType getNalUnitType(int curPoc, bool bIsKeyFrame);
20
};
21
x265_3.6.tar.gz/source/encoder/encoder.cpp -> x265_4.0.tar.gz/source/encoder/encoder.cpp
Changed
201
1
2
m_lookahead = NULL;
3
m_rateControl = NULL;
4
m_dpb = NULL;
5
- m_exportedPic = NULL;
6
m_numDelayedPic = 0;
7
m_outputCount = 0;
8
m_param = NULL;
9
10
m_rpsInSpsCount = 0;
11
m_cB = 1.0;
12
m_cR = 1.0;
13
+ for (int i = 0; i < MAX_LAYERS; i++)
14
+ m_exportedPici = NULL;
15
for (int i = 0; i < X265_MAX_FRAME_THREADS; i++)
16
m_frameEncoderi = NULL;
17
for (uint32_t i = 0; i < DUP_BUFFER; i++)
18
19
}
20
}
21
22
-int Encoder::copySlicetypePocAndSceneCut(int *slicetype, int *poc, int *sceneCut)
23
+int Encoder::copySlicetypePocAndSceneCut(int *slicetype, int *poc, int *sceneCut, int sLayer)
24
{
25
- Frame *FramePtr = m_dpb->m_picList.getCurFrame();
26
+ Frame *FramePtr = m_dpb->m_picList.getCurFrame(sLayer);
27
if (FramePtr != NULL)
28
{
29
*slicetype = FramePtr->m_lowres.sliceType;
30
31
{
32
if (!(IS_X265_TYPE_I(sliceType)))
33
{
34
- Frame *framePtr = m_dpb->m_picList.getPOC(poc);
35
+ Frame *framePtr = m_dpb->m_picList.getPOC(poc, 0);
36
if (framePtr != NULL)
37
{
38
for (int j = 0; j < framePtr->m_encData->m_slice->m_numRefIdx0; j++) // check only for --ref=n number of frames.
39
{
40
- if (framePtr->m_encData->m_slice->m_refFrameList0j && framePtr->m_encData->m_slice->m_refFrameList0j->m_reconPic != NULL)
41
+ if (framePtr->m_encData->m_slice->m_refFrameList0j && framePtr->m_encData->m_slice->m_refFrameList0j->m_reconPic0 != NULL)
42
{
43
int l0POC = framePtr->m_encData->m_slice->m_refFrameList0j->m_poc;
44
pocL0j = l0POC;
45
- Frame* l0Fp = m_dpb->m_picList.getPOC(l0POC);
46
- while (l0Fp->m_reconRowFlagl0Fp->m_numRows - 1.get() == 0)
47
- l0Fp->m_reconRowFlagl0Fp->m_numRows - 1.waitForChange(0); /* If recon is not ready, current frame encoder has to wait. */
48
- l0j = l0Fp->m_reconPic;
49
+ Frame* l0Fp = m_dpb->m_picList.getPOC(l0POC, 0);
50
+#if ENABLE_SCC_EXT
51
+ if (l0POC != poc)
52
+#endif
53
+ {
54
+ while (l0Fp->m_reconRowFlagl0Fp->m_numRows - 1.get() == 0)
55
+ l0Fp->m_reconRowFlagl0Fp->m_numRows - 1.waitForChange(0); /* If recon is not ready, current frame encoder has to wait. */
56
+ }
57
+ l0j = l0Fp->m_reconPic0;
58
}
59
}
60
for (int j = 0; j < framePtr->m_encData->m_slice->m_numRefIdx1; j++) // check only for --ref=n number of frames.
61
{
62
- if (framePtr->m_encData->m_slice->m_refFrameList1j && framePtr->m_encData->m_slice->m_refFrameList1j->m_reconPic != NULL)
63
+ if (framePtr->m_encData->m_slice->m_refFrameList1j && framePtr->m_encData->m_slice->m_refFrameList1j->m_reconPic0 != NULL)
64
{
65
int l1POC = framePtr->m_encData->m_slice->m_refFrameList1j->m_poc;
66
pocL1j = l1POC;
67
- Frame* l1Fp = m_dpb->m_picList.getPOC(l1POC);
68
+ Frame* l1Fp = m_dpb->m_picList.getPOC(l1POC, 0);
69
while (l1Fp->m_reconRowFlagl1Fp->m_numRows - 1.get() == 0)
70
l1Fp->m_reconRowFlagl1Fp->m_numRows - 1.waitForChange(0); /* If recon is not ready, current frame encoder has to wait. */
71
- l1j = l1Fp->m_reconPic;
72
+ l1j = l1Fp->m_reconPic0;
73
}
74
}
75
}
76
77
uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
78
uint32_t heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
79
80
- Frame* curFrame = m_dpb->m_picList.getPOC(poc);
81
+ Frame* curFrame = m_dpb->m_picList.getPOC(poc, 0);
82
if (curFrame != NULL)
83
{
84
curFrame->m_analysisData = (*analysis_data);
85
86
X265_FREE(m_rdCost);
87
X265_FREE(m_trainingCount);
88
}
89
- if (m_exportedPic)
90
+ for (int layer = 0; layer < m_param->numLayers; layer++)
91
{
92
- ATOMIC_DEC(&m_exportedPic->m_countRefEncoders);
93
- m_exportedPic = NULL;
94
+ if (m_exportedPiclayer)
95
+ {
96
+ ATOMIC_DEC(&m_exportedPiclayer->m_countRefEncoders);
97
+ m_exportedPiclayer = NULL;
98
+ }
99
}
100
101
if (m_param->bEnableFrameDuplication)
102
103
memcpy(dest->planes0, src->planes0, src->framesize * sizeof(char));
104
dest->planes1 = (char*)dest->planes0 + src->stride0 * src->height;
105
dest->planes2 = (char*)dest->planes1 + src->stride1 * (src->height >> x265_cli_cspssrc->colorSpace.height1);
106
+#if ENABLE_ALPHA
107
+ if(m_param->bEnableAlpha)
108
+ dest->planes3 = (char*)dest->planes2 + src->stride2 * (src->height >> x265_cli_cspssrc->colorSpace.height2);
109
+#endif
110
}
111
112
bool Encoder::isFilterThisframe(uint8_t sliceTypeConfig, int curSliceType)
113
114
* returns 0 if no frames are currently available for output
115
* 1 if frame was output, m_nalList contains access unit
116
* negative on malloc error or abort */
117
-int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out)
118
+int Encoder::encode(const x265_picture* pic_in, x265_picture** pic_out)
119
{
120
#if CHECKED_BUILD || _DEBUG
121
if (g_checkFailures)
122
123
if (m_aborted)
124
return -1;
125
126
- const x265_picture* inputPic = NULL;
127
+ const x265_picture* inputPicMAX_VIEWS = { NULL };
128
static int written = 0, read = 0;
129
bool dontRead = false;
130
bool dropflag = false;
131
132
- if (m_exportedPic)
133
+ if (*m_exportedPic)
134
{
135
if (!m_param->bUseAnalysisFile && m_param->analysisSave)
136
- x265_free_analysis_data(m_param, &m_exportedPic->m_analysisData);
137
-
138
- ATOMIC_DEC(&m_exportedPic->m_countRefEncoders);
139
+ x265_free_analysis_data(m_param, &m_exportedPic0->m_analysisData);
140
141
- m_exportedPic = NULL;
142
+ for (int i = 0; i < m_param->numLayers; i++)
143
+ {
144
+ ATOMIC_DEC(&m_exportedPici->m_countRefEncoders);
145
+ m_exportedPici = NULL;
146
+ }
147
m_dpb->recycleUnreferenced();
148
149
if (m_param->bEnableTemporalFilter)
150
151
152
if (read < written)
153
{
154
- inputPic = m_dupBuffer0->dupPic;
155
+ inputPic0 = m_dupBuffer0->dupPic;
156
read++;
157
}
158
}
159
else
160
- inputPic = pic_in;
161
+ {
162
+ for (int view = 0; view < m_param->numViews; view++)
163
+ inputPicview = pic_in + view;
164
+ }
165
166
- Frame *inFrame;
167
- x265_param *p = (m_reconfigure || m_reconfigureRc) ? m_latestParam : m_param;
168
- if (m_dpb->m_freeList.empty())
169
- {
170
- inFrame = new Frame;
171
- inFrame->m_encodeStartTime = x265_mdate();
172
- if (inFrame->create(p, inputPic->quantOffsets))
173
- {
174
- /* the first PicYuv created is asked to generate the CU and block unit offset
175
- * arrays which are then shared with all subsequent PicYuv (orig and recon)
176
- * allocated by this top level encoder */
177
- if (m_sps.cuOffsetY)
178
- {
179
- inFrame->m_fencPic->m_cuOffsetY = m_sps.cuOffsetY;
180
- inFrame->m_fencPic->m_buOffsetY = m_sps.buOffsetY;
181
- if (m_param->internalCsp != X265_CSP_I400)
182
- {
183
- inFrame->m_fencPic->m_cuOffsetC = m_sps.cuOffsetC;
184
- inFrame->m_fencPic->m_buOffsetC = m_sps.buOffsetC;
185
- }
186
- }
187
- else
188
+ x265_param* p = (m_reconfigure || m_reconfigureRc) ? m_latestParam : m_param;
189
+ Frame* inFrameMAX_LAYERS;
190
+ for (int layer = 0; layer < m_param->numLayers; layer++)
191
+ {
192
+ if (m_dpb->m_freeList.empty())
193
+ {
194
+ inFramelayer = new Frame;
195
+ inFramelayer->m_encodeStartTime = x265_mdate();
196
+#if ENABLE_MULTIVIEW
197
+ inFramelayer->m_viewId = m_param->numViews > 1 ? layer : 0;
198
+#endif
199
+#if ENABLE_ALPHA
200
+ inFramelayer->m_sLayerId = m_param->numScalableLayers > 1 ? layer : 0;
201
x265_3.6.tar.gz/source/encoder/encoder.h -> x265_4.0.tar.gz/source/encoder/encoder.h
Changed
61
1
2
ThreadPool* m_threadPool;
3
FrameEncoder* m_frameEncoderX265_MAX_FRAME_THREADS;
4
DPB* m_dpb;
5
- Frame* m_exportedPic;
6
+ Frame* m_exportedPicMAX_LAYERS;
7
FILE* m_analysisFileIn;
8
FILE* m_analysisFileOut;
9
FILE* m_naluFile;
10
11
12
bool m_externalFlush;
13
/* Collect statistics globally */
14
- EncStats m_analyzeAll;
15
- EncStats m_analyzeI;
16
- EncStats m_analyzeP;
17
- EncStats m_analyzeB;
18
+ EncStats m_analyzeAllMAX_LAYERS;
19
+ EncStats m_analyzeIMAX_LAYERS;
20
+ EncStats m_analyzePMAX_LAYERS;
21
+ EncStats m_analyzeBMAX_LAYERS;
22
VPS m_vps;
23
SPS m_sps;
24
PPS m_pps;
25
26
void stopJobs();
27
void destroy();
28
29
- int encode(const x265_picture* pic, x265_picture *pic_out);
30
+ int encode(const x265_picture* pic, x265_picture **pic_out);
31
32
int reconfigureParam(x265_param* encParam, x265_param* param);
33
34
35
36
void copyCtuInfo(x265_ctu_info_t** frameCtuInfo, int poc);
37
38
- int copySlicetypePocAndSceneCut(int *slicetype, int *poc, int *sceneCut);
39
+ int copySlicetypePocAndSceneCut(int *slicetype, int *poc, int *sceneCut, int sLayer);
40
41
int getRefFrameList(PicYuv** l0, PicYuv** l1, int sliceType, int poc, int* pocL0, int* pocL1);
42
43
44
45
void getEndNalUnits(NALList& list, Bitstream& bs);
46
47
- void fetchStats(x265_stats* stats, size_t statsSizeBytes);
48
+ void fetchStats(x265_stats* stats, size_t statsSizeBytes, int layer = 0);
49
50
void printSummary();
51
52
53
54
void copyDistortionData(x265_analysis_data* analysis, FrameData &curEncData);
55
56
- void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, x265_frame_stats* frameStats, int inPoc);
57
+ void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, x265_frame_stats* frameStats, int inPoc, int layer);
58
59
int validateAnalysisData(x265_analysis_validate* param, int readWriteFlag);
60
61
x265_3.6.tar.gz/source/encoder/entropy.cpp -> x265_4.0.tar.gz/source/encoder/entropy.cpp
Changed
201
1
2
X265_CHECK(sizeof(m_contextState) >= sizeof(m_contextState0) * MAX_OFF_CTX_MOD, "context state table is too small\n");
3
}
4
5
-void Entropy::codeVPS(const VPS& vps)
6
+void Entropy::codeVPS(const VPS& vps, const SPS& sps)
7
{
8
+ int maxLayers = (vps.m_numLayers > 1 || vps.m_numViews > 1) + 1;
9
WRITE_CODE(0, 4, "vps_video_parameter_set_id");
10
WRITE_CODE(3, 2, "vps_reserved_three_2bits");
11
- WRITE_CODE(0, 6, "vps_reserved_zero_6bits");
12
+ WRITE_CODE(maxLayers - 1, 6, "vps_reserved_zero_6bits");
13
WRITE_CODE(vps.maxTempSubLayers - 1, 3, "vps_max_sub_layers_minus1");
14
WRITE_FLAG(vps.maxTempSubLayers == 1, "vps_temporal_id_nesting_flag");
15
WRITE_CODE(0xffff, 16, "vps_reserved_ffff_16bits");
16
17
WRITE_UVLC(vps.maxLatencyIncreasei + 1, "vps_max_latency_increase_plus1i");
18
}
19
20
+#if ENABLE_ALPHA || ENABLE_MULTIVIEW
21
+ if (vps.m_numLayers > 1 || vps.m_numViews > 1)
22
+ {
23
+ WRITE_CODE(maxLayers - 1, 6, "vps_max_nuh_reserved_zero_layer_id");
24
+ WRITE_UVLC(vps.m_vpsNumLayerSetsMinus1, "vps_num_layer_sets_minus1");
25
+ for (int i = 1; i <= vps.m_vpsNumLayerSetsMinus1; i++)
26
+ {
27
+#if ENABLE_MULTIVIEW
28
+ if (vps.m_numViews > 1)
29
+ {
30
+ for (int j = 0; j < vps.m_numViews; j++)
31
+ {
32
+ WRITE_FLAG(1, "layer_id_included_flagopsIdxi");
33
+ }
34
+ }
35
+#endif
36
+#if ENABLE_ALPHA
37
+ if (vps.m_numLayers > 1)
38
+ {
39
+ for (int j = 0; j < vps.m_numLayers; j++)
40
+ {
41
+ WRITE_FLAG(1, "layer_id_included_flagopsIdxi");
42
+ }
43
+ }
44
+#endif
45
+ }
46
+ }
47
+ else
48
+ {
49
+ WRITE_CODE(0, 6, "vps_max_nuh_reserved_zero_layer_id");
50
+ WRITE_UVLC(0, "vps_max_op_sets_minus1");
51
+ }
52
+#else
53
WRITE_CODE(0, 6, "vps_max_nuh_reserved_zero_layer_id");
54
- WRITE_UVLC(0, "vps_max_op_sets_minus1");
55
+ WRITE_UVLC(0, "vps_max_op_sets_minus1");
56
+#endif
57
+
58
WRITE_FLAG(0, "vps_timing_info_present_flag"); /* we signal timing info in SPS-VUI */
59
- WRITE_FLAG(0, "vps_extension_flag");
60
+
61
+#if ENABLE_ALPHA || ENABLE_MULTIVIEW
62
+ if (vps.m_numLayers > 1 || vps.m_numViews > 1)
63
+ {
64
+ WRITE_FLAG(vps.vps_extension_flag, "vps_extension_flag");
65
+
66
+ if (vps.vps_extension_flag)
67
+ {
68
+ while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
69
+ {
70
+ WRITE_FLAG(1, "vps_extension_alignment_bit_equal_to_one");
71
+ }
72
+
73
+ WRITE_CODE(vps.ptl.levelIdc, 8, "general_level_idc");
74
+ if (vps.maxTempSubLayers > 1)
75
+ {
76
+ for (int i = 0; i < vps.maxTempSubLayers - 1; i++)
77
+ {
78
+ WRITE_FLAG(0, "sub_layer_profile_present_flagi");
79
+ WRITE_FLAG(0, "sub_layer_level_present_flagi");
80
+ }
81
+ for (int i = vps.maxTempSubLayers - 1; i < 8; i++)
82
+ WRITE_CODE(0, 2, "reserved_zero_2bits");
83
+ }
84
+
85
+ WRITE_FLAG(vps.splitting_flag, "splitting flag");
86
+ for (int i = 0; i < MAX_VPS_NUM_SCALABILITY_TYPES; i++)
87
+ {
88
+ WRITE_FLAG(vps.m_scalabilityMaski, "scalability_maski");
89
+ }
90
+ for (int i = 0; i < vps.scalabilityTypes - vps.splitting_flag; i++)
91
+ {
92
+ WRITE_CODE(vps.m_dimensionIdLeni - 1, 3, "dimension_id_len_minus1i");
93
+ }
94
+ WRITE_FLAG(vps.m_nuhLayerIdPresentFlag, "vps_nuh_layer_id_present_flag");
95
+ for (int i = 1; i < maxLayers; i++)
96
+ {
97
+ if (vps.m_nuhLayerIdPresentFlag)
98
+ WRITE_CODE(vps.m_layerIdInNuhi, 6, "layer_id_in_nuhi");
99
+
100
+ if (!vps.splitting_flag)
101
+ {
102
+ for (int j = 0; j < vps.scalabilityTypes; j++)
103
+ {
104
+ uint8_t bits = vps.m_dimensionIdLenj;
105
+ WRITE_CODE(vps.m_dimensionIdij, bits, "dimension_idij");
106
+ }
107
+ }
108
+ }
109
+ WRITE_CODE(vps.m_viewIdLen, 4, "view_id_len");
110
+
111
+#if ENABLE_ALPHA
112
+ if (vps.m_numLayers > 1)
113
+ {
114
+ WRITE_FLAG(0, "direct_dependency_flag10");
115
+ WRITE_UVLC(0, "num_add_layer_sets");
116
+ WRITE_FLAG(0, "vps_sub_layers_max_minus1_present_flag");
117
+ WRITE_FLAG(0, "max_tid_ref_present_flag");
118
+ WRITE_FLAG(0, "default_ref_layers_active_flag");
119
+ WRITE_UVLC(2, "vps_num_profile_tier_level_minus1");
120
+ WRITE_FLAG(1, "vps_profile_present_flag");
121
+ codeProfileTier(vps.ptl, vps.maxTempSubLayers, 1);
122
+
123
+ WRITE_UVLC(0, "num_add_olss");
124
+ WRITE_CODE(0, 2, "default_output_layer_idc");
125
+ WRITE_CODE(1, 2, "profile_tier_level_idx i j ");
126
+ WRITE_CODE(2, 2, "profile_tier_level_idx i j ");
127
+
128
+ WRITE_UVLC(0, "vps_num_rep_formats_minus1");
129
+
130
+ WRITE_CODE(sps.picWidthInLumaSamples, 16, "pic_width_vps_in_luma_samples");
131
+ WRITE_CODE(sps.picHeightInLumaSamples, 16, "pic_height_vps_in_luma_samples");
132
+ WRITE_FLAG(1, "chroma_and_bit_depth_vps_present_flag");
133
+
134
+ WRITE_CODE(sps.chromaFormatIdc, 2, "chroma_format_vps_idc");
135
+
136
+ if (sps.chromaFormatIdc == X265_CSP_I444)
137
+ WRITE_FLAG(0, "separate_colour_plane_vps_flag");
138
+
139
+ WRITE_CODE(X265_DEPTH - 8, 4, "bit_depth_vps_luma_minus8");
140
+ WRITE_CODE(X265_DEPTH - 8, 4, "bit_depth_vps_chroma_minus8");
141
+
142
+ const Window& conf = sps.conformanceWindow;
143
+ WRITE_FLAG(conf.bEnabled, "conformance_window_vps_flag");
144
+ if (conf.bEnabled)
145
+ {
146
+ int hShift = CHROMA_H_SHIFT(sps.chromaFormatIdc), vShift = CHROMA_V_SHIFT(sps.chromaFormatIdc);
147
+ WRITE_UVLC(conf.leftOffset >> hShift, "conf_win_vps_left_offset");
148
+ WRITE_UVLC(conf.rightOffset >> hShift, "conf_win_vps_right_offset");
149
+ WRITE_UVLC(conf.topOffset >> vShift, "conf_win_vps_top_offset");
150
+ WRITE_UVLC(conf.bottomOffset >> vShift, "conf_win_vps_bottom_offset");
151
+ }
152
+
153
+ WRITE_FLAG(1, "max_one_active_ref_layer_flag");
154
+ WRITE_FLAG(0, "vps_poc_lsb_aligned_flag");
155
+ WRITE_FLAG(1, "poc_lsb_not_present_flag");
156
+
157
+ for (int i = 1; i < vps.m_vpsNumLayerSetsMinus1 + 1; i++)
158
+ {
159
+ WRITE_FLAG(vps.maxTempSubLayers > 1, "sub_layer_flag_info_present_flag");
160
+ for (int j = 0; j < vps.maxTempSubLayers ; j++)
161
+ {
162
+ if(j > 0)
163
+ WRITE_FLAG(vps.maxTempSubLayers > 1, "sub_layer_dpb_info_present_flag");
164
+
165
+ for(int k = 0; k < vps.m_numLayersInIdListi; k++)
166
+ WRITE_UVLC(vps.maxDecPicBufferingj - 1, "vps_max_dec_pic_buffering_minus1i");
167
+
168
+ WRITE_UVLC(vps.numReorderPics0, "vps_num_reorder_picsi");
169
+ WRITE_UVLC(vps.maxLatencyIncrease0 + 1, "vps_max_latency_increase_plus1i");
170
+ }
171
+ }
172
+
173
+ WRITE_UVLC(0, "direct_dep_type_len_minus2");
174
+
175
+ WRITE_FLAG(0, "default_direct_dependency_flag");
176
+ WRITE_UVLC(0, "vps_non_vui_extension_length");
177
+ WRITE_FLAG(0, "vps_vui_present_flag");
178
+ WRITE_FLAG(0, "vps_extension2_flag");
179
+ }
180
+#endif
181
+
182
+#if ENABLE_MULTIVIEW
183
+ if (vps.m_numViews > 1)
184
+ {
185
+ for (uint8_t i = 0; i < vps.m_numViews; i++)
186
+ WRITE_CODE(i, vps.m_viewIdLen, "view_id_vali");
187
+
188
+ for (int i = 1; i < vps.m_numViews; i++)
189
+ {
190
+ for (int j = 0; j < i; j++)
191
+ {
192
+ if (j == 0)
193
+ WRITE_FLAG(1, "direct_dependency_flag10");
194
+ else
195
+ WRITE_FLAG(0, "direct_dependency_flag10");
196
+ }
197
+ }
198
+ WRITE_FLAG(0, "vps_sub_layers_max_minus1_present_flag");
199
+ WRITE_FLAG(0, "max_tid_ref_present_flag");
200
+ WRITE_FLAG(1, "default_ref_layers_active_flag");
201
x265_3.6.tar.gz/source/encoder/entropy.h -> x265_4.0.tar.gz/source/encoder/entropy.h
Changed
30
1
2
void loadIntraDirModeLuma(const Entropy& src);
3
void copyState(const Entropy& other);
4
5
- void codeVPS(const VPS& vps);
6
- void codeSPS(const SPS& sps, const ScalingList& scalingList, const ProfileTierLevel& ptl);
7
- void codePPS( const PPS& pps, bool filerAcross, int iPPSInitQpMinus26 );
8
- void codeVUI(const VUI& vui, int maxSubTLayers, bool bEmitVUITimingInfo, bool bEmitVUIHRDInfo);
9
+ void codeVPS(const VPS& vps, const SPS& sps);
10
+ void codeSPS(const SPS& sps, const ScalingList& scalingList, const ProfileTierLevel& ptl, int layer = 0);
11
+ void codePPS( const PPS& pps, bool filerAcross, int iPPSInitQpMinus26, int layer = 0);
12
+ void codeVUI(const VUI& vui, int maxSubTLayers, bool bEmitVUITimingInfo, bool bEmitVUIHRDInfo, int layer = 0);
13
void codeAUD(const Slice& slice);
14
void codeHrdParameters(const HRDInfo& hrd, int maxSubTLayers);
15
16
- void codeSliceHeader(const Slice& slice, FrameData& encData, uint32_t slice_addr, uint32_t slice_addr_bits, int sliceQp);
17
+ void codeSliceHeader(const Slice& slice, FrameData& encData, uint32_t slice_addr, uint32_t slice_addr_bits, int sliceQp, int layer = 0);
18
void codeSliceHeaderWPPEntryPoints(const uint32_t *substreamSizes, uint32_t numSubStreams, uint32_t maxOffset);
19
void codeShortTermRefPicSet(const RPS& rps, int idx);
20
void finishSlice() { encodeBinTrm(1); finish(); dynamic_cast<Bitstream*>(m_bitIf)->writeByteAlignment(); }
21
22
void writeEpExGolomb(uint32_t symbol, uint32_t count);
23
void writeCoefRemainExGolomb(uint32_t symbol, const uint32_t absGoRice);
24
25
- void codeProfileTier(const ProfileTierLevel& ptl, int maxTempSubLayers);
26
+ void codeProfileTier(const ProfileTierLevel& ptl, int maxTempSubLayers, int layer = 0);
27
void codeScalingList(const ScalingList&);
28
void codeScalingList(const ScalingList& scalingList, uint32_t sizeId, uint32_t listId);
29
30
x265_3.6.tar.gz/source/encoder/frameencoder.cpp -> x265_4.0.tar.gz/source/encoder/frameencoder.cpp
Changed
201
1
2
3
FrameEncoder::FrameEncoder()
4
{
5
- m_prevOutputTime = x265_mdate();
6
m_reconfigure = false;
7
m_isFrameEncoder = true;
8
m_threadActive = true;
9
- m_slicetypeWaitTime = 0;
10
m_activeWorkerCount = 0;
11
m_completionCount = 0;
12
m_outStreams = NULL;
13
14
m_rows = NULL;
15
m_top = NULL;
16
m_param = NULL;
17
- m_frame = NULL;
18
m_cuGeoms = NULL;
19
m_ctuGeomMap = NULL;
20
m_localTldIdx = 0;
21
memset(&m_rce, 0, sizeof(RateControlEntry));
22
+ for (int layer = 0; layer < MAX_LAYERS; layer++)
23
+ {
24
+ m_prevOutputTimelayer = x265_mdate();
25
+ m_slicetypeWaitTimelayer = 0;
26
+ m_framelayer = NULL;
27
+ }
28
}
29
30
void FrameEncoder::destroy()
31
32
X265_FREE(m_ctuGeomMap);
33
X265_FREE(m_substreamSizes);
34
X265_FREE(m_nr);
35
+ X265_FREE(m_retFrameBuffer);
36
37
m_frameFilter.destroy();
38
39
40
ok &= !!m_frameEncTF->createRefPicInfo(&m_mcstfRefListi, m_param);
41
}
42
43
+ m_retFrameBuffer = X265_MALLOC(Frame*, m_param->numLayers);
44
+ for (int layer = 0; layer < m_param->numLayers; layer++)
45
+ m_retFrameBufferlayer = NULL;
46
return ok;
47
}
48
49
50
return true;
51
}
52
53
-bool FrameEncoder::startCompressFrame(Frame* curFrame)
54
+bool FrameEncoder::startCompressFrame(Frame* curFrameMAX_LAYERS)
55
{
56
- m_slicetypeWaitTime = x265_mdate() - m_prevOutputTime;
57
- m_frame = curFrame;
58
- m_sliceType = curFrame->m_lowres.sliceType;
59
- curFrame->m_encData->m_frameEncoderID = m_jpId;
60
- curFrame->m_encData->m_jobProvider = this;
61
- curFrame->m_encData->m_slice->m_mref = m_mref;
62
+ for (int layer = 0; layer < m_param->numLayers; layer++)
63
+ {
64
+ m_slicetypeWaitTimelayer = x265_mdate() - m_prevOutputTimelayer;
65
+ m_framelayer = curFramelayer;
66
+ curFramelayer->m_encData->m_frameEncoderID = m_jpId;
67
+ curFramelayer->m_encData->m_jobProvider = this;
68
+ curFramelayer->m_encData->m_slice->m_mref = m_mref;
69
+ }
70
+ m_sliceType = curFrame0->m_lowres.sliceType;
71
72
if (!m_cuGeoms)
73
{
74
75
{
76
if (m_param->bCTUInfo)
77
{
78
- while (!m_frame->m_ctuInfo)
79
- m_frame->m_copied.wait();
80
+ while (!m_frame0->m_ctuInfo)
81
+ m_frame0->m_copied.wait();
82
}
83
- if ((m_param->bAnalysisType == AVC_INFO) && !m_param->analysisSave && !m_param->analysisLoad && !(IS_X265_TYPE_I(m_frame->m_lowres.sliceType)))
84
+ if ((m_param->bAnalysisType == AVC_INFO) && !m_param->analysisSave && !m_param->analysisLoad && !(IS_X265_TYPE_I(m_frame0->m_lowres.sliceType)))
85
{
86
- while (((m_frame->m_analysisData.interData == NULL && m_frame->m_analysisData.intraData == NULL) || (uint32_t)m_frame->m_poc != m_frame->m_analysisData.poc))
87
- m_frame->m_copyMVType.wait();
88
+ while (((m_frame0->m_analysisData.interData == NULL && m_frame0->m_analysisData.intraData == NULL) || (uint32_t)m_frame0->m_poc != m_frame0->m_analysisData.poc))
89
+ m_frame0->m_copyMVType.wait();
90
}
91
- compressFrame();
92
+
93
+ for (int layer = 0; layer < m_param->numLayers; layer++)
94
+ compressFrame(layer);
95
m_done.trigger(); /* FrameEncoder::getEncodedPicture() blocks for this event */
96
m_enable.wait();
97
}
98
99
100
void FrameEncoder::WeightAnalysis::processTasks(int /* workerThreadId */)
101
{
102
- Frame* frame = master.m_frame;
103
+ Frame* frame = master.m_framemaster.m_sLayerId;
104
weightAnalyse(*frame->m_encData->m_slice, *frame, *master.m_param);
105
}
106
107
108
memcpy(m_top->m_prevTonemapPayload.payload, payload->payload, payload->payloadSize);
109
}
110
111
- bool isIDR = m_frame->m_lowres.sliceType == X265_TYPE_IDR;
112
+ bool isIDR = m_frame0->m_lowres.sliceType == X265_TYPE_IDR;
113
return (payloadChange || isIDR);
114
}
115
116
-void FrameEncoder::writeTrailingSEIMessages()
117
+void FrameEncoder::writeTrailingSEIMessages(int layer)
118
{
119
- Slice* slice = m_frame->m_encData->m_slice;
120
+ Slice* slice = m_framelayer->m_encData->m_slice;
121
int planes = (m_param->internalCsp != X265_CSP_I400) ? 3 : 1;
122
int32_t payloadSize = 0;
123
124
125
}
126
127
m_seiReconPictureDigest.setSize(payloadSize);
128
- m_seiReconPictureDigest.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_SUFFIX_SEI, m_nalList, false);
129
+ m_seiReconPictureDigest.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_SUFFIX_SEI, m_nalList, false, layer);
130
}
131
132
-void FrameEncoder::compressFrame()
133
+void FrameEncoder::compressFrame(int layer)
134
{
135
ProfileScopeEvent(frameThread);
136
137
- m_startCompressTime = x265_mdate();
138
+ m_startCompressTimelayer = x265_mdate();
139
m_totalActiveWorkerCount = 0;
140
m_activeWorkerCountSamples = 0;
141
- m_totalWorkerElapsedTime = 0;
142
- m_totalNoWorkerTime = 0;
143
+ m_totalWorkerElapsedTimelayer = 0;
144
+ m_totalNoWorkerTimelayer = 0;
145
m_countRowBlocks = 0;
146
- m_allRowsAvailableTime = 0;
147
- m_stallStartTime = 0;
148
+ m_allRowsAvailableTimelayer = 0;
149
+ m_stallStartTimelayer = 0;
150
151
m_completionCount = 0;
152
memset((void*)m_bAllRowsStop, 0, sizeof(bool) * m_param->maxSlices);
153
154
m_rowSliceTotalBits0 = 0;
155
m_rowSliceTotalBits1 = 0;
156
157
- m_SSDY = m_SSDU = m_SSDV = 0;
158
- m_ssim = 0;
159
- m_ssimCnt = 0;
160
- memset(&(m_frame->m_encData->m_frameStats), 0, sizeof(m_frame->m_encData->m_frameStats));
161
+ m_SSDYlayer = m_SSDUlayer = m_SSDVlayer = 0;
162
+ m_ssimlayer = 0;
163
+ m_ssimCntlayer = 0;
164
+ memset(&(m_framelayer->m_encData->m_frameStats), 0, sizeof(m_framelayer->m_encData->m_frameStats));
165
+ m_sLayerId = layer;
166
167
if (m_param->rc.aqMode != X265_AQ_EDGE && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
168
{
169
- int height = m_frame->m_fencPic->m_picHeight;
170
- int width = m_frame->m_fencPic->m_picWidth;
171
- intptr_t stride = m_frame->m_fencPic->m_stride;
172
+ int height = m_framelayer->m_fencPic->m_picHeight;
173
+ int width = m_framelayer->m_fencPic->m_picWidth;
174
+ intptr_t stride = m_framelayer->m_fencPic->m_stride;
175
176
- if (!computeEdge(m_frame->m_edgeBitPic, m_frame->m_fencPic->m_picOrg0, NULL, stride, height, width, false, 1))
177
+ if (!computeEdge(m_framelayer->m_edgeBitPic, m_framelayer->m_fencPic->m_picOrg0, NULL, stride, height, width, false, 1))
178
{
179
x265_log(m_param, X265_LOG_ERROR, " Failed to compute edge !");
180
}
181
182
/* Emit access unit delimiter unless this is the first frame and the user is
183
* not repeating headers (since AUD is supposed to be the first NAL in the access
184
* unit) */
185
- Slice* slice = m_frame->m_encData->m_slice;
186
+ Slice* slice = m_framelayer->m_encData->m_slice;
187
188
- if (m_param->bEnableEndOfSequence && m_frame->m_lowres.sliceType == X265_TYPE_IDR && m_frame->m_poc)
189
+ if (m_param->bEnableEndOfSequence && m_framelayer->m_lowres.sliceType == X265_TYPE_IDR && m_framelayer->m_poc)
190
{
191
m_bs.resetBits();
192
m_nalList.serialize(NAL_UNIT_EOS, m_bs);
193
}
194
195
- if (m_param->bEnableAccessUnitDelimiters && (m_frame->m_poc || m_param->bRepeatHeaders))
196
+ if (m_param->bEnableAccessUnitDelimiters && (m_framelayer->m_poc || m_param->bRepeatHeaders))
197
{
198
m_bs.resetBits();
199
m_entropyCoder.setBitstream(&m_bs);
200
201
x265_3.6.tar.gz/source/encoder/frameencoder.h -> x265_4.0.tar.gz/source/encoder/frameencoder.h
Changed
116
1
2
void destroy();
3
4
/* triggers encode of a new frame by the worker thread */
5
- bool startCompressFrame(Frame* curFrame);
6
+ bool startCompressFrame(Frame* curFrameMAX_LAYERS);
7
8
/* blocks until worker thread is done, returns access unit */
9
- Frame *getEncodedPicture(NALList& list);
10
+ Frame **getEncodedPicture(NALList& list);
11
12
- void initDecodedPictureHashSEI(int row, int cuAddr, int height);
13
+ void initDecodedPictureHashSEI(int row, int cuAddr, int height, int layer);
14
15
Event m_enable;
16
Event m_done;
17
18
RateControlEntry m_rce;
19
SEIDecodedPictureHash m_seiReconPictureDigest;
20
21
- uint64_t m_SSDY;
22
- uint64_t m_SSDU;
23
- uint64_t m_SSDV;
24
- double m_ssim;
25
- uint64_t m_accessUnitBits;
26
- uint32_t m_ssimCnt;
27
+ uint64_t m_SSDYMAX_LAYERS;
28
+ uint64_t m_SSDUMAX_LAYERS;
29
+ uint64_t m_SSDVMAX_LAYERS;
30
+ double m_ssimMAX_LAYERS;
31
+ uint64_t m_accessUnitBitsMAX_LAYERS;
32
+ uint32_t m_ssimCntMAX_LAYERS;
33
34
volatile int m_activeWorkerCount; // count of workers currently encoding or filtering CTUs
35
volatile int m_totalActiveWorkerCount; // sum of m_activeWorkerCount sampled at end of each CTU
36
volatile int m_activeWorkerCountSamples; // count of times m_activeWorkerCount was sampled (think vbv restarts)
37
volatile int m_countRowBlocks; // count of workers forced to abandon a row because of top dependency
38
- int64_t m_startCompressTime; // timestamp when frame encoder is given a frame
39
- int64_t m_row0WaitTime; // timestamp when row 0 is allowed to start
40
- int64_t m_allRowsAvailableTime; // timestamp when all reference dependencies are resolved
41
- int64_t m_endCompressTime; // timestamp after all CTUs are compressed
42
- int64_t m_endFrameTime; // timestamp after RCEnd, NR updates, etc
43
- int64_t m_stallStartTime; // timestamp when worker count becomes 0
44
- int64_t m_prevOutputTime; // timestamp when prev frame was retrieved by API thread
45
- int64_t m_slicetypeWaitTime; // total elapsed time waiting for decided frame
46
- int64_t m_totalWorkerElapsedTime; // total elapsed time spent by worker threads processing CTUs
47
- int64_t m_totalNoWorkerTime; // total elapsed time without any active worker threads
48
+ int64_t m_startCompressTimeMAX_LAYERS; // timestamp when frame encoder is given a frame
49
+ int64_t m_row0WaitTimeMAX_LAYERS; // timestamp when row 0 is allowed to start
50
+ int64_t m_allRowsAvailableTimeMAX_LAYERS; // timestamp when all reference dependencies are resolved
51
+ int64_t m_endCompressTimeMAX_LAYERS; // timestamp after all CTUs are compressed
52
+ int64_t m_endFrameTimeMAX_LAYERS; // timestamp after RCEnd, NR updates, etc
53
+ int64_t m_stallStartTimeMAX_LAYERS; // timestamp when worker count becomes 0
54
+ int64_t m_prevOutputTimeMAX_LAYERS; // timestamp when prev frame was retrieved by API thread
55
+ int64_t m_slicetypeWaitTimeMAX_LAYERS; // total elapsed time waiting for decided frame
56
+ int64_t m_totalWorkerElapsedTimeMAX_LAYERS; // total elapsed time spent by worker threads processing CTUs
57
+ int64_t m_totalNoWorkerTimeMAX_LAYERS; // total elapsed time without any active worker threads
58
#if DETAILED_CU_STATS
59
CUStats m_cuStats;
60
#endif
61
62
Encoder* m_top;
63
x265_param* m_param;
64
- Frame* m_frame;
65
+ Frame* m_frameMAX_LAYERS;
66
+ Frame** m_retFrameBuffer;
67
NoiseReduction* m_nr;
68
ThreadLocalData* m_tld; /* for --no-wpp */
69
Bitstream* m_outStreams;
70
71
TemporalFilter* m_frameEncTF;
72
TemporalFilterRefPicInfo m_mcstfRefListMAX_MCSTF_TEMPORAL_WINDOW_LENGTH;
73
74
+ int m_sLayerId;
75
+
76
class WeightAnalysis : public BondedTaskGroup
77
{
78
public:
79
80
bool initializeGeoms();
81
82
/* analyze / compress frame, can be run in parallel within reference constraints */
83
- void compressFrame();
84
+ void compressFrame(int layer);
85
86
/* called by compressFrame to generate final per-row bitstreams */
87
- void encodeSlice(uint32_t sliceAddr);
88
+ void encodeSlice(uint32_t sliceAddr, int layer);
89
90
void threadMain();
91
int collectCTUStatistics(const CUData& ctu, FrameStats* frameLog);
92
void noiseReductionUpdate();
93
- void writeTrailingSEIMessages();
94
+ void writeTrailingSEIMessages(int layer);
95
bool writeToneMapInfo(x265_sei_payload *payload);
96
97
/* Called by WaveFront::findJob() */
98
- virtual void processRow(int row, int threadId);
99
- virtual void processRowEncoder(int row, ThreadLocalData& tld);
100
+ virtual void processRow(int row, int threadId, int layer);
101
+ virtual void processRowEncoder(int row, ThreadLocalData& tld, int layer);
102
103
void enqueueRowEncoder(int row) { WaveFront::enqueueRow(row * 2 + 0); }
104
void enqueueRowFilter(int row) { WaveFront::enqueueRow(row * 2 + 1); }
105
106
#if ENABLE_LIBVMAF
107
void vmafFrameLevelScore();
108
#endif
109
- void collectDynDataFrame();
110
- void computeAvgTrainingData();
111
+ void collectDynDataFrame(int layer);
112
+ void computeAvgTrainingData(int layer);
113
void collectDynDataRow(CUData& ctu, FrameStats* rowStats);
114
void readModel(FilmGrainCharacteristics* m_filmGrain, FILE* filmgrain);
115
};
116
x265_3.6.tar.gz/source/encoder/framefilter.cpp -> x265_4.0.tar.gz/source/encoder/framefilter.cpp
Changed
137
1
2
const int size = cu->m_log2CUSizeabsPartIdx - 2;
3
const uint32_t cuAddr = cu->m_cuAddr;
4
5
- PicYuv* reconPic = frame.m_reconPic;
6
+ PicYuv* reconPic = frame.m_reconPic0;
7
PicYuv* fencPic = frame.m_fencPic;
8
9
pixel* dst = reconPic->getLumaAddr(cuAddr, absPartIdx);
10
11
12
uint32_t cuAddr = m_rowAddr + col;
13
const CUData* ctu = m_encData->getPicCTU(cuAddr);
14
- assert(m_frameFilter->m_frame->m_reconPic == m_encData->m_reconPic);
15
+ assert(m_frameFilter->m_frame->m_reconPic0 == m_encData->m_reconPic0);
16
origCUSampleRestoration(ctu, cuGeomsctuGeomMapcuAddr, *m_frameFilter->m_frame);
17
}
18
}
19
20
if ((col != 0) & (col != m_frameFilter->m_numCols - 1) & (m_row != 0) & (m_row != m_frameFilter->m_numRows - 1))
21
return;
22
23
- PicYuv *reconPic = m_frameFilter->m_frame->m_reconPic;
24
+ PicYuv *reconPic = m_frameFilter->m_frame->m_reconPic0;
25
const uint32_t lineStartCUAddr = m_rowAddr + col;
26
const int realH = getCUHeight();
27
const int realW = m_frameFilter->getCUWidth(col);
28
29
SAOParam* saoParam = m_encData->m_saoParam;
30
const CUGeom* cuGeoms = m_frameFilter->m_frameEncoder->m_cuGeoms;
31
const uint32_t* ctuGeomMap = m_frameFilter->m_frameEncoder->m_ctuGeomMap;
32
- PicYuv* reconPic = m_encData->m_reconPic;
33
+ PicYuv* reconPic = m_encData->m_reconPic0;
34
const int colStart = m_lastCol.get();
35
const int numCols = m_frameFilter->m_numCols;
36
// TODO: Waiting previous row finish or simple clip on it?
37
38
}
39
}
40
41
-void FrameFilter::processRow(int row)
42
+void FrameFilter::processRow(int row, int layer)
43
{
44
ProfileScopeEvent(filterCTURow);
45
46
47
48
if (!m_param->bEnableLoopFilter && !m_useSao)
49
{
50
- processPostRow(row);
51
+ processPostRow(row, layer);
52
return;
53
}
54
FrameData& encData = *m_frame->m_encData;
55
56
57
// this row of CTUs has been encoded
58
if (!ctu->m_bFirstRowInSlice)
59
- processPostRow(row - 1);
60
+ processPostRow(row - 1, layer);
61
62
// NOTE: slices parallelism will be execute out-of-order
63
int numRowFinished = 0;
64
65
}
66
67
if (ctu->m_bLastRowInSlice)
68
- processPostRow(row);
69
+ processPostRow(row, layer);
70
}
71
72
-void FrameFilter::processPostRow(int row)
73
+void FrameFilter::processPostRow(int row, int layer)
74
{
75
- PicYuv *reconPic = m_frame->m_reconPic;
76
+ PicYuv *reconPic = m_frame->m_reconPic0;
77
const uint32_t numCols = m_frame->m_encData->m_slice->m_sps->numCuInWidth;
78
const uint32_t lineStartCUAddr = row * numCols;
79
80
81
uint32_t height = m_parallelFilterrow.getCUHeight();
82
83
uint64_t ssdY = m_frameEncoder->m_top->computeSSD(fencPic->getLumaAddr(cuAddr), reconPic->getLumaAddr(cuAddr), stride, width, height, m_param);
84
- m_frameEncoder->m_SSDY += ssdY;
85
+ m_frameEncoder->m_SSDYlayer += ssdY;
86
87
if (m_param->internalCsp != X265_CSP_I400)
88
{
89
90
uint64_t ssdU = m_frameEncoder->m_top->computeSSD(fencPic->getCbAddr(cuAddr), reconPic->getCbAddr(cuAddr), stride, width, height, m_param);
91
uint64_t ssdV = m_frameEncoder->m_top->computeSSD(fencPic->getCrAddr(cuAddr), reconPic->getCrAddr(cuAddr), stride, width, height, m_param);
92
93
- m_frameEncoder->m_SSDU += ssdU;
94
- m_frameEncoder->m_SSDV += ssdV;
95
+ m_frameEncoder->m_SSDUlayer += ssdU;
96
+ m_frameEncoder->m_SSDVlayer += ssdV;
97
}
98
}
99
100
101
/* SSIM is done for each row in blocks of 4x4 . The First blocks are offset by 2 pixels to the right
102
* to avoid alignment of ssim blocks with DCT blocks. */
103
minPixY += bStart ? 2 : -6;
104
- m_frameEncoder->m_ssim += calculateSSIM(rec + 2 + minPixY * stride1, stride1, fenc + 2 + minPixY * stride2, stride2,
105
+ m_frameEncoder->m_ssimlayer += calculateSSIM(rec + 2 + minPixY * stride1, stride1, fenc + 2 + minPixY * stride2, stride2,
106
m_param->sourceWidth - 2, maxPixY - minPixY, m_ssimBuf, ssim_cnt);
107
- m_frameEncoder->m_ssimCnt += ssim_cnt;
108
+ m_frameEncoder->m_ssimCntlayer += ssim_cnt;
109
}
110
111
if (m_param->maxSlices == 1)
112
{
113
uint32_t height = m_parallelFilterrow.getCUHeight();
114
- m_frameEncoder->initDecodedPictureHashSEI(row, cuAddr, height);
115
+ m_frameEncoder->initDecodedPictureHashSEI(row, cuAddr, height, layer);
116
} // end of (m_param->maxSlices == 1)
117
118
if (ATOMIC_INC(&m_frameEncoder->m_completionCount) == 2 * (int)m_frameEncoder->m_numRows)
119
120
}
121
}
122
123
- int stride = (int)m_frame->m_reconPic->m_stride;
124
+ int stride = (int)m_frame->m_reconPic0->m_stride;
125
int padX = m_param->maxCUSize + 32;
126
int padY = m_param->maxCUSize + 16;
127
int numCuInHeight = m_frame->m_encData->m_slice->m_sps->numCuInHeight;
128
129
130
for (int y = startRow; y < height; y++)
131
{
132
- pixel *pix = m_frame->m_reconPic->m_picOrg0 + y * stride - padX;
133
+ pixel *pix = m_frame->m_reconPic0->m_picOrg0 + y * stride - padX;
134
uint32_t *sum32x32 = m_frame->m_encData->m_meIntegral0 + (y + 1) * stride - padX;
135
uint32_t *sum32x24 = m_frame->m_encData->m_meIntegral1 + (y + 1) * stride - padX;
136
uint32_t *sum32x8 = m_frame->m_encData->m_meIntegral2 + (y + 1) * stride - padX;
137
x265_3.6.tar.gz/source/encoder/framefilter.h -> x265_4.0.tar.gz/source/encoder/framefilter.h
Changed
12
1
2
3
void start(Frame *pic, Entropy& initState);
4
5
- void processRow(int row);
6
- void processPostRow(int row);
7
+ void processRow(int row, int layer);
8
+ void processPostRow(int row, int layer);
9
void computeMEIntegral(int row);
10
};
11
}
12
x265_3.6.tar.gz/source/encoder/level.cpp -> x265_4.0.tar.gz/source/encoder/level.cpp
Changed
201
1
2
{ MAX_UINT, MAX_UINT, MAX_UINT, MAX_UINT, MAX_UINT, MAX_UINT, 1, Level::LEVEL8_5, "8.5", 85 },
3
};
4
5
+#if ENABLE_SCC_EXT
6
+enum SCCProfileName
7
+{
8
+ NONE = 0,
9
+ // The following are SCC profiles, which would map to the MAINSCC profile idc.
10
+ // The enumeration indicates the bit-depth constraint in the bottom 2 digits
11
+ // the chroma format in the next digit
12
+ // the intra constraint in the next digit
13
+ // If it is a SCC profile there is a '2' for the next digit.
14
+ // If it is a highthroughput , there is a '2' for the top digit else '1' for the top digit
15
+ SCC_MAIN = 121108,
16
+ SCC_MAIN_10 = 121110,
17
+ SCC_MAIN_444 = 121308,
18
+ SCC_MAIN_444_10 = 121310,
19
+};
20
+
21
+static const SCCProfileName validSCCProfileNames14/* bit depth constraint 8=0, 10=1, 12=2, 14=3*/4/*chroma format*/ =
22
+{
23
+ {
24
+ { NONE, SCC_MAIN, NONE, SCC_MAIN_444 }, // 8-bit intra for 400, 420, 422 and 444
25
+ { NONE, SCC_MAIN_10, NONE, SCC_MAIN_444_10 }, // 10-bit intra for 400, 420, 422 and 444
26
+ { NONE, NONE, NONE, NONE }, // 12-bit intra for 400, 420, 422 and 444
27
+ { NONE, NONE, NONE, NONE } // 16-bit intra for 400, 420, 422 and 444
28
+ },
29
+};
30
+#endif
31
+
32
+static inline int _confirm(x265_param* param, bool bflag, const char* message)
33
+{
34
+ if (!bflag)
35
+ return 0;
36
+
37
+ x265_log(param, X265_LOG_ERROR, "%s\n", message);
38
+ return 1;
39
+}
40
+
41
/* determine minimum decoder level required to decode the described video */
42
void determineLevel(const x265_param ¶m, VPS& vps)
43
{
44
45
if (param.internalBitDepth <= 8)
46
{
47
if (vps.ptl.onePictureOnlyConstraintFlag)
48
- vps.ptl.profileIdc = Profile::MAINSTILLPICTURE;
49
+ vps.ptl.profileIdc0 = Profile::MAINSTILLPICTURE;
50
else if (vps.ptl.intraConstraintFlag)
51
- vps.ptl.profileIdc = Profile::MAINREXT; /* Main Intra */
52
+ vps.ptl.profileIdc0 = Profile::MAINREXT; /* Main Intra */
53
else
54
- vps.ptl.profileIdc = Profile::MAIN;
55
+ vps.ptl.profileIdc0 = Profile::MAIN;
56
+
57
+#if ENABLE_ALPHA
58
+ if (param.numScalableLayers == 2)
59
+ vps.ptl.profileIdc1 = Profile::SCALABLEMAIN;
60
+#endif
61
}
62
else if (param.internalBitDepth <= 10)
63
{
64
/* note there is no 10bit still picture profile */
65
if (vps.ptl.intraConstraintFlag)
66
- vps.ptl.profileIdc = Profile::MAINREXT; /* Main10 Intra */
67
+ vps.ptl.profileIdc0 = Profile::MAINREXT; /* Main10 Intra */
68
else
69
- vps.ptl.profileIdc = Profile::MAIN10;
70
+ vps.ptl.profileIdc0 = Profile::MAIN10;
71
+
72
+#if ENABLE_ALPHA
73
+ if (param.numScalableLayers == 2)
74
+ vps.ptl.profileIdc1 = Profile::SCALABLEMAIN10;
75
+#endif
76
}
77
}
78
else
79
- vps.ptl.profileIdc = Profile::MAINREXT;
80
+ vps.ptl.profileIdc0 = Profile::MAINREXT;
81
+
82
+#if ENABLE_MULTIVIEW
83
+ if (param.numViews == 2)
84
+ vps.ptl.profileIdc1 = Profile::MULTIVIEWMAIN;
85
+#endif
86
+
87
+#if ENABLE_SCC_EXT
88
+ if (param.bEnableSCC)
89
+ vps.ptl.profileIdc0 = Profile::MAINSCC;
90
91
/* determine which profiles are compatible with this stream */
92
+ if (vps.ptl.profileIdc0 == Profile::MAINSCC)
93
+ {
94
+ vps.ptl.onePictureOnlyConstraintFlag = false;
95
+ vps.ptl.intraConstraintFlag = param.keyframeMax <= 1 || vps.ptl.onePictureOnlyConstraintFlag;
96
+ }
97
+#endif
98
99
memset(vps.ptl.profileCompatibilityFlag, 0, sizeof(vps.ptl.profileCompatibilityFlag));
100
- vps.ptl.profileCompatibilityFlagvps.ptl.profileIdc = true;
101
- if (vps.ptl.profileIdc == Profile::MAIN10 && param.internalBitDepth == 8)
102
+ vps.ptl.profileCompatibilityFlagvps.ptl.profileIdc0 = true;
103
+ if (vps.ptl.profileIdc0 == Profile::MAIN10 && param.internalBitDepth == 8)
104
vps.ptl.profileCompatibilityFlagProfile::MAIN = true;
105
- else if (vps.ptl.profileIdc == Profile::MAIN)
106
+ else if (vps.ptl.profileIdc0 == Profile::MAIN)
107
vps.ptl.profileCompatibilityFlagProfile::MAIN10 = true;
108
- else if (vps.ptl.profileIdc == Profile::MAINSTILLPICTURE)
109
+ else if (vps.ptl.profileIdc0 == Profile::MAINSTILLPICTURE)
110
{
111
vps.ptl.profileCompatibilityFlagProfile::MAIN = true;
112
vps.ptl.profileCompatibilityFlagProfile::MAIN10 = true;
113
}
114
- else if (vps.ptl.profileIdc == Profile::MAINREXT)
115
+ else if (vps.ptl.profileIdc0 == Profile::MAINREXT)
116
vps.ptl.profileCompatibilityFlagProfile::MAINREXT = true;
117
+#if ENABLE_SCC_EXT
118
+ else if (vps.ptl.profileIdc0 == Profile::MAINSCC)
119
+ vps.ptl.profileCompatibilityFlagProfile::MAINSCC = true;
120
+#endif
121
122
uint32_t lumaSamples = param.sourceWidth * param.sourceHeight;
123
uint32_t samplesPerSec = (uint32_t)(lumaSamples * ((double)param.fpsNum / param.fpsDenom));
124
uint32_t bitrate = param.rc.vbvMaxBitrate ? param.rc.vbvMaxBitrate : param.rc.bitrate;
125
126
- const uint32_t MaxDpbPicBuf = 6;
127
+ const uint32_t MaxDpbPicBuf = param.bEnableSCC ? 7 : 6;
128
vps.ptl.levelIdc = Level::NONE;
129
vps.ptl.tierFlag = Level::MAIN;
130
131
132
if (levelsi.levelEnum >= Level::LEVEL5 && param.maxCUSize < 32)
133
{
134
x265_log(¶m, X265_LOG_WARNING, "level %s detected, but CTU size 16 is non-compliant\n", levelsi.name);
135
- vps.ptl.profileIdc = Profile::NONE;
136
+ vps.ptl.profileIdc0 = Profile::NONE;
137
vps.ptl.levelIdc = Level::NONE;
138
vps.ptl.tierFlag = Level::MAIN;
139
x265_log(¶m, X265_LOG_INFO, "NONE profile, Level-NONE (Main tier)\n");
140
141
if (numPocTotalCurr > 10)
142
{
143
x265_log(¶m, X265_LOG_WARNING, "level %s detected, but NumPocTotalCurr (total references) is non-compliant\n", levelsi.name);
144
- vps.ptl.profileIdc = Profile::NONE;
145
+ vps.ptl.profileIdc0 = Profile::NONE;
146
vps.ptl.levelIdc = Level::NONE;
147
vps.ptl.tierFlag = Level::MAIN;
148
x265_log(¶m, X265_LOG_INFO, "NONE profile, Level-NONE (Main tier)\n");
149
150
break;
151
}
152
153
- static const char *profiles = { "None", "Main", "Main 10", "Main Still Picture", "RExt" };
154
+#if ENABLE_SCC_EXT
155
+ x265_param m_param = param;
156
+#define CHECK(expr, msg) check_failed |= _confirm(&m_param, expr, msg)
157
+ int check_failed = 0; /* abort if there is a fatal configuration problem */
158
+
159
+ if (vps.ptl.profileIdc0 == Profile::MAINSCC)
160
+ {
161
+ CHECK(vps.ptl.lowerBitRateConstraintFlag == false && vps.ptl.intraConstraintFlag == false, "The lowerBitRateConstraint flag cannot be false when intraConstraintFlag is false");
162
+ CHECK(param.bEnableSCC && !(vps.ptl.profileIdc0 == Profile::MAINSCC), "UseIntraBlockCopy must not be enabled unless the SCC profile is being used.");
163
+ CHECK(vps.ptl.intraConstraintFlag, "intra constraint flag must be 0 for SCC profiles");
164
+ CHECK(vps.ptl.onePictureOnlyConstraintFlag, "one-picture-only constraint flag shall be 0 for SCC profiles");
165
+ const uint32_t bitDepthIdx = (vps.ptl.bitDepthConstraint == 8 ? 0 : (vps.ptl.bitDepthConstraint == 10 ? 1 : (vps.ptl.bitDepthConstraint == 12 ? 2 : (vps.ptl.bitDepthConstraint == 16 ? 3 : 4))));
166
+ const uint32_t chromaFormatIdx = uint32_t(vps.ptl.chromaFormatConstraint);
167
+ const bool bValidProfile = (bitDepthIdx > 2 || chromaFormatIdx > 3) ? false : (validSCCProfileNames0bitDepthIdxchromaFormatIdx != NONE);
168
+ CHECK(!bValidProfile, "Invalid intra constraint flag, bit depth constraint flag and chroma format constraint flag combination for a RExt profile");
169
+ }
170
+#endif
171
+
172
+ static const char* profiles = { "None", "Main", "Main 10", "Main Still Picture", "RExt", "", "", "", "", "Main Scc" };
173
static const char *tiers = { "Main", "High" };
174
175
char profbuf64;
176
- strcpy(profbuf, profilesvps.ptl.profileIdc);
177
+ strcpy(profbuf, profilesvps.ptl.profileIdc0);
178
179
bool bStillPicture = false;
180
- if (vps.ptl.profileIdc == Profile::MAINREXT)
181
+ if (vps.ptl.profileIdc0 == Profile::MAINREXT)
182
{
183
if (vps.ptl.bitDepthConstraint > 12 && vps.ptl.intraConstraintFlag)
184
{
185
186
if (vps.ptl.intraConstraintFlag && !bStillPicture)
187
strcat(profbuf, " Intra");
188
}
189
+
190
+#if ENABLE_SCC_EXT
191
+ if (vps.ptl.profileIdc0 == Profile::MAINSCC)
192
+ {
193
+ if (param.internalCsp == X265_CSP_I420)
194
+ {
195
+ if (vps.ptl.bitDepthConstraint <= 8)
196
+ strcpy(profbuf, "Main Scc");
197
+ else if (vps.ptl.bitDepthConstraint <= 10)
198
+ strcpy(profbuf, "Main 10 Scc");
199
+ }
200
+ else if (param.internalCsp == X265_CSP_I444)
201
x265_3.6.tar.gz/source/encoder/motion.cpp -> x265_4.0.tar.gz/source/encoder/motion.cpp
Changed
23
1
2
int merange,
3
MV & outQMv,
4
uint32_t maxSlices,
5
+ bool m_vertRestriction,
6
pixel * srcReferencePlane)
7
{
8
ALIGN_VAR_16(int, costs16);
9
10
11
// measure SAD cost at clipped QPEL MVP
12
MV pmv = qmvp.clipped(qmvmin, qmvmax);
13
+ if (m_vertRestriction)
14
+ {
15
+ if (pmv.y > mvmax.y << 2)
16
+ {
17
+ pmv.y = (mvmax.y << 2);
18
+ }
19
+ }
20
MV bestpre = pmv;
21
int bprecost;
22
23
x265_3.6.tar.gz/source/encoder/motion.h -> x265_4.0.tar.gz/source/encoder/motion.h
Changed
10
1
2
}
3
4
void refineMV(ReferencePlanes* ref, const MV& mvmin, const MV& mvmax, const MV& qmvp, MV& outQMv);
5
- int motionEstimate(ReferencePlanes* ref, const MV & mvmin, const MV & mvmax, const MV & qmvp, int numCandidates, const MV * mvc, int merange, MV & outQMv, uint32_t maxSlices, pixel *srcReferencePlane = 0);
6
+ int motionEstimate(ReferencePlanes* ref, const MV & mvmin, const MV & mvmax, const MV & qmvp, int numCandidates, const MV * mvc, int merange, MV & outQMv, uint32_t maxSlices, bool m_vertRestriction, pixel *srcReferencePlane = 0);
7
8
int subpelCompare(ReferencePlanes* ref, const MV &qmv, pixelcmp_t);
9
10
x265_3.6.tar.gz/source/encoder/nal.cpp -> x265_4.0.tar.gz/source/encoder/nal.cpp
Changed
19
1
2
other.m_buffer = X265_MALLOC(uint8_t, m_allocSize);
3
}
4
5
-void NALList::serialize(NalUnitType nalUnitType, const Bitstream& bs, uint8_t temporalID)
6
+void NALList::serialize(NalUnitType nalUnitType, const Bitstream& bs, int layerId, uint8_t temporalID)
7
{
8
static const char startCodePrefix = { 0, 0, 0, 1 };
9
10
11
* nuh_reserved_zero_6bits 6-bits
12
* nuh_temporal_id_plus1 3-bits */
13
outbytes++ = (uint8_t)nalUnitType << 1;
14
- outbytes++ = temporalID;
15
+ outbytes++ = (layerId << 3) | (temporalID);
16
17
/* 7.4.1 ...
18
* Within the NAL unit, the following three-byte sequences shall not occur at
19
x265_3.6.tar.gz/source/encoder/nal.h -> x265_4.0.tar.gz/source/encoder/nal.h
Changed
22
1
2
class NALList
3
{
4
public:
5
+#if ENABLE_MULTIVIEW || ENABLE_ALPHA
6
+ static const int MAX_NAL_UNITS = 32;
7
+#else
8
static const int MAX_NAL_UNITS = 16;
9
+#endif
10
11
public:
12
13
14
15
void takeContents(NALList& other);
16
17
- void serialize(NalUnitType nalUnitType, const Bitstream& bs, uint8_t temporalID = 1);
18
+ void serialize(NalUnitType nalUnitType, const Bitstream& bs, int layerId = 0, uint8_t temporalID = 1);
19
20
uint32_t serializeSubstreams(uint32_t* streamSizeBytes, uint32_t streamCount, const Bitstream* streams);
21
};
22
x265_3.6.tar.gz/source/encoder/ratecontrol.cpp -> x265_4.0.tar.gz/source/encoder/ratecontrol.cpp
Changed
21
1
2
FrameData& curEncData = *curFrame->m_encData;
3
m_curSlice = curEncData.m_slice;
4
m_sliceType = m_curSlice->m_sliceType;
5
+#if ENABLE_SCC_EXT
6
+ if(m_param->bEnableSCC)
7
+ m_sliceType = m_curSlice->m_origSliceType;
8
+#endif
9
rce->sliceType = m_sliceType;
10
if (!m_2pass)
11
rce->keptAsRef = IS_REFERENCED(curFrame);
12
13
14
int mincr = enc->m_vps.ptl.minCrForLevel;
15
/* Profiles above Main10 don't require maxAU size check, so just set the maximum to a large value. */
16
- if (enc->m_vps.ptl.profileIdc > Profile::MAIN10 || enc->m_vps.ptl.levelIdc == Level::NONE)
17
+ if (enc->m_vps.ptl.profileIdc0 > Profile::MAIN10 || enc->m_vps.ptl.levelIdc == Level::NONE)
18
rce->frameSizeMaximum = 1e9;
19
else
20
{
21
x265_3.6.tar.gz/source/encoder/sao.cpp -> x265_4.0.tar.gz/source/encoder/sao.cpp
Changed
201
1
2
return num >= 0 ? ((num * 2 + den) / (den * 2)) : -((-num * 2 + den) / (den * 2));
3
}
4
5
-/* get the sign of input variable (TODO: this is a dup, make common) */
6
-inline int8_t signOf(int x)
7
-{
8
- return (x >> 31) | ((int)((((uint32_t)-x)) >> 31));
9
-}
10
-
11
inline int signOf2(const int a, const int b)
12
{
13
// NOTE: don't reorder below compare, both ICL, VC, GCC optimize strong depends on order!
14
15
// CTU-based SAO process without slice granularity
16
void SAO::applyPixelOffsets(int addr, int typeIdx, int plane)
17
{
18
- PicYuv* reconPic = m_frame->m_reconPic;
19
+ PicYuv* reconPic = m_frame->m_reconPic0;
20
pixel* rec = reconPic->getPlaneAddr(plane, addr);
21
intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride;
22
uint32_t picWidth = m_param->sourceWidth;
23
24
{
25
for (int y = 0; y < ctuHeight; y++, rec += stride)
26
{
27
- int signLeft = signOf(recstartX - tmpLy);
28
+ int signLeft = x265_signOf(recstartX - tmpLy);
29
for (int x = startX; x < endX; x++)
30
{
31
- int signRight = signOf(recx - recx + 1);
32
+ int signRight = x265_signOf(recx - recx + 1);
33
int edgeType = signRight + signLeft + 2;
34
signLeft = -signRight;
35
36
37
{
38
for (int y = 0; y < ctuHeight; y += 2, rec += 2 * stride)
39
{
40
- signLeft10 = signOf(recstartX - tmpLy);
41
- signLeft11 = signOf(recstride + startX - tmpLy + 1);
42
+ signLeft10 = x265_signOf(recstartX - tmpLy);
43
+ signLeft11 = x265_signOf(recstride + startX - tmpLy + 1);
44
45
if (!lpelx)
46
{
47
48
if (ctuWidth & 15)
49
{
50
for (int x = 0; x < ctuWidth; x++)
51
- upBuff1x = signOf(recx - tmpUx);
52
+ upBuff1x = x265_signOf(recx - tmpUx);
53
54
for (int y = startY; y < endY; y++, rec += stride)
55
{
56
for (int x = 0; x < ctuWidth; x++)
57
{
58
- int8_t signDown = signOf(recx - recx + stride);
59
+ int8_t signDown = x265_signOf(recx - recx + stride);
60
int edgeType = signDown + upBuff1x + 2;
61
upBuff1x = -signDown;
62
63
64
else
65
{
66
for (int x = startX; x < endX; x++)
67
- upBuff1x = signOf(recx - tmpUx - 1);
68
+ upBuff1x = x265_signOf(recx - tmpUx - 1);
69
}
70
71
if (ctuWidth & 15)
72
{
73
for (int y = startY; y < endY; y++, rec += stride)
74
{
75
- upBufftstartX = signOf(recstride + startX - tmpLy);
76
+ upBufftstartX = x265_signOf(recstride + startX - tmpLy);
77
for (int x = startX; x < endX; x++)
78
{
79
- int8_t signDown = signOf(recx - recx + stride + 1);
80
+ int8_t signDown = x265_signOf(recx - recx + stride + 1);
81
int edgeType = signDown + upBuff1x + 2;
82
upBufftx + 1 = -signDown;
83
recx = m_clipTablerecx + offsetEoedgeType;
84
85
{
86
for (int y = startY; y < endY; y++, rec += stride)
87
{
88
- int8_t iSignDown2 = signOf(recstride + startX - tmpLy);
89
+ int8_t iSignDown2 = x265_signOf(recstride + startX - tmpLy);
90
91
primitives.saoCuOrgE2endX > 16(rec + startX, upBufft + startX, upBuff1 + startX, offsetEo, endX - startX, stride);
92
93
94
if (ctuWidth & 15)
95
{
96
for (int x = startX - 1; x < endX; x++)
97
- upBuff1x = signOf(recx - tmpUx + 1);
98
+ upBuff1x = x265_signOf(recx - tmpUx + 1);
99
100
for (int y = startY; y < endY; y++, rec += stride)
101
{
102
int x = startX;
103
- int8_t signDown = signOf(recx - tmpLy + 1);
104
+ int8_t signDown = x265_signOf(recx - tmpLy + 1);
105
int edgeType = signDown + upBuff1x + 2;
106
upBuff1x - 1 = -signDown;
107
recx = m_clipTablerecx + offsetEoedgeType;
108
109
for (x = startX + 1; x < endX; x++)
110
{
111
- signDown = signOf(recx - recx + stride - 1);
112
+ signDown = x265_signOf(recx - recx + stride - 1);
113
edgeType = signDown + upBuff1x + 2;
114
upBuff1x - 1 = -signDown;
115
recx = m_clipTablerecx + offsetEoedgeType;
116
}
117
118
- upBuff1endX - 1 = signOf(recendX - 1 + stride - recendX);
119
+ upBuff1endX - 1 = x265_signOf(recendX - 1 + stride - recendX);
120
}
121
}
122
else
123
124
int8_t firstSign, lastSign;
125
126
if (lpelx)
127
- firstSign = signOf(rec-1 - tmpU0);
128
+ firstSign = x265_signOf(rec-1 - tmpU0);
129
if (rpelx == picWidth)
130
lastSign = upBuff1ctuWidth - 1;
131
132
133
for (int y = startY; y < endY; y++, rec += stride)
134
{
135
int x = startX;
136
- int8_t signDown = signOf(recx - tmpLy + 1);
137
+ int8_t signDown = x265_signOf(recx - tmpLy + 1);
138
int edgeType = signDown + upBuff1x + 2;
139
upBuff1x - 1 = -signDown;
140
recx = m_clipTablerecx + offsetEoedgeType;
141
142
primitives.saoCuOrgE3endX > 16(rec, upBuff1, offsetEo, stride - 1, startX, endX);
143
144
- upBuff1endX - 1 = signOf(recendX - 1 + stride - recendX);
145
+ upBuff1endX - 1 = x265_signOf(recendX - 1 + stride - recendX);
146
}
147
}
148
149
150
/* Process SAO unit */
151
void SAO::generateLumaOffsets(SaoCtuParam* ctuParam, int idxY, int idxX)
152
{
153
- PicYuv* reconPic = m_frame->m_reconPic;
154
+ PicYuv* reconPic = m_frame->m_reconPic0;
155
intptr_t stride = reconPic->m_stride;
156
int ctuWidth = m_param->maxCUSize;
157
int ctuHeight = m_param->maxCUSize;
158
159
/* Process SAO unit (Chroma only) */
160
void SAO::generateChromaOffsets(SaoCtuParam* ctuParam3, int idxY, int idxX)
161
{
162
- PicYuv* reconPic = m_frame->m_reconPic;
163
+ PicYuv* reconPic = m_frame->m_reconPic0;
164
intptr_t stride = reconPic->m_strideC;
165
int ctuWidth = m_param->maxCUSize;
166
int ctuHeight = m_param->maxCUSize;
167
168
void SAO::calcSaoStatsCTU(int addr, int plane)
169
{
170
Slice* slice = m_frame->m_encData->m_slice;
171
- const PicYuv* reconPic = m_frame->m_reconPic;
172
+ const PicYuv* reconPic = m_frame->m_reconPic0;
173
const CUData* cu = m_frame->m_encData->getPicCTU(addr);
174
const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr);
175
const pixel* rec0 = reconPic->getPlaneAddr(plane, addr);
176
177
178
int x, y;
179
const CUData* cu = frame->m_encData->getPicCTU(addr);
180
- const PicYuv* reconPic = m_frame->m_reconPic;
181
+ const PicYuv* reconPic = m_frame->m_reconPic0;
182
const pixel* fenc;
183
const pixel* rec;
184
intptr_t stride = reconPic->m_stride;
185
186
for (y = 0; y < ctuHeight; y++)
187
{
188
x = (y < startY ? startX : firstX);
189
- int signLeft = signOf(recx - recx - 1);
190
+ int signLeft = x265_signOf(recx - recx - 1);
191
for (; x < endX; x++)
192
{
193
- int signRight = signOf(recx - recx + 1);
194
+ int signRight = x265_signOf(recx - recx + 1);
195
int edgeType = signRight + signLeft + 2;
196
signLeft = -signRight;
197
198
199
}
200
201
x265_3.6.tar.gz/source/encoder/search.cpp -> x265_4.0.tar.gz/source/encoder/search.cpp
Changed
201
1
2
m_param = ¶m;
3
m_bFrameParallel = param.frameNumThreads > 1;
4
m_numLayers = g_log2Sizeparam.maxCUSize - 2;
5
+#if ENABLE_SCC_EXT
6
+ m_ibcEnabled = param.bEnableSCC;
7
+#endif
8
9
m_rdCost.setPsyRdScale(param.psyRd);
10
m_rdCost.setSsimRd(param.bSsimRd);
11
12
CHECKED_MALLOC(m_tsResidual, int16_t, MAX_TS_SIZE * MAX_TS_SIZE);
13
CHECKED_MALLOC(m_tsRecon, pixel, MAX_TS_SIZE * MAX_TS_SIZE);
14
15
+#if ENABLE_SCC_EXT
16
+ m_numBVs = 0;
17
+ m_numBV16s = 0;
18
+#endif
19
+
20
return ok;
21
22
fail:
23
24
}
25
26
// set reconstruction for next intra prediction blocks if full TU prediction won
27
- PicYuv* reconPic = m_frame->m_reconPic;
28
+ PicYuv* reconPic = m_frame->m_reconPic0;
29
pixel* picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
30
intptr_t picStride = reconPic->m_stride;
31
primitives.cusizeIdx.copy_pp(picReconY, picStride, reconQt, reconQtStride);
32
33
}
34
35
// set reconstruction for next intra prediction blocks
36
- PicYuv* reconPic = m_frame->m_reconPic;
37
+ PicYuv* reconPic = m_frame->m_reconPic0;
38
pixel* picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
39
intptr_t picStride = reconPic->m_stride;
40
primitives.cusizeIdx.copy_pp(picReconY, picStride, reconQt, reconQtStride);
41
42
uint32_t sizeIdx = log2TrSize - 2;
43
primitives.cusizeIdx.calcresidualstride % 64 == 0(fenc, pred, residual, stride);
44
45
- PicYuv* reconPic = m_frame->m_reconPic;
46
+ PicYuv* reconPic = m_frame->m_reconPic0;
47
pixel* picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
48
intptr_t picStride = reconPic->m_stride;
49
50
51
coeff_t* coeffC = m_rqtqtLayer.coeffRQTchromaId + coeffOffsetC;
52
pixel* reconQt = m_rqtqtLayer.reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
53
uint32_t reconQtStride = m_rqtqtLayer.reconQtYuv.m_csize;
54
- PicYuv* reconPic = m_frame->m_reconPic;
55
+ PicYuv* reconPic = m_frame->m_reconPic0;
56
pixel* picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
57
intptr_t picStride = reconPic->m_strideC;
58
59
60
cu.setCbfPartRange(bCbf << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
61
cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
62
63
- PicYuv* reconPic = m_frame->m_reconPic;
64
+ PicYuv* reconPic = m_frame->m_reconPic0;
65
pixel* reconPicC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
66
intptr_t picStride = reconPic->m_strideC;
67
primitives.cusizeIdxC.copy_pp(reconPicC, picStride, reconQt, reconQtStride);
68
69
int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
70
uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
71
coeff_t* coeffC = cu.m_trCoeffttype + coeffOffsetC;
72
- PicYuv* reconPic = m_frame->m_reconPic;
73
+ PicYuv* reconPic = m_frame->m_reconPic0;
74
pixel* picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
75
intptr_t picStride = reconPic->m_strideC;
76
77
78
79
updateModeCost(intraMode);
80
checkDQP(intraMode, cuGeom);
81
+
82
+#if ENABLE_SCC_EXT
83
+ if (m_param->bEnableSCC)
84
+ intraMode.reconYuv.copyToPicYuv(*m_frame->m_reconPic1, cu.m_cuAddr, cuGeom.absPartIdx);
85
+#endif
86
}
87
88
/* Note that this function does not save the best intra prediction, it must
89
90
* output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also
91
* it is not updating m_rdContextsdepth.cur for the later PUs which I suspect is slightly wrong. I think
92
* that the contexts should be tracked through each PU */
93
- PicYuv* reconPic = m_frame->m_reconPic;
94
+ PicYuv* reconPic = m_frame->m_reconPic0;
95
pixel* dst = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
96
uint32_t dststride = reconPic->m_stride;
97
const pixel* src = reconYuv->getLumaAddr(absPartIdx);
98
99
if (!tuIterator.isLastSection())
100
{
101
uint32_t zorder = cuGeom.absPartIdx + absPartIdxC;
102
- PicYuv* reconPic = m_frame->m_reconPic;
103
+ PicYuv* reconPic = m_frame->m_reconPic0;
104
uint32_t dststride = reconPic->m_strideC;
105
const pixel* src;
106
pixel* dst;
107
108
MVField candMvFieldMRG_MAX_NUM_CANDS2;
109
uint8_t candDirMRG_MAX_NUM_CANDS;
110
uint32_t numMergeCand = cu.getInterMergeCandidates(pu.puAbsPartIdx, puIdx, candMvField, candDir);
111
-
112
+#if ENABLE_SCC_EXT
113
+ restrictBipredMergeCand(&cu, 0, candMvField, candDir, numMergeCand);
114
+#else
115
if (cu.isBipredRestriction())
116
{
117
/* do not allow bidir merge candidates if PU is smaller than 8x8, drop L1 reference */
118
119
}
120
}
121
}
122
+#endif
123
124
Yuv& tempYuv = m_rqtcuGeom.depth.tmpPredYuv;
125
126
127
continue;
128
}
129
130
+#if ENABLE_SCC_EXT
131
+ if ((candDirmergeCand == 1 || candDirmergeCand == 3) && (m_slice->m_refPOCList0candMvFieldmergeCand0.refIdx == m_slice->m_poc))
132
+ {
133
+ continue;
134
+ }
135
+#endif
136
cu.m_mv0pu.puAbsPartIdx = candMvFieldmergeCand0.mv;
137
cu.m_refIdx0pu.puAbsPartIdx = (int8_t)candMvFieldmergeCand0.refIdx;
138
cu.m_mv1pu.puAbsPartIdx = candMvFieldmergeCand1.mv;
139
140
continue;
141
}
142
cu.clipMv(mvCand);
143
- predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refReconPicListlistref, mvCand);
144
+#if ENABLE_SCC_EXT
145
+ if (m_slice->m_param->bEnableSCC && !list && ref == m_slice->m_numRefIdx0 - 1)
146
+ predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refFrameListlistref->m_reconPic1, mvCand);
147
+ else
148
+#endif
149
+ predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refReconPicListlistref, mvCand);
150
costsi = m_me.bufSAD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size);
151
}
152
153
154
void Search::singleMotionEstimation(Search& master, Mode& interMode, const PredictionUnit& pu, int part, int list, int ref)
155
{
156
uint32_t bits = master.m_listSelBitslist + MVP_IDX_BITS;
157
- bits += getTUBits(ref, m_slice->m_numRefIdxlist);
158
+ int numIdx = m_slice->m_numRefIdxlist;
159
+#if ENABLE_SCC_EXT
160
+ if (!list && m_ibcEnabled)
161
+ numIdx--;
162
+#endif
163
+ bits += getTUBits(ref, numIdx);
164
165
MotionData* bestME = interMode.bestMEpart;
166
167
// 12 mv candidates including lowresMV
168
MV mvc(MD_ABOVE_LEFT + 1) * 2 + 2;
169
- int numMvc = interMode.cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCandlistref, mvc);
170
+ int numMvc = interMode.cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCandlistref, mvc, 0, pu.puAbsPartIdx);
171
172
const MV* amvp = interMode.amvpCandlistref;
173
int mvpIdx = selectMVP(interMode.cu, pu, amvp, list, ref);
174
175
if (!m_param->analysisSave && !m_param->analysisLoad) /* Prevents load/save outputs from diverging if lowresMV is not available */
176
{
177
MV lmv = getLowresMV(interMode.cu, pu, list, ref);
178
- if (lmv.notZero())
179
+ int layer = m_param->numViews > 1 ? m_frame->m_viewId : (m_param->numScalableLayers > 1) ? m_frame->m_sLayerId : 0;
180
+ if (lmv.notZero() && !layer)
181
mvcnumMvc++ = lmv;
182
if (m_param->bEnableHME)
183
mvp_lowres = lmv;
184
}
185
186
+ m_vertRestriction = interMode.cu.m_slice->m_refPOCListlistref == interMode.cu.m_slice->m_poc;
187
setSearchRange(interMode.cu, mvp, m_param->searchRange, mvmin, mvmax);
188
189
- int satdCost = m_me.motionEstimate(&m_slice->m_mreflistref, mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices,
190
+ int satdCost = m_me.motionEstimate(&m_slice->m_mreflistref, mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, m_vertRestriction,
191
m_param->bSourceReferenceEstimation ? m_slice->m_refFrameListlistref->m_fencPic->getLumaAddr(0) : 0);
192
193
if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp)
194
{
195
MV outmv_lowres;
196
setSearchRange(interMode.cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);
197
- int lowresMvCost = m_me.motionEstimate(&m_slice->m_mreflistref, mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices,
198
+ int lowresMvCost = m_me.motionEstimate(&m_slice->m_mreflistref, mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices, m_vertRestriction,
199
m_param->bSourceReferenceEstimation ? m_slice->m_refFrameListlistref->m_fencPic->getLumaAddr(0) : 0);
200
if (lowresMvCost < satdCost)
201
x265_3.6.tar.gz/source/encoder/search.h -> x265_4.0.tar.gz/source/encoder/search.h
Changed
53
1
2
int32_t m_sliceMaxY;
3
int32_t m_sliceMinY;
4
5
+ bool m_vertRestriction;
6
+
7
+#if ENABLE_SCC_EXT
8
+ int m_ibcEnabled;
9
+ int m_numBVs;
10
+ int m_numBV16s;
11
+ MV m_BVs64;
12
+ uint32_t m_lastCandCost;
13
+#endif
14
+
15
#if DETAILED_CU_STATS
16
/* Accumulate CU statistics separately for each frame encoder */
17
CUStats m_statsX265_MAX_FRAME_THREADS;
18
19
void encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom);
20
21
// estimation inter prediction (non-skip)
22
- void predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t masks2);
23
+ void predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t masks2, MV* iMVCandList = NULL);
24
void searchMV(Mode& interMode, int list, int ref, MV& outmv, MV mvp3, int numMvc, MV* mvc);
25
// encode residual and compute rd-cost for inter mode
26
void encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom);
27
28
29
MV getLowresMV(const CUData& cu, const PredictionUnit& pu, int list, int ref);
30
31
+#if ENABLE_SCC_EXT
32
+ bool predIntraBCSearch(Mode& intraBCMode, const CUGeom& cuGeom, bool bChromaMC, PartSize ePartSize, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc);
33
+ void intraBlockCopyEstimate(Mode& intraBCMode, const CUGeom& cuGeom, int puIdx, MV* pred, MV& mv, uint32_t& cost, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc);
34
+ void setIntraSearchRange(Mode& intraBCMode, MV& pred, int puIdx, int roiWidth, int roiHeight, MV& searchRangeLT, MV& searchRangeRB);
35
+ void intraPatternSearch(Mode& intraBCMode, const CUGeom& cuGeom, int puIdx, uint32_t partAddr, pixel* refY, int refStride, MV* searchRangeLT, MV* searchRangeRB,
36
+ MV& mv, uint32_t& cost, int roiwidth, int roiheight, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc);
37
+ bool isValidIntraBCSearchArea(CUData* cu, int predX, int predY, int roiWidth, int roiHeight, int partOffset);
38
+ bool isBlockVectorValid(int xPos, int yPos, int width, int height, CUData* pcCU,
39
+ int xStartInCU, int yStartInCU, int xBv, int yBv, int ctuSize);
40
+ void intraBCSearchMVCandUpdate(uint32_t sad, int x, int y, uint32_t* sadBestCand, MV* cMVCand);
41
+ void updateBVMergeCandLists(int roiWidth, int roiHeight, MV* mvCand, IBC& ibc);
42
+ int intraBCSearchMVChromaRefine(Mode& intraBCMode, const CUGeom& cuGeom, int roiWidth, int roiHeight, int cuPelX, int cuPelY, uint32_t* sadBestCand, MV* cMVCand,
43
+ uint32_t partOffset, int puIdx);
44
+ static uint32_t mergeCandLists(MV* dst, uint32_t dn, MV* src, uint32_t sn, bool isSrcQuarPel);
45
+ uint32_t getSAD(pixel* ref, int refStride, const pixel* curr, int currStride, int width, int height);
46
+ bool predMixedIntraBCInterSearch(Mode& intraBCMode, const CUGeom& cuGeom, bool bChromaMC, PartSize ePartSize, MV* iMVCandList);
47
+ void restrictBipredMergeCand(CUData* cu, uint32_t puIdx, MVField(*mvFieldNeighbours)2, uint8_t* interDirNeighbours, uint32_t numValidMergeCand);
48
+#endif
49
+
50
class PME : public BondedTaskGroup
51
{
52
public:
53
x265_3.6.tar.gz/source/encoder/sei.cpp -> x265_4.0.tar.gz/source/encoder/sei.cpp
Changed
19
1
2
3
/* marshal a single SEI message sei, storing the marshalled representation
4
* in bitstream bs */
5
-void SEI::writeSEImessages(Bitstream& bs, const SPS& sps, NalUnitType nalUnitType, NALList& list, int isNested)
6
+void SEI::writeSEImessages(Bitstream& bs, const SPS& sps, NalUnitType nalUnitType, NALList& list, int isNested, int layer)
7
{
8
if (!isNested)
9
bs.resetBits();
10
11
{
12
if (nalUnitType != NAL_UNIT_UNSPECIFIED)
13
bs.writeByteAlignment();
14
- list.serialize(nalUnitType, bs, (1 + (nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N)));
15
+ list.serialize(nalUnitType, bs, layer, (1 + (nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N)));
16
}
17
}
18
19
x265_3.6.tar.gz/source/encoder/sei.h -> x265_4.0.tar.gz/source/encoder/sei.h
Changed
201
1
2
public:
3
/* SEI users call writeSEImessages() to marshal an SEI to a bitstream.
4
* The writeSEImessages() method calls writeSEI() which encodes the header */
5
- void writeSEImessages(Bitstream& bs, const SPS& sps, NalUnitType nalUnitType, NALList& list, int isNested);
6
+ void writeSEImessages(Bitstream& bs, const SPS& sps, NalUnitType nalUnitType, NALList& list, int isNested, int layerId = 0);
7
void setSize(uint32_t size);
8
static char* base64Decode(char encodedString, int base64EncodeLength);
9
virtual ~SEI() {}
10
11
}
12
};
13
14
+#if ENABLE_ALPHA
15
+class SEIAlphaChannelInfo : public SEI
16
+{
17
+public:
18
+ SEIAlphaChannelInfo()
19
+ {
20
+ m_payloadType = ALPHA_CHANNEL_INFO;
21
+ m_payloadSize = 0;
22
+ }
23
+
24
+ bool alpha_channel_cancel_flag;
25
+ void writeSEI(const SPS&)
26
+ {
27
+ WRITE_CODE(alpha_channel_cancel_flag, 1, "alpha_channel_cancel_flag");
28
+ if (!alpha_channel_cancel_flag)
29
+ {
30
+ WRITE_CODE(0, 3, "alpha_channel_use_idc");
31
+ WRITE_CODE(0, 3, "alpha_channel_bit_depth_minus8");
32
+ WRITE_CODE(0, 9, "alpha_transparent_value");
33
+ WRITE_CODE(255, 9, "alpha_opaque_value");
34
+ WRITE_CODE(0, 1, "alpha_channel_incr_flag");
35
+ WRITE_CODE(0, 1, "alpha_channel_clip_flag");
36
+ }
37
+ if (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
38
+ {
39
+ WRITE_FLAG(1, "payload_bit_equal_to_one");
40
+ while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
41
+ {
42
+ WRITE_FLAG(0, "payload_bit_equal_to_zero");
43
+ }
44
+ }
45
+ }
46
+};
47
+#endif
48
+
49
+#if ENABLE_MULTIVIEW
50
+class SEIThreeDimensionalReferenceDisplaysInfo : public SEI
51
+{
52
+public:
53
+ SEIThreeDimensionalReferenceDisplaysInfo()
54
+ {
55
+ m_payloadType = THREE_DIMENSIONAL_REFERENCE_DISPLAYS_INFO;
56
+ m_payloadSize = 0;
57
+ }
58
+
59
+ int m_numRefDisplaysMinus1 = 0;
60
+ bool m_refViewingDistanceFlag = false;
61
+ bool m_additionalShiftPresentFlag = false;
62
+ void writeSEI(const SPS&)
63
+ {
64
+ WRITE_UVLC(31, "prec_ref_display_width");
65
+ WRITE_FLAG(m_refViewingDistanceFlag, "ref_viewing_distance_flag");
66
+ if (m_refViewingDistanceFlag)
67
+ {
68
+ WRITE_UVLC(0, "prec_ref_viewing_dist");
69
+ }
70
+ WRITE_UVLC(0, "num_ref_displays_minus1");
71
+ for (int i = 0; i <= m_numRefDisplaysMinus1; i++)
72
+ {
73
+ WRITE_UVLC(0, "left_view_id");
74
+ WRITE_UVLC(1, "right_view_id");
75
+ WRITE_CODE(0, 6, "exponent_ref_display_width");
76
+ WRITE_CODE(0, 2, "mantissa_ref_display_width");
77
+ if (m_refViewingDistanceFlag)
78
+ {
79
+ WRITE_CODE(0, 6, "exponent_ref_viewing_distance");
80
+ WRITE_CODE(0, 1, "mantissa_ref_viewing_distance");
81
+ }
82
+ WRITE_FLAG(m_additionalShiftPresentFlag, "additional_shift_present_flag");
83
+ if (m_additionalShiftPresentFlag)
84
+ {
85
+ WRITE_CODE(0, 10, "num_sample_shift_plus512");
86
+ }
87
+ }
88
+ WRITE_FLAG(0, "three_dimensional_reference_displays_extension_flag");
89
+
90
+ if (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
91
+ {
92
+ WRITE_FLAG(1, "payload_bit_equal_to_one");
93
+ while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
94
+ {
95
+ WRITE_FLAG(0, "payload_bit_equal_to_zero");
96
+ }
97
+ }
98
+ }
99
+
100
+};
101
+
102
+class SEIMultiviewSceneInfo : public SEI
103
+{
104
+public:
105
+ SEIMultiviewSceneInfo()
106
+ {
107
+ m_payloadType = MULTIVIEW_SCENE_INFO;
108
+ m_payloadSize = 0;
109
+ }
110
+ void writeSEI(const SPS&)
111
+ {
112
+ WRITE_SVLC(-333, "min_disparity");
113
+ WRITE_UVLC(2047, "max_disparity_range");
114
+
115
+ if (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
116
+ {
117
+ WRITE_FLAG(1, "payload_bit_equal_to_one");
118
+ while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
119
+ {
120
+ WRITE_FLAG(0, "payload_bit_equal_to_zero");
121
+ }
122
+ }
123
+ }
124
+};
125
+
126
+class SEIMultiviewAcquisitionInfo : public SEI
127
+{
128
+public:
129
+ SEIMultiviewAcquisitionInfo()
130
+ {
131
+ m_payloadType = MULTIVIEW_ACQUISITION_INFO;
132
+ m_payloadSize = 0;
133
+ }
134
+
135
+ int sign_r33 = { {0,1,0},{1,0,0},{0,1,1} };
136
+ int exponent_r33 = { {10,20,11},{10,5,11},{2,20,11} };
137
+ int mantissa_r33 = { {4,9,1},{0,3,4},{3,3,7} };
138
+ int sign_t13 = { 0,1,0 };
139
+ int exponent_t13 = { 0,10,5 };
140
+ int mantissa_t13 = { 1,8,9 };
141
+ int lenght_mantissa_r33 = { {10,20,11},{10,5,11},{2,20,11} };
142
+ int length_mantissa_t13 = { 1,10,5 };
143
+ bool m_intrinsicParamFlag = true;
144
+ bool m_extrinsicParamFlag = true;
145
+ bool m_intrinsicParamsEqualFlag = true;
146
+ void writeSEI(const SPS& sps)
147
+ {
148
+ WRITE_FLAG(m_intrinsicParamFlag, "intrinsic_param_flag");
149
+ WRITE_FLAG(m_extrinsicParamFlag, "extrinsic_param_flag");
150
+ if (m_intrinsicParamFlag)
151
+ {
152
+ WRITE_FLAG(m_intrinsicParamsEqualFlag, "intrinsic_params_equal_flag");
153
+ WRITE_UVLC(31, "prec_focal_length");
154
+ WRITE_UVLC(31, "prec_principal_point");
155
+ WRITE_UVLC(31, "prec_skew_factor");
156
+
157
+ for (int i = 0; i <= (m_intrinsicParamsEqualFlag ? 0 : sps.maxViews - 1); i++)
158
+ {
159
+ WRITE_FLAG(0, "sign_focal_length_x");
160
+ WRITE_CODE(0, 6, "exponent_focal_length_x");
161
+ WRITE_CODE(0, 1, "mantissa_focal_length_x");
162
+ WRITE_FLAG(0, "sign_focal_length_y");
163
+ WRITE_CODE(0, 6, "exponent_focal_length_y");
164
+ WRITE_CODE(0, 1, "mantissa_focal_length_y");
165
+ WRITE_FLAG(0, "sign_principal_point_x");
166
+ WRITE_CODE(0, 6, "exponent_principal_point_x");
167
+ WRITE_CODE(0, 1, "mantissa_principal_point_x");
168
+ WRITE_FLAG(0, "sign_principal_point_y");
169
+ WRITE_CODE(0, 6, "exponent_principal_point_y");
170
+ WRITE_CODE(0, 1, "mantissa_principal_point_y");
171
+ WRITE_FLAG(0, "sign_skew_factor");
172
+ WRITE_CODE(0, 6, "exponent_skew_factor");
173
+ WRITE_CODE(0, 1, "mantissa_skew_factor");
174
+ }
175
+ }
176
+
177
+ if (m_extrinsicParamFlag)
178
+ {
179
+ WRITE_UVLC(31, "prec_rotation_param");
180
+ WRITE_UVLC(31, "prec_translation_param");
181
+ for (int i = 0; i <= 0; i++)
182
+ {
183
+ for (int j = 0; j <= 2; j++) /* row */
184
+ {
185
+ for (int k = 0; k <= 2; k++) /* column */
186
+ {
187
+ WRITE_FLAG(sign_rjk, "sign_r");
188
+ WRITE_CODE(exponent_rjk, 6, "exponent_r");
189
+ WRITE_CODE(mantissa_rjk, lenght_mantissa_rjk, "mantissa_r");
190
+ }
191
+ WRITE_FLAG(sign_tij, "sign_t");
192
+ WRITE_CODE(exponent_tij, 6, "exponent_t");
193
+ WRITE_CODE(mantissa_tij, length_mantissa_tij, "mantissa_t");
194
+ }
195
+ }
196
+ }
197
+ if (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
198
+ {
199
+ WRITE_FLAG(1, "payload_bit_equal_to_one");
200
+ while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
201
x265_3.6.tar.gz/source/encoder/slicetype.cpp -> x265_4.0.tar.gz/source/encoder/slicetype.cpp
Changed
22
1
2
int l0poc = slice->m_rps.numberOfNegativePictures ? slice->m_refPOCList00 : -1;
3
int l1poc = slice->m_refPOCList10;
4
5
- switch (slice->m_sliceType)
6
+ switch (slice->m_origSliceType)
7
{
8
case I_SLICE:
9
framesp0 = &curFrame->m_lowres;
10
11
/* ME will never return a cost larger than the cost @MVP, so we do not
12
* have to check that ME cost is more than the estimated merge cost */
13
if(!hme)
14
- fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, searchRange, *fencMV, m_lookahead.m_param->maxSlices);
15
+ fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, searchRange, *fencMV, m_lookahead.m_param->maxSlices, 0);
16
else
17
- fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, searchRange, *fencMV, m_lookahead.m_param->maxSlices, fref->lowerResPlane0);
18
+ fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, searchRange, *fencMV, m_lookahead.m_param->maxSlices, 0, fref->lowerResPlane0);
19
if (skipCost < 64 && skipCost < fencCost && bBidir)
20
{
21
fencCost = skipCost;
22
x265_3.6.tar.gz/source/encoder/weightPrediction.cpp -> x265_4.0.tar.gz/source/encoder/weightPrediction.cpp
Changed
17
1
2
lumaDenom = weights0.log2WeightDenom;
3
chromaDenom = weights1.log2WeightDenom;
4
5
+ int numIdx = slice.m_numRefIdxlist;
6
+#if ENABLE_SCC_EXT
7
+ if (!list && param.bEnableSCC)
8
+ numIdx--;
9
+#endif
10
+
11
/* reset weight states */
12
- for (int ref = 1; ref < slice.m_numRefIdxlist; ref++)
13
+ for (int ref = 1; ref < numIdx; ref++)
14
{
15
SET_WEIGHT(wplistref0, false, 1 << lumaDenom, lumaDenom, 0);
16
SET_WEIGHT(wplistref1, false, 1 << chromaDenom, chromaDenom, 0);
17
x265_3.6.tar.gz/source/input/input.cpp -> x265_4.0.tar.gz/source/input/input.cpp
Changed
17
1
2
3
using namespace X265_NS;
4
5
-InputFile* InputFile::open(InputFileInfo& info, bool bForceY4m)
6
+InputFile* InputFile::open(InputFileInfo& info, bool bForceY4m, bool alpha, int format)
7
{
8
const char * s = strrchr(info.filename, '.');
9
10
if (bForceY4m || (s && !strcmp(s, ".y4m")))
11
- return new Y4MInput(info);
12
+ return new Y4MInput(info, alpha, format);
13
else
14
- return new YUVInput(info);
15
+ return new YUVInput(info, alpha, format);
16
}
17
x265_3.6.tar.gz/source/input/input.h -> x265_4.0.tar.gz/source/input/input.h
Changed
10
1
2
3
InputFile() {}
4
5
- static InputFile* open(InputFileInfo& info, bool bForceY4m);
6
+ static InputFile* open(InputFileInfo& info, bool bForceY4m, bool alpha, int format);
7
8
virtual void startReader() = 0;
9
10
x265_3.6.tar.gz/source/input/y4m.cpp -> x265_4.0.tar.gz/source/input/y4m.cpp
Changed
57
1
2
using namespace X265_NS;
3
using namespace std;
4
static const char header = {'F','R','A','M','E'};
5
-Y4MInput::Y4MInput(InputFileInfo& info)
6
+Y4MInput::Y4MInput(InputFileInfo& info, bool alpha, int format)
7
{
8
for (int i = 0; i < QUEUE_SIZE; i++)
9
bufi = NULL;
10
11
threadActive = false;
12
colorSpace = info.csp;
13
+ alphaAvailable = alpha;
14
sarWidth = info.sarWidth;
15
sarHeight = info.sarHeight;
16
width = info.width;
17
18
ifs = x265_fopen(info.filename, "rb");
19
if (ifs && !ferror(ifs) && parseHeader())
20
{
21
+ if (format == 1) width /= 2;
22
+ if (format == 2) height /= 2;
23
int pixelbytes = depth > 8 ? 2 : 1;
24
- for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
25
+ for (int i = 0; i < x265_cli_cspscolorSpace.planes + alphaAvailable; i++)
26
{
27
- int stride = (width >> x265_cli_cspscolorSpace.widthi) * pixelbytes;
28
- framesize += (stride * (height >> x265_cli_cspscolorSpace.heighti));
29
+ int stride = ((width * (format == 1 ? 2 : 1)) >> x265_cli_cspscolorSpace.widthi) * pixelbytes;
30
+ framesize += (stride * ((height * (format == 2 ? 2 : 1)) >> x265_cli_cspscolorSpace.heighti));
31
}
32
33
threadActive = true;
34
35
pic.height = height;
36
pic.width = width;
37
pic.colorSpace = colorSpace;
38
- pic.stride0 = width * pixelbytes;
39
+ pic.stride0 = width * pixelbytes * (pic.format == 1 ? 2 : 1);
40
pic.stride1 = pic.stride0 >> x265_cli_cspscolorSpace.width1;
41
pic.stride2 = pic.stride0 >> x265_cli_cspscolorSpace.width2;
42
pic.planes0 = bufread % QUEUE_SIZE;
43
- pic.planes1 = (char*)pic.planes0 + pic.stride0 * height;
44
- pic.planes2 = (char*)pic.planes1 + pic.stride1 * (height >> x265_cli_cspscolorSpace.height1);
45
+ pic.planes1 = (char*)pic.planes0 + pic.stride0 * (height * (pic.format == 2 ? 2 : 1));
46
+ pic.planes2 = (char*)pic.planes1 + pic.stride1 * ((height * (pic.format == 2 ? 2 : 1)) >> x265_cli_cspscolorSpace.height1);
47
+#if ENABLE_ALPHA
48
+ if (alphaAvailable)
49
+ {
50
+ pic.stride3 = pic.stride0 >> x265_cli_cspscolorSpace.width3;
51
+ pic.planes3 = (char*)pic.planes2 + pic.stride2 * (height >> x265_cli_cspscolorSpace.height2);
52
+ }
53
+#endif
54
readCount.incr();
55
return true;
56
}
57
x265_3.6.tar.gz/source/input/y4m.h -> x265_4.0.tar.gz/source/input/y4m.h
Changed
19
1
2
3
int colorSpace;
4
5
+ bool alphaAvailable;
6
+
7
bool threadActive;
8
9
ThreadSafeInteger readCount;
10
11
12
public:
13
14
- Y4MInput(InputFileInfo& info);
15
+ Y4MInput(InputFileInfo& info, bool alpha, int format);
16
17
virtual ~Y4MInput();
18
void release();
19
x265_3.6.tar.gz/source/input/yuv.cpp -> x265_4.0.tar.gz/source/input/yuv.cpp
Changed
53
1
2
using namespace X265_NS;
3
using namespace std;
4
5
-YUVInput::YUVInput(InputFileInfo& info)
6
+YUVInput::YUVInput(InputFileInfo& info, bool alpha, int format)
7
{
8
for (int i = 0; i < QUEUE_SIZE; i++)
9
bufi = NULL;
10
11
width = info.width;
12
height = info.height;
13
colorSpace = info.csp;
14
+ alphaAvailable = alpha;
15
threadActive = false;
16
ifs = NULL;
17
18
uint32_t pixelbytes = depth > 8 ? 2 : 1;
19
framesize = 0;
20
- for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
21
+ for (int i = 0; i < x265_cli_cspscolorSpace.planes + alphaAvailable; i++)
22
{
23
- uint32_t w = width >> x265_cli_cspscolorSpace.widthi;
24
- uint32_t h = height >> x265_cli_cspscolorSpace.heighti;
25
+ int32_t w = (width * (format == 1 ? 2 : 1)) >> x265_cli_cspscolorSpace.widthi;
26
+ uint32_t h = (height * (format == 2 ? 2 : 1)) >> x265_cli_cspscolorSpace.heighti;
27
framesize += w * h * pixelbytes;
28
}
29
30
31
pic.framesize = framesize;
32
pic.height = height;
33
pic.width = width;
34
- pic.stride0 = width * pixelbytes;
35
+ pic.stride0 = width * pixelbytes * (pic.format == 1 ? 2 : 1);
36
pic.stride1 = pic.stride0 >> x265_cli_cspscolorSpace.width1;
37
pic.stride2 = pic.stride0 >> x265_cli_cspscolorSpace.width2;
38
pic.planes0 = bufread % QUEUE_SIZE;
39
- pic.planes1 = (char*)pic.planes0 + pic.stride0 * height;
40
- pic.planes2 = (char*)pic.planes1 + pic.stride1 * (height >> x265_cli_cspscolorSpace.height1);
41
+ pic.planes1 = (char*)pic.planes0 + pic.stride0 * (height * (pic.format == 2 ? 2 : 1));
42
+ pic.planes2 = (char*)pic.planes1 + pic.stride1 * ((height * (pic.format == 2 ? 2 : 1)) >> x265_cli_cspscolorSpace.height1);
43
+#if ENABLE_ALPHA
44
+ if (alphaAvailable)
45
+ {
46
+ pic.stride3 = pic.stride0 >> x265_cli_cspscolorSpace.width3;
47
+ pic.planes3 = (char*)pic.planes2 + pic.stride2 * (height >> x265_cli_cspscolorSpace.height2);
48
+ }
49
+#endif
50
readCount.incr();
51
return true;
52
}
53
x265_3.6.tar.gz/source/input/yuv.h -> x265_4.0.tar.gz/source/input/yuv.h
Changed
19
1
2
3
uint32_t framesize;
4
5
+ bool alphaAvailable;
6
+
7
bool threadActive;
8
9
ThreadSafeInteger readCount;
10
11
12
public:
13
14
- YUVInput(InputFileInfo& info);
15
+ YUVInput(InputFileInfo& info, bool alpha, int format);
16
17
virtual ~YUVInput();
18
void release();
19
x265_3.6.tar.gz/source/test/ipfilterharness.cpp -> x265_4.0.tar.gz/source/test/ipfilterharness.cpp
Changed
103
1
2
{
3
int index = i % TEST_CASES;
4
5
- for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++)
6
+ for (int coeffIdx = 1; coeffIdx < 8; coeffIdx++)
7
{
8
rand_srcStride = rand() % 100 + 2;
9
rand_dstStride = rand() % 100 + 64;
10
11
{
12
int index = i % TEST_CASES;
13
14
- for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++)
15
+ for (int coeffIdx = 1; coeffIdx < 8; coeffIdx++)
16
{
17
rand_srcStride = rand() % 100;
18
rand_dstStride = rand() % 100 + 64;
19
20
{
21
int index = i % TEST_CASES;
22
23
- for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++)
24
+ for (int coeffIdx = 1; coeffIdx < 8; coeffIdx++)
25
{
26
// 0 : Interpolate W x H, 1 : Interpolate W x (H + 7)
27
for (int isRowExt = 0; isRowExt < 2; isRowExt++)
28
29
{
30
int index = i % TEST_CASES;
31
32
- for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++)
33
+ for (int coeffIdx = 1; coeffIdx < 8; coeffIdx++)
34
{
35
rand_srcStride = rand() % 100;
36
rand_dstStride = rand() % 100 + 64;
37
38
{
39
int index = i % TEST_CASES;
40
41
- for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++)
42
+ for (int coeffIdx = 1; coeffIdx < 8; coeffIdx++)
43
{
44
rand_srcStride = rand() % 100;
45
rand_dstStride = rand() % 100 + 64;
46
47
{
48
int index = i % TEST_CASES;
49
50
- for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++)
51
+ for (int coeffIdx = 1; coeffIdx < 4; coeffIdx++)
52
{
53
rand_srcStride = rand() % 100;
54
rand_dstStride = rand() % 100 + 64;
55
56
{
57
int index = i % TEST_CASES;
58
59
- for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++)
60
+ for (int coeffIdx = 1; coeffIdx < 4; coeffIdx++)
61
{
62
rand_srcStride = rand() % 100;
63
rand_dstStride = rand() % 100 + 64;
64
65
{
66
int index = i % TEST_CASES;
67
68
- for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++)
69
+ for (int coeffIdx = 1; coeffIdx < 4; coeffIdx++)
70
{
71
// 0 : Interpolate W x H, 1 : Interpolate W x (H + 7)
72
for (int isRowExt = 0; isRowExt < 2; isRowExt++)
73
74
{
75
int index = i % TEST_CASES;
76
77
- for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++)
78
+ for (int coeffIdx = 1; coeffIdx < 4; coeffIdx++)
79
{
80
rand_srcStride = rand() % 100;
81
rand_dstStride = rand() % 100 + 64;
82
83
{
84
int index = i % TEST_CASES;
85
86
- for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++)
87
+ for (int coeffIdx = 1; coeffIdx < 4; coeffIdx++)
88
{
89
rand_srcStride = rand() % 100;
90
rand_dstStride = rand() % 100 + 64;
91
92
{
93
int index = i % TEST_CASES;
94
95
- for (int coeffIdxX = 0; coeffIdxX < 4; coeffIdxX++)
96
+ for (int coeffIdxX = 1; coeffIdxX < 4; coeffIdxX++)
97
{
98
- for (int coeffIdxY = 0; coeffIdxY < 4; coeffIdxY++)
99
+ for (int coeffIdxY = 1; coeffIdxY < 4; coeffIdxY++)
100
{
101
rand_srcStride = rand() % 100;
102
rand_dstStride = rand() % 100 + 64;
103
x265_3.6.tar.gz/source/test/mbdstharness.cpp -> x265_4.0.tar.gz/source/test/mbdstharness.cpp
Changed
18
1
2
uint32_t optReturnValue = 0;
3
uint32_t refReturnValue = 0;
4
5
- int bits = rand() % 32;
6
- int valueToAdd = rand() % (1 << bits);
7
+ int log2TrSize = rand() % 4 + 2;
8
+ const int qp = rand() % (QP_MAX_SPEC + QP_BD_OFFSET + 1);
9
+ const int per = qp / 6;
10
+ const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
11
+
12
+ /* Right shift of non-RDOQ quantizer level = (coeff*Q + offset)>>q_bits */
13
+ int bits = QUANT_SHIFT + per + transformShift;
14
+ int valueToAdd = (1 << (bits - 1));
15
int cmp_size = sizeof(short) * height * width;
16
int numCoeff = height * width;
17
18
x265_3.6.tar.gz/source/test/pixelharness.cpp -> x265_4.0.tar.gz/source/test/pixelharness.cpp
Changed
33
1
2
ref(sbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
3
checked(opt, sbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
4
5
- if ( memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
6
- || memcmp(stats_ref, stats_vec, sizeof(stats_ref))
7
+ if ( memcmp(stats_ref, stats_vec, sizeof(stats_ref))
8
|| memcmp(count_ref, count_vec, sizeof(count_ref)))
9
return false;
10
11
12
ref(sbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, upBufft_ref, endX, endY, stats_ref, count_ref);
13
checked(opt, sbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, upBufft_vec, endX, endY, stats_vec, count_vec);
14
15
- // TODO: don't check upBuff*, the latest output pixels different, and can move into stack temporary buffer in future
16
- if ( memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
17
- || memcmp(_upBufft_ref, _upBufft_vec, sizeof(_upBufft_ref))
18
- || memcmp(stats_ref, stats_vec, sizeof(stats_ref))
19
+ if ( memcmp(stats_ref, stats_vec, sizeof(stats_ref))
20
|| memcmp(count_ref, count_vec, sizeof(count_ref)))
21
return false;
22
23
24
ref(sbuf2, pbuf3, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
25
checked(opt, sbuf2, pbuf3, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
26
27
- if ( memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
28
- || memcmp(stats_ref, stats_vec, sizeof(stats_ref))
29
+ if ( memcmp(stats_ref, stats_vec, sizeof(stats_ref))
30
|| memcmp(count_ref, count_vec, sizeof(count_ref)))
31
return false;
32
33
x265_3.6.tar.gz/source/test/testbench.cpp -> x265_4.0.tar.gz/source/test/testbench.cpp
Changed
54
1
2
3
struct test_arch_t
4
{
5
- char name12;
6
+ char name13;
7
int flag;
8
} test_arch =
9
{
10
+#if X265_ARCH_X86
11
{ "SSE2", X265_CPU_SSE2 },
12
{ "SSE3", X265_CPU_SSE3 },
13
{ "SSSE3", X265_CPU_SSSE3 },
14
15
{ "AVX2", X265_CPU_AVX2 },
16
{ "BMI2", X265_CPU_AVX2 | X265_CPU_BMI1 | X265_CPU_BMI2 },
17
{ "AVX512", X265_CPU_AVX512 },
18
+#else
19
{ "ARMv6", X265_CPU_ARMV6 },
20
{ "NEON", X265_CPU_NEON },
21
{ "SVE2", X265_CPU_SVE2 },
22
{ "SVE", X265_CPU_SVE },
23
+ { "Neon_DotProd", X265_CPU_NEON_DOTPROD },
24
+ { "Neon_I8MM", X265_CPU_NEON_I8MM },
25
{ "FastNeonMRC", X265_CPU_FAST_NEON_MRC },
26
+#endif
27
{ "", 0 },
28
};
29
30
31
else
32
continue;
33
34
-#if X265_ARCH_X86
35
+#if defined(X265_ARCH_X86) || defined(X265_ARCH_ARM64)
36
EncoderPrimitives vecprim;
37
memset(&vecprim, 0, sizeof(vecprim));
38
- setupInstrinsicPrimitives(vecprim, test_archi.flag);
39
+ setupIntrinsicPrimitives(vecprim, test_archi.flag);
40
setupAliasPrimitives(vecprim);
41
for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++)
42
{
43
44
45
EncoderPrimitives optprim;
46
memset(&optprim, 0, sizeof(optprim));
47
-#if X265_ARCH_X86
48
- setupInstrinsicPrimitives(optprim, cpuid);
49
+#if defined(X265_ARCH_X86) || defined(X265_ARCH_ARM64)
50
+ setupIntrinsicPrimitives(optprim, cpuid);
51
#endif
52
53
setupAssemblyPrimitives(optprim, cpuid);
54
x265_3.6.tar.gz/source/test/testharness.h -> x265_4.0.tar.gz/source/test/testharness.h
Changed
9
1
2
// TO-DO: replace clock() function with appropriate ARM cpu instructions
3
a = clock();
4
#elif X265_ARCH_ARM64
5
+ asm volatile("isb" : : : "memory");
6
asm volatile("mrs %0, cntvct_el0" : "=r"(a));
7
#endif
8
return a;
9
x265_3.6.tar.gz/source/x265.h -> x265_4.0.tar.gz/source/x265.h
Changed
187
1
2
MASTERING_DISPLAY_INFO = 137,
3
CONTENT_LIGHT_LEVEL_INFO = 144,
4
ALTERNATIVE_TRANSFER_CHARACTERISTICS = 147,
5
+ ALPHA_CHANNEL_INFO = 165,
6
+ THREE_DIMENSIONAL_REFERENCE_DISPLAYS_INFO = 176,
7
+ MULTIVIEW_SCENE_INFO = 178,
8
+ MULTIVIEW_ACQUISITION_INFO = 179,
9
+ MULTIVIEW_VIEW_POSITION = 180
10
} SEIPayloadType;
11
12
typedef struct x265_sei_payload
13
14
15
/* Must be specified on input pictures, the number of planes is determined
16
* by the colorSpace value */
17
- void* planes3;
18
+ void* planes4;
19
20
/* Stride is the number of bytes between row starts */
21
- int stride3;
22
+ int stride4;
23
24
/* Must be specified on input pictures. x265_picture_init() will set it to
25
* the encoder's internal bit depth, but this field must describe the depth
26
27
uint32_t picStruct;
28
29
int width;
30
+
31
+ int layerID;
32
+ int format;
33
} x265_picture;
34
35
typedef enum
36
37
#define X265_CPU_SLOW_PALIGNR (1 << 25) /* such as on the AMD Bobcat */
38
39
/* ARM */
40
-#define X265_CPU_ARMV6 0x0000001
41
-#define X265_CPU_NEON 0x0000002 /* ARM NEON */
42
-#define X265_CPU_SVE2 0x0000008 /* ARM SVE2 */
43
-#define X265_CPU_SVE 0x0000010 /* ARM SVE2 */
44
-#define X265_CPU_FAST_NEON_MRC 0x0000004 /* Transfer from NEON to ARM register is fast (Cortex-A9) */
45
+#define X265_CPU_ARMV6 (1 << 0)
46
+#define X265_CPU_NEON (1 << 1) /* ARM NEON */
47
+#define X265_CPU_FAST_NEON_MRC (1 << 2) /* Transfer from NEON to ARM register is fast (Cortex-A9) */
48
+#define X265_CPU_SVE2 (1 << 3) /* AArch64 SVE2 */
49
+#define X265_CPU_SVE (1 << 4) /* AArch64 SVE2 */
50
+#define X265_CPU_NEON_DOTPROD (1 << 5) /* AArch64 Neon DotProd */
51
+#define X265_CPU_NEON_I8MM (1 << 6) /* AArch64 Neon I8MM */
52
53
/* IBM Power8 */
54
#define X265_CPU_ALTIVEC 0x0000001
55
56
#define X265_MAX_GOP_LENGTH 16
57
#define MAX_T_LAYERS 7
58
59
+#if ENABLE_MULTIVIEW
60
+#define MAX_VIEWS 2
61
+#define MAX_VPS_NUM_SCALABILITY_TYPES 16
62
+#define MAX_VPS_LAYER_ID_PLUS1 MAX_VIEWS
63
+#define MULTIVIEW_SCALABILITY_IDX 1
64
+#else
65
+#define MAX_VIEWS 1
66
+#endif
67
+
68
+#if ENABLE_ALPHA
69
+#define MAX_SCALABLE_LAYERS 2
70
+#define MAX_VPS_NUM_SCALABILITY_TYPES 16
71
+#define MAX_VPS_LAYER_ID_PLUS1 MAX_SCALABLE_LAYERS
72
+#else
73
+#define MAX_SCALABLE_LAYERS 1
74
+#endif
75
+
76
+#if ENABLE_ALPHA || ENABLE_MULTIVIEW
77
+#define MAX_LAYERS 2
78
+#else
79
+#define MAX_LAYERS 1
80
+#endif
81
+
82
+#if ENABLE_SCC_EXT
83
+/* SCC Extension Options */
84
+#define SCC_EXT_IDX 3
85
+#define NUM_EXTENSION_FLAGS 8
86
+#define SCM_S0067_NUM_CANDIDATES 64
87
+#define CHROMA_REFINEMENT_CANDIDATES 8
88
+#define SCM_S0067_IBC_FULL_1D_SEARCH_FOR_PU 2 ///< Do full horizontal/vertical search for Nx2N
89
+#define SCM_S0067_MAX_CAND_SIZE 32 ///< 32 or 64, 16 by default
90
+#define NUM_RECON_VERSION 2
91
+#else
92
+#define NUM_RECON_VERSION 1
93
+#endif
94
+
95
#define X265_IPRATIO_STRENGTH 1.43
96
97
typedef struct x265_cli_csp
98
{
99
int planes;
100
- int width3;
101
- int height3;
102
+ int width4;
103
+ int height4;
104
} x265_cli_csp;
105
106
static const x265_cli_csp x265_cli_csps =
107
108
char *pool;
109
int thread;
110
int subsample;
111
- int enable_conf_interval;
112
}x265_vmaf_commondata;
113
114
-static const x265_vmaf_commondata vcd = { { NULL, (char *)"/usr/local/share/model/vmaf_v0.6.1.pkl", NULL, NULL, 0, 0, 0, 0, 0, 0, 0, NULL, 0, 1, 0 } };
115
+static x265_vmaf_commondata vcd = { { NULL, (char *)"/usr/local/share/model/vmaf_v0.6.1.json", NULL, NULL, 0, 0, 0, 0, 0, 0, 0, NULL, 0, 1} };
116
117
typedef struct x265_temporal_layer {
118
int poc_offset; /* POC offset */
119
120
121
/*SBRC*/
122
int bEnableSBRC;
123
+ int mcstfFrameRange;
124
+
125
+ /*Alpha channel encoding*/
126
+ int bEnableAlpha;
127
+ int numScalableLayers;
128
+
129
+ /*Multi View Encoding*/
130
+ int numViews;
131
+ int format;
132
+
133
+ int numLayers;
134
+
135
+ /*Screen Content Coding*/
136
+ int bEnableSCC;
137
} x265_param;
138
139
/* x265_param_alloc:
140
141
"main444-12", "main444-12-intra",
142
143
"main444-16-intra", "main444-16-stillpicture", /* Not Supported! */
144
+
145
+#if ENABLE_SCC_EXT
146
+ "main-scc", "main10-scc", "main444-scc", "main444-10-scc", /* Screen content coding */
147
+#endif
148
0
149
};
150
151
152
* the payloads of all output NALs are guaranteed to be sequential in memory.
153
* To flush the encoder and retrieve delayed output pictures, pass pic_in as NULL.
154
* Once flushing has begun, all subsequent calls must pass pic_in as NULL. */
155
-int x265_encoder_encode(x265_encoder *encoder, x265_nal **pp_nal, uint32_t *pi_nal, x265_picture *pic_in, x265_picture *pic_out);
156
+int x265_encoder_encode(x265_encoder *encoder, x265_nal **pp_nal, uint32_t *pi_nal, x265_picture *pic_in, x265_picture **pic_out);
157
158
/* x265_encoder_reconfig:
159
* various parameters from x265_param are copied.
160
161
162
/* x265_calculate_vmaf_framelevelscore:
163
* returns VMAF score for each frame in a given input video. */
164
-double x265_calculate_vmaf_framelevelscore(x265_vmaf_framedata*);
165
+double x265_calculate_vmaf_framelevelscore(x265_param*, x265_vmaf_framedata*);
166
/* x265_vmaf_encoder_log:
167
* write a line to the configured CSV file. If a CSV filename was not
168
* configured, or file open failed, this function will perform no write.
169
170
int (*encoder_reconfig)(x265_encoder*, x265_param*);
171
int (*encoder_reconfig_zone)(x265_encoder*, x265_zone*);
172
int (*encoder_headers)(x265_encoder*, x265_nal**, uint32_t*);
173
- int (*encoder_encode)(x265_encoder*, x265_nal**, uint32_t*, x265_picture*, x265_picture*);
174
+ int (*encoder_encode)(x265_encoder*, x265_nal**, uint32_t*, x265_picture*, x265_picture**);
175
void (*encoder_get_stats)(x265_encoder*, x265_stats*, uint32_t);
176
void (*encoder_log)(x265_encoder*, int, char**);
177
void (*encoder_close)(x265_encoder*);
178
179
int (*set_analysis_data)(x265_encoder *encoder, x265_analysis_data *analysis_data, int poc, uint32_t cuBytes);
180
#if ENABLE_LIBVMAF
181
double (*calculate_vmafscore)(x265_param *, x265_vmaf_data *);
182
- double (*calculate_vmaf_framelevelscore)(x265_vmaf_framedata *);
183
+ double (*calculate_vmaf_framelevelscore)(x265_param *, x265_vmaf_framedata *);
184
void (*vmaf_encoder_log)(x265_encoder*, int, char**, x265_param *, x265_vmaf_data *);
185
#endif
186
int (*zone_param_parse)(x265_param*, const char*, const char*);
187
x265_3.6.tar.gz/source/x265cli.cpp -> x265_4.0.tar.gz/source/x265cli.cpp
Changed
201
1
2
H0(" --no-frame-dup Enable Frame duplication. Default %s\n", OPT(param->bEnableFrameDuplication));
3
H0(" --dup-threshold <integer> PSNR threshold for Frame duplication. Default %d\n", param->dupThreshold);
4
H0(" --no-mcstf Enable GOP based temporal filter. Default %d\n", param->bEnableTemporalFilter);
5
+#if ENABLE_ALPHA
6
+ H0(" --alpha Enable alpha channel support. Default %d\n", param->bEnableAlpha);
7
+#endif
8
+#if ENABLE_MULTIVIEW
9
+ H0(" --num-views Number of Views for Multiview Encoding. Default %d\n", param->numViews);
10
+ H0(" --format Format of the input video 0 : normal, 1 : side-by-side, 2 : over-under Default %d\n", param->format);
11
+ H0(" --multiview-config Configuration file for Multiview Encoding\n");
12
+#endif
13
+#if ENABLE_SCC_EXT
14
+ H0(" --scc <integer> Enable screen content coding. 0: Diabled, 1:Intrablockcopy fast search with 1x2 CTUs search range, 2: Intrablockcopy Full search. Default %d\n", param->bEnableSCC);
15
+#endif
16
#ifdef SVT_HEVC
17
H0(" --nosvt Enable SVT HEVC encoder %s\n", OPT(param->bEnableSvtHevc));
18
H0(" --no-svt-hme Enable Hierarchial motion estimation(HME) in SVT HEVC encoder \n");
19
20
free(argString);
21
}
22
23
- if (input)
24
- input->release();
25
- input = NULL;
26
- if (recon)
27
- recon->release();
28
- recon = NULL;
29
+ for (int i = 0; i < MAX_VIEWS; i++)
30
+ {
31
+ if (inputi)
32
+ inputi->release();
33
+ inputi = NULL;
34
+ }
35
+ for (int i = 0; i < MAX_LAYERS; i++)
36
+ {
37
+ if (reconi)
38
+ reconi->release();
39
+ reconi = NULL;
40
+ }
41
if (qpfile)
42
fclose(qpfile);
43
qpfile = NULL;
44
45
int inputBitDepth = 8;
46
int outputBitDepth = 0;
47
int reconFileBitDepth = 0;
48
- const char *inputfn = NULL;
49
- const char *reconfn = NULL;
50
+ char* inputfnMAX_VIEWS = { NULL };
51
+ for (int view = 0; view < MAX_VIEWS; view++)
52
+ {
53
+ inputfnview = X265_MALLOC(char, sizeof(char) * 1024);
54
+ }
55
+ const char* reconfnMAX_LAYERS = { NULL };
56
const char *outputfn = NULL;
57
const char *preset = NULL;
58
const char *tune = NULL;
59
60
OPT("frames") this->framesToBeEncoded = (uint32_t)x265_atoi(optarg, bError);
61
OPT("no-progress") this->bProgress = false;
62
OPT("output") outputfn = optarg;
63
- OPT("input") inputfn = optarg;
64
- OPT("recon") reconfn = optarg;
65
+ OPT("input") strcpy(inputfn0 , optarg);
66
+ OPT("recon") reconfn0 = optarg;
67
OPT("input-depth") inputBitDepth = (uint32_t)x265_atoi(optarg, bError);
68
OPT("dither") this->bDither = true;
69
OPT("recon-depth") reconFileBitDepth = (uint32_t)x265_atoi(optarg, bError);
70
71
if (!this->scenecutAwareQpConfig)
72
x265_log_file(param, X265_LOG_ERROR, "%s scenecut aware qp config file not found or error in opening config file\n", optarg);
73
}
74
+#if ENABLE_MULTIVIEW
75
+ OPT("multiview-config")
76
+ {
77
+ this->multiViewConfig = x265_fopen(optarg, "rb");
78
+ if (!this->multiViewConfig)
79
+ x265_log_file(param, X265_LOG_ERROR, "%s Multiview config file not found or error in opening config file\n", optarg);
80
+ }
81
+#endif
82
OPT("zonefile")
83
{
84
this->zoneFile = x265_fopen(optarg, "rb");
85
86
}
87
}
88
89
- if (optind < argc && !inputfn)
90
- inputfn = argvoptind++;
91
+#if !ENABLE_MULTIVIEW
92
+ if (optind < argc && !inputfn0)
93
+ inputfn0 = argvoptind++;
94
+#endif
95
if (optind < argc && !outputfn)
96
outputfn = argvoptind++;
97
if (optind < argc)
98
99
showHelp(param);
100
}
101
102
- if (!inputfn || !outputfn)
103
+#if ENABLE_MULTIVIEW
104
+ if (this->multiViewConfig)
105
+ {
106
+ if (!this->parseMultiViewConfig(inputfn))
107
+ {
108
+ x265_log(NULL, X265_LOG_ERROR, "Unable to parse multiview config file \n");
109
+ fclose(this->multiViewConfig);
110
+ this->multiViewConfig = NULL;
111
+ }
112
+ }
113
+#endif
114
+ param->numLayers = param->numViews > 1 ? param->numViews : (param->numScalableLayers > 1) ? param->numScalableLayers : 1;
115
+ if (!outputfn)
116
{
117
x265_log(param, X265_LOG_ERROR, "input or output file not specified, try --help for help\n");
118
+ for (int view = 0; view < param->numViews; view++)
119
+ {
120
+ if (!inputfnview)
121
+ {
122
+ x265_log(param, X265_LOG_ERROR, "input or output file not specified, try --help for help\n");
123
+ return true;
124
+ }
125
+ }
126
return true;
127
}
128
129
130
svtParam->encoderBitDepth = inputBitDepth;
131
}
132
#endif
133
-
134
- InputFileInfo info;
135
- info.filename = inputfn;
136
- info.depth = inputBitDepth;
137
- info.csp = param->internalCsp;
138
- info.width = param->sourceWidth;
139
- info.height = param->sourceHeight;
140
- info.fpsNum = param->fpsNum;
141
- info.fpsDenom = param->fpsDenom;
142
- info.sarWidth = param->vui.sarWidth;
143
- info.sarHeight = param->vui.sarHeight;
144
- info.skipFrames = seek;
145
- info.frameCount = 0;
146
- getParamAspectRatio(param, info.sarWidth, info.sarHeight);
147
-
148
-
149
- this->input = InputFile::open(info, this->bForceY4m);
150
- if (!this->input || this->input->isFail())
151
+ InputFileInfo infoMAX_VIEWS;
152
+ for (int i = 0; i < param->numViews - !!param->format; i++)
153
{
154
- x265_log_file(param, X265_LOG_ERROR, "unable to open input file <%s>\n", inputfn);
155
- return true;
156
- }
157
+ infoi.filename = inputfni;
158
+ infoi.depth = inputBitDepth;
159
+ infoi.csp = param->internalCsp;
160
+ infoi.width = param->sourceWidth;
161
+ infoi.height = param->sourceHeight;
162
+ infoi.fpsNum = param->fpsNum;
163
+ infoi.fpsDenom = param->fpsDenom;
164
+ infoi.sarWidth = param->vui.sarWidth;
165
+ infoi.sarHeight = param->vui.sarHeight;
166
+ infoi.skipFrames = seek;
167
+ infoi.frameCount = 0;
168
+ getParamAspectRatio(param, infoi.sarWidth, infoi.sarHeight);
169
+
170
+ this->inputi = InputFile::open(infoi, this->bForceY4m, param->numScalableLayers > 1, param->format);
171
+ if (!this->inputi || this->inputi->isFail())
172
+ {
173
+ x265_log_file(param, X265_LOG_ERROR, "unable to open input file <%s>\n", inputfni);
174
+ return true;
175
+ }
176
177
- if (info.depth < 8 || info.depth > 16)
178
- {
179
- x265_log(param, X265_LOG_ERROR, "Input bit depth (%d) must be between 8 and 16\n", inputBitDepth);
180
- return true;
181
+ if (infoi.depth < 8 || infoi.depth > 16)
182
+ {
183
+ x265_log(param, X265_LOG_ERROR, "Input bit depth (%d) must be between 8 and 16\n", inputBitDepth);
184
+ return true;
185
+ }
186
}
187
188
+ //TODO:Validate info params of both the views to equal values
189
/* Unconditionally accept height/width/csp/bitDepth from file info */
190
- param->sourceWidth = info.width;
191
- param->sourceHeight = info.height;
192
- param->internalCsp = info.csp;
193
- param->sourceBitDepth = info.depth;
194
+ param->sourceWidth = info0.width;
195
+ param->sourceHeight = info0.height;
196
+ param->internalCsp = info0.csp;
197
+ param->sourceBitDepth = info0.depth;
198
199
/* Accept fps and sar from file info if not specified by user */
200
if (param->fpsDenom == 0 || param->fpsNum == 0)
201
x265_3.6.tar.gz/source/x265cli.h -> x265_4.0.tar.gz/source/x265cli.h
Changed
69
1
2
{ "dup-threshold", required_argument, NULL, 0 },
3
{ "mcstf", no_argument, NULL, 0 },
4
{ "no-mcstf", no_argument, NULL, 0 },
5
+#if ENABLE_ALPHA
6
+ { "alpha", no_argument, NULL, 0 },
7
+#endif
8
+#if ENABLE_MULTIVIEW
9
+ { "num-views", required_argument, NULL, 0 },
10
+ { "multiview-config", required_argument, NULL, 0 },
11
+ { "format", required_argument, NULL, 0 },
12
+#endif
13
+#if ENABLE_SCC_EXT
14
+ { "scc", required_argument, NULL, 0 },
15
+#endif
16
#ifdef SVT_HEVC
17
{ "svt", no_argument, NULL, 0 },
18
{ "no-svt", no_argument, NULL, 0 },
19
20
21
struct CLIOptions
22
{
23
- InputFile* input;
24
- ReconFile* recon;
25
+ InputFile* inputMAX_VIEWS;
26
+ ReconFile* reconMAX_LAYERS;
27
OutputFile* output;
28
FILE* qpfile;
29
FILE* zoneFile;
30
FILE* dolbyVisionRpu; /* File containing Dolby Vision BL RPU metadata */
31
FILE* scenecutAwareQpConfig; /* File containing scenecut aware frame quantization related CLI options */
32
+#if ENABLE_MULTIVIEW
33
+ FILE* multiViewConfig; /* File containing multi-view related CLI options */
34
+#endif
35
const char* reconPlayCmd;
36
const x265_api* api;
37
x265_param* param;
38
39
static const int UPDATE_INTERVAL = 250000;
40
CLIOptions()
41
{
42
- input = NULL;
43
- recon = NULL;
44
+ for (int i = 0; i < MAX_VIEWS; i++)
45
+ inputi = NULL;
46
+ for (int i = 0; i < MAX_LAYERS; i++)
47
+ reconi = NULL;
48
output = NULL;
49
qpfile = NULL;
50
zoneFile = NULL;
51
dolbyVisionRpu = NULL;
52
scenecutAwareQpConfig = NULL;
53
+#if ENABLE_MULTIVIEW
54
+ multiViewConfig = NULL;
55
+#endif
56
reconPlayCmd = NULL;
57
api = NULL;
58
param = NULL;
59
60
int rpuParser(x265_picture * pic);
61
bool parseScenecutAwareQpConfig();
62
bool parseScenecutAwareQpParam(int argc, char **argv, x265_param* globalParam);
63
+#if ENABLE_MULTIVIEW
64
+ bool parseMultiViewConfig(char** fn);
65
+#endif
66
};
67
#ifdef __cplusplus
68
}
69
x265_3.6.tar.gz/x265Version.txt -> x265_4.0.tar.gz/x265Version.txt
Changed
8
1
2
#Attribute: Values
3
-repositorychangeset: aa7f602f7
4
+repositorychangeset: 6318f22
5
releasetagdistance: 1
6
-releasetag: 3.6
7
+releasetag: 4.0
8