Changes of Revision 20

x265.changes Changed
x
 
1
@@ -1,4 +1,38 @@
2
 -------------------------------------------------------------------
3
+Mon Sep 30 12:34:56 UTC 2024 - olaf@aepfle.de
4
+
5
+- Update to version 4.0
6
+  New features:
7
+  * Alpha Channel feature.
8
+  * Screen Content Coding (SCC).
9
+  * MV-HEVC feature.
10
+  Enhancements to existing features:
11
+  * Added support for the VMAF v3.x.
12
+  API changes
13
+  * Add command line parameter for Alpha Channel feature :option:`--alpha`.
14
+  * Add command line parameter for SCC feature :option:`--scc 1`.
15
+  * Add command line parameters for the MV-HEVC feature
16
+    :option:`--multiview-config "multiview_config.txt"`.
17
+  Optimizations
18
+  * Arm SIMD optimizations: Several time-consuming scalar C
19
+    functions now have SIMD implementations on Arm platforms.
20
+    Existing Arm SIMD implementations have also been optimized.
21
+    These optimizations result in up to 57% faster encoding
22
+    compared to release 3.6.
23
+  * Arm SIMD optimizations include use of Armv8.4 DotProd, Armv8.6
24
+    I8MM, and Armv9 SVE2 instruction set extensions. The following
25
+    algorithms now have optimized SIMD implementations: SAD, SSE,
26
+    DCT, SAO, convolution, quantization, intra_planar,
27
+    intraFilter, intrapred DC and IDCT16x16.
28
+  Bug fixes
29
+  * Fix for y4m pipe input broken.
30
+  * Fix SCC crash on multipass encode.
31
+  * Fix mcstf when :option:`--bframes` value was less than 5.
32
+  * Fix lowpass DCT for high bit depth.
33
+  * Fix issue in default code flow and memory leak.
34
+  * Fix scc crash on multipass encode.
35
+
36
+-------------------------------------------------------------------
37
 Thu Jun 13 05:58:19 UTC 2024 - Luigi Baldoni <aloisio@gmx.com>
38
 
39
 - Update to version 3.6
40
x265.spec Changed
73
 
1
@@ -17,12 +17,12 @@
2
 #
3
 
4
 
5
-%define sover   209
6
+%define sover   212
7
 %define libname lib%{name}
8
 %define libsoname %{libname}-%{sover}
9
-%define uver    3_6
10
+%define uver    4_0
11
 Name:           x265
12
-Version:        3.6
13
+Version:        4.0
14
 Release:        0
15
 Summary:        A free h265/HEVC encoder - encoder binary
16
 License:        GPL-2.0-or-later
17
@@ -30,11 +30,20 @@
18
 URL:            https://bitbucket.org/multicoreware/x265_git
19
 Source0:        https://bitbucket.org/multicoreware/x265_git/downloads/%{name}_%{version}.tar.gz
20
 Patch1:         x265.pkgconfig.patch
21
-Patch2:         x265-fix_enable512.patch
22
 Patch3:         0001-Fix-arm-flags.patch
23
 Patch4:         0004-Do-not-build-with-assembly-support-on-arm.patch
24
-BuildRequires:  cmake >= 2.8.8
25
+BuildRequires:  cmake
26
+%if 0%{?suse_version} > 1500
27
 BuildRequires:  gcc-c++
28
+%else
29
+%if 0%{?sle_version} > 150500
30
+BuildRequires:  gcc13
31
+BuildRequires:  gcc13-c++
32
+%else
33
+BuildRequires:  gcc10
34
+BuildRequires:  gcc10-c++
35
+%endif
36
+%endif
37
 BuildRequires:  nasm >= 2.13
38
 BuildRequires:  pkgconfig
39
 %ifarch x86_64
40
@@ -73,16 +82,27 @@
41
 streams.
42
 
43
 %prep
44
-%setup -q -n %{name}_%{version}
45
-%autopatch -p1
46
+%autosetup -p1 -n %{name}_%{version}
47
 
48
+%build
49
+test -x "$(type -p gcc)"    && CC="$_"
50
+test -x "$(type -p g++)"    && CXX="$_"
51
+test -x "$(type -p gcc-10)" && CC="$_"
52
+test -x "$(type -p g++-10)" && CXX="$_"
53
+test -x "$(type -p gcc-13)" && CC="$_"
54
+test -x "$(type -p g++-13)" && CXX="$_"
55
+export CC="$(readlink -f ${CC})"
56
+export CXX="$(readlink -f ${CXX})"
57
+CFLAGS='%optflags -Wno-misleading-indentation -Wno-unused-parameter -Wno-unused-variable'
58
+CXXFLAGS='%optflags -Wno-misleading-indentation -Wno-unused-parameter -Wno-unused-variable'
59
 # set the version by hand
60
-sed -i "/^include(Version)/d" source/CMakeLists.txt
61
+sed -i~ "/^include(Version)/d" source/CMakeLists.txt
62
+diff -u "$_"~ "$_" && exit 1
63
 # force version number in the soname
64
-sed -i 's/hdr10plus-shared PROPERTIES OUTPUT_NAME hdr10plus/hdr10plus-shared PROPERTIES OUTPUT_NAME hdr10plus-%{version}/' \
65
+sed -i~ 's/hdr10plus-shared PROPERTIES OUTPUT_NAME hdr10plus/hdr10plus-shared PROPERTIES OUTPUT_NAME hdr10plus-%{version}/' \
66
        source/CMakeLists.txt
67
+diff -u "$_"~ "$_" && exit 1
68
 
69
-%build
70
 SOURCE_DIR="$PWD"/source
71
 COMMON_FLAGS="-DENABLE_TESTS=OFF -DENABLE_PIC=ON -Wno-dev"
72
 HIGH_BIT_DEPTH_FLAGS="-DENABLE_CLI=OFF -DENABLE_SHARED=OFF -DEXPORT_C_API=OFF -DHIGH_BIT_DEPTH=ON"
73
0001-Fix-arm-flags.patch Changed
74
 
1
@@ -6,11 +6,9 @@
2
  source/CMakeLists.txt | 7 ++-----
3
  1 file changed, 2 insertions(+), 5 deletions(-)
4
 
5
-diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
6
-index ab5ddfe..eb9b19b 100755
7
 --- a/source/CMakeLists.txt
8
 +++ b/source/CMakeLists.txt
9
-@@ -253,10 +253,7 @@ if(GCC)
10
+@@ -257,10 +257,7 @@
11
      elseif(ARM)
12
          find_package(Neon)
13
          if(CPU_HAS_NEON)
14
@@ -20,20 +18,42 @@
15
 -            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
16
          endif()
17
      endif()
18
-   if(ARM64 OR CROSS_COMPILE_ARM64)
19
-@@ -265,13 +262,13 @@ if(GCC)
20
-         find_package(SVE2)
21
-         if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2)
22
-             message(STATUS "Found SVE2")
23
--          set(ARM_ARGS -O3 -march=armv8-a+sve2 -fPIC -flax-vector-conversions)
24
-+          set(ARM_ARGS -fPIC -flax-vector-conversions)
25
-             add_definitions(-DHAVE_SVE2)
26
-             add_definitions(-DHAVE_SVE)
27
-             add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE2
28
-         elseif(CPU_HAS_SVE OR CROSS_COMPILE_SVE)
29
-             message(STATUS "Found SVE")
30
--          set(ARM_ARGS -O3 -march=armv8-a+sve -fPIC -flax-vector-conversions)
31
-+          set(ARM_ARGS -fPIC -flax-vector-conversions)
32
-             add_definitions(-DHAVE_SVE)
33
-             add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE
34
-         elseif(CPU_HAS_NEON)
35
+     if(ARM64)
36
+--- a/source/cmake/FindNEON_DOTPROD.cmake
37
++++ b/source/cmake/FindNEON_DOTPROD.cmake
38
+@@ -17,5 +17,5 @@
39
+ endif()
40
+ 
41
+ if(has_dot_product)
42
+-    set(CPU_HAS_NEON_DOTPROD 1)
43
++    set(CPU_HAS_NEON_DOTPROD 0)
44
+ endif()
45
+--- a/source/cmake/FindNEON_I8MM.cmake
46
++++ b/source/cmake/FindNEON_I8MM.cmake
47
+@@ -17,5 +17,5 @@
48
+ endif()
49
+ 
50
+ if(has_i8mm)
51
+-    set(CPU_HAS_NEON_I8MM 1)
52
++    set(CPU_HAS_NEON_I8MM 0)
53
+ endif()
54
+--- a/source/cmake/FindSVE.cmake
55
++++ b/source/cmake/FindSVE.cmake
56
+@@ -17,5 +17,5 @@
57
+ endif()
58
+ 
59
+ if(sve_version)
60
+-    set(CPU_HAS_SVE 1)
61
++    set(CPU_HAS_SVE 0)
62
+ endif()
63
+--- a/source/cmake/FindSVE2.cmake
64
++++ b/source/cmake/FindSVE2.cmake
65
+@@ -17,6 +17,6 @@
66
+ endif()
67
+ 
68
+ if(sve2_version)
69
+-    set(CPU_HAS_SVE 1)
70
+-    set(CPU_HAS_SVE2 1)
71
++    set(CPU_HAS_SVE 0)
72
++    set(CPU_HAS_SVE2 0)
73
+ endif()
74
0004-Do-not-build-with-assembly-support-on-arm.patch Changed
22
 
1
@@ -6,11 +6,9 @@
2
  source/CMakeLists.txt | 9 ---------
3
  1 file changed, 9 deletions(-)
4
 
5
-diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
6
-index 672cc2d..f112330 100755
7
 --- a/source/CMakeLists.txt
8
 +++ b/source/CMakeLists.txt
9
-@@ -73,15 +73,6 @@ elseif(POWERMATCH GREATER "-1")
10
+@@ -72,15 +72,6 @@
11
          add_definitions(-DPPC64=1)
12
          message(STATUS "Detected POWER PPC64 target processor")
13
      endif()
14
@@ -24,5 +22,5 @@
15
 -    set(ARM 1)
16
 -    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
17
  elseif(ARM64MATCH GREATER "-1")
18
-     #if(CROSS_COMPILE_ARM64)
19
-         #message(STATUS "Cross compiling for ARM64 arch")
20
+     message(STATUS "Detected ARM64 target processor")
21
+     set(ARM64 1)
22
x265-fix_enable512.patch Deleted
28
 
1
@@ -1,26 +0,0 @@
2
---- a/source/common/cpu.cpp
3
-+++ b/source/common/cpu.cpp
4
-@@ -110,6 +110,11 @@ const cpu_name_t cpu_names =
5
-     { "", 0 },
6
- };
7
- 
8
-+bool detect512()
9
-+{
10
-+    return(enable512);
11
-+}
12
-+
13
- #if X265_ARCH_X86
14
- 
15
- extern "C" {
16
-@@ -123,11 +128,6 @@ uint64_t PFX(cpu_xgetbv)(int xcr);
17
- #pragma warning(disable: 4309) // truncation of constant value
18
- #endif
19
- 
20
--bool detect512()
21
--{
22
--    return(enable512);
23
--}
24
--
25
- uint32_t cpu_detect(bool benableavx512 )
26
- {
27
- 
28
baselibs.conf Changed
4
 
1
@@ -1,1 +1,1 @@
2
-libx265-209
3
+libx265-212
4
x265_3.6.tar.gz/source/common/aarch64/ipfilter-common.S Deleted
201
 
1
@@ -1,1436 +0,0 @@
2
-/*****************************************************************************
3
- * Copyright (C) 2022-2023 MulticoreWare, Inc
4
- *
5
- * Authors: David Chen <david.chen@myais.com.cn>
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
- *
21
- * This program is also available under a commercial proprietary license.
22
- * For more information, contact us at license @ x265.com.
23
- *****************************************************************************/
24
-
25
-// This file contains the macros written using NEON instruction set
26
-// that are also used by the SVE2 functions
27
-
28
-// Macros below follow these conventions:
29
-// - input data in registers: v0, v1, v2, v3, v4, v5, v6, v7
30
-// - constants in registers: v24, v25, v26, v27, v31
31
-// - temporary registers: v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30.
32
-// - _32b macros output a result in v17.4s
33
-// - _64b and _32b_1 macros output results in v17.4s, v18.4s
34
-
35
-#include "asm.S"
36
-
37
-.arch           armv8-a
38
-
39
-#ifdef __APPLE__
40
-.section __RODATA,__rodata
41
-#else
42
-.section .rodata
43
-#endif
44
-
45
-.align 4
46
-
47
-.macro vextin8 v
48
-    ldp             d6, d7, x11, #16
49
-.if \v == 0
50
-    // qpel_filter_0 only uses values in v3
51
-    ext             v3.8b, v6.8b, v7.8b, #4
52
-.else
53
-.if \v != 3
54
-    ext             v0.8b, v6.8b, v7.8b, #1
55
-.endif
56
-    ext             v1.8b, v6.8b, v7.8b, #2
57
-    ext             v2.8b, v6.8b, v7.8b, #3
58
-    ext             v3.8b, v6.8b, v7.8b, #4
59
-    ext             v4.8b, v6.8b, v7.8b, #5
60
-    ext             v5.8b, v6.8b, v7.8b, #6
61
-    ext             v6.8b, v6.8b, v7.8b, #7
62
-.endif
63
-.endm
64
-
65
-.macro vextin8_64 v
66
-    ldp             q6, q7, x11, #32
67
-.if \v == 0
68
-    // qpel_filter_0 only uses values in v3
69
-    ext             v3.16b, v6.16b, v7.16b, #4
70
-.else
71
-.if \v != 3
72
-    // qpel_filter_3 does not use values in v0
73
-    ext             v0.16b, v6.16b, v7.16b, #1
74
-.endif
75
-    ext             v1.16b, v6.16b, v7.16b, #2
76
-    ext             v2.16b, v6.16b, v7.16b, #3
77
-    ext             v3.16b, v6.16b, v7.16b, #4
78
-    ext             v4.16b, v6.16b, v7.16b, #5
79
-    ext             v5.16b, v6.16b, v7.16b, #6
80
-.if \v == 1
81
-    ext             v6.16b, v6.16b, v7.16b, #7
82
-    // qpel_filter_1 does not use v7
83
-.else
84
-    ext             v16.16b, v6.16b, v7.16b, #7
85
-    ext             v7.16b, v6.16b, v7.16b, #8
86
-    mov             v6.16b, v16.16b
87
-.endif
88
-.endif
89
-.endm
90
-
91
-.macro vextin8_chroma v
92
-    ldp             d6, d7, x11, #16
93
-.if \v == 0
94
-    // qpel_filter_chroma_0 only uses values in v1
95
-    ext             v1.8b, v6.8b, v7.8b, #2
96
-.else
97
-    ext             v0.8b, v6.8b, v7.8b, #1
98
-    ext             v1.8b, v6.8b, v7.8b, #2
99
-    ext             v2.8b, v6.8b, v7.8b, #3
100
-    ext             v3.8b, v6.8b, v7.8b, #4
101
-.endif
102
-.endm
103
-
104
-.macro vextin8_chroma_64 v
105
-    ldp             q16, q17, x11, #32
106
-.if \v == 0
107
-    // qpel_filter_chroma_0 only uses values in v1
108
-    ext             v1.16b, v16.16b, v17.16b, #2
109
-.else
110
-    ext             v0.16b, v16.16b, v17.16b, #1
111
-    ext             v1.16b, v16.16b, v17.16b, #2
112
-    ext             v2.16b, v16.16b, v17.16b, #3
113
-    ext             v3.16b, v16.16b, v17.16b, #4
114
-.endif
115
-.endm
116
-
117
-.macro qpel_load_32b v
118
-.if \v == 0
119
-    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
120
-    ld1             {v3.8b}, x6, x1
121
-.elseif \v == 1 || \v == 2 || \v == 3
122
-.if \v != 3                           // not used in qpel_filter_3
123
-    ld1             {v0.8b}, x6, x1
124
-.else
125
-    add             x6, x6, x1
126
-.endif
127
-    ld1             {v1.8b}, x6, x1
128
-    ld1             {v2.8b}, x6, x1
129
-    ld1             {v3.8b}, x6, x1
130
-    ld1             {v4.8b}, x6, x1
131
-    ld1             {v5.8b}, x6, x1
132
-.if \v != 1                           // not used in qpel_filter_1
133
-    ld1             {v6.8b}, x6, x1
134
-    ld1             {v7.8b}, x6
135
-.else
136
-    ld1             {v6.8b}, x6
137
-.endif
138
-.endif
139
-.endm
140
-
141
-.macro qpel_load_64b v
142
-.if \v == 0
143
-    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
144
-    ld1             {v3.16b}, x6, x1
145
-.elseif \v == 1 || \v == 2 || \v == 3
146
-.if \v != 3                           // not used in qpel_filter_3
147
-    ld1             {v0.16b}, x6, x1
148
-.else
149
-    add             x6, x6, x1
150
-.endif
151
-    ld1             {v1.16b}, x6, x1
152
-    ld1             {v2.16b}, x6, x1
153
-    ld1             {v3.16b}, x6, x1
154
-    ld1             {v4.16b}, x6, x1
155
-    ld1             {v5.16b}, x6, x1
156
-.if \v != 1                           // not used in qpel_filter_1
157
-    ld1             {v6.16b}, x6, x1
158
-    ld1             {v7.16b}, x6
159
-.else
160
-    ld1             {v6.16b}, x6
161
-.endif
162
-.endif
163
-.endm
164
-
165
-.macro qpel_chroma_load_32b v
166
-.if \v == 0
167
-    // qpel_filter_chroma_0 only uses values in v1
168
-    add             x6, x6, x1
169
-    ldr             d1, x6
170
-.else
171
-    ld1             {v0.8b}, x6, x1
172
-    ld1             {v1.8b}, x6, x1
173
-    ld1             {v2.8b}, x6, x1
174
-    ld1             {v3.8b}, x6
175
-.endif
176
-.endm
177
-
178
-.macro qpel_chroma_load_64b v
179
-.if \v == 0
180
-    // qpel_filter_chroma_0 only uses values in v1
181
-    add             x6, x6, x1
182
-    ldr             q1, x6
183
-.else
184
-    ld1             {v0.16b}, x6, x1
185
-    ld1             {v1.16b}, x6, x1
186
-    ld1             {v2.16b}, x6, x1
187
-    ld1             {v3.16b}, x6
188
-.endif
189
-.endm
190
-
191
-//          a, b,   c,  d,  e,   f, g,  h
192
-// .hword   0, 0,   0, 64,  0,   0, 0,  0
193
-.macro qpel_start_0
194
-    movi            v24.16b, #64
195
-.endm
196
-
197
-.macro qpel_filter_0_32b
198
-    umull           v17.8h, v3.8b, v24.8b    // 64*d
199
-.endm
200
-
201
x265_3.6.tar.gz/source/common/aarch64/ipfilter-sve2.S Deleted
201
 
1
@@ -1,1282 +0,0 @@
2
-/*****************************************************************************
3
- * Copyright (C) 2022-2023 MulticoreWare, Inc
4
- *
5
- * Authors: David Chen <david.chen@myais.com.cn>
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
- *
21
- * This program is also available under a commercial proprietary license.
22
- * For more information, contact us at license @ x265.com.
23
- *****************************************************************************/
24
-
25
-// Functions in this file:
26
-// ***** luma_vpp *****
27
-// ***** luma_vps *****
28
-// ***** luma_vsp *****
29
-// ***** luma_vss *****
30
-// ***** luma_hpp *****
31
-// ***** luma_hps *****
32
-// ***** chroma_vpp *****
33
-// ***** chroma_vps *****
34
-// ***** chroma_vsp *****
35
-// ***** chroma_vss *****
36
-// ***** chroma_hpp *****
37
-// ***** chroma_hps *****
38
-
39
-#include "asm-sve.S"
40
-#include "ipfilter-common.S"
41
-
42
-.arch armv8-a+sve2
43
-
44
-#ifdef __APPLE__
45
-.section __RODATA,__rodata
46
-#else
47
-.section .rodata
48
-#endif
49
-
50
-.align 4
51
-
52
-.text
53
-
54
-.macro qpel_load_32b_sve2 v
55
-.if \v == 0
56
-    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
57
-    ld1b            {z3.h}, p0/z, x6
58
-    add             x6, x6, x1
59
-.elseif \v == 1 || \v == 2 || \v == 3
60
-.if \v != 3                           // not used in qpel_filter_3
61
-    ld1b            {z0.h}, p0/z, x6
62
-    add             x6, x6, x1
63
-.else
64
-    add             x6, x6, x1
65
-.endif
66
-    ld1b            {z1.h}, p0/z, x6
67
-    add             x6, x6, x1
68
-    ld1b            {z2.h}, p0/z, x6
69
-    add             x6, x6, x1
70
-    ld1b            {z3.h}, p0/z, x6
71
-    add             x6, x6, x1
72
-    ld1b            {z4.h}, p0/z, x6
73
-    add             x6, x6, x1
74
-    ld1b            {z5.h}, p0/z, x6
75
-    add             x6, x6, x1
76
-.if \v != 1                           // not used in qpel_filter_1
77
-    ld1b            {z6.h}, p0/z, x6
78
-    add             x6, x6, x1
79
-    ld1b            {z7.h}, p0/z, x6
80
-.else
81
-    ld1b            {z6.h}, p0/z, x6
82
-.endif
83
-.endif
84
-.endm
85
-
86
-.macro qpel_load_64b_sve2_gt_16 v
87
-.if \v == 0
88
-    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
89
-    ld1b            {z3.h}, p2/z, x6
90
-    add             x6, x6, x1
91
-.elseif \v == 1 || \v == 2 || \v == 3
92
-.if \v != 3                           // not used in qpel_filter_3
93
-    ld1b            {z0.h}, p2/z, x6
94
-    add             x6, x6, x1
95
-.else
96
-    add             x6, x6, x1
97
-.endif
98
-    ld1b            {z1.h}, p2/z, x6
99
-    add             x6, x6, x1
100
-    ld1b            {z2.h}, p2/z, x6
101
-    add             x6, x6, x1
102
-    ld1b            {z3.h}, p2/z, x6
103
-    add             x6, x6, x1
104
-    ld1b            {z4.h}, p2/z, x6
105
-    add             x6, x6, x1
106
-    ld1b            {z5.h}, p2/z, x6
107
-    add             x6, x6, x1
108
-.if \v != 1                           // not used in qpel_filter_1
109
-    ld1b            {z6.h}, p2/z, x6
110
-    add             x6, x6, x1
111
-    ld1b            {z7.h}, p2/z, x6
112
-.else
113
-    ld1b            {z6.h}, p2/z, x6
114
-.endif
115
-.endif
116
-.endm
117
-
118
-.macro qpel_chroma_load_32b_sve2 v
119
-.if \v == 0
120
-    // qpel_filter_chroma_0 only uses values in v1
121
-    add             x6, x6, x1
122
-    ld1b            {z1.h}, p0/z, x6
123
-.else
124
-    ld1b            {z0.h}, p0/z, x6
125
-    add             x6, x6, x1
126
-    ld1b            {z1.h}, p0/z, x6
127
-    add             x6, x6, x1
128
-    ld1b            {z2.h}, p0/z, x6
129
-    add             x6, x6, x1
130
-    ld1b            {z3.h}, p0/z, x6
131
-.endif
132
-.endm
133
-
134
-.macro qpel_start_sve2_0
135
-    mov             z24.h, #64
136
-.endm
137
-
138
-.macro qpel_filter_sve2_0_32b
139
-    mul             z17.h, z3.h, z24.h    // 64*d
140
-.endm
141
-
142
-.macro qpel_filter_sve2_0_64b
143
-    qpel_filter_sve2_0_32b
144
-    mul             z18.h, z11.h, z24.h
145
-.endm
146
-
147
-.macro qpel_start_sve2_1
148
-    mov             z24.h, #58
149
-    mov             z25.h, #10
150
-    mov             z26.h, #17
151
-    mov             z27.h, #5
152
-.endm
153
-
154
-.macro qpel_filter_sve2_1_32b
155
-    mul             z19.h, z2.h, z25.h  // c*10
156
-    mul             z17.h, z3.h, z24.h  // d*58
157
-    mul             z21.h, z4.h, z26.h  // e*17
158
-    mul             z23.h, z5.h, z27.h  // f*5
159
-    sub             z17.h, z17.h, z19.h // d*58 - c*10
160
-    lsl             z18.h, z1.h, #2      // b*4
161
-    add             z17.h, z17.h, z21.h // d*58 - c*10 + e*17
162
-    sub             z21.h, z6.h, z0.h   // g - a
163
-    add             z17.h, z17.h, z18.h // d*58 - c*10 + e*17 + b*4
164
-    sub             z21.h, z21.h, z23.h // g - a - f*5
165
-    add             z17.h, z17.h, z21.h // d*58 - c*10 + e*17 + b*4 + g - a - f*5
166
-.endm
167
-
168
-.macro qpel_filter_sve2_1_64b
169
-    qpel_filter_sve2_1_32b
170
-    mul             z20.h, z10.h, z25.h  // c*10
171
-    mul             z18.h, z11.h, z24.h  // d*58
172
-    mul             z21.h, z12.h, z26.h  // e*17
173
-    mul             z23.h, z13.h, z27.h  // f*5
174
-    sub             z18.h, z18.h, z20.h   // d*58 - c*10
175
-    lsl             z28.h, z30.h, #2       // b*4
176
-    add             z18.h, z18.h, z21.h   // d*58 - c*10 + e*17
177
-    sub             z21.h, z14.h, z29.h   // g - a
178
-    add             z18.h, z18.h, z28.h   // d*58 - c*10 + e*17 + b*4
179
-    sub             z21.h, z21.h, z23.h   // g - a - f*5
180
-    add             z18.h, z18.h, z21.h   // d*58 - c*10 + e*17 + b*4 + g - a - f*5
181
-.endm
182
-
183
-.macro qpel_start_sve2_2
184
-    mov             z24.h, #11
185
-    mov             z25.h, #40
186
-.endm
187
-
188
-.macro qpel_filter_sve2_2_32b
189
-    add             z17.h, z3.h, z4.h     // d + e
190
-    add             z19.h, z2.h, z5.h     // c + f
191
-    add             z23.h, z1.h, z6.h     // b + g
192
-    add             z21.h, z0.h, z7.h     // a + h
193
-    mul             z17.h, z17.h, z25.h   // 40 * (d + e)
194
-    mul             z19.h, z19.h, z24.h   // 11 * (c + f)
195
-    lsl             z23.h, z23.h, #2       // (b + g) * 4
196
-    add             z19.h, z19.h, z21.h   // 11 * (c + f) + a + h
197
-    add             z17.h, z17.h, z23.h   // 40 * (d + e) + (b + g) * 4
198
-    sub             z17.h, z17.h, z19.h   // 40 * (d + e) + (b + g) * 4 - 11 * (c + f) - a - h
199
-.endm
200
-
201
x265_3.6.tar.gz/source/common/aarch64/ipfilter.S Deleted
201
 
1
@@ -1,1054 +0,0 @@
2
-/*****************************************************************************
3
- * Copyright (C) 2021 MulticoreWare, Inc
4
- *
5
- * Authors: Sebastian Pop <spop@amazon.com>
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
- *
21
- * This program is also available under a commercial proprietary license.
22
- * For more information, contact us at license @ x265.com.
23
- *****************************************************************************/
24
-
25
-// Functions in this file:
26
-// ***** luma_vpp *****
27
-// ***** luma_vps *****
28
-// ***** luma_vsp *****
29
-// ***** luma_vss *****
30
-// ***** luma_hpp *****
31
-// ***** luma_hps *****
32
-// ***** chroma_vpp *****
33
-// ***** chroma_vps *****
34
-// ***** chroma_vsp *****
35
-// ***** chroma_vss *****
36
-// ***** chroma_hpp *****
37
-// ***** chroma_hps *****
38
-
39
-#include "asm.S"
40
-#include "ipfilter-common.S"
41
-
42
-#ifdef __APPLE__
43
-.section __RODATA,__rodata
44
-#else
45
-.section .rodata
46
-#endif
47
-
48
-.align 4
49
-
50
-.text
51
-
52
-// ***** luma_vpp *****
53
-// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
54
-.macro LUMA_VPP_4xN h
55
-function x265_interp_8tap_vert_pp_4x\h\()_neon
56
-    movrel          x10, g_luma_s16
57
-    sub             x0, x0, x1
58
-    sub             x0, x0, x1, lsl #1         // src -= 3 * srcStride
59
-    lsl             x4, x4, #4
60
-    ldr             q0, x10, x4              // q0 = luma interpolate coeff
61
-    dup             v24.8h, v0.h0
62
-    dup             v25.8h, v0.h1
63
-    trn1            v24.2d, v24.2d, v25.2d
64
-    dup             v26.8h, v0.h2
65
-    dup             v27.8h, v0.h3
66
-    trn1            v26.2d, v26.2d, v27.2d
67
-    dup             v28.8h, v0.h4
68
-    dup             v29.8h, v0.h5
69
-    trn1            v28.2d, v28.2d, v29.2d
70
-    dup             v30.8h, v0.h6
71
-    dup             v31.8h, v0.h7
72
-    trn1            v30.2d, v30.2d, v31.2d
73
-
74
-    // prepare to load 8 lines
75
-    ld1             {v0.s}0, x0, x1
76
-    ld1             {v0.s}1, x0, x1
77
-    ushll           v0.8h, v0.8b, #0
78
-    ld1             {v1.s}0, x0, x1
79
-    ld1             {v1.s}1, x0, x1
80
-    ushll           v1.8h, v1.8b, #0
81
-    ld1             {v2.s}0, x0, x1
82
-    ld1             {v2.s}1, x0, x1
83
-    ushll           v2.8h, v2.8b, #0
84
-    ld1             {v3.s}0, x0, x1
85
-    ld1             {v3.s}1, x0, x1
86
-    ushll           v3.8h, v3.8b, #0
87
-
88
-    mov             x9, #\h
89
-.loop_4x\h:
90
-    ld1             {v4.s}0, x0, x1
91
-    ld1             {v4.s}1, x0, x1
92
-    ushll           v4.8h, v4.8b, #0
93
-
94
-    // row0-1
95
-    mul             v16.8h, v0.8h, v24.8h
96
-    ext             v21.16b, v0.16b, v1.16b, #8
97
-    mul             v17.8h, v21.8h, v24.8h
98
-    mov             v0.16b, v1.16b
99
-
100
-    // row2-3
101
-    mla             v16.8h, v1.8h, v26.8h
102
-    ext             v21.16b, v1.16b, v2.16b, #8
103
-    mla             v17.8h, v21.8h, v26.8h
104
-    mov             v1.16b, v2.16b
105
-
106
-    // row4-5
107
-    mla             v16.8h, v2.8h, v28.8h
108
-    ext             v21.16b, v2.16b, v3.16b, #8
109
-    mla             v17.8h, v21.8h, v28.8h
110
-    mov             v2.16b, v3.16b
111
-
112
-    // row6-7
113
-    mla             v16.8h, v3.8h, v30.8h
114
-    ext             v21.16b, v3.16b, v4.16b, #8
115
-    mla             v17.8h, v21.8h, v30.8h
116
-    mov             v3.16b, v4.16b
117
-
118
-    // sum row0-7
119
-    trn1            v20.2d, v16.2d, v17.2d
120
-    trn2            v21.2d, v16.2d, v17.2d
121
-    add             v16.8h, v20.8h, v21.8h
122
-
123
-    sqrshrun        v16.8b,  v16.8h,  #6
124
-    st1             {v16.s}0, x2, x3
125
-    st1             {v16.s}1, x2, x3
126
-
127
-    sub             x9, x9, #2
128
-    cbnz            x9, .loop_4x\h
129
-    ret
130
-endfunc
131
-.endm
132
-
133
-LUMA_VPP_4xN 4
134
-LUMA_VPP_4xN 8
135
-LUMA_VPP_4xN 16
136
-
137
-// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
138
-.macro LUMA_VPP w, h
139
-function x265_interp_8tap_vert_pp_\w\()x\h\()_neon
140
-    cmp             x4, #0
141
-    b.eq            0f
142
-    cmp             x4, #1
143
-    b.eq            1f
144
-    cmp             x4, #2
145
-    b.eq            2f
146
-    cmp             x4, #3
147
-    b.eq            3f
148
-0:
149
-    FILTER_LUMA_VPP \w, \h, 0
150
-1:
151
-    FILTER_LUMA_VPP \w, \h, 1
152
-2:
153
-    FILTER_LUMA_VPP \w, \h, 2
154
-3:
155
-    FILTER_LUMA_VPP \w, \h, 3
156
-endfunc
157
-.endm
158
-
159
-LUMA_VPP 8, 4
160
-LUMA_VPP 8, 8
161
-LUMA_VPP 8, 16
162
-LUMA_VPP 8, 32
163
-LUMA_VPP 12, 16
164
-LUMA_VPP 16, 4
165
-LUMA_VPP 16, 8
166
-LUMA_VPP 16, 16
167
-LUMA_VPP 16, 32
168
-LUMA_VPP 16, 64
169
-LUMA_VPP 16, 12
170
-LUMA_VPP 24, 32
171
-LUMA_VPP 32, 8
172
-LUMA_VPP 32, 16
173
-LUMA_VPP 32, 32
174
-LUMA_VPP 32, 64
175
-LUMA_VPP 32, 24
176
-LUMA_VPP 48, 64
177
-LUMA_VPP 64, 16
178
-LUMA_VPP 64, 32
179
-LUMA_VPP 64, 64
180
-LUMA_VPP 64, 48
181
-
182
-// ***** luma_vps *****
183
-// void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
184
-.macro LUMA_VPS_4xN h
185
-function x265_interp_8tap_vert_ps_4x\h\()_neon
186
-    lsl             x3, x3, #1
187
-    lsl             x5, x4, #6
188
-    lsl             x4, x1, #2
189
-    sub             x4, x4, x1
190
-    sub             x0, x0, x4
191
-
192
-    mov             w6, #8192
193
-    dup             v28.4s, w6
194
-    mov             x4, #\h
195
-    movrel          x12, g_lumaFilter
196
-    add             x12, x12, x5
197
-    ld1r            {v16.2d}, x12, #8
198
-    ld1r            {v17.2d}, x12, #8
199
-    ld1r            {v18.2d}, x12, #8
200
-    ld1r            {v19.2d}, x12, #8
201
x265_3.6.tar.gz/source/common/aarch64/sad-a-common.S Deleted
201
 
1
@@ -1,514 +0,0 @@
2
-/*****************************************************************************
3
- * Copyright (C) 2022-2023 MulticoreWare, Inc
4
- *
5
- * Authors: David Chen <david.chen@myais.com.cn>
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
- *
21
- * This program is also available under a commercial proprietary license.
22
- * For more information, contact us at license @ x265.com.
23
- *****************************************************************************/
24
-
25
-// This file contains the macros written using NEON instruction set
26
-// that are also used by the SVE2 functions
27
-
28
-#include "asm.S"
29
-
30
-.arch           armv8-a
31
-
32
-#ifdef __APPLE__
33
-.section __RODATA,__rodata
34
-#else
35
-.section .rodata
36
-#endif
37
-
38
-.align 4
39
-
40
-.macro SAD_START_4 f
41
-    ld1             {v0.s}0, x0, x1
42
-    ld1             {v0.s}1, x0, x1
43
-    ld1             {v1.s}0, x2, x3
44
-    ld1             {v1.s}1, x2, x3
45
-    \f              v16.8h, v0.8b, v1.8b
46
-.endm
47
-
48
-.macro SAD_4 h
49
-.rept \h / 2 - 1
50
-    SAD_START_4 uabal
51
-.endr
52
-.endm
53
-
54
-.macro SAD_START_8 f
55
-    ld1             {v0.8b}, x0, x1
56
-    ld1             {v1.8b}, x2, x3
57
-    ld1             {v2.8b}, x0, x1
58
-    ld1             {v3.8b}, x2, x3
59
-    \f              v16.8h, v0.8b, v1.8b
60
-    \f              v17.8h, v2.8b, v3.8b
61
-.endm
62
-
63
-.macro SAD_8 h
64
-.rept \h / 2 - 1
65
-    SAD_START_8 uabal
66
-.endr
67
-.endm
68
-
69
-.macro SAD_START_16 f
70
-    ld1             {v0.16b}, x0, x1
71
-    ld1             {v1.16b}, x2, x3
72
-    ld1             {v2.16b}, x0, x1
73
-    ld1             {v3.16b}, x2, x3
74
-    \f              v16.8h, v0.8b, v1.8b
75
-    \f\()2          v17.8h, v0.16b, v1.16b
76
-    uabal           v16.8h, v2.8b, v3.8b
77
-    uabal2          v17.8h, v2.16b, v3.16b
78
-.endm
79
-
80
-.macro SAD_16 h
81
-.rept \h / 2 - 1
82
-    SAD_START_16 uabal
83
-.endr
84
-.endm
85
-
86
-.macro SAD_START_32
87
-    movi            v16.16b, #0
88
-    movi            v17.16b, #0
89
-    movi            v18.16b, #0
90
-    movi            v19.16b, #0
91
-.endm
92
-
93
-.macro SAD_32
94
-    ld1             {v0.16b-v1.16b}, x0, x1
95
-    ld1             {v2.16b-v3.16b}, x2, x3
96
-    ld1             {v4.16b-v5.16b}, x0, x1
97
-    ld1             {v6.16b-v7.16b}, x2, x3
98
-    uabal           v16.8h, v0.8b, v2.8b
99
-    uabal2          v17.8h, v0.16b, v2.16b
100
-    uabal           v18.8h, v1.8b, v3.8b
101
-    uabal2          v19.8h, v1.16b, v3.16b
102
-    uabal           v16.8h, v4.8b, v6.8b
103
-    uabal2          v17.8h, v4.16b, v6.16b
104
-    uabal           v18.8h, v5.8b, v7.8b
105
-    uabal2          v19.8h, v5.16b, v7.16b
106
-.endm
107
-
108
-.macro SAD_END_32
109
-    add             v16.8h, v16.8h, v17.8h
110
-    add             v17.8h, v18.8h, v19.8h
111
-    add             v16.8h, v16.8h, v17.8h
112
-    uaddlv          s0, v16.8h
113
-    fmov            w0, s0
114
-    ret
115
-.endm
116
-
117
-.macro SAD_START_64
118
-    movi            v16.16b, #0
119
-    movi            v17.16b, #0
120
-    movi            v18.16b, #0
121
-    movi            v19.16b, #0
122
-    movi            v20.16b, #0
123
-    movi            v21.16b, #0
124
-    movi            v22.16b, #0
125
-    movi            v23.16b, #0
126
-.endm
127
-
128
-.macro SAD_64
129
-    ld1             {v0.16b-v3.16b}, x0, x1
130
-    ld1             {v4.16b-v7.16b}, x2, x3
131
-    ld1             {v24.16b-v27.16b}, x0, x1
132
-    ld1             {v28.16b-v31.16b}, x2, x3
133
-    uabal           v16.8h, v0.8b, v4.8b
134
-    uabal2          v17.8h, v0.16b, v4.16b
135
-    uabal           v18.8h, v1.8b, v5.8b
136
-    uabal2          v19.8h, v1.16b, v5.16b
137
-    uabal           v20.8h, v2.8b, v6.8b
138
-    uabal2          v21.8h, v2.16b, v6.16b
139
-    uabal           v22.8h, v3.8b, v7.8b
140
-    uabal2          v23.8h, v3.16b, v7.16b
141
-
142
-    uabal           v16.8h, v24.8b, v28.8b
143
-    uabal2          v17.8h, v24.16b, v28.16b
144
-    uabal           v18.8h, v25.8b, v29.8b
145
-    uabal2          v19.8h, v25.16b, v29.16b
146
-    uabal           v20.8h, v26.8b, v30.8b
147
-    uabal2          v21.8h, v26.16b, v30.16b
148
-    uabal           v22.8h, v27.8b, v31.8b
149
-    uabal2          v23.8h, v27.16b, v31.16b
150
-.endm
151
-
152
-.macro SAD_END_64
153
-    add             v16.8h, v16.8h, v17.8h
154
-    add             v17.8h, v18.8h, v19.8h
155
-    add             v16.8h, v16.8h, v17.8h
156
-    uaddlp          v16.4s, v16.8h
157
-    add             v18.8h, v20.8h, v21.8h
158
-    add             v19.8h, v22.8h, v23.8h
159
-    add             v17.8h, v18.8h, v19.8h
160
-    uaddlp          v17.4s, v17.8h
161
-    add             v16.4s, v16.4s, v17.4s
162
-    uaddlv          d0, v16.4s
163
-    fmov            x0, d0
164
-    ret
165
-.endm
166
-
167
-.macro SAD_START_12
168
-    movrel          x12, sad12_mask
169
-    ld1             {v31.16b}, x12
170
-    movi            v16.16b, #0
171
-    movi            v17.16b, #0
172
-.endm
173
-
174
-.macro SAD_12
175
-    ld1             {v0.16b}, x0, x1
176
-    and             v0.16b, v0.16b, v31.16b
177
-    ld1             {v1.16b}, x2, x3
178
-    and             v1.16b, v1.16b, v31.16b
179
-    ld1             {v2.16b}, x0, x1
180
-    and             v2.16b, v2.16b, v31.16b
181
-    ld1             {v3.16b}, x2, x3
182
-    and             v3.16b, v3.16b, v31.16b
183
-    uabal           v16.8h, v0.8b, v1.8b
184
-    uabal2          v17.8h, v0.16b, v1.16b
185
-    uabal           v16.8h, v2.8b, v3.8b
186
-    uabal2          v17.8h, v2.16b, v3.16b
187
-.endm
188
-
189
-.macro SAD_END_12
190
-    add             v16.8h, v16.8h, v17.8h
191
-    uaddlv          s0, v16.8h
192
-    fmov            w0, s0
193
-    ret
194
-.endm
195
-
196
-.macro SAD_START_24
197
-    movi            v16.16b, #0
198
-    movi            v17.16b, #0
199
-    movi            v18.16b, #0
200
-    sub             x1, x1, #16
201
x265_3.6.tar.gz/source/common/aarch64/sad-a-sve2.S Deleted
201
 
1
@@ -1,511 +0,0 @@
2
-/*****************************************************************************
3
- * Copyright (C) 2022-2023 MulticoreWare, Inc
4
- *
5
- * Authors: David Chen <david.chen@myais.com.cn>
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
- *
21
- * This program is also available under a commercial proprietary license.
22
- * For more information, contact us at license @ x265.com.
23
- *****************************************************************************/
24
-
25
-#include "asm-sve.S"
26
-#include "sad-a-common.S"
27
-
28
-.arch armv8-a+sve2
29
-
30
-#ifdef __APPLE__
31
-.section __RODATA,__rodata
32
-#else
33
-.section .rodata
34
-#endif
35
-
36
-.align 4
37
-
38
-.text
39
-
40
-.macro SAD_SVE2_16 h
41
-    mov             z16.d, #0
42
-    ptrue           p0.h, vl16
43
-.rept \h
44
-    ld1b            {z0.h}, p0/z, x0
45
-    ld1b            {z2.h}, p0/z, x2
46
-    add             x0, x0, x1
47
-    add             x2, x2, x3
48
-    uaba            z16.h, z0.h, z2.h
49
-.endr
50
-    uaddv           d0, p0, z16.h
51
-    fmov            w0, s0
52
-    ret
53
-.endm
54
-
55
-.macro SAD_SVE2_32 h
56
-    ptrue           p0.b, vl32
57
-.rept \h
58
-    ld1b            {z0.b}, p0/z, x0
59
-    ld1b            {z4.b}, p0/z, x2
60
-    add             x0, x0, x1
61
-    add             x2, x2, x3
62
-    uabalb          z16.h, z0.b, z4.b
63
-    uabalt          z16.h, z0.b, z4.b
64
-.endr
65
-    uaddv           d0, p0, z16.h
66
-    fmov            w0, s0
67
-    ret
68
-.endm
69
-
70
-.macro SAD_SVE2_64 h
71
-    cmp             x9, #48
72
-    bgt             .vl_gt_48_pixel_sad_64x\h
73
-    mov             z16.d, #0
74
-    mov             z17.d, #0
75
-    mov             z18.d, #0
76
-    mov             z19.d, #0
77
-    ptrue           p0.b, vl32
78
-.rept \h
79
-    ld1b            {z0.b}, p0/z, x0
80
-    ld1b            {z1.b}, p0/z, x0, #1, mul vl
81
-    ld1b            {z4.b}, p0/z, x2
82
-    ld1b            {z5.b}, p0/z, x2, #1, mul vl
83
-    add             x0, x0, x1
84
-    add             x2, x2, x3
85
-    uabalb          z16.h, z0.b, z4.b
86
-    uabalt          z17.h, z0.b, z4.b
87
-    uabalb          z18.h, z1.b, z5.b
88
-    uabalt          z19.h, z1.b, z5.b
89
-.endr
90
-    add             z16.h, z16.h, z17.h
91
-    add             z17.h, z18.h, z19.h
92
-    add             z16.h, z16.h, z17.h
93
-    uadalp          z24.s, p0/m, z16.h
94
-    uaddv           d5, p0, z24.s
95
-    fmov            x0, d5
96
-    ret
97
-.vl_gt_48_pixel_sad_64x\h\():
98
-    mov             z16.d, #0
99
-    mov             z17.d, #0
100
-    mov             z24.d, #0
101
-    ptrue           p0.b, vl64
102
-.rept \h
103
-    ld1b            {z0.b}, p0/z, x0
104
-    ld1b            {z4.b}, p0/z, x2
105
-    add             x0, x0, x1
106
-    add             x2, x2, x3
107
-    uabalb          z16.h, z0.b, z4.b
108
-    uabalt          z17.h, z0.b, z4.b
109
-.endr
110
-    add             z16.h, z16.h, z17.h
111
-    uadalp          z24.s, p0/m, z16.h
112
-    uaddv           d5, p0, z24.s
113
-    fmov            x0, d5
114
-    ret
115
-.endm
116
-
117
-.macro SAD_SVE2_24 h
118
-    mov             z16.d, #0
119
-    mov             x10, #24
120
-    mov             x11, #0
121
-    whilelt         p0.b, x11, x10
122
-.rept \h
123
-    ld1b            {z0.b}, p0/z, x0
124
-    ld1b            {z8.b}, p0/z, x2
125
-    add             x0, x0, x1
126
-    add             x2, x2, x3
127
-    uabalb          z16.h, z0.b, z8.b
128
-    uabalt          z16.h, z0.b, z8.b
129
-.endr
130
-    uaddv           d5, p0, z16.h
131
-    fmov            w0, s5
132
-    ret
133
-.endm
134
-
135
-.macro SAD_SVE2_48 h
136
-    cmp             x9, #48
137
-    bgt             .vl_gt_48_pixel_sad_48x\h
138
-    mov             z16.d, #0
139
-    mov             z17.d, #0
140
-    mov             z18.d, #0
141
-    mov             z19.d, #0
142
-    ptrue           p0.b, vl32
143
-    ptrue           p1.b, vl16
144
-.rept \h
145
-    ld1b            {z0.b}, p0/z, x0
146
-    ld1b            {z1.b}, p1/z, x0, #1, mul vl
147
-    ld1b            {z8.b}, p0/z, x2
148
-    ld1b            {z9.b}, p1/z, x2, #1, mul vl
149
-    add             x0, x0, x1
150
-    add             x2, x2, x3
151
-    uabalb          z16.h, z0.b, z8.b
152
-    uabalt          z17.h, z0.b, z8.b
153
-    uabalb          z18.h, z1.b, z9.b
154
-    uabalt          z19.h, z1.b, z9.b
155
-.endr
156
-    add             z16.h, z16.h, z17.h
157
-    add             z17.h, z18.h, z19.h
158
-    add             z16.h, z16.h, z17.h
159
-    uaddv           d5, p0, z16.h
160
-    fmov            w0, s5
161
-    ret
162
-.vl_gt_48_pixel_sad_48x\h\():
163
-    mov             z16.d, #0
164
-    mov             z17.d, #0
165
-    mov             x10, #48
166
-    mov             x11, #0
167
-    whilelt         p0.b, x11, x10
168
-.rept \h
169
-    ld1b            {z0.b}, p0/z, x0
170
-    ld1b            {z8.b}, p0/z, x2
171
-    add             x0, x0, x1
172
-    add             x2, x2, x3
173
-    uabalb          z16.h, z0.b, z8.b
174
-    uabalt          z17.h, z0.b, z8.b
175
-.endr
176
-    add             z16.h, z16.h, z17.h
177
-    uaddv           d5, p0, z16.h
178
-    fmov            w0, s5
179
-    ret
180
-.endm
181
-
182
-// Fully unrolled.
183
-.macro SAD_FUNC_SVE2 w, h
184
-function PFX(pixel_sad_\w\()x\h\()_sve2)
185
-    rdvl            x9, #1
186
-    cmp             x9, #16
187
-    bgt             .vl_gt_16_pixel_sad_\w\()x\h
188
-    SAD_START_\w uabdl
189
-    SAD_\w \h
190
-.if \w > 4
191
-    add             v16.8h, v16.8h, v17.8h
192
-.endif
193
-    uaddlv          s0, v16.8h
194
-    fmov            w0, s0
195
-    ret
196
-.vl_gt_16_pixel_sad_\w\()x\h\():
197
-.if \w == 4 || \w == 8 || \w == 12
198
-    SAD_START_\w uabdl
199
-    SAD_\w \h
200
-.if \w > 4
201
x265_3.6.tar.gz/source/common/aarch64/ssd-a-sve.S Deleted
80
 
1
@@ -1,78 +0,0 @@
2
-/*****************************************************************************
3
- * Copyright (C) 2022-2023 MulticoreWare, Inc
4
- *
5
- * Authors: David Chen <david.chen@myais.com.cn>
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
- *
21
- * This program is also available under a commercial proprietary license.
22
- * For more information, contact us at license @ x265.com.
23
- *****************************************************************************/
24
-
25
-#include "asm-sve.S"
26
-
27
-.arch armv8-a+sve
28
-
29
-#ifdef __APPLE__
30
-.section __RODATA,__rodata
31
-#else
32
-.section .rodata
33
-#endif
34
-
35
-.align 4
36
-
37
-.text
38
-
39
-function PFX(pixel_sse_pp_4x4_sve)
40
-    ptrue           p0.s, vl4
41
-    ld1b            {z0.s}, p0/z, x0
42
-    ld1b            {z17.s}, p0/z, x2
43
-    add             x0, x0, x1
44
-    add             x2, x2, x3
45
-    sub             z0.s, p0/m, z0.s, z17.s
46
-    mul             z0.s, p0/m, z0.s, z0.s
47
-.rept 3
48
-    ld1b            {z16.s}, p0/z, x0
49
-    ld1b            {z17.s}, p0/z, x2
50
-    add             x0, x0, x1
51
-    add             x2, x2, x3
52
-    sub             z16.s, p0/m, z16.s, z17.s
53
-    mla             z0.s, p0/m, z16.s, z16.s
54
-.endr
55
-    uaddv           d0, p0, z0.s
56
-    fmov            w0, s0
57
-    ret
58
-endfunc
59
-
60
-function PFX(pixel_sse_pp_4x8_sve)
61
-    ptrue           p0.s, vl4
62
-    ld1b            {z0.s}, p0/z, x0
63
-    ld1b            {z17.s}, p0/z, x2
64
-    add             x0, x0, x1
65
-    add             x2, x2, x3
66
-    sub             z0.s, p0/m, z0.s, z17.s
67
-    mul             z0.s, p0/m, z0.s, z0.s
68
-.rept 7
69
-    ld1b            {z16.s}, p0/z, x0
70
-    ld1b            {z17.s}, p0/z, x2
71
-    add             x0, x0, x1
72
-    add             x2, x2, x3
73
-    sub             z16.s, p0/m, z16.s, z17.s
74
-    mla             z0.s, p0/m, z16.s, z16.s
75
-.endr
76
-    uaddv           d0, p0, z0.s
77
-    fmov            w0, s0
78
-    ret
79
-endfunc
80
x265_4.0.tar.gz/.readthedocs.yaml Added
29
 
1
@@ -0,0 +1,27 @@
2
+# Read the Docs configuration file for Sphinx projects
3
+# .readthedocs.yaml
4
+
5
+# Project Information
6
+# Required
7
+version: 2
8
+
9
+build:
10
+  os: "ubuntu-20.04"
11
+  tools:
12
+    python: "3.10"
13
+
14
+# Use a requirements file for pip dependencies
15
+python:
16
+  install:
17
+    - requirements: doc/requirements.txt
18
+    
19
+# Build documentation in the "docs/" directory with Sphinx
20
+sphinx:
21
+  builder: html
22
+  configuration: doc/reST/conf.py
23
+  fail_on_warning: false
24
+
25
+# Optionally build your docs in additional formats such as PDF and ePub
26
+# formats:
27
+#   - pdf
28
+#   - epub
29
x265_3.6.tar.gz/build/README.txt -> x265_4.0.tar.gz/build/README.txt Changed
58
 
1
@@ -94,22 +94,42 @@
2
 
3
 = Build Instructions for cross-compilation for Arm AArch64 Targets=
4
 
5
-When the target platform is based on Arm AArch64 architecture, the x265 can be
6
-built in x86 platforms. However, the CMAKE_C_COMPILER and CMAKE_CXX_COMPILER
7
-enviroment variables should be set to point to the cross compilers of the
8
-appropriate gcc. For example:
9
+Cross compilation of x265 for AArch64 targets is possible on x86 platforms by
10
+passing a toolchain file when running CMake to configure the project:
11
 
12
-1. export CMAKE_C_COMPILER=aarch64-unknown-linux-gnu-gcc
13
-2. export CMAKE_CXX_COMPILER=aarch64-unknown-linux-gnu-g++
14
+* cmake -DCMAKE_TOOLCHAIN_FILE=<path-to-toolchain-file>
15
 
16
-The default ones are aarch64-linux-gnu-gcc and aarch64-linux-gnu-g++.
17
-Then, the normal building process can be followed.
18
+Toolchain files for AArch64 cross-compilation exist in the /build directory.
19
+These specify a default cross-compiler to use; however this can be overridden
20
+by setting the CMAKE_C_COMPILER and CMAKE_CXX_COMPILER CMake variables when
21
+running CMake to configure the project. For example:
22
 
23
-Moreover, if the target platform supports SVE or SVE2 instruction set, the
24
-CROSS_COMPILE_SVE or CROSS_COMPILE_SVE2 environment variables should be set
25
-to true, respectively. For example:
26
+* cmake -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++
27
 
28
-1. export CROSS_COMPILE_SVE2=true
29
-2. export CROSS_COMPILE_SVE=true
30
+If target platform supports Armv8.4 Neon DotProd instructions, the
31
+CROSS_COMPILE_NEON_DOTPROD CMake option should be set to ON:
32
 
33
-Then, the normal building process can be followed.
34
+* cmake -DCROSS_COMPILE_NEON_DOTPROD=ON  <other configuration options...>
35
+
36
+If target platform supports Armv8.6 Neon I8MM instructions, the
37
+CROSS_COMPILE_NEON_I8MM CMake option should be set to ON:
38
+
39
+* cmake -DCROSS_COMPILE_NEON_I8MM=ON  <other configuration options...>
40
+
41
+If the target platform supports SVE or SVE2, CROSS_COMPILE_SVE or
42
+CROSS_COMPILE_SVE2 CMake options should be set to ON, respectively.
43
+For example, when running CMake to configure the project:
44
+
45
+1. cmake -DCROSS_COMPILE_SVE=ON  <other configuration options...>
46
+2. cmake -DCROSS_COMPILE_SVE2=ON <other configuration options...>
47
+
48
+Note: when the CROSS_COMPILE_SVE option is set to ON the build configuration will
49
+also compile for Neon DotProd and I8MM, as we impose the constraint that SVE implies
50
+both Neon DotProd and I8MM.
51
+
52
+Similarly when the CROSS_COMPILE_SVE2 option is set to ON the build configuration
53
+will also compile for Neon I8MM, as we impose the constraint that SVE2 implies Neon
54
+I8MM. SVE2 already implies that Neon DotProd is implemented since SVE2 is an Armv9.0
55
+feature which implies Armv8.5, and Neon DotProd is mandatory from Armv8.4.
56
+
57
+Then, the normal build process can be followed.
58
x265_3.6.tar.gz/build/aarch64-darwin/crosscompile.cmake -> x265_4.0.tar.gz/build/aarch64-darwin/crosscompile.cmake Changed
26
 
1
@@ -7,17 +7,14 @@
2
 set(CMAKE_SYSTEM_NAME Darwin)
3
 set(CMAKE_SYSTEM_PROCESSOR aarch64)
4
 
5
-# specify the cross compiler
6
-set(CMAKE_C_COMPILER gcc-12)
7
-set(CMAKE_CXX_COMPILER g++-12)
8
+# specify the cross compiler (giving precedence to user-supplied CC/CXX)
9
+if(NOT DEFINED CMAKE_C_COMPILER)
10
+    set(CMAKE_C_COMPILER gcc)
11
+endif()
12
+if(NOT DEFINED CMAKE_CXX_COMPILER)
13
+    set(CMAKE_CXX_COMPILER g++)
14
+endif()
15
 
16
 # specify the target environment
17
 SET(CMAKE_FIND_ROOT_PATH  /opt/homebrew/bin/)
18
 
19
-# specify whether SVE/SVE2 is supported by the target platform
20
-if(DEFINED ENV{CROSS_COMPILE_SVE2})
21
-    set(CROSS_COMPILE_SVE2 1)
22
-elseif(DEFINED ENV{CROSS_COMPILE_SVE})
23
-    set(CROSS_COMPILE_SVE 1)
24
-endif()
25
-
26
x265_4.0.tar.gz/build/aarch64-linux-clang Added
2
 
1
+(directory)
2
x265_4.0.tar.gz/build/aarch64-linux-clang/crosscompile.cmake Added
27
 
1
@@ -0,0 +1,25 @@
2
+# CMake toolchain file for cross compiling x265 for AArch64, using Clang.
3
+
4
+set(CROSS_COMPILE_ARM64 1)
5
+set(CMAKE_SYSTEM_NAME Linux)
6
+set(CMAKE_SYSTEM_PROCESSOR aarch64)
7
+
8
+set(TARGET_TRIPLE aarch64-linux-gnu)
9
+
10
+# specify the cross compiler (giving precedence to user-supplied CC/CXX)
11
+if(NOT DEFINED CMAKE_C_COMPILER)
12
+    set(CMAKE_C_COMPILER clang)
13
+endif()
14
+if(NOT DEFINED CMAKE_CXX_COMPILER)
15
+    set(CMAKE_CXX_COMPILER clang++)
16
+endif()
17
+
18
+# specify compiler target
19
+set(CMAKE_C_COMPILER_TARGET ${TARGET_TRIPLE})
20
+set(CMAKE_CXX_COMPILER_TARGET ${TARGET_TRIPLE})
21
+
22
+# specify assembler target
23
+list(APPEND ASM_FLAGS "--target=${TARGET_TRIPLE}")
24
+
25
+# specify the target environment
26
+SET(CMAKE_FIND_ROOT_PATH /usr/aarch64-linux-gnu)
27
x265_3.6.tar.gz/build/aarch64-linux/crosscompile.cmake -> x265_4.0.tar.gz/build/aarch64-linux/crosscompile.cmake Changed
30
 
1
@@ -7,25 +7,14 @@
2
 set(CMAKE_SYSTEM_NAME Linux)
3
 set(CMAKE_SYSTEM_PROCESSOR aarch64)
4
 
5
-# specify the cross compiler
6
-if(DEFINED ENV{CMAKE_C_COMPILER})
7
-    set(CMAKE_C_COMPILER $ENV{CMAKE_C_COMPILER})
8
-else()
9
+# specify the cross compiler (giving precedence to user-supplied CC/CXX)
10
+if(NOT DEFINED CMAKE_C_COMPILER)
11
     set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
12
 endif()
13
-if(DEFINED ENV{CMAKE_CXX_COMPILER})
14
-    set(CMAKE_CXX_COMPILER $ENV{CMAKE_CXX_COMPILER})
15
-else()
16
+if(NOT DEFINED CMAKE_CXX_COMPILER)
17
     set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
18
 endif()
19
 
20
 # specify the target environment
21
 SET(CMAKE_FIND_ROOT_PATH  /usr/aarch64-linux-gnu)
22
 
23
-# specify whether SVE/SVE2 is supported by the target platform
24
-if(DEFINED ENV{CROSS_COMPILE_SVE2})
25
-    set(CROSS_COMPILE_SVE2 1)
26
-elseif(DEFINED ENV{CROSS_COMPILE_SVE})
27
-    set(CROSS_COMPILE_SVE 1)
28
-endif()
29
-
30
x265_4.0.tar.gz/build/vc17-x86 Added
2
 
1
+(directory)
2
x265_4.0.tar.gz/build/vc17-x86/build-all.bat Added
25
 
1
@@ -0,0 +1,23 @@
2
+@echo off
3
+setlocal enabledelayedexpansion
4
+if "%VS170COMNTOOLS%" == "" (
5
+for /f "usebackq tokens=1* delims=: " %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -latest `) do (
6
+  if /i "%%i"=="productPath" (
7
+        set VS170COMNTOOLS=%%j
8
+)
9
+)
10
+)
11
+setx VS170COMNTOOLS "!VS170COMNTOOLS!"
12
+if "%VS170COMNTOOLS%" == "" (
13
+  msg "%username%" "Visual Studio 17 not detected"
14
+  exit 1
15
+)
16
+if not exist x265.sln (
17
+  call make-solutions.bat
18
+)
19
+if exist x265.sln (
20
+  call "%VS170COMNTOOLS%\..\..\tools\VsDevCmd.bat"
21
+  MSBuild /property:Configuration="Release" x265.sln
22
+  MSBuild /property:Configuration="Debug" x265.sln
23
+  MSBuild /property:Configuration="RelWithDebInfo" x265.sln
24
+)
25
x265_4.0.tar.gz/build/vc17-x86/make-solutions.bat Added
8
 
1
@@ -0,0 +1,6 @@
2
+@echo off
3
+::
4
+:: run this batch file to create a Visual Studio solution file for this project.
5
+:: See the cmake documentation for other generator targets
6
+::
7
+cmake -G "Visual Studio 17 2022" ..\..\source && cmake-gui ..\..\source
8
x265_4.0.tar.gz/build/vc17-x86_64 Added
2
 
1
+(directory)
2
x265_4.0.tar.gz/build/vc17-x86_64/build-all.bat Added
25
 
1
@@ -0,0 +1,23 @@
2
+@echo off
3
+setlocal enabledelayedexpansion
4
+if "%VS170COMNTOOLS%" == "" (
5
+for /f "usebackq tokens=1* delims=: " %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -latest `) do (
6
+  if /i "%%i"=="productPath" (
7
+        set VS170COMNTOOLS=%%j
8
+)
9
+)
10
+)
11
+setx VS170COMNTOOLS "!VS170COMNTOOLS!"
12
+if "%VS170COMNTOOLS%" == "" (
13
+  msg "%username%" "Visual Studio 17 not detected"
14
+  exit 1
15
+)
16
+if not exist x265.sln (
17
+  call make-solutions.bat
18
+)
19
+if exist x265.sln (
20
+  call "%VS170COMNTOOLS%\..\..\tools\VsDevCmd.bat"
21
+  MSBuild /property:Configuration="Release" x265.sln
22
+  MSBuild /property:Configuration="Debug" x265.sln
23
+  MSBuild /property:Configuration="RelWithDebInfo" x265.sln
24
+)
25
x265_4.0.tar.gz/build/vc17-x86_64/make-solutions.bat Added
8
 
1
@@ -0,0 +1,6 @@
2
+@echo off
3
+::
4
+:: run this batch file to create a Visual Studio solution file for this project.
5
+:: See the cmake documentation for other generator targets
6
+::
7
+cmake -G "Visual Studio 17 2022" ..\..\source && cmake-gui ..\..\source
8
x265_4.0.tar.gz/build/vc17-x86_64/multilib.bat Added
50
 
1
@@ -0,0 +1,47 @@
2
+@echo off
3
+setlocal enabledelayedexpansion
4
+if "%VS170COMNTOOLS%" == "" (
5
+for /f "usebackq tokens=1* delims=: " %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -latest `) do (
6
+  if /i "%%i"=="productPath" (
7
+        set VS170COMNTOOLS=%%j
8
+)
9
+)
10
+)
11
+setx VS170COMNTOOLS "!VS170COMNTOOLS!"
12
+call "%VS170COMNTOOLS%\..\..\tools\VsDevCmd.bat"
13
+@mkdir 12bit
14
+@mkdir 10bit
15
+@mkdir 8bit
16
+
17
+@cd 12bit
18
+cmake -G "Visual Studio 17 2022" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF -DMAIN12=ON
19
+if exist x265.sln (
20
+  MSBuild /property:Configuration="Release" x265.sln
21
+  copy/y Release\x265-static.lib ..\8bit\x265-static-main12.lib
22
+)
23
+
24
+@cd ..\10bit
25
+cmake -G "Visual Studio 17 2022" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF
26
+if exist x265.sln (
27
+  MSBuild /property:Configuration="Release" x265.sln
28
+  copy/y Release\x265-static.lib ..\8bit\x265-static-main10.lib
29
+)
30
+
31
+@cd ..\8bit
32
+if not exist x265-static-main10.lib (
33
+  msg "%username%" "10bit build failed"
34
+  exit 1
35
+)
36
+if not exist x265-static-main12.lib (
37
+  msg "%username%" "12bit build failed"
38
+  exit 1
39
+)
40
+cmake -G "Visual Studio 17 2022" ../../../source -DEXTRA_LIB="x265-static-main10.lib;x265-static-main12.lib" -DLINKED_10BIT=ON -DLINKED_12BIT=ON
41
+if exist x265.sln (
42
+  MSBuild /property:Configuration="Release" x265.sln
43
+  :: combine static libraries (ignore warnings caused by winxp.cpp hacks)
44
+  move Release\x265-static.lib x265-static-main.lib
45
+  LIB.EXE /ignore:4006 /ignore:4221 /OUT:Release\x265-static.lib x265-static-main.lib x265-static-main10.lib x265-static-main12.lib
46
+)
47
+
48
+pause
49
\ No newline at end of file
50
x265_3.6.tar.gz/doc/reST/api.rst -> x265_4.0.tar.gz/doc/reST/api.rst Changed
31
 
1
@@ -419,21 +419,21 @@
2
    void x265_cleanup(void);
3
 
4
 VMAF (Video Multi-Method Assessment Fusion)
5
-==========================================
6
+===========================================
7
 
8
 If you set the ENABLE_LIBVMAF cmake option to ON, then x265 will report per frame
9
 and aggregate VMAF score for the given input and dump the scores in csv file.
10
-The user also need to specify the :option:`--recon` in command line to get the VMAF scores.
11
+The user also need to specify the :option:`--recon` in command line to get the VMAF scores.::
12
  
13
     /* x265_calculate_vmafScore:
14
-     *    returns VMAF score for the input video.
15
-     *    This api must be called only after encoding was done. */
16
-    double x265_calculate_vmafscore(x265_param*, x265_vmaf_data*);
17
+    *       returns VMAF score for the input video.
18
+    *       This API must be called only after encoding was done. */
19
+   double x265_calculate_vmafscore(x265_param*, x265_vmaf_data*);
20
 
21
     /* x265_calculate_vmaf_framelevelscore:
22
-     *    returns VMAF score for each frame in a given input video. The frame level VMAF score does not include temporal scores. */
23
-    double x265_calculate_vmaf_framelevelscore(x265_vmaf_framedata*);
24
-    
25
+    *       returns VMAF score for each frame in a given input video. The frame level VMAF score does not include temporal scores. */
26
+   double x265_calculate_vmaf_framelevelscore(x265_vmaf_framedata*);
27
+
28
 .. Note::
29
 
30
     When setting ENABLE_LIBVMAF cmake option to ON, it is recommended to
31
x265_3.6.tar.gz/doc/reST/cli.rst -> x265_4.0.tar.gz/doc/reST/cli.rst Changed
201
 
1
@@ -822,7 +822,7 @@
2
    metrics from the 4 sub-CUs. When multiple inter modes like :option:`--rect`
3
    and/or :option:`--amp` are enabled, this feature will use motion cost 
4
    heuristics from the 4 sub-CUs to bypass modes that are unlikely to be the 
5
-   best choice. This can significantly improve performance when :option:`rect`
6
+   best choice. This can significantly improve performance when :option:`--rect`
7
    and/or :option:`--amp` are enabled at minimal compression efficiency loss.
8
 
9
 .. option:: --rect, --no-rect
10
@@ -983,7 +983,7 @@
11
     Store/normalize ctu distortion in analysis-save/load.
12
     0 - Disabled.
13
     1 - Save ctu distortion to the analysis file specified during :option:`--analysis-save`.
14
-        Load CTU distortion from the analysis file and normalize it across every frame during :option:`--analysis-load`.
15
+    - Load CTU distortion from the analysis file and normalize it across every frame during :option:`--analysis-load`.
16
     Default 0.
17
 
18
 .. option:: --scale-factor
19
@@ -1056,27 +1056,13 @@
20
 
21
 .. option:: --rdoq-level <0|1|2>, --no-rdoq-level
22
 
23
-   Specify the amount of rate-distortion analysis to use within
24
-   quantization::
25
+   Specify the amount of rate-distortion analysis to use within quantization::
26
 
27
-   At level 0 rate-distortion cost is not considered in quant
28
-   
29
-   At level 1 rate-distortion cost is used to find optimal rounding
30
-   values for each level (and allows psy-rdoq to be effective). It
31
-   trades-off the signaling cost of the coefficient vs its post-inverse
32
-   quant distortion from the pre-quant coefficient. When
33
-   :option:`--psy-rdoq` is enabled, this formula is biased in favor of
34
-   more energy in the residual (larger coefficient absolute levels)
35
-   
36
-   At level 2 rate-distortion cost is used to make decimate decisions
37
-   on each 4x4 coding group, including the cost of signaling the group
38
-   within the group bitmap. If the total distortion of not signaling
39
-   the entire coding group is less than the rate cost, the block is
40
-   decimated. Next, it applies rate-distortion cost analysis to the
41
-   last non-zero coefficient, which can result in many (or all) of the
42
-   coding groups being decimated. Psy-rdoq is less effective at
43
-   preserving energy when RDOQ is at level 2, since it only has
44
-   influence over the level distortion costs.
45
+           At level 0 rate-distortion cost is not considered in quant.
46
+
47
+           At level 1 rate-distortion cost is used to find optimal rounding values for each level (and allows psy-rdoq to be effective). It trades-off the signaling cost of the coefficient vs its post-inverse quant distortion from the pre-quant coefficient. When :option:`--psy-rdoq` is enabled, this formula is biased in favor of more energy in the residual (larger coefficient absolute levels).
48
+
49
+           At level 2 rate-distortion cost is used to make decimate decisions on each 4x4 coding group, including the cost of signaling the group within the group bitmap. If the total distortion of not signaling the entire coding group is less than the rate cost, the block is decimated. Next, it applies rate-distortion cost analysis to the last non-zero coefficient, which can result in many (or all) of the coding groups being decimated. Psy-rdoq is less effective at preserving energy when RDOQ is at level 2, since it only has influence over the level distortion costs.
50
 
51
 .. option:: --tu-intra-depth <1..4>
52
 
53
@@ -1221,19 +1207,16 @@
54
 
55
 .. option:: --me <integer|string>
56
 
57
-   Motion search method. Generally, the higher the number the harder
58
-   the ME method will try to find an optimal match. Diamond search is
59
-   the simplest. Hexagon search is a little better. Uneven
60
-   Multi-Hexagon is an adaption of the search method used by x264 for
61
-   slower presets. Star is a three-step search adapted from the HM
62
-   encoder: a star-pattern search followed by an optional radix scan
63
-   followed by an optional star-search refinement. Full is an
64
-   exhaustive search; an order of magnitude slower than all other
65
-   searches but not much better than umh or star. SEA is similar to
66
-   x264's ESA implementation and a speed optimization of full search.
67
-    It is a three-step motion search where the DC calculation is
68
-    followed by ADS calculation followed by SAD of the passed motion
69
-    vector candidates.
70
+   Motion search method. Generally, the higher the number the harder the ME method
71
+   will try to find an optimal match. Diamond search is the simplest. Hexagon search
72
+   is a little better. Uneven Multi-Hexagon is an adaption of the search method used
73
+   by x264 for slower presets. Star is a three-step search adapted from the HM encoder: a
74
+   star-pattern search followed by an optional radix scan followed by an optional
75
+   star-search refinement. Full is an exhaustive search; an order of magnitude slower
76
+   than all other searches but not much better than umh or star. SEA is similar to x264's
77
+   ESA implementation and a speed optimization of full search. It is a three-step motion
78
+   search where the DC calculation is followed by ADS calculation followed by SAD of the
79
+   passed motion vector candidates.
80
 
81
    0. dia
82
    1. hex **(default)**
83
@@ -1331,7 +1314,14 @@
84
    
85
 .. option:: --mcstf, --no-mcstf
86
 
87
-    Enable Motion Compensated Temporal filtering.
88
+   Motion-compensated spatio-temporal filtering (MCSTF) improves the compression
89
+   efficiency of videos that contain a high level of noise. It introduces a
90
+   temporal filter before encoding and this filter is applied only to the I- and P-frames.
91
+   It utilizes previously generated motion vectors across different video content
92
+   resolutions to find the best temporal correspondence for low-pass filtering. Here,
93
+   motion estimation is applied between the central picture and each future or past
94
+   picture, thereby generating multiple motion-compensated predictions, which are then
95
+   combined by using adaptive filtering to produce a final noise-reduced picture.
96
    Default: disabled
97
 
98
 Spatial/intra options
99
@@ -1486,7 +1476,7 @@
100
    whereas for the :option:`--scenecut`, inserts RADL at every scenecut.
101
    Recommended value is 2-3. Default 0 (disabled).
102
    
103
-   **Range of values: Between 0 and `--bframes`
104
+   **Range of values:** Between 0 and `--bframes`
105
 
106
 .. option:: --ctu-info <0, 1, 2, 4, 6>
107
 
108
@@ -1550,9 +1540,7 @@
109
    as *lslices*
110
 
111
    **Values:** 0 - disabled. 1 is the same as 0. Max 16.
112
-   Default: 8 for ultrafast, superfast, faster, fast, medium
113
-            4 for slow, slower
114
-            disabled for veryslow, slower
115
+   Default: 8 for ultrafast, superfast, faster, fast, medium; 4 for slow, slower; disabled for veryslow, slower.
116
 
117
 .. option:: --lookahead-threads <integer>
118
 
119
@@ -1602,14 +1590,17 @@
120
 
121
    Values:
122
    0 - flush the encoder only when all the input pictures are over.
123
-   1 - flush all the frames even when the input is not over. 
124
-       slicetype decision may change with this option.
125
+   1 - flush all the frames even when the input is not over. Slicetype decision may change with this option.
126
    2 - flush the slicetype decided frames only.   
127
 
128
 .. option:: --fades, --no-fades
129
 
130
    Detect and handle fade-in regions. Default disabled.
131
 
132
+.. option:: --cra-nal
133
+
134
+   Force NAL type to CRA to all the frames expect for the first frame, works only with :option:`--keyint` is 1.
135
+
136
 Quality, rate control and rate distortion options
137
 =================================================
138
 
139
@@ -1744,9 +1735,7 @@
140
    0. disabled
141
    1. AQ enabled 
142
    2. AQ enabled with auto-variance **(default)**
143
-   3. AQ enabled with auto-variance and bias to dark scenes. This is 
144
-   recommended for 8-bit encodes or low-bitrate 10-bit encodes, to 
145
-   prevent color banding/blocking. 
146
+   3. AQ enabled with auto-variance and bias to dark scenes. This is recommended for 8-bit encodes or low-bitrate 10-bit encodes, to prevent color banding/blocking.
147
    4. AQ enabled with auto-variance and edge information.
148
 
149
 .. option:: --aq-strength <float>
150
@@ -1759,11 +1748,13 @@
151
    Default 1.0.
152
    **Range of values:** 0.0 to 3.0
153
 
154
-.. option:: --sbrc --no-sbrc
155
+.. option:: --sbrc, --no-sbrc
156
+
157
+   To enable and disable segment-based rate control. SBRC controls the overflow with
158
+   segment sizes, and it is based on the Capped CRF mode. Segment duration depends on
159
+   the keyframe interval specified. If unspecified, the default keyframe interval will
160
+   be used. Default: disabled. **Experimental Feature**
161
 
162
-   To enable and disable segment based rate control.Segment duration depends on the
163
-   keyframe interval specified.If unspecified,default keyframe interval will be used.
164
-   Default: disabled.
165
 
166
 .. option:: --hevc-aq
167
 
168
@@ -1849,7 +1840,7 @@
169
    and also redundant steps are skipped.
170
    In pass 1 analysis information like motion vector, depth, reference and prediction
171
    modes of the final best CTU partition is stored for each CTU.
172
-   Multipass analysis refinement cannot be enabled when :option:`--analysis-save`/:option:`analysis-load`
173
+   Multipass analysis refinement cannot be enabled when :option:`--analysis-save`/:option:`--analysis-load`
174
    is enabled and both will be disabled when enabled together. This feature requires :option:`--pmode`/:option:`--pme`
175
    to be disabled and hence pmode/pme will be disabled when enabled at the same time.
176
 
177
@@ -2014,26 +2005,29 @@
178
    When :option:`--scenecut-aware-qp` is:
179
 
180
    * 1 (Forward masking):
181
-   --masking-strength <fwdMaxWindow,fwdRefQPDelta,fwdNonRefQPDelta>
182
-   or 
183
-   --masking-strength <fwdWindow1,fwdRefQPDelta1,fwdNonRefQPDelta1,fwdWindow2,fwdRefQPDelta2,fwdNonRefQPDelta2,
184
-                       fwdWindow3,fwdRefQPDelta3,fwdNonRefQPDelta3,fwdWindow4,fwdRefQPDelta4,fwdNonRefQPDelta4,
185
-                       fwdWindow5,fwdRefQPDelta5,fwdNonRefQPDelta5,fwdWindow6,fwdRefQPDelta6,fwdNonRefQPDelta6>
186
+
187
+           --masking-strength <fwdMaxWindow,fwdRefQPDelta,fwdNonRefQPDelta>
188
+
189
+           or
190
+
191
+           --masking-strength <fwdWindow1,fwdRefQPDelta1,fwdNonRefQPDelta1,fwdWindow2,fwdRefQPDelta2,fwdNonRefQPDelta2,fwdWindow3,fwdRefQPDelta3,fwdNonRefQPDelta3,fwdWindow4,fwdRefQPDelta4,fwdNonRefQPDelta4,fwdWindow5,fwdRefQPDelta5,fwdNonRefQPDelta5,fwdWindow6,fwdRefQPDelta6,fwdNonRefQPDelta6>
192
+
193
    * 2 (Backward masking):
194
-   --masking-strength <bwdMaxWindow,bwdRefQPDelta,bwdNonRefQPDelta>
195
-   or 
196
-   --masking-strength <bwdWindow1,bwdRefQPDelta1,bwdNonRefQPDelta1,bwdWindow2,bwdRefQPDelta2,bwdNonRefQPDelta2,
197
-                       bwdWindow3,bwdRefQPDelta3,bwdNonRefQPDelta3,bwdWindow4,bwdRefQPDelta4,bwdNonRefQPDelta4,
198
-                       bwdWindow5,bwdRefQPDelta5,bwdNonRefQPDelta5,bwdWindow6,bwdRefQPDelta6,bwdNonRefQPDelta6>
199
+
200
+           --masking-strength <bwdMaxWindow,bwdRefQPDelta,bwdNonRefQPDelta>
201
x265_3.6.tar.gz/doc/reST/conf.py -> x265_4.0.tar.gz/doc/reST/conf.py Changed
10
 
1
@@ -14,7 +14,7 @@
2
 copyright = u'2014 MulticoreWare Inc'
3
 
4
 # -- Options for HTML output ---------------------------------------------------
5
-html_theme = "default"
6
+html_theme = "sphinx_rtd_theme"
7
 
8
 # One entry per manual page. List of tuples
9
 # (source start file, name, description, authors, manual section).
10
x265_3.6.tar.gz/doc/reST/presets.rst -> x265_4.0.tar.gz/doc/reST/presets.rst Changed
38
 
1
@@ -21,16 +21,17 @@
2
 The presets adjust encoder parameters as shown in the following table.
3
 Any parameters below that are specified in your command-line will be 
4
 changed from the value specified by the preset.
5
-   0. ultrafast
6
-   1. superfast
7
-   2. veryfast
8
-   3. faster
9
-   4. fast
10
-   5. medium **(default)**
11
-   6. slow
12
-   7. slower
13
-   8. veryslow
14
-   9. placebo
15
+
16
+    0. ultrafast
17
+    1. superfast
18
+    2. veryfast
19
+    3. faster
20
+    4. fast
21
+    5. medium **(default)**
22
+    6. slow
23
+    7. slower
24
+    8. veryslow
25
+    9. placebo
26
 
27
 +-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
28
 | preset          |  0  |  1  |  2  |   3 |   4 |   5 |   6  |   7  |   8  |  9   |
29
@@ -152,7 +153,7 @@
30
     * :option:`--sao` 0
31
     * :option:`--psy-rd` 4.0
32
     * :option:`--psy-rdoq` 10.0
33
-    * :option:`--recursion-skip` 0
34
+    * :option:`--rskip` 0
35
     
36
 It also enables a specialised ratecontrol algorithm :option:`--rc-grain` 
37
 that strictly minimises QP fluctuations across frames, while still allowing 
38
x265_3.6.tar.gz/doc/reST/releasenotes.rst -> x265_4.0.tar.gz/doc/reST/releasenotes.rst Changed
116
 
1
@@ -2,6 +2,44 @@
2
 Release Notes
3
 *************
4
 
5
+Version 4.0
6
+===========
7
+
8
+Release date - 13th September, 2024.
9
+
10
+New feature
11
+-----------
12
+1. Alpha Channel feature.
13
+2. Screen Content Coding (SCC).
14
+3. MV-HEVC feature.
15
+
16
+Enhancements to existing features
17
+---------------------------------
18
+1. Added support for the VMAF v3.x.
19
+
20
+API changes
21
+-----------
22
+1. Add command line parameter for Alpha Channel feature :option:`--alpha`.
23
+2. Add command line parameter for SCC feature :option:`--scc 1`.
24
+3. Add command line parameters for the MV-HEVC feature :option:`--multiview-config "multiview_config.txt"`.
25
+
26
+Optimizations
27
+---------------------
28
+1. Arm SIMD optimizations: Several time-consuming scalar C functions now have SIMD implementations on Arm platforms. Existing Arm SIMD implementations have also been optimized. These optimizations result in up to 57% faster encoding compared to release 3.6.
29
+2. Arm SIMD optimizations include use of Armv8.4 DotProd, Armv8.6 I8MM, and Armv9 SVE2 instruction set extensions. The following algorithms now have optimized SIMD implementations: SAD, SSE, DCT, SAO, convolution, quantization, intra_planar, intraFilter, intrapred DC and IDCT16x16.
30
+
31
+Bug fixes
32
+---------
33
+1. Fix for y4m pipe input broken.
34
+2. Fix SCC crash on multipass encode.
35
+3. Fix mcstf when :option:`--bframes` value was less than 5.
36
+4. Fix lowpass DCT for high bit depth.
37
+5. Added build support for Visual Studio 17.
38
+6. Fix issue in default code flow and memory leak.
39
+7. Framethreads tuning for Windows ARM devices.
40
+8. Fix scc crash on multipass encode.
41
+
42
+
43
 Version 3.6
44
 ===========
45
 
46
@@ -9,44 +47,44 @@
47
 
48
 New feature
49
 -----------
50
-1. Segment based Ratecontrol (SBRC) feature
51
-2. Motion-Compensated Spatio-Temporal Filtering
52
-3. Scene-cut aware qp - BBAQ (Bidirectional Boundary Aware Quantization)
53
-4. Histogram-Based Scene Change Detection
54
-5. Film-Grain characteristics as a SEI message to support Film Grain Synthesis(FGS)
55
-6. Add temporal layer implementation(Hierarchical B-frame implementation)
56
- 
57
+1. Segment based Ratecontrol (SBRC) feature.
58
+2. Motion-Compensated Spatio-Temporal Filtering.
59
+3. Scene-cut aware qp - BBAQ (Bidirectional Boundary Aware Quantization).
60
+4. Histogram-Based Scene Change Detection.
61
+5. Film-Grain characteristics as a SEI message to support Film Grain Synthesis (FGS).
62
+6. Add temporal layer implementation (Hierarchical B-frame implementation).
63
+
64
 Enhancements to existing features
65
 ---------------------------------
66
-1. Added Dolby Vision 8.4 Profile Support
67
+1. Added Dolby Vision 8.4 Profile Support.
68
 
69
 
70
 API changes
71
 -----------
72
-1. Add Segment based Ratecontrol(SBRC) feature: "--no-sbrc".
73
-2. Add command line parameter for mcstf feature: "--no-mctf".
74
-3. Add command line parameters for the scene cut aware qp feature: "--scenecut-aware-qp" and "--masking-strength".
75
-4. Add command line parameters for Histogram-Based Scene Change Detection: "--hist-scenecut".
76
-5. Add film grain characteristics as a SEI message to the bitstream: "--film-grain <filename>"
77
-6. cli: add new option --cra-nal (Force nal type to CRA to all frames expect for the first frame, works only with keyint 1)
78
+1. Add command line parameter for SBRC feature :option:`--sbrc`.
79
+2. Add command line parameter for mcstf feature :option:`--mcstf`.
80
+3. Add command line parameters for the scene cut aware qp feature :option:`--scenecut-aware-qp` and :option:`--masking-strength`.
81
+4. Add command line parameters for Histogram-Based Scene Change Detection :option:`--hist-scenecut`.
82
+5. Add command line parameters for film grain characteristics as a SEI message to the bitstream :option:`--film-grain`.
83
+6. cli: add new option :option:`--cra-nal` (Force NAL type to CRA to all the frames expect for the first frame, works only with :option:`--keyint` is 1).
84
 
85
 Optimizations
86
 ---------------------
87
-ARM64 NEON optimizations:- Several time-consuming C functions have been optimized for the targeted platform - aarch64. The overall performance increased by around 20%.
88
-SVE/SVE2 optimizations
89
+1. ARM64 NEON optimizations:- Several time-consuming C functions have been optimized for the targeted platform - aarch64. The overall performance increased by around 20%.
90
+2. SVE/SVE2 optimizations.
91
 
92
 
93
 Bug fixes
94
 ---------
95
-1. Linux bug to utilize all the cores
96
-2. Crash with hist-scenecut build when source resolution is not multiple of minCuSize
97
-3. 32bit and 64bit builds generation for ARM
98
-4. bugs in zonefile feature (Reflect Zonefile Parameters inside Lookahead, extra IDR issue, Avg I Slice QP value issue etc..)
99
-5. Add x86 ASM implementation for subsampling luma 
100
-6. Fix for abrladder segfault with load reuse level 1 
101
-7. Reorder miniGOP based on temporal layer hierarchy and add support for more B frame 
102
-8. Add MacOS aarch64 build support 
103
-9. Fix boundary condition issue for Gaussian filter
104
+1. Linux bug to utilize all the cores.
105
+2. Crash with hist-scenecut build when source resolution is not multiple of minCuSize.
106
+3. 32bit and 64bit builds generation for ARM.
107
+4. bugs in zonefile feature (Reflect Zonefile Parameters inside Lookahead, extra IDR issue, Avg I Slice QP value issue etc.).
108
+5. Add x86 ASM implementation for subsampling luma.
109
+6. Fix for abrladder segfault with load reuse level 1.
110
+7. Reorder miniGOP based on temporal layer hierarchy and add support for more B frames. 
111
+8. Add MacOS aarch64 build support.
112
+9. Fix boundary condition issue for Gaussian filter.
113
 
114
 
115
 Version 3.5
116
x265_3.6.tar.gz/doc/reST/svthevc.rst -> x265_4.0.tar.gz/doc/reST/svthevc.rst Changed
19
 
1
@@ -3,7 +3,7 @@
2
 
3
 .. _SvtHevc:
4
 
5
-x265 has support for open source HEVC encoder `SVT-HEVC <https://01.org/svt>`_ 
6
+x265 has support for open source HEVC encoder `SVT-HEVC <https://www.intel.com/content/www/us/en/developer/articles/technical/scalable-video-technology.html>`_
7
 and can generate SVT-HEVC compliant bitstreams. SVT-HEVC encoder can be enabled at run time 
8
 using :option:`--svt`. Since SVT-HEVC params/CLI are not exposed outside, it has to be 
9
 configured only via x265 CLI options. The API's of SVT-HEVC are accessed through x265's API 
10
@@ -22,7 +22,7 @@
11
 
12
 **SVT-HEVC**
13
 
14
-1. Clone `SVT-HEVC <https://github.com/intel/SVT-HEVC>`_ (say at path "/home/app/") and build it (follow the build steps in its README file)
15
+1. Clone `SVT-HEVC-repo <https://github.com/intel/SVT-HEVC>`_ (say at path "/home/app/") and build it (follow the build steps in its README file)
16
 2. Once build is successful, binaries can be found inside the *Bin* folder at its root directory ("/home/app/SVT-HEVC/Bin/Release/")
17
 
18
 **x265**
19
x265_3.6.tar.gz/doc/reST/x265.rst -> x265_4.0.tar.gz/doc/reST/x265.rst Changed
7
 
1
@@ -1,3 +1,5 @@
2
+:orphan:
3
+
4
 x265 CLI Documentation
5
 ######################
6
 
7
x265_4.0.tar.gz/doc/requirements.txt Added
5
 
1
@@ -0,0 +1,3 @@
2
+sphinx
3
+sphinx-rtd-theme
4
+# Add other dependencies here
5
x265_3.6.tar.gz/source/CMakeLists.txt -> x265_4.0.tar.gz/source/CMakeLists.txt Changed
201
 
1
@@ -22,6 +22,8 @@
2
 include(CheckFunctionExists)
3
 include(CheckSymbolExists)
4
 include(CheckCXXCompilerFlag)
5
+include(CheckCSourceCompiles)
6
+include(CheckCXXSourceCompiles)
7
 
8
 option(FPROFILE_GENERATE "Compile executable to generate usage data" OFF)
9
 option(FPROFILE_USE "Compile executable using generated usage data" OFF)
10
@@ -29,7 +31,7 @@
11
 option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
12
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
13
 # X265_BUILD must be incremented each time the public API is changed
14
-set(X265_BUILD 209)
15
+set(X265_BUILD 212)
16
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
17
                "${PROJECT_BINARY_DIR}/x265.def")
18
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
19
@@ -80,14 +82,16 @@
20
     set(ARM 1)
21
     add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
22
 elseif(ARM64MATCH GREATER "-1")
23
-    #if(CROSS_COMPILE_ARM64)
24
-        #message(STATUS "Cross compiling for ARM64 arch")
25
-    #else()
26
-        #set(CROSS_COMPILE_ARM64 0)
27
-    #endif()
28
     message(STATUS "Detected ARM64 target processor")
29
     set(ARM64 1)
30
-    add_definitions(-DX265_ARCH_ARM64=1 -DHAVE_NEON)
31
+
32
+    option(AARCH64_WARNINGS_AS_ERRORS "Build with -Werror for AArch64 Intrinsics files" OFF)
33
+
34
+    # Options for cross compiling AArch64 optional extensions
35
+    option(CROSS_COMPILE_SVE "Cross Compile for SVE Target" OFF)
36
+    option(CROSS_COMPILE_SVE2 "Cross Compile for SVE2 Target" OFF)
37
+    option(CROSS_COMPILE_NEON_DOTPROD "Cross Compile for Neon DotProd Target" OFF)
38
+    option(CROSS_COMPILE_NEON_I8MM "Cross Compile for Neon I8MM Target" OFF)
39
 else()
40
     message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
41
     message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
42
@@ -259,28 +263,106 @@
43
             set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
44
         endif()
45
     endif()
46
-   if(ARM64 OR CROSS_COMPILE_ARM64)
47
-        find_package(Neon)
48
-        find_package(SVE)
49
-        find_package(SVE2)
50
-        if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2)
51
-            message(STATUS "Found SVE2")
52
-           set(ARM_ARGS -O3 -march=armv8-a+sve2 -fPIC -flax-vector-conversions)
53
-            add_definitions(-DHAVE_SVE2)
54
-            add_definitions(-DHAVE_SVE)
55
-            add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE2
56
-        elseif(CPU_HAS_SVE OR CROSS_COMPILE_SVE)
57
-            message(STATUS "Found SVE")
58
-           set(ARM_ARGS -O3 -march=armv8-a+sve -fPIC -flax-vector-conversions)
59
-            add_definitions(-DHAVE_SVE)
60
-            add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE
61
-        elseif(CPU_HAS_NEON)
62
-            message(STATUS "Found NEON")
63
-            set(ARM_ARGS -fPIC -flax-vector-conversions)
64
-            add_definitions(-DHAVE_NEON)
65
+    if(ARM64)
66
+        message(STATUS "Found Neon")
67
+        set(CPU_HAS_NEON 1)
68
+        add_definitions(-DX265_ARCH_ARM64=1 -DHAVE_NEON=1)
69
+
70
+        if(CROSS_COMPILE_ARM64)
71
+            # Handle cross-compilation options.
72
+            if(CROSS_COMPILE_NEON_DOTPROD)
73
+                set(CPU_HAS_NEON_DOTPROD 1)
74
+            endif()
75
+            if(CROSS_COMPILE_NEON_I8MM)
76
+                set(CPU_HAS_NEON_I8MM 1)
77
+                # Impose the constraint that Neon I8MM implies Neon DotProd.
78
+                set(CPU_HAS_NEON_DOTPROD 1)
79
+            endif()
80
+            if(CROSS_COMPILE_SVE)
81
+                set(CPU_HAS_SVE 1)
82
+                # Impose the constraint that SVE implies Neon DotProd and I8MM.
83
+                set(CPU_HAS_NEON_DOTPROD 1)
84
+                set(CPU_HAS_NEON_I8MM 1)
85
+            endif()
86
+            if(CROSS_COMPILE_SVE2)
87
+                set(CPU_HAS_SVE2 1)
88
+                # SVE2 implies SVE and Neon DotProd.
89
+                set(CPU_HAS_SVE 1)
90
+                set(CPU_HAS_NEON_DOTPROD 1)
91
+                # Impose the constraint that SVE2 implies Neon I8MM.
92
+                set(CPU_HAS_NEON_I8MM 1)
93
+            endif()
94
         else()
95
-            set(ARM_ARGS -fPIC -flax-vector-conversions)
96
-        endif()        
97
+            if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin")
98
+                find_package(NEON_DOTPROD)
99
+                find_package(NEON_I8MM)
100
+                find_package(SVE)
101
+                find_package(SVE2)
102
+            else()
103
+                message(STATUS "Compile time feature detection unsupported on this platform")
104
+            endif()
105
+        endif()
106
+
107
+        if(CPU_HAS_NEON_DOTPROD)
108
+            # Neon DotProd is mandatory from Armv8.4.
109
+            message(STATUS "Found Neon DotProd")
110
+            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod)
111
+            add_definitions(-DHAVE_NEON_DOTPROD=1)
112
+        endif()
113
+        if(CPU_HAS_NEON_I8MM)
114
+            # Neon I8MM is mandatory from Armv8.6.
115
+            message(STATUS "Found Neon I8MM")
116
+            # Impose the constraint that Neon I8MM implies Neon DotProd.
117
+            if(NOT CPU_HAS_NEON_DOTPROD)
118
+                message(FATAL_ERROR "Unsupported AArch64 feature combination (Neon I8MM without Neon DotProd)")
119
+            endif()
120
+            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm)
121
+            add_definitions(-DHAVE_NEON_I8MM=1)
122
+        endif()
123
+        if(CPU_HAS_SVE)
124
+            message(STATUS "Found SVE")
125
+            # Impose the constraint that SVE implies Neon I8MM.
126
+            if(NOT CPU_HAS_NEON_I8MM)
127
+                message(FATAL_ERROR "Unsupported AArch64 feature combination (SVE without Neon I8MM)")
128
+            endif()
129
+            set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm+sve)
130
+            add_definitions(-DHAVE_SVE=1)
131
+        endif()
132
+        if(CPU_HAS_SVE2)
133
+            message(STATUS "Found SVE2")
134
+            # SVE2 is only available from Armv9.0, and armv9-a implies +dotprod
135
+            set(ARM_ARGS -O3 -march=armv9-a+i8mm+sve2)
136
+            add_definitions(-DHAVE_SVE2=1)
137
+        endif()
138
+        set(ARM_ARGS ${ARM_ARGS} -fPIC)
139
+        # Do not allow implicit vector type conversions in Clang builds (this
140
+        # is already the default in GCC builds).
141
+        check_cxx_compiler_flag(-flax-vector-conversions=none CC_HAS_FLAX_VEC_CONV_NONE)
142
+        if(CC_HAS_FLAX_VEC_CONV_NONE)
143
+            set(ARM_ARGS ${ARM_ARGS} -flax-vector-conversions=none)
144
+        endif()
145
+        if(CPU_HAS_SVE)
146
+            set(SVE_HEADER_TEST "
147
+#ifndef __ARM_NEON_SVE_BRIDGE
148
+#error 1
149
+#endif
150
+#include <arm_sve.h>
151
+#include <arm_neon_sve_bridge.h>
152
+int main() { return 0; }")
153
+            set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
154
+            # CMAKE_REQUIRED_FLAGS requires a space-delimited string, whereas
155
+            # ARM_ARGS is defined and used elsewhere as a ;-list.
156
+            foreach(ARM_ARG ${ARM_ARGS})
157
+                string(APPEND CMAKE_REQUIRED_FLAGS " ${ARM_ARG}")
158
+            endforeach()
159
+            check_c_source_compiles("${SVE_HEADER_TEST}" SVE_HEADER_C_TEST_COMPILED)
160
+            check_cxx_source_compiles("${SVE_HEADER_TEST}" SVE_HEADER_CXX_TEST_COMPILED)
161
+            set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
162
+            if(SVE_HEADER_C_TEST_COMPILED AND SVE_HEADER_CXX_TEST_COMPILED)
163
+                add_definitions(-DHAVE_SVE_BRIDGE=1)
164
+                set(HAVE_SVE_BRIDGE 1)
165
+            endif()
166
+        endif()
167
     endif()
168
    if(ENABLE_PIC)
169
    list(APPEND ARM_ARGS -DPIC)
170
@@ -334,9 +416,11 @@
171
     if (CC_HAS_FAST_MATH)
172
         add_definitions(-ffast-math)
173
     endif()
174
-    check_cxx_compiler_flag(-mstackrealign CC_HAS_STACK_REALIGN) 
175
-    if (CC_HAS_STACK_REALIGN)
176
-        add_definitions(-mstackrealign)
177
+    if (NOT (ARM64 OR CROSS_COMPILE_ARM64))
178
+        check_cxx_compiler_flag(-mstackrealign CC_HAS_STACK_REALIGN)
179
+        if (CC_HAS_STACK_REALIGN)
180
+            add_definitions(-mstackrealign)
181
+        endif()
182
     endif()
183
     # Disable exceptions. Reduce executable size, increase compability.
184
     check_cxx_compiler_flag(-fno-exceptions CC_HAS_FNO_EXCEPTIONS_FLAG)
185
@@ -558,6 +642,21 @@
186
     add_definitions(-DDETAILED_CU_STATS)
187
 endif(DETAILED_CU_STATS)
188
 
189
+option(ENABLE_ALPHA "Enable alpha encoding in x265" OFF)
190
+if(ENABLE_ALPHA)
191
+    add_definitions(-DENABLE_ALPHA)
192
+endif()
193
+
194
+option(ENABLE_MULTIVIEW "Enable Multi-view encoding in HEVC" OFF)
195
+if(ENABLE_MULTIVIEW)
196
+    add_definitions(-DENABLE_MULTIVIEW)
197
+endif()
198
+
199
+option(ENABLE_SCC_EXT "Enable screen content coding extension in HEVC" OFF)
200
+if(ENABLE_SCC_EXT)
201
x265_3.6.tar.gz/source/abrEncApp.cpp -> x265_4.0.tar.gz/source/abrEncApp.cpp Changed
201
 
1
@@ -63,6 +63,7 @@
2
             m_passEnci->init(ret);
3
         }
4
 
5
+        m_numInputViews = m_passEnc0->m_param->numViews;
6
         if (!allocBuffers())
7
         {
8
             x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for buffers\n");
9
@@ -76,7 +77,11 @@
10
 
11
     bool AbrEncoder::allocBuffers()
12
     {
13
+#if ENABLE_MULTIVIEW
14
+        m_inputPicBuffer = X265_MALLOC(x265_picture**, MAX_VIEWS);
15
+#else
16
         m_inputPicBuffer = X265_MALLOC(x265_picture**, m_numEncodes);
17
+#endif
18
         m_analysisBuffer = X265_MALLOC(x265_analysis_data*, m_numEncodes);
19
 
20
         m_picWriteCnt = new ThreadSafeIntegerm_numEncodes;
21
@@ -89,21 +94,48 @@
22
         m_analysisRead = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
23
         m_readFlag = X265_MALLOC(int*, m_numEncodes);
24
 
25
-        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
26
+#if ENABLE_MULTIVIEW
27
+        if (m_passEnc0->m_param->numViews > 1)
28
         {
29
-            m_inputPicBufferpass = X265_MALLOC(x265_picture*, m_queueSize);
30
-            for (uint32_t idx = 0; idx < m_queueSize; idx++)
31
+            for (uint8_t pass = 0; pass < m_numInputViews; pass++)
32
             {
33
-                m_inputPicBufferpassidx = x265_picture_alloc();
34
-                x265_picture_init(m_passEncpass->m_param, m_inputPicBufferpassidx);
35
+                m_inputPicBufferpass = X265_MALLOC(x265_picture*, m_queueSize);
36
+                for (uint32_t idx = 0; idx < m_queueSize; idx++)
37
+                {
38
+                    m_inputPicBufferpassidx = x265_picture_alloc();
39
+                    x265_picture_init(m_passEnc0->m_param, m_inputPicBufferpassidx);
40
+                }
41
+                if (pass == 0)
42
+                {
43
+                    CHECKED_MALLOC_ZERO(m_analysisBufferpass, x265_analysis_data, m_queueSize);
44
+                    m_picIdxReadCntpass = new ThreadSafeIntegerm_queueSize;
45
+                    m_analysisWritepass = new ThreadSafeIntegerm_queueSize;
46
+                    m_analysisReadpass = new ThreadSafeIntegerm_queueSize;
47
+                    m_readFlagpass = X265_MALLOC(int, m_queueSize);
48
+                }
49
             }
50
+        }
51
+        else
52
+        {
53
+#endif
54
+            for (uint8_t pass = 0; pass < m_numEncodes; pass++)
55
+            {
56
+                m_inputPicBufferpass = X265_MALLOC(x265_picture*, m_queueSize);
57
+                for (uint32_t idx = 0; idx < m_queueSize; idx++)
58
+                {
59
+                    m_inputPicBufferpassidx = x265_picture_alloc();
60
+                    x265_picture_init(m_passEncpass->m_param, m_inputPicBufferpassidx);
61
+                }
62
 
63
-            CHECKED_MALLOC_ZERO(m_analysisBufferpass, x265_analysis_data, m_queueSize);
64
-            m_picIdxReadCntpass = new ThreadSafeIntegerm_queueSize;
65
-            m_analysisWritepass = new ThreadSafeIntegerm_queueSize;
66
-            m_analysisReadpass = new ThreadSafeIntegerm_queueSize;
67
-            m_readFlagpass = X265_MALLOC(int, m_queueSize);
68
+                CHECKED_MALLOC_ZERO(m_analysisBufferpass, x265_analysis_data, m_queueSize);
69
+                m_picIdxReadCntpass = new ThreadSafeIntegerm_queueSize;
70
+                m_analysisWritepass = new ThreadSafeIntegerm_queueSize;
71
+                m_analysisReadpass = new ThreadSafeIntegerm_queueSize;
72
+                m_readFlagpass = X265_MALLOC(int, m_queueSize);
73
+            }
74
+#if ENABLE_MULTIVIEW
75
         }
76
+#endif
77
         return true;
78
     fail:
79
         return false;
80
@@ -112,15 +144,37 @@
81
     void AbrEncoder::destroy()
82
     {
83
         x265_cleanup(); /* Free library singletons */
84
-        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
85
+#if ENABLE_MULTIVIEW
86
+        for (uint8_t pass = 0; pass < m_numInputViews; pass++)
87
         {
88
             for (uint32_t index = 0; index < m_queueSize; index++)
89
             {
90
                 X265_FREE(m_inputPicBufferpassindex->planes0);
91
                 x265_picture_free(m_inputPicBufferpassindex);
92
             }
93
+            X265_FREE(m_inputPicBufferpass);
94
 
95
+            if (pass == 0)
96
+            {
97
+                X265_FREE(m_analysisBufferpass);
98
+                X265_FREE(m_readFlagpass);
99
+                delete m_picIdxReadCntpass;
100
+                delete m_analysisWritepass;
101
+                delete m_analysisReadpass;
102
+                m_passEncpass->destroy();
103
+                delete m_passEncpass;
104
+            }
105
+        }
106
+#else
107
+        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
108
+        {
109
+            for (uint32_t index = 0; index < m_queueSize; index++)
110
+            {
111
+                X265_FREE(m_inputPicBufferpassindex->planes0);
112
+                x265_picture_free(m_inputPicBufferpassindex);
113
+            }
114
             X265_FREE(m_inputPicBufferpass);
115
+
116
             X265_FREE(m_analysisBufferpass);
117
             X265_FREE(m_readFlagpass);
118
             delete m_picIdxReadCntpass;
119
@@ -129,6 +183,7 @@
120
             m_passEncpass->destroy();
121
             delete m_passEncpass;
122
         }
123
+#endif
124
         X265_FREE(m_inputPicBuffer);
125
         X265_FREE(m_analysisBuffer);
126
         X265_FREE(m_readFlag);
127
@@ -150,8 +205,11 @@
128
         m_id = id;
129
         m_cliopt = cliopt;
130
         m_parent = parent;
131
-        if(!(m_cliopt.enableScaler && m_id))
132
-            m_input = m_cliopt.input;
133
+        if (!(m_cliopt.enableScaler && m_id))
134
+        {
135
+            for (int view = 0; view < m_cliopt.param->numViews; view++)
136
+                m_inputview = m_cliopt.inputview;
137
+        }
138
         m_param = cliopt.param;
139
         m_inputOver = false;
140
         m_lastIdx = -1;
141
@@ -206,6 +264,7 @@
142
         {
143
             x265_log(NULL, X265_LOG_ERROR, "x265_encoder_open() failed for Enc, \n");
144
             m_ret = 2;
145
+            m_reader = NULL;
146
             return -1;
147
         }
148
 
149
@@ -402,7 +461,7 @@
150
     }
151
 
152
 
153
-    bool PassEncoder::readPicture(x265_picture *dstPic)
154
+    bool PassEncoder::readPicture(x265_picture* dstPic, int view)
155
     {
156
         /*Check and wait if there any input frames to read*/
157
         int ipread = m_parent->m_picReadCntm_id.get();
158
@@ -480,7 +539,7 @@
159
             }
160
 
161
 
162
-            x265_picture *srcPic = (x265_picture*)(m_parent->m_inputPicBufferm_idreadPos);
163
+            x265_picture* srcPic = (m_param->numViews > 1) ? (x265_picture*)(m_parent->m_inputPicBufferviewreadPos) : (x265_picture*)(m_parent->m_inputPicBufferm_idreadPos);
164
 
165
             x265_picture *pic = (x265_picture*)(dstPic);
166
             pic->colorSpace = srcPic->colorSpace;
167
@@ -499,6 +558,8 @@
168
             pic->planes0 = srcPic->planes0;
169
             pic->planes1 = srcPic->planes1;
170
             pic->planes2 = srcPic->planes2;
171
+            pic->planes3 = srcPic->planes3;
172
+            pic->format = srcPic->format;
173
             if (isAbrLoad)
174
                 pic->analysisData = *analysisData;
175
             return true;
176
@@ -529,11 +590,17 @@
177
                 x265_log(m_param, X265_LOG_ERROR, "Unable to register CTRL+C handler: %s in %s\n",
178
                     strerror(errno), profileName);
179
 
180
-            x265_picture pic_orig, pic_out;
181
-            x265_picture *pic_in = &pic_orig;
182
+            x265_picture pic_origMAX_VIEWS;
183
+            x265_picture *pic_inMAX_VIEWS;
184
+            for (int view = 0; view < m_param->numViews; view++)
185
+                pic_inview = &pic_origview;
186
             /* Allocate recon picture if analysis save/load is enabled */
187
             std::priority_queue<int64_t>* pts_queue = m_cliopt.output->needPTS() ? new std::priority_queue<int64_t>() : NULL;
188
-            x265_picture *pic_recon = (m_cliopt.recon || m_param->analysisSave || m_param->analysisLoad || pts_queue || reconPlay || m_param->csvLogLevel) ? &pic_out : NULL;
189
+            x265_picture* pic_reconMAX_LAYERS;
190
+            x265_picture pic_outMAX_LAYERS;
191
+
192
+            for (int i = 0; i < m_param->numLayers; i++)
193
+                pic_reconi = (m_cliopt.reconi || m_param->analysisSave || m_param->analysisLoad || pts_queue || reconPlay || m_param->csvLogLevel) ? &pic_outi : NULL;
194
             uint32_t inFrameCount = 0;
195
             uint32_t outFrameCount = 0;
196
             x265_nal *p_nal;
197
@@ -544,7 +611,7 @@
198
             uint8_t *rpuPayload = NULL;
199
             int inputPicNum = 1;
200
             x265_picture picField1, picField2;
201
x265_3.6.tar.gz/source/abrEncApp.h -> x265_4.0.tar.gz/source/abrEncApp.h Changed
36
 
1
@@ -42,6 +42,7 @@
2
     {
3
     public:
4
         uint8_t           m_numEncodes;
5
+        uint8_t           m_numInputViews; // Number of inputs for multiview-extension
6
         PassEncoder        **m_passEnc;
7
         uint32_t           m_queueSize;
8
         ThreadSafeInteger  m_numActiveEncodes;
9
@@ -86,7 +87,7 @@
10
         x265_picture **m_outputRecon;
11
 
12
         CLIOptions m_cliopt;
13
-        InputFile* m_input;
14
+        InputFile* m_inputMAX_VIEWS;
15
         const char* m_reconPlayCmd;
16
         FILE*    m_qpfile;
17
         FILE*    m_zoneFile;
18
@@ -102,7 +103,7 @@
19
         void startThreads();
20
         void copyInfo(x265_analysis_data *src);
21
 
22
-        bool readPicture(x265_picture*);
23
+        bool readPicture(x265_picture*, int view);
24
         void destroy();
25
 
26
     private:
27
@@ -142,7 +143,7 @@
28
     public:
29
         PassEncoder *m_parentEnc;
30
         int m_id;
31
-        InputFile* m_input;
32
+        InputFile* m_inputMAX_VIEWS;
33
         int m_threadActive;
34
 
35
         Reader(int id, PassEncoder *parentEnc);
36
x265_4.0.tar.gz/source/cmake/FindNEON_DOTPROD.cmake Added
23
 
1
@@ -0,0 +1,21 @@
2
+include(FindPackageHandleStandardArgs)
3
+
4
+# Check if Armv8.4 Neon DotProd is supported by the Arm CPU
5
+if(APPLE)
6
+    execute_process(COMMAND sysctl -a
7
+                    COMMAND grep "hw.optional.arm.FEAT_DotProd: 1"
8
+                    OUTPUT_VARIABLE has_dot_product
9
+                    ERROR_QUIET
10
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
11
+else()
12
+    execute_process(COMMAND cat /proc/cpuinfo
13
+                    COMMAND grep Features
14
+                    COMMAND grep asimddp
15
+                    OUTPUT_VARIABLE has_dot_product
16
+                    ERROR_QUIET
17
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
18
+endif()
19
+
20
+if(has_dot_product)
21
+    set(CPU_HAS_NEON_DOTPROD 1)
22
+endif()
23
x265_4.0.tar.gz/source/cmake/FindNEON_I8MM.cmake Added
23
 
1
@@ -0,0 +1,21 @@
2
+include(FindPackageHandleStandardArgs)
3
+
4
+# Check if Armv8.6 Neon I8MM is supported by the Arm CPU
5
+if(APPLE)
6
+    execute_process(COMMAND sysctl -a
7
+                    COMMAND grep "hw.optional.arm.FEAT_I8MM: 1"
8
+                    OUTPUT_VARIABLE has_i8mm
9
+                    ERROR_QUIET
10
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
11
+else()
12
+    execute_process(COMMAND cat /proc/cpuinfo
13
+                    COMMAND grep Features
14
+                    COMMAND grep i8mm
15
+                    OUTPUT_VARIABLE has_i8mm
16
+                    ERROR_QUIET
17
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
18
+endif()
19
+
20
+if(has_i8mm)
21
+    set(CPU_HAS_NEON_I8MM 1)
22
+endif()
23
x265_3.6.tar.gz/source/common/CMakeLists.txt -> x265_4.0.tar.gz/source/common/CMakeLists.txt Changed
64
 
1
@@ -103,22 +103,57 @@
2
         add_definitions(-DAUTO_VECTORIZE=1)
3
     endif()
4
 
5
-    set(C_SRCS asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h)
6
+    set(C_SRCS_NEON asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h sao-prim.cpp  mem-neon.h)
7
+    set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp)
8
+    set(C_SRCS_NEON_I8MM filter-neon-i8mm.cpp)
9
+    set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp)
10
+    set(C_SRCS_SVE2 sao-prim-sve2.cpp)
11
     enable_language(ASM)
12
 
13
     # add ARM assembly/intrinsic files here
14
-    set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S sad-a-common.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S ipfilter.S ipfilter-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S)
15
-    set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S)
16
-    set(A_SRCS_SVE2 mc-a-sve2.S sad-a-sve2.S pixel-util-sve2.S ipfilter-sve2.S ssd-a-sve2.S)
17
+    set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S intrapred.S dct.S)
18
+    set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S)
19
+    set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S ssd-a-sve2.S)
20
+    set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S)
21
     set(VEC_PRIMITIVES)
22
 
23
     set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
24
     set(ARM_ASMS_SVE "${A_SRCS_SVE}" CACHE INTERNAL "ARM Assembly Sources that use SVE instruction set")
25
     set(ARM_ASMS_SVE2 "${A_SRCS_SVE2}" CACHE INTERNAL "ARM Assembly Sources that use SVE2 instruction set")
26
-    foreach(SRC ${C_SRCS})
27
+    set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL "Arm Assembly Sources that use the Neon DotProd extension")
28
+    foreach(SRC ${C_SRCS_NEON})
29
         set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
30
     endforeach()
31
+
32
+    if(CPU_HAS_NEON_I8MM)
33
+        foreach(SRC ${C_SRCS_NEON_I8MM})
34
+            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
35
+        endforeach()
36
+    endif()
37
+
38
+    if(CPU_HAS_NEON_DOTPROD)
39
+        foreach(SRC ${C_SRCS_NEON_DOTPROD})
40
+            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
41
+        endforeach()
42
+    endif()
43
+
44
+    if(CPU_HAS_SVE AND HAVE_SVE_BRIDGE)
45
+        foreach(SRC ${C_SRCS_SVE})
46
+            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
47
+        endforeach()
48
+    endif()
49
+
50
+    if(CPU_HAS_SVE2 AND HAVE_SVE_BRIDGE)
51
+        foreach(SRC ${C_SRCS_SVE2})
52
+            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
53
+        endforeach()
54
+    endif()
55
+
56
     source_group(Assembly FILES ${ASM_PRIMITIVES})
57
+
58
+    if(AARCH64_WARNINGS_AS_ERRORS)
59
+        set_source_files_properties(${ASM_PRIMITIVES} PROPERTIES COMPILE_FLAGS -Werror)
60
+    endif()
61
 endif(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
62
 
63
 if(POWER)
64
x265_3.6.tar.gz/source/common/aarch64/arm64-utils.cpp -> x265_4.0.tar.gz/source/common/aarch64/arm64-utils.cpp Changed
201
 
1
@@ -3,7 +3,6 @@
2
 #include "arm64-utils.h"
3
 #include <arm_neon.h>
4
 
5
-#define COPY_16(d,s) *(uint8x16_t *)(d) = *(uint8x16_t *)(s)
6
 namespace X265_NS
7
 {
8
 
9
@@ -11,53 +10,58 @@
10
 
11
 void transpose8x8(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
12
 {
13
-    uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7;
14
-    uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;
15
-
16
-    a0 = *(uint8x8_t *)(src + 0 * sstride);
17
-    a1 = *(uint8x8_t *)(src + 1 * sstride);
18
-    a2 = *(uint8x8_t *)(src + 2 * sstride);
19
-    a3 = *(uint8x8_t *)(src + 3 * sstride);
20
-    a4 = *(uint8x8_t *)(src + 4 * sstride);
21
-    a5 = *(uint8x8_t *)(src + 5 * sstride);
22
-    a6 = *(uint8x8_t *)(src + 6 * sstride);
23
-    a7 = *(uint8x8_t *)(src + 7 * sstride);
24
-
25
-    b0 = vtrn1_u32(a0, a4);
26
-    b1 = vtrn1_u32(a1, a5);
27
-    b2 = vtrn1_u32(a2, a6);
28
-    b3 = vtrn1_u32(a3, a7);
29
-    b4 = vtrn2_u32(a0, a4);
30
-    b5 = vtrn2_u32(a1, a5);
31
-    b6 = vtrn2_u32(a2, a6);
32
-    b7 = vtrn2_u32(a3, a7);
33
-
34
-    a0 = vtrn1_u16(b0, b2);
35
-    a1 = vtrn1_u16(b1, b3);
36
-    a2 = vtrn2_u16(b0, b2);
37
-    a3 = vtrn2_u16(b1, b3);
38
-    a4 = vtrn1_u16(b4, b6);
39
-    a5 = vtrn1_u16(b5, b7);
40
-    a6 = vtrn2_u16(b4, b6);
41
-    a7 = vtrn2_u16(b5, b7);
42
-
43
-    b0 = vtrn1_u8(a0, a1);
44
-    b1 = vtrn2_u8(a0, a1);
45
-    b2 = vtrn1_u8(a2, a3);
46
-    b3 = vtrn2_u8(a2, a3);
47
-    b4 = vtrn1_u8(a4, a5);
48
-    b5 = vtrn2_u8(a4, a5);
49
-    b6 = vtrn1_u8(a6, a7);
50
-    b7 = vtrn2_u8(a6, a7);
51
-
52
-    *(uint8x8_t *)(dst + 0 * dstride) = b0;
53
-    *(uint8x8_t *)(dst + 1 * dstride) = b1;
54
-    *(uint8x8_t *)(dst + 2 * dstride) = b2;
55
-    *(uint8x8_t *)(dst + 3 * dstride) = b3;
56
-    *(uint8x8_t *)(dst + 4 * dstride) = b4;
57
-    *(uint8x8_t *)(dst + 5 * dstride) = b5;
58
-    *(uint8x8_t *)(dst + 6 * dstride) = b6;
59
-    *(uint8x8_t *)(dst + 7 * dstride) = b7;
60
+    uint8x8_t a0 = vld1_u8(src + 0 * sstride);
61
+    uint8x8_t a1 = vld1_u8(src + 1 * sstride);
62
+    uint8x8_t a2 = vld1_u8(src + 2 * sstride);
63
+    uint8x8_t a3 = vld1_u8(src + 3 * sstride);
64
+    uint8x8_t a4 = vld1_u8(src + 4 * sstride);
65
+    uint8x8_t a5 = vld1_u8(src + 5 * sstride);
66
+    uint8x8_t a6 = vld1_u8(src + 6 * sstride);
67
+    uint8x8_t a7 = vld1_u8(src + 7 * sstride);
68
+
69
+    uint32x2_t b0 = vtrn1_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4));
70
+    uint32x2_t b1 = vtrn1_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5));
71
+    uint32x2_t b2 = vtrn1_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6));
72
+    uint32x2_t b3 = vtrn1_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7));
73
+    uint32x2_t b4 = vtrn2_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4));
74
+    uint32x2_t b5 = vtrn2_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5));
75
+    uint32x2_t b6 = vtrn2_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6));
76
+    uint32x2_t b7 = vtrn2_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7));
77
+
78
+    uint16x4_t c0 = vtrn1_u16(vreinterpret_u16_u32(b0),
79
+                              vreinterpret_u16_u32(b2));
80
+    uint16x4_t c1 = vtrn1_u16(vreinterpret_u16_u32(b1),
81
+                              vreinterpret_u16_u32(b3));
82
+    uint16x4_t c2 = vtrn2_u16(vreinterpret_u16_u32(b0),
83
+                              vreinterpret_u16_u32(b2));
84
+    uint16x4_t c3 = vtrn2_u16(vreinterpret_u16_u32(b1),
85
+                              vreinterpret_u16_u32(b3));
86
+    uint16x4_t c4 = vtrn1_u16(vreinterpret_u16_u32(b4),
87
+                              vreinterpret_u16_u32(b6));
88
+    uint16x4_t c5 = vtrn1_u16(vreinterpret_u16_u32(b5),
89
+                              vreinterpret_u16_u32(b7));
90
+    uint16x4_t c6 = vtrn2_u16(vreinterpret_u16_u32(b4),
91
+                              vreinterpret_u16_u32(b6));
92
+    uint16x4_t c7 = vtrn2_u16(vreinterpret_u16_u32(b5),
93
+                              vreinterpret_u16_u32(b7));
94
+
95
+    uint8x8_t d0 = vtrn1_u8(vreinterpret_u8_u16(c0), vreinterpret_u8_u16(c1));
96
+    uint8x8_t d1 = vtrn2_u8(vreinterpret_u8_u16(c0), vreinterpret_u8_u16(c1));
97
+    uint8x8_t d2 = vtrn1_u8(vreinterpret_u8_u16(c2), vreinterpret_u8_u16(c3));
98
+    uint8x8_t d3 = vtrn2_u8(vreinterpret_u8_u16(c2), vreinterpret_u8_u16(c3));
99
+    uint8x8_t d4 = vtrn1_u8(vreinterpret_u8_u16(c4), vreinterpret_u8_u16(c5));
100
+    uint8x8_t d5 = vtrn2_u8(vreinterpret_u8_u16(c4), vreinterpret_u8_u16(c5));
101
+    uint8x8_t d6 = vtrn1_u8(vreinterpret_u8_u16(c6), vreinterpret_u8_u16(c7));
102
+    uint8x8_t d7 = vtrn2_u8(vreinterpret_u8_u16(c6), vreinterpret_u8_u16(c7));
103
+
104
+    vst1_u8(dst + 0 * dstride, d0);
105
+    vst1_u8(dst + 1 * dstride, d1);
106
+    vst1_u8(dst + 2 * dstride, d2);
107
+    vst1_u8(dst + 3 * dstride, d3);
108
+    vst1_u8(dst + 4 * dstride, d4);
109
+    vst1_u8(dst + 5 * dstride, d5);
110
+    vst1_u8(dst + 6 * dstride, d6);
111
+    vst1_u8(dst + 7 * dstride, d7);
112
 }
113
 
114
 
115
@@ -67,97 +71,171 @@
116
 
117
 void transpose16x16(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
118
 {
119
-    uint16x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aA, aB, aC, aD, aE, aF;
120
-    uint16x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, bA, bB, bC, bD, bE, bF;
121
-    uint16x8_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF;
122
-    uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, dA, dB, dC, dD, dE, dF;
123
-
124
-    a0 = *(uint16x8_t *)(src + 0 * sstride);
125
-    a1 = *(uint16x8_t *)(src + 1 * sstride);
126
-    a2 = *(uint16x8_t *)(src + 2 * sstride);
127
-    a3 = *(uint16x8_t *)(src + 3 * sstride);
128
-    a4 = *(uint16x8_t *)(src + 4 * sstride);
129
-    a5 = *(uint16x8_t *)(src + 5 * sstride);
130
-    a6 = *(uint16x8_t *)(src + 6 * sstride);
131
-    a7 = *(uint16x8_t *)(src + 7 * sstride);
132
-    a8 = *(uint16x8_t *)(src + 8 * sstride);
133
-    a9 = *(uint16x8_t *)(src + 9 * sstride);
134
-    aA = *(uint16x8_t *)(src + 10 * sstride);
135
-    aB = *(uint16x8_t *)(src + 11 * sstride);
136
-    aC = *(uint16x8_t *)(src + 12 * sstride);
137
-    aD = *(uint16x8_t *)(src + 13 * sstride);
138
-    aE = *(uint16x8_t *)(src + 14 * sstride);
139
-    aF = *(uint16x8_t *)(src + 15 * sstride);
140
-
141
-    b0 = vtrn1q_u64(a0, a8);
142
-    b1 = vtrn1q_u64(a1, a9);
143
-    b2 = vtrn1q_u64(a2, aA);
144
-    b3 = vtrn1q_u64(a3, aB);
145
-    b4 = vtrn1q_u64(a4, aC);
146
-    b5 = vtrn1q_u64(a5, aD);
147
-    b6 = vtrn1q_u64(a6, aE);
148
-    b7 = vtrn1q_u64(a7, aF);
149
-    b8 = vtrn2q_u64(a0, a8);
150
-    b9 = vtrn2q_u64(a1, a9);
151
-    bA = vtrn2q_u64(a2, aA);
152
-    bB = vtrn2q_u64(a3, aB);
153
-    bC = vtrn2q_u64(a4, aC);
154
-    bD = vtrn2q_u64(a5, aD);
155
-    bE = vtrn2q_u64(a6, aE);
156
-    bF = vtrn2q_u64(a7, aF);
157
-
158
-    c0 = vtrn1q_u32(b0, b4);
159
-    c1 = vtrn1q_u32(b1, b5);
160
-    c2 = vtrn1q_u32(b2, b6);
161
-    c3 = vtrn1q_u32(b3, b7);
162
-    c4 = vtrn2q_u32(b0, b4);
163
-    c5 = vtrn2q_u32(b1, b5);
164
-    c6 = vtrn2q_u32(b2, b6);
165
-    c7 = vtrn2q_u32(b3, b7);
166
-    c8 = vtrn1q_u32(b8, bC);
167
-    c9 = vtrn1q_u32(b9, bD);
168
-    cA = vtrn1q_u32(bA, bE);
169
-    cB = vtrn1q_u32(bB, bF);
170
-    cC = vtrn2q_u32(b8, bC);
171
-    cD = vtrn2q_u32(b9, bD);
172
-    cE = vtrn2q_u32(bA, bE);
173
-    cF = vtrn2q_u32(bB, bF);
174
-
175
-    d0 = vtrn1q_u16(c0, c2);
176
-    d1 = vtrn1q_u16(c1, c3);
177
-    d2 = vtrn2q_u16(c0, c2);
178
-    d3 = vtrn2q_u16(c1, c3);
179
-    d4 = vtrn1q_u16(c4, c6);
180
-    d5 = vtrn1q_u16(c5, c7);
181
-    d6 = vtrn2q_u16(c4, c6);
182
-    d7 = vtrn2q_u16(c5, c7);
183
-    d8 = vtrn1q_u16(c8, cA);
184
-    d9 = vtrn1q_u16(c9, cB);
185
-    dA = vtrn2q_u16(c8, cA);
186
-    dB = vtrn2q_u16(c9, cB);
187
-    dC = vtrn1q_u16(cC, cE);
188
-    dD = vtrn1q_u16(cD, cF);
189
-    dE = vtrn2q_u16(cC, cE);
190
-    dF = vtrn2q_u16(cD, cF);
191
-
192
-    *(uint16x8_t *)(dst + 0 * dstride)  = vtrn1q_u8(d0, d1);
193
-    *(uint16x8_t *)(dst + 1 * dstride)  = vtrn2q_u8(d0, d1);
194
-    *(uint16x8_t *)(dst + 2 * dstride)  = vtrn1q_u8(d2, d3);
195
-    *(uint16x8_t *)(dst + 3 * dstride)  = vtrn2q_u8(d2, d3);
196
-    *(uint16x8_t *)(dst + 4 * dstride)  = vtrn1q_u8(d4, d5);
197
-    *(uint16x8_t *)(dst + 5 * dstride)  = vtrn2q_u8(d4, d5);
198
-    *(uint16x8_t *)(dst + 6 * dstride)  = vtrn1q_u8(d6, d7);
199
-    *(uint16x8_t *)(dst + 7 * dstride)  = vtrn2q_u8(d6, d7);
200
-    *(uint16x8_t *)(dst + 8 * dstride)  = vtrn1q_u8(d8, d9);
201
x265_3.6.tar.gz/source/common/aarch64/arm64-utils.h -> x265_4.0.tar.gz/source/common/aarch64/arm64-utils.h Changed
9
 
1
@@ -1,6 +1,7 @@
2
 #ifndef __ARM64_UTILS_H__
3
 #define __ARM64_UTILS_H__
4
 
5
+#include <stdint.h>
6
 
7
 namespace X265_NS
8
 {
9
x265_3.6.tar.gz/source/common/aarch64/asm-primitives.cpp -> x265_4.0.tar.gz/source/common/aarch64/asm-primitives.cpp Changed
201
 
1
@@ -39,15 +39,9 @@
2
     p.cuBLOCK_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
3
     p.cuBLOCK_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
4
     p.cuBLOCK_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu)
5
-#define LUMA_TU_TYPED_NEON(prim, fncdef, fname) \
6
-    p.cuBLOCK_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
7
-    p.cuBLOCK_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
8
-    p.cuBLOCK_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
9
-    p.cuBLOCK_64x64.prim = fncdef PFX(fname ## _64x64_ ## neon)
10
 #define LUMA_TU_TYPED_CAN_USE_SVE(prim, fncdef, fname) \
11
     p.cuBLOCK_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve)
12
 #define ALL_LUMA_TU(prim, fname, cpu)      ALL_LUMA_TU_TYPED(prim, , fname, cpu)
13
-#define LUMA_TU_NEON(prim, fname)      LUMA_TU_TYPED_NEON(prim, , fname)
14
 #define LUMA_TU_CAN_USE_SVE(prim, fname)      LUMA_TU_TYPED_CAN_USE_SVE(prim, , fname)
15
 
16
 #define ALL_LUMA_PU_TYPED(prim, fncdef, fname, cpu) \
17
@@ -76,50 +70,6 @@
18
     p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
19
     p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \
20
     p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu)
21
-#define LUMA_PU_TYPED_MULTIPLE_ARCHS_1(prim, fncdef, fname, cpu) \
22
-    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
23
-    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## cpu); \
24
-    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## cpu)
25
-#define LUMA_PU_TYPED_MULTIPLE_ARCHS_2(prim, fncdef, fname, cpu) \
26
-    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
27
-    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
28
-    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
29
-    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
30
-    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## cpu); \
31
-    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## cpu); \
32
-    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
33
-    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
34
-    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
35
-    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
36
-    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
37
-    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
38
-    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
39
-    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## cpu); \
40
-    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
41
-    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
42
-    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
43
-    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu); \
44
-    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
45
-    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
46
-    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \
47
-    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu)
48
-#define LUMA_PU_TYPED_NEON_1(prim, fncdef, fname) \
49
-    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
50
-    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
51
-    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon); \
52
-    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## neon); \
53
-    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
54
-    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
55
-    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## neon); \
56
-    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## neon); \
57
-    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## neon); \
58
-    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \
59
-    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
60
-    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## neon); \
61
-    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## neon); \
62
-    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## neon); \
63
-    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## neon); \
64
-    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## neon)
65
 #define LUMA_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fncdef, fname) \
66
     p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve); \
67
     p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## sve); \
68
@@ -130,20 +80,6 @@
69
     p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## sve); \
70
     p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## sve); \
71
     p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## sve)
72
-#define LUMA_PU_TYPED_NEON_2(prim, fncdef, fname) \
73
-    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
74
-    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## neon); \
75
-    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
76
-    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
77
-    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## neon); \
78
-    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## neon); \
79
-    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
80
-    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
81
-    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \
82
-    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## neon); \
83
-    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon); \
84
-    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## neon); \
85
-    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## neon)
86
 #define LUMA_PU_TYPED_MULTIPLE_ARCHS_3(prim, fncdef, fname, cpu) \
87
     p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
88
     p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
89
@@ -157,10 +93,6 @@
90
     p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
91
     p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
92
     p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu)
93
-#define LUMA_PU_TYPED_NEON_3(prim, fncdef, fname) \
94
-    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
95
-    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
96
-    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon)
97
 #define LUMA_PU_TYPED_CAN_USE_SVE2(prim, fncdef, fname) \
98
     p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## sve2); \
99
     p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## sve2); \
100
@@ -184,22 +116,6 @@
101
     p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## sve2); \
102
     p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## sve2); \
103
     p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## sve2)
104
-#define LUMA_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
105
-    p.puLUMA_4x4.prim   = fncdef PFX(filterPixelToShort ## _4x4_ ## neon); \
106
-    p.puLUMA_8x8.prim   = fncdef PFX(filterPixelToShort ## _8x8_ ## neon); \
107
-    p.puLUMA_16x16.prim = fncdef PFX(filterPixelToShort ## _16x16_ ## neon); \
108
-    p.puLUMA_8x4.prim   = fncdef PFX(filterPixelToShort ## _8x4_ ## neon); \
109
-    p.puLUMA_4x8.prim   = fncdef PFX(filterPixelToShort ## _4x8_ ## neon); \
110
-    p.puLUMA_16x8.prim  = fncdef PFX(filterPixelToShort ## _16x8_ ## neon); \
111
-    p.puLUMA_8x16.prim  = fncdef PFX(filterPixelToShort ## _8x16_ ## neon); \
112
-    p.puLUMA_16x32.prim = fncdef PFX(filterPixelToShort ## _16x32_ ## neon); \
113
-    p.puLUMA_16x12.prim = fncdef PFX(filterPixelToShort ## _16x12_ ## neon); \
114
-    p.puLUMA_12x16.prim = fncdef PFX(filterPixelToShort ## _12x16_ ## neon); \
115
-    p.puLUMA_16x4.prim  = fncdef PFX(filterPixelToShort ## _16x4_ ## neon); \
116
-    p.puLUMA_4x16.prim  = fncdef PFX(filterPixelToShort ## _4x16_ ## neon); \
117
-    p.puLUMA_24x32.prim = fncdef PFX(filterPixelToShort ## _24x32_ ## neon); \
118
-    p.puLUMA_8x32.prim  = fncdef PFX(filterPixelToShort ## _8x32_ ## neon); \
119
-    p.puLUMA_16x64.prim = fncdef PFX(filterPixelToShort ## _16x64_ ## neon)
120
 #define LUMA_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
121
     p.puLUMA_32x32.prim = fncdef PFX(filterPixelToShort ## _32x32_ ## sve); \
122
     p.puLUMA_32x16.prim = fncdef PFX(filterPixelToShort ## _32x16_ ## sve); \
123
@@ -211,17 +127,29 @@
124
     p.puLUMA_64x48.prim = fncdef PFX(filterPixelToShort ## _64x48_ ## sve); \
125
     p.puLUMA_64x16.prim = fncdef PFX(filterPixelToShort ## _64x16_ ## sve); \
126
     p.puLUMA_48x64.prim = fncdef PFX(filterPixelToShort ## _48x64_ ## sve)
127
+#define LUMA_PU_TYPED_MULTIPLE_16(prim, fncdef, fname, cpu)      \
128
+    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
129
+    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
130
+    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
131
+    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## cpu);  \
132
+    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
133
+    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
134
+    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
135
+    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
136
+    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
137
+    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## cpu);  \
138
+    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
139
+    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu);  \
140
+    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
141
+    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
142
+    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \
143
+    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu)
144
 #define ALL_LUMA_PU(prim, fname, cpu) ALL_LUMA_PU_TYPED(prim, , fname, cpu)
145
-#define LUMA_PU_MULTIPLE_ARCHS_1(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_1(prim, , fname, cpu)
146
-#define LUMA_PU_MULTIPLE_ARCHS_2(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_2(prim, , fname, cpu)
147
-#define LUMA_PU_NEON_1(prim, fname) LUMA_PU_TYPED_NEON_1(prim, , fname)
148
 #define LUMA_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fname) LUMA_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, , fname)
149
-#define LUMA_PU_NEON_2(prim, fname) LUMA_PU_TYPED_NEON_2(prim, , fname)
150
 #define LUMA_PU_MULTIPLE_ARCHS_3(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_3(prim, , fname, cpu)
151
-#define LUMA_PU_NEON_3(prim, fname) LUMA_PU_TYPED_NEON_3(prim, , fname)
152
 #define LUMA_PU_CAN_USE_SVE2(prim, fname) LUMA_PU_TYPED_CAN_USE_SVE2(prim, , fname)
153
-#define LUMA_PU_NEON_FILTER_PIXEL_TO_SHORT(prim) LUMA_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, )
154
 #define LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(prim) LUMA_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, )
155
+#define LUMA_PU_MULTIPLE_16(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_16(prim, , fname, cpu)
156
 
157
 
158
 #define ALL_LUMA_PU_T(prim, fname) \
159
@@ -276,37 +204,9 @@
160
     p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
161
     p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
162
     p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu)
163
-#define CHROMA_420_PU_TYPED_NEON_1(prim, fncdef, fname)               \
164
-    p.chromaX265_CSP_I420.puCHROMA_420_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
165
-    p.chromaX265_CSP_I420.puCHROMA_420_4x2.prim   = fncdef PFX(fname ## _4x2_ ## neon); \
166
-    p.chromaX265_CSP_I420.puCHROMA_420_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
167
-    p.chromaX265_CSP_I420.puCHROMA_420_6x8.prim   = fncdef PFX(fname ## _6x8_ ## neon); \
168
-    p.chromaX265_CSP_I420.puCHROMA_420_12x16.prim = fncdef PFX(fname ## _12x16_ ## neon); \
169
-    p.chromaX265_CSP_I420.puCHROMA_420_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon); \
170
-    p.chromaX265_CSP_I420.puCHROMA_420_32x24.prim = fncdef PFX(fname ## _32x24_ ## neon); \
171
-    p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(fname ## _24x32_ ## neon); \
172
-    p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim  = fncdef PFX(fname ## _32x8_ ## neon); \
173
-    p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim  = fncdef PFX(fname ## _8x32_ ## neon); \
174
-    p.chromaX265_CSP_I420.puCHROMA_420_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
175
-    p.chromaX265_CSP_I420.puCHROMA_420_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
176
-    p.chromaX265_CSP_I420.puCHROMA_420_2x4.prim   = fncdef PFX(fname ## _2x4_ ## neon); \
177
-    p.chromaX265_CSP_I420.puCHROMA_420_8x4.prim   = fncdef PFX(fname ## _8x4_ ## neon); \
178
-    p.chromaX265_CSP_I420.puCHROMA_420_16x8.prim  = fncdef PFX(fname ## _16x8_ ## neon); \
179
-    p.chromaX265_CSP_I420.puCHROMA_420_8x16.prim  = fncdef PFX(fname ## _8x16_ ## neon); \
180
-    p.chromaX265_CSP_I420.puCHROMA_420_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
181
-    p.chromaX265_CSP_I420.puCHROMA_420_8x6.prim   = fncdef PFX(fname ## _8x6_ ## neon); \
182
-    p.chromaX265_CSP_I420.puCHROMA_420_8x2.prim   = fncdef PFX(fname ## _8x2_ ## neon); \
183
-    p.chromaX265_CSP_I420.puCHROMA_420_2x8.prim   = fncdef PFX(fname ## _2x8_ ## neon); \
184
-    p.chromaX265_CSP_I420.puCHROMA_420_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \
185
-    p.chromaX265_CSP_I420.puCHROMA_420_16x4.prim  = fncdef PFX(fname ## _16x4_ ## neon)
186
 #define CHROMA_420_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fncdef, fname)               \
187
     p.chromaX265_CSP_I420.puCHROMA_420_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve); \
188
     p.chromaX265_CSP_I420.puCHROMA_420_32x16.prim = fncdef PFX(fname ## _32x16_ ## sve)
189
-#define CHROMA_420_PU_TYPED_NEON_2(prim, fncdef, fname)               \
190
-    p.chromaX265_CSP_I420.puCHROMA_420_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
191
-    p.chromaX265_CSP_I420.puCHROMA_420_4x2.prim   = fncdef PFX(fname ## _4x2_ ## neon); \
192
-    p.chromaX265_CSP_I420.puCHROMA_420_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
193
-    p.chromaX265_CSP_I420.puCHROMA_420_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon)
194
 #define CHROMA_420_PU_TYPED_MULTIPLE_ARCHS(prim, fncdef, fname, cpu)               \
195
     p.chromaX265_CSP_I420.puCHROMA_420_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
196
     p.chromaX265_CSP_I420.puCHROMA_420_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
197
@@ -328,23 +228,6 @@
198
     p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
199
     p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
200
     p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu)
201
x265_3.6.tar.gz/source/common/aarch64/asm.S -> x265_4.0.tar.gz/source/common/aarch64/asm.S Changed
40
 
1
@@ -72,6 +72,16 @@
2
 
3
 #define PFX_C(name)        JOIN(JOIN(JOIN(EXTERN_ASM, X265_NS), _), name)
4
 
5
+// Alignment of stack arguments of size less than 8 bytes.
6
+#ifdef __APPLE__
7
+#define STACK_ARG_ALIGNMENT 4
8
+#else
9
+#define STACK_ARG_ALIGNMENT 8
10
+#endif
11
+
12
+// Get offset from SP of stack argument at index `idx`.
13
+#define STACK_ARG_OFFSET(idx) (idx * STACK_ARG_ALIGNMENT)
14
+
15
 #ifdef __APPLE__
16
 .macro endfunc
17
 ELF .size \name, . - \name
18
@@ -184,4 +194,19 @@
19
     vtrn            \t3, \t4, \s3, \s4
20
 .endm
21
 
22
-#endif
23
\ No newline at end of file
24
+
25
+.macro push_vec_regs
26
+    stp             d8, d9, sp,#-16!
27
+    stp             d10, d11, sp,#-16!
28
+    stp             d12, d13, sp,#-16!
29
+    stp             d14, d15, sp,#-16!
30
+.endm
31
+
32
+.macro pop_vec_regs
33
+    ldp             d14, d15, sp, #16
34
+    ldp             d12, d13, sp, #16
35
+    ldp             d10, d11, sp, #16
36
+    ldp             d8, d9, sp, #16
37
+.endm
38
+
39
+#endif
40
x265_3.6.tar.gz/source/common/aarch64/blockcopy8-sve.S -> x265_4.0.tar.gz/source/common/aarch64/blockcopy8-sve.S Changed
201
 
1
@@ -112,7 +112,7 @@
2
     lsl             x3, x3, #1
3
     movrel          x11, xtn_xtn2_table
4
     ld1             {v31.16b}, x11
5
-.loop_csp32_sve:
6
+.Loop_csp32_sve:
7
     sub             w12, w12, #1
8
 .rept 4
9
     ld1             {v0.8h-v3.8h}, x2, x3
10
@@ -124,7 +124,7 @@
11
     st1             {v0.16b-v1.16b}, x0, x1
12
     st1             {v2.16b-v3.16b}, x0, x1
13
 .endr
14
-    cbnz            w12, .loop_csp32_sve
15
+    cbnz            w12, .Loop_csp32_sve
16
     ret
17
 .vl_gt_16_blockcopy_sp_32_32:
18
     cmp             x9, #48
19
@@ -199,7 +199,7 @@
20
     bgt             .vl_gt_16_blockcopy_ps_32_32
21
     lsl             x1, x1, #1
22
     mov             w12, #4
23
-.loop_cps32_sve:
24
+.Loop_cps32_sve:
25
     sub             w12, w12, #1
26
 .rept 4
27
     ld1             {v16.16b-v17.16b}, x2, x3
28
@@ -215,7 +215,7 @@
29
     st1             {v0.8h-v3.8h}, x0, x1
30
     st1             {v4.8h-v7.8h}, x0, x1
31
 .endr
32
-    cbnz            w12, .loop_cps32_sve
33
+    cbnz            w12, .Loop_cps32_sve
34
     ret
35
 .vl_gt_16_blockcopy_ps_32_32:
36
     cmp             x9, #48
37
@@ -248,7 +248,7 @@
38
     lsl             x1, x1, #1
39
     sub             x1, x1, #64
40
     mov             w12, #16
41
-.loop_cps64_sve:
42
+.Loop_cps64_sve:
43
     sub             w12, w12, #1
44
 .rept 4
45
     ld1             {v16.16b-v19.16b}, x2, x3
46
@@ -263,7 +263,7 @@
47
     st1             {v0.8h-v3.8h}, x0, #64
48
     st1             {v4.8h-v7.8h}, x0, x1
49
 .endr
50
-    cbnz            w12, .loop_cps64_sve
51
+    cbnz            w12, .Loop_cps64_sve
52
     ret
53
 .vl_gt_16_blockcopy_ps_64_64:
54
     cmp             x9, #48
55
@@ -338,13 +338,13 @@
56
     lsl             x1, x1, #1
57
     lsl             x3, x3, #1
58
     mov             w12, #4
59
-.loop_css32_sve:
60
+.Loop_css32_sve:
61
     sub             w12, w12, #1
62
 .rept 8
63
     ld1             {v0.8h-v3.8h}, x2, x3
64
     st1             {v0.8h-v3.8h}, x0, x1
65
 .endr
66
-    cbnz            w12, .loop_css32_sve
67
+    cbnz            w12, .Loop_css32_sve
68
     ret
69
 .vl_gt_16_blockcopy_ss_32_32:
70
     cmp             x9, #48
71
@@ -379,7 +379,7 @@
72
     lsl             x3, x3, #1
73
     sub             x3, x3, #64
74
     mov             w12, #8
75
-.loop_css64_sve:
76
+.Loop_css64_sve:
77
     sub             w12, w12, #1
78
 .rept 8
79
     ld1             {v0.8h-v3.8h}, x2, #64
80
@@ -387,7 +387,7 @@
81
     st1             {v0.8h-v3.8h}, x0, #64
82
     st1             {v4.8h-v7.8h}, x0, x1
83
 .endr
84
-    cbnz            w12, .loop_css64_sve
85
+    cbnz            w12, .Loop_css64_sve
86
     ret
87
 .vl_gt_16_blockcopy_ss_64_64:
88
     cmp             x9, #48
89
@@ -474,13 +474,13 @@
90
     lsl             x1, x1, #1
91
     lsl             x3, x3, #1
92
     mov             w12, #8
93
-.loop_css32x64_sve:
94
+.Loop_css32x64_sve:
95
     sub             w12, w12, #1
96
 .rept 8
97
     ld1             {v0.8h-v3.8h}, x2, x3
98
     st1             {v0.8h-v3.8h}, x0, x1
99
 .endr
100
-    cbnz            w12, .loop_css32x64_sve
101
+    cbnz            w12, .Loop_css32x64_sve
102
     ret
103
 .vl_gt_16_blockcopy_ss_32_64:
104
     cmp             x9, #48
105
@@ -570,7 +570,7 @@
106
     bgt             .vl_gt_16_blockcopy_ps_32_64
107
     lsl             x1, x1, #1
108
     mov             w12, #8
109
-.loop_cps32x64_sve:
110
+.Loop_cps32x64_sve:
111
     sub             w12, w12, #1
112
 .rept 4
113
     ld1             {v16.16b-v17.16b}, x2, x3
114
@@ -586,7 +586,7 @@
115
     st1             {v0.8h-v3.8h}, x0, x1
116
     st1             {v4.8h-v7.8h}, x0, x1
117
 .endr
118
-    cbnz            w12, .loop_cps32x64_sve
119
+    cbnz            w12, .Loop_cps32x64_sve
120
     ret
121
 .vl_gt_16_blockcopy_ps_32_64:
122
     cmp             x9, #48
123
@@ -730,13 +730,13 @@
124
     rdvl            x9, #1
125
     cmp             x9, #16
126
     bgt             .vl_gt_16_blockcopy_pp_32xN_\h
127
-.loop_sve_32x\h\():
128
+.Loop_sve_32x\h\():
129
     sub             w12, w12, #1
130
 .rept 8
131
     ld1             {v0.16b-v1.16b}, x2, x3
132
     st1             {v0.16b-v1.16b}, x0, x1
133
 .endr
134
-    cbnz            w12, .loop_sve_32x\h
135
+    cbnz            w12, .Loop_sve_32x\h
136
     ret
137
 .vl_gt_16_blockcopy_pp_32xN_\h:
138
     ptrue           p0.b, vl32
139
@@ -765,13 +765,13 @@
140
     rdvl            x9, #1
141
     cmp             x9, #16
142
     bgt             .vl_gt_16_blockcopy_pp_64xN_\h
143
-.loop_sve_64x\h\():
144
+.Loop_sve_64x\h\():
145
     sub             w12, w12, #1
146
 .rept 4
147
     ld1             {v0.16b-v3.16b}, x2, x3
148
     st1             {v0.16b-v3.16b}, x0, x1
149
 .endr
150
-    cbnz            w12, .loop_sve_64x\h
151
+    cbnz            w12, .Loop_sve_64x\h
152
     ret
153
 .vl_gt_16_blockcopy_pp_64xN_\h:
154
     cmp             x9, #48
155
@@ -856,7 +856,7 @@
156
     bgt             .vl_gt_16_cpy2Dto1D_shl_16x16
157
     cpy2Dto1D_shl_start_sve
158
     mov             w12, #4
159
-.loop_cpy2Dto1D_shl_16_sve:
160
+.Loop_cpy2Dto1D_shl_16_sve:
161
     sub             w12, w12, #1
162
 .rept 4
163
     ld1             {v2.16b-v3.16b}, x1, x2
164
@@ -864,7 +864,7 @@
165
     sshl            v3.8h, v3.8h, v0.8h
166
     st1             {v2.16b-v3.16b}, x0, #32
167
 .endr
168
-    cbnz            w12, .loop_cpy2Dto1D_shl_16_sve
169
+    cbnz            w12, .Loop_cpy2Dto1D_shl_16_sve
170
     ret
171
 .vl_gt_16_cpy2Dto1D_shl_16x16:
172
     ptrue           p0.h, vl16
173
@@ -885,7 +885,7 @@
174
     bgt             .vl_gt_16_cpy2Dto1D_shl_32x32
175
     cpy2Dto1D_shl_start_sve
176
     mov             w12, #16
177
-.loop_cpy2Dto1D_shl_32_sve:
178
+.Loop_cpy2Dto1D_shl_32_sve:
179
     sub             w12, w12, #1
180
 .rept 2
181
     ld1             {v2.16b-v5.16b}, x1, x2
182
@@ -895,7 +895,7 @@
183
     sshl            v5.8h, v5.8h, v0.8h
184
     st1             {v2.16b-v5.16b}, x0, #64
185
 .endr
186
-    cbnz            w12, .loop_cpy2Dto1D_shl_32_sve
187
+    cbnz            w12, .Loop_cpy2Dto1D_shl_32_sve
188
     ret
189
 .vl_gt_16_cpy2Dto1D_shl_32x32:
190
     cmp             x9, #48
191
@@ -931,7 +931,7 @@
192
     cpy2Dto1D_shl_start_sve
193
     mov             w12, #32
194
     sub             x2, x2, #64
195
-.loop_cpy2Dto1D_shl_64_sve:
196
+.Loop_cpy2Dto1D_shl_64_sve:
197
     sub             w12, w12, #1
198
 .rept 2
199
     ld1             {v2.16b-v5.16b}, x1, #64
200
@@ -947,7 +947,7 @@
201
x265_3.6.tar.gz/source/common/aarch64/blockcopy8.S -> x265_4.0.tar.gz/source/common/aarch64/blockcopy8.S Changed
201
 
1
@@ -86,7 +86,7 @@
2
     lsl             x3, x3, #1
3
     movrel          x11, xtn_xtn2_table
4
     ld1             {v31.16b}, x11
5
-.loop_csp32:
6
+.Loop_csp32:
7
     sub             w12, w12, #1
8
 .rept 4
9
     ld1             {v0.8h-v3.8h}, x2, x3
10
@@ -98,7 +98,7 @@
11
     st1             {v0.16b-v1.16b}, x0, x1
12
     st1             {v2.16b-v3.16b}, x0, x1
13
 .endr
14
-    cbnz            w12, .loop_csp32
15
+    cbnz            w12, .Loop_csp32
16
     ret
17
 endfunc
18
 
19
@@ -108,7 +108,7 @@
20
     sub             x3, x3, #64
21
     movrel          x11, xtn_xtn2_table
22
     ld1             {v31.16b}, x11
23
-.loop_csp64:
24
+.Loop_csp64:
25
     sub             w12, w12, #1
26
 .rept 4
27
     ld1             {v0.8h-v3.8h}, x2, #64
28
@@ -119,7 +119,7 @@
29
     tbl             v3.16b, {v6.16b,v7.16b}, v31.16b
30
     st1             {v0.16b-v3.16b}, x0, x1
31
 .endr
32
-    cbnz            w12, .loop_csp64
33
+    cbnz            w12, .Loop_csp64
34
     ret
35
 endfunc
36
 
37
@@ -168,7 +168,7 @@
38
 function PFX(blockcopy_ps_32x32_neon)
39
     lsl             x1, x1, #1
40
     mov             w12, #4
41
-.loop_cps32:
42
+.Loop_cps32:
43
     sub             w12, w12, #1
44
 .rept 4
45
     ld1             {v16.16b-v17.16b}, x2, x3
46
@@ -184,7 +184,7 @@
47
     st1             {v0.8h-v3.8h}, x0, x1
48
     st1             {v4.8h-v7.8h}, x0, x1
49
 .endr
50
-    cbnz            w12, .loop_cps32
51
+    cbnz            w12, .Loop_cps32
52
     ret
53
 endfunc
54
 
55
@@ -192,7 +192,7 @@
56
     lsl             x1, x1, #1
57
     sub             x1, x1, #64
58
     mov             w12, #16
59
-.loop_cps64:
60
+.Loop_cps64:
61
     sub             w12, w12, #1
62
 .rept 4
63
     ld1             {v16.16b-v19.16b}, x2, x3
64
@@ -207,7 +207,7 @@
65
     st1             {v0.8h-v3.8h}, x0, #64
66
     st1             {v4.8h-v7.8h}, x0, x1
67
 .endr
68
-    cbnz            w12, .loop_cps64
69
+    cbnz            w12, .Loop_cps64
70
     ret
71
 endfunc
72
 
73
@@ -252,13 +252,13 @@
74
     lsl             x1, x1, #1
75
     lsl             x3, x3, #1
76
     mov             w12, #4
77
-.loop_css32:
78
+.Loop_css32:
79
     sub             w12, w12, #1
80
 .rept 8
81
     ld1             {v0.8h-v3.8h}, x2, x3
82
     st1             {v0.8h-v3.8h}, x0, x1
83
 .endr
84
-    cbnz            w12, .loop_css32
85
+    cbnz            w12, .Loop_css32
86
     ret
87
 endfunc
88
 
89
@@ -268,7 +268,7 @@
90
     lsl             x3, x3, #1
91
     sub             x3, x3, #64
92
     mov             w12, #8
93
-.loop_css64:
94
+.Loop_css64:
95
     sub             w12, w12, #1
96
 .rept 8
97
     ld1             {v0.8h-v3.8h}, x2, #64
98
@@ -276,7 +276,7 @@
99
     st1             {v0.8h-v3.8h}, x0, #64
100
     st1             {v4.8h-v7.8h}, x0, x1
101
 .endr
102
-    cbnz            w12, .loop_css64
103
+    cbnz            w12, .Loop_css64
104
     ret
105
 endfunc
106
 
107
@@ -321,13 +321,13 @@
108
     lsl             x1, x1, #1
109
     lsl             x3, x3, #1
110
     mov             w12, #8
111
-.loop_css32x64:
112
+.Loop_css32x64:
113
     sub             w12, w12, #1
114
 .rept 8
115
     ld1             {v0.8h-v3.8h}, x2, x3
116
     st1             {v0.8h-v3.8h}, x0, x1
117
 .endr
118
-    cbnz            w12, .loop_css32x64
119
+    cbnz            w12, .Loop_css32x64
120
     ret
121
 endfunc
122
 
123
@@ -376,7 +376,7 @@
124
 function PFX(blockcopy_ps_32x64_neon)
125
     lsl             x1, x1, #1
126
     mov             w12, #8
127
-.loop_cps32x64:
128
+.Loop_cps32x64:
129
     sub             w12, w12, #1
130
 .rept 4
131
     ld1             {v16.16b-v17.16b}, x2, x3
132
@@ -392,7 +392,7 @@
133
     st1             {v0.8h-v3.8h}, x0, x1
134
     st1             {v4.8h-v7.8h}, x0, x1
135
 .endr
136
-    cbnz            w12, .loop_cps32x64
137
+    cbnz            w12, .Loop_cps32x64
138
     ret
139
 endfunc
140
 
141
@@ -443,7 +443,7 @@
142
     lsl             x3, x3, #1
143
     movrel          x11, xtn_xtn2_table
144
     ld1             {v31.16b}, x11
145
-.loop_csp32x64:
146
+.Loop_csp32x64:
147
     sub             w12, w12, #1
148
 .rept 4
149
     ld1             {v0.8h-v3.8h}, x2, x3
150
@@ -455,7 +455,7 @@
151
     st1             {v0.16b-v1.16b}, x0, x1
152
     st1             {v2.16b-v3.16b}, x0, x1
153
 .endr
154
-    cbnz            w12, .loop_csp32x64
155
+    cbnz            w12, .Loop_csp32x64
156
     ret
157
 endfunc
158
 
159
@@ -595,13 +595,13 @@
160
 
161
 function PFX(blockcopy_pp_8x64_neon)
162
     mov             w12, #4
163
-.loop_pp_8x64:
164
+.Loop_pp_8x64:
165
     sub             w12, w12, #1
166
 .rept 16
167
     ld1             {v0.4h}, x2, x3
168
     st1             {v0.4h}, x0, x1
169
 .endr
170
-    cbnz            w12, .loop_pp_8x64
171
+    cbnz            w12, .Loop_pp_8x64
172
     ret
173
 endfunc
174
 
175
@@ -623,13 +623,13 @@
176
 .macro blockcopy_pp_16xN1_neon h
177
 function PFX(blockcopy_pp_16x\h\()_neon)
178
     mov             w12, #\h / 8
179
-.loop_16x\h\():
180
+.Loop_16x\h\():
181
 .rept 8
182
     ld1             {v0.8h}, x2, x3
183
     st1             {v0.8h}, x0, x1
184
 .endr
185
     sub             w12, w12, #1
186
-    cbnz            w12, .loop_16x\h
187
+    cbnz            w12, .Loop_16x\h
188
     ret
189
 endfunc
190
 .endm
191
@@ -651,38 +651,38 @@
192
 function PFX(blockcopy_pp_12x32_neon)
193
     sub             x1, x1, #8
194
     mov             w12, #4
195
-.loop_pp_12x32:
196
+.Loop_pp_12x32:
197
     sub             w12, w12, #1
198
 .rept 8
199
     ld1             {v0.16b}, x2, x3
200
     str             d0, x0, #8
201
x265_4.0.tar.gz/source/common/aarch64/dct-prim-sve.cpp Added
201
 
1
@@ -0,0 +1,491 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2024 MulticoreWare, Inc
4
+ *
5
+ * Authors: Hari Limaye <hari.limaye@arm.com>
6
+ *          Jonathan Wright <jonathan.wright@arm.com>
7
+ *
8
+ * This program is free software; you can redistribute it and/or modify
9
+ * it under the terms of the GNU General Public License as published by
10
+ * the Free Software Foundation; either version 2 of the License, or
11
+ * (at your option) any later version.
12
+ *
13
+ * This program is distributed in the hope that it will be useful,
14
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ * GNU General Public License for more details.
17
+ *
18
+ * You should have received a copy of the GNU General Public License
19
+ * along with this program; if not, write to the Free Software
20
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21
+ *
22
+ * This program is also available under a commercial proprietary license.
23
+ * For more information, contact us at license @ x265.com.
24
+ *****************************************************************************/
25
+
26
+#include "dct-prim.h"
27
+#include "neon-sve-bridge.h"
28
+#include <arm_neon.h>
29
+
30
+
31
+namespace
32
+{
33
+using namespace X265_NS;
34
+
35
+// First four elements (duplicated) of rows 1, 3, 5 and 7 in g_t8 (8x8 DCT
36
+// matrix.)
37
+const int16_t t8_odd48 =
38
+{
39
+    { 89,  75,  50,  18, 89,  75,  50,  18 },
40
+    { 75, -18, -89, -50, 75, -18, -89, -50 },
41
+    { 50, -89,  18,  75, 50, -89,  18,  75 },
42
+    { 18, -50,  75, -89, 18, -50,  75, -89 },
43
+};
44
+
45
+template<int shift>
46
+static inline void partialButterfly8_sve(const int16_t *src, int16_t *dst)
47
+{
48
+    const int line = 8;
49
+
50
+    int16x8_t Oline / 2;
51
+    int32x4_t EEline / 2;
52
+    int32x4_t EOline / 2;
53
+
54
+    for (int i = 0; i < line; i += 2)
55
+    {
56
+        int16x8_t s_lo = vcombine_s16(vld1_s16(src + i * line),
57
+                                      vld1_s16(src + (i + 1) * line));
58
+        int16x8_t s_hi = vcombine_s16(
59
+            vrev64_s16(vld1_s16(src + i * line + 4)),
60
+            vrev64_s16(vld1_s16(src + (i + 1) * line + 4)));
61
+
62
+        int32x4_t E0 = vaddl_s16(vget_low_s16(s_lo), vget_low_s16(s_hi));
63
+        int32x4_t E1 = vaddl_s16(vget_high_s16(s_lo), vget_high_s16(s_hi));
64
+
65
+        Oi / 2 = vsubq_s16(s_lo, s_hi);
66
+
67
+        int32x4_t t0 = vreinterpretq_s32_s64(
68
+            vzip1q_s64(vreinterpretq_s64_s32(E0), vreinterpretq_s64_s32(E1)));
69
+        int32x4_t t1 = vrev64q_s32(vreinterpretq_s32_s64(
70
+            vzip2q_s64(vreinterpretq_s64_s32(E0), vreinterpretq_s64_s32(E1))));
71
+
72
+        EEi / 2 = vaddq_s32(t0, t1);
73
+        EOi / 2 = vsubq_s32(t0, t1);
74
+    }
75
+
76
+    int16_t *d = dst;
77
+
78
+    int32x4_t c0 = vld1q_s32(t8_even0);
79
+    int32x4_t c2 = vld1q_s32(t8_even1);
80
+    int32x4_t c4 = vld1q_s32(t8_even2);
81
+    int32x4_t c6 = vld1q_s32(t8_even3);
82
+    int16x8_t c1 = vld1q_s16(t8_odd0);
83
+    int16x8_t c3 = vld1q_s16(t8_odd1);
84
+    int16x8_t c5 = vld1q_s16(t8_odd2);
85
+    int16x8_t c7 = vld1q_s16(t8_odd3);
86
+
87
+    for (int j = 0; j < line; j += 4)
88
+    {
89
+        // O
90
+        int64x2_t t01 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 0, c1);
91
+        int64x2_t t23 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 1, c1);
92
+        int32x4_t t0123 = vcombine_s32(vmovn_s64(t01), vmovn_s64(t23));
93
+        int16x4_t res1 = vrshrn_n_s32(t0123, shift);
94
+        vst1_s16(d + 1 * line, res1);
95
+
96
+        t01 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 0, c3);
97
+        t23 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 1, c3);
98
+        t0123 = vcombine_s32(vmovn_s64(t01), vmovn_s64(t23));
99
+        int16x4_t res3 = vrshrn_n_s32(t0123, shift);
100
+        vst1_s16(d + 3 * line, res3);
101
+
102
+        t01 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 0, c5);
103
+        t23 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 1, c5);
104
+        t0123 = vcombine_s32(vmovn_s64(t01), vmovn_s64(t23));
105
+        int16x4_t res5 = vrshrn_n_s32(t0123, shift);
106
+        vst1_s16(d + 5 * line, res5);
107
+
108
+        t01 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 0, c7);
109
+        t23 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 1, c7);
110
+        t0123 = vcombine_s32(vmovn_s64(t01), vmovn_s64(t23));
111
+        int16x4_t res7 = vrshrn_n_s32(t0123, shift);
112
+        vst1_s16(d + 7 * line, res7);
113
+
114
+        // EE and EO
115
+        int32x4_t t0 = vpaddq_s32(EEj / 2 + 0, EEj / 2 + 1);
116
+        int32x4_t t1 = vmulq_s32(c0, t0);
117
+        int16x4_t res0 = vrshrn_n_s32(t1, shift);
118
+        vst1_s16(d + 0 * line, res0);
119
+
120
+        int32x4_t t2 = vmulq_s32(c2, EOj / 2 + 0);
121
+        int32x4_t t3 = vmulq_s32(c2, EOj / 2 + 1);
122
+        int16x4_t res2 = vrshrn_n_s32(vpaddq_s32(t2, t3), shift);
123
+        vst1_s16(d + 2 * line, res2);
124
+
125
+        int32x4_t t4 = vmulq_s32(c4, EEj / 2 + 0);
126
+        int32x4_t t5 = vmulq_s32(c4, EEj / 2 + 1);
127
+        int16x4_t res4 = vrshrn_n_s32(vpaddq_s32(t4, t5), shift);
128
+        vst1_s16(d + 4 * line, res4);
129
+
130
+        int32x4_t t6 = vmulq_s32(c6, EOj / 2 + 0);
131
+        int32x4_t t7 = vmulq_s32(c6, EOj / 2 + 1);
132
+        int16x4_t res6 = vrshrn_n_s32(vpaddq_s32(t6, t7), shift);
133
+        vst1_s16(d + 6 * line, res6);
134
+
135
+        d += 4;
136
+    }
137
+}
138
+
139
+template<int shift>
140
+static inline void partialButterfly16_sve(const int16_t *src, int16_t *dst)
141
+{
142
+    const int line = 16;
143
+
144
+    int16x8_t Oline;
145
+    int16x8_t EOline / 2;
146
+    int32x4_t EEEline;
147
+    int32x4_t EEOline;
148
+
149
+    for (int i = 0; i < line; i += 2)
150
+    {
151
+        int16x8_t s0_lo = vld1q_s16(src + i * line);
152
+        int16x8_t s0_hi = rev16(vld1q_s16(src + i * line + 8));
153
+
154
+        int16x8_t s1_lo = vld1q_s16(src + (i + 1) * line);
155
+        int16x8_t s1_hi = rev16(vld1q_s16(src + (i + 1) * line + 8));
156
+
157
+        int32x4_t E02;
158
+        E00 = vaddl_s16(vget_low_s16(s0_lo), vget_low_s16(s0_hi));
159
+        E01 = vaddl_s16(vget_high_s16(s0_lo), vget_high_s16(s0_hi));
160
+
161
+        int32x4_t E12;
162
+        E10 = vaddl_s16(vget_low_s16(s1_lo), vget_low_s16(s1_hi));
163
+        E11 = vaddl_s16(vget_high_s16(s1_lo), vget_high_s16(s1_hi));
164
+
165
+        Oi + 0 = vsubq_s16(s0_lo, s0_hi);
166
+        Oi + 1 = vsubq_s16(s1_lo, s1_hi);
167
+
168
+        int16x4_t EO_lo = vmovn_s32(vsubq_s32(E00, rev32(E01)));
169
+        int16x4_t EO_hi = vmovn_s32(vsubq_s32(E10, rev32(E11)));
170
+        EOi / 2 = vcombine_s16(EO_lo, EO_hi);
171
+
172
+        int32x4_t EE0 = vaddq_s32(E00, rev32(E01));
173
+        int32x4_t EE1 = vaddq_s32(E10, rev32(E11));
174
+
175
+        int32x4_t t0 = vreinterpretq_s32_s64(
176
+            vzip1q_s64(vreinterpretq_s64_s32(EE0), vreinterpretq_s64_s32(EE1)));
177
+        int32x4_t t1 = vrev64q_s32(vreinterpretq_s32_s64(
178
+            vzip2q_s64(vreinterpretq_s64_s32(EE0),
179
+                       vreinterpretq_s64_s32(EE1))));
180
+
181
+        EEEi / 2 = vaddq_s32(t0, t1);
182
+        EEOi / 2 = vsubq_s32(t0, t1);
183
+    }
184
+
185
+    for (int i = 0; i < line; i += 4)
186
+    {
187
+        for (int k = 1; k < 16; k += 2)
188
+        {
189
+            int16x8_t c0_c4 = vld1q_s16(&g_t16k0);
190
+
191
+            int64x2_t t0 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, Oi + 0);
192
+            int64x2_t t1 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, Oi + 1);
193
+            int64x2_t t2 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, Oi + 2);
194
+            int64x2_t t3 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, Oi + 3);
195
+
196
+            int32x4_t t01 = vcombine_s32(vmovn_s64(t0), vmovn_s64(t1));
197
+            int32x4_t t23 = vcombine_s32(vmovn_s64(t2), vmovn_s64(t3));
198
+            int16x4_t res = vrshrn_n_s32(vpaddq_s32(t01, t23), shift);
199
+            vst1_s16(dst + k * line, res);
200
+        }
201
x265_3.6.tar.gz/source/common/aarch64/dct-prim.cpp -> x265_4.0.tar.gz/source/common/aarch64/dct-prim.cpp Changed
201
 
1
@@ -5,36 +5,35 @@
2
 
3
 #include <arm_neon.h>
4
 
5
+#define X265_PRAGMA(text)       _Pragma(#text)
6
+#if defined(__clang__)
7
+#define X265_PRAGMA_UNROLL(n)   X265_PRAGMA(unroll(n))
8
+#elif defined(__GNUC__)
9
+#define X265_PRAGMA_UNROLL(n)   X265_PRAGMA(GCC unroll (n))
10
+#else
11
+#define X265_PRAGMA_UNROLL(n)
12
+#endif
13
+
14
+extern "C" void PFX(dct16_neon)(const int16_t *src, int16_t *dst, intptr_t srcStride);
15
+extern "C" void PFX(idct16_neon)(const int16_t *src, int16_t *dst, intptr_t dstStride);
16
 
17
 namespace
18
 {
19
 using namespace X265_NS;
20
 
21
-
22
-static int16x8_t rev16(const int16x8_t a)
23
+static void transpose_4x4x16(int16x4_t &x0, int16x4_t &x1, int16x4_t &x2, int16x4_t &x3)
24
 {
25
-    static const int8x16_t tbl = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
26
-    return vqtbx1q_u8(a, a, tbl);
27
-}
28
+    int32x2_t s0, s1, s2, s3;
29
 
30
-static int32x4_t rev32(const int32x4_t a)
31
-{
32
-    static const int8x16_t tbl = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
33
-    return vqtbx1q_u8(a, a, tbl);
34
-}
35
+    s0 = vtrn1_s32(vreinterpret_s32_s16(x0), vreinterpret_s32_s16(x2));
36
+    s1 = vtrn1_s32(vreinterpret_s32_s16(x1), vreinterpret_s32_s16(x3));
37
+    s2 = vtrn2_s32(vreinterpret_s32_s16(x0), vreinterpret_s32_s16(x2));
38
+    s3 = vtrn2_s32(vreinterpret_s32_s16(x1), vreinterpret_s32_s16(x3));
39
 
40
-static void transpose_4x4x16(int16x4_t &x0, int16x4_t &x1, int16x4_t &x2, int16x4_t &x3)
41
-{
42
-    int16x4_t s0, s1, s2, s3;
43
-    s0 = vtrn1_s32(x0, x2);
44
-    s1 = vtrn1_s32(x1, x3);
45
-    s2 = vtrn2_s32(x0, x2);
46
-    s3 = vtrn2_s32(x1, x3);
47
-
48
-    x0 = vtrn1_s16(s0, s1);
49
-    x1 = vtrn2_s16(s0, s1);
50
-    x2 = vtrn1_s16(s2, s3);
51
-    x3 = vtrn2_s16(s2, s3);
52
+    x0 = vtrn1_s16(vreinterpret_s16_s32(s0), vreinterpret_s16_s32(s1));
53
+    x1 = vtrn2_s16(vreinterpret_s16_s32(s0), vreinterpret_s16_s32(s1));
54
+    x2 = vtrn1_s16(vreinterpret_s16_s32(s2), vreinterpret_s16_s32(s3));
55
+    x3 = vtrn2_s16(vreinterpret_s16_s32(s2), vreinterpret_s16_s32(s3));
56
 }
57
 
58
 
59
@@ -111,13 +110,13 @@
60
     int64x2_t vcost_sum_1 = vdupq_n_s64(0);
61
     for (int y = 0; y < MLS_CG_SIZE; y++)
62
     {
63
-        int16x4_t in = *(int16x4_t *)&m_resiDctCoeffblkPos;
64
+        int16x4_t in = vld1_s16(&m_resiDctCoeffblkPos);
65
         int32x4_t mul = vmull_s16(in, in);
66
         int64x2_t cost0, cost1;
67
         cost0 = vshll_n_s32(vget_low_s32(mul), scaleBits);
68
         cost1 = vshll_high_n_s32(mul, scaleBits);
69
-        *(int64x2_t *)&costUncodedblkPos + 0 = cost0;
70
-        *(int64x2_t *)&costUncodedblkPos + 2 = cost1;
71
+        vst1q_s64(&costUncodedblkPos + 0, cost0);
72
+        vst1q_s64(&costUncodedblkPos + 2, cost1);
73
         vcost_sum_0 = vaddq_s64(vcost_sum_0, cost0);
74
         vcost_sum_1 = vaddq_s64(vcost_sum_1, cost1);
75
         blkPos += trSize;
76
@@ -143,8 +142,9 @@
77
     int32x4_t vpsy = vdupq_n_s32(*psyScale);
78
     for (int y = 0; y < MLS_CG_SIZE; y++)
79
     {
80
-        int32x4_t signCoef = vmovl_s16(*(int16x4_t *)&m_resiDctCoeffblkPos);
81
-        int32x4_t predictedCoef = vsubq_s32(vmovl_s16(*(int16x4_t *)&m_fencDctCoeffblkPos), signCoef);
82
+        int32x4_t signCoef = vmovl_s16(vld1_s16(&m_resiDctCoeffblkPos));
83
+        int32x4_t fencCoef = vmovl_s16(vld1_s16(&m_fencDctCoeffblkPos));
84
+        int32x4_t predictedCoef = vsubq_s32(fencCoef, signCoef);
85
         int64x2_t cost0, cost1;
86
         cost0 = vmull_s32(vget_low_s32(signCoef), vget_low_s32(signCoef));
87
         cost1 = vmull_high_s32(signCoef, signCoef);
88
@@ -160,8 +160,8 @@
89
         }
90
         cost0 = vsubq_s64(cost0, neg0);
91
         cost1 = vsubq_s64(cost1, neg1);
92
-        *(int64x2_t *)&costUncodedblkPos + 0 = cost0;
93
-        *(int64x2_t *)&costUncodedblkPos + 2 = cost1;
94
+        vst1q_s64(&costUncodedblkPos + 0, cost0);
95
+        vst1q_s64(&costUncodedblkPos + 2, cost1);
96
         vcost_sum_0 = vaddq_s64(vcost_sum_0, cost0);
97
         vcost_sum_1 = vaddq_s64(vcost_sum_1, cost1);
98
 
99
@@ -188,8 +188,9 @@
100
     int i = 0;
101
     for (; (i + 8) <= numCoeff; i += 8)
102
     {
103
-        int16x8_t in = *(int16x8_t *)&quantCoeffi;
104
-        vcount = vaddq_s16(vcount, vtstq_s16(in, in));
105
+        int16x8_t in = vld1q_s16(&quantCoeffi);
106
+        uint16x8_t tst = vtstq_s16(in, in);
107
+        vcount = vaddq_s16(vcount, vreinterpretq_s16_u16(tst));
108
     }
109
     for (; i < numCoeff; i++)
110
     {
111
@@ -209,9 +210,10 @@
112
         int j = 0;
113
         for (; (j + 8) <= trSize; j += 8)
114
         {
115
-            int16x8_t in = *(int16x8_t *)&residualj;
116
-            *(int16x8_t *)&coeffj = in;
117
-            vcount = vaddq_s16(vcount, vtstq_s16(in, in));
118
+            int16x8_t in = vld1q_s16(&residualj);
119
+            vst1q_s16(&coeffj, in);
120
+            uint16x8_t tst = vtstq_s16(in, in);
121
+            vcount = vaddq_s16(vcount, vreinterpretq_s16_u16(tst));
122
         }
123
         for (; j < trSize; j++)
124
         {
125
@@ -225,200 +227,396 @@
126
     return numSig - vaddvq_s16(vcount);
127
 }
128
 
129
-
130
-static void partialButterfly16(const int16_t *src, int16_t *dst, int shift, int line)
131
+template<int shift>
132
+static inline void partialButterfly16_neon(const int16_t *src, int16_t *dst)
133
 {
134
-    int j, k;
135
-    int32x4_t E2, O2;
136
-    int32x4_t EE, EO;
137
-    int32x2_t EEE, EEO;
138
-    const int add = 1 << (shift - 1);
139
-    const int32x4_t _vadd = {add, 0};
140
+    const int line = 16;
141
 
142
-    for (j = 0; j < line; j++)
143
+    int16x8_t Oline;
144
+    int32x4_t EOline;
145
+    int32x4_t EEEline;
146
+    int32x4_t EEOline;
147
+
148
+    for (int i = 0; i < line; i += 2)
149
     {
150
-        int16x8_t in0 = *(int16x8_t *)src;
151
-        int16x8_t in1 = rev16(*(int16x8_t *)&src8);
152
+        int16x8_t s0_lo = vld1q_s16(src + i * line);
153
+        int16x8_t s0_hi = rev16(vld1q_s16(src + i * line + 8));
154
 
155
-        E0 = vaddl_s16(vget_low_s16(in0), vget_low_s16(in1));
156
-        O0 = vsubl_s16(vget_low_s16(in0), vget_low_s16(in1));
157
-        E1 = vaddl_high_s16(in0, in1);
158
-        O1 = vsubl_high_s16(in0, in1);
159
+        int16x8_t s1_lo = vld1q_s16(src + (i + 1) * line);
160
+        int16x8_t s1_hi = rev16(vld1q_s16(src + (i + 1) * line + 8));
161
 
162
-        for (k = 1; k < 16; k += 2)
163
-        {
164
-            int32x4_t c0 = vmovl_s16(*(int16x4_t *)&g_t16k0);
165
-            int32x4_t c1 = vmovl_s16(*(int16x4_t *)&g_t16k4);
166
+        int32x4_t E02;
167
+        E00 = vaddl_s16(vget_low_s16(s0_lo), vget_low_s16(s0_hi));
168
+        E01 = vaddl_s16(vget_high_s16(s0_lo), vget_high_s16(s0_hi));
169
 
170
-            int32x4_t res = _vadd;
171
-            res = vmlaq_s32(res, c0, O0);
172
-            res = vmlaq_s32(res, c1, O1);
173
-            dstk * line = (int16_t)(vaddvq_s32(res) >> shift);
174
-        }
175
+        int32x4_t E12;
176
+        E10 = vaddl_s16(vget_low_s16(s1_lo), vget_low_s16(s1_hi));
177
+        E11 = vaddl_s16(vget_high_s16(s1_lo), vget_high_s16(s1_hi));
178
+
179
+        Oi + 0 = vsubq_s16(s0_lo, s0_hi);
180
+        Oi + 1 = vsubq_s16(s1_lo, s1_hi);
181
+
182
+        int32x4_t EE0 = vaddq_s32(E00, rev32(E01));
183
+        int32x4_t EE1 = vaddq_s32(E10, rev32(E11));
184
+        EOi + 0 = vsubq_s32(E00, rev32(E01));
185
+        EOi + 1 = vsubq_s32(E10, rev32(E11));
186
+
187
+        int32x4_t t0 = vreinterpretq_s32_s64(
188
+            vzip1q_s64(vreinterpretq_s64_s32(EE0), vreinterpretq_s64_s32(EE1)));
189
+        int32x4_t t1 = vrev64q_s32(vreinterpretq_s32_s64(vzip2q_s64(
190
+            vreinterpretq_s64_s32(EE0), vreinterpretq_s64_s32(EE1))));
191
 
192
-        /* EE and EO */
193
-        EE = vaddq_s32(E0, rev32(E1));
194
-        EO = vsubq_s32(E0, rev32(E1));
195
 
196
-        for (k = 2; k < 16; k += 4)
197
+        EEEi / 2 = vaddq_s32(t0, t1);
198
+        EEOi / 2 = vsubq_s32(t0, t1);
199
+    }
200
+
201
x265_3.6.tar.gz/source/common/aarch64/dct-prim.h -> x265_4.0.tar.gz/source/common/aarch64/dct-prim.h Changed
53
 
1
@@ -6,11 +6,51 @@
2
 #include "primitives.h"
3
 #include "contexts.h"   // costCoeffNxN_c
4
 #include "threading.h"  // CLZ
5
+#include <arm_neon.h>
6
 
7
 namespace X265_NS
8
 {
9
+// First two columns of the 4x4 dct transform matrix, duplicated to 4x4 to allow
10
+// processing two lines at once.
11
+const int32_t t8_even44 =
12
+{
13
+    { 64,  64, 64,  64 },
14
+    { 83,  36, 83,  36 },
15
+    { 64, -64, 64, -64 },
16
+    { 36, -83, 36, -83 },
17
+};
18
+
19
+const uint8_t rev16_tbl16 =
20
+{
21
+    14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
22
+};
23
+
24
+const uint8_t rev32_tbl16 =
25
+{
26
+    12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
27
+};
28
+
29
+static inline int16x8_t rev16(const int16x8_t a)
30
+{
31
+    const uint8x16_t tbl = vld1q_u8(rev16_tbl);
32
+    const int8x16_t a_s8 = vreinterpretq_s8_s16(a);
33
+
34
+    return vreinterpretq_s16_s8(vqtbx1q_s8(a_s8, a_s8, tbl));
35
+}
36
+
37
+static inline int32x4_t rev32(const int32x4_t a)
38
+{
39
+    const uint8x16_t tbl = vld1q_u8(rev32_tbl);
40
+    const int8x16_t a_s8 = vreinterpretq_s8_s32(a);
41
+
42
+    return vreinterpretq_s32_s8(vqtbx1q_s8(a_s8, a_s8, tbl));
43
+}
44
+
45
 // x265 private namespace
46
 void setupDCTPrimitives_neon(EncoderPrimitives &p);
47
+#if defined(HAVE_SVE) && HAVE_SVE_BRIDGE
48
+void setupDCTPrimitives_sve(EncoderPrimitives &p);
49
+#endif
50
 };
51
 
52
 
53
x265_4.0.tar.gz/source/common/aarch64/dct.S Added
201
 
1
@@ -0,0 +1,883 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2021 MulticoreWare, Inc
4
+ *
5
+ * Authors: Min Chen <min.chen@multicorewareinc.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// Functions in this file:
26
+// ***** luma_vpp *****
27
+
28
+#include "asm.S"
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+.set idct16_shift_1, 7
40
+.set idct16_shift_2, 12-(BIT_DEPTH-8)
41
+
42
+.set dct16_shift_1, 3+(BIT_DEPTH-8)
43
+.set dct16_shift_2, 10
44
+
45
+.align 4
46
+// NOTE: Hardcoded due to asm syntax issue, don't reorder!
47
+tbl_const_idct_0:
48
+    .hword 64, 83, 36, 89, 75, 50, 18,  0   // v0
49
+    .hword 90, 87, 80, 70, 57, 43, 25,  9   // v1
50
+//    .hword 0=64, 1=83, 2=36, 3=89, 4=75, 5=50, 6=18, 7=00
51
+//    .hword 0=90, 1=87, 2=80, 3=70, 4=57, 5=43, 6=25, 7= 9
52
+
53
+    .hword 64, 83, 64, 36   // v0
54
+    .hword 64, 36,-64,-83
55
+    .hword 64,-36,-64, 83   // v1
56
+    .hword 64,-83, 64,-36
57
+
58
+    .hword 89, 75, 50, 18   // v2
59
+    .hword 75,-18,-89,-50
60
+    .hword 50,-89, 18, 75   // v3
61
+    .hword 18,-50, 75,-89
62
+
63
+    .hword 90,+87,+80,+70, +57,+43,+25,+ 9   // v4
64
+    .hword 87,+57, +9,-43, -80,-90,-70,-25   // v5
65
+    .hword 80, +9,-70,-87, -25,+57,+90,+43   // v6
66
+    .hword 70,-43,-87, +9, +90,+25,-80,-57   // v7
67
+    .hword 57,-80,-25,+90, - 9,-87,+43,+70   // v8
68
+    .hword 43,-90,+57,+25, -87,+70,+ 9,-80   // v9
69
+    .hword 25,-70,+90,-80, +43,+ 9,-57,+87   // v16
70
+    .hword  9,-25,+43,-57, +70,-80,+87,-90   // v17
71
+
72
+    .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3  // v18
73
+
74
+tbl_const_dct_0:
75
+    // EE
76
+    .hword 64,+64,+64,+64                   // v16
77
+    .hword 83,+36,-36,-83                   // v17
78
+    .hword 64,-64,-64,+64                   // v18
79
+    .hword 36,-83,+83,-36                   // v19
80
+
81
+    // EO
82
+    .hword 89,+75,+50,+18                   // v20
83
+    .hword 75,-18,-89,-50                   // v21
84
+    .hword 50,-89,+18,+75                   // v22
85
+    .hword 18,-50,+75,-89                   // v23
86
+
87
+    // O
88
+    .hword 90,+87,+80,+70,+57,+43,+25, +9   // v24
89
+    .hword 87,+57, +9,-43,-80,-90,-70,-25   // v25
90
+    .hword 80, +9,-70,-87,-25,+57,+90,+43   // v26
91
+    .hword 70,-43,-87, +9,+90,+25,-80,-57   // v27
92
+    .hword 57,-80,-25,+90, -9,-87,+43,+70   // v28
93
+    .hword 43,-90,+57,+25,-87,+70, +9,-80   // v29
94
+    .hword 25,-70,+90,-80,+43, +9,-57,+87   // v30
95
+    .hword  9,-25,+43,-57,+70,-80,+87,-90   // v31
96
+
97
+    .byte 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1  // v0
98
+//    .byte 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9  // v1
99
+
100
+    .word 64, 83, 36, 89, 75, 50, 18,  0    // v0, v1
101
+    .word 90, 87, 80, 70, 57, 43, 25,  9    // v2, v3
102
+
103
+
104
+// ***** idct 16x16 *****
105
+// void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride)
106
+function PFX(idct16_neon)
107
+// Register map
108
+// x0  = src
109
+// x1  = dst
110
+// x2  = dstStride
111
+// x8  = tbl_const_idct_0
112
+
113
+    stp             d8, d9, sp,#-16!
114
+    sub             sp, sp, #(16*16*2)
115
+
116
+    adr             x8, tbl_const_idct_0
117
+    ldp             q0, q1, x8
118
+
119
+    mov             x5, sp
120
+    mov             w4, #16
121
+
122
+    // Pass1
123
+5:
124
+    ldr             d16, x0, #(0*16*2)
125
+    ldr             d17, x0, #(2*16*2)
126
+    ldr             d18, x0, #(4*16*2)
127
+    ldr             d19, x0, #(6*16*2)
128
+    ldr             d20, x0, #(8*16*2)
129
+    ldr             d21, x0, #(10*16*2)
130
+    ldr             d22, x0, #(12*16*2)
131
+    ldr             d23, x0, #(14*16*2)
132
+
133
+// EEE0 = 64*src0*16+i + 64*src 8*16+i;
134
+// EEE1 = 64*src0*16+i - 64*src 8*16+i;
135
+// EEO0 = 83*src4*16+i + 36*src12*16+i;
136
+// EEO1 = 36*src4*16+i - 83*src12*16+i;
137
+    smull           v24.4s, v16.4h, v0.h0         // EEE0 = 64*0
138
+    smull           v26.4s, v18.4h, v0.h1         // EEO0 = 83*4
139
+    mov             v25.16b, v24.16b                // EEE1 = 64*0
140
+    smull           v27.4s, v18.4h, v0.h2         // EEO1 = 36*4
141
+
142
+// EO0 = 89*src 2*16+i + 75*src 6*16+i + 50*src10*16+i + 18*src14*16+i;
143
+// EO1 = 75*src 2*16+i - 18*src 6*16+i - 89*src10*16+i - 50*src14*16+i;
144
+// EO2 = 50*src 2*16+i - 89*src 6*16+i + 18*src10*16+i + 75*src14*16+i;
145
+// EO3 = 18*src 2*16+i - 50*src 6*16+i + 75*src10*16+i - 89*src14*16+i;
146
+    smull           v28.4s, v17.4h, v0.h3         // EO0 = 89*2
147
+    smull           v29.4s, v17.4h, v0.h4         // EO1 = 75*2
148
+    smull           v30.4s, v17.4h, v0.h5         // EO2 = 50*2
149
+    smull           v31.4s, v17.4h, v0.h6         // EO3 = 18*2
150
+
151
+    smlal           v28.4s, v19.4h, v0.h4         // EO0 = 89*2+75*6
152
+    smlsl           v29.4s, v19.4h, v0.h6         // EO1 = 75*2-18*6
153
+    smlsl           v30.4s, v19.4h, v0.h3         // EO2 = 50*2-89*6
154
+    smlsl           v31.4s, v19.4h, v0.h5         // EO3 = 18*2-50*6
155
+
156
+    ldr             d16, x0, #(1*16*2)
157
+    ldr             d17, x0, #(3*16*2)
158
+    ldr             d18, x0, #(5*16*2)
159
+    ldr             d19, x0, #(7*16*2)
160
+
161
+    orr             v2.8b, v20.8b, v21.8b
162
+    orr             v2.8b, v2.8b, v22.8b
163
+    orr             v2.8b, v2.8b, v23.8b
164
+    orr             v3.8b, v18.8b, v19.8b
165
+    mov             x6, v2.d0
166
+    mov             x7, v3.d0
167
+
168
+// O0 = 90*src 1*16+i + 87*src 3*16+i + 80*src 5*16+i + 70*src 7*16+i + 57*src 9*16+i + 43*src11*16+i + 25*src13*16+i +  9*src15*16+i;
169
+// O1 = 87*src 1*16+i + 57*src 3*16+i +  9*src 5*16+i - 43*src 7*16+i - 80*src 9*16+i - 90*src11*16+i - 70*src13*16+i - 25*src15*16+i;
170
+// O2 = 80*src 1*16+i +  9*src 3*16+i - 70*src 5*16+i - 87*src 7*16+i - 25*src 9*16+i + 57*src11*16+i + 90*src13*16+i + 43*src15*16+i;
171
+// O3 = 70*src 1*16+i - 43*src 3*16+i - 87*src 5*16+i +  9*src 7*16+i + 90*src 9*16+i + 25*src11*16+i - 80*src13*16+i - 57*src15*16+i;
172
+// O4 = 57*src 1*16+i - 80*src 3*16+i - 25*src 5*16+i + 90*src 7*16+i -  9*src 9*16+i - 87*src11*16+i + 43*src13*16+i + 70*src15*16+i;
173
+// O5 = 43*src 1*16+i - 90*src 3*16+i + 57*src 5*16+i + 25*src 7*16+i - 87*src 9*16+i + 70*src11*16+i +  9*src13*16+i - 80*src15*16+i;
174
+// O6 = 25*src 1*16+i - 70*src 3*16+i + 90*src 5*16+i - 80*src 7*16+i + 43*src 9*16+i +  9*src11*16+i - 57*src13*16+i + 87*src15*16+i;
175
+// O7 =  9*src 1*16+i - 25*src 3*16+i + 43*src 5*16+i - 57*src 7*16+i + 70*src 9*16+i - 80*src11*16+i + 87*src13*16+i - 90*src15*16+i;
176
+    smull           v2.4s, v16.4h, v1.h0          // v2 = O0 = 90*1
177
+    smull           v3.4s, v16.4h, v1.h1          // v3 = O1 = 87*1
178
+    smull           v4.4s, v16.4h, v1.h2          // v4 = O2 = 80*1
179
+    smull           v5.4s, v16.4h, v1.h3          // v5 = O3 = 70*1
180
+    smull           v6.4s, v16.4h, v1.h4          // v6 = O4 = 57*1
181
+    smull           v7.4s, v16.4h, v1.h5          // v7 = O5 = 43*1
182
+    smull           v8.4s, v16.4h, v1.h6          // v8 = O6 = 25*1
183
+    smull           v9.4s, v16.4h, v1.h7          // v9 = O7 =  9*1
184
+
185
+    smlal           v2.4s, v17.4h, v1.h1          // v2 = O0 = 90*1+87*3
186
+    smlal           v3.4s, v17.4h, v1.h4          // v3 = O1 = 87*1+57*3
187
+    smlal           v4.4s, v17.4h, v1.h7          // v4 = O2 = 80*1+ 9*3
188
+    smlsl           v5.4s, v17.4h, v1.h5          // v5 = O3 = 70*1-43*3
189
+    smlsl           v6.4s, v17.4h, v1.h2          // v6 = O4 = 57*1-80*3
190
+    smlsl           v7.4s, v17.4h, v1.h0          // v7 = O5 = 43*1-90*3
191
+    smlsl           v8.4s, v17.4h, v1.h3          // v8 = O6 = 25*1-70*3
192
+    smlsl           v9.4s, v17.4h, v1.h6          // v9 = O7 =  9*1-25*3
193
+
194
+    //cmp             x7, #0
195
+    //beq             1f
196
+    cbz             x7, 1f
197
+
198
+    smlal           v2.4s, v18.4h, v1.h2          // v2 = O0 = 90*1+87*3+80*5
199
+    smlal           v3.4s, v18.4h, v1.h7          // v3 = O1 = 87*1+57*3+ 9*5
200
+    smlsl           v4.4s, v18.4h, v1.h3          // v4 = O2 = 80*1+ 9*3-70*5
201
x265_4.0.tar.gz/source/common/aarch64/filter-neon-dotprod.cpp Added
201
 
1
@@ -0,0 +1,1131 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2024 MulticoreWare, Inc
4
+ *
5
+ * Authors: Hari Limaye <hari.limaye@arm.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "filter-neon-dotprod.h"
26
+
27
+#if !HIGH_BIT_DEPTH
28
+#include "mem-neon.h"
29
+#include <arm_neon.h>
30
+
31
+namespace {
32
+static const uint8_t dotprod_permute_tbl48 = {
33
+    0, 1,  2,  3, 1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5, 6,
34
+    4, 5,  6,  7, 5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10,
35
+    8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
36
+};
37
+
38
+static const uint8_t dot_prod_merge_block_tbl48 = {
39
+    // Shift left and insert new last column in transposed 4x4 block.
40
+    1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
41
+    // Shift left and insert two new columns in transposed 4x4 block.
42
+    2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
43
+    // Shift left and insert three new columns in transposed 4x4 block.
44
+    3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
45
+};
46
+
47
+uint8x8_t inline filter8_8_pp(uint8x16_t samples, const int8x8_t filter,
48
+                              const int32x4_t constant, const uint8x16x3_t tbl)
49
+{
50
+    // Transform sample range from uint8_t to int8_t for signed dot product.
51
+    int8x16_t samples_s8 =
52
+        vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
53
+
54
+    // Permute input samples for dot product.
55
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
56
+    int8x16_t perm_samples_0 = vqtbl1q_s8(samples_s8, tbl.val0);
57
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
58
+    int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val1);
59
+    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
60
+    int8x16_t perm_samples_2 = vqtbl1q_s8(samples_s8, tbl.val2);
61
+
62
+    int32x4_t dotprod_lo = vdotq_lane_s32(constant, perm_samples_0, filter, 0);
63
+    int32x4_t dotprod_hi = vdotq_lane_s32(constant, perm_samples_1, filter, 0);
64
+    dotprod_lo = vdotq_lane_s32(dotprod_lo, perm_samples_1, filter, 1);
65
+    dotprod_hi = vdotq_lane_s32(dotprod_hi, perm_samples_2, filter, 1);
66
+
67
+    // Narrow and combine.
68
+    int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
69
+                                     vmovn_s32(dotprod_hi));
70
+    return vqrshrun_n_s16(dotprod, IF_FILTER_PREC);
71
+}
72
+
73
+void inline init_sample_permute(uint8x8_t *samples, const uint8x16x3_t tbl,
74
+                                int8x16_t *d)
75
+{
76
+    // Transform sample range from uint8_t to int8_t for signed dot product.
77
+    int8x8_t samples_s84;
78
+    samples_s80 = vreinterpret_s8_u8(vsub_u8(samples0, vdup_n_u8(128)));
79
+    samples_s81 = vreinterpret_s8_u8(vsub_u8(samples1, vdup_n_u8(128)));
80
+    samples_s82 = vreinterpret_s8_u8(vsub_u8(samples2, vdup_n_u8(128)));
81
+    samples_s83 = vreinterpret_s8_u8(vsub_u8(samples3, vdup_n_u8(128)));
82
+
83
+    // Permute input samples for dot product.
84
+    // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
85
+    d0 = vqtbl1q_s8(vcombine_s8(samples_s80, vdup_n_s8(0)), tbl.val0);
86
+    d1 = vqtbl1q_s8(vcombine_s8(samples_s81, vdup_n_s8(0)), tbl.val0);
87
+    d2 = vqtbl1q_s8(vcombine_s8(samples_s82, vdup_n_s8(0)), tbl.val0);
88
+    d3 = vqtbl1q_s8(vcombine_s8(samples_s83, vdup_n_s8(0)), tbl.val0);
89
+}
90
+
91
+uint8x8_t inline filter8_8_pp_reuse(uint8x16_t samples, const int8x8_t filter,
92
+                                    const int32x4_t constant,
93
+                                    const uint8x16x3_t tbl,
94
+                                    int8x16_t &perm_samples_0)
95
+{
96
+    // Transform sample range from uint8_t to int8_t for signed dot product.
97
+    int8x16_t samples_s8 =
98
+        vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
99
+
100
+    // Permute input samples for dot product.
101
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
102
+    // Already in perm_samples_0.
103
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
104
+    int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val1);
105
+    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
106
+    int8x16_t perm_samples_2 = vqtbl1q_s8(samples_s8, tbl.val2);
107
+
108
+    int32x4_t dotprod_lo = vdotq_lane_s32(constant, perm_samples_0, filter, 0);
109
+    int32x4_t dotprod_hi = vdotq_lane_s32(constant, perm_samples_1, filter, 0);
110
+    dotprod_lo = vdotq_lane_s32(dotprod_lo, perm_samples_1, filter, 1);
111
+    dotprod_hi = vdotq_lane_s32(dotprod_hi, perm_samples_2, filter, 1);
112
+
113
+    // Save for re-use in next iteration.
114
+    perm_samples_0 = perm_samples_2;
115
+
116
+    // Narrow and combine.
117
+    int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
118
+                                     vmovn_s32(dotprod_hi));
119
+    return vqrshrun_n_s16(dotprod, IF_FILTER_PREC);
120
+}
121
+
122
+int16x4_t inline filter8_4_ps(uint8x16_t samples, const int8x8_t filter,
123
+                              const uint8x16x3_t tbl)
124
+{
125
+    // Transform sample range from uint8_t to int8_t for signed dot product.
126
+    int8x16_t samples_s8 =
127
+        vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
128
+
129
+    // Permute input samples for dot product.
130
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
131
+    int8x16_t perm_samples_0 = vqtbl1q_s8(samples_s8, tbl.val0);
132
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
133
+    int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val1);
134
+
135
+    // Correction accounting for sample range transform cancels to 0.
136
+    int32x4_t constant = vdupq_n_s32(0);
137
+    int32x4_t dotprod = vdotq_lane_s32(constant, perm_samples_0, filter, 0);
138
+    dotprod = vdotq_lane_s32(dotprod, perm_samples_1, filter, 1);
139
+
140
+    // Narrow.
141
+    return vmovn_s32(dotprod);
142
+}
143
+
144
+int16x8_t inline filter8_8_ps(uint8x16_t samples, const int8x8_t filter,
145
+                              const uint8x16x3_t tbl)
146
+{
147
+    // Transform sample range from uint8_t to int8_t for signed dot product.
148
+    int8x16_t samples_s8 =
149
+        vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
150
+
151
+    // Permute input samples for dot product.
152
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
153
+    int8x16_t perm_samples_0 = vqtbl1q_s8(samples_s8, tbl.val0);
154
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
155
+    int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val1);
156
+    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
157
+    int8x16_t perm_samples_2 = vqtbl1q_s8(samples_s8, tbl.val2);
158
+
159
+    // Correction accounting for sample range transform cancels to 0.
160
+    int32x4_t constant = vdupq_n_s32(0);
161
+    int32x4_t dotprod_lo = vdotq_lane_s32(constant, perm_samples_0, filter, 0);
162
+    int32x4_t dotprod_hi = vdotq_lane_s32(constant, perm_samples_1, filter, 0);
163
+    dotprod_lo = vdotq_lane_s32(dotprod_lo, perm_samples_1, filter, 1);
164
+    dotprod_hi = vdotq_lane_s32(dotprod_hi, perm_samples_2, filter, 1);
165
+
166
+    // Narrow and combine.
167
+    return vcombine_s16(vmovn_s32(dotprod_lo), vmovn_s32(dotprod_hi));
168
+}
169
+
170
+int16x8_t inline filter8_8_ps_reuse(uint8x16_t samples, const int8x8_t filter,
171
+                                    const uint8x16x3_t tbl,
172
+                                    int8x16_t &perm_samples_0)
173
+{
174
+    // Transform sample range from uint8_t to int8_t for signed dot product.
175
+    int8x16_t samples_s8 =
176
+        vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
177
+
178
+    // Permute input samples for dot product.
179
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
180
+    // Already in perm_samples_0.
181
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
182
+    int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val1);
183
+    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
184
+    int8x16_t perm_samples_2 = vqtbl1q_s8(samples_s8, tbl.val2);
185
+
186
+    // Correction accounting for sample range transform cancels to 0.
187
+    int32x4_t constant = vdupq_n_s32(0);
188
+    int32x4_t dotprod_lo = vdotq_lane_s32(constant, perm_samples_0, filter, 0);
189
+    int32x4_t dotprod_hi = vdotq_lane_s32(constant, perm_samples_1, filter, 0);
190
+    dotprod_lo = vdotq_lane_s32(dotprod_lo, perm_samples_1, filter, 1);
191
+    dotprod_hi = vdotq_lane_s32(dotprod_hi, perm_samples_2, filter, 1);
192
+
193
+    // Save for re-use in next iteration.
194
+    perm_samples_0 = perm_samples_2;
195
+
196
+    // Narrow and combine.
197
+    return vcombine_s16(vmovn_s32(dotprod_lo), vmovn_s32(dotprod_hi));
198
+}
199
+
200
+uint8x8_t inline filter4_8_pp(uint8x16_t samples, const int8x8_t filter,
201
x265_4.0.tar.gz/source/common/aarch64/filter-neon-dotprod.h Added
39
 
1
@@ -0,0 +1,37 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2024 MulticoreWare, Inc
4
+ *
5
+ * Authors: Hari Limaye <hari.limaye@arm.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#ifndef X265_COMMON_AARCH64_FILTER_NEON_DOTPROD_H
26
+#define X265_COMMON_AARCH64_FILTER_NEON_DOTPROD_H
27
+
28
+#if defined(HAVE_NEON_DOTPROD)
29
+
30
+#include "primitives.h"
31
+
32
+namespace X265_NS {
33
+void setupFilterPrimitives_neon_dotprod(EncoderPrimitives &p);
34
+}
35
+
36
+#endif // defined(HAVE_NEON_DOTPROD)
37
+
38
+#endif // X265_COMMON_AARCH64_FILTER_NEON_DOTPROD_H
39
x265_4.0.tar.gz/source/common/aarch64/filter-neon-i8mm.cpp Added
201
 
1
@@ -0,0 +1,1412 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2024 MulticoreWare, Inc
4
+ *
5
+ * Authors: Hari Limaye <hari.limaye@arm.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#if defined(HAVE_NEON_I8MM)
26
+#include "filter-neon-i8mm.h"
27
+#if !HIGH_BIT_DEPTH
28
+
29
+#include "mem-neon.h"
30
+
31
+#include <arm_neon.h>
32
+
33
+namespace {
34
+static const uint8_t dotprod_permute_tbl48 = {
35
+    0, 1,  2,  3, 1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5, 6,
36
+    4, 5,  6,  7, 5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10,
37
+    8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
38
+};
39
+
40
+static const uint8_t matmul_permute_tbl232 = {
41
+    // Permute for luma filter 3.
42
+    { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9,
43
+      4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13 },
44
+    // Permute for luma filter 1.
45
+    { 1,  2,  3,  4,  5,  6,  7,  8,  3,  4,  5,  6,  7,  8,  9, 10,
46
+      5,  6,  7,  8,  9, 10, 11, 12,  7,  8,  9, 10, 11, 12, 13, 14 }
47
+};
48
+
49
+static const int8_t matmul_luma_filter216 = {
50
+    { -1, 4, -10, 58, 17, -5, 1, 0, 0, -1, 4, -10, 58, 17, -5, 1 },
51
+    { 1, -5, 17, 58, -10, 4, -1, 0, 0, 1, -5, 17, 58, -10, 4, -1 }
52
+};
53
+
54
+static const uint8_t dot_prod_merge_block_tbl48 = {
55
+    // Shift left and insert new last column in transposed 4x4 block.
56
+    1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
57
+    // Shift left and insert two new columns in transposed 4x4 block.
58
+    2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
59
+    // Shift left and insert three new columns in transposed 4x4 block.
60
+    3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
61
+};
62
+
63
+uint8x8_t inline filter8_8_pp(uint8x16_t samples, const int8x8_t filter,
64
+                              const uint8x16x3_t tbl)
65
+{
66
+    // Permute input samples for dot product.
67
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
68
+    uint8x16_t perm_s0 = vqtbl1q_u8(samples, tbl.val0);
69
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
70
+    uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1);
71
+    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
72
+    uint8x16_t perm_S2 = vqtbl1q_u8(samples, tbl.val2);
73
+
74
+    int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0);
75
+    dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1);
76
+    int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0);
77
+    dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_S2, filter, 1);
78
+
79
+    // Narrow and combine.
80
+    int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
81
+                                     vmovn_s32(dotprod_hi));
82
+    return vqrshrun_n_s16(dotprod, IF_FILTER_PREC);
83
+}
84
+
85
+void inline init_sample_permute(uint8x8_t *samples, const uint8x16x3_t tbl,
86
+                                uint8x16_t *d)
87
+{
88
+    // Permute input samples for dot product.
89
+    // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
90
+    d0 = vqtbl1q_u8(vcombine_u8(samples0, vdup_n_u8(0)), tbl.val0);
91
+    d1 = vqtbl1q_u8(vcombine_u8(samples1, vdup_n_u8(0)), tbl.val0);
92
+    d2 = vqtbl1q_u8(vcombine_u8(samples2, vdup_n_u8(0)), tbl.val0);
93
+    d3 = vqtbl1q_u8(vcombine_u8(samples3, vdup_n_u8(0)), tbl.val0);
94
+}
95
+
96
+uint8x8_t inline filter8_8_pp_reuse(uint8x16_t samples, const int8x8_t filter,
97
+                                    const uint8x16x3_t tbl, uint8x16_t &perm_s0)
98
+{
99
+    // Permute input samples for dot product.
100
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
101
+    // Already in perm_s0.
102
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
103
+    uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1);
104
+    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
105
+    uint8x16_t perm_s2 = vqtbl1q_u8(samples, tbl.val2);
106
+
107
+    int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0);
108
+    dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1);
109
+    int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0);
110
+    dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_s2, filter, 1);
111
+
112
+    // Save for re-use in next iteration.
113
+    perm_s0 = perm_s2;
114
+
115
+    // Narrow and combine.
116
+    int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
117
+                                     vmovn_s32(dotprod_hi));
118
+    return vqrshrun_n_s16(dotprod, IF_FILTER_PREC);
119
+}
120
+
121
+uint8x8_t inline filter8_8_pp_matmul(uint8x16_t samples, const int8x16_t filter,
122
+                                     const uint8x16x2_t tbl)
123
+{
124
+    // Permute input samples for 8x2 by 2x8 matrix multiply.
125
+    uint8x16_t perm_s0 = vqtbl1q_u8(samples, tbl.val0);
126
+    uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1);
127
+
128
+    int32x4_t matmul_lo = vusmmlaq_s32(vdupq_n_s32(0), perm_s0, filter);
129
+    int32x4_t matmul_hi = vusmmlaq_s32(vdupq_n_s32(0), perm_s1, filter);
130
+
131
+    // Narrow and combine.
132
+    int16x8_t matmul = vcombine_s16(vmovn_s32(matmul_lo), vmovn_s32(matmul_hi));
133
+    return vqrshrun_n_s16(matmul, IF_FILTER_PREC);
134
+}
135
+
136
+int16x4_t inline filter8_4_ps(uint8x16_t samples, const int8x8_t filter,
137
+                              const int16x8_t constant, const uint8x16x3_t tbl)
138
+{
139
+    // Permute input samples for dot product.
140
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
141
+    uint8x16_t perm_s0 = vqtbl1q_u8(samples, tbl.val0);
142
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
143
+    uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1);
144
+
145
+    int32x4_t dotprod = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0);
146
+    dotprod = vusdotq_lane_s32(dotprod, perm_s1, filter, 1);
147
+
148
+    // Narrow.
149
+    return vadd_s16(vmovn_s32(dotprod), vget_low_s16(constant));
150
+}
151
+
152
+int16x8_t inline filter8_8_ps(uint8x16_t samples, const int8x8_t filter,
153
+                              const int16x8_t constant, const uint8x16x3_t tbl)
154
+{
155
+    // Permute input samples for dot product.
156
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
157
+    uint8x16_t perm_s0 = vqtbl1q_u8(samples, tbl.val0);
158
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
159
+    uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1);
160
+    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
161
+    uint8x16_t perm_S2 = vqtbl1q_u8(samples, tbl.val2);
162
+
163
+    int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0);
164
+    dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1);
165
+    int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0);
166
+    dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_S2, filter, 1);
167
+
168
+    // Narrow and combine.
169
+    int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
170
+                                     vmovn_s32(dotprod_hi));
171
+    return vaddq_s16(dotprod, constant);
172
+}
173
+
174
+int16x8_t inline filter8_8_ps_reuse(uint8x16_t samples, const int8x8_t filter,
175
+                                    const int16x8_t constant,
176
+                                    const uint8x16x3_t tbl, uint8x16_t &perm_s0)
177
+{
178
+    // Permute input samples for dot product.
179
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
180
+    // Already in perm_s0.
181
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
182
+    uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1);
183
+    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
184
+    uint8x16_t perm_s2 = vqtbl1q_u8(samples, tbl.val2);
185
+
186
+    int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0);
187
+    dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1);
188
+    int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0);
189
+    dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_s2, filter, 1);
190
+
191
+    // Save for re-use in next iteration.
192
+    perm_s0 = perm_s2;
193
+
194
+    // Narrow and combine.
195
+    int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
196
+                                     vmovn_s32(dotprod_hi));
197
+    return vaddq_s16(dotprod, constant);
198
+}
199
+
200
+int16x8_t inline filter8_8_ps_matmul(uint8x16_t samples, const int8x16_t filter,
201
x265_4.0.tar.gz/source/common/aarch64/filter-neon-i8mm.h Added
39
 
1
@@ -0,0 +1,37 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2024 MulticoreWare, Inc
4
+ *
5
+ * Authors: Hari Limaye <hari.limaye@arm.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#ifndef X265_FILTER_NEON_I8MM_H
26
+#define X265_FILTER_NEON_I8MM_H
27
+
28
+#if defined(HAVE_NEON_I8MM)
29
+
30
+#include "primitives.h"
31
+
32
+namespace X265_NS {
33
+void setupFilterPrimitives_neon_i8mm(EncoderPrimitives &p);
34
+}
35
+
36
+#endif // defined(HAVE_NEON_I8MM)
37
+
38
+#endif // X265_FILTER_NEON_I8MM_H
39
x265_3.6.tar.gz/source/common/aarch64/filter-prim.cpp -> x265_4.0.tar.gz/source/common/aarch64/filter-prim.cpp Changed
201
 
1
@@ -1,37 +1,2114 @@
2
 #if HAVE_NEON
3
 
4
 #include "filter-prim.h"
5
+#include "mem-neon.h"
6
+
7
 #include <arm_neon.h>
8
 
9
-namespace
10
+namespace {
11
+void inline filter4_s16x8(int coeffIdx, const int16x8_t *s, const int16x4_t f,
12
+                          const int32x4_t c, int32x4_t &d0, int32x4_t &d1)
13
+{
14
+    if (coeffIdx == 4)
15
+    {
16
+        // { -4, 36, 36, -4 }
17
+        int16x8_t t0 = vaddq_s16(s1, s2);
18
+        int16x8_t t1 = vaddq_s16(s0, s3);
19
+        d0 = vmlal_n_s16(c, vget_low_s16(t0), 36);
20
+        d0 = vmlsl_n_s16(d0, vget_low_s16(t1), 4);
21
+
22
+        d1 = vmlal_n_s16(c, vget_high_s16(t0), 36);
23
+        d1 = vmlsl_n_s16(d1, vget_high_s16(t1), 4);
24
+    }
25
+    else
26
+    {
27
+        d0 = vmlal_lane_s16(c, vget_low_s16(s0), f, 0);
28
+        d0 = vmlal_lane_s16(d0, vget_low_s16(s1), f, 1);
29
+        d0 = vmlal_lane_s16(d0, vget_low_s16(s2), f, 2);
30
+        d0 = vmlal_lane_s16(d0, vget_low_s16(s3), f, 3);
31
+
32
+        d1 = vmlal_lane_s16(c, vget_high_s16(s0), f, 0);
33
+        d1 = vmlal_lane_s16(d1, vget_high_s16(s1), f, 1);
34
+        d1 = vmlal_lane_s16(d1, vget_high_s16(s2), f, 2);
35
+        d1 = vmlal_lane_s16(d1, vget_high_s16(s3), f, 3);
36
+    }
37
+}
38
+
39
+template<int coeffIdx>
40
+void inline filter8_s16x4(const int16x4_t *s, const int32x4_t c, int32x4_t &d)
41
+{
42
+    if (coeffIdx == 1)
43
+    {
44
+        // { -1, 4, -10, 58, 17, -5, 1, 0 }
45
+        d = vsubl_s16(s6, s0);
46
+        d = vaddq_s32(d, c);
47
+        d = vmlal_n_s16(d, s1, 4);
48
+        d = vmlsl_n_s16(d, s2, 10);
49
+        d = vmlal_n_s16(d, s3, 58);
50
+        d = vmlal_n_s16(d, s4, 17);
51
+        d = vmlsl_n_s16(d, s5, 5);
52
+    }
53
+    else if (coeffIdx == 2)
54
+    {
55
+        // { -1, 4, -11, 40, 40, -11, 4, -1 }
56
+        int32x4_t t0 = vaddl_s16(s3, s4);
57
+        int32x4_t t1 = vaddl_s16(s2, s5);
58
+        int32x4_t t2 = vaddl_s16(s1, s6);
59
+        int32x4_t t3 = vaddl_s16(s0, s7);
60
+
61
+        d = vmlaq_n_s32(c, t0, 40);
62
+        d = vmlaq_n_s32(d, t1, -11);
63
+        d = vmlaq_n_s32(d, t2, 4);
64
+        d = vmlaq_n_s32(d, t3, -1);
65
+    }
66
+    else
67
+    {
68
+        // { 0, 1, -5, 17, 58, -10, 4, -1 }
69
+        d = vsubl_s16(s1, s7);
70
+        d = vaddq_s32(d, c);
71
+        d = vmlal_n_s16(d, s6, 4);
72
+        d = vmlsl_n_s16(d, s5, 10);
73
+        d = vmlal_n_s16(d, s4, 58);
74
+        d = vmlal_n_s16(d, s3, 17);
75
+        d = vmlsl_n_s16(d, s2, 5);
76
+    }
77
+}
78
+
79
+template<int coeffIdx>
80
+void inline filter8_s16x8(const int16x8_t *s, const int32x4_t c, int32x4_t &d0,
81
+                          int32x4_t &d1)
82
+{
83
+    if (coeffIdx == 1)
84
+    {
85
+        // { -1, 4, -10, 58, 17, -5, 1, 0 }
86
+        d0 = vsubl_s16(vget_low_s16(s6), vget_low_s16(s0));
87
+        d0 = vaddq_s32(d0, c);
88
+        d0 = vmlal_n_s16(d0, vget_low_s16(s1), 4);
89
+        d0 = vmlsl_n_s16(d0, vget_low_s16(s2), 10);
90
+        d0 = vmlal_n_s16(d0, vget_low_s16(s3), 58);
91
+        d0 = vmlal_n_s16(d0, vget_low_s16(s4), 17);
92
+        d0 = vmlsl_n_s16(d0, vget_low_s16(s5), 5);
93
+
94
+        d1 = vsubl_s16(vget_high_s16(s6), vget_high_s16(s0));
95
+        d1 = vaddq_s32(d1, c);
96
+        d1 = vmlal_n_s16(d1, vget_high_s16(s1), 4);
97
+        d1 = vmlsl_n_s16(d1, vget_high_s16(s2), 10);
98
+        d1 = vmlal_n_s16(d1, vget_high_s16(s3), 58);
99
+        d1 = vmlal_n_s16(d1, vget_high_s16(s4), 17);
100
+        d1 = vmlsl_n_s16(d1, vget_high_s16(s5), 5);
101
+    }
102
+    else if (coeffIdx == 2)
103
+    {
104
+        // { -1, 4, -11, 40, 40, -11, 4, -1 }
105
+        int32x4_t t0 = vaddl_s16(vget_low_s16(s3), vget_low_s16(s4));
106
+        int32x4_t t1 = vaddl_s16(vget_low_s16(s2), vget_low_s16(s5));
107
+        int32x4_t t2 = vaddl_s16(vget_low_s16(s1), vget_low_s16(s6));
108
+        int32x4_t t3 = vaddl_s16(vget_low_s16(s0), vget_low_s16(s7));
109
+
110
+        d0 = vmlaq_n_s32(c, t0, 40);
111
+        d0 = vmlaq_n_s32(d0, t1, -11);
112
+        d0 = vmlaq_n_s32(d0, t2, 4);
113
+        d0 = vmlaq_n_s32(d0, t3, -1);
114
+
115
+        int32x4_t t4 = vaddl_s16(vget_high_s16(s3), vget_high_s16(s4));
116
+        int32x4_t t5 = vaddl_s16(vget_high_s16(s2), vget_high_s16(s5));
117
+        int32x4_t t6 = vaddl_s16(vget_high_s16(s1), vget_high_s16(s6));
118
+        int32x4_t t7 = vaddl_s16(vget_high_s16(s0), vget_high_s16(s7));
119
+
120
+        d1 = vmlaq_n_s32(c, t4, 40);
121
+        d1 = vmlaq_n_s32(d1, t5, -11);
122
+        d1 = vmlaq_n_s32(d1, t6, 4);
123
+        d1 = vmlaq_n_s32(d1, t7, -1);
124
+    }
125
+    else
126
+    {
127
+        // { 0, 1, -5, 17, 58, -10, 4, -1 }
128
+        d0 = vsubl_s16(vget_low_s16(s1), vget_low_s16(s7));
129
+        d0 = vaddq_s32(d0, c);
130
+        d0 = vmlal_n_s16(d0, vget_low_s16(s6), 4);
131
+        d0 = vmlsl_n_s16(d0, vget_low_s16(s5), 10);
132
+        d0 = vmlal_n_s16(d0, vget_low_s16(s4), 58);
133
+        d0 = vmlal_n_s16(d0, vget_low_s16(s3), 17);
134
+        d0 = vmlsl_n_s16(d0, vget_low_s16(s2), 5);
135
+
136
+        d1 = vsubl_s16(vget_high_s16(s1), vget_high_s16(s7));
137
+        d1 = vaddq_s32(d1, c);
138
+        d1 = vmlal_n_s16(d1, vget_high_s16(s6), 4);
139
+        d1 = vmlsl_n_s16(d1, vget_high_s16(s5), 10);
140
+        d1 = vmlal_n_s16(d1, vget_high_s16(s4), 58);
141
+        d1 = vmlal_n_s16(d1, vget_high_s16(s3), 17);
142
+        d1 = vmlsl_n_s16(d1, vget_high_s16(s2), 5);
143
+    }
144
+}
145
+
146
+template<int width, int height>
147
+void interp4_vert_ss_neon(const int16_t *src, intptr_t srcStride, int16_t *dst,
148
+                          intptr_t dstStride, int coeffIdx)
149
+{
150
+    const int N_TAPS = 4;
151
+    src -= (N_TAPS / 2 - 1) * srcStride;
152
+
153
+    const int16x4_t filter = vld1_s16(X265_NS::g_chromaFiltercoeffIdx);
154
+
155
+    // Zero constant in order to use filter helper functions (optimised away).
156
+    const int32x4_t c = vdupq_n_s32(0);
157
+
158
+    if (width == 12)
159
+    {
160
+        const int16_t *s = src;
161
+        int16_t *d = dst;
162
+
163
+        int16x8_t in7;
164
+        load_s16x8xn<3>(s, srcStride, in);
165
+        s += 3 * srcStride;
166
+
167
+        for (int row = 0; (row + 4) <= height; row += 4)
168
+        {
169
+            load_s16x8xn<4>(s, srcStride, in + 3);
170
+
171
+            int32x4_t sum_lo4;
172
+            int32x4_t sum_hi4;
173
+            filter4_s16x8(coeffIdx, in + 0, filter, c, sum_lo0, sum_hi0);
174
+            filter4_s16x8(coeffIdx, in + 1, filter, c, sum_lo1, sum_hi1);
175
+            filter4_s16x8(coeffIdx, in + 2, filter, c, sum_lo2, sum_hi2);
176
+            filter4_s16x8(coeffIdx, in + 3, filter, c, sum_lo3, sum_hi3);
177
+
178
+            int16x8_t sum4;
179
+            sum0 = vcombine_s16(vshrn_n_s32(sum_lo0, IF_FILTER_PREC),
180
+                                  vshrn_n_s32(sum_hi0, IF_FILTER_PREC));
181
+            sum1 = vcombine_s16(vshrn_n_s32(sum_lo1, IF_FILTER_PREC),
182
+                                  vshrn_n_s32(sum_hi1, IF_FILTER_PREC));
183
+            sum2 = vcombine_s16(vshrn_n_s32(sum_lo2, IF_FILTER_PREC),
184
+                                  vshrn_n_s32(sum_hi2, IF_FILTER_PREC));
185
+            sum3 = vcombine_s16(vshrn_n_s32(sum_lo3, IF_FILTER_PREC),
186
+                                  vshrn_n_s32(sum_hi3, IF_FILTER_PREC));
187
+
188
+            store_s16x8xn<4>(d, dstStride, sum);
189
+
190
+            in0 = in4;
191
+            in1 = in5;
192
+            in2 = in6;
193
+
194
+            s += 4 * srcStride;
195
+            d += 4 * dstStride;
196
+        }
197
+
198
+        src += 8;
199
+        dst += 8;
200
+        s = src;
201
x265_3.6.tar.gz/source/common/aarch64/fun-decls.h -> x265_4.0.tar.gz/source/common/aarch64/fun-decls.h Changed
201
 
1
@@ -69,6 +69,24 @@
2
     ret PFX(name ## _64x16_ ## cpu)(__VA_ARGS__); \
3
     ret PFX(name ## _16x64_ ## cpu)(__VA_ARGS__)
4
 
5
+#define FUNCDEF_PU_MULT_16(ret, name, cpu, ...) \
6
+    ret PFX(name ## _16x16_ ## cpu)(__VA_ARGS__); \
7
+    ret PFX(name ## _32x32_ ## cpu)(__VA_ARGS__); \
8
+    ret PFX(name ## _64x64_ ## cpu)(__VA_ARGS__); \
9
+    ret PFX(name ## _16x8_  ## cpu)(__VA_ARGS__); \
10
+    ret PFX(name ## _16x32_ ## cpu)(__VA_ARGS__); \
11
+    ret PFX(name ## _32x16_ ## cpu)(__VA_ARGS__); \
12
+    ret PFX(name ## _64x32_ ## cpu)(__VA_ARGS__); \
13
+    ret PFX(name ## _32x64_ ## cpu)(__VA_ARGS__); \
14
+    ret PFX(name ## _16x12_ ## cpu)(__VA_ARGS__); \
15
+    ret PFX(name ## _16x4_  ## cpu)(__VA_ARGS__); \
16
+    ret PFX(name ## _32x24_ ## cpu)(__VA_ARGS__); \
17
+    ret PFX(name ## _32x8_  ## cpu)(__VA_ARGS__); \
18
+    ret PFX(name ## _64x48_ ## cpu)(__VA_ARGS__); \
19
+    ret PFX(name ## _48x64_ ## cpu)(__VA_ARGS__); \
20
+    ret PFX(name ## _64x16_ ## cpu)(__VA_ARGS__); \
21
+    ret PFX(name ## _16x64_ ## cpu)(__VA_ARGS__)
22
+
23
 #define FUNCDEF_CHROMA_PU(ret, name, cpu, ...) \
24
     FUNCDEF_PU(ret, name, cpu, __VA_ARGS__); \
25
     ret PFX(name ## _4x2_ ## cpu)(__VA_ARGS__); \
26
@@ -113,23 +131,8 @@
27
     FUNCDEF_CHROMA_PU(void, blockcopy_pp, cpu, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); \
28
     FUNCDEF_PU(void, blockcopy_sp, cpu, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); \
29
     FUNCDEF_PU(void, blockcopy_ps, cpu, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); \
30
-    FUNCDEF_PU(void, interp_8tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
31
-    FUNCDEF_PU(void, interp_8tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
32
-    FUNCDEF_PU(void, interp_8tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
33
-    FUNCDEF_PU(void, interp_8tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
34
-    FUNCDEF_PU(void, interp_8tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
35
-    FUNCDEF_PU(void, interp_8tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
36
-    FUNCDEF_PU(void, interp_8tap_hv_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY); \
37
     FUNCDEF_CHROMA_PU(void, filterPixelToShort, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
38
     FUNCDEF_CHROMA_PU(void, filterPixelToShort_aligned, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
39
-    FUNCDEF_CHROMA_PU(void, interp_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
40
-    FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
41
-    FUNCDEF_CHROMA_PU(void, interp_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
42
-    FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
43
-    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
44
-    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
45
-    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
46
-    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
47
     FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
48
     FUNCDEF_CHROMA_PU(void, addAvg_aligned, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
49
     FUNCDEF_PU(void, pixel_avg_pp, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
50
@@ -154,70 +157,74 @@
51
 DECLS(sve);
52
 DECLS(sve2);
53
 
54
+FUNCDEF_PU_MULT_16(int, pixel_sad, neon_dotprod, const pixel*, intptr_t, const pixel*, intptr_t);
55
+FUNCDEF_PU_MULT_16(void, sad_x3, neon_dotprod, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*);
56
+FUNCDEF_PU_MULT_16(void, sad_x4, neon_dotprod, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*);
57
+FUNCDEF_PU(sse_t, pixel_sse_pp, neon_dotprod, const pixel*, intptr_t, const pixel*, intptr_t);
58
 
59
-void x265_pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
60
+void PFX(pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift));
61
 
62
-uint64_t x265_pixel_var_8x8_neon(const pixel* pix, intptr_t stride);
63
-uint64_t x265_pixel_var_16x16_neon(const pixel* pix, intptr_t stride);
64
-uint64_t x265_pixel_var_32x32_neon(const pixel* pix, intptr_t stride);
65
-uint64_t x265_pixel_var_64x64_neon(const pixel* pix, intptr_t stride);
66
+uint64_t PFX(pixel_var_8x8_neon(const pixel* pix, intptr_t stride));
67
+uint64_t PFX(pixel_var_16x16_neon(const pixel* pix, intptr_t stride));
68
+uint64_t PFX(pixel_var_32x32_neon(const pixel* pix, intptr_t stride));
69
+uint64_t PFX(pixel_var_64x64_neon(const pixel* pix, intptr_t stride));
70
 
71
-void x265_getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
72
-void x265_getResidual8_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
73
-void x265_getResidual16_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
74
-void x265_getResidual32_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
75
+void PFX(getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
76
+void PFX(getResidual8_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
77
+void PFX(getResidual16_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
78
+void PFX(getResidual32_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
79
 
80
-void x265_scale1D_128to64_neon(pixel *dst, const pixel *src);
81
-void x265_scale2D_64to32_neon(pixel* dst, const pixel* src, intptr_t stride);
82
+void PFX(scale1D_128to64_neon(pixel *dst, const pixel *src));
83
+void PFX(scale2D_64to32_neon(pixel* dst, const pixel* src, intptr_t stride));
84
 
85
-int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
86
-int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
87
-int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
88
-int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
89
-int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
90
-int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
91
-int x265_pixel_satd_8x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
92
-int x265_pixel_satd_8x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
93
-int x265_pixel_satd_8x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
94
-int x265_pixel_satd_8x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
95
-int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
96
-int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
97
-int x265_pixel_satd_16x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
98
-int x265_pixel_satd_16x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
99
-int x265_pixel_satd_16x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
100
-int x265_pixel_satd_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
101
-int x265_pixel_satd_16x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
102
-int x265_pixel_satd_16x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
103
-int x265_pixel_satd_16x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
104
-int x265_pixel_satd_24x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
105
-int x265_pixel_satd_24x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
106
-int x265_pixel_satd_32x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
107
-int x265_pixel_satd_32x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
108
-int x265_pixel_satd_32x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
109
-int x265_pixel_satd_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
110
-int x265_pixel_satd_32x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
111
-int x265_pixel_satd_32x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
112
-int x265_pixel_satd_48x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
113
-int x265_pixel_satd_64x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
114
-int x265_pixel_satd_64x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
115
-int x265_pixel_satd_64x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
116
-int x265_pixel_satd_64x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
117
+int PFX(pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
118
+int PFX(pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
119
+int PFX(pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
120
+int PFX(pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
121
+int PFX(pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
122
+int PFX(pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
123
+int PFX(pixel_satd_8x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
124
+int PFX(pixel_satd_8x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
125
+int PFX(pixel_satd_8x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
126
+int PFX(pixel_satd_8x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
127
+int PFX(pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
128
+int PFX(pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
129
+int PFX(pixel_satd_16x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
130
+int PFX(pixel_satd_16x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
131
+int PFX(pixel_satd_16x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
132
+int PFX(pixel_satd_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
133
+int PFX(pixel_satd_16x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
134
+int PFX(pixel_satd_16x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
135
+int PFX(pixel_satd_16x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
136
+int PFX(pixel_satd_24x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
137
+int PFX(pixel_satd_24x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
138
+int PFX(pixel_satd_32x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
139
+int PFX(pixel_satd_32x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
140
+int PFX(pixel_satd_32x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
141
+int PFX(pixel_satd_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
142
+int PFX(pixel_satd_32x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
143
+int PFX(pixel_satd_32x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
144
+int PFX(pixel_satd_48x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
145
+int PFX(pixel_satd_64x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
146
+int PFX(pixel_satd_64x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
147
+int PFX(pixel_satd_64x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
148
+int PFX(pixel_satd_64x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
149
 
150
-int x265_pixel_sa8d_8x8_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
151
-int x265_pixel_sa8d_8x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
152
-int x265_pixel_sa8d_16x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
153
-int x265_pixel_sa8d_16x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
154
-int x265_pixel_sa8d_32x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
155
-int x265_pixel_sa8d_32x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
156
-int x265_pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
157
+int PFX(pixel_sa8d_8x8_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
158
+int PFX(pixel_sa8d_8x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
159
+int PFX(pixel_sa8d_16x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
160
+int PFX(pixel_sa8d_16x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
161
+int PFX(pixel_sa8d_32x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
162
+int PFX(pixel_sa8d_32x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
163
+int PFX(pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
164
 
165
 uint32_t PFX(quant_neon)(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
166
 uint32_t PFX(nquant_neon)(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
167
 
168
-void x265_dequant_scaling_neon(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift);
169
-void x265_dequant_normal_neon(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
170
+void PFX(dequant_scaling_neon(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift));
171
+void PFX(dequant_normal_neon(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift));
172
 
173
-void x265_ssim_4x4x2_core_neon(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24);
174
+void PFX(ssim_4x4x2_core_neon(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24));
175
 
176
 int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
177
 int PFX(psyCost_8x8_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
178
@@ -226,30 +233,28 @@
179
 int PFX(scanPosLast_neon)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
180
 uint32_t PFX(costCoeffNxN_neon)(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
181
 
182
-uint64_t x265_pixel_var_8x8_sve2(const pixel* pix, intptr_t stride);
183
-uint64_t x265_pixel_var_16x16_sve2(const pixel* pix, intptr_t stride);
184
-uint64_t x265_pixel_var_32x32_sve2(const pixel* pix, intptr_t stride);
185
-uint64_t x265_pixel_var_64x64_sve2(const pixel* pix, intptr_t stride);
186
-
187
-void x265_getResidual16_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
188
-void x265_getResidual32_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
189
+uint64_t PFX(pixel_var_8x8_sve2(const pixel* pix, intptr_t stride));
190
+uint64_t PFX(pixel_var_16x16_sve2(const pixel* pix, intptr_t stride));
191
+uint64_t PFX(pixel_var_32x32_sve2(const pixel* pix, intptr_t stride));
192
+uint64_t PFX(pixel_var_64x64_sve2(const pixel* pix, intptr_t stride));
193
 
194
-void x265_scale1D_128to64_sve2(pixel *dst, const pixel *src);
195
-void x265_scale2D_64to32_sve2(pixel* dst, const pixel* src, intptr_t stride);
196
+void PFX(getResidual16_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
197
+void PFX(getResidual32_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
198
 
199
-int x265_pixel_satd_4x4_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
200
-int x265_pixel_satd_8x4_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
201
x265_3.6.tar.gz/source/common/aarch64/intrapred-prim.cpp -> x265_4.0.tar.gz/source/common/aarch64/intrapred-prim.cpp Changed
201
 
1
@@ -2,7 +2,7 @@
2
 #include "primitives.h"
3
 
4
 
5
-#if 1
6
+#if HAVE_NEON
7
 #include "arm64-utils.h"
8
 #include <arm_neon.h>
9
 
10
@@ -12,6 +12,52 @@
11
 {
12
 
13
 
14
+template<int tuSize>
15
+void intraFilter_neon(const pixel* samples, pixel* filtered) /* 1:2:1 filtering of left and top reference samples */
16
+{
17
+    const int tuSize2 = tuSize << 1;
18
+    pixel topLeft = samples0, topLast = samplestuSize2, leftLast = samplestuSize2 + tuSize2;
19
+
20
+    uint16x8_t two_vec = vdupq_n_u16(2);
21
+#if !HIGH_BIT_DEPTH
22
+    {
23
+        for(int i = 0; i < tuSize2 + tuSize2; i+=8)
24
+         {
25
+            uint16x8_t sample1 = vmovl_u8(vld1_u8(&samplesi));
26
+            uint16x8_t sample2 = vmovl_u8(vld1_u8(&samplesi-1));
27
+            uint16x8_t sample3 = vmovl_u8(vld1_u8(&samplesi+1));
28
+
29
+            uint16x8_t result1 = vaddq_u16(vshlq_n_u16(sample1,1), sample2 );
30
+            uint16x8_t result2 = vaddq_u16(sample3, two_vec);
31
+            uint16x8_t result3 = vaddq_u16(result1,result2);
32
+            vst1_u8(&filteredi , vmovn_u16(vshrq_n_u16(result3, 2)));
33
+        }
34
+    }
35
+#else
36
+    {
37
+        for(int i = 0; i < tuSize2 + tuSize2; i+=8)
38
+        {
39
+            uint16x8_t sample1 = vld1q_u16(&samplesi);
40
+            uint16x8_t sample2 = vld1q_u16(&samplesi-1);
41
+            uint16x8_t sample3 = vld1q_u16(&samplesi+1);
42
+
43
+            uint16x8_t result1 = vaddq_u16(vshlq_n_u16(sample1,1), sample2 );
44
+            uint16x8_t result2 = vaddq_u16(sample3, two_vec);
45
+            uint16x8_t result3 = vaddq_u16(result1,result2);
46
+            vst1q_u16(&filteredi , vshrq_n_u16(result3, 2));
47
+        }
48
+    }
49
+#endif
50
+    // filtering top
51
+    filteredtuSize2 = topLast;
52
+
53
+    // filtering top-left
54
+    filtered0 = ((topLeft << 1) + samples1 + samplestuSize2 + 1 + 2) >> 2;
55
+
56
+    // filtering left
57
+    filteredtuSize2 + 1 = ((samplestuSize2 + 1 << 1) + topLeft + samplestuSize2 + 2 + 2) >> 2;
58
+    filteredtuSize2 + tuSize2 = leftLast;
59
+}
60
 
61
 template<int width>
62
 void intra_pred_ang_neon(pixel *dst, intptr_t dstStride, const pixel *srcPix0, int dirMode, int bFilter)
63
@@ -105,30 +151,42 @@
64
             {
65
                 if (width >= 8 && sizeof(pixel) == 1)
66
                 {
67
-                    const int16x8_t f0 = vdupq_n_s16(32 - fraction);
68
-                    const int16x8_t f1 = vdupq_n_s16(fraction);
69
+                    // We have to cast to the 'real' type so that this block
70
+                    // will compile for both low and high bitdepth.
71
+                    const uint8_t *ref_u8 = (const uint8_t *)ref + offset;
72
+                    uint8_t *dst_u8 = (uint8_t *)dst;
73
+
74
+                    // f0 and f1 are unsigned (fraction is in range 0, 31).
75
+                    const uint8x8_t f0 = vdup_n_u8(32 - fraction);
76
+                    const uint8x8_t f1 = vdup_n_u8(fraction);
77
                     for (int x = 0; x < width; x += 8)
78
                     {
79
-                        uint8x8_t in0 = *(uint8x8_t *)&refoffset + x;
80
-                        uint8x8_t in1 = *(uint8x8_t *)&refoffset + x + 1;
81
-                        int16x8_t lo = vmlaq_s16(vdupq_n_s16(16), vmovl_u8(in0), f0);
82
-                        lo = vmlaq_s16(lo, vmovl_u8(in1), f1);
83
-                        lo = vshrq_n_s16(lo, 5);
84
-                        *(uint8x8_t *)&dsty * dstStride + x = vmovn_u16(lo);
85
+                        uint8x8_t in0 = vld1_u8(ref_u8 + x);
86
+                        uint8x8_t in1 = vld1_u8(ref_u8 + x + 1);
87
+                        uint16x8_t lo = vmlal_u8(vdupq_n_u16(16), in0, f0);
88
+                        lo = vmlal_u8(lo, in1, f1);
89
+                        uint8x8_t res = vshrn_n_u16(lo, 5);
90
+                        vst1_u8(dst_u8 + y * dstStride + x, res);
91
                     }
92
                 }
93
                 else if (width >= 4 && sizeof(pixel) == 2)
94
                 {
95
-                    const int32x4_t f0 = vdupq_n_s32(32 - fraction);
96
-                    const int32x4_t f1 = vdupq_n_s32(fraction);
97
+                    // We have to cast to the 'real' type so that this block
98
+                    // will compile for both low and high bitdepth.
99
+                    const uint16_t *ref_u16 = (const uint16_t *)ref + offset;
100
+                    uint16_t *dst_u16 = (uint16_t *)dst;
101
+
102
+                    // f0 and f1 are unsigned (fraction is in range 0, 31).
103
+                    const uint16x4_t f0 = vdup_n_u16(32 - fraction);
104
+                    const uint16x4_t f1 = vdup_n_u16(fraction);
105
                     for (int x = 0; x < width; x += 4)
106
                     {
107
-                        uint16x4_t in0 = *(uint16x4_t *)&refoffset + x;
108
-                        uint16x4_t in1 = *(uint16x4_t *)&refoffset + x + 1;
109
-                        int32x4_t lo = vmlaq_s32(vdupq_n_s32(16), vmovl_u16(in0), f0);
110
-                        lo = vmlaq_s32(lo, vmovl_u16(in1), f1);
111
-                        lo = vshrq_n_s32(lo, 5);
112
-                        *(uint16x4_t *)&dsty * dstStride + x = vmovn_u32(lo);
113
+                        uint16x4_t in0 = vld1_u16(ref_u16 + x);
114
+                        uint16x4_t in1 = vld1_u16(ref_u16 + x + 1);
115
+                        uint32x4_t lo = vmlal_u16(vdupq_n_u32(16), in0, f0);
116
+                        lo = vmlal_u16(lo, in1, f1);
117
+                        uint16x4_t res = vshrn_n_u32(lo, 5);
118
+                        vst1_u16(dst_u16 + y * dstStride + x, res);
119
                     }
120
                 }
121
                 else
122
@@ -176,6 +234,7 @@
123
     }
124
 }
125
 
126
+#endif
127
 template<int log2Size>
128
 void all_angs_pred_neon(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
129
 {
130
@@ -220,14 +279,285 @@
131
         }
132
     }
133
 }
134
+
135
+template<int log2Size>
136
+void planar_pred_neon(pixel * dst, intptr_t dstStride, const pixel * srcPix, int /*dirMode*/, int /*bFilter*/)
137
+{
138
+    const int blkSize = 1 << log2Size;
139
+
140
+    const pixel* above = srcPix + 1;
141
+    const pixel* left = srcPix + (2 * blkSize + 1);
142
+
143
+    switch (blkSize) {
144
+    case 8:
145
+    {
146
+        const uint16_t log2SizePlusOne = log2Size + 1;
147
+        uint16x8_t blkSizeVec = vdupq_n_u16(blkSize);
148
+        uint16x8_t topRight = vdupq_n_u16(aboveblkSize);
149
+        uint16_t bottomLeft = leftblkSize;
150
+        uint16x8_t oneVec = vdupq_n_u16(1);
151
+        uint16x8_t blkSizeSubOneVec = vdupq_n_u16(blkSize - 1);
152
+
153
+        for (int y = 0; y < blkSize; y++) {
154
+            // (blkSize - 1 - y)
155
+            uint16x8_t vlkSizeYVec = vdupq_n_u16(blkSize - 1 - y);
156
+            // (y + 1) * bottomLeft
157
+            uint16x8_t bottomLeftYVec = vdupq_n_u16((y + 1) * bottomLeft);
158
+            // lefty
159
+            uint16x8_t leftYVec = vdupq_n_u16(lefty);
160
+
161
+            for (int x = 0; x < blkSize; x += 8) {
162
+                int idx = y * dstStride + x;
163
+                uint16x8_t xvec = { (uint16_t)(x + 0), (uint16_t)(x + 1),
164
+                                    (uint16_t)(x + 2), (uint16_t)(x + 3),
165
+                                    (uint16_t)(x + 4), (uint16_t)(x + 5),
166
+                                    (uint16_t)(x + 6), (uint16_t)(x + 7) };
167
+
168
+                // (blkSize - 1 - y) * abovex
169
+                uint16x8_t aboveVec = { (uint16_t)(abovex + 0),
170
+                                        (uint16_t)(abovex + 1),
171
+                                        (uint16_t)(abovex + 2),
172
+                                        (uint16_t)(abovex + 3),
173
+                                        (uint16_t)(abovex + 4),
174
+                                        (uint16_t)(abovex + 5),
175
+                                        (uint16_t)(abovex + 6),
176
+                                        (uint16_t)(abovex + 7) };
177
+
178
+                aboveVec = vmulq_u16(aboveVec, vlkSizeYVec);
179
+
180
+                // (blkSize - 1 - x) * lefty
181
+                uint16x8_t first = vsubq_u16(blkSizeSubOneVec, xvec);
182
+                first = vmulq_u16(first, leftYVec);
183
+
184
+                // (x + 1) * topRight
185
+                uint16x8_t second = vaddq_u16(xvec, oneVec);
186
+                second = vmulq_u16(second, topRight);
187
+
188
+                uint16x8_t resVec = vaddq_u16(first, second);
189
+                resVec = vaddq_u16(resVec, aboveVec);
190
+                resVec = vaddq_u16(resVec, bottomLeftYVec);
191
+                resVec = vaddq_u16(resVec, blkSizeVec);
192
+                resVec = vshrq_n_u16(resVec, log2SizePlusOne);
193
+
194
+                for (int i = 0; i < 8; i++)
195
+                    dstidx + i = (pixel)resVeci;
196
+    }
197
+}
198
+        }
199
+    break;
200
+    case 4:
201
x265_4.0.tar.gz/source/common/aarch64/intrapred.S Added
173
 
1
@@ -0,0 +1,171 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2021 MulticoreWare, Inc
4
+ *
5
+ * Authors: Min Chen <min.chen@multicorewareinc.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// Functions in this file:
26
+// ***** luma_vpp *****
27
+
28
+#include "asm.S"
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+.align 4
41
+tbl_const_1to8_7to0:
42
+    .byte 1, 2, 3, 4, 5, 6, 7, 8
43
+    .byte 7, 6, 5, 4, 3, 2, 1, 0
44
+    .byte 9, 10, 11, 12, 13, 14, 15, 16
45
+    .byte 15, 14, 13, 12, 11, 10, 9, 8
46
+
47
+// ***** planar_pred *****
48
+// void planar_pred(pixel* dst, intptr_t dstStride, const pixel* srcPix, int /*dirMode*/, int /*bFilter*/)
49
+function PFX(intra_pred_planar8_neon)
50
+// Register map
51
+// x0  = dst
52
+// x1  = dstStride
53
+// x2  = *srcPix
54
+// x3  = leftx
55
+// x4  = tmp
56
+// v0  = above7:0
57
+// v1  = left7:0
58
+// v2  = topRight = rep(aboveblkSize)
59
+// v3  = bottomLeft = rep(leftblkSize)
60
+// v4  = const8 7 6 5 4 3 2 1
61
+// v5  = const7 6 5 4 3 2 1 0
62
+
63
+//{
64
+//    const int blkSize = 1 << log2Size;
65
+//    const pixel* above = srcPix + 1;
66
+//    const pixel* left  = srcPix + (2 * blkSize + 1);
67
+//    pixel topRight = aboveblkSize;
68
+//    pixel bottomLeft = leftblkSize;
69
+//    for (int y = 0; y < blkSize; y++)
70
+//        for (int x = 0; x < blkSize; x++)
71
+//            dsty * dstStride + x = (pixel) (((blkSize - 1 - x) * lefty + (blkSize - 1 -y) * abovex + (x + 1) * topRight + (y + 1) * bottomLeft + blkSize) >> (log2Size + 1));
72
+//}
73
+
74
+    ldurb           w3, x2, #(1+8)                // topRight
75
+    ldurb           w4, x2, #(2*8+1+8)            // bottomLeft
76
+    dup             v2.8b, w3                       // v2 = topRight_b
77
+    dup             v3.8h, w4                       // v3 = bottomLeft_h
78
+    ldr             x3, x2, #(2*8+1)              // x3 = leftx_b
79
+    ldr             d0, x2, #1                    // v0 = abovex_b
80
+
81
+    adr             x4, tbl_const_1to8_7to0
82
+    ldr             d4, x4                        // v4 = const_b8 7 6 5 4 3 2 1
83
+    ldr             d5, x4, #8                    // v5 = const_b7 6 5 4 3 2 1 0
84
+
85
+    ushll           v6.8h, v0.8b, #3                // v6 = 8 * abovex
86
+    usubw           v0.8h, v3.8h, v0.8b             // v0 = bottomLeft - abovex
87
+
88
+    umlal           v6.8h, v4.8b, v2.8b             // v6 = 8 * abovex + (x + 1) * topRight
89
+
90
+    mov             w4, #8
91
+
92
+1:
93
+    dup             v1.8b, w3
94
+    lsr             x3, x3, #8
95
+    add             v6.8h, v6.8h, v0.8h             // v6 = (blkSize - 1 -y=0) * abovex + (x + 1) * topRight + (y=0 + 1) * bottomLeft
96
+    mov             v3.16b, v6.16b
97
+    umlal           v3.8h, v5.8b, v1.8b             // v3 = (blkSize - 1 - x) * lefty=0 + (blkSize - 1 -y=0) * abovex + (x + 1) * topRight + (y=0 + 1) * bottomLeft
98
+    rshrn           v3.8b, v3.8h, #4
99
+    sub             w4, w4, #1
100
+    st1             {v3.8b}, x0, x1
101
+    cbnz            w4, 1b
102
+
103
+    ret
104
+endfunc
105
+
106
+// void planar_pred(pixel* dst, intptr_t dstStride, const pixel* srcPix, int /*dirMode*/, int /*bFilter*/)
107
+function PFX(intra_pred_planar16_neon)
108
+// Register map
109
+// x0  = dst
110
+// x1  = dstStride
111
+// x2  = *srcPix
112
+// x3  = leftx
113
+// x4  = tmp
114
+// v0  = above7:0
115
+// v1  = left7:0
116
+// v2  = topRight = rep(aboveblkSize)
117
+// v3  = bottomLeft = rep(leftblkSize)
118
+// v4  = const16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1
119
+// v5  = const15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
120
+
121
+//{
122
+//    const int blkSize = 1 << log2Size;
123
+//    const pixel* above = srcPix + 1;
124
+//    const pixel* left  = srcPix + (2 * blkSize + 1);
125
+//    pixel topRight = aboveblkSize;
126
+//    pixel bottomLeft = leftblkSize;
127
+//    for (int y = 0; y < blkSize; y++)
128
+//        for (int x = 0; x < blkSize; x++)
129
+//            dsty * dstStride + x = (pixel) (((blkSize - 1 - x) * lefty + (blkSize - 1 -y) * abovex + (x + 1) * topRight + (y + 1) * bottomLeft + blkSize) >> (log2Size + 1));
130
+//}
131
+
132
+    ldurb           w3, x2, #(1+16)               // topRight
133
+    ldurb           w4, x2, #(2*16+1+16)          // bottomLeft
134
+    ldr             q0, x2, #(2*16+1)             // v0 = leftx_b
135
+    ldr             q1, x2, #1                    // v1 = abovex_b
136
+    dup             v2.16b, w3                      // v2 = topRight_b
137
+    dup             v3.8h, w4                       // v3 = bottomLeft_h
138
+
139
+    adr             x4, tbl_const_1to8_7to0
140
+    ld2             {v4.2d, v5.2d}, x4            // v4 = const_b16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1
141
+    ext             v5.16b, v5.16b, v5.16b, #8      // v5 = const_b15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
142
+
143
+    ushll           v16.8h, v1.8b, #4               // v16,v17 = 16 * abovex
144
+    ushll2          v17.8h, v1.16b, #4
145
+    usubw           v6.8h, v3.8h, v1.8b             // v6,v7 = bottomLeft - abovex
146
+    usubw2          v7.8h, v3.8h, v1.16b
147
+
148
+    umlal           v16.8h, v4.8b, v2.8b            // v16,v17 = 16 * abovex + (x + 1) * topRight
149
+    umlal2          v17.8h, v4.16b, v2.16b
150
+
151
+    mov             w4, #16
152
+
153
+1:
154
+    dup             v1.16b, v0.b0                 // v1 = leftx_b
155
+    ext             v0.16b, v0.16b, v0.16b, #1
156
+
157
+    add             v16.8h, v16.8h, v6.8h           // v16,v17 = (blkSize - 1 -y=0) * abovex + (x + 1) * topRight + (y=0 + 1) * bottomLeft
158
+    add             v17.8h, v17.8h, v7.8h
159
+
160
+    mov             v18.16b, v16.16b
161
+    mov             v19.16b, v17.16b
162
+
163
+    umlal           v18.8h, v5.8b, v1.8b             // v3 = (blkSize - 1 - x) * lefty=0 + (blkSize - 1 -y=0) * abovex + (x + 1) * topRight + (y=0 + 1) * bottomLeft
164
+    umlal2          v19.8h, v5.16b, v1.16b
165
+    rshrn           v18.8b, v18.8h, #5
166
+    rshrn2          v18.16b, v19.8h, #5
167
+    st1             {v18.16b}, x0, x1
168
+    sub             w4, w4, #1
169
+    cbnz            w4, 1b
170
+
171
+    ret
172
+endfunc
173
x265_3.6.tar.gz/source/common/aarch64/loopfilter-prim.cpp -> x265_4.0.tar.gz/source/common/aarch64/loopfilter-prim.cpp Changed
201
 
1
@@ -1,3 +1,4 @@
2
+#include "common.h"
3
 #include "loopfilter-prim.h"
4
 
5
 #define PIXEL_MIN 0
6
@@ -11,15 +12,10 @@
7
 {
8
 
9
 
10
-/* get the sign of input variable (TODO: this is a dup, make common) */
11
-static inline int8_t signOf(int x)
12
-{
13
-    return (x >> 31) | ((int)((((uint32_t) - x)) >> 31));
14
-}
15
-
16
 static inline int8x8_t sign_diff_neon(const uint8x8_t in0, const uint8x8_t in1)
17
 {
18
-    int16x8_t in = vsubl_u8(in0, in1);
19
+    int16x8_t in = vreinterpretq_s16_u16(vsubl_u8(in0, in1));
20
+
21
     return vmovn_s16(vmaxq_s16(vminq_s16(in, vdupq_n_s16(1)), vdupq_n_s16(-1)));
22
 }
23
 
24
@@ -28,12 +24,13 @@
25
     int x = 0;
26
     for (; (x + 8) <= endX; x += 8)
27
     {
28
-        *(int8x8_t *)&dstx  = sign_diff_neon(*(uint8x8_t *)&src1x, *(uint8x8_t *)&src2x);
29
+        int8x8_t sign = sign_diff_neon(vld1_u8(src1 + x), vld1_u8(src2 + x));
30
+        vst1_s8(dst + x, sign);
31
     }
32
 
33
     for (; x < endX; x++)
34
     {
35
-        dstx = signOf(src1x - src2x);
36
+        dstx = x265_signOf(src1x - src2x);
37
     }
38
 }
39
 
40
@@ -56,21 +53,20 @@
41
             int8x8x2_t shifter;
42
             shifter.val10 = signLeft0;
43
             static const int8x8_t index = {8, 0, 1, 2, 3, 4, 5, 6};
44
-            int8x8_t tbl = *(int8x8_t *)offsetEo;
45
+            int8x8_t tbl = vld1_s8(offsetEo);
46
             for (; (x + 8) <= width; x += 8)
47
             {
48
-                uint8x8_t in = *(uint8x8_t *)&recx;
49
-                vsignRight = sign_diff_neon(in, *(uint8x8_t *)&recx + 1);
50
+                uint8x8_t in = vld1_u8(rec + x);
51
+                vsignRight = sign_diff_neon(in, vld1_u8(rec + x + 1));
52
                 shifter.val0 = vneg_s8(vsignRight);
53
                 int8x8_t tmp = shifter.val0;
54
                 int8x8_t edge = vtbl2_s8(shifter, index);
55
                 int8x8_t vedgeType = vadd_s8(vadd_s8(vsignRight, edge), vdup_n_s8(2));
56
                 shifter.val10 = tmp7;
57
                 int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
58
-                t1 = vaddw_u8(t1, in);
59
-                t1 = vmaxq_s16(t1, vdupq_n_s16(0));
60
-                t1 = vminq_s16(t1, vdupq_n_s16(255));
61
-                *(uint8x8_t *)&recx = vmovn_u16(t1);
62
+                t1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(t1),
63
+                                                    in));
64
+                vst1_u8(rec + x, vqmovun_s16(t1));
65
             }
66
             signLeft0 = shifter.val10;
67
         }
68
@@ -93,22 +89,26 @@
69
 
70
     if (width >= 8)
71
     {
72
-        int8x8_t tbl = *(int8x8_t *)offsetEo;
73
+        int8x8_t tbl = vld1_s8(offsetEo);
74
+        const int8x8_t c = vdup_n_s8(2);
75
+
76
         for (; (x + 8) <= width; x += 8)
77
         {
78
-            uint8x8_t in0 = *(uint8x8_t *)&recx;
79
-            uint8x8_t in1 = *(uint8x8_t *)&recx + stride;
80
+            uint8x8_t in0 = vld1_u8(rec + x);
81
+            uint8x8_t in1 = vld1_u8(rec + x + stride);
82
             int8x8_t vsignDown = sign_diff_neon(in0, in1);
83
-            int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1x), vdup_n_s8(2));
84
-            *(int8x8_t *)&upBuff1x = vneg_s8(vsignDown);
85
+            int8x8_t vsignUp = vld1_s8(upBuff1 + x);
86
+            int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, vsignUp), c);
87
+            vst1_s8(upBuff1 + x, vneg_s8(vsignDown));
88
             int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
89
-            t1 = vaddw_u8(t1, in0);
90
-            *(uint8x8_t *)&recx = vqmovun_s16(t1);
91
+            t1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(t1),
92
+                                                in0));
93
+            vst1_u8(rec + x, vqmovun_s16(t1));
94
         }
95
     }
96
     for (; x < width; x++)
97
     {
98
-        signDown = signOf(recx - recx + stride);
99
+        signDown = x265_signOf(recx - recx + stride);
100
         edgeType = signDown + upBuff1x + 2;
101
         upBuff1x = -signDown;
102
         recx = x265_clip(recx + offsetEoedgeType);
103
@@ -126,25 +126,26 @@
104
         int x = 0;
105
         if (width >= 8)
106
         {
107
-            int8x8_t tbl = *(int8x8_t *)offsetEo;
108
+            int8x8_t tbl = vld1_s8(offsetEo);
109
+            const int8x8_t c = vdup_n_s8(2);
110
+
111
             for (; (x + 8) <= width; x += 8)
112
             {
113
-                uint8x8_t in0 = *(uint8x8_t *)&recx;
114
-                uint8x8_t in1 = *(uint8x8_t *)&recx + stride;
115
+                uint8x8_t in0 = vld1_u8(rec + x);
116
+                uint8x8_t in1 = vld1_u8(rec + x + stride);
117
                 int8x8_t vsignDown = sign_diff_neon(in0, in1);
118
-                int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1x), vdup_n_s8(2));
119
-                *(int8x8_t *)&upBuff1x = vneg_s8(vsignDown);
120
+                int8x8_t vsignUp = vld1_s8(upBuff1 + x);
121
+                int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, vsignUp), c);
122
+                vst1_s8(upBuff1 + x, vneg_s8(vsignDown));
123
                 int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
124
-                t1 = vaddw_u8(t1, in0);
125
-                t1 = vmaxq_s16(t1, vdupq_n_s16(0));
126
-                t1 = vminq_s16(t1, vdupq_n_s16(255));
127
-                *(uint8x8_t *)&recx = vmovn_u16(t1);
128
-
129
+                t1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(t1),
130
+                                                    in0));
131
+                vst1_u8(rec + x, vqmovun_s16(t1));
132
             }
133
         }
134
         for (; x < width; x++)
135
         {
136
-            signDown = signOf(recx - recx + stride);
137
+            signDown = x265_signOf(recx - recx + stride);
138
             edgeType = signDown + upBuff1x + 2;
139
             upBuff1x = -signDown;
140
             recx = x265_clip(recx + offsetEoedgeType);
141
@@ -157,11 +158,11 @@
142
 {
143
     int x;
144
 
145
-    if (abs(buff1 - bufft) < 16)
146
+    if (abs(static_cast<int>(buff1 - bufft)) < 16)
147
     {
148
         for (x = 0; x < width; x++)
149
         {
150
-            int8_t signDown = signOf(recx - recx + stride + 1);
151
+            int8_t signDown = x265_signOf(recx - recx + stride + 1);
152
             int edgeType = signDown + buff1x + 2;
153
             bufftx + 1 = -signDown;
154
             recx = x265_clip(recx + offsetEoedgeType);;
155
@@ -169,24 +170,26 @@
156
     }
157
     else
158
     {
159
-        int8x8_t tbl = *(int8x8_t *)offsetEo;
160
+        int8x8_t tbl = vld1_s8(offsetEo);
161
+        const int8x8_t c = vdup_n_s8(2);
162
+
163
         x = 0;
164
         for (; (x + 8) <= width; x += 8)
165
         {
166
-            uint8x8_t in0 = *(uint8x8_t *)&recx;
167
-            uint8x8_t in1 = *(uint8x8_t *)&recx + stride + 1;
168
+            uint8x8_t in0 = vld1_u8(rec + x);
169
+            uint8x8_t in1 = vld1_u8(rec + x + stride + 1);
170
             int8x8_t vsignDown = sign_diff_neon(in0, in1);
171
-            int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&buff1x), vdup_n_s8(2));
172
-            *(int8x8_t *)&bufftx + 1 = vneg_s8(vsignDown);
173
+            int8x8_t vsignUp = vld1_s8(buff1 + x);
174
+            int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, vsignUp), c);
175
+            vst1_s8(bufft + x + 1, vneg_s8(vsignDown));
176
             int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
177
-            t1 = vaddw_u8(t1, in0);
178
-            t1 = vmaxq_s16(t1, vdupq_n_s16(0));
179
-            t1 = vminq_s16(t1, vdupq_n_s16(255));
180
-            *(uint8x8_t *)&recx = vmovn_u16(t1);
181
+            t1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(t1),
182
+                                                in0));
183
+            vst1_u8(rec + x, vqmovun_s16(t1));
184
         }
185
         for (; x < width; x++)
186
         {
187
-            int8_t signDown = signOf(recx - recx + stride + 1);
188
+            int8_t signDown = x265_signOf(recx - recx + stride + 1);
189
             int edgeType = signDown + buff1x + 2;
190
             bufftx + 1 = -signDown;
191
             recx = x265_clip(recx + offsetEoedgeType);;
192
@@ -200,26 +203,25 @@
193
 {
194
     int8_t signDown;
195
     int8_t edgeType;
196
-    int8x8_t tbl = *(int8x8_t *)offsetEo;
197
+    int8x8_t tbl = vld1_s8(offsetEo);
198
+    const int8x8_t c = vdup_n_s8(2);
199
 
200
     int x = startX + 1;
201
x265_3.6.tar.gz/source/common/aarch64/mc-a-sve2.S -> x265_4.0.tar.gz/source/common/aarch64/mc-a-sve2.S Changed
201
 
1
@@ -219,7 +219,7 @@
2
     mov             x11, #0
3
     whilelt         p0.b, x11, x10
4
     mov             w12, #8
5
-.loop_gt_32_pixel_avg_pp_48x64:
6
+.Loop_gt_32_pixel_avg_pp_48x64:
7
     sub             w12, w12, #1
8
 .rept 8
9
     ld1b            {z0.b}, p0/z, x2
10
@@ -230,7 +230,7 @@
11
     st1b            {z0.b}, p0, x0
12
     add             x0, x0, x1
13
 .endr
14
-    cbnz            w12, .loop_gt_32_pixel_avg_pp_48x64
15
+    cbnz            w12, .Loop_gt_32_pixel_avg_pp_48x64
16
     ret
17
 endfunc
18
 
19
@@ -339,7 +339,7 @@
20
     mov             w12, #\h / 2
21
     ptrue           p0.b, vl16
22
     ptrue           p2.h, vl6
23
-.loop_sve2_addavg_6x\h\():
24
+.Loop_sve2_addavg_6x\h\():
25
     sub             w12, w12, #1
26
     ld1b            {z0.b}, p0/z, x0
27
     ld1b            {z1.b}, p0/z, x1
28
@@ -359,7 +359,7 @@
29
     add             x2, x2, x5
30
     st1b            {z2.h}, p2, x2
31
     add             x2, x2, x5
32
-    cbnz            w12, .loop_sve2_addavg_6x\h
33
+    cbnz            w12, .Loop_sve2_addavg_6x\h
34
     ret
35
 endfunc
36
 .endm
37
@@ -398,7 +398,7 @@
38
 function PFX(addAvg_8x\h\()_sve2)
39
     mov             w12, #\h / 2
40
     ptrue           p0.b, vl16
41
-.loop_sve2_addavg_8x\h\():
42
+.Loop_sve2_addavg_8x\h\():
43
     sub             w12, w12, #1
44
     ld1b            {z0.b}, p0/z, x0
45
     ld1b            {z1.b}, p0/z, x1
46
@@ -418,7 +418,7 @@
47
     add             x2, x2, x5
48
     st1b            {z2.h}, p0, x2
49
     add             x2, x2, x5
50
-    cbnz            w12, .loop_sve2_addavg_8x\h
51
+    cbnz            w12, .Loop_sve2_addavg_8x\h
52
     ret
53
 endfunc
54
 .endm
55
@@ -440,7 +440,7 @@
56
     bgt             .vl_gt_16_addAvg_12x\h
57
     ptrue           p0.b, vl16
58
     ptrue           p1.b, vl8
59
-.loop_sve2_addavg_12x\h\():
60
+.Loop_sve2_addavg_12x\h\():
61
     sub             w12, w12, #1
62
     ld1b            {z0.b}, p0/z, x0
63
     ld1b            {z1.b}, p0/z, x1
64
@@ -457,13 +457,13 @@
65
     st1b            {z0.h}, p0, x2
66
     st1b            {z2.h}, p1, x2, #1, mul vl
67
     add             x2, x2, x5
68
-    cbnz            w12, .loop_sve2_addavg_12x\h
69
+    cbnz            w12, .Loop_sve2_addavg_12x\h
70
     ret
71
 .vl_gt_16_addAvg_12x\h\():
72
     mov             x10, #24
73
     mov             x11, #0
74
     whilelt         p0.b, x11, x10
75
-.loop_sve2_gt_16_addavg_12x\h\():
76
+.Loop_sve2_gt_16_addavg_12x\h\():
77
     sub             w12, w12, #1
78
     ld1b            {z0.b}, p0/z, x0
79
     ld1b            {z1.b}, p0/z, x1
80
@@ -476,7 +476,7 @@
81
     add             z2.b, z2.b, #0x80
82
     st1b            {z0.h}, p0, x2
83
     add             x2, x2, x5
84
-    cbnz            w12, .loop_sve2_gt_16_addavg_12x\h
85
+    cbnz            w12, .Loop_sve2_gt_16_addavg_12x\h
86
     ret
87
 endfunc
88
 .endm
89
@@ -491,7 +491,7 @@
90
     cmp             x9, #16
91
     bgt             .vl_gt_16_addAvg_16x\h
92
     ptrue           p0.b, vl16
93
-.loop_eq_16_sve2_addavg_16x\h\():
94
+.Loop_eq_16_sve2_addavg_16x\h\():
95
     sub             w12, w12, #1
96
     ld1b            {z0.b}, p0/z, x0
97
     ld1b            {z1.b}, p0/z, x1
98
@@ -508,13 +508,13 @@
99
     st1b            {z0.h}, p0, x2
100
     st1b            {z2.h}, p0, x2, #1, mul vl
101
     add             x2, x2, x5
102
-    cbnz            w12, .loop_eq_16_sve2_addavg_16x\h
103
+    cbnz            w12, .Loop_eq_16_sve2_addavg_16x\h
104
     ret
105
 .vl_gt_16_addAvg_16x\h\():
106
     cmp             x9, #32
107
     bgt             .vl_gt_32_addAvg_16x\h
108
     ptrue           p0.b, vl32
109
-.loop_gt_16_sve2_addavg_16x\h\():
110
+.Loop_gt_16_sve2_addavg_16x\h\():
111
     sub             w12, w12, #1
112
     ld1b            {z0.b}, p0/z, x0
113
     ld1b            {z1.b}, p0/z, x1
114
@@ -525,13 +525,13 @@
115
     add             z0.b, z0.b, #0x80
116
     st1b            {z0.h}, p1, x2
117
     add             x2, x2, x5
118
-    cbnz            w12, .loop_gt_16_sve2_addavg_16x\h
119
+    cbnz            w12, .Loop_gt_16_sve2_addavg_16x\h
120
     ret
121
 .vl_gt_32_addAvg_16x\h\():
122
     mov             x10, #48
123
     mov             x11, #0
124
     whilelt         p0.b, x11, x10
125
-.loop_gt_32_sve2_addavg_16x\h\():
126
+.Loop_gt_32_sve2_addavg_16x\h\():
127
     sub             w12, w12, #1
128
     ld1b            {z0.b}, p0/z, x0
129
     add             x0, x0, x3, lsl #1
130
@@ -541,7 +541,7 @@
131
     add             z0.b, z0.b, #0x80
132
     st1b            {z0.h}, p0, x2
133
     add             x2, x2, x5
134
-    cbnz            w12, .loop_gt_32_sve2_addavg_16x\h
135
+    cbnz            w12, .Loop_gt_32_sve2_addavg_16x\h
136
     ret
137
 endfunc
138
 .endm
139
@@ -561,7 +561,7 @@
140
     cmp             x9, #16
141
     bgt             .vl_gt_16_addAvg_24x\h
142
     addAvg_start
143
-.loop_eq_16_sve2_addavg_24x\h\():
144
+.Loop_eq_16_sve2_addavg_24x\h\():
145
     sub             w12, w12, #1
146
     ld1             {v0.16b-v2.16b}, x0, x3
147
     ld1             {v3.16b-v5.16b}, x1, x4
148
@@ -572,14 +572,14 @@
149
     sqxtun          v1.8b, v1.8h
150
     sqxtun          v2.8b, v2.8h
151
     st1             {v0.8b-v2.8b}, x2, x5
152
-    cbnz            w12, .loop_eq_16_sve2_addavg_24x\h
153
+    cbnz            w12, .Loop_eq_16_sve2_addavg_24x\h
154
     ret
155
 .vl_gt_16_addAvg_24x\h\():
156
     cmp             x9, #48
157
     bgt             .vl_gt_48_addAvg_24x\h
158
     ptrue           p0.b, vl32
159
     ptrue           p1.b, vl16
160
-.loop_gt_16_sve2_addavg_24x\h\():
161
+.Loop_gt_16_sve2_addavg_24x\h\():
162
     sub             w12, w12, #1
163
     ld1b            {z0.b}, p0/z, x0
164
     ld1b            {z1.b}, p1/z, x0, #1, mul vl
165
@@ -596,13 +596,13 @@
166
     st1b            {z0.h}, p0, x2
167
     st1b            {z1.h}, p1, x2, #1, mul vl
168
     add             x2, x2, x5
169
-    cbnz            w12, .loop_gt_16_sve2_addavg_24x\h
170
+    cbnz            w12, .Loop_gt_16_sve2_addavg_24x\h
171
     ret
172
 .vl_gt_48_addAvg_24x\h\():
173
     mov             x10, #48
174
     mov             x11, #0
175
     whilelt         p0.b, x11, x10
176
-.loop_gt_48_sve2_addavg_24x\h\():
177
+.Loop_gt_48_sve2_addavg_24x\h\():
178
     sub             w12, w12, #1
179
     ld1b            {z0.b}, p0/z, x0
180
     ld1b            {z2.b}, p0/z, x1
181
@@ -613,7 +613,7 @@
182
     add             z0.b, z0.b, #0x80
183
     st1b            {z0.h}, p0, x2
184
     add             x2, x2, x5
185
-    cbnz            w12, .loop_gt_48_sve2_addavg_24x\h
186
+    cbnz            w12, .Loop_gt_48_sve2_addavg_24x\h
187
     ret
188
 endfunc
189
 .endm
190
@@ -628,7 +628,7 @@
191
     cmp             x9, #16
192
     bgt             .vl_gt_16_addAvg_32x\h
193
     ptrue           p0.b, vl16
194
-.loop_eq_16_sve2_addavg_32x\h\():
195
+.Loop_eq_16_sve2_addavg_32x\h\():
196
     sub             w12, w12, #1
197
     ld1b            {z0.b}, p0/z, x0
198
     ld1b            {z1.b}, p0/z, x0, #1, mul vl
199
@@ -657,13 +657,13 @@
200
     st1b            {z2.h}, p0, x2, #2, mul vl
201
x265_3.6.tar.gz/source/common/aarch64/mc-a.S -> x265_4.0.tar.gz/source/common/aarch64/mc-a.S Changed
145
 
1
@@ -283,7 +283,7 @@
2
     addAvg_start
3
     mov             w12, #\h / 2
4
     sub             x5, x5, #4
5
-.loop_addavg_6x\h:
6
+.Loop_addavg_6x\h:
7
     sub             w12, w12, #1
8
     ld1             {v0.16b}, x0, x3
9
     ld1             {v1.16b}, x1, x4
10
@@ -305,7 +305,7 @@
11
     st1             {v0.h}2, x2, x5
12
     str             s1, x2, #4
13
     st1             {v1.h}2, x2, x5
14
-    cbnz            w12, .loop_addavg_6x\h
15
+    cbnz            w12, .Loop_addavg_6x\h
16
     ret
17
 endfunc
18
 .endm
19
@@ -344,7 +344,7 @@
20
 function PFX(addAvg_8x\h\()_neon)
21
     addAvg_start
22
     mov             w12, #\h / 2
23
-.loop_addavg_8x\h:
24
+.Loop_addavg_8x\h:
25
     sub             w12, w12, #1
26
     ld1             {v0.16b}, x0, x3
27
     ld1             {v1.16b}, x1, x4
28
@@ -364,7 +364,7 @@
29
     sqxtun          v1.8b, v1.8h
30
     st1             {v0.8b}, x2, x5
31
     st1             {v1.8b}, x2, x5
32
-    cbnz            w12, .loop_addavg_8x\h
33
+    cbnz            w12, .Loop_addavg_8x\h
34
     ret
35
 endfunc
36
 .endm
37
@@ -385,7 +385,7 @@
38
     sub             x4, x4, #16
39
     sub             x5, x5, #8
40
     mov             w12, #\h
41
-.loop_addAvg_12X\h\():
42
+.Loop_addAvg_12X\h\():
43
     sub             w12, w12, #1
44
     ld1             {v0.16b}, x0, #16
45
     ld1             {v1.16b}, x1, #16
46
@@ -403,7 +403,7 @@
47
     sqxtun          v1.8b, v1.8h
48
     st1             {v0.8b}, x2, #8
49
     st1             {v1.s}0, x2, x5
50
-    cbnz            w12, .loop_addAvg_12X\h
51
+    cbnz            w12, .Loop_addAvg_12X\h
52
     ret
53
 endfunc
54
 .endm
55
@@ -415,7 +415,7 @@
56
 function PFX(addAvg_16x\h\()_neon)
57
     addAvg_start
58
     mov             w12, #\h
59
-.loop_addavg_16x\h:
60
+.Loop_addavg_16x\h:
61
     sub             w12, w12, #1
62
     ld1             {v0.8h-v1.8h}, x0, x3
63
     ld1             {v2.8h-v3.8h}, x1, x4
64
@@ -424,7 +424,7 @@
65
     sqxtun          v0.8b, v0.8h
66
     sqxtun2         v0.16b, v1.8h
67
     st1             {v0.16b}, x2, x5
68
-    cbnz            w12, .loop_addavg_16x\h
69
+    cbnz            w12, .Loop_addavg_16x\h
70
     ret
71
 endfunc
72
 .endm
73
@@ -441,7 +441,7 @@
74
 function PFX(addAvg_24x\h\()_neon)
75
     addAvg_start
76
     mov             w12, #\h
77
-.loop_addavg_24x\h\():
78
+.Loop_addavg_24x\h\():
79
     sub             w12, w12, #1
80
     ld1             {v0.16b-v2.16b}, x0, x3
81
     ld1             {v3.16b-v5.16b}, x1, x4
82
@@ -452,7 +452,7 @@
83
     sqxtun          v1.8b, v1.8h
84
     sqxtun          v2.8b, v2.8h
85
     st1             {v0.8b-v2.8b}, x2, x5
86
-    cbnz            w12, .loop_addavg_24x\h
87
+    cbnz            w12, .Loop_addavg_24x\h
88
     ret
89
 endfunc
90
 .endm
91
@@ -464,7 +464,7 @@
92
 function PFX(addAvg_32x\h\()_neon)
93
     addAvg_start
94
     mov             w12, #\h
95
-.loop_addavg_32x\h\():
96
+.Loop_addavg_32x\h\():
97
     sub             w12, w12, #1
98
     ld1             {v0.8h-v3.8h}, x0, x3
99
     ld1             {v4.8h-v7.8h}, x1, x4
100
@@ -477,7 +477,7 @@
101
     sqxtun          v2.8b, v2.8h
102
     sqxtun          v3.8b, v3.8h
103
     st1             {v0.8b-v3.8b}, x2, x5
104
-    cbnz            w12, .loop_addavg_32x\h
105
+    cbnz            w12, .Loop_addavg_32x\h
106
     ret
107
 endfunc
108
 .endm
109
@@ -494,7 +494,7 @@
110
     sub             x3, x3, #64
111
     sub             x4, x4, #64
112
     mov             w12, #64
113
-.loop_addavg_48x64:
114
+.Loop_addavg_48x64:
115
     sub             w12, w12, #1
116
     ld1             {v0.8h-v3.8h}, x0, #64
117
     ld1             {v4.8h-v7.8h}, x1, #64
118
@@ -513,7 +513,7 @@
119
     sqxtun          v2.8b, v20.8h
120
     sqxtun2         v2.16b, v21.8h
121
     st1             {v0.16b-v2.16b}, x2, x5
122
-    cbnz            w12, .loop_addavg_48x64
123
+    cbnz            w12, .Loop_addavg_48x64
124
     ret
125
 endfunc
126
 
127
@@ -523,7 +523,7 @@
128
     mov             w12, #\h
129
     sub             x3, x3, #64
130
     sub             x4, x4, #64
131
-.loop_addavg_64x\h\():
132
+.Loop_addavg_64x\h\():
133
     sub             w12, w12, #1
134
     ld1             {v0.8h-v3.8h}, x0, #64
135
     ld1             {v4.8h-v7.8h}, x1, #64
136
@@ -546,7 +546,7 @@
137
     sqxtun          v3.8b, v22.8h
138
     sqxtun2         v3.16b, v23.8h
139
     st1             {v0.16b-v3.16b}, x2, x5
140
-    cbnz            w12, .loop_addavg_64x\h
141
+    cbnz            w12, .Loop_addavg_64x\h
142
     ret
143
 endfunc
144
 .endm
145
x265_4.0.tar.gz/source/common/aarch64/mem-neon.h Added
201
 
1
@@ -0,0 +1,268 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2024 MulticoreWare, Inc
4
+ *
5
+ * Authors: Hari Limaye <hari.limaye@arm.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#ifndef X265_COMMON_AARCH64_MEM_NEON_H
26
+#define X265_COMMON_AARCH64_MEM_NEON_H
27
+
28
+#include <arm_neon.h>
29
+#include <cassert>
30
+#include <stdint.h>
31
+
32
+// Load 4 bytes into the low half of a uint8x8_t, zero the upper half.
33
+static uint8x8_t inline load_u8x4x1(const uint8_t *s)
34
+{
35
+    uint8x8_t ret = vdup_n_u8(0);
36
+
37
+    ret = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t*)s,
38
+                                            vreinterpret_u32_u8(ret), 0));
39
+    return ret;
40
+}
41
+
42
+static uint8x8_t inline load_u8x4x2(const uint8_t *s, intptr_t stride)
43
+{
44
+    uint8x8_t ret = vdup_n_u8(0);
45
+
46
+    ret = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t*)s,
47
+                                            vreinterpret_u32_u8(ret), 0));
48
+    s += stride;
49
+    ret = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t*)s,
50
+                                            vreinterpret_u32_u8(ret), 1));
51
+
52
+    return ret;
53
+}
54
+
55
+// Store 4 bytes from the low half of a uint8x8_t.
56
+static void inline store_u8x4x1(uint8_t *d, const uint8x8_t s)
57
+{
58
+    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(s), 0);
59
+}
60
+
61
+// Store N blocks of 32-bits from (N / 2) D-Registers.
62
+template<int N>
63
+static void inline store_u8x4_strided_xN(uint8_t *d, intptr_t stride,
64
+                                         const uint8x8_t *s)
65
+{
66
+    assert(N % 2 == 0);
67
+    for (int i = 0; i < N / 2; ++i)
68
+    {
69
+        vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(si), 0);
70
+        d += stride;
71
+        vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(si), 1);
72
+        d += stride;
73
+    }
74
+}
75
+
76
+template<int N>
77
+static void inline load_u8x8xn(const uint8_t *src, const intptr_t stride,
78
+                               uint8x8_t *dst)
79
+{
80
+    for (int i = 0; i < N; ++i)
81
+    {
82
+        dsti = vld1_u8(src);
83
+        src += stride;
84
+    }
85
+}
86
+
87
+template<int N>
88
+static void inline load_u8x16xn(const uint8_t *src, const intptr_t stride,
89
+                                uint8x16_t *dst)
90
+{
91
+    for (int i = 0; i < N; ++i)
92
+    {
93
+        dsti = vld1q_u8(src);
94
+        src += stride;
95
+    }
96
+}
97
+
98
+template<int N>
99
+static void inline store_u8x2xn(uint8_t *dst, intptr_t dst_stride,
100
+                                const uint8x8_t *src)
101
+{
102
+    for (int i = 0; i < N; ++i)
103
+    {
104
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(srci), 0);
105
+        dst += dst_stride;
106
+    }
107
+}
108
+
109
+template<int N>
110
+static void inline store_u8x4xn(uint8_t *dst, intptr_t dst_stride,
111
+                                const uint8x8_t *src)
112
+{
113
+    for (int i = 0; i < N; ++i)
114
+    {
115
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(srci), 0);
116
+        dst += dst_stride;
117
+    }
118
+}
119
+
120
+template<int N>
121
+static void inline store_u8x6xn(uint8_t *dst, intptr_t dst_stride,
122
+                                const uint8x8_t *src)
123
+{
124
+    for (int i = 0; i < N; ++i)
125
+    {
126
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(srci), 0);
127
+        vst1_lane_u16((uint16_t *)(dst + 4), vreinterpret_u16_u8(srci), 2);
128
+        dst += dst_stride;
129
+    }
130
+}
131
+
132
+template<int N>
133
+static void inline store_u8x8xn(uint8_t *dst, intptr_t dst_stride,
134
+                                const uint8x8_t *src)
135
+{
136
+    for (int i = 0; i < N; ++i)
137
+    {
138
+        vst1_u8(dst, srci);
139
+        dst += dst_stride;
140
+    }
141
+}
142
+
143
+template<int N, int M>
144
+static void inline store_u8xnxm(uint8_t *dst, intptr_t dst_stride,
145
+                                const uint8x8_t *src)
146
+{
147
+    switch (N)
148
+    {
149
+    case 2: return store_u8x2xn<M>(dst, dst_stride, src);
150
+    case 4: return store_u8x4xn<M>(dst, dst_stride, src);
151
+    case 6: return store_u8x6xn<M>(dst, dst_stride, src);
152
+    case 8: return store_u8x8xn<M>(dst, dst_stride, src);
153
+    }
154
+}
155
+
156
+template<int N>
157
+static void inline store_u8x16xn(uint8_t *dst, intptr_t dst_stride,
158
+                                 const uint8x16_t *src)
159
+{
160
+    for (int i = 0; i < N; ++i)
161
+    {
162
+        vst1q_u8(dst, srci);
163
+        dst += dst_stride;
164
+    }
165
+}
166
+
167
+template<int N>
168
+static void inline load_s16x4xn(const int16_t *src, const intptr_t stride,
169
+                                int16x4_t *dst)
170
+{
171
+    for (int i = 0; i < N; ++i)
172
+    {
173
+        dsti = vld1_s16(src);
174
+        src += stride;
175
+    }
176
+}
177
+
178
+template<int N>
179
+static void inline load_s16x8xn(const int16_t *src, const intptr_t stride,
180
+                                int16x8_t *dst)
181
+{
182
+    for (int i = 0; i < N; ++i)
183
+    {
184
+        dsti = vld1q_s16(src);
185
+        src += stride;
186
+    }
187
+}
188
+
189
+template<int N>
190
+static void inline store_s16x2xn(int16_t *dst, intptr_t dst_stride,
191
+                                 const int16x4_t *src)
192
+{
193
+    for (int i = 0; i < N; ++i)
194
+    {
195
+        vst1_lane_s32((int32_t*)dst, vreinterpret_s32_s16(srci), 0);
196
+        dst += dst_stride;
197
+    }
198
+}
199
+
200
+template<int N>
201
x265_4.0.tar.gz/source/common/aarch64/neon-sve-bridge.h Added
69
 
1
@@ -0,0 +1,67 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2024 MulticoreWare, Inc
4
+ *
5
+ * Authors: Hari Limaye <hari.limaye@arm.com>
6
+ *          Jonathan Wright <jonathan.wright@arm.com>
7
+ *
8
+ * This program is free software; you can redistribute it and/or modify
9
+ * it under the terms of the GNU General Public License as published by
10
+ * the Free Software Foundation; either version 2 of the License, or
11
+ * (at your option) any later version.
12
+ *
13
+ * This program is distributed in the hope that it will be useful,
14
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ * GNU General Public License for more details.
17
+ *
18
+ * You should have received a copy of the GNU General Public License
19
+ * along with this program; if not, write to the Free Software
20
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21
+ *
22
+ * This program is also available under a commercial proprietary license.
23
+ * For more information, contact us at license @ x265.com.
24
+ *****************************************************************************/
25
+
26
+#ifndef X265_COMMON_AARCH64_NEON_SVE_BRIDGE_H
27
+#define X265_COMMON_AARCH64_NEON_SVE_BRIDGE_H
28
+
29
+#include <arm_neon.h>
30
+
31
+#if defined(HAVE_SVE) && HAVE_SVE_BRIDGE
32
+#include <arm_sve.h>
33
+#include <arm_neon_sve_bridge.h>
34
+
35
+/* We can access instructions that are exclusive to the SVE or SVE2 instruction
36
+ * sets from a predominantly Neon context by making use of the Neon-SVE bridge
37
+ * intrinsics to reinterpret Neon vectors as SVE vectors - with the high part of
38
+ * the SVE vector (if it's longer than 128 bits) being "don't care".
39
+ *
40
+ * While sub-optimal on machines that have SVE vector length > 128-bit - as the
41
+ * remainder of the vector is unused - this approach is still beneficial when
42
+ * compared to a Neon-only implementation. */
43
+
44
+static inline int32x4_t x265_vld1sh_s32(const int16_t *ptr)
45
+{
46
+    return svget_neonq_s32(svld1sh_s32(svptrue_pat_b32(SV_VL4), ptr));
47
+}
48
+
49
+static inline int64x2_t x265_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y)
50
+{
51
+    return svget_neonq_s64(svdot_s64(svset_neonq_s64(svundef_s64(), acc),
52
+                                     svset_neonq_s16(svundef_s16(), x),
53
+                                     svset_neonq_s16(svundef_s16(), y)));
54
+}
55
+
56
+static inline int8x16_t x265_sve_mask(const int x, const int endX,
57
+                                      const int8x16_t in)
58
+{
59
+    // Use predicate to shift "unused lanes" outside of range -2, 2
60
+    svbool_t svpred = svwhilelt_b8(x, endX);
61
+    svint8_t edge_type = svsel_s8(svpred, svset_neonq_s8(svundef_s8(), in),
62
+                                  svdup_n_s8(-3));
63
+    return svget_neonq_s8(edge_type);
64
+}
65
+
66
+#endif // defined(HAVE_SVE) && HAVE_SVE_BRIDGE
67
+
68
+#endif // X265_COMMON_AARCH64_NEON_SVE_BRIDGE_H
69
x265_3.6.tar.gz/source/common/aarch64/p2s-sve.S -> x265_4.0.tar.gz/source/common/aarch64/p2s-sve.S Changed
55
 
1
@@ -204,7 +204,7 @@
2
 #else
3
     p2s_start
4
     mov             x9, #\h
5
-.loop_filter_sve_P2S_32x\h:
6
+.Loop_filter_sve_P2S_32x\h:
7
     sub             x9, x9, #1
8
     ld1             {v0.16b-v1.16b}, x0, x1
9
     ushll           v22.8h, v0.8b,  #P2S_SHIFT
10
@@ -216,7 +216,7 @@
11
     add             v24.8h, v24.8h, v31.8h
12
     add             v25.8h, v25.8h, v31.8h
13
     st1             {v22.16b-v25.16b}, x2, x3
14
-    cbnz            x9, .loop_filter_sve_P2S_32x\h
15
+    cbnz            x9, .Loop_filter_sve_P2S_32x\h
16
     ret
17
 #endif
18
 endfunc
19
@@ -331,7 +331,7 @@
20
     p2s_start
21
     sub             x3, x3, #64
22
     mov             x9, #\h
23
-.loop_filter_sve_P2S_64x\h:
24
+.Loop_filter_sve_P2S_64x\h:
25
     sub             x9, x9, #1
26
     ld1             {v0.16b-v3.16b}, x0, x1
27
     ushll           v16.8h, v0.8b,  #P2S_SHIFT
28
@@ -352,7 +352,7 @@
29
     add             v23.8h, v23.8h, v31.8h
30
     st1             {v16.16b-v19.16b}, x2, #64
31
     st1             {v20.16b-v23.16b}, x2, x3
32
-    cbnz            x9, .loop_filter_sve_P2S_64x\h
33
+    cbnz            x9, .Loop_filter_sve_P2S_64x\h
34
     ret
35
 #endif
36
 endfunc
37
@@ -422,7 +422,7 @@
38
     p2s_start
39
     sub             x3, x3, #64
40
     mov             x9, #64
41
-.loop_filterP2S_sve_48x64:
42
+.Loop_filterP2S_sve_48x64:
43
     sub            x9, x9, #1
44
     ld1             {v0.16b-v2.16b}, x0, x1
45
     ushll           v16.8h, v0.8b,  #P2S_SHIFT
46
@@ -439,7 +439,7 @@
47
     add             v21.8h, v21.8h, v31.8h
48
     st1             {v16.16b-v19.16b}, x2, #64
49
     st1             {v20.16b-v21.16b}, x2, x3
50
-    cbnz            x9, .loop_filterP2S_sve_48x64
51
+    cbnz            x9, .Loop_filterP2S_sve_48x64
52
     ret
53
 #endif
54
 endfunc
55
x265_3.6.tar.gz/source/common/aarch64/p2s.S -> x265_4.0.tar.gz/source/common/aarch64/p2s.S Changed
54
 
1
@@ -262,7 +262,7 @@
2
 function PFX(filterPixelToShort_32x\h\()_neon)
3
     p2s_start
4
     mov             x9, #\h
5
-.loop_filterP2S_32x\h:
6
+.Loop_filterP2S_32x\h:
7
     sub             x9, x9, #1
8
 #if HIGH_BIT_DEPTH
9
     ld1             {v0.16b-v3.16b}, x0, x1
10
@@ -282,7 +282,7 @@
11
     add             v24.8h, v24.8h, v31.8h
12
     add             v25.8h, v25.8h, v31.8h
13
     st1             {v22.16b-v25.16b}, x2, x3
14
-    cbnz            x9, .loop_filterP2S_32x\h
15
+    cbnz            x9, .Loop_filterP2S_32x\h
16
     ret
17
 endfunc
18
 .endm
19
@@ -302,7 +302,7 @@
20
 #endif
21
     sub             x3, x3, #64
22
     mov             x9, #\h
23
-.loop_filterP2S_64x\h:
24
+.Loop_filterP2S_64x\h:
25
     sub             x9, x9, #1
26
 #if HIGH_BIT_DEPTH
27
     ld1             {v0.16b-v3.16b}, x0, #64
28
@@ -336,7 +336,7 @@
29
     add             v23.8h, v23.8h, v31.8h
30
     st1             {v16.16b-v19.16b}, x2, #64
31
     st1             {v20.16b-v23.16b}, x2, x3
32
-    cbnz            x9, .loop_filterP2S_64x\h
33
+    cbnz            x9, .Loop_filterP2S_64x\h
34
     ret
35
 endfunc
36
 .endm
37
@@ -353,7 +353,7 @@
38
 #endif
39
     sub             x3, x3, #64
40
     mov             x9, #64
41
-.loop_filterP2S_48x64:
42
+.Loop_filterP2S_48x64:
43
     sub            x9, x9, #1
44
 #if HIGH_BIT_DEPTH
45
     ld1             {v0.16b-v3.16b}, x0, #64
46
@@ -381,6 +381,6 @@
47
     add             v21.8h, v21.8h, v31.8h
48
     st1             {v16.16b-v19.16b}, x2, #64
49
     st1             {v20.16b-v21.16b}, x2, x3
50
-    cbnz            x9, .loop_filterP2S_48x64
51
+    cbnz            x9, .Loop_filterP2S_48x64
52
     ret
53
 endfunc
54
x265_3.6.tar.gz/source/common/aarch64/pixel-prim.cpp -> x265_4.0.tar.gz/source/common/aarch64/pixel-prim.cpp Changed
201
 
1
@@ -7,6 +7,8 @@
2
 #include "arm64-utils.h"
3
 #if HAVE_NEON
4
 
5
+#include "mem-neon.h"
6
+
7
 #include <arm_neon.h>
8
 
9
 using namespace X265_NS;
10
@@ -24,26 +26,32 @@
11
     sub = vsubq_s16(a, b);
12
 }
13
 
14
-static inline void transpose_8h(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
15
+static inline void transpose_8h_8h(int16x8_t &t1, int16x8_t &t2,
16
+                                   const int16x8_t s1, const int16x8_t s2)
17
 {
18
     t1 = vtrn1q_s16(s1, s2);
19
     t2 = vtrn2q_s16(s1, s2);
20
 }
21
 
22
-static inline void transpose_4s(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
23
+static inline void transpose_4s_8h(int16x8_t &t1, int16x8_t &t2,
24
+                                   const int16x8_t s1, const int16x8_t s2)
25
 {
26
-    t1 = vtrn1q_s32(s1, s2);
27
-    t2 = vtrn2q_s32(s1, s2);
28
+    int32x4_t tmp1 = vreinterpretq_s32_s16(s1);
29
+    int32x4_t tmp2 = vreinterpretq_s32_s16(s2);
30
+
31
+    t1 = vreinterpretq_s16_s32(vtrn1q_s32(tmp1, tmp2));
32
+    t2 = vreinterpretq_s16_s32(vtrn2q_s32(tmp1, tmp2));
33
 }
34
 
35
-#if (X265_DEPTH <= 10)
36
-static inline void transpose_2d(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
37
+static inline void transpose_2d_8h(int16x8_t &t1, int16x8_t &t2,
38
+                                   const int16x8_t s1, const int16x8_t s2)
39
 {
40
-    t1 = vtrn1q_s64(s1, s2);
41
-    t2 = vtrn2q_s64(s1, s2);
42
-}
43
-#endif
44
+    int64x2_t tmp1 = vreinterpretq_s64_s16(s1);
45
+    int64x2_t tmp2 = vreinterpretq_s64_s16(s2);
46
 
47
+    t1 = vreinterpretq_s16_s64(vtrn1q_s64(tmp1, tmp2));
48
+    t2 = vreinterpretq_s16_s64(vtrn2q_s64(tmp1, tmp2));
49
+}
50
 
51
 static inline void SUMSUB_ABCD(int16x8_t &s1, int16x8_t &d1, int16x8_t &s2, int16x8_t &d2,
52
                                int16x8_t a, int16x8_t  b, int16x8_t  c, int16x8_t  d)
53
@@ -73,29 +81,25 @@
54
     SUMSUB_AB(v4 , v6 , v16, v18);
55
     SUMSUB_AB(v5 , v7 , v17, v19);
56
 
57
-    v0 = vtrn1q_s16(v4, v5);
58
-    v1 = vtrn2q_s16(v4, v5);
59
-    v2 = vtrn1q_s16(v6, v7);
60
-    v3 = vtrn2q_s16(v6, v7);
61
+    transpose_8h_8h(v0, v1, v4, v5);
62
+    transpose_8h_8h(v2, v3, v6, v7);
63
 
64
     SUMSUB_AB(v16, v17, v0,  v1);
65
     SUMSUB_AB(v18, v19, v2,  v3);
66
 
67
-    v0 = vtrn1q_s32(v16, v18);
68
-    v1 = vtrn2q_s32(v16, v18);
69
-    v2 = vtrn1q_s32(v17, v19);
70
-    v3 = vtrn2q_s32(v17, v19);
71
+    transpose_4s_8h(v0, v1, v16, v18);
72
+    transpose_4s_8h(v2, v3, v17, v19);
73
 
74
-    v0 = vabsq_s16(v0);
75
-    v1 = vabsq_s16(v1);
76
-    v2 = vabsq_s16(v2);
77
-    v3 = vabsq_s16(v3);
78
+    uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0));
79
+    uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1));
80
+    uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v2));
81
+    uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v3));
82
 
83
-    v0 = vmaxq_u16(v0, v1);
84
-    v1 = vmaxq_u16(v2, v3);
85
+    uint16x8_t max0 = vmaxq_u16(abs0, abs1);
86
+    uint16x8_t max1 = vmaxq_u16(abs2, abs3);
87
 
88
-    v0 = vaddq_u16(v0, v1);
89
-    return vaddlvq_u16(v0);
90
+    uint16x8_t sum = vaddq_u16(max0, max1);
91
+    return vaddlvq_u16(sum);
92
 }
93
 
94
 static inline int _satd_4x4_neon(int16x8_t v0, int16x8_t v1)
95
@@ -103,22 +107,19 @@
96
     int16x8_t v2, v3;
97
     SUMSUB_AB(v2,  v3,  v0,  v1);
98
 
99
-    v0 = vzip1q_s64(v2, v3);
100
-    v1 = vzip2q_s64(v2, v3);
101
+    transpose_2d_8h(v0, v1, v2, v3);
102
     SUMSUB_AB(v2,  v3,  v0,  v1);
103
 
104
-    v0 = vtrn1q_s16(v2, v3);
105
-    v1 = vtrn2q_s16(v2, v3);
106
+    transpose_8h_8h(v0, v1, v2, v3);
107
     SUMSUB_AB(v2,  v3,  v0,  v1);
108
 
109
-    v0 = vtrn1q_s32(v2, v3);
110
-    v1 = vtrn2q_s32(v2, v3);
111
+    transpose_4s_8h(v0, v1, v2, v3);
112
 
113
-    v0 = vabsq_s16(v0);
114
-    v1 = vabsq_s16(v1);
115
-    v0 = vmaxq_u16(v0, v1);
116
+    uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0));
117
+    uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1));
118
+    uint16x8_t max = vmaxq_u16(abs0, abs1);
119
 
120
-    return vaddlvq_s16(v0);
121
+    return vaddlvq_u16(max);
122
 }
123
 
124
 static void _satd_8x4v_8x8h_neon(int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3, int16x8_t &v20,
125
@@ -131,44 +132,47 @@
126
 
127
     HADAMARD4_V(v20, v21, v22, v23, v0,  v1, v2, v3);
128
 
129
-    transpose_8h(v0,  v1,  v16, v17);
130
-    transpose_8h(v2,  v3,  v18, v19);
131
-    transpose_8h(v4,  v5,  v20, v21);
132
-    transpose_8h(v6,  v7,  v22, v23);
133
+    transpose_8h_8h(v0,  v1,  v16, v17);
134
+    transpose_8h_8h(v2,  v3,  v18, v19);
135
+    transpose_8h_8h(v4,  v5,  v20, v21);
136
+    transpose_8h_8h(v6,  v7,  v22, v23);
137
 
138
     SUMSUB_AB(v16, v17, v0,  v1);
139
     SUMSUB_AB(v18, v19, v2,  v3);
140
     SUMSUB_AB(v20, v21, v4,  v5);
141
     SUMSUB_AB(v22, v23, v6,  v7);
142
 
143
-    transpose_4s(v0,  v2,  v16, v18);
144
-    transpose_4s(v1,  v3,  v17, v19);
145
-    transpose_4s(v4,  v6,  v20, v22);
146
-    transpose_4s(v5,  v7,  v21, v23);
147
-
148
-    v0 = vabsq_s16(v0);
149
-    v1 = vabsq_s16(v1);
150
-    v2 = vabsq_s16(v2);
151
-    v3 = vabsq_s16(v3);
152
-    v4 = vabsq_s16(v4);
153
-    v5 = vabsq_s16(v5);
154
-    v6 = vabsq_s16(v6);
155
-    v7 = vabsq_s16(v7);
156
-
157
-    v0 = vmaxq_u16(v0, v2);
158
-    v1 = vmaxq_u16(v1, v3);
159
-    v2 = vmaxq_u16(v4, v6);
160
-    v3 = vmaxq_u16(v5, v7);
161
-
162
+    transpose_4s_8h(v0,  v2,  v16, v18);
163
+    transpose_4s_8h(v1,  v3,  v17, v19);
164
+    transpose_4s_8h(v4,  v6,  v20, v22);
165
+    transpose_4s_8h(v5,  v7,  v21, v23);
166
+
167
+    uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0));
168
+    uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1));
169
+    uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v2));
170
+    uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v3));
171
+    uint16x8_t abs4 = vreinterpretq_u16_s16(vabsq_s16(v4));
172
+    uint16x8_t abs5 = vreinterpretq_u16_s16(vabsq_s16(v5));
173
+    uint16x8_t abs6 = vreinterpretq_u16_s16(vabsq_s16(v6));
174
+    uint16x8_t abs7 = vreinterpretq_u16_s16(vabsq_s16(v7));
175
+
176
+    v0 = vreinterpretq_s16_u16(vmaxq_u16(abs0, abs2));
177
+    v1 = vreinterpretq_s16_u16(vmaxq_u16(abs1, abs3));
178
+    v2 = vreinterpretq_s16_u16(vmaxq_u16(abs4, abs6));
179
+    v3 = vreinterpretq_s16_u16(vmaxq_u16(abs5, abs7));
180
 }
181
 
182
 #if HIGH_BIT_DEPTH
183
 
184
 #if (X265_DEPTH > 10)
185
-static inline void transpose_2d(int32x4_t &t1, int32x4_t &t2, const int32x4_t s1, const int32x4_t s2)
186
+static inline void transpose_2d_4s(int32x4_t &t1, int32x4_t &t2,
187
+                                   const int32x4_t s1, const int32x4_t s2)
188
 {
189
-    t1 = vtrn1q_s64(s1, s2);
190
-    t2 = vtrn2q_s64(s1, s2);
191
+    int64x2_t tmp1 = vreinterpretq_s64_s32(s1);
192
+    int64x2_t tmp2 = vreinterpretq_s64_s32(s2);
193
+
194
+    t1 = vreinterpretq_s32_s64(vtrn1q_s64(tmp1, tmp2));
195
+    t2 = vreinterpretq_s32_s64(vtrn2q_s64(tmp1, tmp2));
196
 }
197
 
198
 static inline void ISUMSUB_AB(int32x4_t &sum, int32x4_t &sub, const int32x4_t a, const int32x4_t b)
199
@@ -197,35 +201,35 @@
200
     int16x8_t v16, v17;
201
x265_3.6.tar.gz/source/common/aarch64/pixel-util-sve.S -> x265_4.0.tar.gz/source/common/aarch64/pixel-util-sve.S Changed
107
 
1
@@ -190,27 +190,27 @@
2
     ld1b            {z7.h}, p0/z, x2, x11
3
     add             x0, x0, x1
4
     add             x2, x2, x3
5
-    ld1b            {z29.h}, p0/z, x0
6
-    ld1b            {z9.h}, p0/z, x0, x11
7
-    ld1b            {z10.h}, p0/z, x2
8
-    ld1b            {z11.h}, p0/z, x2, x11
9
-    add             x0, x0, x1
10
-    add             x2, x2, x3
11
-    ld1b            {z12.h}, p0/z, x0
12
-    ld1b            {z13.h}, p0/z, x0, x11
13
-    ld1b            {z14.h}, p0/z, x2
14
-    ld1b            {z15.h}, p0/z, x2, x11
15
-    add             x0, x0, x1
16
-    add             x2, x2, x3
17
-
18
     sub             \v0\().h, z0.h, z2.h
19
     sub             \v4\().h, z1.h, z3.h
20
     sub             \v1\().h, z4.h, z6.h
21
     sub             \v5\().h, z5.h, z7.h
22
-    sub             \v2\().h, z29.h, z10.h
23
-    sub             \v6\().h, z9.h, z11.h
24
-    sub             \v3\().h, z12.h, z14.h
25
-    sub             \v7\().h, z13.h, z15.h
26
+
27
+    ld1b            {z0.h}, p0/z, x0
28
+    ld1b            {z1.h}, p0/z, x0, x11
29
+    ld1b            {z2.h}, p0/z, x2
30
+    ld1b            {z3.h}, p0/z, x2, x11
31
+    add             x0, x0, x1
32
+    add             x2, x2, x3
33
+    ld1b            {z4.h}, p0/z, x0
34
+    ld1b            {z5.h}, p0/z, x0, x11
35
+    ld1b            {z6.h}, p0/z, x2
36
+    ld1b            {z7.h}, p0/z, x2, x11
37
+    add             x0, x0, x1
38
+    add             x2, x2, x3
39
+    sub             \v2\().h, z0.h, z2.h
40
+    sub             \v6\().h, z1.h, z3.h
41
+    sub             \v3\().h, z4.h, z6.h
42
+    sub             \v7\().h, z5.h, z7.h
43
 .endm
44
 
45
 // one vertical hadamard pass and two horizontal
46
@@ -314,60 +314,3 @@
47
     mov             x0, x7
48
     ret             x10
49
 endfunc
50
-
51
-/********* ssim ***********/
52
-// uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
53
-// No need to fully use sve instructions for this function
54
-function PFX(quant_sve)
55
-    mov             w9, #1
56
-    lsl             w9, w9, w4
57
-    mov             z0.s, w9
58
-    neg             w9, w4
59
-    mov             z1.s, w9
60
-    add             w9, w9, #8
61
-    mov             z2.s, w9
62
-    mov             z3.s, w5
63
-
64
-    lsr             w6, w6, #2
65
-    eor             z4.d, z4.d, z4.d
66
-    eor             w10, w10, w10
67
-    eor             z17.d, z17.d, z17.d
68
-
69
-.loop_quant_sve:
70
-    ld1             {v18.4h}, x0, #8
71
-    ld1             {v7.4s}, x1, #16
72
-    sxtl            v6.4s, v18.4h
73
-
74
-    cmlt            v5.4s, v6.4s, #0
75
-
76
-    abs             v6.4s, v6.4s
77
-
78
-
79
-    mul             v6.4s, v6.4s, v7.4s
80
-
81
-    add             v7.4s, v6.4s, v3.4s
82
-    sshl            v7.4s, v7.4s, v1.4s
83
-
84
-    mls             v6.4s, v7.4s, v0.s0
85
-    sshl            v16.4s, v6.4s, v2.4s
86
-    st1             {v16.4s}, x2, #16
87
-
88
-    // numsig
89
-    cmeq            v16.4s, v7.4s, v17.4s
90
-    add             v4.4s, v4.4s, v16.4s
91
-    add             w10, w10, #4
92
-
93
-    // level *= sign
94
-    eor             z16.d, z7.d, z5.d
95
-    sub             v16.4s, v16.4s, v5.4s
96
-    sqxtn           v5.4h, v16.4s
97
-    st1             {v5.4h}, x3, #8
98
-
99
-    subs            w6, w6, #1
100
-    b.ne             .loop_quant_sve
101
-
102
-    addv            s4, v4.4s
103
-    mov             w9, v4.s0
104
-    add             w0, w10, w9
105
-    ret
106
-endfunc
107
x265_3.6.tar.gz/source/common/aarch64/pixel-util-sve2.S -> x265_4.0.tar.gz/source/common/aarch64/pixel-util-sve2.S Changed
201
 
1
@@ -64,11 +64,11 @@
2
     bgt             .vl_gt_16_pixel_var_16x16
3
     pixel_var_start
4
     mov             w12, #16
5
-.loop_var_16_sve2:
6
+.Loop_var_16_sve2:
7
     sub             w12, w12, #1
8
     ld1             {v4.16b}, x0, x1
9
     pixel_var_1 v4
10
-    cbnz            w12, .loop_var_16_sve2
11
+    cbnz            w12, .Loop_var_16_sve2
12
     pixel_var_end
13
     ret
14
 .vl_gt_16_pixel_var_16x16:
15
@@ -95,12 +95,12 @@
16
     bgt             .vl_gt_16_pixel_var_32x32
17
     pixel_var_start
18
     mov             w12, #32
19
-.loop_var_32_sve2:
20
+.Loop_var_32_sve2:
21
     sub             w12, w12, #1
22
     ld1             {v4.16b-v5.16b}, x0, x1
23
     pixel_var_1 v4
24
     pixel_var_1 v5
25
-    cbnz            w12, .loop_var_32_sve2
26
+    cbnz            w12, .Loop_var_32_sve2
27
     pixel_var_end
28
     ret
29
 .vl_gt_16_pixel_var_32x32:
30
@@ -150,14 +150,14 @@
31
     bgt             .vl_gt_16_pixel_var_64x64
32
     pixel_var_start
33
     mov             w12, #64
34
-.loop_var_64_sve2:
35
+.Loop_var_64_sve2:
36
     sub             w12, w12, #1
37
     ld1             {v4.16b-v7.16b}, x0, x1
38
     pixel_var_1 v4
39
     pixel_var_1 v5
40
     pixel_var_1 v6
41
     pixel_var_1 v7
42
-    cbnz            w12, .loop_var_64_sve2
43
+    cbnz            w12, .Loop_var_64_sve2
44
     pixel_var_end
45
     ret
46
 .vl_gt_16_pixel_var_64x64:
47
@@ -268,7 +268,7 @@
48
     bgt             .vl_gt_16_getResidual32
49
     lsl             x4, x3, #1
50
     mov             w12, #4
51
-.loop_residual_32:
52
+.Loop_residual_32:
53
     sub             w12, w12, #1
54
 .rept 4
55
     ld1             {v0.16b-v1.16b}, x0, x3
56
@@ -286,7 +286,7 @@
57
     st1             {v16.8h-v19.8h}, x2, x4
58
     st1             {v20.8h-v23.8h}, x2, x4
59
 .endr
60
-    cbnz            w12, .loop_residual_32
61
+    cbnz            w12, .Loop_residual_32
62
     ret
63
 .vl_gt_16_getResidual32:
64
     cmp             x9, #48
65
@@ -323,7 +323,7 @@
66
     bgt             .vl_gt_16_pixel_sub_ps_32x32
67
     lsl             x1, x1, #1
68
     mov             w12, #4
69
-.loop_sub_ps_32_sve2:
70
+.Loop_sub_ps_32_sve2:
71
     sub             w12, w12, #1
72
 .rept 4
73
     ld1             {v0.16b-v1.16b}, x2, x4
74
@@ -341,7 +341,7 @@
75
     st1             {v16.8h-v19.8h}, x0, x1
76
     st1             {v20.8h-v23.8h}, x0, x1
77
 .endr
78
-    cbnz            w12, .loop_sub_ps_32_sve2
79
+    cbnz            w12, .Loop_sub_ps_32_sve2
80
     ret
81
 .vl_gt_16_pixel_sub_ps_32x32:
82
     cmp             x9, #48
83
@@ -387,7 +387,7 @@
84
     lsl             x1, x1, #1
85
     sub             x1, x1, #64
86
     mov             w12, #16
87
-.loop_sub_ps_64_sve2:
88
+.Loop_sub_ps_64_sve2:
89
     sub             w12, w12, #1
90
 .rept 4
91
     ld1             {v0.16b-v3.16b}, x2, x4
92
@@ -403,7 +403,7 @@
93
     st1             {v16.8h-v19.8h}, x0, #64
94
     st1             {v20.8h-v23.8h}, x0, x1
95
 .endr
96
-    cbnz            w12, .loop_sub_ps_64_sve2
97
+    cbnz            w12, .Loop_sub_ps_64_sve2
98
     ret
99
 .vl_gt_16_pixel_sub_ps_64x64:
100
     rdvl            x9, #1
101
@@ -473,7 +473,7 @@
102
     bgt             .vl_gt_16_pixel_sub_ps_32x64
103
     lsl             x1, x1, #1
104
     mov             w12, #8
105
-.loop_sub_ps_32x64_sve2:
106
+.Loop_sub_ps_32x64_sve2:
107
     sub             w12, w12, #1
108
 .rept 4
109
     ld1             {v0.16b-v1.16b}, x2, x4
110
@@ -491,7 +491,7 @@
111
     st1             {v16.8h-v19.8h}, x0, x1
112
     st1             {v20.8h-v23.8h}, x0, x1
113
 .endr
114
-    cbnz            w12, .loop_sub_ps_32x64_sve2
115
+    cbnz            w12, .Loop_sub_ps_32x64_sve2
116
     ret
117
 .vl_gt_16_pixel_sub_ps_32x64:
118
     cmp             x9, #48
119
@@ -609,7 +609,7 @@
120
     bgt             .vl_gt_16_pixel_add_ps_32x\h
121
     lsl             x5, x5, #1
122
     mov             w12, #\h / 4
123
-.loop_add_ps__sve2_32x\h\():
124
+.Loop_add_ps__sve2_32x\h\():
125
     sub             w12, w12, #1
126
 .rept 4
127
     ld1             {v0.16b-v1.16b}, x2, x4
128
@@ -628,7 +628,7 @@
129
     sqxtun2         v5.16b, v27.8h
130
     st1             {v4.16b-v5.16b}, x0, x1
131
 .endr
132
-    cbnz            w12, .loop_add_ps__sve2_32x\h
133
+    cbnz            w12, .Loop_add_ps__sve2_32x\h
134
     ret
135
 .vl_gt_16_pixel_add_ps_32x\h\():
136
     cmp             x9, #48
137
@@ -1157,7 +1157,7 @@
138
     bgt             .vl_gt_16_ssimDist16
139
     ssimDist_start
140
     ptrue           p0.s, vl4
141
-.loop_ssimDist16_sve2:
142
+.Loop_ssimDist16_sve2:
143
     sub             w12, w12, #1
144
     ld1b            {z4.s}, p0/z, x0
145
     ld1b            {z5.s}, p0/z, x0, #1, mul vl
146
@@ -1171,7 +1171,7 @@
147
     add             x2, x2, x3
148
     ssimDist_1_sve2 z4, z5, z8, z9
149
     ssimDist_1_sve2 z6, z7, z10, z11
150
-    cbnz            w12, .loop_ssimDist16_sve2
151
+    cbnz            w12, .Loop_ssimDist16_sve2
152
     ssimDist_end
153
     ret
154
 .vl_gt_16_ssimDist16:
155
@@ -1217,7 +1217,7 @@
156
     bgt             .vl_gt_16_ssimDist32
157
     ssimDist_start
158
     ptrue           p0.s, vl4
159
-.loop_ssimDist32_sve2:
160
+.Loop_ssimDist32_sve2:
161
     sub             w12, w12, #1
162
     ld1b            {z2.s}, p0/z, x0
163
     ld1b            {z3.s}, p0/z, x0, #1, mul vl
164
@@ -1241,7 +1241,7 @@
165
     ssimDist_1_sve2 z4, z5, z12, z13
166
     ssimDist_1_sve2 z6, z7, z14, z15
167
     ssimDist_1_sve2 z8, z9, z30, z31
168
-    cbnz            w12, .loop_ssimDist32_sve2
169
+    cbnz            w12, .Loop_ssimDist32_sve2
170
     ssimDist_end
171
     ret
172
 .vl_gt_16_ssimDist32:
173
@@ -1309,7 +1309,7 @@
174
     bgt             .vl_gt_16_ssimDist64
175
     ssimDist_start
176
     ptrue           p0.s, vl4
177
-.loop_ssimDist64_sve2:
178
+.Loop_ssimDist64_sve2:
179
     sub             w12, w12, #1
180
     ld1b            {z2.s}, p0/z, x0
181
     ld1b            {z3.s}, p0/z, x0, #1, mul vl
182
@@ -1357,7 +1357,7 @@
183
     ssimDist_1_sve2 z8, z9, z29, z30
184
     add             x0, x0, x1
185
     add             x2, x2, x3
186
-    cbnz            w12, .loop_ssimDist64_sve2
187
+    cbnz            w12, .Loop_ssimDist64_sve2
188
     ssimDist_end
189
     ret
190
 .vl_gt_16_ssimDist64:
191
@@ -1482,7 +1482,7 @@
192
     bgt             .vl_gt_16_normFact16
193
     normFact_start
194
     ptrue           p0.s, vl4
195
-.loop_normFact16_sve2:
196
+.Loop_normFact16_sve2:
197
     sub             w12, w12, #1
198
     ld1b            {z4.s}, p0/z, x0
199
     ld1b            {z5.s}, p0/z, x0, #1, mul vl
200
@@ -1491,7 +1491,7 @@
201
x265_3.6.tar.gz/source/common/aarch64/pixel-util.S -> x265_4.0.tar.gz/source/common/aarch64/pixel-util.S Changed
201
 
1
@@ -60,11 +60,11 @@
2
 function PFX(pixel_var_16x16_neon)
3
     pixel_var_start
4
     mov             w12, #16
5
-.loop_var_16:
6
+.Loop_var_16:
7
     sub             w12, w12, #1
8
     ld1             {v4.16b}, x0, x1
9
     pixel_var_1 v4
10
-    cbnz            w12, .loop_var_16
11
+    cbnz            w12, .Loop_var_16
12
     pixel_var_end
13
     ret
14
 endfunc
15
@@ -72,12 +72,12 @@
16
 function PFX(pixel_var_32x32_neon)
17
     pixel_var_start
18
     mov             w12, #32
19
-.loop_var_32:
20
+.Loop_var_32:
21
     sub             w12, w12, #1
22
     ld1             {v4.16b-v5.16b}, x0, x1
23
     pixel_var_1 v4
24
     pixel_var_1 v5
25
-    cbnz            w12, .loop_var_32
26
+    cbnz            w12, .Loop_var_32
27
     pixel_var_end
28
     ret
29
 endfunc
30
@@ -85,14 +85,14 @@
31
 function PFX(pixel_var_64x64_neon)
32
     pixel_var_start
33
     mov             w12, #64
34
-.loop_var_64:
35
+.Loop_var_64:
36
     sub             w12, w12, #1
37
     ld1             {v4.16b-v7.16b}, x0, x1
38
     pixel_var_1 v4
39
     pixel_var_1 v5
40
     pixel_var_1 v6
41
     pixel_var_1 v7
42
-    cbnz            w12, .loop_var_64
43
+    cbnz            w12, .Loop_var_64
44
     pixel_var_end
45
     ret
46
 endfunc
47
@@ -148,7 +148,7 @@
48
 function PFX(getResidual32_neon)
49
     lsl             x4, x3, #1
50
     mov             w12, #4
51
-.loop_residual_32:
52
+.Loop_residual_32:
53
     sub             w12, w12, #1
54
 .rept 4
55
     ld1             {v0.16b-v1.16b}, x0, x3
56
@@ -166,7 +166,7 @@
57
     st1             {v16.8h-v19.8h}, x2, x4
58
     st1             {v20.8h-v23.8h}, x2, x4
59
 .endr
60
-    cbnz            w12, .loop_residual_32
61
+    cbnz            w12, .Loop_residual_32
62
     ret
63
 endfunc
64
 
65
@@ -221,7 +221,7 @@
66
 function PFX(pixel_sub_ps_32x32_neon)
67
     lsl             x1, x1, #1
68
     mov             w12, #4
69
-.loop_sub_ps_32:
70
+.Loop_sub_ps_32:
71
     sub             w12, w12, #1
72
 .rept 4
73
     ld1             {v0.16b-v1.16b}, x2, x4
74
@@ -239,7 +239,7 @@
75
     st1             {v16.8h-v19.8h}, x0, x1
76
     st1             {v20.8h-v23.8h}, x0, x1
77
 .endr
78
-    cbnz            w12, .loop_sub_ps_32
79
+    cbnz            w12, .Loop_sub_ps_32
80
     ret
81
 endfunc
82
 
83
@@ -247,7 +247,7 @@
84
     lsl             x1, x1, #1
85
     sub             x1, x1, #64
86
     mov             w12, #16
87
-.loop_sub_ps_64:
88
+.Loop_sub_ps_64:
89
     sub             w12, w12, #1
90
 .rept 4
91
     ld1             {v0.16b-v3.16b}, x2, x4
92
@@ -263,7 +263,7 @@
93
     st1             {v16.8h-v19.8h}, x0, #64
94
     st1             {v20.8h-v23.8h}, x0, x1
95
 .endr
96
-    cbnz            w12, .loop_sub_ps_64
97
+    cbnz            w12, .Loop_sub_ps_64
98
     ret
99
 endfunc
100
 
101
@@ -318,7 +318,7 @@
102
 function PFX(pixel_sub_ps_32x64_neon)
103
     lsl             x1, x1, #1
104
     mov             w12, #8
105
-.loop_sub_ps_32x64:
106
+.Loop_sub_ps_32x64:
107
     sub             w12, w12, #1
108
 .rept 4
109
     ld1             {v0.16b-v1.16b}, x2, x4
110
@@ -336,7 +336,7 @@
111
     st1             {v16.8h-v19.8h}, x0, x1
112
     st1             {v20.8h-v23.8h}, x0, x1
113
 .endr
114
-    cbnz            w12, .loop_sub_ps_32x64
115
+    cbnz            w12, .Loop_sub_ps_32x64
116
     ret
117
 endfunc
118
 
119
@@ -383,7 +383,7 @@
120
 function PFX(pixel_add_ps_16x\h\()_neon)
121
     lsl             x5, x5, #1
122
     mov             w12, #\h / 8
123
-.loop_add_ps_16x\h\():
124
+.Loop_add_ps_16x\h\():
125
     sub             w12, w12, #1
126
 .rept 4
127
     ld1             {v0.16b}, x2, x4
128
@@ -405,7 +405,7 @@
129
     st1             {v4.16b}, x0, x1
130
     st1             {v5.16b}, x0, x1
131
 .endr
132
-    cbnz            w12, .loop_add_ps_16x\h
133
+    cbnz            w12, .Loop_add_ps_16x\h
134
     ret
135
 endfunc
136
 .endm
137
@@ -417,7 +417,7 @@
138
  function PFX(pixel_add_ps_32x\h\()_neon)
139
     lsl             x5, x5, #1
140
     mov             w12, #\h / 4
141
-.loop_add_ps_32x\h\():
142
+.Loop_add_ps_32x\h\():
143
     sub             w12, w12, #1
144
 .rept 4
145
     ld1             {v0.16b-v1.16b}, x2, x4
146
@@ -436,7 +436,7 @@
147
     sqxtun2         v5.16b, v27.8h
148
     st1             {v4.16b-v5.16b}, x0, x1
149
 .endr
150
-    cbnz            w12, .loop_add_ps_32x\h
151
+    cbnz            w12, .Loop_add_ps_32x\h
152
     ret
153
 endfunc
154
 .endm
155
@@ -448,7 +448,7 @@
156
     lsl             x5, x5, #1
157
     sub             x5, x5, #64
158
     mov             w12, #32
159
-.loop_add_ps_64x64:
160
+.Loop_add_ps_64x64:
161
     sub             w12, w12, #1
162
 .rept 2
163
     ld1             {v0.16b-v3.16b}, x2, x4
164
@@ -480,7 +480,7 @@
165
     sqxtun2         v3.16b, v7.8h
166
     st1             {v0.16b-v3.16b}, x0, x1
167
 .endr
168
-    cbnz            w12, .loop_add_ps_64x64
169
+    cbnz            w12, .Loop_add_ps_64x64
170
     ret
171
 endfunc
172
 
173
@@ -548,7 +548,7 @@
174
 // void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride)
175
 function PFX(scale2D_64to32_neon)
176
     mov             w12, #32
177
-.loop_scale2D:
178
+.Loop_scale2D:
179
     ld1             {v0.16b-v3.16b}, x1, x2
180
     sub             w12, w12, #1
181
     ld1             {v4.16b-v7.16b}, x1, x2
182
@@ -561,7 +561,7 @@
183
     uqrshrn         v1.8b, v2.8h, #2
184
     uqrshrn2        v1.16b, v3.8h, #2
185
     st1             {v0.16b-v1.16b}, x0, #32
186
-    cbnz            w12, .loop_scale2D
187
+    cbnz            w12, .Loop_scale2D
188
     ret
189
 endfunc
190
 
191
@@ -569,33 +569,33 @@
192
 function PFX(pixel_planecopy_cp_neon)
193
     dup             v2.16b, w6
194
     sub             x5, x5, #1
195
-.loop_h:
196
+.Loop_h:
197
     mov             x6, x0
198
     mov             x12, x2
199
     mov             x7, #0
200
-.loop_w:
201
x265_3.6.tar.gz/source/common/aarch64/sad-a.S -> x265_4.0.tar.gz/source/common/aarch64/sad-a.S Changed
201
 
1
@@ -1,8 +1,9 @@
2
 /*****************************************************************************
3
- * Copyright (C) 2020-2021 MulticoreWare, Inc
4
+ * Copyright (C) 2020-2024 MulticoreWare, Inc
5
  *
6
  * Authors: Hongbin Liu <liuhongbin1@huawei.com>
7
  *          Sebastian Pop <spop@amazon.com>
8
+            Hari Limaye <hari.limaye@arm.com>
9
  *
10
  * This program is free software; you can redistribute it and/or modify
11
  * it under the terms of the GNU General Public License as published by
12
@@ -23,7 +24,6 @@
13
  *****************************************************************************/
14
 
15
 #include "asm.S"
16
-#include "sad-a-common.S"
17
 
18
 #ifdef __APPLE__
19
 .section __RODATA,__rodata
20
@@ -35,12 +35,234 @@
21
 
22
 .text
23
 
24
+.macro SAD_START_4 f
25
+    ldr             s0, x0
26
+    ldr             s1, x2
27
+    add             x0, x0, x1
28
+    add             x2, x2, x3
29
+    ld1             {v0.s}1, x0, x1
30
+    ld1             {v1.s}1, x2, x3
31
+    \f              v16.8h, v0.8b, v1.8b
32
+.endm
33
+
34
+.macro SAD_4 h
35
+.rept \h / 2 - 1
36
+    SAD_START_4 uabal
37
+.endr
38
+.endm
39
+
40
+.macro SAD_START_8 f
41
+    ld1             {v0.8b}, x0, x1
42
+    ld1             {v1.8b}, x2, x3
43
+    \f              v16.8h, v0.8b, v1.8b
44
+.endm
45
+
46
+.macro SAD_8 h
47
+.rept \h - 3
48
+    SAD_START_8 uabal
49
+.endr
50
+    ldr             d0, x0
51
+    ldr             d1, x2
52
+    uabal           v16.8h, v0.8b, v1.8b
53
+    ldr             d0, x0, x1
54
+    ldr             d1, x2, x3
55
+    uabal           v16.8h, v0.8b, v1.8b
56
+.endm
57
+
58
+.macro SAD_START_16
59
+    movi            v16.16b, #0
60
+    movi            v17.16b, #0
61
+.endm
62
+
63
+.macro SAD_16
64
+    ld1             {v0.16b}, x0, x1
65
+    ld1             {v1.16b}, x2, x3
66
+    ld1             {v2.16b}, x0, x1
67
+    ld1             {v3.16b}, x2, x3
68
+    uabd            v20.16b, v0.16b, v1.16b
69
+    uadalp          v16.8h, v20.16b
70
+    uabd            v21.16b, v2.16b, v3.16b
71
+    uadalp          v17.8h, v21.16b
72
+.endm
73
+
74
+.macro SAD_END_16
75
+    add             v16.8h, v16.8h, v17.8h
76
+    uaddlv          s0, v16.8h
77
+    fmov            x0, d0
78
+    ret
79
+.endm
80
+
81
+.macro SAD_START_32
82
+    movi            v16.16b, #0
83
+    movi            v17.16b, #0
84
+    movi            v18.16b, #0
85
+    movi            v19.16b, #0
86
+.endm
87
+
88
+.macro SAD_32
89
+    ld1             {v0.16b-v1.16b}, x0, x1
90
+    ld1             {v2.16b-v3.16b}, x2, x3
91
+    ld1             {v4.16b-v5.16b}, x0, x1
92
+    ld1             {v6.16b-v7.16b}, x2, x3
93
+    uabd            v20.16b, v0.16b, v2.16b
94
+    uadalp          v16.8h, v20.16b
95
+    uabd            v21.16b, v1.16b, v3.16b
96
+    uadalp          v17.8h, v21.16b
97
+    uabd            v22.16b, v4.16b, v6.16b
98
+    uadalp          v18.8h, v22.16b
99
+    uabd            v23.16b, v5.16b, v7.16b
100
+    uadalp          v19.8h, v23.16b
101
+.endm
102
+
103
+.macro SAD_END_32
104
+    add             v16.8h, v16.8h, v17.8h
105
+    add             v17.8h, v18.8h, v19.8h
106
+    add             v16.8h, v16.8h, v17.8h
107
+    uaddlv          s0, v16.8h
108
+    fmov            w0, s0
109
+    ret
110
+.endm
111
+
112
+.macro SAD_START_64
113
+    movi            v16.16b, #0
114
+    movi            v17.16b, #0
115
+    movi            v18.16b, #0
116
+    movi            v19.16b, #0
117
+.endm
118
+
119
+.macro SAD_64
120
+    ld1             {v0.16b-v3.16b}, x0, x1
121
+    ld1             {v4.16b-v7.16b}, x2, x3
122
+    ld1             {v24.16b-v27.16b}, x0, x1
123
+    ld1             {v28.16b-v31.16b}, x2, x3
124
+    uabd            v20.16b, v0.16b, v4.16b
125
+    uadalp          v16.8h, v20.16b
126
+    uabd            v21.16b, v1.16b, v5.16b
127
+    uadalp          v17.8h, v21.16b
128
+    uabd            v22.16b, v2.16b, v6.16b
129
+    uadalp          v18.8h, v22.16b
130
+    uabd            v23.16b, v3.16b, v7.16b
131
+    uadalp          v19.8h, v23.16b
132
+    uabd            v20.16b, v24.16b, v28.16b
133
+    uadalp          v16.8h, v20.16b
134
+    uabd            v21.16b, v25.16b, v29.16b
135
+    uadalp          v17.8h, v21.16b
136
+    uabd            v22.16b, v26.16b, v30.16b
137
+    uadalp          v18.8h, v22.16b
138
+    uabd            v23.16b, v27.16b, v31.16b
139
+    uadalp          v19.8h, v23.16b
140
+.endm
141
+
142
+.macro SAD_END_64
143
+    uaddlp          v16.4s, v16.8h
144
+    uadalp          v16.4s, v17.8h
145
+    uadalp          v16.4s, v18.8h
146
+    uadalp          v16.4s, v19.8h
147
+    uaddlv          d0, v16.4s
148
+    fmov            x0, d0
149
+    ret
150
+.endm
151
+
152
+.macro SAD_START_12
153
+    movrel          x12, sad12_mask
154
+    ld1             {v31.16b}, x12
155
+    movi            v16.16b, #0
156
+    movi            v17.16b, #0
157
+.endm
158
+
159
+.macro SAD_12
160
+    ld1             {v0.16b}, x0, x1
161
+    and             v0.16b, v0.16b, v31.16b
162
+    ld1             {v1.16b}, x2, x3
163
+    and             v1.16b, v1.16b, v31.16b
164
+    ld1             {v2.16b}, x0, x1
165
+    and             v2.16b, v2.16b, v31.16b
166
+    ld1             {v3.16b}, x2, x3
167
+    and             v3.16b, v3.16b, v31.16b
168
+    uabd            v20.16b, v0.16b, v1.16b
169
+    uadalp          v16.8h, v20.16b
170
+    uabd            v21.16b, v2.16b, v3.16b
171
+    uadalp          v17.8h, v21.16b
172
+.endm
173
+
174
+.macro SAD_END_12
175
+    add             v16.8h, v16.8h, v17.8h
176
+    uaddlv          s0, v16.8h
177
+    fmov            w0, s0
178
+    ret
179
+.endm
180
+
181
+.macro SAD_START_24
182
+    movi            v16.16b, #0
183
+    movi            v17.16b, #0
184
+    sub             x1, x1, #16
185
+    sub             x3, x3, #16
186
+.endm
187
+
188
+.macro SAD_24
189
+    ld1             {v0.16b}, x0, #16
190
+    ld1             {v1.8b}, x0, x1
191
+    ld1             {v2.16b}, x2, #16
192
+    ld1             {v3.8b}, x2, x3
193
+    ld1             {v4.16b}, x0, #16
194
+    ld1             {v5.8b}, x0, x1
195
+    ld1             {v6.16b}, x2, #16
196
+    ld1             {v7.8b}, x2, x3
197
+    uabd            v20.16b, v0.16b, v2.16b
198
+    uadalp          v16.8h, v20.16b
199
+    uabal           v17.8h, v1.8b, v3.8b
200
+    uabd            v20.16b, v4.16b, v6.16b
201
x265_4.0.tar.gz/source/common/aarch64/sad-neon-dotprod.S Added
201
 
1
@@ -0,0 +1,330 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2024 MulticoreWare, Inc
4
+ *
5
+ * Authors: Hari Limaye <hari.limaye@arm.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+
27
+.arch armv8.2-a+dotprod
28
+
29
+#ifdef __APPLE__
30
+.section __RODATA,__rodata
31
+#else
32
+.section .rodata
33
+#endif
34
+
35
+.align 4
36
+
37
+.text
38
+
39
+// Fully unrolled with single accumulator for smaller block heights.
40
+.macro SAD_NEON_DOTPROD_16_S h
41
+function PFX(pixel_sad_16x\h\()_neon_dotprod)
42
+    movi            v0.16b, #0
43
+    movi            v1.16b, #1
44
+.rept \h - 2
45
+    ldr             q2, x0
46
+    ldr             q3, x2
47
+    add             x0, x0, x1
48
+    add             x2, x2, x3
49
+    uabd            v4.16b, v2.16b, v3.16b
50
+    udot            v0.4s, v4.16b, v1.16b
51
+.endr
52
+    ldr             q2, x0
53
+    ldr             q3, x2
54
+    uabd            v4.16b, v2.16b, v3.16b
55
+    udot            v0.4s, v4.16b, v1.16b
56
+    ldr             q2, x0, x1
57
+    ldr             q3, x2, x3
58
+    uabd            v4.16b, v2.16b, v3.16b
59
+    udot            v0.4s, v4.16b, v1.16b
60
+
61
+    addv            s0, v0.4s
62
+    fmov            w0, s0
63
+    ret
64
+endfunc
65
+.endm
66
+
67
+.macro SAD_NEON_DOTPROD_START
68
+    // v31: 1 across all lanes for use in UDOT instructions.
69
+    movi            v31.16b, #1
70
+    movi            v16.16b, #0
71
+    movi            v17.16b, #0
72
+.endm
73
+
74
+.macro SAD_NEON_DOTPROD_END
75
+    add             v16.4s, v16.4s, v17.4s
76
+    addv            s0, v16.4s
77
+    fmov            w0, s0
78
+    ret
79
+.endm
80
+
81
+// Fully unrolled.
82
+.macro SAD_NEON_DOTPROD_16 h
83
+function PFX(pixel_sad_16x\h\()_neon_dotprod)
84
+    SAD_NEON_DOTPROD_START
85
+.rept \h / 2
86
+    ld1             {v0.16b}, x0, x1
87
+    ld1             {v1.16b}, x0, x1
88
+    ld1             {v2.16b}, x2, x3
89
+    ld1             {v3.16b}, x2, x3
90
+    uabd            v20.16b, v0.16b, v2.16b
91
+    udot            v16.4s, v20.16b, v31.16b
92
+    uabd            v21.16b, v1.16b, v3.16b
93
+    udot            v17.4s, v21.16b, v31.16b
94
+.endr
95
+    SAD_NEON_DOTPROD_END
96
+endfunc
97
+.endm
98
+
99
+// Process four rows of width 32.
100
+.macro SAD_NEON_DOTPROD_32
101
+.rept 4
102
+    ld1             {v0.16b-v1.16b}, x0, x1
103
+    ld1             {v2.16b-v3.16b}, x2, x3
104
+    uabd            v20.16b, v0.16b, v2.16b
105
+    udot            v16.4s, v20.16b, v31.16b
106
+    uabd            v21.16b, v1.16b, v3.16b
107
+    udot            v17.4s, v21.16b, v31.16b
108
+.endr
109
+.endm
110
+
111
+// Process four rows of width 48.
112
+.macro SAD_NEON_DOTPROD_48
113
+.rept 4
114
+    ld1             {v0.16b-v2.16b}, x0, x1
115
+    ld1             {v4.16b-v6.16b}, x2, x3
116
+    uabd            v20.16b, v0.16b, v4.16b
117
+    udot            v16.4s, v20.16b, v31.16b
118
+    uabd            v21.16b, v1.16b, v5.16b
119
+    udot            v17.4s, v21.16b, v31.16b
120
+    uabd            v20.16b, v2.16b, v6.16b
121
+    udot            v16.4s, v20.16b, v31.16b
122
+.endr
123
+.endm
124
+
125
+// Process four rows of width 64.
126
+.macro SAD_NEON_DOTPROD_64
127
+.rept 4
128
+    ld1             {v0.16b-v3.16b}, x0, x1
129
+    ld1             {v4.16b-v7.16b}, x2, x3
130
+    uabd            v20.16b, v0.16b, v4.16b
131
+    udot            v16.4s, v20.16b, v31.16b
132
+    uabd            v21.16b, v1.16b, v5.16b
133
+    udot            v17.4s, v21.16b, v31.16b
134
+    uabd            v20.16b, v2.16b, v6.16b
135
+    udot            v16.4s, v20.16b, v31.16b
136
+    uabd            v21.16b, v3.16b, v7.16b
137
+    udot            v17.4s, v21.16b, v31.16b
138
+.endr
139
+.endm
140
+
141
+// Loop unrolled to process 4 rows per iteration.
142
+.macro SAD_NEON_DOTPROD_LOOP w, h
143
+function PFX(pixel_sad_\w\()x\h\()_neon_dotprod)
144
+    SAD_NEON_DOTPROD_START
145
+    mov             w9, #\h/4
146
+.Loop_\w\()x\h:
147
+    sub             w9, w9, #1
148
+
149
+    SAD_NEON_DOTPROD_\w
150
+
151
+    cbnz            w9, .Loop_\w\()x\h
152
+    SAD_NEON_DOTPROD_END
153
+endfunc
154
+.endm
155
+
156
+SAD_NEON_DOTPROD_16_S 4
157
+SAD_NEON_DOTPROD_16_S 8
158
+SAD_NEON_DOTPROD_16_S 12
159
+SAD_NEON_DOTPROD_16_S 16
160
+SAD_NEON_DOTPROD_16 32
161
+SAD_NEON_DOTPROD_16 64
162
+SAD_NEON_DOTPROD_LOOP  32, 8
163
+SAD_NEON_DOTPROD_LOOP  32, 16
164
+SAD_NEON_DOTPROD_LOOP  32, 24
165
+SAD_NEON_DOTPROD_LOOP  32, 32
166
+SAD_NEON_DOTPROD_LOOP  32, 64
167
+SAD_NEON_DOTPROD_LOOP  48, 64
168
+SAD_NEON_DOTPROD_LOOP  64, 16
169
+SAD_NEON_DOTPROD_LOOP  64, 32
170
+SAD_NEON_DOTPROD_LOOP  64, 48
171
+SAD_NEON_DOTPROD_LOOP  64, 64
172
+
173
+.macro PREP_ARGS_SAD_X_NEON_DOTPROD x
174
+    mov             x9, #FENC_STRIDE
175
+
176
+// Make function arguments for x == 3 look like x == 4.
177
+.if \x == 3
178
+    mov             x6, x5
179
+    mov             x5, x4
180
+.endif
181
+
182
+    // v31: 1 across all lanes for use in UDOT instructions.
183
+    movi            v31.16b, #1
184
+.endm
185
+
186
+.macro SAD_X_NEON_DOTPROD_START x
187
+    movi v16.4s, #0
188
+    movi v17.4s, #0
189
+    movi v18.4s, #0
190
+.if \x == 4
191
+    movi v19.4s, #0
192
+.endif
193
+.endm
194
+
195
+.macro SAD_X_NEON_DOTPROD_END x
196
+.if \x == 3
197
+    addv            s0, v16.4s
198
+    addv            s1, v17.4s
199
+    addv            s2, v18.4s
200
+    stp             s0, s1, x6
201
x265_4.0.tar.gz/source/common/aarch64/sao-prim-sve.cpp Added
201
 
1
@@ -0,0 +1,271 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2024 MulticoreWare, Inc
4
+ *
5
+ * Authors: Hari Limaye <hari.limaye@arm.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "sao-prim.h"
26
+
27
+/*
28
+ * Compute Edge Offset statistics (count and stats).
29
+ * To save some instructions compute count and stats as negative values - since
30
+ * output of Neon comparison instructions for a matched condition is all 1s (-1).
31
+ */
32
+static inline void compute_eo_stats(const int8x16_t edge_type,
33
+                                    const int16_t *diff, int16x8_t *count,
34
+                                    int64x2_t *stats)
35
+{
36
+    // Create a mask for each edge type.
37
+    int8x16_t mask0 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-2)));
38
+    int8x16_t mask1 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-1)));
39
+    int8x16_t mask2 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(0)));
40
+    int8x16_t mask3 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(1)));
41
+    int8x16_t mask4 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(2)));
42
+
43
+    // Compute negative counts for each edge type.
44
+    count0 = vpadalq_s8(count0, mask0);
45
+    count1 = vpadalq_s8(count1, mask1);
46
+    count2 = vpadalq_s8(count2, mask2);
47
+    count3 = vpadalq_s8(count3, mask3);
48
+    count4 = vpadalq_s8(count4, mask4);
49
+
50
+    // Widen the masks to 16-bit.
51
+    int16x8_t mask0_lo = vreinterpretq_s16_s8(vzip1q_s8(mask0, mask0));
52
+    int16x8_t mask0_hi = vreinterpretq_s16_s8(vzip2q_s8(mask0, mask0));
53
+    int16x8_t mask1_lo = vreinterpretq_s16_s8(vzip1q_s8(mask1, mask1));
54
+    int16x8_t mask1_hi = vreinterpretq_s16_s8(vzip2q_s8(mask1, mask1));
55
+    int16x8_t mask2_lo = vreinterpretq_s16_s8(vzip1q_s8(mask2, mask2));
56
+    int16x8_t mask2_hi = vreinterpretq_s16_s8(vzip2q_s8(mask2, mask2));
57
+    int16x8_t mask3_lo = vreinterpretq_s16_s8(vzip1q_s8(mask3, mask3));
58
+    int16x8_t mask3_hi = vreinterpretq_s16_s8(vzip2q_s8(mask3, mask3));
59
+    int16x8_t mask4_lo = vreinterpretq_s16_s8(vzip1q_s8(mask4, mask4));
60
+    int16x8_t mask4_hi = vreinterpretq_s16_s8(vzip2q_s8(mask4, mask4));
61
+
62
+    int16x8_t diff_lo = vld1q_s16(diff);
63
+    int16x8_t diff_hi = vld1q_s16(diff + 8);
64
+
65
+    // Compute negative stats for each edge type.
66
+    stats0 = x265_sdotq_s16(stats0, diff_lo, mask0_lo);
67
+    stats0 = x265_sdotq_s16(stats0, diff_hi, mask0_hi);
68
+    stats1 = x265_sdotq_s16(stats1, diff_lo, mask1_lo);
69
+    stats1 = x265_sdotq_s16(stats1, diff_hi, mask1_hi);
70
+    stats2 = x265_sdotq_s16(stats2, diff_lo, mask2_lo);
71
+    stats2 = x265_sdotq_s16(stats2, diff_hi, mask2_hi);
72
+    stats3 = x265_sdotq_s16(stats3, diff_lo, mask3_lo);
73
+    stats3 = x265_sdotq_s16(stats3, diff_hi, mask3_hi);
74
+    stats4 = x265_sdotq_s16(stats4, diff_lo, mask4_lo);
75
+    stats4 = x265_sdotq_s16(stats4, diff_hi, mask4_hi);
76
+}
77
+
78
+/*
79
+ * Reduce and store Edge Offset statistics (count and stats).
80
+ */
81
+static inline void reduce_eo_stats(int64x2_t *vstats, int16x8_t *vcount,
82
+                                   int32_t *stats, int32_t *count)
83
+{
84
+    // s_eoTable maps edge types to memory in order: {2, 0, 1, 3, 4}.
85
+    int16x8_t c01 = vpaddq_s16(vcount2, vcount0);
86
+    int16x8_t c23 = vpaddq_s16(vcount1, vcount3);
87
+    int16x8_t c0123 = vpaddq_s16(c01, c23);
88
+    // Subtract from current count, as we calculate the negation.
89
+    vst1q_s32(count, vsubq_s32(vld1q_s32(count), vpaddlq_s16(c0123)));
90
+    count4 -= vaddvq_s16(vcount4);
91
+
92
+    int32x4_t s01 = vcombine_s32(vmovn_s64(vstats2), vmovn_s64(vstats0));
93
+    int32x4_t s23 = vcombine_s32(vmovn_s64(vstats1), vmovn_s64(vstats3));
94
+    int32x4_t s0123 = vpaddq_s32(s01, s23);
95
+    // Subtract from current stats, as we calculate the negation.
96
+    vst1q_s32(stats, vsubq_s32(vld1q_s32(stats), s0123));
97
+    stats4 -= vaddvq_s64(vstats4);
98
+}
99
+
100
+namespace X265_NS {
101
+void saoCuStatsE0_sve(const int16_t *diff, const pixel *rec, intptr_t stride,
102
+                      int endX, int endY, int32_t *stats, int32_t *count)
103
+{
104
+    // Separate buffers for each edge type, so that we can vectorise.
105
+    int16x8_t tmp_count5 = { vdupq_n_s16(0), vdupq_n_s16(0), vdupq_n_s16(0),
106
+                               vdupq_n_s16(0), vdupq_n_s16(0) };
107
+    int64x2_t tmp_stats5 = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
108
+                               vdupq_n_s64(0), vdupq_n_s64(0) };
109
+
110
+    for (int y = 0; y < endY; y++)
111
+    {
112
+        // Calculate negated sign_left(x) directly, to save negation when
113
+        // reusing sign_right(x) as sign_left(x + 1).
114
+        int8x16_t neg_sign_left = vdupq_n_s8(x265_signOf(rec-1 - rec0));
115
+        for (int x = 0; x < endX; x += 16)
116
+        {
117
+            int8x16_t sign_right = signOf_neon(rec + x, rec + x + 1);
118
+
119
+            // neg_sign_left(x) = sign_right(x + 1), reusing one from previous
120
+            // iteration.
121
+            neg_sign_left = vextq_s8(neg_sign_left, sign_right, 15);
122
+
123
+            // Subtract instead of add, as sign_left is negated.
124
+            int8x16_t edge_type = vsubq_s8(sign_right, neg_sign_left);
125
+
126
+            // For reuse in the next iteration.
127
+            neg_sign_left = sign_right;
128
+
129
+            edge_type = x265_sve_mask(x, endX, edge_type);
130
+            compute_eo_stats(edge_type, diff + x, tmp_count, tmp_stats);
131
+        }
132
+
133
+        diff += MAX_CU_SIZE;
134
+        rec += stride;
135
+    }
136
+
137
+    reduce_eo_stats(tmp_stats, tmp_count, stats, count);
138
+}
139
+
140
+void saoCuStatsE1_sve(const int16_t *diff, const pixel *rec, intptr_t stride,
141
+                      int8_t *upBuff1, int endX, int endY, int32_t *stats,
142
+                      int32_t *count)
143
+{
144
+    // Separate buffers for each edge type, so that we can vectorise.
145
+    int16x8_t tmp_count5 = { vdupq_n_s16(0), vdupq_n_s16(0), vdupq_n_s16(0),
146
+                               vdupq_n_s16(0), vdupq_n_s16(0) };
147
+    int64x2_t tmp_stats5 = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
148
+                               vdupq_n_s64(0), vdupq_n_s64(0) };
149
+
150
+    // Negate upBuff1 (sign_up), so we can subtract and save repeated negations.
151
+    for (int x = 0; x < endX; x += 16)
152
+    {
153
+        vst1q_s8(upBuff1 + x, vnegq_s8(vld1q_s8(upBuff1 + x)));
154
+    }
155
+
156
+    for (int y = 0; y < endY; y++)
157
+    {
158
+        for (int x = 0; x < endX; x += 16)
159
+        {
160
+            int8x16_t sign_up = vld1q_s8(upBuff1 + x);
161
+            int8x16_t sign_down = signOf_neon(rec + x, rec + x + stride);
162
+
163
+            // Subtract instead of add, as sign_up is negated.
164
+            int8x16_t edge_type = vsubq_s8(sign_down, sign_up);
165
+
166
+            // For reuse in the next iteration.
167
+            vst1q_s8(upBuff1 + x, sign_down);
168
+
169
+            edge_type = x265_sve_mask(x, endX, edge_type);
170
+            compute_eo_stats(edge_type, diff + x, tmp_count, tmp_stats);
171
+        }
172
+
173
+        diff += MAX_CU_SIZE;
174
+        rec += stride;
175
+    }
176
+
177
+    reduce_eo_stats(tmp_stats, tmp_count, stats, count);
178
+}
179
+
180
+void saoCuStatsE2_sve(const int16_t *diff, const pixel *rec, intptr_t stride,
181
+                      int8_t *upBuff1, int8_t *upBufft, int endX, int endY,
182
+                      int32_t *stats, int32_t *count)
183
+{
184
+    // Separate buffers for each edge type, so that we can vectorise.
185
+    int16x8_t tmp_count5 = { vdupq_n_s16(0), vdupq_n_s16(0), vdupq_n_s16(0),
186
+                               vdupq_n_s16(0), vdupq_n_s16(0) };
187
+    int64x2_t tmp_stats5 = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
188
+                               vdupq_n_s64(0), vdupq_n_s64(0) };
189
+
190
+    // Negate upBuff1 (sign_up) so we can subtract and save repeated negations.
191
+    for (int x = 0; x < endX; x += 16)
192
+    {
193
+        vst1q_s8(upBuff1 + x, vnegq_s8(vld1q_s8(upBuff1 + x)));
194
+    }
195
+
196
+    for (int y = 0; y < endY; y++)
197
+    {
198
+        upBufft0 = x265_signOf(rec-1 - recstride);
199
+        for (int x = 0; x < endX; x += 16)
200
+        {
201
x265_4.0.tar.gz/source/common/aarch64/sao-prim-sve2.cpp Added
201
 
1
@@ -0,0 +1,317 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2024 MulticoreWare, Inc
4
+ *
5
+ * Authors: Hari Limaye <hari.limaye@arm.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "sao-prim.h"
26
+
27
+static inline uint8x16_t sve_count(int8x16_t in)
28
+{
29
+    // We do not care about initialising the values in the rest of the vector,
30
+    // for VL > 128, as HISTSEG counts matching elements in 128-bit segments.
31
+    svint8_t edge_type = svset_neonq_s8(svundef_s8(), in);
32
+
33
+    // Use an arbitrary value outside of range -2, 2 for lanes we don't
34
+    // need to use the result from.
35
+    const int DC = -3;
36
+    // s_eoTable maps edge types to memory in order: {2, 0, 1, 3, 4}.
37
+    // We use (edge_class - 2) resulting in   {0, -2, -1, 1, 2}
38
+    int8x16_t idx = { 0, -2, -1, 1, 2, DC, DC, DC, DC, DC, DC, DC, DC, DC, DC,
39
+                      DC };
40
+    svint8_t svidx = svset_neonq_s8(svundef_s8(), idx);
41
+
42
+    svuint8_t count = svhistseg_s8(svidx, edge_type);
43
+    return svget_neonq_u8(count);
44
+}
45
+
46
+/*
47
+ * Compute Edge Offset statistics (stats array).
48
+ * To save some instructions compute stats as negative values - since output of
49
+ * Neon comparison instructions for a matched condition is all 1s (-1).
50
+ */
51
+static inline void compute_eo_stats(const int8x16_t edge_type,
52
+                                    const int16_t *diff, int64x2_t *stats)
53
+{
54
+    // Create a mask for each edge type.
55
+    int8x16_t mask0 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-2)));
56
+    int8x16_t mask1 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-1)));
57
+    int8x16_t mask2 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(0)));
58
+    int8x16_t mask3 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(1)));
59
+    int8x16_t mask4 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(2)));
60
+
61
+    // Widen the masks to 16-bit.
62
+    int16x8_t mask0_lo = vreinterpretq_s16_s8(vzip1q_s8(mask0, mask0));
63
+    int16x8_t mask0_hi = vreinterpretq_s16_s8(vzip2q_s8(mask0, mask0));
64
+    int16x8_t mask1_lo = vreinterpretq_s16_s8(vzip1q_s8(mask1, mask1));
65
+    int16x8_t mask1_hi = vreinterpretq_s16_s8(vzip2q_s8(mask1, mask1));
66
+    int16x8_t mask2_lo = vreinterpretq_s16_s8(vzip1q_s8(mask2, mask2));
67
+    int16x8_t mask2_hi = vreinterpretq_s16_s8(vzip2q_s8(mask2, mask2));
68
+    int16x8_t mask3_lo = vreinterpretq_s16_s8(vzip1q_s8(mask3, mask3));
69
+    int16x8_t mask3_hi = vreinterpretq_s16_s8(vzip2q_s8(mask3, mask3));
70
+    int16x8_t mask4_lo = vreinterpretq_s16_s8(vzip1q_s8(mask4, mask4));
71
+    int16x8_t mask4_hi = vreinterpretq_s16_s8(vzip2q_s8(mask4, mask4));
72
+
73
+    int16x8_t diff_lo = vld1q_s16(diff);
74
+    int16x8_t diff_hi = vld1q_s16(diff + 8);
75
+
76
+    // Compute negative stats for each edge type.
77
+    stats0 = x265_sdotq_s16(stats0, diff_lo, mask0_lo);
78
+    stats0 = x265_sdotq_s16(stats0, diff_hi, mask0_hi);
79
+    stats1 = x265_sdotq_s16(stats1, diff_lo, mask1_lo);
80
+    stats1 = x265_sdotq_s16(stats1, diff_hi, mask1_hi);
81
+    stats2 = x265_sdotq_s16(stats2, diff_lo, mask2_lo);
82
+    stats2 = x265_sdotq_s16(stats2, diff_hi, mask2_hi);
83
+    stats3 = x265_sdotq_s16(stats3, diff_lo, mask3_lo);
84
+    stats3 = x265_sdotq_s16(stats3, diff_hi, mask3_hi);
85
+    stats4 = x265_sdotq_s16(stats4, diff_lo, mask4_lo);
86
+    stats4 = x265_sdotq_s16(stats4, diff_hi, mask4_hi);
87
+}
88
+
89
+/*
90
+ * Reduce and store Edge Offset statistics (count and stats).
91
+ */
92
+static inline void reduce_eo_stats(int64x2_t *vstats, uint16x8_t vcount,
93
+                                   int32_t *stats, int32_t *count)
94
+{
95
+    // s_eoTable maps edge types to memory in order: {2, 0, 1, 3, 4}.
96
+    // We already have the count values in the correct order for the store,
97
+    // so widen to 32-bit and accumulate to the destination.
98
+    int32x4_t c0123 = vmovl_s16(vget_low_s16(vreinterpretq_s16_u16(vcount)));
99
+    vst1q_s32(count, vaddq_s32(vld1q_s32(count), c0123));
100
+    count4 += vcount4;
101
+
102
+    int32x4_t s01 = vcombine_s32(vmovn_s64(vstats2), vmovn_s64(vstats0));
103
+    int32x4_t s23 = vcombine_s32(vmovn_s64(vstats1), vmovn_s64(vstats3));
104
+    int32x4_t s0123 = vpaddq_s32(s01, s23);
105
+    // Subtract from current stats, as we calculate the negation.
106
+    vst1q_s32(stats, vsubq_s32(vld1q_s32(stats), s0123));
107
+    stats4 -= vaddvq_s64(vstats4);
108
+}
109
+
110
+namespace X265_NS {
111
+void saoCuStatsE0_sve2(const int16_t *diff, const pixel *rec, intptr_t stride,
112
+                       int endX, int endY, int32_t *stats, int32_t *count)
113
+{
114
+    // Separate buffers for each edge type, so that we can vectorise.
115
+    int64x2_t tmp_stats5 = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
116
+                               vdupq_n_s64(0), vdupq_n_s64(0) };
117
+    uint16x8_t count_acc_u16 = vdupq_n_u16(0);
118
+
119
+    for (int y = 0; y < endY; y++)
120
+    {
121
+        uint8x16_t count_acc_u8 = vdupq_n_u8(0);
122
+
123
+        // Calculate negated sign_left(x) directly, to save negation when
124
+        // reusing sign_right(x) as sign_left(x + 1).
125
+        int8x16_t neg_sign_left = vdupq_n_s8(x265_signOf(rec-1 - rec0));
126
+        for (int x = 0; x < endX; x += 16)
127
+        {
128
+            int8x16_t sign_right = signOf_neon(rec + x, rec + x + 1);
129
+
130
+            // neg_sign_left(x) = sign_right(x + 1), reusing one from previous
131
+            // iteration.
132
+            neg_sign_left = vextq_s8(neg_sign_left, sign_right, 15);
133
+
134
+            // Subtract instead of add, as sign_left is negated.
135
+            int8x16_t edge_type = vsubq_s8(sign_right, neg_sign_left);
136
+
137
+            // For reuse in the next iteration.
138
+            neg_sign_left = sign_right;
139
+
140
+            edge_type = x265_sve_mask(x, endX, edge_type);
141
+            count_acc_u8 = vaddq_u8(count_acc_u8, sve_count(edge_type));
142
+            compute_eo_stats(edge_type, diff + x, tmp_stats);
143
+        }
144
+
145
+        // The width (endX) can be a maximum of 64, so we can safely
146
+        // widen from 8-bit count accumulators after one inner loop iteration.
147
+        // Technically the largest an accumulator could reach after one inner
148
+        // loop iteration is 64, if every input value had the same edge type, so
149
+        // we could complete two iterations (2 * 64 = 128) before widening.
150
+        count_acc_u16 = vaddw_u8(count_acc_u16, vget_low_u8(count_acc_u8));
151
+
152
+        diff += MAX_CU_SIZE;
153
+        rec += stride;
154
+    }
155
+
156
+    reduce_eo_stats(tmp_stats, count_acc_u16, stats, count);
157
+}
158
+
159
+void saoCuStatsE1_sve2(const int16_t *diff, const pixel *rec, intptr_t stride,
160
+                       int8_t *upBuff1, int endX, int endY, int32_t *stats,
161
+                       int32_t *count)
162
+{
163
+    // Separate buffers for each edge type, so that we can vectorise.
164
+    int64x2_t tmp_stats5 = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
165
+                               vdupq_n_s64(0), vdupq_n_s64(0) };
166
+    uint16x8_t count_acc_u16 = vdupq_n_u16(0);
167
+
168
+    // Negate upBuff1 (sign_up), so we can subtract and save repeated negations.
169
+    for (int x = 0; x < endX; x += 16)
170
+    {
171
+        vst1q_s8(upBuff1 + x, vnegq_s8(vld1q_s8(upBuff1 + x)));
172
+    }
173
+
174
+    for (int y = 0; y < endY; y++)
175
+    {
176
+        uint8x16_t count_acc_u8 = vdupq_n_u8(0);
177
+
178
+        for (int x = 0; x < endX; x += 16)
179
+        {
180
+            int8x16_t sign_up = vld1q_s8(upBuff1 + x);
181
+            int8x16_t sign_down = signOf_neon(rec + x, rec + x + stride);
182
+
183
+            // Subtract instead of add, as sign_up is negated.
184
+            int8x16_t edge_type = vsubq_s8(sign_down, sign_up);
185
+
186
+            // For reuse in the next iteration.
187
+            vst1q_s8(upBuff1 + x, sign_down);
188
+
189
+            edge_type = x265_sve_mask(x, endX, edge_type);
190
+            count_acc_u8 = vaddq_u8(count_acc_u8, sve_count(edge_type));
191
+            compute_eo_stats(edge_type, diff + x, tmp_stats);
192
+        }
193
+
194
+        // The width (endX) can be a maximum of 64, so we can safely
195
+        // widen from 8-bit count accumulators after one inner loop iteration.
196
+        // Technically the largest an accumulator could reach after one inner
197
+        // loop iteration is 64, if every input value had the same edge type, so
198
+        // we could complete two iterations (2 * 64 = 128) before widening.
199
+        count_acc_u16 = vaddw_u8(count_acc_u16, vget_low_u8(count_acc_u8));
200
+
201
x265_4.0.tar.gz/source/common/aarch64/sao-prim.cpp Added
201
 
1
@@ -0,0 +1,380 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2024 MulticoreWare, Inc
4
+ *
5
+ * Authors: Hari Limaye <hari.limaye@arm.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "sao-prim.h"
26
+#include "sao.h"
27
+#include <arm_neon.h>
28
+
29
+// Predicate mask indices.
30
+static const int8_t quad_reg_byte_indices16 = {
31
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
32
+};
33
+
34
+static inline int8x16_t mask_inactive_elems(const int rem, int8x16_t edge_type)
35
+{
36
+    // Compute a predicate mask where the bits of an element are 0 if the index
37
+    // is less than the remainder (active), and 1 otherwise.
38
+    const int8x16_t indices = vld1q_s8(quad_reg_byte_indices);
39
+    int8x16_t pred = vreinterpretq_s8_u8(vcgeq_s8(indices, vdupq_n_s8(rem)));
40
+
41
+    // Use predicate mask to shift "unused lanes" outside of range -2, 2
42
+    pred = vshlq_n_s8(pred, 3);
43
+    return veorq_s8(edge_type, pred);
44
+}
45
+
46
+/*
47
+ * Compute Edge Offset statistics (count and stats).
48
+ * To save some instructions compute count and stats as negative values - since
49
+ * output of Neon comparison instructions for a matched condition is all 1s (-1).
50
+ */
51
+static inline void compute_eo_stats(const int8x16_t edge_type,
52
+                                    const int16_t *diff, int16x8_t *count,
53
+                                    int32x4_t *stats)
54
+{
55
+    // Create a mask for each edge type.
56
+    int8x16_t mask0 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-2)));
57
+    int8x16_t mask1 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-1)));
58
+    int8x16_t mask2 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(0)));
59
+    int8x16_t mask3 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(1)));
60
+    int8x16_t mask4 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(2)));
61
+
62
+    // Compute negative counts for each edge type.
63
+    count0 = vpadalq_s8(count0, mask0);
64
+    count1 = vpadalq_s8(count1, mask1);
65
+    count2 = vpadalq_s8(count2, mask2);
66
+    count3 = vpadalq_s8(count3, mask3);
67
+    count4 = vpadalq_s8(count4, mask4);
68
+
69
+    // Widen the masks to 16-bit.
70
+    int16x8_t mask0_lo = vreinterpretq_s16_s8(vzip1q_s8(mask0, mask0));
71
+    int16x8_t mask0_hi = vreinterpretq_s16_s8(vzip2q_s8(mask0, mask0));
72
+    int16x8_t mask1_lo = vreinterpretq_s16_s8(vzip1q_s8(mask1, mask1));
73
+    int16x8_t mask1_hi = vreinterpretq_s16_s8(vzip2q_s8(mask1, mask1));
74
+    int16x8_t mask2_lo = vreinterpretq_s16_s8(vzip1q_s8(mask2, mask2));
75
+    int16x8_t mask2_hi = vreinterpretq_s16_s8(vzip2q_s8(mask2, mask2));
76
+    int16x8_t mask3_lo = vreinterpretq_s16_s8(vzip1q_s8(mask3, mask3));
77
+    int16x8_t mask3_hi = vreinterpretq_s16_s8(vzip2q_s8(mask3, mask3));
78
+    int16x8_t mask4_lo = vreinterpretq_s16_s8(vzip1q_s8(mask4, mask4));
79
+    int16x8_t mask4_hi = vreinterpretq_s16_s8(vzip2q_s8(mask4, mask4));
80
+
81
+    int16x8_t diff_lo = vld1q_s16(diff);
82
+    int16x8_t diff_hi = vld1q_s16(diff + 8);
83
+
84
+    // Compute negative stats for each edge type.
85
+    int16x8_t stats0 = vmulq_s16(diff_lo, mask0_lo);
86
+    int16x8_t stats1 = vmulq_s16(diff_lo, mask1_lo);
87
+    int16x8_t stats2 = vmulq_s16(diff_lo, mask2_lo);
88
+    int16x8_t stats3 = vmulq_s16(diff_lo, mask3_lo);
89
+    int16x8_t stats4 = vmulq_s16(diff_lo, mask4_lo);
90
+    stats0 = vmlaq_s16(stats0, diff_hi, mask0_hi);
91
+    stats1 = vmlaq_s16(stats1, diff_hi, mask1_hi);
92
+    stats2 = vmlaq_s16(stats2, diff_hi, mask2_hi);
93
+    stats3 = vmlaq_s16(stats3, diff_hi, mask3_hi);
94
+    stats4 = vmlaq_s16(stats4, diff_hi, mask4_hi);
95
+
96
+    stats0 = vpadalq_s16(stats0, stats0);
97
+    stats1 = vpadalq_s16(stats1, stats1);
98
+    stats2 = vpadalq_s16(stats2, stats2);
99
+    stats3 = vpadalq_s16(stats3, stats3);
100
+    stats4 = vpadalq_s16(stats4, stats4);
101
+}
102
+
103
+/*
104
+ * Reduce and store Edge Offset statistics (count and stats).
105
+ */
106
+static inline void reduce_eo_stats(int32x4_t *vstats, int16x8_t *vcount,
107
+                                   int32_t *stats, int32_t *count)
108
+{
109
+    // s_eoTable maps edge types to memory in order: {2, 0, 1, 3, 4}.
110
+    int16x8_t c01 = vpaddq_s16(vcount2, vcount0);
111
+    int16x8_t c23 = vpaddq_s16(vcount1, vcount3);
112
+    int16x8_t c0123 = vpaddq_s16(c01, c23);
113
+
114
+    // Subtract from current count, as we calculate the negation.
115
+    vst1q_s32(count, vsubq_s32(vld1q_s32(count), vpaddlq_s16(c0123)));
116
+    count4 -= vaddvq_s16(vcount4);
117
+
118
+    int32x4_t s01 = vpaddq_s32(vstats2, vstats0);
119
+    int32x4_t s23 = vpaddq_s32(vstats1, vstats3);
120
+    int32x4_t s0123 = vpaddq_s32(s01, s23);
121
+
122
+    // Subtract from current stats, as we calculate the negation.
123
+    vst1q_s32(stats, vsubq_s32(vld1q_s32(stats), s0123));
124
+    stats4 -= vaddvq_s32(vstats4);
125
+}
126
+
127
+namespace X265_NS {
128
+void saoCuStatsBO_neon(const int16_t *diff, const pixel *rec, intptr_t stride,
129
+                       int endX, int endY, int32_t *stats, int32_t *count)
130
+{
131
+#if HIGH_BIT_DEPTH
132
+    const int n_elem = 4;
133
+    const int elem_width = 16;
134
+#else
135
+    const int n_elem = 8;
136
+    const int elem_width = 8;
137
+#endif
138
+
139
+    // Additional temporary buffer for accumulation.
140
+    int32_t stats_tmp32 = { 0 };
141
+    int32_t count_tmp32 = { 0 };
142
+
143
+    // Byte-addressable pointers to buffers, to optimise address calculation.
144
+    uint8_t *stats_b2 = {
145
+        reinterpret_cast<uint8_t *>(stats),
146
+        reinterpret_cast<uint8_t *>(stats_tmp),
147
+    };
148
+    uint8_t *count_b2 = {
149
+        reinterpret_cast<uint8_t *>(count),
150
+        reinterpret_cast<uint8_t *>(count_tmp),
151
+    };
152
+
153
+    // Combine shift for index calculation with shift for address calculation.
154
+    const int right_shift = X265_DEPTH - X265_NS::SAO::SAO_BO_BITS;
155
+    const int left_shift = 2;
156
+    const int shift = right_shift - left_shift;
157
+    // Mask out bits 7, 1 & 0 to account for combination of shifts.
158
+    const int mask = 0x7c;
159
+
160
+    // Compute statistics into temporary buffers.
161
+    for (int y = 0; y < endY; y++)
162
+    {
163
+        int x = 0;
164
+        for (; x + n_elem < endX; x += n_elem)
165
+        {
166
+            uint64_t class_idx_64 =
167
+                *reinterpret_cast<const uint64_t *>(rec + x) >> shift;
168
+
169
+            for (int i = 0; i < n_elem; ++i)
170
+            {
171
+                const int idx = i & 1;
172
+                const int off  = (class_idx_64 >> (i * elem_width)) & mask;
173
+                *reinterpret_cast<uint32_t*>(stats_bidx + off) += diffx + i;
174
+                *reinterpret_cast<uint32_t*>(count_bidx + off) += 1;
175
+            }
176
+        }
177
+
178
+        if (x < endX)
179
+        {
180
+            uint64_t class_idx_64 =
181
+                *reinterpret_cast<const uint64_t *>(rec + x) >> shift;
182
+
183
+            for (int i = 0; (i + x) < endX; ++i)
184
+            {
185
+                const int idx = i & 1;
186
+                const int off  = (class_idx_64 >> (i * elem_width)) & mask;
187
+                *reinterpret_cast<uint32_t*>(stats_bidx + off) += diffx + i;
188
+                *reinterpret_cast<uint32_t*>(count_bidx + off) += 1;
189
+            }
190
+        }
191
+
192
+        diff += MAX_CU_SIZE;
193
+        rec += stride;
194
+    }
195
+
196
+    // Reduce temporary buffers to destination using Neon.
197
+    for (int i = 0; i < 32; i += 4)
198
+    {
199
+        int32x4_t s0 = vld1q_s32(stats_tmp + i);
200
+        int32x4_t s1 = vld1q_s32(stats + i);
201
x265_4.0.tar.gz/source/common/aarch64/sao-prim.h Added
72
 
1
@@ -0,0 +1,70 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2024 MulticoreWare, Inc
4
+ *
5
+ * Authors: Hari Limaye <hari.limaye@arm.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#ifndef X265_COMMON_AARCH64_SAO_PRIM_H
26
+#define X265_COMMON_AARCH64_SAO_PRIM_H
27
+
28
+#include "neon-sve-bridge.h"
29
+#include "primitives.h"
30
+#include <arm_neon.h>
31
+
32
+static inline int8x16_t signOf_neon(const pixel *a, const pixel *b)
33
+{
34
+#if HIGH_BIT_DEPTH
35
+    uint16x8_t s0_lo = vld1q_u16(a);
36
+    uint16x8_t s0_hi = vld1q_u16(a + 8);
37
+    uint16x8_t s1_lo = vld1q_u16(b);
38
+    uint16x8_t s1_hi = vld1q_u16(b + 8);
39
+
40
+    // signOf(a - b) = -(a > b ? -1 : 0) | (a < b ? -1 : 0)
41
+    int16x8_t cmp0_lo = vreinterpretq_s16_u16(vcgtq_u16(s0_lo, s1_lo));
42
+    int16x8_t cmp0_hi = vreinterpretq_s16_u16(vcgtq_u16(s0_hi, s1_hi));
43
+    int16x8_t cmp1_lo = vreinterpretq_s16_u16(vcgtq_u16(s1_lo, s0_lo));
44
+    int16x8_t cmp1_hi = vreinterpretq_s16_u16(vcgtq_u16(s1_hi, s0_hi));
45
+
46
+    int8x16_t cmp0 = vcombine_s8(vmovn_s16(cmp0_lo), vmovn_s16(cmp0_hi));
47
+    int8x16_t cmp1 = vcombine_s8(vmovn_s16(cmp1_lo), vmovn_s16(cmp1_hi));
48
+#else // HIGH_BIT_DEPTH
49
+    uint8x16_t s0 = vld1q_u8(a);
50
+    uint8x16_t s1 = vld1q_u8(b);
51
+
52
+    // signOf(a - b) = -(a > b ? -1 : 0) | (a < b ? -1 : 0)
53
+    int8x16_t cmp0 = vreinterpretq_s8_u8(vcgtq_u8(s0, s1));
54
+    int8x16_t cmp1 = vreinterpretq_s8_u8(vcgtq_u8(s1, s0));
55
+#endif // HIGH_BIT_DEPTH
56
+    return vorrq_s8(vnegq_s8(cmp0), cmp1);
57
+}
58
+
59
+namespace X265_NS {
60
+void setupSaoPrimitives_neon(EncoderPrimitives &p);
61
+
62
+#if defined(HAVE_SVE) && HAVE_SVE_BRIDGE
63
+void setupSaoPrimitives_sve(EncoderPrimitives &p);
64
+#endif
65
+
66
+#if defined(HAVE_SVE2) && HAVE_SVE_BRIDGE
67
+void setupSaoPrimitives_sve2(EncoderPrimitives &p);
68
+#endif
69
+}
70
+
71
+#endif // X265_COMMON_AARCH64_SAO_PRIM_H
72
x265_3.6.tar.gz/source/common/aarch64/ssd-a-common.S -> x265_4.0.tar.gz/source/common/aarch64/ssd-a-common.S Changed
12
 
1
@@ -29,9 +29,7 @@
2
 .arch           armv8-a
3
 
4
 .macro ret_v0_w0
5
-    trn2            v1.2d, v0.2d, v0.2d
6
-    add             v0.2s, v0.2s, v1.2s
7
-    addp            v0.2s, v0.2s, v0.2s
8
+    addv            s0, v0.4s
9
     fmov            w0, s0
10
     ret
11
 .endm
12
x265_3.6.tar.gz/source/common/aarch64/ssd-a-sve2.S -> x265_4.0.tar.gz/source/common/aarch64/ssd-a-sve2.S Changed
201
 
1
@@ -36,267 +36,6 @@
2
 
3
 .text
4
 
5
-function PFX(pixel_sse_pp_32x32_sve2)
6
-    rdvl            x9, #1
7
-    cmp             x9, #16
8
-    bgt             .vl_gt_16_pixel_sse_pp_32x32
9
-    mov             w12, #8
10
-    movi            v0.16b, #0
11
-    movi            v1.16b, #0
12
-.loop_sse_pp_32_sve2:
13
-    sub             w12, w12, #1
14
-.rept 4
15
-    ld1             {v16.16b,v17.16b}, x0, x1
16
-    ld1             {v18.16b,v19.16b}, x2, x3
17
-    usubl           v2.8h, v16.8b, v18.8b
18
-    usubl2          v3.8h, v16.16b, v18.16b
19
-    usubl           v4.8h, v17.8b, v19.8b
20
-    usubl2          v5.8h, v17.16b, v19.16b
21
-    smlal           v0.4s, v2.4h, v2.4h
22
-    smlal2          v1.4s, v2.8h, v2.8h
23
-    smlal           v0.4s, v3.4h, v3.4h
24
-    smlal2          v1.4s, v3.8h, v3.8h
25
-    smlal           v0.4s, v4.4h, v4.4h
26
-    smlal2          v1.4s, v4.8h, v4.8h
27
-    smlal           v0.4s, v5.4h, v5.4h
28
-    smlal2          v1.4s, v5.8h, v5.8h
29
-.endr
30
-    cbnz            w12, .loop_sse_pp_32_sve2
31
-    add             v0.4s, v0.4s, v1.4s
32
-    ret_v0_w0
33
-.vl_gt_16_pixel_sse_pp_32x32:
34
-    ptrue           p0.b, vl32
35
-    ld1b            {z16.b}, p0/z, x0
36
-    ld1b            {z18.b}, p0/z, x2
37
-    add             x0, x0, x1
38
-    add             x2, x2, x3
39
-    usublb          z1.h, z16.b, z18.b
40
-    usublt          z2.h, z16.b, z18.b
41
-    smullb          z0.s, z1.h, z1.h
42
-    smlalt          z0.s, z1.h, z1.h
43
-    smlalb          z0.s, z2.h, z2.h
44
-    smlalt          z0.s, z2.h, z2.h
45
-.rept 31
46
-    ld1b            {z16.b}, p0/z, x0
47
-    ld1b            {z18.b}, p0/z, x2
48
-    add             x0, x0, x1
49
-    add             x2, x2, x3
50
-    usublb          z1.h, z16.b, z18.b
51
-    usublt          z2.h, z16.b, z18.b
52
-    smullb          z0.s, z1.h, z1.h
53
-    smlalt          z0.s, z1.h, z1.h
54
-    smlalb          z0.s, z2.h, z2.h
55
-    smlalt          z0.s, z2.h, z2.h
56
-.endr
57
-    uaddv           d3, p0, z0.s
58
-    fmov            w0, s3
59
-    ret
60
-endfunc
61
-
62
-function PFX(pixel_sse_pp_32x64_sve2)
63
-    rdvl            x9, #1
64
-    cmp             x9, #16
65
-    bgt             .vl_gt_16_pixel_sse_pp_32x64
66
-    ptrue           p0.b, vl16
67
-    ld1b            {z16.b}, p0/z, x0
68
-    ld1b            {z17.b}, p0/z, x0, #1, mul vl
69
-    ld1b            {z18.b}, p0/z, x2
70
-    ld1b            {z19.b}, p0/z, x2, #1, mul vl
71
-    add             x0, x0, x1
72
-    add             x2, x2, x3
73
-    usublb          z1.h, z16.b, z18.b
74
-    usublt          z2.h, z16.b, z18.b
75
-    usublb          z3.h, z17.b, z19.b
76
-    usublt          z4.h, z17.b, z19.b
77
-    smullb          z20.s, z1.h, z1.h
78
-    smullt          z21.s, z1.h, z1.h
79
-    smlalb          z20.s, z2.h, z2.h
80
-    smlalt          z21.s, z2.h, z2.h
81
-    smlalb          z20.s, z3.h, z3.h
82
-    smlalt          z21.s, z3.h, z3.h
83
-    smlalb          z20.s, z4.h, z4.h
84
-    smlalt          z21.s, z4.h, z4.h
85
-.rept 63
86
-    ld1b            {z16.b}, p0/z, x0
87
-    ld1b            {z17.b}, p0/z, x0, #1, mul vl
88
-    ld1b            {z18.b}, p0/z, x2
89
-    ld1b            {z19.b}, p0/z, x2, #1, mul vl
90
-    add             x0, x0, x1
91
-    add             x2, x2, x3
92
-    usublb          z1.h, z16.b, z18.b
93
-    usublt          z2.h, z16.b, z18.b
94
-    usublb          z3.h, z17.b, z19.b
95
-    usublt          z4.h, z17.b, z19.b
96
-    smlalb          z20.s, z1.h, z1.h
97
-    smlalt          z21.s, z1.h, z1.h
98
-    smlalb          z20.s, z2.h, z2.h
99
-    smlalt          z21.s, z2.h, z2.h
100
-    smlalb          z20.s, z3.h, z3.h
101
-    smlalt          z21.s, z3.h, z3.h
102
-    smlalb          z20.s, z4.h, z4.h
103
-    smlalt          z21.s, z4.h, z4.h
104
-.endr
105
-    uaddv           d3, p0, z20.s
106
-    fmov            w0, s3
107
-    uaddv           d4, p0, z21.s
108
-    fmov            w1, s4
109
-    add             w0, w0, w1
110
-    ret
111
-.vl_gt_16_pixel_sse_pp_32x64:
112
-    ptrue           p0.b, vl32
113
-    ld1b            {z16.b}, p0/z, x0
114
-    ld1b            {z18.b}, p0/z, x2
115
-    add             x0, x0, x1
116
-    add             x2, x2, x3
117
-    usublb          z1.h, z16.b, z18.b
118
-    usublt          z2.h, z16.b, z18.b
119
-    smullb          z20.s, z1.h, z1.h
120
-    smullt          z21.s, z1.h, z1.h
121
-    smlalb          z20.s, z2.h, z2.h
122
-    smlalt          z21.s, z2.h, z2.h
123
-.rept 63
124
-    ld1b            {z16.b}, p0/z, x0
125
-    ld1b            {z18.b}, p0/z, x2
126
-    add             x0, x0, x1
127
-    add             x2, x2, x3
128
-    usublb          z1.h, z16.b, z18.b
129
-    usublt          z2.h, z16.b, z18.b
130
-    smlalb          z20.s, z1.h, z1.h
131
-    smlalt          z21.s, z1.h, z1.h
132
-    smlalb          z20.s, z2.h, z2.h
133
-    smlalt          z21.s, z2.h, z2.h
134
-.endr
135
-    uaddv           d3, p0, z20.s
136
-    fmov            w0, s3
137
-    uaddv           d4, p0, z21.s
138
-    fmov            w1, s4
139
-    add             w0, w0, w1
140
-    ret
141
-endfunc
142
-
143
-function PFX(pixel_sse_pp_64x64_sve2)
144
-    rdvl            x9, #1
145
-    cmp             x9, #16
146
-    bgt             .vl_gt_16_pixel_sse_pp_64x64
147
-    mov             w12, #16
148
-    movi            v0.16b, #0
149
-    movi            v1.16b, #0
150
-
151
-.loop_sse_pp_64_sve2:
152
-    sub             w12, w12, #1
153
-.rept 4
154
-    ld1             {v16.16b-v19.16b}, x0, x1
155
-    ld1             {v20.16b-v23.16b}, x2, x3
156
-
157
-    usubl           v2.8h, v16.8b, v20.8b
158
-    usubl2          v3.8h, v16.16b, v20.16b
159
-    usubl           v4.8h, v17.8b, v21.8b
160
-    usubl2          v5.8h, v17.16b, v21.16b
161
-    smlal           v0.4s, v2.4h, v2.4h
162
-    smlal2          v1.4s, v2.8h, v2.8h
163
-    smlal           v0.4s, v3.4h, v3.4h
164
-    smlal2          v1.4s, v3.8h, v3.8h
165
-    smlal           v0.4s, v4.4h, v4.4h
166
-    smlal2          v1.4s, v4.8h, v4.8h
167
-    smlal           v0.4s, v5.4h, v5.4h
168
-    smlal2          v1.4s, v5.8h, v5.8h
169
-
170
-    usubl           v2.8h, v18.8b, v22.8b
171
-    usubl2          v3.8h, v18.16b, v22.16b
172
-    usubl           v4.8h, v19.8b, v23.8b
173
-    usubl2          v5.8h, v19.16b, v23.16b
174
-    smlal           v0.4s, v2.4h, v2.4h
175
-    smlal2          v1.4s, v2.8h, v2.8h
176
-    smlal           v0.4s, v3.4h, v3.4h
177
-    smlal2          v1.4s, v3.8h, v3.8h
178
-    smlal           v0.4s, v4.4h, v4.4h
179
-    smlal2          v1.4s, v4.8h, v4.8h
180
-    smlal           v0.4s, v5.4h, v5.4h
181
-    smlal2          v1.4s, v5.8h, v5.8h
182
-.endr
183
-    cbnz            w12, .loop_sse_pp_64_sve2
184
-    add             v0.4s, v0.4s, v1.4s
185
-    ret_v0_w0
186
-.vl_gt_16_pixel_sse_pp_64x64:
187
-    cmp             x9, #48
188
-    bgt             .vl_gt_48_pixel_sse_pp_64x64
189
-    ptrue           p0.b, vl32
190
-    ld1b            {z16.b}, p0/z, x0
191
-    ld1b            {z17.b}, p0/z, x0, #1, mul vl
192
-    ld1b            {z20.b}, p0/z, x2
193
-    ld1b            {z21.b}, p0/z, x2, #1, mul vl
194
-    add             x0, x0, x1
195
-    add             x2, x2, x3
196
-    usublb          z1.h, z16.b, z20.b
197
-    usublt          z2.h, z16.b, z20.b
198
-    usublb          z3.h, z17.b, z21.b
199
-    usublt          z4.h, z17.b, z21.b
200
-    smullb          z24.s, z1.h, z1.h
201
x265_3.6.tar.gz/source/common/aarch64/ssd-a.S -> x265_4.0.tar.gz/source/common/aarch64/ssd-a.S Changed
201
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2021 MulticoreWare, Inc
3
  *
4
  * Authors: Sebastian Pop <spop@amazon.com>
5
+ *          Hari Limaye <hari.limaye@arm.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -34,217 +35,145 @@
10
 
11
 .text
12
 
13
-function PFX(pixel_sse_pp_4x4_neon)
14
-    ld1             {v16.s}0, x0, x1
15
-    ld1             {v17.s}0, x2, x3
16
-    ld1             {v18.s}0, x0, x1
17
-    ld1             {v19.s}0, x2, x3
18
-    ld1             {v20.s}0, x0, x1
19
-    ld1             {v21.s}0, x2, x3
20
-    ld1             {v22.s}0, x0, x1
21
-    ld1             {v23.s}0, x2, x3
22
-
23
-    usubl           v1.8h, v16.8b, v17.8b
24
-    usubl           v2.8h, v18.8b, v19.8b
25
-    usubl           v3.8h, v20.8b, v21.8b
26
-    usubl           v4.8h, v22.8b, v23.8b
27
-
28
-    smull           v0.4s, v1.4h, v1.4h
29
-    smlal           v0.4s, v2.4h, v2.4h
30
-    smlal           v0.4s, v3.4h, v3.4h
31
-    smlal           v0.4s, v4.4h, v4.4h
32
-    ret_v0_w0
33
-endfunc
34
+// Fully unrolled.
35
+.macro SSE_PP_4xN h
36
+function PFX(pixel_sse_pp_4x\h\()_neon)
37
+    movi            v0.4s, #0
38
+.rept \h / 2
39
+    ldr             s16, x0
40
+    ldr             s17, x2
41
+    add             x0, x0, x1
42
+    add             x2, x2, x3
43
+    ld1             {v16.s}1, x0, x1
44
+    ld1             {v17.s}1, x2, x3
45
 
46
-function PFX(pixel_sse_pp_4x8_neon)
47
-    ld1             {v16.s}0, x0, x1
48
-    ld1             {v17.s}0, x2, x3
49
-    usubl           v1.8h, v16.8b, v17.8b
50
-    ld1             {v16.s}0, x0, x1
51
-    ld1             {v17.s}0, x2, x3
52
-    smull           v0.4s, v1.4h, v1.4h
53
-.rept 6
54
-    usubl           v1.8h, v16.8b, v17.8b
55
-    ld1             {v16.s}0, x0, x1
56
-    smlal           v0.4s, v1.4h, v1.4h
57
-    ld1             {v17.s}0, x2, x3
58
+    uabd            v1.8b, v16.8b, v17.8b
59
+    umull           v20.8h, v1.8b, v1.8b
60
+    uadalp          v0.4s, v20.8h
61
 .endr
62
-    usubl           v1.8h, v16.8b, v17.8b
63
-    smlal           v0.4s, v1.4h, v1.4h
64
     ret_v0_w0
65
 endfunc
66
+.endm
67
 
68
-function PFX(pixel_sse_pp_8x8_neon)
69
-    ld1             {v16.8b}, x0, x1
70
-    ld1             {v17.8b}, x2, x3
71
-    usubl           v1.8h, v16.8b, v17.8b
72
-    ld1             {v16.8b}, x0, x1
73
-    smull           v0.4s, v1.4h, v1.4h
74
-    smlal2          v0.4s, v1.8h, v1.8h
75
-    ld1             {v17.8b}, x2, x3
76
-
77
-.rept 6
78
-    usubl           v1.8h, v16.8b, v17.8b
79
-    ld1             {v16.8b}, x0, x1
80
-    smlal           v0.4s, v1.4h, v1.4h
81
-    smlal2          v0.4s, v1.8h, v1.8h
82
-    ld1             {v17.8b}, x2, x3
83
-.endr
84
-    usubl           v1.8h, v16.8b, v17.8b
85
-    smlal           v0.4s, v1.4h, v1.4h
86
-    smlal2          v0.4s, v1.8h, v1.8h
87
-    ret_v0_w0
88
-endfunc
89
+SSE_PP_4xN 4
90
+SSE_PP_4xN 8
91
 
92
-function PFX(pixel_sse_pp_8x16_neon)
93
-    ld1             {v16.8b}, x0, x1
94
-    ld1             {v17.8b}, x2, x3
95
-    usubl           v1.8h, v16.8b, v17.8b
96
+// Fully unrolled.
97
+.macro SSE_PP_8xN h
98
+function PFX(pixel_sse_pp_8x\h\()_neon)
99
+    movi            v0.4s, #0
100
+.rept \h
101
     ld1             {v16.8b}, x0, x1
102
-    smull           v0.4s, v1.4h, v1.4h
103
-    smlal2          v0.4s, v1.8h, v1.8h
104
     ld1             {v17.8b}, x2, x3
105
 
106
-.rept 14
107
-    usubl           v1.8h, v16.8b, v17.8b
108
-    ld1             {v16.8b}, x0, x1
109
-    smlal           v0.4s, v1.4h, v1.4h
110
-    smlal2          v0.4s, v1.8h, v1.8h
111
-    ld1             {v17.8b}, x2, x3
112
+    uabd            v1.8b, v16.8b, v17.8b
113
+    umull           v20.8h, v1.8b, v1.8b
114
+    uadalp          v0.4s, v20.8h
115
 .endr
116
-    usubl           v1.8h, v16.8b, v17.8b
117
-    smlal           v0.4s, v1.4h, v1.4h
118
-    smlal2          v0.4s, v1.8h, v1.8h
119
     ret_v0_w0
120
 endfunc
121
+.endm
122
+
123
+SSE_PP_8xN 8
124
+SSE_PP_8xN 16
125
 
126
-.macro sse_pp_16xN h
127
+// Fully unrolled.
128
+.macro SSE_PP_16xN h
129
 function PFX(pixel_sse_pp_16x\h\()_neon)
130
+    movi            v0.4s, #0
131
+    movi            v1.4s, #0
132
+.rept \h
133
     ld1             {v16.16b}, x0, x1
134
     ld1             {v17.16b}, x2, x3
135
-    usubl           v1.8h, v16.8b, v17.8b
136
-    usubl2          v2.8h, v16.16b, v17.16b
137
-    ld1             {v16.16b}, x0, x1
138
-    ld1             {v17.16b}, x2, x3
139
-    smull           v0.4s, v1.4h, v1.4h
140
-    smlal2          v0.4s, v1.8h, v1.8h
141
-    smlal           v0.4s, v2.4h, v2.4h
142
-    smlal2          v0.4s, v2.8h, v2.8h
143
-.rept \h - 2
144
-    usubl           v1.8h, v16.8b, v17.8b
145
-    usubl2          v2.8h, v16.16b, v17.16b
146
-    ld1             {v16.16b}, x0, x1
147
-    smlal           v0.4s, v1.4h, v1.4h
148
-    smlal2          v0.4s, v1.8h, v1.8h
149
-    ld1             {v17.16b}, x2, x3
150
-    smlal           v0.4s, v2.4h, v2.4h
151
-    smlal2          v0.4s, v2.8h, v2.8h
152
+
153
+    uabd            v2.16b, v16.16b, v17.16b
154
+    umull           v20.8h, v2.8b, v2.8b
155
+    uadalp          v0.4s, v20.8h
156
+    umull2          v21.8h, v2.16b, v2.16b
157
+    uadalp          v1.4s, v21.8h
158
 .endr
159
-    usubl           v1.8h, v16.8b, v17.8b
160
-    usubl2          v2.8h, v16.16b, v17.16b
161
-    smlal           v0.4s, v1.4h, v1.4h
162
-    smlal2          v0.4s, v1.8h, v1.8h
163
-    smlal           v0.4s, v2.4h, v2.4h
164
-    smlal2          v0.4s, v2.8h, v2.8h
165
+    add             v0.4s, v0.4s, v1.4s
166
     ret_v0_w0
167
 endfunc
168
 .endm
169
 
170
-sse_pp_16xN 16
171
-sse_pp_16xN 32
172
+SSE_PP_16xN 16
173
+SSE_PP_16xN 32
174
 
175
-function PFX(pixel_sse_pp_32x32_neon)
176
-    mov             w12, #8
177
-    movi            v0.16b, #0
178
-    movi            v1.16b, #0
179
-.loop_sse_pp_32:
180
-    sub             w12, w12, #1
181
+// Loop unrolled to process 4 rows per iteration.
182
+function PFX(pixel_sse_pp_32xh_neon), export=0
183
+    movi            v0.4s, #0
184
+    movi            v1.4s, #0
185
+.Loop_sse_pp_32xh:
186
+    sub             w4, w4, #1
187
 .rept 4
188
     ld1             {v16.16b,v17.16b}, x0, x1
189
     ld1             {v18.16b,v19.16b}, x2, x3
190
-    usubl           v2.8h, v16.8b, v18.8b
191
-    usubl2          v3.8h, v16.16b, v18.16b
192
-    usubl           v4.8h, v17.8b, v19.8b
193
-    usubl2          v5.8h, v17.16b, v19.16b
194
-    smlal           v0.4s, v2.4h, v2.4h
195
-    smlal2          v1.4s, v2.8h, v2.8h
196
-    smlal           v0.4s, v3.4h, v3.4h
197
-    smlal2          v1.4s, v3.8h, v3.8h
198
-    smlal           v0.4s, v4.4h, v4.4h
199
-    smlal2          v1.4s, v4.8h, v4.8h
200
-    smlal           v0.4s, v5.4h, v5.4h
201
x265_4.0.tar.gz/source/common/aarch64/ssd-neon-dotprod.S Added
171
 
1
@@ -0,0 +1,169 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2024 MulticoreWare, Inc
4
+ *
5
+ * Authors: Hari Limaye <hari.limaye@arm.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+
27
+.arch armv8.2-a+dotprod
28
+
29
+#ifdef __APPLE__
30
+.section __RODATA,__rodata
31
+#else
32
+.section .rodata
33
+#endif
34
+
35
+.align 4
36
+
37
+.text
38
+
39
+// Fully unrolled.
40
+.macro SSE_PP_4xN h
41
+function PFX(pixel_sse_pp_4x\h\()_neon_dotprod)
42
+    movi            v0.4s, #0
43
+.rept \h / 4
44
+    ldr             s16, x0
45
+    ldr             s17, x2
46
+    add             x0, x0, x1
47
+    add             x2, x2, x3
48
+    ld1             {v16.s}1, x0, x1
49
+    ld1             {v16.s}2, x0, x1
50
+    ld1             {v16.s}3, x0, x1
51
+    ld1             {v17.s}1, x2, x3
52
+    ld1             {v17.s}2, x2, x3
53
+    ld1             {v17.s}3, x2, x3
54
+
55
+    uabd            v1.16b, v16.16b, v17.16b
56
+    udot            v0.4s, v1.16b, v1.16b
57
+.endr
58
+    addv            s0, v0.4s
59
+    fmov            w0, s0
60
+    ret
61
+endfunc
62
+.endm
63
+
64
+SSE_PP_4xN 4
65
+SSE_PP_4xN 8
66
+
67
+// Fully unrolled.
68
+.macro SSE_PP_8xN h
69
+function PFX(pixel_sse_pp_8x\h\()_neon_dotprod)
70
+    movi            v0.4s, #0
71
+.rept \h
72
+    ld1             {v16.8b}, x0, x1
73
+    ld1             {v17.8b}, x2, x3
74
+
75
+    uabd            v1.8b, v16.8b, v17.8b
76
+    udot            v0.2s, v1.8b, v1.8b
77
+.endr
78
+    addv            s0, v0.4s
79
+    fmov            w0, s0
80
+    ret
81
+endfunc
82
+.endm
83
+
84
+SSE_PP_8xN 8
85
+SSE_PP_8xN 16
86
+
87
+// Fully unrolled.
88
+.macro SSE_PP_16xN h
89
+function PFX(pixel_sse_pp_16x\h\()_neon_dotprod)
90
+    movi            v0.4s, #0
91
+    movi            v1.4s, #0
92
+.rept \h / 2
93
+    ld1             {v16.16b}, x0, x1
94
+    ld1             {v17.16b}, x2, x3
95
+    ld1             {v18.16b}, x0, x1
96
+    ld1             {v19.16b}, x2, x3
97
+
98
+    uabd            v2.16b, v16.16b, v17.16b
99
+    udot            v0.4s, v2.16b, v2.16b
100
+    uabd            v3.16b, v18.16b, v19.16b
101
+    udot            v1.4s, v3.16b, v3.16b
102
+.endr
103
+    add             v0.4s, v0.4s, v1.4s
104
+    addv            s0, v0.4s
105
+    fmov            w0, s0
106
+    ret
107
+endfunc
108
+.endm
109
+
110
+SSE_PP_16xN 16
111
+SSE_PP_16xN 32
112
+
113
+// Loop unrolled to process 4 rows per iteration.
114
+function PFX(pixel_sse_pp_32xh_neon_dotprod), export=0
115
+    movi            v0.4s, #0
116
+    movi            v1.4s, #0
117
+.Loop_sse_pp_32xh:
118
+    sub             w4, w4, #1
119
+.rept 4
120
+    ld1             {v16.16b,v17.16b}, x0, x1
121
+    ld1             {v18.16b,v19.16b}, x2, x3
122
+
123
+    uabd            v2.16b, v16.16b, v18.16b
124
+    udot            v0.4s, v2.16b, v2.16b
125
+    uabd            v3.16b, v17.16b, v19.16b
126
+    udot            v1.4s, v3.16b, v3.16b
127
+.endr
128
+    cbnz            w4, .Loop_sse_pp_32xh
129
+    add             v0.4s, v0.4s, v1.4s
130
+    addv            s0, v0.4s
131
+    fmov            w0, s0
132
+    ret
133
+endfunc
134
+
135
+.macro SSE_PP_32xN h
136
+function PFX(pixel_sse_pp_32x\h\()_neon_dotprod)
137
+    mov             w4, \h / 4
138
+    b               PFX(pixel_sse_pp_32xh_neon_dotprod)
139
+endfunc
140
+.endm
141
+
142
+SSE_PP_32xN 32
143
+SSE_PP_32xN 64
144
+
145
+// Loop unrolled to process 4 rows per iteration.
146
+function PFX(pixel_sse_pp_64x64_neon_dotprod)
147
+    mov             w12, #16
148
+    movi            v0.4s, #0
149
+    movi            v1.4s, #0
150
+.Loop_sse_pp_64:
151
+    sub             w12, w12, #1
152
+.rept 4
153
+    ld1             {v16.16b-v19.16b}, x0, x1
154
+    ld1             {v20.16b-v23.16b}, x2, x3
155
+
156
+    uabd            v2.16b, v16.16b, v20.16b
157
+    udot            v0.4s, v2.16b, v2.16b
158
+    uabd            v3.16b, v17.16b, v21.16b
159
+    udot            v1.4s, v3.16b, v3.16b
160
+    uabd            v4.16b, v18.16b, v22.16b
161
+    udot            v0.4s, v4.16b, v4.16b
162
+    uabd            v5.16b, v19.16b, v23.16b
163
+    udot            v1.4s, v5.16b, v5.16b
164
+.endr
165
+    cbnz            w12, .Loop_sse_pp_64
166
+    add             v0.4s, v0.4s, v1.4s
167
+    addv            s0, v0.4s
168
+    fmov            w0, s0
169
+    ret
170
+endfunc
171
x265_3.6.tar.gz/source/common/arm/blockcopy8.S -> x265_4.0.tar.gz/source/common/arm/blockcopy8.S Changed
19
 
1
@@ -795,7 +795,7 @@
2
     vmov            q2, q12
3
     vmov            q3, q14
4
 
5
-.loop:    
6
+.Loop:
7
     vldm            r0!, {q8-q15}
8
     subs            r1, #1
9
 
10
@@ -817,7 +817,7 @@
11
     vadd.s8         q1, q10
12
     vadd.s8         q2, q12
13
     vadd.s8         q3, q14
14
-    bgt            .loop
15
+    bgt            .Loop
16
 
17
     // sum
18
     vadd.s8         q0, q1
19
x265_3.6.tar.gz/source/common/arm/dct-a.S -> x265_4.0.tar.gz/source/common/arm/dct-a.S Changed
37
 
1
@@ -422,7 +422,7 @@
2
     mov lr, #4*16*2
3
 
4
     // DCT-1D
5
-.loop1:
6
+.Loop1:
7
     // Row0-3
8
     vld1.16 {q8-q9}, r0, :64, r2      // q8  = 07 06 05 04 03 02 01 00, q9  = 0F 0E 0D 0C 0B 0A 09 08
9
     vld1.16 {q10-q11}, r0, :64, r2    // q10 = 17 16 15 14 13 12 11 10, q11 = 1F 1E 1D 1C 1B 1A 19 18
10
@@ -628,7 +628,7 @@
11
     // loop into next process group
12
     sub r3, #3*4*16*2
13
     subs r12, #1
14
-    bgt .loop1
15
+    bgt .Loop1
16
 
17
 
18
     // DCT-2D
19
@@ -637,7 +637,7 @@
20
     mov r3, #16*2*2
21
     mov r12, #16/4                      // Process 4 rows every loop
22
 
23
-.loop2:
24
+.Loop2:
25
     vldm r2, {q8-q15}
26
 
27
     // d16 = 30 20 10 00
28
@@ -887,7 +887,7 @@
29
 
30
     sub r1, #(17*16-4)*2
31
     subs r12, #1
32
-    bgt .loop2
33
+    bgt .Loop2
34
 
35
     add sp, #16*16*2
36
     vpop {q4-q7}
37
x265_3.6.tar.gz/source/common/arm/ipfilter8.S -> x265_4.0.tar.gz/source/common/arm/ipfilter8.S Changed
201
 
1
@@ -372,7 +372,7 @@
2
     vmov.u16    q1, #8192
3
     vneg.s16    q1, q1
4
     mov         r12, #8
5
-.loop_filterP2S_32x16:
6
+.Loop_filterP2S_32x16:
7
     subs        r12, #1
8
 .rept 2
9
     vld1.u8     {q9-q10}, r0, r1
10
@@ -391,7 +391,7 @@
11
     vmla.s16    q3, q10, q0
12
     vst1.16     {q2-q3}, r2, r3
13
 .endr
14
-    bgt         .loop_filterP2S_32x16
15
+    bgt         .Loop_filterP2S_32x16
16
     bx          lr
17
 endfunc
18
 
19
@@ -402,7 +402,7 @@
20
     vmov.u16    q1, #8192
21
     vneg.s16    q1, q1
22
     mov         r12, #12
23
-.loop_filterP2S_32x24:
24
+.Loop_filterP2S_32x24:
25
     subs        r12, #1
26
 .rept 2
27
     vld1.u8     {q9-q10}, r0, r1
28
@@ -421,7 +421,7 @@
29
     vmla.s16    q3, q10, q0
30
     vst1.16     {q2-q3}, r2, r3
31
 .endr
32
-    bgt         .loop_filterP2S_32x24
33
+    bgt         .Loop_filterP2S_32x24
34
     bx          lr
35
 endfunc
36
 
37
@@ -432,7 +432,7 @@
38
     vmov.u16    q1, #8192
39
     vneg.s16    q1, q1
40
     mov         r12, #16
41
-.loop_filterP2S_32x32:
42
+.Loop_filterP2S_32x32:
43
     subs        r12, #1
44
 .rept 2
45
     vld1.u8     {q9-q10}, r0, r1
46
@@ -451,7 +451,7 @@
47
     vmla.s16    q3, q10, q0
48
     vst1.16     {q2-q3}, r2, r3
49
 .endr
50
-    bgt         .loop_filterP2S_32x32
51
+    bgt         .Loop_filterP2S_32x32
52
     bx          lr
53
 endfunc
54
 
55
@@ -462,7 +462,7 @@
56
     vmov.u16    q1, #8192
57
     vneg.s16    q1, q1
58
     mov         r12, #32
59
-.loop_filterP2S_32x64:
60
+.Loop_filterP2S_32x64:
61
     subs        r12, #1
62
 .rept 2
63
     vld1.u8     {q9-q10}, r0, r1
64
@@ -481,7 +481,7 @@
65
     vmla.s16    q3, q10, q0
66
     vst1.16     {q2-q3}, r2, r3
67
 .endr
68
-    bgt         .loop_filterP2S_32x64
69
+    bgt         .Loop_filterP2S_32x64
70
     bx          lr
71
 endfunc
72
 
73
@@ -493,7 +493,7 @@
74
     vmov.u16    q1, #8192
75
     vneg.s16    q1, q1
76
     mov         r12, #8
77
-.loop_filterP2S_64x16:
78
+.Loop_filterP2S_64x16:
79
     subs        r12, #1
80
 .rept 2
81
     vld1.u8     {q9-q10}, r0!
82
@@ -528,7 +528,7 @@
83
     vmla.s16    q3, q10, q0
84
     vst1.16     {q2-q3}, r2, r3
85
 .endr
86
-    bgt         .loop_filterP2S_64x16
87
+    bgt         .Loop_filterP2S_64x16
88
     bx          lr
89
 endfunc
90
 
91
@@ -540,7 +540,7 @@
92
     vmov.u16    q1, #8192
93
     vneg.s16    q1, q1
94
     mov         r12, #16
95
-.loop_filterP2S_64x32:
96
+.Loop_filterP2S_64x32:
97
     subs        r12, #1
98
 .rept 2
99
     vld1.u8     {q9-q10}, r0!
100
@@ -575,7 +575,7 @@
101
     vmla.s16    q3, q10, q0
102
     vst1.16     {q2-q3}, r2, r3
103
 .endr
104
-    bgt         .loop_filterP2S_64x32
105
+    bgt         .Loop_filterP2S_64x32
106
     bx          lr
107
 endfunc
108
 
109
@@ -587,7 +587,7 @@
110
     vmov.u16    q1, #8192
111
     vneg.s16    q1, q1
112
     mov         r12, #24
113
-.loop_filterP2S_64x48:
114
+.Loop_filterP2S_64x48:
115
     subs        r12, #1
116
 .rept 2
117
     vld1.u8     {q9-q10}, r0!
118
@@ -622,7 +622,7 @@
119
     vmla.s16    q3, q10, q0
120
     vst1.16     {q2-q3}, r2, r3
121
 .endr
122
-    bgt         .loop_filterP2S_64x48
123
+    bgt         .Loop_filterP2S_64x48
124
     bx          lr
125
 endfunc
126
 
127
@@ -634,7 +634,7 @@
128
     vmov.u16    q1, #8192
129
     vneg.s16    q1, q1
130
     mov         r12, #32
131
-.loop_filterP2S_64x64:
132
+.Loop_filterP2S_64x64:
133
     subs        r12, #1
134
 .rept 2
135
     vld1.u8     {q9-q10}, r0!
136
@@ -669,7 +669,7 @@
137
     vmla.s16    q3, q10, q0
138
     vst1.16     {q2-q3}, r2, r3
139
 .endr
140
-    bgt         .loop_filterP2S_64x64
141
+    bgt         .Loop_filterP2S_64x64
142
     bx          lr
143
 endfunc
144
 
145
@@ -681,7 +681,7 @@
146
     vmov.u16    q1, #8192
147
     vneg.s16    q1, q1
148
     mov         r12, #32
149
-.loop_filterP2S_48x64:
150
+.Loop_filterP2S_48x64:
151
     subs        r12, #1
152
 .rept 2
153
     vld1.u8     {q9-q10}, r0!
154
@@ -709,7 +709,7 @@
155
     vmla.s16    q3, q9, q0
156
     vst1.16     {q2-q3}, r2, r3
157
 .endr
158
-    bgt         .loop_filterP2S_48x64
159
+    bgt         .Loop_filterP2S_48x64
160
     bx          lr
161
 endfunc
162
 
163
@@ -756,7 +756,7 @@
164
     vmovl.u8    q2, d4
165
     vmovl.u8    q3, d6
166
 
167
-.loop_4x\h:
168
+.Loop_4x\h:
169
     // TODO: read extra 1 row for speed optimize, may made crash on OS X platform!
170
     vld1.u32    {d160}, r0, r1
171
     vld1.u32    {d161}, r0, r1
172
@@ -795,7 +795,7 @@
173
     vst1.u32    {d181}, r2, r3
174
 
175
     subs        r12, #2
176
-    bne        .loop_4x4
177
+    bne        .Loop_4x4
178
 
179
     pop         {pc}
180
     .ltorg
181
@@ -945,13 +945,13 @@
182
 
183
 .macro FILTER_VPP a b filterv
184
 
185
-.loop_\filterv\()_\a\()x\b:
186
+.Loop_\filterv\()_\a\()x\b:
187
 
188
     mov             r7, r2
189
     mov             r6, r0
190
     eor             r8, r8
191
 
192
-.loop_w8_\filterv\()_\a\()x\b:
193
+.Loop_w8_\filterv\()_\a\()x\b:
194
 
195
     add             r6, r0, r8
196
 
197
@@ -988,12 +988,12 @@
198
 
199
     add             r8, #8
200
     cmp             r8, #\a
201
x265_3.6.tar.gz/source/common/arm/mc-a.S -> x265_4.0.tar.gz/source/common/arm/mc-a.S Changed
37
 
1
@@ -554,7 +554,7 @@
2
     vsri.s16        q1, #1
3
     vneg.s16        q0, q0
4
     mov             r3, #4
5
-.loop_cpy2Dto1D_shr_16:
6
+.Loop_cpy2Dto1D_shr_16:
7
     subs            r3, #1
8
 .rept 4
9
     vld1.s16        {q2-q3}, r1, r2
10
@@ -564,7 +564,7 @@
11
     vshl.s16        q3, q0
12
     vst1.16         {q2-q3}, r0!
13
 .endr
14
-    bgt             .loop_cpy2Dto1D_shr_16
15
+    bgt             .Loop_cpy2Dto1D_shr_16
16
     bx              lr
17
 endfunc
18
 
19
@@ -577,7 +577,7 @@
20
     vsri.s16        q1, #1
21
     vneg.s16        q0, q0
22
     mov             r3, 16
23
-.loop_cpy2Dto1D_shr_32:
24
+.Loop_cpy2Dto1D_shr_32:
25
     subs            r3, #1
26
 .rept 2
27
     vld1.s16        {q2-q3}, r1!
28
@@ -593,7 +593,7 @@
29
     vst1.16         {q2-q3}, r0!
30
     vst1.16         {q8-q9}, r0!
31
 .endr
32
-    bgt             .loop_cpy2Dto1D_shr_32
33
+    bgt             .Loop_cpy2Dto1D_shr_32
34
     bx              lr
35
 endfunc
36
 
37
x265_3.6.tar.gz/source/common/arm/pixel-util.S -> x265_4.0.tar.gz/source/common/arm/pixel-util.S Changed
116
 
1
@@ -848,36 +848,36 @@
2
     vdup.8          q2, r12
3
     sub             r5, #1
4
 
5
-.loop_h:
6
+.Loop_h:
7
     mov             r6, r0
8
     mov             r12, r2
9
     eor             r7, r7
10
-.loop_w:
11
+.Loop_w:
12
     vld1.u8         {q0}, r6!
13
     vshl.u8         q0, q0, q2
14
     vst1.u8         {q0}, r12!
15
 
16
     add             r7, #16
17
     cmp             r7, r4
18
-    blt             .loop_w
19
+    blt             .Loop_w
20
 
21
     add             r0, r1
22
     add             r2, r3
23
 
24
     subs             r5, #1
25
-    bgt             .loop_h
26
+    bgt             .Loop_h
27
 
28
 // handle last row
29
     mov             r5, r4
30
     lsr             r5, #3
31
 
32
-.loopW8:
33
+.LoopW8:
34
     vld1.u8         d0, r0!
35
     vshl.u8         d0, d0, d4
36
     vst1.u8         d0, r2!
37
     subs            r4, r4, #8
38
     subs            r5, #1
39
-    bgt             .loopW8
40
+    bgt             .LoopW8
41
 
42
     mov             r5,#8
43
     sub             r5, r4
44
@@ -1970,7 +1970,7 @@
45
     eor             r5, r5
46
     veor.s32        q12, q12
47
 
48
-.loop_quant:
49
+.Loop_quant:
50
 
51
     vld1.s16        d16, r0!
52
     vmovl.s16       q9, d16                // q9= coefblockpos
53
@@ -1999,7 +1999,7 @@
54
     vst1.s16        d16, r3!
55
 
56
     subs            r4, #1
57
-    bne             .loop_quant
58
+    bne             .Loop_quant
59
 
60
     vadd.u32        d8, d9
61
     vpadd.u32       d8, d8
62
@@ -2023,7 +2023,7 @@
63
     eor             r4, r4
64
     veor.s32        q12, q12
65
 
66
-.loop_nquant:
67
+.Loop_nquant:
68
 
69
     vld1.s16        d16, r0!
70
     vmovl.s16       q9, d16                // q9= coefblockpos
71
@@ -2049,7 +2049,7 @@
72
     vst1.s16        d17, r2!
73
 
74
     subs            r3, #1
75
-    bne             .loop_nquant
76
+    bne             .Loop_nquant
77
 
78
     vadd.u32        d8, d9
79
     vpadd.u32       d8, d8
80
@@ -2148,7 +2148,7 @@
81
     mov             r10, #4
82
     eor             r9, r9
83
 
84
-.loop_32:
85
+.Loop_32:
86
 
87
     sa8d_16x16 r4
88
 
89
@@ -2166,7 +2166,7 @@
90
     sub             r2,  r2,  #24
91
 
92
     subs            r10, #1
93
-    bgt            .loop_32
94
+    bgt            .Loop_32
95
 
96
     mov             r0, r9
97
     vpop            {d8-d11}
98
@@ -2183,7 +2183,7 @@
99
     mov             r10, #4
100
     eor             r9, r9
101
 
102
-.loop_1:
103
+.Loop_1:
104
 
105
     sa8d_16x16 r4
106
 
107
@@ -2217,7 +2217,7 @@
108
     sub             r2,  r2,  #56
109
 
110
     subs            r10, #1
111
-    bgt            .loop_1
112
+    bgt            .Loop_1
113
 
114
     mov             r0, r9
115
     vpop            {d8-d11}
116
x265_3.6.tar.gz/source/common/arm/sad-a.S -> x265_4.0.tar.gz/source/common/arm/sad-a.S Changed
151
 
1
@@ -103,7 +103,7 @@
2
     vabal.u8        q9, d5, d7
3
     mov             r12, #(\h-2)/2
4
 
5
-.loop_16x\h:
6
+.Loop_16x\h:
7
 
8
     subs            r12, #1
9
     vld1.8          {q0}, r0, r1
10
@@ -115,7 +115,7 @@
11
     vabal.u8        q9, d1, d3
12
     vabal.u8        q8, d4, d6
13
     vabal.u8        q9, d5, d7
14
-    bne             .loop_16x\h
15
+    bne             .Loop_16x\h
16
 
17
     vadd.u16        q8, q8, q9
18
 .if \h == 64
19
@@ -147,7 +147,7 @@
20
     veor.u8         q11, q11
21
     mov             r12, #\h/8
22
 
23
-.loop_32x\h:
24
+.Loop_32x\h:
25
 
26
     subs            r12, #1
27
 .rept 4
28
@@ -166,7 +166,7 @@
29
     vabal.u8        q10, d26, d30
30
     vabal.u8        q11, d27, d31
31
 .endr
32
-    bne             .loop_32x\h
33
+    bne             .Loop_32x\h
34
 
35
     vadd.u16        q8, q8, q9
36
     vadd.u16        q10, q10, q11
37
@@ -213,7 +213,7 @@
38
     sub             r3, r12
39
     mov             r12, #\h/8
40
 
41
-.loop_64x\h:
42
+.Loop_64x\h:
43
 
44
     subs            r12, #1
45
 .rept 4
46
@@ -246,7 +246,7 @@
47
     vabal.u8        q10, d26, d30
48
     vabal.u8        q11, d27, d31
49
 .endr
50
-    bne             .loop_64x\h
51
+    bne             .Loop_64x\h
52
 
53
     vadd.u16        q8, q8, q9
54
     vadd.u16        q10, q10, q11
55
@@ -283,7 +283,7 @@
56
     sub             r3, #16
57
     mov             r12, #8
58
 
59
-.loop_24x32:
60
+.Loop_24x32:
61
 
62
     subs            r12, #1
63
 .rept 4
64
@@ -296,7 +296,7 @@
65
     vld1.8          {d1}, r2, r3
66
     vabal.u8        q10, d0, d1
67
 .endr
68
-    bne             .loop_24x32
69
+    bne             .Loop_24x32
70
 
71
     vadd.u16        q8, q8, q9
72
     vadd.u16        d16, d16, d17
73
@@ -322,7 +322,7 @@
74
     sub             r3, #32
75
     mov             r12, #16
76
 
77
-.loop_48x64:
78
+.Loop_48x64:
79
 
80
     subs            r12, #1
81
 .rept 4
82
@@ -337,7 +337,7 @@
83
     vabal.u8        q14, d4, d20
84
     vabal.u8        q15, d5, d21
85
 .endr
86
-    bne             .loop_48x64
87
+    bne             .Loop_48x64
88
 
89
     vadd.u16        q3, q3, q11
90
     vadd.u16        d6, d6, d7
91
@@ -635,12 +635,12 @@
92
     veor.u8         q15, q15
93
 .endif
94
 
95
-.loop_sad_x\x\()_16x\h:
96
+.Loop_sad_x\x\()_16x\h:
97
 .rept 8
98
     SAD_X_16 \x
99
 .endr
100
     subs            r6, #1
101
-    bne             .loop_sad_x\x\()_16x\h
102
+    bne             .Loop_sad_x\x\()_16x\h
103
 
104
     vadd.u16        q8, q8, q9
105
     vadd.u16        q10, q10, q11
106
@@ -929,12 +929,12 @@
107
     veor.u8         q14, q14
108
     veor.u8         q15, q15
109
 .endif
110
-.loop_sad_x\x\()_64x\h:
111
+.Loop_sad_x\x\()_64x\h:
112
 .rept 8
113
     SAD_X_64 \x
114
 .endr
115
     subs            r6, #1
116
-    bne             .loop_sad_x\x\()_64x\h
117
+    bne             .Loop_sad_x\x\()_64x\h
118
 
119
 .if \h <= 16
120
     vadd.u16        q8, q8, q9
121
@@ -1071,12 +1071,12 @@
122
     veor.u8         q15, q15
123
 .endif
124
 
125
-.loop_sad_x\x\()_48x64:
126
+.Loop_sad_x\x\()_48x64:
127
 .rept 8
128
     SAD_X_48 \x
129
 .endr
130
     subs            r6, #1
131
-    bne             .loop_sad_x\x\()_48x64
132
+    bne             .Loop_sad_x\x\()_48x64
133
 
134
     vpaddl.u16      q8, q8
135
     vpaddl.u16      q9, q9
136
@@ -1179,12 +1179,12 @@
137
     veor.u8         q15, q15
138
 .endif
139
 
140
-.loop_sad_x\x\()_24x32:
141
+.Loop_sad_x\x\()_24x32:
142
 .rept 8
143
     SAD_X_24 \x
144
 .endr
145
     subs            r6, #1
146
-    bne             .loop_sad_x\x\()_24x32
147
+    bne             .Loop_sad_x\x\()_24x32
148
 
149
     vadd.u16        q8, q8, q9
150
     vadd.u16        q10, q10, q11
151
x265_3.6.tar.gz/source/common/arm/ssd-a.S -> x265_4.0.tar.gz/source/common/arm/ssd-a.S Changed
127
 
1
@@ -121,7 +121,7 @@
2
     veor.u8     q0, q0
3
     veor.u8     q1, q1
4
 
5
-.loop_sse_pp_32:
6
+.Loop_sse_pp_32:
7
     subs        r12, #1
8
 .rept 4
9
     vld1.64     {q8-q9}, r0, r1
10
@@ -139,7 +139,7 @@
11
     vmlal.s16   q0, d26, d26
12
     vmlal.s16   q1, d27, d27
13
 .endr
14
-    bne         .loop_sse_pp_32
15
+    bne         .Loop_sse_pp_32
16
     vadd.s32    q0, q1
17
     vadd.s32    d0, d0, d1
18
     vpadd.s32   d0, d0, d0
19
@@ -154,7 +154,7 @@
20
     veor.u8     q0, q0
21
     veor.u8     q1, q1
22
 
23
-.loop_sse_pp_64:
24
+.Loop_sse_pp_64:
25
     subs        r12, #1
26
 .rept 4
27
     vld1.64     {q8-q9}, r0!
28
@@ -187,7 +187,7 @@
29
     vmlal.s16   q0, d26, d26
30
     vmlal.s16   q1, d27, d27
31
 .endr
32
-    bne         .loop_sse_pp_64
33
+    bne         .Loop_sse_pp_64
34
     vadd.s32    q0, q1
35
     vadd.s32    d0, d0, d1
36
     vpadd.s32   d0, d0, d0
37
@@ -257,7 +257,7 @@
38
     veor.u8     q0, q0
39
     veor.u8     q1, q1
40
 
41
-.loop_sse_ss_16:
42
+.Loop_sse_ss_16:
43
     subs        r12, #1
44
 .rept 4
45
     vld1.s16    {q8-q9}, r0, r1
46
@@ -269,7 +269,7 @@
47
     vmlal.s16   q0, d18, d18
48
     vmlal.s16   q1, d19, d19
49
 .endr
50
-    bne         .loop_sse_ss_16
51
+    bne         .Loop_sse_ss_16
52
     vadd.s32    q0, q1
53
     vadd.s32    d0, d0, d1
54
     vpadd.s32   d0, d0, d0
55
@@ -286,7 +286,7 @@
56
     veor.u8     q0, q0
57
     veor.u8     q1, q1
58
 
59
-.loop_sse_ss_32:
60
+.Loop_sse_ss_32:
61
     subs        r12, #1
62
 .rept 4
63
     vld1.s16    {q8-q9}, r0!
64
@@ -307,7 +307,7 @@
65
     vmlal.s16   q0, d18, d18
66
     vmlal.s16   q1, d19, d19
67
 .endr
68
-    bne         .loop_sse_ss_32
69
+    bne         .Loop_sse_ss_32
70
     vadd.s32    q0, q1
71
     vadd.s32    d0, d0, d1
72
     vpadd.s32   d0, d0, d0
73
@@ -324,7 +324,7 @@
74
     veor.u8     q0, q0
75
     veor.u8     q1, q1
76
 
77
-.loop_sse_ss_64:
78
+.Loop_sse_ss_64:
79
     subs        r12, #1
80
 .rept 2
81
     vld1.s16    {q8-q9}, r0!
82
@@ -363,7 +363,7 @@
83
     vmlal.s16   q0, d18, d18
84
     vmlal.s16   q1, d19, d19
85
 .endr
86
-    bne         .loop_sse_ss_64
87
+    bne         .Loop_sse_ss_64
88
     vadd.s32    q0, q1
89
     vadd.s32    d0, d0, d1
90
     vpadd.s32   d0, d0, d0
91
@@ -417,7 +417,7 @@
92
     veor.u8     q0, q0
93
     veor.u8     q1, q1
94
 
95
-.loop_ssd_s_16:
96
+.Loop_ssd_s_16:
97
     subs        r12, #1
98
 .rept 2
99
     vld1.s16    {q8-q9}, r0, r1
100
@@ -431,7 +431,7 @@
101
     vmlal.s16   q0, d22, d22
102
     vmlal.s16   q1, d23, d23
103
 .endr
104
-    bne         .loop_ssd_s_16
105
+    bne         .Loop_ssd_s_16
106
     vadd.s32    q0, q1
107
     vadd.s32    d0, d0, d1
108
     vpadd.s32   d0, d0, d0
109
@@ -446,7 +446,7 @@
110
     veor.u8     q0, q0
111
     veor.u8     q1, q1
112
 
113
-.loop_ssd_s_32:
114
+.Loop_ssd_s_32:
115
     subs        r12, #1
116
 .rept 4
117
     vld1.s16    {q8-q9}, r0!
118
@@ -460,7 +460,7 @@
119
     vmlal.s16   q0, d22, d22
120
     vmlal.s16   q1, d23, d23
121
 .endr
122
-    bne         .loop_ssd_s_32
123
+    bne         .Loop_ssd_s_32
124
     vadd.s32    q0, q1
125
     vadd.s32    d0, d0, d1
126
     vpadd.s32   d0, d0, d0
127
x265_3.6.tar.gz/source/common/common.h -> x265_4.0.tar.gz/source/common/common.h Changed
14
 
1
@@ -176,6 +176,12 @@
2
 template<typename T> /* clip to pixel range, 0..255 or 0..1023 */
3
 inline pixel x265_clip(T x) { return (pixel)x265_min<T>(T((1 << X265_DEPTH) - 1), x265_max<T>(T(0), x)); }
4
 
5
+/* get the sign of input variable */
6
+static inline int8_t x265_signOf(int32_t x)
7
+{
8
+    return (x >> 31) | ((int32_t)((((uint32_t) - x)) >> 31));
9
+}
10
+
11
 typedef int16_t  coeff_t;      // transform coefficient
12
 
13
 #define X265_MIN(a, b) ((a) < (b) ? (a) : (b))
14
x265_3.6.tar.gz/source/common/cpu.cpp -> x265_4.0.tar.gz/source/common/cpu.cpp Changed
45
 
1
@@ -115,6 +115,12 @@
2
 #if defined(HAVE_SVE2)
3
     { "SVE2",            X265_CPU_SVE2 },
4
 #endif
5
+#if defined(HAVE_NEON_DOTPROD)
6
+    { "Neon_DotProd",    X265_CPU_NEON_DOTPROD },
7
+#endif
8
+#if defined(HAVE_NEON_I8MM)
9
+    { "Neon_I8MM",       X265_CPU_NEON_I8MM },
10
+#endif
11
 #elif X265_ARCH_POWER8
12
     { "Altivec",         X265_CPU_ALTIVEC },
13
 
14
@@ -389,17 +395,22 @@
15
 {
16
     int flags = 0;
17
 
18
-    #if defined(HAVE_SVE2)
19
-         flags |= X265_CPU_SVE2;
20
-         flags |= X265_CPU_SVE;
21
+    #if HAVE_NEON
22
          flags |= X265_CPU_NEON;
23
-    #elif defined(HAVE_SVE)
24
+    #endif
25
+    #if HAVE_NEON_DOTPROD
26
+         flags |= X265_CPU_NEON_DOTPROD;
27
+    #endif
28
+    #if HAVE_NEON_I8MM
29
+         flags |= X265_CPU_NEON_I8MM;
30
+    #endif
31
+    #if HAVE_SVE
32
          flags |= X265_CPU_SVE;
33
-         flags |= X265_CPU_NEON;
34
-    #elif HAVE_NEON
35
-         flags |= X265_CPU_NEON;
36
     #endif
37
-        
38
+    #if HAVE_SVE2
39
+         flags |= X265_CPU_SVE2;
40
+    #endif
41
+
42
     return flags;
43
 }
44
 
45
x265_3.6.tar.gz/source/common/cudata.cpp -> x265_4.0.tar.gz/source/common/cudata.cpp Changed
201
 
1
@@ -290,6 +290,10 @@
2
     m_bFirstRowInSlice = (uint8_t)firstRowInSlice;
3
     m_bLastRowInSlice  = (uint8_t)lastRowInSlice;
4
     m_bLastCuInSlice   = (uint8_t)lastCuInSlice;
5
+#if ENABLE_SCC_EXT
6
+    m_lastIntraBCMv0.set(0, 0);
7
+    m_lastIntraBCMv1.set(0, 0);
8
+#endif
9
 
10
     /* sequential memsets */
11
     m_partSet((uint8_t*)m_qp, (uint8_t)qp);
12
@@ -323,7 +327,7 @@
13
 }
14
 
15
 // initialize Sub partition
16
-void CUData::initSubCU(const CUData& ctu, const CUGeom& cuGeom, int qp)
17
+void CUData::initSubCU(const CUData& ctu, const CUGeom& cuGeom, int qp, MV lastIntraBCMv2)
18
 {
19
     m_absIdxInCTU   = cuGeom.absPartIdx;
20
     m_encData       = ctu.m_encData;
21
@@ -360,6 +364,14 @@
22
     /* initialize the remaining CU data in one memset */
23
     memset(m_predMode, 0, (ctu.m_chromaFormat == X265_CSP_I400 ? BytesPerPartition - 13 : BytesPerPartition - 9) * m_numPartitions);
24
     memset(m_distortion, 0, m_numPartitions * sizeof(sse_t));
25
+
26
+#if ENABLE_SCC_EXT
27
+    if (lastIntraBCMv)
28
+    {
29
+        for (int i = 0; i < 2; i++)
30
+            m_lastIntraBCMvi = lastIntraBCMvi;
31
+    }
32
+#endif
33
 }
34
 
35
 /* Copy the results of a sub-part (split) CU to the parent CU */
36
@@ -415,6 +427,10 @@
37
         memcpy(m_trCoeff1 + tmpC2, subCU.m_trCoeff1, sizeof(coeff_t) * tmpC);
38
         memcpy(m_trCoeff2 + tmpC2, subCU.m_trCoeff2, sizeof(coeff_t) * tmpC);
39
     }
40
+#if ENABLE_SCC_EXT
41
+    for (int i = 0; i < 2; i++)
42
+        m_lastIntraBCMvi = subCU.m_lastIntraBCMvi;
43
+#endif
44
 }
45
 
46
 /* If a sub-CU part is not present (off the edge of the picture) its depth and
47
@@ -1591,7 +1607,11 @@
48
                 return maxNumMergeCand;
49
         }
50
     }
51
+#if ENABLE_SCC_EXT
52
+    if (m_slice->m_bTemporalMvp)
53
+#else
54
     if (m_slice->m_sps->bTemporalMVPEnabled)
55
+#endif
56
     {
57
         uint32_t partIdxRB = deriveRightBottomIdx(puIdx);
58
         MV colmv;
59
@@ -1681,10 +1701,15 @@
60
             }
61
         }
62
     }
63
-    int numRefIdx = (isInterB) ? X265_MIN(m_slice->m_numRefIdx0, m_slice->m_numRefIdx1) : m_slice->m_numRefIdx0;
64
+    int numRefIdx0 = m_slice->m_numRefIdx0;
65
+#if ENABLE_SCC_EXT
66
+    if (m_slice->m_param->bEnableSCC)
67
+        numRefIdx0--;
68
+#endif
69
+    int numRefIdx = (isInterB) ? X265_MIN(numRefIdx0, m_slice->m_numRefIdx1) : numRefIdx0;
70
     int r = 0;
71
     int refcnt = 0;
72
-    while (count < maxNumMergeCand)
73
+    while (numRefIdx && (count < maxNumMergeCand))
74
     {
75
         candDircount = 1;
76
         candMvFieldcount0.mv.word = 0;
77
@@ -1712,28 +1737,61 @@
78
 }
79
 
80
 // Create the PMV list. Called for each reference index.
81
-int CUData::getPMV(InterNeighbourMV *neighbours, uint32_t picList, uint32_t refIdx, MV* amvpCand, MV* pmv) const
82
+int CUData::getPMV(InterNeighbourMV* neighbours, uint32_t picList, uint32_t refIdx, MV* amvpCand, MV* pmv, uint32_t puIdx, uint32_t absPartIdx) const
83
 {
84
     MV directMVMD_ABOVE_LEFT + 1;
85
     MV indirectMVMD_ABOVE_LEFT + 1;
86
     bool validDirectMD_ABOVE_LEFT + 1;
87
     bool validIndirectMD_ABOVE_LEFT + 1;
88
 
89
-    // Left candidate.
90
-    validDirectMD_BELOW_LEFT  = getDirectPMV(directMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx);
91
-    validDirectMD_LEFT        = getDirectPMV(directMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx);
92
-    // Top candidate.
93
-    validDirectMD_ABOVE_RIGHT = getDirectPMV(directMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx);
94
-    validDirectMD_ABOVE       = getDirectPMV(directMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx);
95
-    validDirectMD_ABOVE_LEFT  = getDirectPMV(directMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx);
96
-
97
-    // Left candidate.
98
-    validIndirectMD_BELOW_LEFT  = getIndirectPMV(indirectMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx);
99
-    validIndirectMD_LEFT        = getIndirectPMV(indirectMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx);
100
-    // Top candidate.
101
-    validIndirectMD_ABOVE_RIGHT = getIndirectPMV(indirectMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx);
102
-    validIndirectMD_ABOVE       = getIndirectPMV(indirectMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx);
103
-    validIndirectMD_ABOVE_LEFT  = getIndirectPMV(indirectMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx);
104
+#if (ENABLE_MULTIVIEW || ENABLE_SCC_EXT)
105
+    if (m_slice->m_param->numViews > 1 || m_slice->m_param->bEnableSCC)
106
+    {
107
+        // Left candidate.
108
+        if ((neighbours + MD_BELOW_LEFT)->isAvailable || (neighbours + MD_LEFT)->isAvailable)
109
+        {
110
+            validIndirectMD_ABOVE_RIGHT = validIndirectMD_ABOVE = validIndirectMD_ABOVE_LEFT = false;
111
+
112
+            validDirectMD_BELOW_LEFT = getDirectPMV(directMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx);
113
+            validDirectMD_LEFT = getDirectPMV(directMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx);
114
+
115
+            validIndirectMD_BELOW_LEFT = getIndirectPMV(indirectMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx);
116
+            validIndirectMD_LEFT = getIndirectPMV(indirectMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx);
117
+        }
118
+
119
+        // Top candidate.
120
+        validDirectMD_ABOVE_RIGHT = getDirectPMV(directMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx);
121
+        validDirectMD_ABOVE = getDirectPMV(directMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx);
122
+        validDirectMD_ABOVE_LEFT = getDirectPMV(directMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx);
123
+
124
+        // Top candidate.
125
+        if (!((neighbours + MD_BELOW_LEFT)->isAvailable || (neighbours + MD_LEFT)->isAvailable))
126
+        {
127
+            validDirectMD_BELOW_LEFT = validDirectMD_LEFT = validIndirectMD_BELOW_LEFT = validIndirectMD_LEFT = false;
128
+            validIndirectMD_ABOVE_RIGHT = getIndirectPMV(indirectMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx);
129
+            validIndirectMD_ABOVE = getIndirectPMV(indirectMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx);
130
+            validIndirectMD_ABOVE_LEFT = getIndirectPMV(indirectMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx);
131
+        }
132
+    }
133
+    else
134
+#endif
135
+    {
136
+        // Left candidate.
137
+        validDirectMD_BELOW_LEFT = getDirectPMV(directMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx);
138
+        validDirectMD_LEFT = getDirectPMV(directMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx);
139
+        // Top candidate.
140
+        validDirectMD_ABOVE_RIGHT = getDirectPMV(directMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx);
141
+        validDirectMD_ABOVE = getDirectPMV(directMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx);
142
+        validDirectMD_ABOVE_LEFT = getDirectPMV(directMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx);
143
+
144
+        // Left candidate.
145
+        validIndirectMD_BELOW_LEFT = getIndirectPMV(indirectMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx);
146
+        validIndirectMD_LEFT = getIndirectPMV(indirectMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx);
147
+        // Top candidate.
148
+        validIndirectMD_ABOVE_RIGHT = getIndirectPMV(indirectMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx);
149
+        validIndirectMD_ABOVE = getIndirectPMV(indirectMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx);
150
+        validIndirectMD_ABOVE_LEFT = getIndirectPMV(indirectMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx);
151
+    }
152
 
153
     int num = 0;
154
     // Left predictor search
155
@@ -1781,27 +1839,80 @@
156
 
157
     // Get the collocated candidate. At this step, either the first candidate
158
     // was found or its value is 0.
159
-    if (m_slice->m_sps->bTemporalMVPEnabled && num < 2)
160
+#if ENABLE_MULTIVIEW || ENABLE_SCC_EXT
161
+    if (m_slice->m_param->numViews > 1 || m_slice->m_param->bEnableSCC)
162
     {
163
-        int tempRefIdx = neighboursMD_COLLOCATED.refIdxpicList;
164
-        if (tempRefIdx != -1)
165
+        if (m_slice->m_bTemporalMvp && num < 2)
166
         {
167
-            uint32_t cuAddr = neighboursMD_COLLOCATED.cuAddrpicList;
168
-            const Frame* colPic = m_slice->m_refFrameListm_slice->isInterB() && !m_slice->m_colFromL0Flagm_slice->m_colRefIdx;
169
-            const CUData* colCU = colPic->m_encData->getPicCTU(cuAddr);
170
+            int refId = refIdx;
171
+            uint32_t absPartAddr = m_absIdxInCTU + absPartIdx;
172
+            uint32_t partIdxRB = deriveRightBottomIdx(puIdx);
173
+            bool isValid;
174
+
175
+            // co-located RightBottom temporal predictor (H)
176
+            int ctuIdx = -1;
177
 
178
-            // Scale the vector
179
-            int colRefPOC = colCU->m_slice->m_refPOCListtempRefIdx >> 4tempRefIdx & 0xf;
180
-            int colPOC = colCU->m_slice->m_poc;
181
+            // image boundary check
182
+            if (m_encData->getPicCTU(m_cuAddr)->m_cuPelX + g_zscanToPelXpartIdxRB + UNIT_SIZE < m_slice->m_sps->picWidthInLumaSamples &&
183
+                m_encData->getPicCTU(m_cuAddr)->m_cuPelY + g_zscanToPelYpartIdxRB + UNIT_SIZE < m_slice->m_sps->picHeightInLumaSamples)
184
+            {
185
+                uint32_t absPartIdxRB = g_zscanToRasterpartIdxRB;
186
+                uint32_t numUnits = s_numPartInCUSize;
187
+                bool bNotLastCol = lessThanCol(absPartIdxRB, numUnits - 1); // is not at the last column of CTU
188
+                bool bNotLastRow = lessThanRow(absPartIdxRB, numUnits - 1); // is not at the last row    of CTU
189
 
190
-            int curRefPOC = m_slice->m_refPOCListpicListrefIdx;
191
-            int curPOC = m_slice->m_poc;
192
-            pmvnumMvc++ = amvpCandnum++ = scaleMvByPOCDist(neighboursMD_COLLOCATED.mvpicList, curPOC, curRefPOC, colPOC, colRefPOC);
193
+                if (bNotLastCol && bNotLastRow)
194
+                {
195
+                    absPartAddr = g_rasterToZscanabsPartIdxRB + RASTER_SIZE + 1;
196
+                    ctuIdx = m_cuAddr;
197
+                }
198
+                else if (bNotLastCol)
199
+                    absPartAddr = g_rasterToZscan(absPartIdxRB + 1) & (numUnits - 1);
200
+                else if (bNotLastRow)
201
x265_3.6.tar.gz/source/common/cudata.h -> x265_4.0.tar.gz/source/common/cudata.h Changed
79
 
1
@@ -37,6 +37,9 @@
2
 class Slice;
3
 struct TUEntropyCodingParameters;
4
 struct CUDataMemPool;
5
+#if ENABLE_SCC_EXT
6
+struct IBC;
7
+#endif
8
 
9
 enum PartSize
10
 {
11
@@ -107,6 +110,8 @@
12
     // Collocated right bottom CU addr.
13
     uint32_t cuAddr2;
14
 
15
+    bool isAvailable;
16
+
17
     // For spatial prediction, this field contains the reference index
18
     // in each list (-1 if not available).
19
     //
20
@@ -118,6 +123,14 @@
21
     union { int16_t refIdx2; int32_t unifiedRef; };
22
 };
23
 
24
+struct IBC
25
+{
26
+    int             m_numBVs;
27
+    int             m_numBV16s;
28
+    MV              m_BVs64;
29
+    MV              m_lastIntraBCMv2;
30
+};
31
+
32
 typedef void(*cucopy_t)(uint8_t* dst, uint8_t* src); // dst and src are aligned to MIN(size, 32)
33
 typedef void(*cubcast_t)(uint8_t* dst, uint8_t val); // dst is aligned to MIN(size, 32)
34
 
35
@@ -230,13 +243,17 @@
36
     uint32_t*       m_collectCUVariance;
37
     uint32_t*       m_collectCUCount;
38
 
39
+#if ENABLE_SCC_EXT
40
+    MV              m_lastIntraBCMv2;
41
+#endif
42
+
43
     CUData();
44
 
45
     void     initialize(const CUDataMemPool& dataPool, uint32_t depth, const x265_param& param, int instance);
46
     static void calcCTUGeoms(uint32_t ctuWidth, uint32_t ctuHeight, uint32_t maxCUSize, uint32_t minCUSize, CUGeom cuDataArrayCUGeom::MAX_GEOMS);
47
 
48
     void     initCTU(const Frame& frame, uint32_t cuAddr, int qp, uint32_t firstRowInSlice, uint32_t lastRowInSlice, uint32_t lastCUInSlice);
49
-    void     initSubCU(const CUData& ctu, const CUGeom& cuGeom, int qp);
50
+    void     initSubCU(const CUData& ctu, const CUGeom& cuGeom, int qp, MV lastIntraBCMv2 = 0);
51
     void     initLosslessCU(const CUData& cu, const CUGeom& cuGeom);
52
 
53
     void     copyPartFrom(const CUData& cu, const CUGeom& childGeom, uint32_t subPartIdx);
54
@@ -272,7 +289,7 @@
55
     int8_t   getRefQP(uint32_t currAbsIdxInCTU) const;
56
     uint32_t getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField (*candMvField)2, uint8_t* candDir) const;
57
     void     clipMv(MV& outMV) const;
58
-    int      getPMV(InterNeighbourMV *neighbours, uint32_t reference_list, uint32_t refIdx, MV* amvpCand, MV* pmv) const;
59
+    int      getPMV(InterNeighbourMV* neighbours, uint32_t reference_list, uint32_t refIdx, MV* amvpCand, MV* pmv, uint32_t puIdx = 0, uint32_t absPartIdx = 0) const;
60
     void     getNeighbourMV(uint32_t puIdx, uint32_t absPartIdx, InterNeighbourMV* neighbours) const;
61
     void     getIntraTUQtDepthRange(uint32_t tuDepthRange2, uint32_t absPartIdx) const;
62
     void     getInterTUQtDepthRange(uint32_t tuDepthRange2, uint32_t absPartIdx) const;
63
@@ -309,6 +326,15 @@
64
     const CUData* getPUAboveRightAdi(uint32_t& arPartUnitIdx, uint32_t curPartUnitIdx, uint32_t partUnitOffset) const;
65
     const CUData* getPUBelowLeftAdi(uint32_t& blPartUnitIdx, uint32_t curPartUnitIdx, uint32_t partUnitOffset) const;
66
 
67
+#if ENABLE_SCC_EXT
68
+    void getIntraBCMVPsEncOnly(uint32_t absPartIdx, MV* MvPred, int& nbPred, int puIdx);
69
+    bool getDerivedBV(uint32_t absPartIdx, const MV& currentMv, MV& derivedMv, uint32_t width, uint32_t height);
70
+    bool isIntraBC(const CUData* cu, uint32_t absPartIdx) const;
71
+    bool getColMVPIBC(int ctuRsAddr, int partUnitIdx, MV& rcMv);
72
+    void roundMergeCandidates(MVField(*candMvField)2, int iCount) const;
73
+    bool is8x8BipredRestriction(MV mvL0, MV mvL1, int iRefIdxL0, int iRefIdxL1) const;
74
+#endif
75
+
76
 protected:
77
 
78
     template<typename T>
79
x265_3.6.tar.gz/source/common/dct.cpp -> x265_4.0.tar.gz/source/common/dct.cpp Changed
100
 
1
@@ -439,7 +439,8 @@
2
     }
3
 }
4
 
5
-static void dst4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
6
+namespace X265_NS {
7
+void dst4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
8
 {
9
     const int shift_1st = 1 + X265_DEPTH - 8;
10
     const int shift_2nd = 8;
11
@@ -456,7 +457,7 @@
12
     fastForwardDst(coef, dst, shift_2nd);
13
 }
14
 
15
-static void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
16
+void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
17
 {
18
     const int shift_1st = 1 + X265_DEPTH - 8;
19
     const int shift_2nd = 8;
20
@@ -473,7 +474,7 @@
21
     partialButterfly4(coef, dst, shift_2nd, 4);
22
 }
23
 
24
-static void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
25
+void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
26
 {
27
     const int shift_1st = 2 + X265_DEPTH - 8;
28
     const int shift_2nd = 9;
29
@@ -490,7 +491,7 @@
30
     partialButterfly8(coef, dst, shift_2nd, 8);
31
 }
32
 
33
-static void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
34
+void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
35
 {
36
     const int shift_1st = 3 + X265_DEPTH - 8;
37
     const int shift_2nd = 10;
38
@@ -507,7 +508,7 @@
39
     partialButterfly16(coef, dst, shift_2nd, 16);
40
 }
41
 
42
-static void dct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
43
+void dct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
44
 {
45
     const int shift_1st = 4 + X265_DEPTH - 8;
46
     const int shift_2nd = 11;
47
@@ -524,7 +525,7 @@
48
     partialButterfly32(coef, dst, shift_2nd, 32);
49
 }
50
 
51
-static void idst4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
52
+void idst4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
53
 {
54
     const int shift_1st = 7;
55
     const int shift_2nd = 12 - (X265_DEPTH - 8);
56
@@ -541,7 +542,7 @@
57
     }
58
 }
59
 
60
-static void idct4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
61
+void idct4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
62
 {
63
     const int shift_1st = 7;
64
     const int shift_2nd = 12 - (X265_DEPTH - 8);
65
@@ -558,7 +559,7 @@
66
     }
67
 }
68
 
69
-static void idct8_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
70
+void idct8_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
71
 {
72
     const int shift_1st = 7;
73
     const int shift_2nd = 12 - (X265_DEPTH - 8);
74
@@ -575,7 +576,7 @@
75
     }
76
 }
77
 
78
-static void idct16_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
79
+void idct16_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
80
 {
81
     const int shift_1st = 7;
82
     const int shift_2nd = 12 - (X265_DEPTH - 8);
83
@@ -592,7 +593,7 @@
84
     }
85
 }
86
 
87
-static void idct32_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
88
+void idct32_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
89
 {
90
     const int shift_1st = 7;
91
     const int shift_2nd = 12 - (X265_DEPTH - 8);
92
@@ -608,6 +609,7 @@
93
         memcpy(&dsti * dstStride, &blocki * 32, 32 * sizeof(int16_t));
94
     }
95
 }
96
+} // namespace X265_NS
97
 
98
 static void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
99
 {
100
x265_3.6.tar.gz/source/common/deblock.cpp -> x265_4.0.tar.gz/source/common/deblock.cpp Changed
19
 
1
@@ -316,7 +316,7 @@
2
 
3
 void Deblock::edgeFilterLuma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength)
4
 {
5
-    PicYuv* reconPic = cuQ->m_encData->m_reconPic;
6
+    PicYuv* reconPic = cuQ->m_encData->m_reconPic0;
7
     pixel* src = reconPic->getLumaAddr(cuQ->m_cuAddr, absPartIdx);
8
     intptr_t stride = reconPic->m_stride;
9
     const PPS* pps = cuQ->m_slice->m_pps;
10
@@ -429,7 +429,7 @@
11
                 : ((g_zscanToPelYabsPartIdx + edge * UNIT_SIZE) >> cuQ->m_vChromaShift)) % DEBLOCK_SMALLEST_BLOCK == 0,
12
                "invalid edge\n");
13
 
14
-    PicYuv* reconPic = cuQ->m_encData->m_reconPic;
15
+    PicYuv* reconPic = cuQ->m_encData->m_reconPic0;
16
     intptr_t stride = reconPic->m_strideC;
17
     intptr_t srcOffset = reconPic->getChromaAddrOffset(cuQ->m_cuAddr, absPartIdx);
18
     bool bCheckNoFilter = pps->bTransquantBypassEnabled;
19
x265_3.6.tar.gz/source/common/frame.cpp -> x265_4.0.tar.gz/source/common/frame.cpp Changed
147
 
1
@@ -37,7 +37,8 @@
2
     m_reconColCount = NULL;
3
     m_countRefEncoders = 0;
4
     m_encData = NULL;
5
-    m_reconPic = NULL;
6
+    for (int i = 0; i < NUM_RECON_VERSION; i++)
7
+        m_reconPici = NULL;
8
     m_quantOffsets = NULL;
9
     m_next = NULL;
10
     m_prev = NULL;
11
@@ -75,6 +76,11 @@
12
 
13
     m_tempLayer = 0;
14
     m_sameLayerRefPic = false;
15
+
16
+    m_viewId = 0;
17
+    m_valid = 0;
18
+    m_nextSubDPB = NULL;
19
+    m_prevSubDPB = NULL;
20
 }
21
 
22
 bool Frame::create(x265_param *param, float* quantOffsets)
23
@@ -85,6 +91,7 @@
24
     if (m_param->bEnableTemporalFilter)
25
     {
26
         m_mcstf = new TemporalFilter;
27
+        m_mcstf->m_range = param->mcstfFrameRange;
28
         m_mcstf->init(param);
29
 
30
         m_fencPicSubsampled2 = new PicYuv;
31
@@ -198,29 +205,35 @@
32
 bool Frame::allocEncodeData(x265_param *param, const SPS& sps)
33
 {
34
     m_encData = new FrameData;
35
-    m_reconPic = new PicYuv;
36
     m_param = param;
37
-    m_encData->m_reconPic = m_reconPic;
38
-    bool ok = m_encData->create(*param, sps, m_fencPic->m_picCsp) && m_reconPic->create(param);
39
+    for (int i = 0; i < !!m_param->bEnableSCC + 1; i++)
40
+    {
41
+        m_reconPici = new PicYuv;
42
+        m_encData->m_reconPici = m_reconPici;
43
+    }
44
+    bool ok = m_encData->create(*param, sps, m_fencPic->m_picCsp) && m_reconPic0->create(param) && (param->bEnableSCC ? (param->bEnableSCC && m_reconPic1->create(param)) : 1);
45
     if (ok)
46
     {
47
-        /* initialize right border of m_reconpicYuv as SAO may read beyond the
48
+        /* initialize right border of m_reconPicYuv as SAO may read beyond the
49
          * end of the picture accessing uninitialized pixels */
50
         int maxHeight = sps.numCuInHeight * param->maxCUSize;
51
-        memset(m_reconPic->m_picOrg0, 0, sizeof(pixel)* m_reconPic->m_stride * maxHeight);
52
+        memset(m_reconPic0->m_picOrg0, 0, sizeof(pixel)* m_reconPic0->m_stride * maxHeight);
53
 
54
-        /* use pre-calculated cu/pu offsets cached in the SPS structure */
55
-        m_reconPic->m_cuOffsetY = sps.cuOffsetY;
56
-        m_reconPic->m_buOffsetY = sps.buOffsetY;
57
-
58
-        if (param->internalCsp != X265_CSP_I400)
59
+        for (int i = 0; i < !!m_param->bEnableSCC + 1; i++)
60
         {
61
-            memset(m_reconPic->m_picOrg1, 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
62
-            memset(m_reconPic->m_picOrg2, 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
63
-
64
             /* use pre-calculated cu/pu offsets cached in the SPS structure */
65
-            m_reconPic->m_cuOffsetC = sps.cuOffsetC;
66
-            m_reconPic->m_buOffsetC = sps.buOffsetC;
67
+            m_reconPici->m_cuOffsetY = sps.cuOffsetY;
68
+            m_reconPici->m_buOffsetY = sps.buOffsetY;
69
+
70
+            if (param->internalCsp != X265_CSP_I400)
71
+            {
72
+                memset(m_reconPici->m_picOrg1, 0, sizeof(pixel) * m_reconPici->m_strideC * (maxHeight >> m_reconPici->m_vChromaShift));
73
+                memset(m_reconPici->m_picOrg2, 0, sizeof(pixel) * m_reconPici->m_strideC * (maxHeight >> m_reconPici->m_vChromaShift));
74
+
75
+                /* use pre-calculated cu/pu offsets cached in the SPS structure */
76
+                m_reconPici->m_cuOffsetC = sps.cuOffsetC;
77
+                m_reconPici->m_buOffsetC = sps.buOffsetC;
78
+            }
79
         }
80
     }
81
     return ok;
82
@@ -230,7 +243,8 @@
83
 void Frame::reinit(const SPS& sps)
84
 {
85
     m_bChromaExtended = false;
86
-    m_reconPic = m_encData->m_reconPic;
87
+    for (int i = 0; i < !!m_param->bEnableSCC + 1; i++)
88
+        m_reconPici = m_encData->m_reconPici;
89
     m_encData->reinit(sps);
90
 }
91
 
92
@@ -243,6 +257,35 @@
93
         m_encData = NULL;
94
     }
95
 
96
+#if ENABLE_MULTIVIEW
97
+    //Destroy interlayer References
98
+    if (refPicSetInterLayer0.size())
99
+    {
100
+        Frame* iterFrame = refPicSetInterLayer0.first();
101
+
102
+        while (iterFrame)
103
+        {
104
+            Frame* curFrame = iterFrame;
105
+            iterFrame = iterFrame->m_nextSubDPB;
106
+            refPicSetInterLayer0.removeSubDPB(*curFrame);
107
+            iterFrame = refPicSetInterLayer0.first();
108
+        }
109
+    }
110
+
111
+    if (refPicSetInterLayer1.size())
112
+    {
113
+        Frame* iterFrame = refPicSetInterLayer1.first();
114
+
115
+        while (iterFrame)
116
+        {
117
+            Frame* curFrame = iterFrame;
118
+            iterFrame = iterFrame->m_nextSubDPB;
119
+            refPicSetInterLayer1.removeSubDPB(*curFrame);
120
+            iterFrame = refPicSetInterLayer1.first();
121
+        }
122
+    }
123
+#endif
124
+
125
     if (m_fencPic)
126
     {
127
         if (m_param->bCopyPicToFrame)
128
@@ -271,11 +314,14 @@
129
         X265_FREE(m_isSubSampled);
130
     }
131
 
132
-    if (m_reconPic)
133
+    for (int i = 0; i < !!m_param->bEnableSCC + 1; i++)
134
     {
135
-        m_reconPic->destroy();
136
-        delete m_reconPic;
137
-        m_reconPic = NULL;
138
+        if (m_reconPici)
139
+        {
140
+            m_reconPici->destroy();
141
+            delete m_reconPici;
142
+            m_reconPici = NULL;
143
+        }
144
     }
145
 
146
     if (m_reconRowFlag)
147
x265_3.6.tar.gz/source/common/frame.h -> x265_4.0.tar.gz/source/common/frame.h Changed
33
 
1
@@ -81,13 +81,16 @@
2
     /* These two items will be NULL until the Frame begins to be encoded, at which point
3
      * it will be assigned a FrameData instance, which comes with a reconstructed image PicYuv */
4
     FrameData*             m_encData;
5
-    PicYuv*                m_reconPic;
6
+    PicYuv*                m_reconPicNUM_RECON_VERSION;
7
 
8
     /* Data associated with x265_picture */
9
     PicYuv*                m_fencPic;
10
     PicYuv*                m_fencPicSubsampled2;
11
     PicYuv*                m_fencPicSubsampled4;
12
 
13
+    PicList                refPicSetInterLayer0;
14
+    PicList                refPicSetInterLayer1;
15
+
16
     int                    m_poc;
17
     int                    m_encodeOrder;
18
     int                    m_gopOffset;
19
@@ -161,6 +164,13 @@
20
     int8_t                 m_gopId;
21
     bool                   m_sameLayerRefPic;
22
 
23
+    int                    m_sLayerId;
24
+    bool                   m_valid;
25
+
26
+    int                    m_viewId;
27
+    Frame*                 m_nextSubDPB;           // PicList doubly linked list pointers
28
+    Frame*                 m_prevSubDPB;
29
+
30
     Frame();
31
 
32
     bool create(x265_param *param, float* quantOffsets);
33
x265_3.6.tar.gz/source/common/framedata.h -> x265_4.0.tar.gz/source/common/framedata.h Changed
10
 
1
@@ -115,7 +115,7 @@
2
     const x265_param* m_param;
3
 
4
     FrameData*     m_freeListNext;
5
-    PicYuv*        m_reconPic;
6
+    PicYuv*        m_reconPicNUM_RECON_VERSION;
7
     bool           m_bHasReferences;   /* used during DPB/RPS updates */
8
     int            m_frameEncoderID;   /* the ID of the FrameEncoder encoding this frame */
9
     JobProvider*   m_jobProvider;
10
x265_3.6.tar.gz/source/common/ipfilter.cpp -> x265_4.0.tar.gz/source/common/ipfilter.cpp Changed
23
 
1
@@ -34,8 +34,8 @@
2
 #pragma warning(disable: 4127) // conditional expression is constant, typical for templated functions
3
 #endif
4
 
5
-namespace {
6
-// file local namespace
7
+namespace X265_NS {
8
+// x265 private namespace
9
 
10
 template<int width, int height>
11
 void filterPixelToShort_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
12
@@ -367,10 +367,6 @@
13
     interp_horiz_ps_c<N, width, height>(src, srcStride, immed, width, idxX, 1);
14
     filterVertical_sp_c<N>(immed + (N / 2 - 1) * width, width, dst, dstStride, width, height, idxY);
15
 }
16
-}
17
-
18
-namespace X265_NS {
19
-// x265 private namespace
20
 
21
 #define CHROMA_420(W, H) \
22
     p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_hpp = interp_horiz_pp_c<4, W, H>; \
23
x265_3.6.tar.gz/source/common/loopfilter.cpp -> x265_4.0.tar.gz/source/common/loopfilter.cpp Changed
55
 
1
@@ -30,16 +30,10 @@
2
 
3
 namespace {
4
 
5
-/* get the sign of input variable (TODO: this is a dup, make common) */
6
-inline int8_t signOf(int x)
7
-{
8
-    return (x >> 31) | ((int)((((uint32_t)-x)) >> 31));
9
-}
10
-
11
 static void calSign(int8_t *dst, const pixel *src1, const pixel *src2, const int endX)
12
 {
13
     for (int x = 0; x < endX; x++)
14
-        dstx = signOf(src1x - src2x);
15
+        dstx = x265_signOf(src1x - src2x);
16
 }
17
 
18
 static void processSaoCUE0(pixel * rec, int8_t * offsetEo, int width, int8_t* signLeft, intptr_t stride)
19
@@ -70,7 +64,7 @@
20
 
21
     for (x = 0; x < width; x++)
22
     {
23
-        signDown = signOf(recx - recx + stride);
24
+        signDown = x265_signOf(recx - recx + stride);
25
         edgeType = signDown + upBuff1x + 2;
26
         upBuff1x = -signDown;
27
         recx = x265_clip(recx + offsetEoedgeType);
28
@@ -87,7 +81,7 @@
29
     {
30
         for (x = 0; x < width; x++)
31
         {
32
-            signDown = signOf(recx - recx + stride);
33
+            signDown = x265_signOf(recx - recx + stride);
34
             edgeType = signDown + upBuff1x + 2;
35
             upBuff1x = -signDown;
36
             recx = x265_clip(recx + offsetEoedgeType);
37
@@ -101,7 +95,7 @@
38
     int x;
39
     for (x = 0; x < width; x++)
40
     {
41
-        int8_t signDown = signOf(recx - recx + stride + 1);
42
+        int8_t signDown = x265_signOf(recx - recx + stride + 1);
43
         int edgeType = signDown + buff1x + 2;
44
         bufftx + 1 = -signDown;
45
         recx = x265_clip(recx + offsetEoedgeType);;
46
@@ -115,7 +109,7 @@
47
 
48
     for (int x = startX + 1; x < endX; x++)
49
     {
50
-        signDown = signOf(recx - recx + stride);
51
+        signDown = x265_signOf(recx - recx + stride);
52
         edgeType = signDown + upBuff1x + 2;
53
         upBuff1x - 1 = -signDown;
54
         recx = x265_clip(recx + offsetEoedgeType);
55
x265_3.6.tar.gz/source/common/lowpassdct.cpp -> x265_4.0.tar.gz/source/common/lowpassdct.cpp Changed
28
 
1
@@ -58,7 +58,7 @@
2
     }
3
 
4
     // replace first coef with total block average
5
-    dst0 = totalSum << 1;
6
+    dst0 = (X265_DEPTH == 8) ? (totalSum << 1) : (totalSum >> ((X265_DEPTH - 9)));
7
 }
8
 
9
 static void lowPassDct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
10
@@ -83,7 +83,7 @@
11
     {
12
         memcpy(&dsti * 16, &coefi * 8, 8 * sizeof(int16_t));
13
     }
14
-    dst0 = static_cast<int16_t>(totalSum >> 1);
15
+    dst0 = static_cast<int16_t>(totalSum >> (1 + (X265_DEPTH - 8)));
16
 }
17
 
18
 static void lowPassDct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
19
@@ -108,7 +108,7 @@
20
     {
21
         memcpy(&dsti * 32, &coefi * 16, 16 * sizeof(int16_t));
22
     }
23
-    dst0 = static_cast<int16_t>(totalSum >> 3);
24
+    dst0 = static_cast<int16_t>(totalSum >> (3 + (X265_DEPTH - 8)));
25
 }
26
 
27
 namespace X265_NS {
28
x265_3.6.tar.gz/source/common/param.cpp -> x265_4.0.tar.gz/source/common/param.cpp Changed
201
 
1
@@ -183,6 +183,7 @@
2
     param->bEnableSceneCutAwareQp = 0;
3
     param->fwdMaxScenecutWindow = 1200;
4
     param->bwdMaxScenecutWindow = 600;
5
+    param->mcstfFrameRange = 2;
6
     for (int i = 0; i < 6; i++)
7
     {
8
         int deltas6 = { 5, 4, 3, 2, 1, 0 };
9
@@ -391,6 +392,10 @@
10
     param->bEnableTemporalFilter = 0;
11
     param->temporalFilterStrength = 0.95;
12
 
13
+    /*Alpha Channel Encoding*/
14
+    param->bEnableAlpha = 0;
15
+    param->numScalableLayers = 1;
16
+
17
 #ifdef SVT_HEVC
18
     param->svtHevcParam = svtParam;
19
     svt_param_default(param);
20
@@ -398,6 +403,15 @@
21
     /* Film grain characteristics model filename */
22
     param->filmGrain = NULL;
23
     param->bEnableSBRC = 0;
24
+
25
+    /* Multi-View Encoding*/
26
+    param->numViews = 1;
27
+    param->format = 0;
28
+
29
+    param->numLayers = 1;
30
+
31
+    /* SCC */
32
+    param->bEnableSCC = 0;
33
 }
34
 
35
 int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
36
@@ -417,6 +431,7 @@
37
 
38
         if (!strcmp(preset, "ultrafast"))
39
         {
40
+            param->mcstfFrameRange = 1;
41
             param->maxNumMergeCand = 2;
42
             param->bIntraInBFrames = 0;
43
             param->lookaheadDepth = 5;
44
@@ -441,6 +456,7 @@
45
         }
46
         else if (!strcmp(preset, "superfast"))
47
         {
48
+            param->mcstfFrameRange = 1;
49
             param->maxNumMergeCand = 2;
50
             param->bIntraInBFrames = 0;
51
             param->lookaheadDepth = 10;
52
@@ -461,6 +477,7 @@
53
         }
54
         else if (!strcmp(preset, "veryfast"))
55
         {
56
+            param->mcstfFrameRange = 1;
57
             param->maxNumMergeCand = 2;
58
             param->limitReferences = 3;
59
             param->bIntraInBFrames = 0;
60
@@ -474,6 +491,7 @@
61
         }
62
         else if (!strcmp(preset, "faster"))
63
         {
64
+            param->mcstfFrameRange = 1;
65
             param->maxNumMergeCand = 2;
66
             param->limitReferences = 3;
67
             param->bIntraInBFrames = 0;
68
@@ -485,6 +503,7 @@
69
         }
70
         else if (!strcmp(preset, "fast"))
71
         {
72
+            param->mcstfFrameRange = 1;
73
             param->maxNumMergeCand = 2;
74
             param->limitReferences = 3;
75
             param->bEnableEarlySkip = 0;
76
@@ -497,6 +516,7 @@
77
         }
78
         else if (!strcmp(preset, "medium"))
79
         {
80
+            param->mcstfFrameRange = 1;
81
             /* defaults */
82
         }
83
         else if (!strcmp(preset, "slow"))
84
@@ -1437,6 +1457,33 @@
85
         OPT("film-grain") p->filmGrain = (char* )value;
86
         OPT("mcstf") p->bEnableTemporalFilter = atobool(value);
87
         OPT("sbrc") p->bEnableSBRC = atobool(value);
88
+#if ENABLE_ALPHA
89
+        OPT("alpha")
90
+        {
91
+            if (atobool(value))
92
+            {
93
+                p->bEnableAlpha = 1;
94
+                p->numScalableLayers = 2;
95
+                p->numLayers = 2;
96
+            }
97
+        }
98
+#endif
99
+#if ENABLE_MULTIVIEW
100
+        OPT("format")
101
+            p->format = atoi(value);
102
+        OPT("num-views")
103
+        {
104
+            p->numViews = atoi(value);
105
+        }
106
+#endif
107
+#if ENABLE_SCC_EXT
108
+        OPT("scc")
109
+        {
110
+            p->bEnableSCC = atoi(value);
111
+            if (p->bEnableSCC)
112
+                p->bEnableWeightedPred = false;
113
+        }
114
+#endif
115
         else
116
             return X265_PARAM_BAD_NAME;
117
     }
118
@@ -1674,7 +1721,7 @@
119
         CHECK(param->edgeVarThreshold < 0.0f || param->edgeVarThreshold > 1.0f,
120
               "Minimum edge density percentage for a CU should be an integer between 0 to 100");
121
     }
122
-    CHECK(param->bframes && param->bframes >= param->lookaheadDepth && !param->rc.bStatRead,
123
+    CHECK(param->bframes && (param->bEnableTemporalFilter ? (param->bframes > param->lookaheadDepth) : (param->bframes >= param->lookaheadDepth)) && !param->rc.bStatRead,
124
           "Lookahead depth must be greater than the max consecutive bframe count");
125
     CHECK(param->bframes < 0,
126
           "bframe count should be greater than zero");
127
@@ -1908,6 +1955,21 @@
128
         }
129
     }
130
     CHECK(param->rc.dataShareMode != X265_SHARE_MODE_FILE && param->rc.dataShareMode != X265_SHARE_MODE_SHAREDMEM, "Invalid data share mode. It must be one of the X265_DATA_SHARE_MODES enum values\n" );
131
+#if ENABLE_ALPHA
132
+    if (param->bEnableAlpha)
133
+    {
134
+        CHECK((param->internalCsp != X265_CSP_I420), "Alpha encode supported only with i420a colorspace");
135
+        CHECK((param->analysisMultiPassDistortion || param->analysisMultiPassRefine), "Alpha encode doesnot support multipass feature");
136
+    }
137
+#endif
138
+#if ENABLE_MULTIVIEW
139
+    CHECK((param->numViews > 2), "Multi-View Encoding currently support only 2 views");
140
+    CHECK((param->numViews > 1) && (param->internalBitDepth != 8), "BitDepthConstraint must be 8 for Multiview main profile");
141
+    CHECK((param->numViews > 1) && (param->analysisMultiPassDistortion || param->analysisMultiPassRefine), "Multiview encode doesnot support multipass feature");
142
+#endif
143
+#if ENABLE_SCC_EXT
144
+    CHECK(!!param->bEnableSCC&& param->rdLevel != 6, "Enabling scc extension in x265 requires rdlevel of 6 ");
145
+#endif
146
     return check_failed;
147
 }
148
 
149
@@ -2072,6 +2134,12 @@
150
     TOOLOPT(param->rc.bStatWrite, "stats-write");
151
     TOOLOPT(param->rc.bStatRead,  "stats-read");
152
     TOOLOPT(param->bSingleSeiNal, "single-sei");
153
+#if ENABLE_ALPHA
154
+    TOOLOPT(param->numScalableLayers > 1, "alpha");
155
+#endif
156
+#if ENABLE_MULTIVIEW
157
+    TOOLOPT(param->numViews > 1, "multi-view");
158
+#endif
159
 #if ENABLE_HDR10_PLUS
160
     TOOLOPT(param->toneMapFile != NULL, "dhdr10-info");
161
 #endif
162
@@ -2336,6 +2404,16 @@
163
     if (p->filmGrain)
164
         s += sprintf(s, " film-grain=%s", p->filmGrain); // Film grain characteristics model filename
165
     BOOL(p->bEnableTemporalFilter, "mcstf");
166
+#if ENABLE_ALPHA
167
+    BOOL(p->bEnableAlpha, "alpha");
168
+#endif
169
+#if ENABLE_MULTIVIEW
170
+    s += sprintf(s, " num-views=%d", p->numViews);
171
+    s += sprintf(s, " format=%d", p->format);
172
+#endif
173
+#if ENABLE_SCC_EXT
174
+    s += sprintf(s, "scc=%d", p->bEnableSCC);
175
+#endif
176
     BOOL(p->bEnableSBRC, "sbrc");
177
 #undef BOOL
178
     return buf;
179
@@ -2558,6 +2636,7 @@
180
 
181
 void x265_copy_params(x265_param* dst, x265_param* src)
182
 {
183
+    dst->mcstfFrameRange = src->mcstfFrameRange;
184
     dst->cpuid = src->cpuid;
185
     dst->frameNumThreads = src->frameNumThreads;
186
     if (src->numaPools) dst->numaPools = strdup(src->numaPools);
187
@@ -2856,6 +2935,18 @@
188
     dst->confWinRightOffset = src->confWinRightOffset;
189
     dst->confWinBottomOffset = src->confWinBottomOffset;
190
     dst->bliveVBV2pass = src->bliveVBV2pass;
191
+#if ENABLE_ALPHA
192
+    dst->bEnableAlpha = src->bEnableAlpha;
193
+    dst->numScalableLayers = src->numScalableLayers;
194
+#endif
195
+#if ENABLE_MULTIVIEW
196
+    dst->numViews = src->numViews;
197
+    dst->format = src->format;
198
+#endif
199
+    dst->numLayers = src->numLayers;
200
+#if ENABLE_SCC_EXT
201
x265_3.6.tar.gz/source/common/piclist.cpp -> x265_4.0.tar.gz/source/common/piclist.cpp Changed
160
 
1
@@ -82,6 +82,82 @@
2
     m_count++;
3
 }
4
 
5
+#if ENABLE_MULTIVIEW
6
+Frame* PicList::popFrontSubDPB()
7
+{
8
+    if (m_start)
9
+    {
10
+        Frame* temp = m_start;
11
+        m_count--;
12
+
13
+        if (m_count)
14
+        {
15
+            m_start = m_start->m_nextSubDPB;
16
+            m_start->m_prevSubDPB = NULL;
17
+        }
18
+        else
19
+        {
20
+            m_start = m_end = NULL;
21
+        }
22
+        temp->m_next = temp->m_prev = NULL;
23
+        return temp;
24
+    }
25
+    else
26
+        return NULL;
27
+}
28
+
29
+void PicList::pushBackSubDPB(Frame& curFrame)
30
+{
31
+    X265_CHECK(!curFrame.m_nextSubDPB && !curFrame.m_prevSubDPB, "piclist: picture already in Sub DPB list\n"); // ensure frame is not in a list
32
+    curFrame.m_nextSubDPB = NULL;
33
+    curFrame.m_prevSubDPB = m_end;
34
+
35
+    if (m_count)
36
+    {
37
+        m_end->m_nextSubDPB = &curFrame;
38
+        m_end = &curFrame;
39
+    }
40
+    else
41
+    {
42
+        m_start = m_end = &curFrame;
43
+    }
44
+    m_count++;
45
+}
46
+
47
+void PicList::removeSubDPB(Frame& curFrame)
48
+{
49
+#if _DEBUG
50
+    Frame* tmp = m_start;
51
+    while (tmp && tmp != &curFrame)
52
+    {
53
+        tmp = tmp->m_nextSubDPB;
54
+    }
55
+
56
+    X265_CHECK(tmp == &curFrame, "piclist: pic being removed was not in list\n"); // verify pic is in this list
57
+#endif
58
+
59
+    m_count--;
60
+    if (m_count)
61
+    {
62
+        if (m_start == &curFrame)
63
+            m_start = curFrame.m_nextSubDPB;
64
+        if (m_end == &curFrame)
65
+            m_end = curFrame.m_prevSubDPB;
66
+
67
+        if (curFrame.m_nextSubDPB)
68
+            curFrame.m_nextSubDPB->m_prevSubDPB = curFrame.m_prevSubDPB;
69
+        if (curFrame.m_prevSubDPB)
70
+            curFrame.m_prevSubDPB->m_nextSubDPB = curFrame.m_nextSubDPB;
71
+    }
72
+    else
73
+    {
74
+        m_start = m_end = NULL;
75
+    }
76
+
77
+    curFrame.m_nextSubDPB = curFrame.m_prevSubDPB = NULL;
78
+}
79
+#endif
80
+
81
 void PicList::pushBackMCSTF(Frame& curFrame)
82
 {
83
     X265_CHECK(!curFrame.m_nextMCSTF && !curFrame.m_prevMCSTF, "piclist: picture already in OPB list\n"); // ensure frame is not in a list
84
@@ -123,11 +199,16 @@
85
         return NULL;
86
 }
87
 
88
-Frame* PicList::getPOC(int poc)
89
+Frame* PicList::getPOC(int poc, int sLayerId)
90
 {
91
     Frame *curFrame = m_start;
92
-    while (curFrame && curFrame->m_poc != poc)
93
+    int layer = curFrame->m_param->numViews > 1 ? curFrame->m_viewId : (curFrame->m_param->numScalableLayers > 1) ? curFrame->m_sLayerId : 0;
94
+    while (curFrame && (curFrame->m_poc != poc || layer != sLayerId))
95
+    {
96
         curFrame = curFrame->m_next;
97
+        if(curFrame)
98
+            layer = curFrame->m_param->numViews > 1 ? curFrame->m_viewId : (curFrame->m_param->numScalableLayers > 1) ? curFrame->m_sLayerId : 0;
99
+    }
100
     return curFrame;
101
 }
102
 
103
@@ -185,10 +266,11 @@
104
         return NULL;
105
 }
106
 
107
-Frame* PicList::getCurFrame(void)
108
+Frame* PicList::getCurFrame(int sLayer)
109
 {
110
     Frame *curFrame = m_start;
111
-    if (curFrame != NULL)
112
+    int layer = curFrame->m_param->numViews > 1 ? curFrame->m_viewId : (curFrame->m_param->numScalableLayers > 1) ? curFrame->m_sLayerId : 0;
113
+    if (layer == sLayer && curFrame != NULL)
114
         return curFrame;
115
     else
116
         return NULL;
117
@@ -227,6 +309,42 @@
118
     curFrame.m_next = curFrame.m_prev = NULL;
119
 }
120
 
121
+
122
+Frame* PicList::removeFrame(Frame& curFrame)
123
+{
124
+    Frame* tmp = &curFrame;
125
+#if _DEBUG
126
+    tmp = m_start;
127
+    while (tmp && tmp != &curFrame)
128
+    {
129
+        tmp = tmp->m_next;
130
+    }
131
+
132
+    X265_CHECK(tmp == &curFrame, "piclist: pic being removed was not in list\n"); // verify pic is in this list
133
+#endif
134
+
135
+    m_count--;
136
+    if (m_count)
137
+    {
138
+        if (m_start == &curFrame)
139
+            m_start = curFrame.m_next;
140
+        if (m_end == &curFrame)
141
+            m_end = curFrame.m_prev;
142
+
143
+        if (curFrame.m_next)
144
+            curFrame.m_next->m_prev = curFrame.m_prev;
145
+        if (curFrame.m_prev)
146
+            curFrame.m_prev->m_next = curFrame.m_next;
147
+    }
148
+    else
149
+    {
150
+        m_start = m_end = NULL;
151
+    }
152
+
153
+    curFrame.m_next = curFrame.m_prev = NULL;
154
+    return tmp;
155
+}
156
+
157
 void PicList::removeMCSTF(Frame& curFrame)
158
 {
159
 #if _DEBUG
160
x265_3.6.tar.gz/source/common/piclist.h -> x265_4.0.tar.gz/source/common/piclist.h Changed
45
 
1
@@ -50,10 +50,16 @@
2
     /** Push picture to end of the list */
3
     void pushBack(Frame& pic);
4
     void pushBackMCSTF(Frame& pic);
5
+#if ENABLE_MULTIVIEW
6
+    void pushBackSubDPB(Frame& pic);
7
+#endif
8
 
9
     /** Push picture to beginning of the list */
10
     void pushFront(Frame& pic);
11
     void pushFrontMCSTF(Frame& pic);
12
+#if ENABLE_MULTIVIEW
13
+    Frame* popFrontSubDPB();
14
+#endif
15
 
16
     /** Pop picture from end of the list */
17
     Frame* popBack();
18
@@ -63,17 +69,24 @@
19
     Frame* popFront();
20
 
21
     /** Find frame with specified POC */
22
-    Frame* getPOC(int poc);
23
+    Frame* getPOC(int poc, int sLayerId = 0);
24
     /* Find next MCSTF frame with specified POC */
25
     Frame* getPOCMCSTF(int poc);
26
 
27
     /** Get the current Frame from the list **/
28
-    Frame* getCurFrame(void);
29
+    Frame* getCurFrame(int sLayer);
30
 
31
     /** Remove picture from list */
32
     void remove(Frame& pic);
33
+
34
+    /** Remove picture from list */
35
+    Frame* removeFrame(Frame& pic);
36
     /* Remove MCSTF picture from list */
37
     void removeMCSTF(Frame& pic);
38
+#if ENABLE_MULTIVIEW
39
+    /** Remove picture from Sub list */
40
+    void removeSubDPB(Frame& pic);
41
+#endif
42
 
43
     Frame* first()        { return m_start;   }
44
 
45
x265_3.6.tar.gz/source/common/picyuv.cpp -> x265_4.0.tar.gz/source/common/picyuv.cpp Changed
201
 
1
@@ -258,7 +258,7 @@
2
 
3
 /* Copy pixels from an x265_picture into internal PicYuv instance.
4
  * Shift pixels as necessary, mask off bits above X265_DEPTH for safety. */
5
-void PicYuv::copyFromPicture(const x265_picture& pic, const x265_param& param, int padx, int pady)
6
+void PicYuv::copyFromPicture(const x265_picture& pic, const x265_param& param, int padx, int pady, bool isBase)
7
 {
8
     /* m_picWidth is the width that is being encoded, padx indicates how many
9
      * of those pixels are padding to reach multiple of MinCU(4) size.
10
@@ -321,78 +321,157 @@
11
 #else /* Case for (X265_DEPTH == 8) */
12
             // TODO: Does we need this path? may merge into above in future
13
         {
14
-            pixel *yPixel = m_picOrg0;
15
-            uint8_t *yChar = (uint8_t*)pic.planes0;
16
-
17
-            for (int r = 0; r < height; r++)
18
+            if (isBase || param.numViews > 1)
19
             {
20
-                memcpy(yPixel, yChar, width * sizeof(pixel));
21
+                int offsetX, offsetY;
22
+                offsetX = (!isBase && pic.format == 1 ? width : 0);
23
+                offsetY = (!isBase && pic.format == 2 ? pic.stride0 * height : 0);
24
+                pixel *yPixel = m_picOrg0;
25
+                uint8_t* yChar = (uint8_t*)pic.planes0 + offsetX + offsetY;
26
 
27
-                yPixel += m_stride;
28
-                yChar += pic.stride0 / sizeof(*yChar);
29
-            }
30
+                for (int r = 0; r < height; r++)
31
+                {
32
+                    memcpy(yPixel, yChar, width * sizeof(pixel));
33
 
34
-            if (param.internalCsp != X265_CSP_I400)
35
+                    yPixel += m_stride;
36
+                    yChar += pic.stride0 / sizeof(*yChar);
37
+                }
38
+
39
+                if (param.internalCsp != X265_CSP_I400)
40
+                {
41
+                    offsetX = offsetX >> m_hChromaShift;
42
+                    int offsetYU = (!isBase && pic.format == 2 ? pic.stride1 * (height >> m_vChromaShift) : 0);
43
+                    int offsetYV = (!isBase && pic.format == 2 ? pic.stride2 * (height >> m_vChromaShift) : 0);
44
+
45
+                    pixel *uPixel = m_picOrg1;
46
+                    pixel *vPixel = m_picOrg2;
47
+
48
+                    uint8_t* uChar = (uint8_t*)pic.planes1 + offsetX + offsetYU;
49
+                    uint8_t* vChar = (uint8_t*)pic.planes2 + offsetX + offsetYV;
50
+
51
+                    for (int r = 0; r < height >> m_vChromaShift; r++)
52
+                    {
53
+                        memcpy(uPixel, uChar, (width >> m_hChromaShift) * sizeof(pixel));
54
+                        memcpy(vPixel, vChar, (width >> m_hChromaShift) * sizeof(pixel));
55
+
56
+                        uPixel += m_strideC;
57
+                        vPixel += m_strideC;
58
+                        uChar += pic.stride1 / sizeof(*uChar);
59
+                        vChar += pic.stride2 / sizeof(*vChar);
60
+                    }
61
+                }
62
+            }
63
+#if ENABLE_ALPHA
64
+            if (!isBase && param.bEnableAlpha)
65
             {
66
-                pixel *uPixel = m_picOrg1;
67
-                pixel *vPixel = m_picOrg2;
68
+                pixel* aPixel = m_picOrg0;
69
+                uint8_t* aChar = (uint8_t*)pic.planes3;
70
 
71
-                uint8_t *uChar = (uint8_t*)pic.planes1;
72
-                uint8_t *vChar = (uint8_t*)pic.planes2;
73
+                for (int r = 0; r < height; r++)
74
+                {
75
+                    memcpy(aPixel, aChar, width * sizeof(pixel));
76
+
77
+                    aPixel += m_stride;
78
+                    aChar += pic.stride0 / sizeof(*aChar);
79
+                }
80
+
81
+                pixel* uPixel = m_picOrg1;
82
+                pixel* vPixel = m_picOrg2;
83
 
84
                 for (int r = 0; r < height >> m_vChromaShift; r++)
85
                 {
86
-                    memcpy(uPixel, uChar, (width >> m_hChromaShift) * sizeof(pixel));
87
-                    memcpy(vPixel, vChar, (width >> m_hChromaShift) * sizeof(pixel));
88
+                    memset(uPixel, 128, (width >> m_hChromaShift) * sizeof(pixel));
89
+                    memset(vPixel, 128, (width >> m_hChromaShift) * sizeof(pixel));
90
 
91
                     uPixel += m_strideC;
92
                     vPixel += m_strideC;
93
-                    uChar += pic.stride1 / sizeof(*uChar);
94
-                    vChar += pic.stride2 / sizeof(*vChar);
95
                 }
96
             }
97
+#endif
98
         }
99
 #endif /* (X265_DEPTH > 8) */
100
         }
101
         else /* pic.bitDepth > 8 */
102
         {
103
             /* defensive programming, mask off bits that are supposed to be zero */
104
-            uint16_t mask = (1 << X265_DEPTH) - 1;
105
-            int shift = abs(pic.bitDepth - X265_DEPTH);
106
-            pixel *yPixel = m_picOrg0;
107
+            if (isBase)
108
+            {
109
+                uint16_t mask = (1 << X265_DEPTH) - 1;
110
+                int shift = abs(pic.bitDepth - X265_DEPTH);
111
+                pixel* yPixel = m_picOrg0;
112
 
113
-            uint16_t *yShort = (uint16_t*)pic.planes0;
114
+                uint16_t* yShort = (uint16_t*)pic.planes0;
115
 
116
-            if (pic.bitDepth > X265_DEPTH)
117
-            {
118
-                /* shift right and mask pixels to final size */
119
-                primitives.planecopy_sp(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
120
-            }
121
-            else /* Case for (pic.bitDepth <= X265_DEPTH) */
122
-            {
123
-                /* shift left and mask pixels to final size */
124
-                primitives.planecopy_sp_shl(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
125
-            }
126
+                if (pic.bitDepth > X265_DEPTH)
127
+                {
128
+                    /* shift right and mask pixels to final size */
129
+                    primitives.planecopy_sp(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
130
+                }
131
+                else /* Case for (pic.bitDepth <= X265_DEPTH) */
132
+                {
133
+                    /* shift left and mask pixels to final size */
134
+                    primitives.planecopy_sp_shl(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
135
+                }
136
 
137
-            if (param.internalCsp != X265_CSP_I400)
138
+                if (param.internalCsp != X265_CSP_I400)
139
+                {
140
+                    pixel* uPixel = m_picOrg1;
141
+                    pixel* vPixel = m_picOrg2;
142
+
143
+                    uint16_t* uShort = (uint16_t*)pic.planes1;
144
+                    uint16_t* vShort = (uint16_t*)pic.planes2;
145
+
146
+                    if (pic.bitDepth > X265_DEPTH)
147
+                    {
148
+                        primitives.planecopy_sp(uShort, pic.stride1 / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
149
+                        primitives.planecopy_sp(vShort, pic.stride2 / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
150
+                    }
151
+                    else /* Case for (pic.bitDepth <= X265_DEPTH) */
152
+                    {
153
+                        primitives.planecopy_sp_shl(uShort, pic.stride1 / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
154
+                        primitives.planecopy_sp_shl(vShort, pic.stride2 / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
155
+                    }
156
+                }
157
+            }
158
+#if ENABLE_ALPHA
159
+            if (!isBase && param.bEnableAlpha)
160
             {
161
-                pixel *uPixel = m_picOrg1;
162
-                pixel *vPixel = m_picOrg2;
163
+                /* defensive programming, mask off bits that are supposed to be zero */
164
+                uint16_t mask = (1 << X265_DEPTH) - 1;
165
+                int shift = abs(pic.bitDepth - X265_DEPTH);
166
+                pixel* yPixel = m_picOrg0;
167
 
168
-                uint16_t *uShort = (uint16_t*)pic.planes1;
169
-                uint16_t *vShort = (uint16_t*)pic.planes2;
170
+                uint16_t* yShort = (uint16_t*)pic.planes3;
171
 
172
                 if (pic.bitDepth > X265_DEPTH)
173
                 {
174
-                    primitives.planecopy_sp(uShort, pic.stride1 / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
175
-                    primitives.planecopy_sp(vShort, pic.stride2 / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
176
+                    /* shift right and mask pixels to final size */
177
+                    primitives.planecopy_sp(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
178
                 }
179
                 else /* Case for (pic.bitDepth <= X265_DEPTH) */
180
                 {
181
-                    primitives.planecopy_sp_shl(uShort, pic.stride1 / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
182
-                    primitives.planecopy_sp_shl(vShort, pic.stride2 / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
183
+                    /* shift left and mask pixels to final size */
184
+                    primitives.planecopy_sp_shl(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
185
+                }
186
+
187
+                if (param.internalCsp != X265_CSP_I400)
188
+                {
189
+                    pixel* uPixel = m_picOrg1;
190
+                    pixel* vPixel = m_picOrg2;
191
+
192
+                    for (int r = 0; r < height >> m_vChromaShift; r++)
193
+                    {
194
+                        for (int c = 0; c < (width >> m_hChromaShift); c++)
195
+                        {
196
+                            uPixelc = ((1 << X265_DEPTH) >> 1);
197
+                            vPixelc = ((1 << X265_DEPTH) >> 1);
198
+                        }
199
+                        uPixel += m_strideC;
200
+                        vPixel += m_strideC;
201
x265_3.6.tar.gz/source/common/picyuv.h -> x265_4.0.tar.gz/source/common/picyuv.h Changed
10
 
1
@@ -83,7 +83,7 @@
2
     void  destroy();
3
     int   getLumaBufLen(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp);
4
 
5
-    void  copyFromPicture(const x265_picture&, const x265_param& param, int padx, int pady);
6
+    void  copyFromPicture(const x265_picture&, const x265_param& param, int padx, int pady, bool isBase = true);
7
     void  copyFromFrame(PicYuv* source);
8
 
9
     intptr_t getChromaAddrOffset(uint32_t ctuAddr, uint32_t absPartIdx) const { return m_cuOffsetCctuAddr + m_buOffsetCabsPartIdx; }
10
x265_3.6.tar.gz/source/common/pixel.cpp -> x265_4.0.tar.gz/source/common/pixel.cpp Changed
23
 
1
@@ -266,10 +266,6 @@
2
 {
3
     int satd = 0;
4
 
5
-#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 && !HIGH_BIT_DEPTH
6
-    pixelcmp_t satd_4x4 = x265_pixel_satd_4x4_neon;
7
-#endif
8
-
9
     for (int row = 0; row < h; row += 4)
10
         for (int col = 0; col < w; col += 4)
11
             satd += satd_4x4(pix1 + row * stride_pix1 + col, stride_pix1,
12
@@ -284,10 +280,6 @@
13
 {
14
     int satd = 0;
15
 
16
-#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 && !HIGH_BIT_DEPTH
17
-    pixelcmp_t satd_8x4 = x265_pixel_satd_8x4_neon;
18
-#endif
19
-
20
     for (int row = 0; row < h; row += 4)
21
         for (int col = 0; col < w; col += 8)
22
             satd += satd_8x4(pix1 + row * stride_pix1 + col, stride_pix1,
23
x265_3.6.tar.gz/source/common/predict.cpp -> x265_4.0.tar.gz/source/common/predict.cpp Changed
98
 
1
@@ -112,10 +112,22 @@
2
         }
3
         else
4
         {
5
-            if (bLuma)
6
-                predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
7
-            if (bChroma)
8
-                predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
9
+#if ENABLE_SCC_EXT
10
+            if (cu.m_slice->m_param->bEnableSCC && refIdx0 == (cu.m_slice->m_numRefIdx0 - 1))
11
+            {
12
+                if (bLuma)
13
+                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0);
14
+                if (bChroma)
15
+                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0);
16
+            }
17
+            else
18
+#endif
19
+            {
20
+                if (bLuma)
21
+                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
22
+                if (bChroma)
23
+                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
24
+            }
25
         }
26
     }
27
     else
28
@@ -174,12 +186,22 @@
29
 
30
             if (bLuma)
31
             {
32
-                predInterLumaShort(pu, m_predShortYuv0, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
33
+#if ENABLE_SCC_EXT
34
+                if (cu.m_slice->m_param->bEnableSCC && refIdx0 == (cu.m_slice->m_numRefIdx0 - 1))
35
+                    predInterLumaShort(pu, m_predShortYuv0, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0);
36
+                else
37
+#endif
38
+                    predInterLumaShort(pu, m_predShortYuv0, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
39
                 predInterLumaShort(pu, m_predShortYuv1, *cu.m_slice->m_refReconPicList1refIdx1, mv1);
40
             }
41
             if (bChroma)
42
             {
43
-                predInterChromaShort(pu, m_predShortYuv0, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
44
+#if ENABLE_SCC_EXT
45
+                if (cu.m_slice->m_param->bEnableSCC && refIdx0 == (cu.m_slice->m_numRefIdx0 - 1))
46
+                    predInterChromaShort(pu, m_predShortYuv0, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0);
47
+                else
48
+#endif
49
+                    predInterChromaShort(pu, m_predShortYuv0, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
50
                 predInterChromaShort(pu, m_predShortYuv1, *cu.m_slice->m_refReconPicList1refIdx1, mv1);
51
             }
52
 
53
@@ -206,10 +228,22 @@
54
             }
55
             else
56
             {
57
-                if (bLuma)
58
-                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
59
-                if (bChroma)
60
-                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
61
+#if ENABLE_SCC_EXT
62
+                if (cu.m_slice->m_param->bEnableSCC && refIdx0 == (cu.m_slice->m_numRefIdx0 - 1))
63
+                {
64
+                    if (bLuma)
65
+                        predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0);
66
+                    if (bChroma)
67
+                        predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0);
68
+                }
69
+                else
70
+#endif
71
+                {
72
+                    if (bLuma)
73
+                        predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
74
+                    if (bChroma)
75
+                        predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0);
76
+                }
77
             }
78
         }
79
         else
80
@@ -602,7 +636,7 @@
81
     int tuSize = 1 << intraNeighbors.log2TrSize;
82
     int tuSize2 = tuSize << 1;
83
 
84
-    PicYuv* reconPic = cu.m_encData->m_reconPic;
85
+    PicYuv* reconPic = cu.m_encData->m_reconPic0;
86
     pixel* adiOrigin = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
87
     intptr_t picStride = reconPic->m_stride;
88
 
89
@@ -651,7 +685,7 @@
90
 
91
 void Predict::initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId)
92
 {
93
-    PicYuv* reconPic = cu.m_encData->m_reconPic;
94
+    PicYuv* reconPic = cu.m_encData->m_reconPic0;
95
     const pixel* adiOrigin = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
96
     intptr_t picStride = reconPic->m_strideC;
97
 
98
x265_3.6.tar.gz/source/common/primitives.cpp -> x265_4.0.tar.gz/source/common/primitives.cpp Changed
12
 
1
@@ -258,8 +258,8 @@
2
             primitives.cui.intra_pred_allangs = NULL;
3
 
4
 #if ENABLE_ASSEMBLY
5
-#if X265_ARCH_X86
6
-        setupInstrinsicPrimitives(primitives, param->cpuid);
7
+#if defined(X265_ARCH_X86) || defined(X265_ARCH_ARM64)
8
+        setupIntrinsicPrimitives(primitives, param->cpuid);
9
 #endif
10
         setupAssemblyPrimitives(primitives, param->cpuid);
11
 #endif
12
x265_3.6.tar.gz/source/common/primitives.h -> x265_4.0.tar.gz/source/common/primitives.h Changed
15
 
1
@@ -470,12 +470,9 @@
2
 }
3
 
4
 void setupCPrimitives(EncoderPrimitives &p);
5
-void setupInstrinsicPrimitives(EncoderPrimitives &p, int cpuMask);
6
+void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask);
7
 void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask);
8
 void setupAliasPrimitives(EncoderPrimitives &p);
9
-#if X265_ARCH_ARM64
10
-void setupAliasCPrimitives(EncoderPrimitives &cp, EncoderPrimitives &asmp, int cpuMask);
11
-#endif
12
 #if HAVE_ALTIVEC
13
 void setupPixelPrimitives_altivec(EncoderPrimitives &p);
14
 void setupDCTPrimitives_altivec(EncoderPrimitives &p);
15
x265_3.6.tar.gz/source/common/slice.cpp -> x265_4.0.tar.gz/source/common/slice.cpp Changed
201
 
1
@@ -29,17 +29,83 @@
2
 
3
 using namespace X265_NS;
4
 
5
-void Slice::setRefPicList(PicList& picList)
6
+#if ENABLE_MULTIVIEW
7
+void Slice::createInterLayerReferencePictureSet(PicList& picList, PicList& refPicSetInterLayer0, PicList& refPicSetInterLayer1)
8
 {
9
+
10
+    for (int i = 0; i < 1; i++)
11
+    {
12
+        int layerIdRef = 0;// getRefPicLayerId(i);
13
+        Frame* refPic = picList.getPOC(m_poc, 0);
14
+        int viewIdCur = 0;
15
+        int viewIdZero = 1;
16
+        int viewIdRef = 1;
17
+
18
+        if ((viewIdCur <= viewIdZero && viewIdCur <= viewIdRef) || (viewIdCur >= viewIdZero && viewIdCur >= viewIdRef))
19
+        {
20
+            refPicSetInterLayer0.pushBackSubDPB(*refPic);
21
+        }
22
+        else
23
+        {
24
+            refPicSetInterLayer1.pushBackSubDPB(*refPic);
25
+        }
26
+    }
27
+}
28
+#endif
29
+
30
+void Slice::setRefPicList(PicList& picList, PicList& refPicSetInterLayer0, PicList& refPicSetInterLayer1, int sLayerId)
31
+{
32
+    bool checkNumPocTotalCurr = m_param->bEnableSCC ? false : true;
33
     if (m_sliceType == I_SLICE)
34
     {
35
         memset(m_refFrameList, 0, sizeof(m_refFrameList));
36
         memset(m_refReconPicList, 0, sizeof(m_refReconPicList));
37
         memset(m_refPOCList, 0, sizeof(m_refPOCList));
38
         m_numRefIdx1 = m_numRefIdx0 = 0;
39
+
40
+#if ENABLE_SCC_EXT
41
+        if (!checkNumPocTotalCurr)
42
+        {
43
+            if (m_rps.numberOfPictures == 0)
44
+            {
45
+                Frame* prevPic = picList.getPOC(X265_MAX(0, m_poc - 1));
46
+                if (prevPic->m_poc != X265_MAX(0, m_poc - 1))
47
+                {
48
+                    prevPic = picList.getPOC(m_poc);
49
+                }
50
+                m_lastEncPic = prevPic;
51
+            }
52
+            return;
53
+        }
54
+#endif
55
+
56
         return;
57
     }
58
 
59
+#if ENABLE_SCC_EXT || ENABLE_MULTIVIEW || ENABLE_ALPHA
60
+    /*Reset the number of references for I-slice marked as P-slice*/
61
+    if ((m_param->bEnableSCC || sLayerId) && m_sliceType != m_origSliceType)
62
+    {
63
+        memset(m_refFrameList, 0, sizeof(m_refFrameList));
64
+        memset(m_refReconPicList, 0, sizeof(m_refReconPicList));
65
+        memset(m_refPOCList, 0, sizeof(m_refPOCList));
66
+        m_numRefIdx0 = 1;
67
+    }
68
+#endif
69
+
70
+#if ENABLE_SCC_EXT
71
+    if (!checkNumPocTotalCurr && m_rps.numberOfPictures == 0)
72
+    {
73
+        Frame* prevPic = picList.getPOC(X265_MAX(0, m_poc - 1));
74
+        if (prevPic->m_poc != X265_MAX(0, m_poc - 1))
75
+        {
76
+            prevPic = picList.getPOC(m_poc);
77
+
78
+        }
79
+        m_lastEncPic = prevPic;
80
+    }
81
+#endif
82
+
83
     Frame* refPic = NULL;
84
     Frame* refPicSetStCurr0MAX_NUM_REF;
85
     Frame* refPicSetStCurr1MAX_NUM_REF;
86
@@ -51,9 +117,9 @@
87
 
88
     for (i = 0; i < m_rps.numberOfNegativePictures; i++)
89
     {
90
-        if (m_rps.bUsedi)
91
+        if (m_rps.bUsedi && m_origSliceType != I_SLICE)
92
         {
93
-            refPic = picList.getPOC(m_poc + m_rps.deltaPOCi);
94
+            refPic = picList.getPOC(m_poc + m_rps.deltaPOCi, m_rps.deltaPOCi ? sLayerId : 0);
95
             refPicSetStCurr0numPocStCurr0 = refPic;
96
             numPocStCurr0++;
97
         }
98
@@ -61,9 +127,9 @@
99
 
100
     for (; i < m_rps.numberOfNegativePictures + m_rps.numberOfPositivePictures; i++)
101
     {
102
-        if (m_rps.bUsedi)
103
+        if (m_rps.bUsedi && m_origSliceType != I_SLICE)
104
         {
105
-            refPic = picList.getPOC(m_poc + m_rps.deltaPOCi);
106
+            refPic = picList.getPOC(m_poc + m_rps.deltaPOCi, m_rps.deltaPOCi ? sLayerId : 0);
107
             refPicSetStCurr1numPocStCurr1 = refPic;
108
             numPocStCurr1++;
109
         }
110
@@ -75,18 +141,44 @@
111
     // ref_pic_list_init
112
     Frame* rpsCurrList0MAX_NUM_REF + 1;
113
     Frame* rpsCurrList1MAX_NUM_REF + 1;
114
+#if ENABLE_MULTIVIEW
115
+    int numPocTotalCurr = numPocStCurr0 + numPocStCurr1 + numPocLtCurr + refPicSetInterLayer0.size() + refPicSetInterLayer1.size();
116
+#else
117
     int numPocTotalCurr = numPocStCurr0 + numPocStCurr1 + numPocLtCurr;
118
+#endif
119
+
120
+#if ENABLE_SCC_EXT
121
+    if (m_param->bEnableSCC)
122
+        numPocTotalCurr++;
123
+#endif
124
 
125
     int cIdx = 0;
126
     for (i = 0; i < numPocStCurr0; i++, cIdx++)
127
         rpsCurrList0cIdx = refPicSetStCurr0i;
128
 
129
+#if ENABLE_MULTIVIEW
130
+    if (m_param->numViews > 1)
131
+        for (i = 0; i < refPicSetInterLayer0.size(); i++, cIdx++)
132
+            rpsCurrList0cIdx = refPicSetInterLayer0.getPOC(m_poc, 0);
133
+#endif
134
+
135
     for (i = 0; i < numPocStCurr1; i++, cIdx++)
136
         rpsCurrList0cIdx = refPicSetStCurr1i;
137
 
138
     for (i = 0; i < numPocLtCurr; i++, cIdx++)
139
         rpsCurrList0cIdx = refPicSetLtCurri;
140
 
141
+#if ENABLE_MULTIVIEW
142
+    if (m_param->numViews > 1)
143
+        for (i = 0; i < refPicSetInterLayer1.size(); i++, cIdx++)
144
+            rpsCurrList0cIdx = refPicSetInterLayer1.getPOC(m_poc, 0);
145
+#endif
146
+
147
+#if ENABLE_SCC_EXT
148
+    if (m_param->bEnableSCC)
149
+        rpsCurrList0cIdx++ = picList.getPOC(m_poc);
150
+#endif
151
+
152
     X265_CHECK(cIdx == numPocTotalCurr, "RPS index check fail\n");
153
 
154
     if (m_sliceType == B_SLICE)
155
@@ -95,12 +187,29 @@
156
         for (i = 0; i < numPocStCurr1; i++, cIdx++)
157
             rpsCurrList1cIdx = refPicSetStCurr1i;
158
 
159
+#if ENABLE_MULTIVIEW
160
+        if (m_param->numViews > 1)
161
+            for (i = 0; i < refPicSetInterLayer1.size(); i++, cIdx++)
162
+                rpsCurrList1cIdx = refPicSetInterLayer1.getPOC(m_poc, 0);
163
+#endif
164
+
165
         for (i = 0; i < numPocStCurr0; i++, cIdx++)
166
             rpsCurrList1cIdx = refPicSetStCurr0i;
167
 
168
         for (i = 0; i < numPocLtCurr; i++, cIdx++)
169
             rpsCurrList1cIdx = refPicSetLtCurri;
170
 
171
+#if ENABLE_MULTIVIEW
172
+        if (m_param->numViews > 1)
173
+            for (i = 0; i < refPicSetInterLayer0.size(); i++, cIdx++)
174
+                rpsCurrList1cIdx = refPicSetInterLayer0.getPOC(m_poc, 0);
175
+#endif
176
+
177
+#if  ENABLE_SCC_EXT
178
+        if (m_param->bEnableSCC)
179
+            rpsCurrList1cIdx++ = picList.getPOC(m_poc);
180
+#endif
181
+
182
         X265_CHECK(cIdx == numPocTotalCurr, "RPS index check fail\n");
183
     }
184
 
185
@@ -109,8 +218,18 @@
186
         cIdx = rIdx % numPocTotalCurr;
187
         X265_CHECK(cIdx >= 0 && cIdx < numPocTotalCurr, "RPS index check fail\n");
188
         m_refFrameList0rIdx = rpsCurrList0cIdx;
189
+#if ENABLE_MULTIVIEW
190
+        m_refFrameList0rIdx = rpsCurrList0cIdx;
191
+#endif
192
     }
193
 
194
+#if  ENABLE_SCC_EXT
195
+    if (m_param->bEnableSCC && numPocTotalCurr > m_numRefIdx0)
196
+    {
197
+        m_refFrameList0m_numRefIdx0 - 1 = picList.getPOC(m_poc);
198
+    }
199
+#endif
200
+
201
x265_3.6.tar.gz/source/common/slice.h -> x265_4.0.tar.gz/source/common/slice.h Changed
124
 
1
@@ -73,7 +73,11 @@
2
         MAIN10 = 2,
3
         MAINSTILLPICTURE = 3,
4
         MAINREXT = 4,
5
-        HIGHTHROUGHPUTREXT = 5
6
+        HIGHTHROUGHPUTREXT = 5,
7
+        MULTIVIEWMAIN = 6,
8
+        SCALABLEMAIN = 7,
9
+        SCALABLEMAIN10 = 8,
10
+        MAINSCC = 9
11
     };
12
 }
13
 
14
@@ -106,7 +110,7 @@
15
 
16
 struct ProfileTierLevel
17
 {
18
-    int      profileIdc;
19
+    int      profileIdcMAX_LAYERS;
20
     int      levelIdc;
21
     uint32_t minCrForLevel;
22
     uint32_t maxLumaSrForLevel;
23
@@ -159,6 +163,27 @@
24
     uint32_t         numReorderPicsMAX_T_LAYERS;
25
     uint32_t         maxDecPicBufferingMAX_T_LAYERS;
26
     uint32_t         maxLatencyIncreaseMAX_T_LAYERS;
27
+    int              m_numLayers;
28
+    int              m_numViews;
29
+    bool             vps_extension_flag;
30
+
31
+#if (ENABLE_ALPHA || ENABLE_MULTIVIEW)
32
+    bool             splitting_flag;
33
+    int              m_scalabilityMaskMAX_VPS_NUM_SCALABILITY_TYPES;
34
+    int              scalabilityTypes;
35
+    uint8_t          m_dimensionIdLenMAX_VPS_NUM_SCALABILITY_TYPES;
36
+    uint8_t          m_dimensionIdMAX_VPS_LAYER_ID_PLUS1MAX_VPS_NUM_SCALABILITY_TYPES;
37
+    bool             m_nuhLayerIdPresentFlag;
38
+    uint8_t          m_layerIdInNuhMAX_VPS_LAYER_ID_PLUS1;
39
+    uint8_t          m_layerIdInVpsMAX_VPS_LAYER_ID_PLUS1;
40
+    int              m_viewIdLen;
41
+    int              m_vpsNumLayerSetsMinus1;
42
+    int              m_numLayersInIdList1023;
43
+#endif
44
+
45
+#if ENABLE_MULTIVIEW
46
+    int              m_layerIdIncludedFlag;
47
+#endif
48
 };
49
 
50
 struct Window
51
@@ -252,6 +277,13 @@
52
 
53
     Window   conformanceWindow;
54
     VUI      vuiParameters;
55
+    bool     sps_extension_flag;
56
+
57
+#if ENABLE_MULTIVIEW
58
+    int      setSpsExtOrMaxSubLayersMinus1;
59
+    int      maxViews;
60
+    bool     vui_parameters_present_flag;
61
+#endif
62
 
63
     SPS()
64
     {
65
@@ -290,6 +322,11 @@
66
 
67
     int      numRefIdxDefault2;
68
     bool     pps_slice_chroma_qp_offsets_present_flag;
69
+
70
+    bool     pps_extension_flag;
71
+    int      maxViews;
72
+
73
+    int      profileIdc;
74
 };
75
 
76
 struct WeightParam
77
@@ -339,6 +376,7 @@
78
 
79
     NalUnitType m_nalUnitType;
80
     SliceType   m_sliceType;
81
+    SliceType   m_origSliceType;
82
     int         m_sliceQp;
83
     int         m_chromaQpOffset2;
84
     int         m_poc;
85
@@ -365,6 +403,13 @@
86
     int         m_fieldNum;
87
     Frame*      m_mcstfRefFrameList2MAX_MCSTF_TEMPORAL_WINDOW_LENGTH;
88
 
89
+#if  ENABLE_SCC_EXT
90
+    Frame*      m_lastEncPic;
91
+    bool        m_bLMvdL1Zero;
92
+    bool        m_useIntegerMv;
93
+#endif
94
+    bool        m_bTemporalMvp;
95
+
96
     Slice()
97
     {
98
         m_lastIDR = 0;
99
@@ -380,11 +425,23 @@
100
         m_rpsIdx = -1;
101
         m_chromaQpOffset0 = m_chromaQpOffset1 = 0;
102
         m_fieldNum = 0;
103
+#if  ENABLE_SCC_EXT
104
+        m_lastEncPic = NULL;
105
+        m_useIntegerMv = false;
106
+#endif
107
+        m_bTemporalMvp = false;
108
     }
109
 
110
     void disableWeights();
111
 
112
-    void setRefPicList(PicList& picList);
113
+    void setRefPicList(PicList& picList, PicList& refPicSetInterLayer0, PicList& refPicSetInterLayer1, int viewId);
114
+#if ENABLE_MULTIVIEW
115
+    void createInterLayerReferencePictureSet(PicList& picList, PicList& refPicSetInterLayer0, PicList& refPicSetInterLayer1);
116
+#endif
117
+
118
+#if  ENABLE_SCC_EXT
119
+    bool isOnlyCurrentPictureAsReference() const;
120
+#endif
121
 
122
     bool getRapPicFlag() const
123
     {
124
x265_3.6.tar.gz/source/common/threadpool.cpp -> x265_4.0.tar.gz/source/common/threadpool.cpp Changed
13
 
1
@@ -669,7 +669,11 @@
2
     else if (cpuCount >= 16)
3
         p->frameNumThreads = 4; 
4
     else if (cpuCount >= 8)
5
+#if _WIN32 && X265_ARCH_ARM64
6
+        p->frameNumThreads = cpuCount;
7
+#else
8
         p->frameNumThreads = 3;
9
+#endif
10
     else if (cpuCount >= 4)
11
         p->frameNumThreads = 2;
12
     else
13
x265_3.6.tar.gz/source/common/vec/vec-primitives.cpp -> x265_4.0.tar.gz/source/common/vec/vec-primitives.cpp Changed
10
 
1
@@ -59,7 +59,7 @@
2
 void setupIntrinsicDCT_sse41(EncoderPrimitives&);
3
 
4
 /* Use primitives for the best available vector architecture */
5
-void setupInstrinsicPrimitives(EncoderPrimitives &p, int cpuMask)
6
+void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask)
7
 {
8
 #ifdef HAVE_SSE3
9
     if (cpuMask & X265_CPU_SSE3)
10
x265_3.6.tar.gz/source/common/wavefront.cpp -> x265_4.0.tar.gz/source/common/wavefront.cpp Changed
22
 
1
@@ -58,6 +58,11 @@
2
     x265_free((void*)m_externalDependencyBitmap);
3
 }
4
 
5
+void WaveFront::setLayerId(int layer)
6
+{
7
+    m_sLayerId = layer;
8
+}
9
+
10
 void WaveFront::clearEnabledRowMask()
11
 {
12
     memset((void*)m_externalDependencyBitmap, 0, sizeof(uint32_t) * m_numWords);
13
@@ -103,7 +108,7 @@
14
             if (ATOMIC_AND(&m_internalDependencyBitmapw, ~bit) & bit)
15
             {
16
                 /* we cleared the bit, we get to process the row */
17
-                processRow(w * 32 + id, threadId);
18
+                processRow(w * 32 + id, threadId, m_sLayerId);
19
                 m_helpWanted = true;
20
                 return; /* check for a higher priority task */
21
             }
22
x265_3.6.tar.gz/source/common/wavefront.h -> x265_4.0.tar.gz/source/common/wavefront.h Changed
21
 
1
@@ -52,6 +52,8 @@
2
 
3
     int m_numRows;
4
 
5
+    int m_sLayerId;
6
+
7
 protected:
8
     uint32_t *m_row_to_idx;
9
     uint32_t *m_idx_to_row;
10
@@ -95,7 +97,9 @@
11
 
12
     // Start or resume encode processing of this row, must be implemented by
13
     // derived classes.
14
-    virtual void processRow(int row, int threadId) = 0;
15
+    virtual void processRow(int row, int threadId, int layer) = 0;
16
+
17
+    void setLayerId(int layer);
18
 };
19
 } // end namespace X265_NS
20
 
21
x265_3.6.tar.gz/source/encoder/analysis.cpp -> x265_4.0.tar.gz/source/encoder/analysis.cpp Changed
201
 
1
@@ -223,7 +223,12 @@
2
     }
3
     ProfileCUScope(ctu, totalCTUTime, totalCTUs);
4
 
5
-    if (m_slice->m_sliceType == I_SLICE)
6
+#if  ENABLE_SCC_EXT
7
+    memset(m_ibc.m_BVs, 0, sizeof(m_ibc.m_BVs));
8
+    memset(m_ibc.m_lastIntraBCMv, 0, sizeof(m_ibc.m_lastIntraBCMv));
9
+    m_ibc.m_numBV16s = 0; m_ibc.m_numBVs = 0;
10
+#endif
11
+    if (m_slice->m_sliceType == I_SLICE || (m_param->bEnableSCC && (m_slice->m_numRefIdx0 == 1) && m_slice->m_refPOCList00 == m_slice->m_poc))
12
     {
13
         x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
14
         if (m_param->analysisLoadReuseLevel > 1)
15
@@ -233,7 +238,11 @@
16
             memcpy(ctu.m_partSize, &intraDataCTU->partSizesctu.m_cuAddr * numPartition, sizeof(char) * numPartition);
17
             memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModesctu.m_cuAddr * numPartition, sizeof(uint8_t) * numPartition);
18
         }
19
+#if ENABLE_SCC_EXT
20
+        compressIntraCU(ctu, cuGeom, qp, &m_ibc);
21
+#else
22
         compressIntraCU(ctu, cuGeom, qp);
23
+#endif
24
     }
25
     else
26
     {
27
@@ -271,7 +280,7 @@
28
         {
29
             /* In RD Level 0/1, copy source pixels into the reconstructed block so
30
              * they are available for intra predictions */
31
-            m_modeDepth0.fencYuv.copyToPicYuv(*m_frame->m_reconPic, ctu.m_cuAddr, 0);
32
+            m_modeDepth0.fencYuv.copyToPicYuv(*m_frame->m_reconPic0, ctu.m_cuAddr, 0);
33
 
34
             compressInterCU_rd0_4(ctu, cuGeom, qp);
35
 
36
@@ -304,7 +313,11 @@
37
         else if (m_param->rdLevel <= 4)
38
             compressInterCU_rd0_4(ctu, cuGeom, qp);
39
         else
40
+#if ENABLE_SCC_EXT
41
+            compressInterCU_rd5_6(ctu, cuGeom, qp, &m_ibc);
42
+#else
43
             compressInterCU_rd5_6(ctu, cuGeom, qp);
44
+#endif
45
     }
46
 
47
     if (m_param->bEnableRdRefine || m_param->bOptCUDeltaQP)
48
@@ -508,15 +521,22 @@
49
 
50
     /* Copy best data to encData CTU and recon */
51
     md.bestMode->cu.copyToPic(depth);
52
-    md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
53
+    md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic0, parentCTU.m_cuAddr, cuGeom.absPartIdx);
54
 }
55
 
56
+#if ENABLE_SCC_EXT
57
+uint64_t Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, IBC* ibc)
58
+#else
59
 uint64_t Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
60
+#endif
61
 {
62
     uint32_t depth = cuGeom.depth;
63
     ModeDepth& md = m_modeDepthdepth;
64
     md.bestMode = NULL;
65
 
66
+    MV iMVCandList410;
67
+    memset(iMVCandList, 0, sizeof(MV) * 4 * 10);
68
+
69
     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
70
     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
71
 
72
@@ -567,6 +587,43 @@
73
             checkBestMode(md.predPRED_INTRA_NxN, depth);
74
         }
75
 
76
+#if ENABLE_SCC_EXT
77
+        bool intraBlockCopyFastSearch = (m_param->bEnableSCC == 1) ? true : false, bUse1DSearchFor8x8 = false;
78
+        if (m_param->bEnableSCC)
79
+        {
80
+            md.predPRED_MERGE_IBC.cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
81
+            checkRDCostIntraBCMerge2Nx2N(md.predPRED_MERGE_IBC, cuGeom);
82
+
83
+            md.predPRED_IBC_2Nx2N.cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
84
+            checkIntraBC_rd5_6(md.predPRED_IBC_2Nx2N, cuGeom, SIZE_2Nx2N, false, bUse1DSearchFor8x8, *ibc);
85
+            checkBestMode(md.predPRED_IBC_2Nx2N, depth);
86
+
87
+            if (intraBlockCopyFastSearch)
88
+            {
89
+                if ((int)depth == m_slice->m_sps->log2DiffMaxMinCodingBlockSize)
90
+                {
91
+                    md.predPRED_IBC_Nx2N.cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
92
+                    checkIntraBC_rd5_6(md.predPRED_IBC_Nx2N, cuGeom, SIZE_Nx2N, false, bUse1DSearchFor8x8, *ibc, (iMVCandListSIZE_Nx2N + 8));
93
+                    checkBestMode(md.predPRED_IBC_Nx2N, depth);
94
+
95
+                    md.predPRED_IBC_2NxN.cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
96
+                    checkIntraBC_rd5_6(md.predPRED_IBC_2NxN, cuGeom, SIZE_2NxN, false, bUse1DSearchFor8x8, *ibc, (iMVCandListSIZE_2NxN + 8));
97
+                    checkBestMode(md.predPRED_IBC_2NxN, depth);
98
+                }
99
+            }
100
+            else
101
+            {
102
+                md.predPRED_IBC_2NxN.cu.initSubCU(parentCTU, cuGeom, qp);
103
+                checkIntraBC_rd5_6(md.predPRED_IBC_2NxN, cuGeom, SIZE_2NxN, false, bUse1DSearchFor8x8, *ibc, (iMVCandListSIZE_2NxN + 8));
104
+                checkBestMode(md.predPRED_IBC_2NxN, depth);
105
+
106
+                md.predPRED_IBC_Nx2N.cu.initSubCU(parentCTU, cuGeom, qp);
107
+                checkIntraBC_rd5_6(md.predPRED_IBC_Nx2N, cuGeom, SIZE_Nx2N, false, bUse1DSearchFor8x8, *ibc, (iMVCandListSIZE_Nx2N + 8));
108
+                checkBestMode(md.predPRED_IBC_Nx2N, depth);
109
+            }
110
+        }
111
+#endif
112
+
113
         if (m_bTryLossless)
114
             tryLossless(cuGeom);
115
 
116
@@ -574,6 +631,91 @@
117
             addSplitFlagCost(*md.bestMode, cuGeom.depth);
118
     }
119
 
120
+#if ENABLE_SCC_EXT
121
+    // If Intra BC keep last coded Mv
122
+    if (md.bestMode && md.bestMode->cu.isInter(0))
123
+    {
124
+        MVField mvField;
125
+        const CUData* cu = &md.bestMode->cu;
126
+        md.bestMode->cu.getMvField(cu, 0, 0, mvField);
127
+        int iRefIdxFirst = mvField.refIdx;
128
+        md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
129
+        int iRefIdxLast = mvField.refIdx;
130
+        bool isIntraBCFirst = (iRefIdxFirst >= 0) ? cu->m_slice->m_refFrameList0iRefIdxFirst->m_poc == cu->m_slice->m_poc : false;
131
+        bool isIntraBCLast = (iRefIdxLast >= 0) ? cu->m_slice->m_refFrameList0iRefIdxLast->m_poc == cu->m_slice->m_poc : false;
132
+
133
+        if (isIntraBCFirst || isIntraBCLast)
134
+        {
135
+            if (cu->m_partSize0 == SIZE_2Nx2N)
136
+            {
137
+                md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
138
+                if (mvField.mv != cu->m_lastIntraBCMv0)
139
+                {
140
+                    md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0;
141
+                    md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv;
142
+                }
143
+            }
144
+            else if (cu->m_partSize0 == SIZE_2NxN || cu->m_partSize0 == SIZE_Nx2N)
145
+            {
146
+                // mixed PU, only one partition is IntraBC coded
147
+                if (isIntraBCFirst != isIntraBCLast)
148
+                {
149
+                    if (isIntraBCFirst)
150
+                    {
151
+                        // Part 0
152
+                        md.bestMode->cu.getMvField(cu, 0, 0, mvField);
153
+                        if (mvField.mv != cu->m_lastIntraBCMv0)
154
+                        {
155
+                            md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0;
156
+                            md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv;
157
+                        }
158
+                    }
159
+                    else if (isIntraBCLast)
160
+                    {
161
+                        // Part 1
162
+                        md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
163
+                        if (mvField.mv != cu->m_lastIntraBCMv0)
164
+                        {
165
+                            md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0;
166
+                            md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv;
167
+                        }
168
+                    }
169
+                }
170
+                else // normal IntraBC CU
171
+                {
172
+                    // Part 0
173
+                    md.bestMode->cu.getMvField(cu, 0, 0, mvField);
174
+                    if (mvField.mv != cu->m_lastIntraBCMv0)
175
+                    {
176
+                        md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0;
177
+                        md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv;
178
+                    }
179
+                    // Part 1
180
+                    md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
181
+                    if (mvField.mv != cu->m_lastIntraBCMv0)
182
+                    {
183
+                        md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0;
184
+                        md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv;
185
+                    }
186
+                }
187
+            }
188
+            else
189
+            {
190
+                // NxN
191
+                for (int part = 0; part < 4; part++)
192
+                {
193
+                    md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 4 + part, 0, mvField);
194
+                    if (mvField.mv != cu->m_lastIntraBCMv0)
195
+                    {
196
+                        md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0;
197
+                        md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv;
198
+                    }
199
+                }
200
+            }
201
x265_3.6.tar.gz/source/encoder/analysis.h -> x265_4.0.tar.gz/source/encoder/analysis.h Changed
70
 
1
@@ -75,6 +75,14 @@
2
         PRED_nRx2N,
3
         PRED_INTRA_NxN, /* 4x4 intra PU blocks for 8x8 CU */
4
         PRED_LOSSLESS,  /* lossless encode of best mode */
5
+#if ENABLE_SCC_EXT
6
+        PRED_IBC_2Nx2N,
7
+        PRED_IBC_Nx2N,
8
+        PRED_IBC_2NxN,
9
+        PRED_MIXED_IBC_NX2N,
10
+        PRED_MIXED_IBC_2NXN,
11
+        PRED_MERGE_IBC,
12
+#endif
13
         MAX_PRED_TYPES
14
     };
15
 
16
@@ -113,6 +121,7 @@
17
     bool      m_modeFlag2;
18
     bool      m_checkMergeAndSkipOnly2;
19
 
20
+    IBC       m_ibc;
21
     Analysis();
22
 
23
     bool create(ThreadLocalData* tld);
24
@@ -120,6 +129,7 @@
25
 
26
     Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext);
27
     int32_t loadTUDepth(CUGeom cuGeom, CUData parentCTU);
28
+
29
 protected:
30
     /* Analysis data for save/load mode, writes/reads data based on absPartIdx */
31
     x265_analysis_inter_data*  m_reuseInterDataCTU;
32
@@ -162,12 +172,20 @@
33
     void qprdRefine(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp);
34
 
35
     /* full analysis for an I-slice CU */
36
+#if ENABLE_SCC_EXT
37
+    uint64_t compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, IBC* ibc = NULL);
38
+#else
39
     uint64_t compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
40
+#endif
41
 
42
     /* full analysis for a P or B slice CU */
43
     uint32_t compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
44
     SplitData compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
45
+#if ENABLE_SCC_EXT
46
+    SplitData compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, IBC* ibc = NULL);
47
+#else
48
     SplitData compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
49
+#endif
50
 
51
     void recodeCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t origqp = -1);
52
 
53
@@ -177,10 +195,15 @@
54
 
55
     /* measure inter options */
56
     void checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refmask2);
57
-    void checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refmask2);
58
+    void checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refmask2, MV* iMVCandList = NULL);
59
 
60
     void checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom);
61
 
62
+#if ENABLE_SCC_EXT
63
+    void checkRDCostIntraBCMerge2Nx2N(Mode& merge, const CUGeom& cuGeom);
64
+    void checkIntraBC_rd5_6(Mode& intraBCMode, const CUGeom& cuGeom, PartSize ePartSize, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc, MV* iMVCandList = NULL);
65
+#endif
66
+
67
     /* encode current bestMode losslessly, pick best RD cost */
68
     void tryLossless(const CUGeom& cuGeom);
69
 
70
x265_3.6.tar.gz/source/encoder/api.cpp -> x265_4.0.tar.gz/source/encoder/api.cpp Changed
201
 
1
@@ -20,7 +20,6 @@
2
  * This program is also available under a commercial proprietary license.
3
  * For more information, contact us at license @ x265.com.
4
  *****************************************************************************/
5
-
6
 #include "common.h"
7
 #include "bitstream.h"
8
 #include "param.h"
9
@@ -185,7 +184,7 @@
10
     // will detect and set profile/tier/level in VPS
11
     determineLevel(*param, encoder->m_vps);
12
 
13
-    if (!param->bAllowNonConformance && encoder->m_vps.ptl.profileIdc == Profile::NONE)
14
+    if (!param->bAllowNonConformance && encoder->m_vps.ptl.profileIdc0 == Profile::NONE)
15
     {
16
         x265_log(param, X265_LOG_INFO, "non-conformant bitstreams not allowed (--allow-non-conformance)\n");
17
         goto fail;
18
@@ -357,11 +356,11 @@
19
             VPS saveVPS;
20
             memcpy(&saveVPS.ptl, &encoder->m_vps.ptl, sizeof(saveVPS.ptl));
21
             determineLevel(*encoder->m_latestParam, encoder->m_vps);
22
-            if (saveVPS.ptl.profileIdc != encoder->m_vps.ptl.profileIdc || saveVPS.ptl.levelIdc != encoder->m_vps.ptl.levelIdc
23
+            if (saveVPS.ptl.profileIdc0 != encoder->m_vps.ptl.profileIdc0 || saveVPS.ptl.levelIdc != encoder->m_vps.ptl.levelIdc
24
                 || saveVPS.ptl.tierFlag != encoder->m_vps.ptl.tierFlag)
25
             {
26
                 x265_log(encoder->m_param, X265_LOG_WARNING, "Profile/Level/Tier has changed from %d/%d/%s to %d/%d/%s.Cannot reconfigure rate-control.\n",
27
-                         saveVPS.ptl.profileIdc, saveVPS.ptl.levelIdc, saveVPS.ptl.tierFlag ? "High" : "Main", encoder->m_vps.ptl.profileIdc,
28
+                         saveVPS.ptl.profileIdc0, saveVPS.ptl.levelIdc, saveVPS.ptl.tierFlag ? "High" : "Main", encoder->m_vps.ptl.profileIdc0,
29
                          encoder->m_vps.ptl.levelIdc, encoder->m_vps.ptl.tierFlag ? "High" : "Main");
30
                 x265_copy_params(encoder->m_latestParam, &save);
31
                 memcpy(&encoder->m_vps.ptl, &saveVPS.ptl, sizeof(saveVPS.ptl));
32
@@ -406,7 +405,7 @@
33
     return 0;
34
 }
35
 
36
-int x265_encoder_encode(x265_encoder *enc, x265_nal **pp_nal, uint32_t *pi_nal, x265_picture *pic_in, x265_picture *pic_out)
37
+int x265_encoder_encode(x265_encoder *enc, x265_nal **pp_nal, uint32_t *pi_nal, x265_picture *pic_in, x265_picture **pic_out)
38
 {
39
     if (!enc)
40
         return -1;
41
@@ -602,7 +601,10 @@
42
         *pi_nal = 0;
43
 
44
     if (numEncoded && encoder->m_param->csvLogLevel && encoder->m_outputCount >= encoder->m_latestParam->chunkStart)
45
-        x265_csvlog_frame(encoder->m_param, pic_out);
46
+    {
47
+        for (int layer = 0; layer < encoder->m_param->numLayers; layer++)
48
+            x265_csvlog_frame(encoder->m_param, pic_outlayer);
49
+    }
50
 
51
     if (numEncoded < 0)
52
         encoder->m_aborted = true;
53
@@ -653,11 +655,14 @@
54
     if (enc)
55
     {
56
         Encoder *encoder = static_cast<Encoder*>(enc);
57
-        x265_stats stats;       
58
-        encoder->fetchStats(&stats, sizeof(stats));
59
+        x265_stats statsMAX_LAYERS;
60
         int padx = encoder->m_sps.conformanceWindow.rightOffset;
61
         int pady = encoder->m_sps.conformanceWindow.bottomOffset;
62
-        x265_csvlog_encode(encoder->m_param, &stats, padx, pady, argc, argv);
63
+        for (int layer = 0; layer < encoder->m_param->numLayers; layer++)
64
+        {
65
+            encoder->fetchStats(stats, sizeof(statslayer), layer);
66
+            x265_csvlog_encode(encoder->m_param, &stats0, padx, pady, argc, argv);
67
+        }
68
     }
69
 }
70
 
71
@@ -744,7 +749,7 @@
72
     if (!enc)
73
         return -1;
74
     Encoder *encoder = static_cast<Encoder*>(enc);
75
-    if (!encoder->copySlicetypePocAndSceneCut(slicetype, poc, sceneCut))
76
+    if (!encoder->copySlicetypePocAndSceneCut(slicetype, poc, sceneCut, 0))
77
         return 0;
78
     return -1;
79
 }
80
@@ -1295,7 +1300,7 @@
81
         {
82
             if (param->csvLogLevel)
83
             {
84
-                fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, Scenecut, ");
85
+                fprintf(csvfp, "Layer , Encode Order, Type, POC, QP, Bits, Scenecut, ");
86
                 if (!!param->bEnableTemporalSubLayers)
87
                     fprintf(csvfp, "Temporal Sub Layer ID, ");
88
                 if (param->csvLogLevel >= 2)
89
@@ -1409,7 +1414,7 @@
90
         return;
91
 
92
     const x265_frame_stats* frameStats = &pic->frameData;
93
-    fprintf(param->csvfpt, "%d, %c-SLICE, %4d, %2.2lf, %10d, %d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc,
94
+    fprintf(param->csvfpt, "%d, %d, %c-SLICE, %4d, %2.2lf, %10d, %d,", pic->layerID, frameStats->encoderOrder, frameStats->sliceType, frameStats->poc,
95
                                                                    frameStats->qp, (int)frameStats->bits, frameStats->bScenecut);
96
     if (!!param->bEnableTemporalSubLayers)
97
         fprintf(param->csvfpt, "%d,", frameStats->tLayer);
98
@@ -1806,6 +1811,219 @@
99
     return ret;
100
 }
101
 
102
+static enum VmafOutputFormat log_fmt_map(const char *log_fmt)
103
+{
104
+   if (log_fmt) {
105
+       if (!strcmp(log_fmt, "xml"))
106
+           return VMAF_OUTPUT_FORMAT_XML;
107
+       if (!strcmp(log_fmt, "json"))
108
+           return VMAF_OUTPUT_FORMAT_JSON;
109
+       if (!strcmp(log_fmt, "csv"))
110
+           return VMAF_OUTPUT_FORMAT_CSV;
111
+       if (!strcmp(log_fmt, "sub"))
112
+           return VMAF_OUTPUT_FORMAT_SUB;
113
+   }
114
+
115
+   return VMAF_OUTPUT_FORMAT_NONE;
116
+}
117
+
118
+static enum VmafPoolingMethod pool_method_map(const char *pool_method)
119
+{
120
+   if (pool_method) {
121
+       if (!strcmp(pool_method, "min"))
122
+           return VMAF_POOL_METHOD_MIN;
123
+       if (!strcmp(pool_method, "mean"))
124
+           return VMAF_POOL_METHOD_MEAN;
125
+       if (!strcmp(pool_method, "harmonic_mean"))
126
+           return VMAF_POOL_METHOD_HARMONIC_MEAN;
127
+   }
128
+   return VMAF_POOL_METHOD_MEAN;
129
+}
130
+
131
+static enum VmafPixelFormat pix_fmt_map(const char *fmt)
132
+{
133
+   if (fmt) {
134
+       if (!strcmp(fmt, "yuv420p") || !strcmp(fmt, "yuv420p10le") || !strcmp(fmt, "yuv420p12le") || !strcmp(fmt, "yuv420p16le"))
135
+            return VMAF_PIX_FMT_YUV420P;
136
+        if (!strcmp(fmt, "yuv422p") || !strcmp(fmt, "yuv422p10le"))
137
+            return VMAF_PIX_FMT_YUV422P;
138
+        if (!strcmp(fmt, "yuv444p") || !strcmp(fmt, "yuv444p10le"))
139
+            return VMAF_PIX_FMT_YUV444P;
140
+   }
141
+   return VMAF_PIX_FMT_UNKNOWN;
142
+}
143
+
144
+static void copy_picture(float *src, VmafPicture *dst, unsigned width, unsigned height, int src_stride, unsigned bpc)
145
+{
146
+    const int bytes_per_value = bpc > 8 ? 2 : 1;
147
+    const int dst_stride = dst->stride0 / bytes_per_value;
148
+    const unsigned b_shift = (bpc > 8) ? (bpc - 8) : 0;
149
+
150
+    uint8_t *dst_data = static_cast<uint8_t*>(dst->data0);
151
+
152
+    for (unsigned i = 0; i < height; i++) {
153
+        if (bpc > 8) {
154
+            uint16_t *dst_row = reinterpret_cast<uint16_t*>(dst_data);
155
+            for (unsigned j = 0; j < width; j++) {
156
+                dst_rowj = static_cast<uint16_t>(srcj * (1 << b_shift));
157
+            }
158
+        } else {
159
+            for (unsigned j = 0; j < width; j++) {
160
+                dst_dataj = static_cast<uint8_t>(srcj);
161
+            }
162
+        }
163
+        src += src_stride / sizeof(float);
164
+        dst_data += dst_stride * bytes_per_value;
165
+    }
166
+}
167
+
168
+int load_feature(VmafContext *vmaf, const char *feature_name, VmafFeatureDictionary *d) {
169
+    int err = vmaf_use_feature(vmaf, feature_name, d);
170
+    if (err) {
171
+        printf("problem loading feature extractor: %s\n", feature_name);
172
+    }
173
+    return err;
174
+}
175
+
176
+int compute_vmaf(double* vmaf_score, char* fmt, int width, int height, int bitdepth, int(*read_frame)(float *ref_data, float *main_data, float *temp_data, int stride_byte, void *user_data),
177
+   void *user_data, char *model_path, char *log_path, char *log_fmt, int disable_clip, int disable_avx, int enable_transform, int phone_model, int do_psnr, int do_ssim, int do_ms_ssim,
178
+   char *pool_method, int n_thread, int n_subsample)
179
+{
180
+   int err = 0;
181
+
182
+   VmafConfiguration cfg = {
183
+       .log_level = VMAF_LOG_LEVEL_INFO,
184
+       .n_threads = n_thread,
185
+       .n_subsample = n_subsample,
186
+       .cpumask = disable_avx ? -1 : 0,
187
+       .gpumask = 0,
188
+   };
189
+
190
+   VmafContext *vmaf;
191
+   err = vmaf_init(&vmaf, cfg);
192
+   if (err) {
193
+       printf("problem initializing VMAF context\n");
194
+       return -1;
195
+   }
196
+
197
+   uint64_t flags = VMAF_MODEL_FLAGS_DEFAULT;
198
+   if (disable_clip)
199
+       flags |= VMAF_MODEL_FLAG_DISABLE_CLIP;
200
+   if (enable_transform || phone_model)
201
x265_3.6.tar.gz/source/encoder/dpb.cpp -> x265_4.0.tar.gz/source/encoder/dpb.cpp Changed
201
 
1
@@ -53,8 +53,8 @@
2
         FrameData* next = m_frameDataFreeList->m_freeListNext;
3
         m_frameDataFreeList->destroy();
4
 
5
-        m_frameDataFreeList->m_reconPic->destroy();
6
-        delete m_frameDataFreeList->m_reconPic;
7
+        m_frameDataFreeList->m_reconPic0->destroy();
8
+        delete m_frameDataFreeList->m_reconPic0;
9
 
10
         delete m_frameDataFreeList;
11
         m_frameDataFreeList = next;
12
@@ -75,7 +75,7 @@
13
         if (curFrame->m_param->bEnableTemporalFilter)
14
             isMCSTFReferenced =!!(curFrame->m_refPicCnt1);
15
 
16
-        if (!curFrame->m_encData->m_bHasReferences && !curFrame->m_countRefEncoders && !isMCSTFReferenced)
17
+        if (curFrame->m_valid && !curFrame->m_encData->m_bHasReferences && !curFrame->m_countRefEncoders && !isMCSTFReferenced)
18
         {
19
             curFrame->m_bChromaExtended = false;
20
 
21
@@ -95,6 +95,12 @@
22
 
23
             // iterator is invalidated by remove, restart scan
24
             m_picList.remove(*curFrame);
25
+#if ENABLE_MULTIVIEW
26
+            if (curFrame->m_param->numViews > 1 && !curFrame->m_viewId && m_picList.getPOC(curFrame->m_poc, 1) && curFrame == m_picList.getPOC(curFrame->m_poc, 1)->refPicSetInterLayer0.getPOC(curFrame->m_poc, curFrame->m_viewId))
27
+            {
28
+                m_picList.getPOC(curFrame->m_poc, 1)->refPicSetInterLayer0.removeSubDPB(*curFrame);
29
+            }
30
+#endif
31
             iterFrame = m_picList.first();
32
 
33
             m_freeList.pushBack(*curFrame);
34
@@ -126,7 +132,8 @@
35
                 curFrame->m_prevCtuInfoChange = NULL;
36
             }
37
             curFrame->m_encData = NULL;
38
-            curFrame->m_reconPic = NULL;
39
+            for (int i = 0; i < !!curFrame->m_param->bEnableSCC + 1; i++)
40
+                curFrame->m_reconPici = NULL;
41
         }
42
     }
43
 }
44
@@ -145,6 +152,11 @@
45
         m_lastIDR = pocCurr;
46
     slice->m_lastIDR = m_lastIDR;
47
     slice->m_sliceType = IS_X265_TYPE_B(type) ? B_SLICE : (type == X265_TYPE_P) ? P_SLICE : I_SLICE;
48
+#if ENABLE_SCC_EXT
49
+    if (slice->m_param->bEnableSCC)        slice->m_origSliceType = slice->m_sliceType;
50
+    if (slice->m_param->bEnableSCC && IS_X265_TYPE_I(type))
51
+        slice->m_sliceType = P_SLICE;
52
+#endif
53
 
54
     if (type == X265_TYPE_B)
55
     {
56
@@ -177,7 +189,8 @@
57
 
58
     m_picList.pushFront(*newFrame);
59
 
60
-    if (m_bTemporalSublayer && getTemporalLayerNonReferenceFlag())
61
+    int layer = slice->m_param->numViews > 1 ? newFrame->m_viewId : (slice->m_param->numScalableLayers > 1) ? newFrame->m_sLayerId : 0;
62
+    if (m_bTemporalSublayer && getTemporalLayerNonReferenceFlag(layer))
63
     {
64
         switch (slice->m_nalUnitType)
65
         {
66
@@ -195,12 +208,13 @@
67
         }
68
     }
69
     // Do decoding refresh marking if any
70
-    decodingRefreshMarking(pocCurr, slice->m_nalUnitType);
71
+    decodingRefreshMarking(pocCurr, slice->m_nalUnitType, layer);
72
 
73
-    computeRPS(pocCurr, newFrame->m_tempLayer, slice->isIRAP(), &slice->m_rps, slice->m_sps->maxDecPicBufferingnewFrame->m_tempLayer);
74
+    uint32_t maxDecBuffer = (slice->m_sps->maxDecPicBufferingnewFrame->m_tempLayer >= 8 && slice->m_param->bEnableSCC) ? 7 : slice->m_sps->maxDecPicBufferingnewFrame->m_tempLayer;
75
+    computeRPS(pocCurr, newFrame->m_tempLayer, slice->isIRAP(), &slice->m_rps, maxDecBuffer, layer);
76
     bool isTSAPic = ((slice->m_nalUnitType == 2) || (slice->m_nalUnitType == 3)) ? true : false;
77
     // Mark pictures in m_piclist as unreferenced if they are not included in RPS
78
-    applyReferencePictureSet(&slice->m_rps, pocCurr, newFrame->m_tempLayer, isTSAPic);
79
+    applyReferencePictureSet(&slice->m_rps, pocCurr, newFrame->m_tempLayer, isTSAPic, layer);
80
 
81
 
82
     if (m_bTemporalSublayer && newFrame->m_tempLayer > 0
83
@@ -210,9 +224,9 @@
84
             || slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RASL_R)
85
         )
86
     {
87
-        if (isTemporalLayerSwitchingPoint(pocCurr, newFrame->m_tempLayer) || (slice->m_sps->maxTempSubLayers == 1))
88
+        if (isTemporalLayerSwitchingPoint(pocCurr, newFrame->m_tempLayer, layer) || (slice->m_sps->maxTempSubLayers == 1))
89
         {
90
-            if (getTemporalLayerNonReferenceFlag())
91
+            if (getTemporalLayerNonReferenceFlag(layer))
92
             {
93
                 slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_TSA_N;
94
             }
95
@@ -221,7 +235,7 @@
96
                 slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_TSA_R;
97
             }
98
         }
99
-        else if (isStepwiseTemporalLayerSwitchingPoint(&slice->m_rps, pocCurr, newFrame->m_tempLayer))
100
+        else if (isStepwiseTemporalLayerSwitchingPoint(&slice->m_rps, pocCurr, newFrame->m_tempLayer, layer))
101
         {
102
             bool isSTSA = true;
103
             int id = newFrame->m_gopOffset % x265_gop_ra_lengthnewFrame->m_gopId;
104
@@ -254,7 +268,7 @@
105
             }
106
             if (isSTSA == true)
107
             {
108
-                if (getTemporalLayerNonReferenceFlag())
109
+                if (getTemporalLayerNonReferenceFlag(layer))
110
                 {
111
                     slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_STSA_N;
112
                 }
113
@@ -266,12 +280,22 @@
114
         }
115
     }
116
 
117
+#if ENABLE_MULTIVIEW
118
+    if (newFrame->m_viewId)
119
+        slice->createInterLayerReferencePictureSet(m_picList, newFrame->refPicSetInterLayer0, newFrame->refPicSetInterLayer1);
120
+#endif
121
+    int numRef = slice->m_param->bEnableSCC ? slice->m_rps.numberOfNegativePictures + 1 : slice->m_rps.numberOfNegativePictures;
122
     if (slice->m_sliceType != I_SLICE)
123
-        slice->m_numRefIdx0 = x265_clip3(1, newFrame->m_param->maxNumReferences, slice->m_rps.numberOfNegativePictures);
124
+        slice->m_numRefIdx0 = x265_clip3(1, newFrame->m_param->maxNumReferences, numRef + newFrame->refPicSetInterLayer0.size() + newFrame->refPicSetInterLayer1.size());
125
+    else
126
+        slice->m_numRefIdx0 = X265_MIN(newFrame->m_param->maxNumReferences, numRef); // Ensuring L0 contains just the -ve POC
127
+#if ENABLE_MULTIVIEW || ENABLE_SCC_EXT
128
+    if(slice->m_param->numViews > 1 || !!slice->m_param->bEnableSCC)
129
+        slice->m_numRefIdx1 = X265_MIN(newFrame->m_param->bBPyramid ? 3 : 2, slice->m_rps.numberOfPositivePictures + newFrame->refPicSetInterLayer0.size() + newFrame->refPicSetInterLayer1.size());
130
     else
131
-        slice->m_numRefIdx0 = X265_MIN(newFrame->m_param->maxNumReferences, slice->m_rps.numberOfNegativePictures); // Ensuring L0 contains just the -ve POC
132
-    slice->m_numRefIdx1 = X265_MIN(newFrame->m_param->bBPyramid ? 2 : 1, slice->m_rps.numberOfPositivePictures);
133
-    slice->setRefPicList(m_picList);
134
+#endif
135
+        slice->m_numRefIdx1 = X265_MIN(newFrame->m_param->bBPyramid ? 2 : 1, slice->m_rps.numberOfPositivePictures);
136
+    slice->setRefPicList(m_picList, newFrame->refPicSetInterLayer0, newFrame->refPicSetInterLayer1, layer);
137
 
138
     X265_CHECK(slice->m_sliceType != B_SLICE || slice->m_numRefIdx1, "B slice without L1 references (non-fatal)\n");
139
 
140
@@ -280,9 +304,29 @@
141
         /* TODO: the lookahead should be able to tell which reference picture
142
          * had the least motion residual.  We should be able to use that here to
143
          * select a colocation reference list and index */
144
-        slice->m_colFromL0Flag = false;
145
+
146
+        bool bLowDelay = true;
147
+        int  iCurrPOC = slice->m_poc;
148
+        int iRefIdx = 0;
149
+
150
+        for (iRefIdx = 0; iRefIdx < slice->m_numRefIdx0 && bLowDelay; iRefIdx++)
151
+        {
152
+            if (slice->m_refPOCList0iRefIdx > iCurrPOC)
153
+            {
154
+                bLowDelay = false;
155
+            }
156
+        }
157
+        for (iRefIdx = 0; iRefIdx < slice->m_numRefIdx1 && bLowDelay; iRefIdx++)
158
+        {
159
+            if (slice->m_refPOCList1iRefIdx > iCurrPOC)
160
+            {
161
+                bLowDelay = false;
162
+            }
163
+        }
164
+
165
+        slice->m_bCheckLDC = bLowDelay;
166
+        slice->m_colFromL0Flag = bLowDelay;
167
         slice->m_colRefIdx = 0;
168
-        slice->m_bCheckLDC = false;
169
     }
170
     else
171
     {
172
@@ -291,6 +335,59 @@
173
         slice->m_colRefIdx = 0;
174
     }
175
 
176
+    slice->m_bTemporalMvp = slice->m_sps->bTemporalMVPEnabled;
177
+#if ENABLE_SCC_EXT
178
+    bool bGPBcheck = false;
179
+    if (slice->m_sliceType == B_SLICE)
180
+    {
181
+        if (slice->m_param->bEnableSCC)
182
+        {
183
+            if (slice->m_numRefIdx0 - 1 == slice->m_numRefIdx1)
184
+            {
185
+                bGPBcheck = true;
186
+                for (int i = 0; i < slice->m_numRefIdx1; i++)
187
+                {
188
+                    if (slice->m_refPOCList1i != slice->m_refPOCList0i)
189
+                    {
190
+                        bGPBcheck = false;
191
+                        break;
192
+                    }
193
+                }
194
+            }
195
+        }
196
+        else if (slice->m_numRefIdx0 == slice->m_numRefIdx1)
197
+        {
198
+            bGPBcheck = true;
199
+            int i;
200
+            for (i = 0; i < slice->m_numRefIdx1; i++)
201
x265_3.6.tar.gz/source/encoder/dpb.h -> x265_4.0.tar.gz/source/encoder/dpb.h Changed
21
 
1
@@ -79,13 +79,13 @@
2
 
3
 protected:
4
 
5
-    void computeRPS(int curPoc,int tempId, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer);
6
+    void computeRPS(int curPoc,int tempId, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer, int sLayerId);
7
 
8
-    void applyReferencePictureSet(RPS *rps, int curPoc, int tempId, bool isTSAPicture);
9
-    bool getTemporalLayerNonReferenceFlag();
10
-    void decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType);
11
-    bool isTemporalLayerSwitchingPoint(int curPoc, int tempId);
12
-    bool isStepwiseTemporalLayerSwitchingPoint(RPS *rps, int curPoc, int tempId);
13
+    void applyReferencePictureSet(RPS *rps, int curPoc, int tempId, bool isTSAPicture, int sLayerId);
14
+    bool getTemporalLayerNonReferenceFlag(int sLayerId);
15
+    void decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType, int sLayerId);
16
+    bool isTemporalLayerSwitchingPoint(int curPoc, int tempId, int sLayerId);
17
+    bool isStepwiseTemporalLayerSwitchingPoint(RPS *rps, int curPoc, int tempId, int sLayerId);
18
 
19
     NalUnitType getNalUnitType(int curPoc, bool bIsKeyFrame);
20
 };
21
x265_3.6.tar.gz/source/encoder/encoder.cpp -> x265_4.0.tar.gz/source/encoder/encoder.cpp Changed
201
 
1
@@ -134,7 +134,6 @@
2
     m_lookahead = NULL;
3
     m_rateControl = NULL;
4
     m_dpb = NULL;
5
-    m_exportedPic = NULL;
6
     m_numDelayedPic = 0;
7
     m_outputCount = 0;
8
     m_param = NULL;
9
@@ -150,6 +149,8 @@
10
     m_rpsInSpsCount = 0;
11
     m_cB = 1.0;
12
     m_cR = 1.0;
13
+    for (int i = 0; i < MAX_LAYERS; i++)
14
+        m_exportedPici = NULL;
15
     for (int i = 0; i < X265_MAX_FRAME_THREADS; i++)
16
         m_frameEncoderi = NULL;
17
     for (uint32_t i = 0; i < DUP_BUFFER; i++)
18
@@ -597,9 +598,9 @@
19
     }
20
 }
21
 
22
-int Encoder::copySlicetypePocAndSceneCut(int *slicetype, int *poc, int *sceneCut)
23
+int Encoder::copySlicetypePocAndSceneCut(int *slicetype, int *poc, int *sceneCut, int sLayer)
24
 {
25
-    Frame *FramePtr = m_dpb->m_picList.getCurFrame();
26
+    Frame *FramePtr = m_dpb->m_picList.getCurFrame(sLayer);
27
     if (FramePtr != NULL)
28
     {
29
         *slicetype = FramePtr->m_lowres.sliceType;
30
@@ -618,31 +619,36 @@
31
 {
32
     if (!(IS_X265_TYPE_I(sliceType)))
33
     {
34
-        Frame *framePtr = m_dpb->m_picList.getPOC(poc);
35
+        Frame *framePtr = m_dpb->m_picList.getPOC(poc, 0);
36
         if (framePtr != NULL)
37
         {
38
             for (int j = 0; j < framePtr->m_encData->m_slice->m_numRefIdx0; j++)    // check only for --ref=n number of frames.
39
             {
40
-                if (framePtr->m_encData->m_slice->m_refFrameList0j && framePtr->m_encData->m_slice->m_refFrameList0j->m_reconPic != NULL)
41
+                if (framePtr->m_encData->m_slice->m_refFrameList0j && framePtr->m_encData->m_slice->m_refFrameList0j->m_reconPic0 != NULL)
42
                 {
43
                     int l0POC = framePtr->m_encData->m_slice->m_refFrameList0j->m_poc;
44
                     pocL0j = l0POC;
45
-                    Frame* l0Fp = m_dpb->m_picList.getPOC(l0POC);
46
-                    while (l0Fp->m_reconRowFlagl0Fp->m_numRows - 1.get() == 0)
47
-                        l0Fp->m_reconRowFlagl0Fp->m_numRows - 1.waitForChange(0); /* If recon is not ready, current frame encoder has to wait. */
48
-                    l0j = l0Fp->m_reconPic;
49
+                    Frame* l0Fp = m_dpb->m_picList.getPOC(l0POC, 0);
50
+#if ENABLE_SCC_EXT
51
+                    if (l0POC != poc)
52
+#endif
53
+                    {
54
+                        while (l0Fp->m_reconRowFlagl0Fp->m_numRows - 1.get() == 0)
55
+                            l0Fp->m_reconRowFlagl0Fp->m_numRows - 1.waitForChange(0); /* If recon is not ready, current frame encoder has to wait. */
56
+                    }
57
+                    l0j = l0Fp->m_reconPic0;
58
                 }
59
             }
60
             for (int j = 0; j < framePtr->m_encData->m_slice->m_numRefIdx1; j++)    // check only for --ref=n number of frames.
61
             {
62
-                if (framePtr->m_encData->m_slice->m_refFrameList1j && framePtr->m_encData->m_slice->m_refFrameList1j->m_reconPic != NULL)
63
+                if (framePtr->m_encData->m_slice->m_refFrameList1j && framePtr->m_encData->m_slice->m_refFrameList1j->m_reconPic0 != NULL)
64
                 {
65
                     int l1POC = framePtr->m_encData->m_slice->m_refFrameList1j->m_poc;
66
                     pocL1j = l1POC;
67
-                    Frame* l1Fp = m_dpb->m_picList.getPOC(l1POC);
68
+                    Frame* l1Fp = m_dpb->m_picList.getPOC(l1POC, 0);
69
                     while (l1Fp->m_reconRowFlagl1Fp->m_numRows - 1.get() == 0)
70
                         l1Fp->m_reconRowFlagl1Fp->m_numRows - 1.waitForChange(0); /* If recon is not ready, current frame encoder has to wait. */
71
-                    l1j = l1Fp->m_reconPic;
72
+                    l1j = l1Fp->m_reconPic0;
73
                 }
74
             }
75
         }
76
@@ -762,7 +768,7 @@
77
     uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
78
     uint32_t heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
79
 
80
-    Frame* curFrame = m_dpb->m_picList.getPOC(poc);
81
+    Frame* curFrame = m_dpb->m_picList.getPOC(poc, 0);
82
     if (curFrame != NULL)
83
     {
84
         curFrame->m_analysisData = (*analysis_data);
85
@@ -861,10 +867,13 @@
86
         X265_FREE(m_rdCost);
87
         X265_FREE(m_trainingCount);
88
     }
89
-    if (m_exportedPic)
90
+    for (int layer = 0; layer < m_param->numLayers; layer++)
91
     {
92
-        ATOMIC_DEC(&m_exportedPic->m_countRefEncoders);
93
-        m_exportedPic = NULL;
94
+        if (m_exportedPiclayer)
95
+        {
96
+            ATOMIC_DEC(&m_exportedPiclayer->m_countRefEncoders);
97
+            m_exportedPiclayer = NULL;
98
+        }
99
     }
100
 
101
     if (m_param->bEnableFrameDuplication)
102
@@ -1359,6 +1368,10 @@
103
     memcpy(dest->planes0, src->planes0, src->framesize * sizeof(char));
104
     dest->planes1 = (char*)dest->planes0 + src->stride0 * src->height;
105
     dest->planes2 = (char*)dest->planes1 + src->stride1 * (src->height >> x265_cli_cspssrc->colorSpace.height1);
106
+#if ENABLE_ALPHA
107
+    if(m_param->bEnableAlpha)
108
+        dest->planes3 = (char*)dest->planes2 + src->stride2 * (src->height >> x265_cli_cspssrc->colorSpace.height2);
109
+#endif
110
 }
111
 
112
 bool Encoder::isFilterThisframe(uint8_t sliceTypeConfig, int curSliceType)
113
@@ -1458,7 +1471,7 @@
114
  * returns 0 if no frames are currently available for output
115
  *         1 if frame was output, m_nalList contains access unit
116
  *         negative on malloc error or abort */
117
-int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out)
118
+int Encoder::encode(const x265_picture* pic_in, x265_picture** pic_out)
119
 {
120
 #if CHECKED_BUILD || _DEBUG
121
     if (g_checkFailures)
122
@@ -1470,19 +1483,21 @@
123
     if (m_aborted)
124
         return -1;
125
 
126
-    const x265_picture* inputPic = NULL;
127
+    const x265_picture* inputPicMAX_VIEWS = { NULL };
128
     static int written = 0, read = 0;
129
     bool dontRead = false;
130
     bool dropflag = false;
131
 
132
-    if (m_exportedPic)
133
+    if (*m_exportedPic)
134
     {
135
         if (!m_param->bUseAnalysisFile && m_param->analysisSave)
136
-            x265_free_analysis_data(m_param, &m_exportedPic->m_analysisData);
137
-
138
-        ATOMIC_DEC(&m_exportedPic->m_countRefEncoders);
139
+            x265_free_analysis_data(m_param, &m_exportedPic0->m_analysisData);
140
 
141
-        m_exportedPic = NULL;
142
+        for (int i = 0; i < m_param->numLayers; i++)
143
+        {
144
+            ATOMIC_DEC(&m_exportedPici->m_countRefEncoders);
145
+            m_exportedPici = NULL;
146
+        }
147
         m_dpb->recycleUnreferenced();
148
 
149
         if (m_param->bEnableTemporalFilter)
150
@@ -1566,143 +1581,194 @@
151
 
152
             if (read < written)
153
             {
154
-                inputPic = m_dupBuffer0->dupPic;
155
+                inputPic0 = m_dupBuffer0->dupPic;
156
                 read++;
157
             }
158
         }
159
         else
160
-            inputPic = pic_in;
161
+        {
162
+            for (int view = 0; view < m_param->numViews; view++)
163
+                inputPicview = pic_in + view;
164
+        }
165
 
166
-        Frame *inFrame;
167
-        x265_param *p = (m_reconfigure || m_reconfigureRc) ? m_latestParam : m_param;
168
-        if (m_dpb->m_freeList.empty())
169
-        {
170
-            inFrame = new Frame;
171
-            inFrame->m_encodeStartTime = x265_mdate();
172
-            if (inFrame->create(p, inputPic->quantOffsets))
173
-            {
174
-                /* the first PicYuv created is asked to generate the CU and block unit offset
175
-                 * arrays which are then shared with all subsequent PicYuv (orig and recon) 
176
-                 * allocated by this top level encoder */
177
-                if (m_sps.cuOffsetY)
178
-                {
179
-                    inFrame->m_fencPic->m_cuOffsetY = m_sps.cuOffsetY;
180
-                    inFrame->m_fencPic->m_buOffsetY = m_sps.buOffsetY;
181
-                    if (m_param->internalCsp != X265_CSP_I400)
182
-                    {
183
-                        inFrame->m_fencPic->m_cuOffsetC = m_sps.cuOffsetC;
184
-                        inFrame->m_fencPic->m_buOffsetC = m_sps.buOffsetC;
185
-                    }
186
-                }
187
-                else
188
+        x265_param* p = (m_reconfigure || m_reconfigureRc) ? m_latestParam : m_param;
189
+        Frame* inFrameMAX_LAYERS;
190
+        for (int layer = 0; layer < m_param->numLayers; layer++)
191
+        {
192
+            if (m_dpb->m_freeList.empty())
193
+            {
194
+                inFramelayer = new Frame;
195
+                inFramelayer->m_encodeStartTime = x265_mdate();
196
+#if ENABLE_MULTIVIEW
197
+                inFramelayer->m_viewId = m_param->numViews > 1 ? layer : 0;
198
+#endif
199
+#if ENABLE_ALPHA
200
+                inFramelayer->m_sLayerId = m_param->numScalableLayers > 1 ? layer : 0;
201
x265_3.6.tar.gz/source/encoder/encoder.h -> x265_4.0.tar.gz/source/encoder/encoder.h Changed
61
 
1
@@ -202,7 +202,7 @@
2
     ThreadPool*        m_threadPool;
3
     FrameEncoder*      m_frameEncoderX265_MAX_FRAME_THREADS;
4
     DPB*               m_dpb;
5
-    Frame*             m_exportedPic;
6
+    Frame*             m_exportedPicMAX_LAYERS;
7
     FILE*              m_analysisFileIn;
8
     FILE*              m_analysisFileOut;
9
     FILE*              m_naluFile;
10
@@ -217,10 +217,10 @@
11
 
12
     bool               m_externalFlush;
13
     /* Collect statistics globally */
14
-    EncStats           m_analyzeAll;
15
-    EncStats           m_analyzeI;
16
-    EncStats           m_analyzeP;
17
-    EncStats           m_analyzeB;
18
+    EncStats           m_analyzeAllMAX_LAYERS;
19
+    EncStats           m_analyzeIMAX_LAYERS;
20
+    EncStats           m_analyzePMAX_LAYERS;
21
+    EncStats           m_analyzeBMAX_LAYERS;
22
     VPS                m_vps;
23
     SPS                m_sps;
24
     PPS                m_pps;
25
@@ -300,7 +300,7 @@
26
     void stopJobs();
27
     void destroy();
28
 
29
-    int encode(const x265_picture* pic, x265_picture *pic_out);
30
+    int encode(const x265_picture* pic, x265_picture **pic_out);
31
 
32
     int reconfigureParam(x265_param* encParam, x265_param* param);
33
 
34
@@ -308,7 +308,7 @@
35
 
36
     void copyCtuInfo(x265_ctu_info_t** frameCtuInfo, int poc);
37
 
38
-    int copySlicetypePocAndSceneCut(int *slicetype, int *poc, int *sceneCut);
39
+    int copySlicetypePocAndSceneCut(int *slicetype, int *poc, int *sceneCut, int sLayer);
40
 
41
     int getRefFrameList(PicYuv** l0, PicYuv** l1, int sliceType, int poc, int* pocL0, int* pocL1);
42
 
43
@@ -320,7 +320,7 @@
44
 
45
     void getEndNalUnits(NALList& list, Bitstream& bs);
46
 
47
-    void fetchStats(x265_stats* stats, size_t statsSizeBytes);
48
+    void fetchStats(x265_stats* stats, size_t statsSizeBytes, int layer = 0);
49
 
50
     void printSummary();
51
 
52
@@ -352,7 +352,7 @@
53
 
54
     void copyDistortionData(x265_analysis_data* analysis, FrameData &curEncData);
55
 
56
-    void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, x265_frame_stats* frameStats, int inPoc);
57
+    void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, x265_frame_stats* frameStats, int inPoc, int layer);
58
 
59
     int validateAnalysisData(x265_analysis_validate* param, int readWriteFlag);
60
 
61
x265_3.6.tar.gz/source/encoder/entropy.cpp -> x265_4.0.tar.gz/source/encoder/entropy.cpp Changed
201
 
1
@@ -230,11 +230,12 @@
2
     X265_CHECK(sizeof(m_contextState) >= sizeof(m_contextState0) * MAX_OFF_CTX_MOD, "context state table is too small\n");
3
 }
4
 
5
-void Entropy::codeVPS(const VPS& vps)
6
+void Entropy::codeVPS(const VPS& vps, const SPS& sps)
7
 {
8
+    int maxLayers = (vps.m_numLayers > 1 || vps.m_numViews > 1) + 1;
9
     WRITE_CODE(0,       4, "vps_video_parameter_set_id");
10
     WRITE_CODE(3,       2, "vps_reserved_three_2bits");
11
-    WRITE_CODE(0,       6, "vps_reserved_zero_6bits");
12
+    WRITE_CODE(maxLayers - 1, 6, "vps_reserved_zero_6bits");
13
     WRITE_CODE(vps.maxTempSubLayers - 1, 3, "vps_max_sub_layers_minus1");
14
     WRITE_FLAG(vps.maxTempSubLayers == 1,   "vps_temporal_id_nesting_flag");
15
     WRITE_CODE(0xffff, 16, "vps_reserved_ffff_16bits");
16
@@ -250,50 +251,320 @@
17
         WRITE_UVLC(vps.maxLatencyIncreasei + 1, "vps_max_latency_increase_plus1i");
18
     }
19
 
20
+#if ENABLE_ALPHA || ENABLE_MULTIVIEW
21
+    if (vps.m_numLayers > 1 || vps.m_numViews > 1)
22
+    {
23
+        WRITE_CODE(maxLayers - 1, 6, "vps_max_nuh_reserved_zero_layer_id");
24
+        WRITE_UVLC(vps.m_vpsNumLayerSetsMinus1, "vps_num_layer_sets_minus1");
25
+        for (int i = 1; i <= vps.m_vpsNumLayerSetsMinus1; i++)
26
+        {
27
+#if ENABLE_MULTIVIEW
28
+            if (vps.m_numViews > 1)
29
+            {
30
+                for (int j = 0; j < vps.m_numViews; j++)
31
+                {
32
+                    WRITE_FLAG(1, "layer_id_included_flagopsIdxi");
33
+                }
34
+            }
35
+#endif
36
+#if ENABLE_ALPHA
37
+            if (vps.m_numLayers > 1)
38
+            {
39
+                for (int j = 0; j < vps.m_numLayers; j++)
40
+                {
41
+                    WRITE_FLAG(1, "layer_id_included_flagopsIdxi");
42
+                }
43
+            }
44
+#endif
45
+        }
46
+    }
47
+    else
48
+    {
49
+        WRITE_CODE(0, 6, "vps_max_nuh_reserved_zero_layer_id");
50
+        WRITE_UVLC(0, "vps_max_op_sets_minus1");
51
+    }
52
+#else
53
     WRITE_CODE(0, 6, "vps_max_nuh_reserved_zero_layer_id");
54
-    WRITE_UVLC(0,    "vps_max_op_sets_minus1");
55
+    WRITE_UVLC(0, "vps_max_op_sets_minus1");
56
+#endif
57
+
58
     WRITE_FLAG(0,    "vps_timing_info_present_flag"); /* we signal timing info in SPS-VUI */
59
-    WRITE_FLAG(0,    "vps_extension_flag");
60
+
61
+#if ENABLE_ALPHA || ENABLE_MULTIVIEW
62
+    if (vps.m_numLayers > 1 || vps.m_numViews > 1)
63
+    {
64
+        WRITE_FLAG(vps.vps_extension_flag, "vps_extension_flag");
65
+
66
+        if (vps.vps_extension_flag)
67
+        {
68
+            while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
69
+            {
70
+                WRITE_FLAG(1, "vps_extension_alignment_bit_equal_to_one");
71
+            }
72
+
73
+            WRITE_CODE(vps.ptl.levelIdc, 8, "general_level_idc");
74
+            if (vps.maxTempSubLayers > 1)
75
+            {
76
+                for (int i = 0; i < vps.maxTempSubLayers - 1; i++)
77
+                {
78
+                    WRITE_FLAG(0, "sub_layer_profile_present_flagi");
79
+                    WRITE_FLAG(0, "sub_layer_level_present_flagi");
80
+                }
81
+                for (int i = vps.maxTempSubLayers - 1; i < 8; i++)
82
+                    WRITE_CODE(0, 2, "reserved_zero_2bits");
83
+            }
84
+
85
+            WRITE_FLAG(vps.splitting_flag, "splitting flag");
86
+            for (int i = 0; i < MAX_VPS_NUM_SCALABILITY_TYPES; i++)
87
+            {
88
+                WRITE_FLAG(vps.m_scalabilityMaski, "scalability_maski");
89
+            }
90
+            for (int i = 0; i < vps.scalabilityTypes - vps.splitting_flag; i++)
91
+            {
92
+                WRITE_CODE(vps.m_dimensionIdLeni - 1, 3, "dimension_id_len_minus1i");
93
+            }
94
+            WRITE_FLAG(vps.m_nuhLayerIdPresentFlag, "vps_nuh_layer_id_present_flag");
95
+            for (int i = 1; i < maxLayers; i++)
96
+            {
97
+                if (vps.m_nuhLayerIdPresentFlag)
98
+                    WRITE_CODE(vps.m_layerIdInNuhi, 6, "layer_id_in_nuhi");
99
+
100
+                if (!vps.splitting_flag)
101
+                {
102
+                    for (int j = 0; j < vps.scalabilityTypes; j++)
103
+                    {
104
+                        uint8_t bits = vps.m_dimensionIdLenj;
105
+                        WRITE_CODE(vps.m_dimensionIdij, bits, "dimension_idij");
106
+                    }
107
+                }
108
+            }
109
+            WRITE_CODE(vps.m_viewIdLen, 4, "view_id_len");
110
+
111
+#if ENABLE_ALPHA
112
+            if (vps.m_numLayers > 1)
113
+            {
114
+                WRITE_FLAG(0, "direct_dependency_flag10");
115
+                WRITE_UVLC(0, "num_add_layer_sets");
116
+                WRITE_FLAG(0, "vps_sub_layers_max_minus1_present_flag");
117
+                WRITE_FLAG(0, "max_tid_ref_present_flag");
118
+                WRITE_FLAG(0, "default_ref_layers_active_flag");
119
+                WRITE_UVLC(2, "vps_num_profile_tier_level_minus1");
120
+                WRITE_FLAG(1, "vps_profile_present_flag");
121
+                codeProfileTier(vps.ptl, vps.maxTempSubLayers, 1);
122
+
123
+                WRITE_UVLC(0, "num_add_olss");
124
+                WRITE_CODE(0, 2, "default_output_layer_idc");
125
+                WRITE_CODE(1, 2, "profile_tier_level_idx i  j ");
126
+                WRITE_CODE(2, 2, "profile_tier_level_idx i  j ");
127
+
128
+                WRITE_UVLC(0, "vps_num_rep_formats_minus1");
129
+
130
+                WRITE_CODE(sps.picWidthInLumaSamples, 16, "pic_width_vps_in_luma_samples");
131
+                WRITE_CODE(sps.picHeightInLumaSamples, 16, "pic_height_vps_in_luma_samples");
132
+                WRITE_FLAG(1, "chroma_and_bit_depth_vps_present_flag");
133
+
134
+                WRITE_CODE(sps.chromaFormatIdc, 2, "chroma_format_vps_idc");
135
+
136
+                if (sps.chromaFormatIdc == X265_CSP_I444)
137
+                    WRITE_FLAG(0, "separate_colour_plane_vps_flag");
138
+
139
+                WRITE_CODE(X265_DEPTH - 8, 4, "bit_depth_vps_luma_minus8");
140
+                WRITE_CODE(X265_DEPTH - 8, 4, "bit_depth_vps_chroma_minus8");
141
+
142
+                const Window& conf = sps.conformanceWindow;
143
+                WRITE_FLAG(conf.bEnabled, "conformance_window_vps_flag");
144
+                if (conf.bEnabled)
145
+                {
146
+                    int hShift = CHROMA_H_SHIFT(sps.chromaFormatIdc), vShift = CHROMA_V_SHIFT(sps.chromaFormatIdc);
147
+                    WRITE_UVLC(conf.leftOffset >> hShift, "conf_win_vps_left_offset");
148
+                    WRITE_UVLC(conf.rightOffset >> hShift, "conf_win_vps_right_offset");
149
+                    WRITE_UVLC(conf.topOffset >> vShift, "conf_win_vps_top_offset");
150
+                    WRITE_UVLC(conf.bottomOffset >> vShift, "conf_win_vps_bottom_offset");
151
+                }
152
+
153
+                WRITE_FLAG(1, "max_one_active_ref_layer_flag");
154
+                WRITE_FLAG(0, "vps_poc_lsb_aligned_flag");
155
+                WRITE_FLAG(1, "poc_lsb_not_present_flag");
156
+
157
+                for (int i = 1; i < vps.m_vpsNumLayerSetsMinus1 + 1; i++)
158
+                {
159
+                    WRITE_FLAG(vps.maxTempSubLayers > 1, "sub_layer_flag_info_present_flag");
160
+                    for (int j = 0; j < vps.maxTempSubLayers ; j++)
161
+                    {
162
+                        if(j > 0)
163
+                        WRITE_FLAG(vps.maxTempSubLayers > 1, "sub_layer_dpb_info_present_flag");
164
+
165
+                        for(int k = 0; k < vps.m_numLayersInIdListi; k++)
166
+                            WRITE_UVLC(vps.maxDecPicBufferingj - 1, "vps_max_dec_pic_buffering_minus1i");
167
+
168
+                        WRITE_UVLC(vps.numReorderPics0, "vps_num_reorder_picsi");
169
+                        WRITE_UVLC(vps.maxLatencyIncrease0 + 1, "vps_max_latency_increase_plus1i");
170
+                    }
171
+                }
172
+
173
+                WRITE_UVLC(0, "direct_dep_type_len_minus2");
174
+
175
+                WRITE_FLAG(0, "default_direct_dependency_flag");
176
+                WRITE_UVLC(0, "vps_non_vui_extension_length");
177
+                WRITE_FLAG(0, "vps_vui_present_flag");
178
+                WRITE_FLAG(0, "vps_extension2_flag");
179
+        }
180
+#endif
181
+
182
+#if ENABLE_MULTIVIEW
183
+            if (vps.m_numViews > 1)
184
+            {
185
+                for (uint8_t i = 0; i < vps.m_numViews; i++)
186
+                    WRITE_CODE(i, vps.m_viewIdLen, "view_id_vali");
187
+
188
+                for (int i = 1; i < vps.m_numViews; i++)
189
+                {
190
+                    for (int j = 0; j < i; j++)
191
+                    {
192
+                        if (j == 0)
193
+                            WRITE_FLAG(1, "direct_dependency_flag10");
194
+                        else
195
+                            WRITE_FLAG(0, "direct_dependency_flag10");
196
+                    }
197
+                }
198
+                WRITE_FLAG(0, "vps_sub_layers_max_minus1_present_flag");
199
+                WRITE_FLAG(0, "max_tid_ref_present_flag");
200
+                WRITE_FLAG(1, "default_ref_layers_active_flag");
201
x265_3.6.tar.gz/source/encoder/entropy.h -> x265_4.0.tar.gz/source/encoder/entropy.h Changed
30
 
1
@@ -141,14 +141,14 @@
2
     void loadIntraDirModeLuma(const Entropy& src);
3
     void copyState(const Entropy& other);
4
 
5
-    void codeVPS(const VPS& vps);
6
-    void codeSPS(const SPS& sps, const ScalingList& scalingList, const ProfileTierLevel& ptl);
7
-    void codePPS( const PPS& pps, bool filerAcross, int iPPSInitQpMinus26 );
8
-    void codeVUI(const VUI& vui, int maxSubTLayers, bool bEmitVUITimingInfo, bool bEmitVUIHRDInfo);
9
+    void codeVPS(const VPS& vps, const SPS& sps);
10
+    void codeSPS(const SPS& sps, const ScalingList& scalingList, const ProfileTierLevel& ptl, int layer = 0);
11
+    void codePPS( const PPS& pps, bool filerAcross, int iPPSInitQpMinus26, int layer = 0);
12
+    void codeVUI(const VUI& vui, int maxSubTLayers, bool bEmitVUITimingInfo, bool bEmitVUIHRDInfo, int layer = 0);
13
     void codeAUD(const Slice& slice);
14
     void codeHrdParameters(const HRDInfo& hrd, int maxSubTLayers);
15
 
16
-    void codeSliceHeader(const Slice& slice, FrameData& encData, uint32_t slice_addr, uint32_t slice_addr_bits, int sliceQp);
17
+    void codeSliceHeader(const Slice& slice, FrameData& encData, uint32_t slice_addr, uint32_t slice_addr_bits, int sliceQp, int layer = 0);
18
     void codeSliceHeaderWPPEntryPoints(const uint32_t *substreamSizes, uint32_t numSubStreams, uint32_t maxOffset);
19
     void codeShortTermRefPicSet(const RPS& rps, int idx);
20
     void finishSlice()                 { encodeBinTrm(1); finish(); dynamic_cast<Bitstream*>(m_bitIf)->writeByteAlignment(); }
21
@@ -234,7 +234,7 @@
22
     void writeEpExGolomb(uint32_t symbol, uint32_t count);
23
     void writeCoefRemainExGolomb(uint32_t symbol, const uint32_t absGoRice);
24
 
25
-    void codeProfileTier(const ProfileTierLevel& ptl, int maxTempSubLayers);
26
+    void codeProfileTier(const ProfileTierLevel& ptl, int maxTempSubLayers, int layer = 0);
27
     void codeScalingList(const ScalingList&);
28
     void codeScalingList(const ScalingList& scalingList, uint32_t sizeId, uint32_t listId);
29
 
30
x265_3.6.tar.gz/source/encoder/frameencoder.cpp -> x265_4.0.tar.gz/source/encoder/frameencoder.cpp Changed
201
 
1
@@ -41,11 +41,9 @@
2
 
3
 FrameEncoder::FrameEncoder()
4
 {
5
-    m_prevOutputTime = x265_mdate();
6
     m_reconfigure = false;
7
     m_isFrameEncoder = true;
8
     m_threadActive = true;
9
-    m_slicetypeWaitTime = 0;
10
     m_activeWorkerCount = 0;
11
     m_completionCount = 0;
12
     m_outStreams = NULL;
13
@@ -56,11 +54,16 @@
14
     m_rows = NULL;
15
     m_top = NULL;
16
     m_param = NULL;
17
-    m_frame = NULL;
18
     m_cuGeoms = NULL;
19
     m_ctuGeomMap = NULL;
20
     m_localTldIdx = 0;
21
     memset(&m_rce, 0, sizeof(RateControlEntry));
22
+    for (int layer = 0; layer < MAX_LAYERS; layer++)
23
+    {
24
+        m_prevOutputTimelayer = x265_mdate();
25
+        m_slicetypeWaitTimelayer = 0;
26
+        m_framelayer = NULL;
27
+    }
28
 }
29
 
30
 void FrameEncoder::destroy()
31
@@ -94,6 +97,7 @@
32
     X265_FREE(m_ctuGeomMap);
33
     X265_FREE(m_substreamSizes);
34
     X265_FREE(m_nr);
35
+    X265_FREE(m_retFrameBuffer);
36
 
37
     m_frameFilter.destroy();
38
 
39
@@ -216,6 +220,9 @@
40
             ok &= !!m_frameEncTF->createRefPicInfo(&m_mcstfRefListi, m_param);
41
     }
42
 
43
+    m_retFrameBuffer = X265_MALLOC(Frame*, m_param->numLayers);
44
+    for (int layer = 0; layer < m_param->numLayers; layer++)
45
+        m_retFrameBufferlayer = NULL;
46
     return ok;
47
 }
48
 
49
@@ -282,14 +289,17 @@
50
     return true;
51
 }
52
 
53
-bool FrameEncoder::startCompressFrame(Frame* curFrame)
54
+bool FrameEncoder::startCompressFrame(Frame* curFrameMAX_LAYERS)
55
 {
56
-    m_slicetypeWaitTime = x265_mdate() - m_prevOutputTime;
57
-    m_frame = curFrame;
58
-    m_sliceType = curFrame->m_lowres.sliceType;
59
-    curFrame->m_encData->m_frameEncoderID = m_jpId;
60
-    curFrame->m_encData->m_jobProvider = this;
61
-    curFrame->m_encData->m_slice->m_mref = m_mref;
62
+    for (int layer = 0; layer < m_param->numLayers; layer++)
63
+    {
64
+        m_slicetypeWaitTimelayer = x265_mdate() - m_prevOutputTimelayer;
65
+        m_framelayer = curFramelayer;
66
+        curFramelayer->m_encData->m_frameEncoderID = m_jpId;
67
+        curFramelayer->m_encData->m_jobProvider = this;
68
+        curFramelayer->m_encData->m_slice->m_mref = m_mref;
69
+    }
70
+    m_sliceType = curFrame0->m_lowres.sliceType;
71
 
72
     if (!m_cuGeoms)
73
     {
74
@@ -355,15 +365,17 @@
75
     {
76
         if (m_param->bCTUInfo)
77
         {
78
-            while (!m_frame->m_ctuInfo)
79
-                m_frame->m_copied.wait();
80
+            while (!m_frame0->m_ctuInfo)
81
+                m_frame0->m_copied.wait();
82
         }
83
-        if ((m_param->bAnalysisType == AVC_INFO) && !m_param->analysisSave && !m_param->analysisLoad && !(IS_X265_TYPE_I(m_frame->m_lowres.sliceType)))
84
+        if ((m_param->bAnalysisType == AVC_INFO) && !m_param->analysisSave && !m_param->analysisLoad && !(IS_X265_TYPE_I(m_frame0->m_lowres.sliceType)))
85
         {
86
-            while (((m_frame->m_analysisData.interData == NULL && m_frame->m_analysisData.intraData == NULL) || (uint32_t)m_frame->m_poc != m_frame->m_analysisData.poc))
87
-                m_frame->m_copyMVType.wait();
88
+            while (((m_frame0->m_analysisData.interData == NULL && m_frame0->m_analysisData.intraData == NULL) || (uint32_t)m_frame0->m_poc != m_frame0->m_analysisData.poc))
89
+                m_frame0->m_copyMVType.wait();
90
         }
91
-        compressFrame();
92
+
93
+        for (int layer = 0; layer < m_param->numLayers; layer++)
94
+            compressFrame(layer);
95
         m_done.trigger(); /* FrameEncoder::getEncodedPicture() blocks for this event */
96
         m_enable.wait();
97
     }
98
@@ -371,7 +383,7 @@
99
 
100
 void FrameEncoder::WeightAnalysis::processTasks(int /* workerThreadId */)
101
 {
102
-    Frame* frame = master.m_frame;
103
+    Frame* frame = master.m_framemaster.m_sLayerId;
104
     weightAnalyse(*frame->m_encData->m_slice, *frame, *master.m_param);
105
 }
106
 
107
@@ -411,13 +423,13 @@
108
         memcpy(m_top->m_prevTonemapPayload.payload, payload->payload, payload->payloadSize);
109
     }
110
 
111
-    bool isIDR = m_frame->m_lowres.sliceType == X265_TYPE_IDR;
112
+    bool isIDR = m_frame0->m_lowres.sliceType == X265_TYPE_IDR;
113
     return (payloadChange || isIDR);
114
 }
115
 
116
-void FrameEncoder::writeTrailingSEIMessages()
117
+void FrameEncoder::writeTrailingSEIMessages(int layer)
118
 {
119
-    Slice* slice = m_frame->m_encData->m_slice;
120
+    Slice* slice = m_framelayer->m_encData->m_slice;
121
     int planes = (m_param->internalCsp != X265_CSP_I400) ? 3 : 1;
122
     int32_t payloadSize = 0;
123
 
124
@@ -444,21 +456,21 @@
125
     }
126
 
127
     m_seiReconPictureDigest.setSize(payloadSize);
128
-    m_seiReconPictureDigest.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_SUFFIX_SEI, m_nalList, false);
129
+    m_seiReconPictureDigest.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_SUFFIX_SEI, m_nalList, false, layer);
130
 }
131
 
132
-void FrameEncoder::compressFrame()
133
+void FrameEncoder::compressFrame(int layer)
134
 {
135
     ProfileScopeEvent(frameThread);
136
 
137
-    m_startCompressTime = x265_mdate();
138
+    m_startCompressTimelayer = x265_mdate();
139
     m_totalActiveWorkerCount = 0;
140
     m_activeWorkerCountSamples = 0;
141
-    m_totalWorkerElapsedTime = 0;
142
-    m_totalNoWorkerTime = 0;
143
+    m_totalWorkerElapsedTimelayer = 0;
144
+    m_totalNoWorkerTimelayer = 0;
145
     m_countRowBlocks = 0;
146
-    m_allRowsAvailableTime = 0;
147
-    m_stallStartTime = 0;
148
+    m_allRowsAvailableTimelayer = 0;
149
+    m_stallStartTimelayer = 0;
150
 
151
     m_completionCount = 0;
152
     memset((void*)m_bAllRowsStop, 0, sizeof(bool) * m_param->maxSlices);
153
@@ -466,18 +478,19 @@
154
     m_rowSliceTotalBits0 = 0;
155
     m_rowSliceTotalBits1 = 0;
156
 
157
-    m_SSDY = m_SSDU = m_SSDV = 0;
158
-    m_ssim = 0;
159
-    m_ssimCnt = 0;
160
-    memset(&(m_frame->m_encData->m_frameStats), 0, sizeof(m_frame->m_encData->m_frameStats));
161
+    m_SSDYlayer = m_SSDUlayer = m_SSDVlayer = 0;
162
+    m_ssimlayer = 0;
163
+    m_ssimCntlayer = 0;
164
+    memset(&(m_framelayer->m_encData->m_frameStats), 0, sizeof(m_framelayer->m_encData->m_frameStats));
165
+    m_sLayerId = layer;
166
 
167
     if (m_param->rc.aqMode != X265_AQ_EDGE && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
168
     {
169
-        int height = m_frame->m_fencPic->m_picHeight;
170
-        int width = m_frame->m_fencPic->m_picWidth;
171
-        intptr_t stride = m_frame->m_fencPic->m_stride;
172
+        int height = m_framelayer->m_fencPic->m_picHeight;
173
+        int width = m_framelayer->m_fencPic->m_picWidth;
174
+        intptr_t stride = m_framelayer->m_fencPic->m_stride;
175
 
176
-        if (!computeEdge(m_frame->m_edgeBitPic, m_frame->m_fencPic->m_picOrg0, NULL, stride, height, width, false, 1))
177
+        if (!computeEdge(m_framelayer->m_edgeBitPic, m_framelayer->m_fencPic->m_picOrg0, NULL, stride, height, width, false, 1))
178
         {
179
             x265_log(m_param, X265_LOG_ERROR, " Failed to compute edge !");
180
         }
181
@@ -486,15 +499,15 @@
182
     /* Emit access unit delimiter unless this is the first frame and the user is
183
      * not repeating headers (since AUD is supposed to be the first NAL in the access
184
      * unit) */
185
-    Slice* slice = m_frame->m_encData->m_slice;
186
+    Slice* slice = m_framelayer->m_encData->m_slice;
187
 
188
-    if (m_param->bEnableEndOfSequence && m_frame->m_lowres.sliceType == X265_TYPE_IDR && m_frame->m_poc)
189
+    if (m_param->bEnableEndOfSequence && m_framelayer->m_lowres.sliceType == X265_TYPE_IDR && m_framelayer->m_poc)
190
     {
191
         m_bs.resetBits();
192
         m_nalList.serialize(NAL_UNIT_EOS, m_bs);
193
     }
194
 
195
-    if (m_param->bEnableAccessUnitDelimiters && (m_frame->m_poc || m_param->bRepeatHeaders))
196
+    if (m_param->bEnableAccessUnitDelimiters && (m_framelayer->m_poc || m_param->bRepeatHeaders))
197
     {
198
         m_bs.resetBits();
199
         m_entropyCoder.setBitstream(&m_bs);
200
@@ -504,7 +517,7 @@
201
x265_3.6.tar.gz/source/encoder/frameencoder.h -> x265_4.0.tar.gz/source/encoder/frameencoder.h Changed
116
 
1
@@ -156,12 +156,12 @@
2
     void destroy();
3
 
4
     /* triggers encode of a new frame by the worker thread */
5
-    bool startCompressFrame(Frame* curFrame);
6
+    bool startCompressFrame(Frame* curFrameMAX_LAYERS);
7
 
8
     /* blocks until worker thread is done, returns access unit */
9
-    Frame *getEncodedPicture(NALList& list);
10
+    Frame **getEncodedPicture(NALList& list);
11
 
12
-    void initDecodedPictureHashSEI(int row, int cuAddr, int height);
13
+    void initDecodedPictureHashSEI(int row, int cuAddr, int height, int layer);
14
 
15
     Event                    m_enable;
16
     Event                    m_done;
17
@@ -190,34 +190,35 @@
18
     RateControlEntry         m_rce;
19
     SEIDecodedPictureHash    m_seiReconPictureDigest;
20
 
21
-    uint64_t                 m_SSDY;
22
-    uint64_t                 m_SSDU;
23
-    uint64_t                 m_SSDV;
24
-    double                   m_ssim;
25
-    uint64_t                 m_accessUnitBits;
26
-    uint32_t                 m_ssimCnt;
27
+    uint64_t                 m_SSDYMAX_LAYERS;
28
+    uint64_t                 m_SSDUMAX_LAYERS;
29
+    uint64_t                 m_SSDVMAX_LAYERS;
30
+    double                   m_ssimMAX_LAYERS;
31
+    uint64_t                 m_accessUnitBitsMAX_LAYERS;
32
+    uint32_t                 m_ssimCntMAX_LAYERS;
33
 
34
     volatile int             m_activeWorkerCount;        // count of workers currently encoding or filtering CTUs
35
     volatile int             m_totalActiveWorkerCount;   // sum of m_activeWorkerCount sampled at end of each CTU
36
     volatile int             m_activeWorkerCountSamples; // count of times m_activeWorkerCount was sampled (think vbv restarts)
37
     volatile int             m_countRowBlocks;           // count of workers forced to abandon a row because of top dependency
38
-    int64_t                  m_startCompressTime;        // timestamp when frame encoder is given a frame
39
-    int64_t                  m_row0WaitTime;             // timestamp when row 0 is allowed to start
40
-    int64_t                  m_allRowsAvailableTime;     // timestamp when all reference dependencies are resolved
41
-    int64_t                  m_endCompressTime;          // timestamp after all CTUs are compressed
42
-    int64_t                  m_endFrameTime;             // timestamp after RCEnd, NR updates, etc
43
-    int64_t                  m_stallStartTime;           // timestamp when worker count becomes 0
44
-    int64_t                  m_prevOutputTime;           // timestamp when prev frame was retrieved by API thread
45
-    int64_t                  m_slicetypeWaitTime;        // total elapsed time waiting for decided frame
46
-    int64_t                  m_totalWorkerElapsedTime;   // total elapsed time spent by worker threads processing CTUs
47
-    int64_t                  m_totalNoWorkerTime;        // total elapsed time without any active worker threads
48
+    int64_t                  m_startCompressTimeMAX_LAYERS;        // timestamp when frame encoder is given a frame
49
+    int64_t                  m_row0WaitTimeMAX_LAYERS;             // timestamp when row 0 is allowed to start
50
+    int64_t                  m_allRowsAvailableTimeMAX_LAYERS;     // timestamp when all reference dependencies are resolved
51
+    int64_t                  m_endCompressTimeMAX_LAYERS;          // timestamp after all CTUs are compressed
52
+    int64_t                  m_endFrameTimeMAX_LAYERS;             // timestamp after RCEnd, NR updates, etc
53
+    int64_t                  m_stallStartTimeMAX_LAYERS;           // timestamp when worker count becomes 0
54
+    int64_t                  m_prevOutputTimeMAX_LAYERS;           // timestamp when prev frame was retrieved by API thread
55
+    int64_t                  m_slicetypeWaitTimeMAX_LAYERS;        // total elapsed time waiting for decided frame
56
+    int64_t                  m_totalWorkerElapsedTimeMAX_LAYERS;   // total elapsed time spent by worker threads processing CTUs
57
+    int64_t                  m_totalNoWorkerTimeMAX_LAYERS;        // total elapsed time without any active worker threads
58
 #if DETAILED_CU_STATS
59
     CUStats                  m_cuStats;
60
 #endif
61
 
62
     Encoder*                 m_top;
63
     x265_param*              m_param;
64
-    Frame*                   m_frame;
65
+    Frame*                   m_frameMAX_LAYERS;
66
+    Frame**                  m_retFrameBuffer;
67
     NoiseReduction*          m_nr;
68
     ThreadLocalData*         m_tld; /* for --no-wpp */
69
     Bitstream*               m_outStreams;
70
@@ -238,6 +239,8 @@
71
     TemporalFilter*          m_frameEncTF;
72
     TemporalFilterRefPicInfo m_mcstfRefListMAX_MCSTF_TEMPORAL_WINDOW_LENGTH;
73
 
74
+    int                      m_sLayerId;
75
+
76
     class WeightAnalysis : public BondedTaskGroup
77
     {
78
     public:
79
@@ -258,20 +261,20 @@
80
     bool initializeGeoms();
81
 
82
     /* analyze / compress frame, can be run in parallel within reference constraints */
83
-    void compressFrame();
84
+    void compressFrame(int layer);
85
 
86
     /* called by compressFrame to generate final per-row bitstreams */
87
-    void encodeSlice(uint32_t sliceAddr);
88
+    void encodeSlice(uint32_t sliceAddr, int layer);
89
 
90
     void threadMain();
91
     int  collectCTUStatistics(const CUData& ctu, FrameStats* frameLog);
92
     void noiseReductionUpdate();
93
-    void writeTrailingSEIMessages();
94
+    void writeTrailingSEIMessages(int layer);
95
     bool writeToneMapInfo(x265_sei_payload *payload);
96
 
97
     /* Called by WaveFront::findJob() */
98
-    virtual void processRow(int row, int threadId);
99
-    virtual void processRowEncoder(int row, ThreadLocalData& tld);
100
+    virtual void processRow(int row, int threadId, int layer);
101
+    virtual void processRowEncoder(int row, ThreadLocalData& tld, int layer);
102
 
103
     void enqueueRowEncoder(int row) { WaveFront::enqueueRow(row * 2 + 0); }
104
     void enqueueRowFilter(int row)  { WaveFront::enqueueRow(row * 2 + 1); }
105
@@ -280,8 +283,8 @@
106
 #if ENABLE_LIBVMAF
107
     void vmafFrameLevelScore();
108
 #endif
109
-    void collectDynDataFrame();
110
-    void computeAvgTrainingData();
111
+    void collectDynDataFrame(int layer);
112
+    void computeAvgTrainingData(int layer);
113
     void collectDynDataRow(CUData& ctu, FrameStats* rowStats);    
114
     void readModel(FilmGrainCharacteristics* m_filmGrain, FILE* filmgrain);
115
 };
116
x265_3.6.tar.gz/source/encoder/framefilter.cpp -> x265_4.0.tar.gz/source/encoder/framefilter.cpp Changed
137
 
1
@@ -256,7 +256,7 @@
2
     const int size = cu->m_log2CUSizeabsPartIdx - 2;
3
     const uint32_t cuAddr = cu->m_cuAddr;
4
 
5
-    PicYuv* reconPic = frame.m_reconPic;
6
+    PicYuv* reconPic = frame.m_reconPic0;
7
     PicYuv* fencPic  = frame.m_fencPic;
8
 
9
     pixel* dst = reconPic->getLumaAddr(cuAddr, absPartIdx);
10
@@ -337,7 +337,7 @@
11
 
12
         uint32_t cuAddr = m_rowAddr + col;
13
         const CUData* ctu = m_encData->getPicCTU(cuAddr);
14
-        assert(m_frameFilter->m_frame->m_reconPic == m_encData->m_reconPic);
15
+        assert(m_frameFilter->m_frame->m_reconPic0 == m_encData->m_reconPic0);
16
         origCUSampleRestoration(ctu, cuGeomsctuGeomMapcuAddr, *m_frameFilter->m_frame);
17
     }
18
 }
19
@@ -352,7 +352,7 @@
20
     if ((col != 0) & (col != m_frameFilter->m_numCols - 1) & (m_row != 0) & (m_row != m_frameFilter->m_numRows - 1))
21
         return;
22
 
23
-    PicYuv *reconPic = m_frameFilter->m_frame->m_reconPic;
24
+    PicYuv *reconPic = m_frameFilter->m_frame->m_reconPic0;
25
     const uint32_t lineStartCUAddr = m_rowAddr + col;
26
     const int realH = getCUHeight();
27
     const int realW = m_frameFilter->getCUWidth(col);
28
@@ -441,7 +441,7 @@
29
     SAOParam* saoParam = m_encData->m_saoParam;
30
     const CUGeom* cuGeoms = m_frameFilter->m_frameEncoder->m_cuGeoms;
31
     const uint32_t* ctuGeomMap = m_frameFilter->m_frameEncoder->m_ctuGeomMap;
32
-    PicYuv* reconPic = m_encData->m_reconPic;
33
+    PicYuv* reconPic = m_encData->m_reconPic0;
34
     const int colStart = m_lastCol.get();
35
     const int numCols = m_frameFilter->m_numCols;
36
     // TODO: Waiting previous row finish or simple clip on it?
37
@@ -561,7 +561,7 @@
38
     }
39
 }
40
 
41
-void FrameFilter::processRow(int row)
42
+void FrameFilter::processRow(int row, int layer)
43
 {
44
     ProfileScopeEvent(filterCTURow);
45
 
46
@@ -572,7 +572,7 @@
47
 
48
     if (!m_param->bEnableLoopFilter && !m_useSao)
49
     {
50
-        processPostRow(row);
51
+        processPostRow(row, layer);
52
         return;
53
     }
54
     FrameData& encData = *m_frame->m_encData;
55
@@ -616,7 +616,7 @@
56
 
57
     // this row of CTUs has been encoded
58
     if (!ctu->m_bFirstRowInSlice)
59
-        processPostRow(row - 1);
60
+        processPostRow(row - 1, layer);
61
 
62
     // NOTE: slices parallelism will be execute out-of-order
63
     int numRowFinished = 0;
64
@@ -648,12 +648,12 @@
65
     }
66
 
67
     if (ctu->m_bLastRowInSlice)
68
-        processPostRow(row);
69
+        processPostRow(row, layer);
70
 }
71
 
72
-void FrameFilter::processPostRow(int row)
73
+void FrameFilter::processPostRow(int row, int layer)
74
 {
75
-    PicYuv *reconPic = m_frame->m_reconPic;
76
+    PicYuv *reconPic = m_frame->m_reconPic0;
77
     const uint32_t numCols = m_frame->m_encData->m_slice->m_sps->numCuInWidth;
78
     const uint32_t lineStartCUAddr = row * numCols;
79
 
80
@@ -673,7 +673,7 @@
81
         uint32_t height = m_parallelFilterrow.getCUHeight();
82
 
83
         uint64_t ssdY = m_frameEncoder->m_top->computeSSD(fencPic->getLumaAddr(cuAddr), reconPic->getLumaAddr(cuAddr), stride, width, height, m_param);
84
-        m_frameEncoder->m_SSDY += ssdY;
85
+        m_frameEncoder->m_SSDYlayer += ssdY;
86
 
87
         if (m_param->internalCsp != X265_CSP_I400)
88
         {
89
@@ -684,8 +684,8 @@
90
             uint64_t ssdU = m_frameEncoder->m_top->computeSSD(fencPic->getCbAddr(cuAddr), reconPic->getCbAddr(cuAddr), stride, width, height, m_param);
91
             uint64_t ssdV = m_frameEncoder->m_top->computeSSD(fencPic->getCrAddr(cuAddr), reconPic->getCrAddr(cuAddr), stride, width, height, m_param);
92
 
93
-            m_frameEncoder->m_SSDU += ssdU;
94
-            m_frameEncoder->m_SSDV += ssdV;
95
+            m_frameEncoder->m_SSDUlayer += ssdU;
96
+            m_frameEncoder->m_SSDVlayer += ssdV;
97
         }
98
     }
99
 
100
@@ -705,15 +705,15 @@
101
         /* SSIM is done for each row in blocks of 4x4 . The First blocks are offset by 2 pixels to the right
102
         * to avoid alignment of ssim blocks with DCT blocks. */
103
         minPixY += bStart ? 2 : -6;
104
-        m_frameEncoder->m_ssim += calculateSSIM(rec + 2 + minPixY * stride1, stride1, fenc + 2 + minPixY * stride2, stride2,
105
+        m_frameEncoder->m_ssimlayer += calculateSSIM(rec + 2 + minPixY * stride1, stride1, fenc + 2 + minPixY * stride2, stride2,
106
                                                 m_param->sourceWidth - 2, maxPixY - minPixY, m_ssimBuf, ssim_cnt);
107
-        m_frameEncoder->m_ssimCnt += ssim_cnt;
108
+        m_frameEncoder->m_ssimCntlayer += ssim_cnt;
109
     }
110
 
111
     if (m_param->maxSlices == 1)
112
     {
113
         uint32_t height = m_parallelFilterrow.getCUHeight();
114
-        m_frameEncoder->initDecodedPictureHashSEI(row, cuAddr, height);
115
+        m_frameEncoder->initDecodedPictureHashSEI(row, cuAddr, height, layer);
116
     } // end of (m_param->maxSlices == 1)
117
 
118
     if (ATOMIC_INC(&m_frameEncoder->m_completionCount) == 2 * (int)m_frameEncoder->m_numRows)
119
@@ -737,7 +737,7 @@
120
             }
121
         }
122
 
123
-        int stride = (int)m_frame->m_reconPic->m_stride;
124
+        int stride = (int)m_frame->m_reconPic0->m_stride;
125
         int padX = m_param->maxCUSize + 32;
126
         int padY = m_param->maxCUSize + 16;
127
         int numCuInHeight = m_frame->m_encData->m_slice->m_sps->numCuInHeight;
128
@@ -763,7 +763,7 @@
129
 
130
         for (int y = startRow; y < height; y++)
131
         {
132
-            pixel    *pix = m_frame->m_reconPic->m_picOrg0 + y * stride - padX;
133
+            pixel    *pix = m_frame->m_reconPic0->m_picOrg0 + y * stride - padX;
134
             uint32_t *sum32x32 = m_frame->m_encData->m_meIntegral0 + (y + 1) * stride - padX;
135
             uint32_t *sum32x24 = m_frame->m_encData->m_meIntegral1 + (y + 1) * stride - padX;
136
             uint32_t *sum32x8 = m_frame->m_encData->m_meIntegral2 + (y + 1) * stride - padX;
137
x265_3.6.tar.gz/source/encoder/framefilter.h -> x265_4.0.tar.gz/source/encoder/framefilter.h Changed
12
 
1
@@ -128,8 +128,8 @@
2
 
3
     void start(Frame *pic, Entropy& initState);
4
 
5
-    void processRow(int row);
6
-    void processPostRow(int row);
7
+    void processRow(int row, int layer);
8
+    void processPostRow(int row, int layer);
9
     void computeMEIntegral(int row);
10
 };
11
 }
12
x265_3.6.tar.gz/source/encoder/level.cpp -> x265_4.0.tar.gz/source/encoder/level.cpp Changed
201
 
1
@@ -60,6 +60,42 @@
2
     { MAX_UINT, MAX_UINT, MAX_UINT, MAX_UINT, MAX_UINT, MAX_UINT, 1, Level::LEVEL8_5, "8.5", 85 },
3
 };
4
 
5
+#if ENABLE_SCC_EXT
6
+enum SCCProfileName
7
+{
8
+    NONE = 0,
9
+    // The following are SCC profiles, which would map to the MAINSCC profile idc.
10
+    // The enumeration indicates the bit-depth constraint in the bottom 2 digits
11
+    //                           the chroma format in the next digit
12
+    //                           the intra constraint in the next digit
13
+    //                           If it is a SCC profile there is a '2' for the next digit.
14
+    //                           If it is a highthroughput , there is a '2' for the top digit else '1' for the top digit
15
+    SCC_MAIN = 121108,
16
+    SCC_MAIN_10 = 121110,
17
+    SCC_MAIN_444 = 121308,
18
+    SCC_MAIN_444_10 = 121310,
19
+};
20
+
21
+static const SCCProfileName validSCCProfileNames14/* bit depth constraint 8=0, 10=1, 12=2, 14=3*/4/*chroma format*/ =
22
+{
23
+   {
24
+        { NONE,         SCC_MAIN,      NONE,      SCC_MAIN_444                     }, // 8-bit  intra for 400, 420, 422 and 444
25
+        { NONE,         SCC_MAIN_10,   NONE,      SCC_MAIN_444_10                  }, // 10-bit intra for 400, 420, 422 and 444
26
+        { NONE,         NONE,          NONE,      NONE                             }, // 12-bit intra for 400, 420, 422 and 444
27
+        { NONE,         NONE,          NONE,      NONE                             }  // 16-bit intra for 400, 420, 422 and 444
28
+    },
29
+};
30
+#endif
31
+
32
+static inline int _confirm(x265_param* param, bool bflag, const char* message)
33
+{
34
+    if (!bflag)
35
+        return 0;
36
+
37
+    x265_log(param, X265_LOG_ERROR, "%s\n", message);
38
+    return 1;
39
+}
40
+
41
 /* determine minimum decoder level required to decode the described video */
42
 void determineLevel(const x265_param &param, VPS& vps)
43
 {
44
@@ -80,45 +116,74 @@
45
         if (param.internalBitDepth <= 8)
46
         {
47
             if (vps.ptl.onePictureOnlyConstraintFlag)
48
-                vps.ptl.profileIdc = Profile::MAINSTILLPICTURE;
49
+                vps.ptl.profileIdc0 = Profile::MAINSTILLPICTURE;
50
             else if (vps.ptl.intraConstraintFlag)
51
-                vps.ptl.profileIdc = Profile::MAINREXT; /* Main Intra */
52
+                vps.ptl.profileIdc0 = Profile::MAINREXT; /* Main Intra */
53
             else 
54
-                vps.ptl.profileIdc = Profile::MAIN;
55
+                vps.ptl.profileIdc0 = Profile::MAIN;
56
+
57
+#if ENABLE_ALPHA
58
+            if (param.numScalableLayers == 2)
59
+                vps.ptl.profileIdc1 = Profile::SCALABLEMAIN;
60
+#endif
61
         }
62
         else if (param.internalBitDepth <= 10)
63
         {
64
             /* note there is no 10bit still picture profile */
65
             if (vps.ptl.intraConstraintFlag)
66
-                vps.ptl.profileIdc = Profile::MAINREXT; /* Main10 Intra */
67
+                vps.ptl.profileIdc0 = Profile::MAINREXT; /* Main10 Intra */
68
             else
69
-                vps.ptl.profileIdc = Profile::MAIN10;
70
+                vps.ptl.profileIdc0 = Profile::MAIN10;
71
+
72
+#if ENABLE_ALPHA
73
+            if (param.numScalableLayers == 2)
74
+                vps.ptl.profileIdc1 = Profile::SCALABLEMAIN10;
75
+#endif
76
         }
77
     }
78
     else
79
-        vps.ptl.profileIdc = Profile::MAINREXT;
80
+        vps.ptl.profileIdc0 = Profile::MAINREXT;
81
+
82
+#if ENABLE_MULTIVIEW
83
+    if (param.numViews == 2)
84
+        vps.ptl.profileIdc1 = Profile::MULTIVIEWMAIN;
85
+#endif
86
+
87
+#if ENABLE_SCC_EXT
88
+    if (param.bEnableSCC)
89
+        vps.ptl.profileIdc0 = Profile::MAINSCC;
90
 
91
     /* determine which profiles are compatible with this stream */
92
+    if (vps.ptl.profileIdc0 == Profile::MAINSCC)
93
+    {
94
+        vps.ptl.onePictureOnlyConstraintFlag = false;
95
+        vps.ptl.intraConstraintFlag = param.keyframeMax <= 1 || vps.ptl.onePictureOnlyConstraintFlag;
96
+    }
97
+#endif
98
 
99
     memset(vps.ptl.profileCompatibilityFlag, 0, sizeof(vps.ptl.profileCompatibilityFlag));
100
-    vps.ptl.profileCompatibilityFlagvps.ptl.profileIdc = true;
101
-    if (vps.ptl.profileIdc == Profile::MAIN10 && param.internalBitDepth == 8)
102
+    vps.ptl.profileCompatibilityFlagvps.ptl.profileIdc0 = true;
103
+    if (vps.ptl.profileIdc0 == Profile::MAIN10 && param.internalBitDepth == 8)
104
         vps.ptl.profileCompatibilityFlagProfile::MAIN = true;
105
-    else if (vps.ptl.profileIdc == Profile::MAIN)
106
+    else if (vps.ptl.profileIdc0 == Profile::MAIN)
107
         vps.ptl.profileCompatibilityFlagProfile::MAIN10 = true;
108
-    else if (vps.ptl.profileIdc == Profile::MAINSTILLPICTURE)
109
+    else if (vps.ptl.profileIdc0 == Profile::MAINSTILLPICTURE)
110
     {
111
         vps.ptl.profileCompatibilityFlagProfile::MAIN = true;
112
         vps.ptl.profileCompatibilityFlagProfile::MAIN10 = true;
113
     }
114
-    else if (vps.ptl.profileIdc == Profile::MAINREXT)
115
+    else if (vps.ptl.profileIdc0 == Profile::MAINREXT)
116
         vps.ptl.profileCompatibilityFlagProfile::MAINREXT = true;
117
+#if ENABLE_SCC_EXT
118
+    else if (vps.ptl.profileIdc0 == Profile::MAINSCC)
119
+        vps.ptl.profileCompatibilityFlagProfile::MAINSCC = true;
120
+#endif
121
 
122
     uint32_t lumaSamples = param.sourceWidth * param.sourceHeight;
123
     uint32_t samplesPerSec = (uint32_t)(lumaSamples * ((double)param.fpsNum / param.fpsDenom));
124
     uint32_t bitrate = param.rc.vbvMaxBitrate ? param.rc.vbvMaxBitrate : param.rc.bitrate;
125
 
126
-    const uint32_t MaxDpbPicBuf = 6;
127
+    const uint32_t MaxDpbPicBuf = param.bEnableSCC ? 7 : 6;
128
     vps.ptl.levelIdc = Level::NONE;
129
     vps.ptl.tierFlag = Level::MAIN;
130
 
131
@@ -174,7 +239,7 @@
132
         if (levelsi.levelEnum >= Level::LEVEL5 && param.maxCUSize < 32)
133
         {
134
             x265_log(&param, X265_LOG_WARNING, "level %s detected, but CTU size 16 is non-compliant\n", levelsi.name);
135
-            vps.ptl.profileIdc = Profile::NONE;
136
+            vps.ptl.profileIdc0 = Profile::NONE;
137
             vps.ptl.levelIdc = Level::NONE;
138
             vps.ptl.tierFlag = Level::MAIN;
139
             x265_log(&param, X265_LOG_INFO, "NONE profile, Level-NONE (Main tier)\n");
140
@@ -186,7 +251,7 @@
141
         if (numPocTotalCurr > 10)
142
         {
143
             x265_log(&param, X265_LOG_WARNING, "level %s detected, but NumPocTotalCurr (total references) is non-compliant\n", levelsi.name);
144
-            vps.ptl.profileIdc = Profile::NONE;
145
+            vps.ptl.profileIdc0 = Profile::NONE;
146
             vps.ptl.levelIdc = Level::NONE;
147
             vps.ptl.tierFlag = Level::MAIN;
148
             x265_log(&param, X265_LOG_INFO, "NONE profile, Level-NONE (Main tier)\n");
149
@@ -217,14 +282,32 @@
150
         break;
151
     }
152
 
153
-    static const char *profiles = { "None", "Main", "Main 10", "Main Still Picture", "RExt" };
154
+#if ENABLE_SCC_EXT
155
+    x265_param m_param = param;
156
+#define CHECK(expr, msg) check_failed |= _confirm(&m_param, expr, msg)
157
+    int check_failed = 0; /* abort if there is a fatal configuration problem */
158
+
159
+    if (vps.ptl.profileIdc0 == Profile::MAINSCC)
160
+    {
161
+        CHECK(vps.ptl.lowerBitRateConstraintFlag == false && vps.ptl.intraConstraintFlag == false, "The lowerBitRateConstraint flag cannot be false when intraConstraintFlag is false");
162
+        CHECK(param.bEnableSCC && !(vps.ptl.profileIdc0 == Profile::MAINSCC), "UseIntraBlockCopy must not be enabled unless the SCC profile is being used.");
163
+        CHECK(vps.ptl.intraConstraintFlag, "intra constraint flag must be 0 for SCC profiles");
164
+        CHECK(vps.ptl.onePictureOnlyConstraintFlag, "one-picture-only constraint flag shall be 0 for SCC profiles");
165
+        const uint32_t bitDepthIdx = (vps.ptl.bitDepthConstraint == 8 ? 0 : (vps.ptl.bitDepthConstraint == 10 ? 1 : (vps.ptl.bitDepthConstraint == 12 ? 2 : (vps.ptl.bitDepthConstraint == 16 ? 3 : 4))));
166
+        const uint32_t chromaFormatIdx = uint32_t(vps.ptl.chromaFormatConstraint);
167
+        const bool bValidProfile = (bitDepthIdx > 2 || chromaFormatIdx > 3) ? false : (validSCCProfileNames0bitDepthIdxchromaFormatIdx != NONE);
168
+        CHECK(!bValidProfile, "Invalid intra constraint flag, bit depth constraint flag and chroma format constraint flag combination for a RExt profile");
169
+    }
170
+#endif
171
+
172
+    static const char* profiles = { "None", "Main", "Main 10", "Main Still Picture", "RExt", "", "", "", "", "Main Scc" };
173
     static const char *tiers    = { "Main", "High" };
174
 
175
     char profbuf64;
176
-    strcpy(profbuf, profilesvps.ptl.profileIdc);
177
+    strcpy(profbuf, profilesvps.ptl.profileIdc0);
178
 
179
     bool bStillPicture = false;
180
-    if (vps.ptl.profileIdc == Profile::MAINREXT)
181
+    if (vps.ptl.profileIdc0 == Profile::MAINREXT)
182
     {
183
         if (vps.ptl.bitDepthConstraint > 12 && vps.ptl.intraConstraintFlag)
184
         {
185
@@ -277,6 +360,27 @@
186
         if (vps.ptl.intraConstraintFlag && !bStillPicture)
187
             strcat(profbuf, " Intra");
188
     }
189
+
190
+#if ENABLE_SCC_EXT
191
+    if (vps.ptl.profileIdc0 == Profile::MAINSCC)
192
+    {
193
+        if (param.internalCsp == X265_CSP_I420)
194
+        {
195
+            if (vps.ptl.bitDepthConstraint <= 8)
196
+                strcpy(profbuf, "Main Scc");
197
+            else if (vps.ptl.bitDepthConstraint <= 10)
198
+                strcpy(profbuf, "Main 10 Scc");
199
+        }
200
+        else if (param.internalCsp == X265_CSP_I444)
201
x265_3.6.tar.gz/source/encoder/motion.cpp -> x265_4.0.tar.gz/source/encoder/motion.cpp Changed
23
 
1
@@ -770,6 +770,7 @@
2
                                    int              merange,
3
                                    MV &             outQMv,
4
                                    uint32_t         maxSlices,
5
+                                    bool            m_vertRestriction,
6
                                    pixel *          srcReferencePlane)
7
 {
8
     ALIGN_VAR_16(int, costs16);
9
@@ -794,6 +795,13 @@
10
 
11
     // measure SAD cost at clipped QPEL MVP
12
     MV pmv = qmvp.clipped(qmvmin, qmvmax);
13
+    if (m_vertRestriction)
14
+    {
15
+        if (pmv.y > mvmax.y << 2)
16
+        {
17
+            pmv.y = (mvmax.y << 2);
18
+        }
19
+    }
20
     MV bestpre = pmv;
21
     int bprecost;
22
 
23
x265_3.6.tar.gz/source/encoder/motion.h -> x265_4.0.tar.gz/source/encoder/motion.h Changed
10
 
1
@@ -95,7 +95,7 @@
2
     }
3
 
4
     void refineMV(ReferencePlanes* ref, const MV& mvmin, const MV& mvmax, const MV& qmvp, MV& outQMv);
5
-    int motionEstimate(ReferencePlanes* ref, const MV & mvmin, const MV & mvmax, const MV & qmvp, int numCandidates, const MV * mvc, int merange, MV & outQMv, uint32_t maxSlices, pixel *srcReferencePlane = 0);
6
+    int motionEstimate(ReferencePlanes* ref, const MV & mvmin, const MV & mvmax, const MV & qmvp, int numCandidates, const MV * mvc, int merange, MV & outQMv, uint32_t maxSlices, bool m_vertRestriction, pixel *srcReferencePlane = 0);
7
 
8
     int subpelCompare(ReferencePlanes* ref, const MV &qmv, pixelcmp_t);
9
 
10
x265_3.6.tar.gz/source/encoder/nal.cpp -> x265_4.0.tar.gz/source/encoder/nal.cpp Changed
19
 
1
@@ -57,7 +57,7 @@
2
     other.m_buffer = X265_MALLOC(uint8_t, m_allocSize);
3
 }
4
 
5
-void NALList::serialize(NalUnitType nalUnitType, const Bitstream& bs, uint8_t temporalID)
6
+void NALList::serialize(NalUnitType nalUnitType, const Bitstream& bs, int layerId, uint8_t temporalID)
7
 {
8
     static const char startCodePrefix = { 0, 0, 0, 1 };
9
 
10
@@ -114,7 +114,7 @@
11
      * nuh_reserved_zero_6bits  6-bits
12
      * nuh_temporal_id_plus1    3-bits */
13
     outbytes++ = (uint8_t)nalUnitType << 1;
14
-    outbytes++ = temporalID;
15
+    outbytes++ = (layerId << 3) | (temporalID);
16
 
17
     /* 7.4.1 ...
18
      * Within the NAL unit, the following three-byte sequences shall not occur at
19
x265_3.6.tar.gz/source/encoder/nal.h -> x265_4.0.tar.gz/source/encoder/nal.h Changed
22
 
1
@@ -35,7 +35,11 @@
2
 class NALList
3
 {
4
 public:
5
+#if ENABLE_MULTIVIEW || ENABLE_ALPHA
6
+    static const int MAX_NAL_UNITS = 32;
7
+#else
8
     static const int MAX_NAL_UNITS = 16;
9
+#endif
10
 
11
 public:
12
 
13
@@ -56,7 +60,7 @@
14
 
15
     void takeContents(NALList& other);
16
 
17
-    void serialize(NalUnitType nalUnitType, const Bitstream& bs, uint8_t temporalID = 1);
18
+    void serialize(NalUnitType nalUnitType, const Bitstream& bs, int layerId = 0, uint8_t temporalID = 1);
19
 
20
     uint32_t serializeSubstreams(uint32_t* streamSizeBytes, uint32_t streamCount, const Bitstream* streams);
21
 };
22
x265_3.6.tar.gz/source/encoder/ratecontrol.cpp -> x265_4.0.tar.gz/source/encoder/ratecontrol.cpp Changed
21
 
1
@@ -1349,6 +1349,10 @@
2
     FrameData& curEncData = *curFrame->m_encData;
3
     m_curSlice = curEncData.m_slice;
4
     m_sliceType = m_curSlice->m_sliceType;
5
+#if ENABLE_SCC_EXT
6
+    if(m_param->bEnableSCC)
7
+        m_sliceType = m_curSlice->m_origSliceType;
8
+#endif
9
     rce->sliceType = m_sliceType;
10
     if (!m_2pass)
11
         rce->keptAsRef = IS_REFERENCED(curFrame);
12
@@ -1466,7 +1470,7 @@
13
 
14
         int mincr = enc->m_vps.ptl.minCrForLevel;
15
         /* Profiles above Main10 don't require maxAU size check, so just set the maximum to a large value. */
16
-        if (enc->m_vps.ptl.profileIdc > Profile::MAIN10 || enc->m_vps.ptl.levelIdc == Level::NONE)
17
+        if (enc->m_vps.ptl.profileIdc0 > Profile::MAIN10 || enc->m_vps.ptl.levelIdc == Level::NONE)
18
             rce->frameSizeMaximum = 1e9;
19
         else
20
         {
21
x265_3.6.tar.gz/source/encoder/sao.cpp -> x265_4.0.tar.gz/source/encoder/sao.cpp Changed
201
 
1
@@ -36,12 +36,6 @@
2
     return num >= 0 ? ((num * 2 + den) / (den * 2)) : -((-num * 2 + den) / (den * 2));
3
 }
4
 
5
-/* get the sign of input variable (TODO: this is a dup, make common) */
6
-inline int8_t signOf(int x)
7
-{
8
-    return (x >> 31) | ((int)((((uint32_t)-x)) >> 31));
9
-}
10
-
11
 inline int signOf2(const int a, const int b)
12
 {
13
     // NOTE: don't reorder below compare, both ICL, VC, GCC optimize strong depends on order!
14
@@ -273,7 +267,7 @@
15
 // CTU-based SAO process without slice granularity
16
 void SAO::applyPixelOffsets(int addr, int typeIdx, int plane)
17
 {
18
-    PicYuv* reconPic = m_frame->m_reconPic;
19
+    PicYuv* reconPic = m_frame->m_reconPic0;
20
     pixel* rec = reconPic->getPlaneAddr(plane, addr);
21
     intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride;
22
     uint32_t picWidth  = m_param->sourceWidth;
23
@@ -328,10 +322,10 @@
24
         {
25
             for (int y = 0; y < ctuHeight; y++, rec += stride)
26
             {
27
-                int signLeft = signOf(recstartX - tmpLy);
28
+                int signLeft = x265_signOf(recstartX - tmpLy);
29
                 for (int x = startX; x < endX; x++)
30
                 {
31
-                    int signRight = signOf(recx - recx + 1);
32
+                    int signRight = x265_signOf(recx - recx + 1);
33
                     int edgeType = signRight + signLeft + 2;
34
                     signLeft = -signRight;
35
 
36
@@ -343,8 +337,8 @@
37
         {
38
             for (int y = 0; y < ctuHeight; y += 2, rec += 2 * stride)
39
             {
40
-                signLeft10 = signOf(recstartX - tmpLy);
41
-                signLeft11 = signOf(recstride + startX - tmpLy + 1);
42
+                signLeft10 = x265_signOf(recstartX - tmpLy);
43
+                signLeft11 = x265_signOf(recstride + startX - tmpLy + 1);
44
 
45
                 if (!lpelx)
46
                 {
47
@@ -385,13 +379,13 @@
48
         if (ctuWidth & 15)
49
         {
50
             for (int x = 0; x < ctuWidth; x++)
51
-                upBuff1x = signOf(recx - tmpUx);
52
+                upBuff1x = x265_signOf(recx - tmpUx);
53
 
54
             for (int y = startY; y < endY; y++, rec += stride)
55
             {
56
                 for (int x = 0; x < ctuWidth; x++)
57
                 {
58
-                    int8_t signDown = signOf(recx - recx + stride);
59
+                    int8_t signDown = x265_signOf(recx - recx + stride);
60
                     int edgeType = signDown + upBuff1x + 2;
61
                     upBuff1x = -signDown;
62
 
63
@@ -445,17 +439,17 @@
64
         else
65
         {
66
             for (int x = startX; x < endX; x++)
67
-                upBuff1x = signOf(recx - tmpUx - 1);
68
+                upBuff1x = x265_signOf(recx - tmpUx - 1);
69
         }
70
 
71
         if (ctuWidth & 15)
72
         {
73
              for (int y = startY; y < endY; y++, rec += stride)
74
              {
75
-                 upBufftstartX = signOf(recstride + startX - tmpLy);
76
+                 upBufftstartX = x265_signOf(recstride + startX - tmpLy);
77
                  for (int x = startX; x < endX; x++)
78
                  {
79
-                     int8_t signDown = signOf(recx - recx + stride + 1);
80
+                     int8_t signDown = x265_signOf(recx - recx + stride + 1);
81
                      int edgeType = signDown + upBuff1x + 2;
82
                      upBufftx + 1 = -signDown;
83
                      recx = m_clipTablerecx + offsetEoedgeType;
84
@@ -468,7 +462,7 @@
85
         {
86
             for (int y = startY; y < endY; y++, rec += stride)
87
             {
88
-                int8_t iSignDown2 = signOf(recstride + startX - tmpLy);
89
+                int8_t iSignDown2 = x265_signOf(recstride + startX - tmpLy);
90
 
91
                 primitives.saoCuOrgE2endX > 16(rec + startX, upBufft + startX, upBuff1 + startX, offsetEo, endX - startX, stride);
92
 
93
@@ -493,25 +487,25 @@
94
         if (ctuWidth & 15)
95
         {
96
             for (int x = startX - 1; x < endX; x++)
97
-                upBuff1x = signOf(recx - tmpUx + 1);
98
+                upBuff1x = x265_signOf(recx - tmpUx + 1);
99
 
100
             for (int y = startY; y < endY; y++, rec += stride)
101
             {
102
                 int x = startX;
103
-                int8_t signDown = signOf(recx - tmpLy + 1);
104
+                int8_t signDown = x265_signOf(recx - tmpLy + 1);
105
                 int edgeType = signDown + upBuff1x + 2;
106
                 upBuff1x - 1 = -signDown;
107
                 recx = m_clipTablerecx + offsetEoedgeType;
108
 
109
                 for (x = startX + 1; x < endX; x++)
110
                 {
111
-                    signDown = signOf(recx - recx + stride - 1);
112
+                    signDown = x265_signOf(recx - recx + stride - 1);
113
                     edgeType = signDown + upBuff1x + 2;
114
                     upBuff1x - 1 = -signDown;
115
                     recx = m_clipTablerecx + offsetEoedgeType;
116
                 }
117
 
118
-                upBuff1endX - 1 = signOf(recendX - 1 + stride - recendX);
119
+                upBuff1endX - 1 = x265_signOf(recendX - 1 + stride - recendX);
120
             }
121
         }
122
         else
123
@@ -519,7 +513,7 @@
124
             int8_t firstSign, lastSign;
125
 
126
             if (lpelx)
127
-                firstSign = signOf(rec-1 - tmpU0);
128
+                firstSign = x265_signOf(rec-1 - tmpU0);
129
             if (rpelx == picWidth)
130
                 lastSign = upBuff1ctuWidth - 1;
131
 
132
@@ -533,14 +527,14 @@
133
             for (int y = startY; y < endY; y++, rec += stride)
134
             {
135
                 int x = startX;
136
-                int8_t signDown = signOf(recx - tmpLy + 1);
137
+                int8_t signDown = x265_signOf(recx - tmpLy + 1);
138
                 int edgeType = signDown + upBuff1x + 2;
139
                 upBuff1x - 1 = -signDown;
140
                 recx = m_clipTablerecx + offsetEoedgeType;
141
 
142
                 primitives.saoCuOrgE3endX > 16(rec, upBuff1, offsetEo, stride - 1, startX, endX);
143
 
144
-                upBuff1endX - 1 = signOf(recendX - 1 + stride - recendX);
145
+                upBuff1endX - 1 = x265_signOf(recendX - 1 + stride - recendX);
146
             }
147
         }
148
 
149
@@ -571,7 +565,7 @@
150
 /* Process SAO unit */
151
 void SAO::generateLumaOffsets(SaoCtuParam* ctuParam, int idxY, int idxX)
152
 {
153
-    PicYuv* reconPic = m_frame->m_reconPic;
154
+    PicYuv* reconPic = m_frame->m_reconPic0;
155
     intptr_t stride = reconPic->m_stride;
156
     int ctuWidth = m_param->maxCUSize;
157
     int ctuHeight = m_param->maxCUSize;
158
@@ -631,7 +625,7 @@
159
 /* Process SAO unit (Chroma only) */
160
 void SAO::generateChromaOffsets(SaoCtuParam* ctuParam3, int idxY, int idxX)
161
 {
162
-    PicYuv* reconPic = m_frame->m_reconPic;
163
+    PicYuv* reconPic = m_frame->m_reconPic0;
164
     intptr_t stride = reconPic->m_strideC;
165
     int ctuWidth  = m_param->maxCUSize;
166
     int ctuHeight = m_param->maxCUSize;
167
@@ -735,7 +729,7 @@
168
 void SAO::calcSaoStatsCTU(int addr, int plane)
169
 {
170
     Slice* slice = m_frame->m_encData->m_slice;
171
-    const PicYuv* reconPic = m_frame->m_reconPic;
172
+    const PicYuv* reconPic = m_frame->m_reconPic0;
173
     const CUData* cu = m_frame->m_encData->getPicCTU(addr);
174
     const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr);
175
     const pixel* rec0  = reconPic->getPlaneAddr(plane, addr);
176
@@ -922,7 +916,7 @@
177
 
178
     int x, y;
179
     const CUData* cu = frame->m_encData->getPicCTU(addr);
180
-    const PicYuv* reconPic = m_frame->m_reconPic;
181
+    const PicYuv* reconPic = m_frame->m_reconPic0;
182
     const pixel* fenc;
183
     const pixel* rec;
184
     intptr_t stride = reconPic->m_stride;
185
@@ -1030,10 +1024,10 @@
186
             for (y = 0; y < ctuHeight; y++)
187
             {
188
                 x = (y < startY ? startX : firstX);
189
-                int signLeft = signOf(recx - recx - 1);
190
+                int signLeft = x265_signOf(recx - recx - 1);
191
                 for (; x < endX; x++)
192
                 {
193
-                    int signRight = signOf(recx - recx + 1);
194
+                    int signRight = x265_signOf(recx - recx + 1);
195
                     int edgeType = signRight + signLeft + 2;
196
                     signLeft = -signRight;
197
 
198
@@ -1069,13 +1063,13 @@
199
             }
200
 
201
x265_3.6.tar.gz/source/encoder/search.cpp -> x265_4.0.tar.gz/source/encoder/search.cpp Changed
201
 
1
@@ -76,6 +76,9 @@
2
     m_param = &param;
3
     m_bFrameParallel = param.frameNumThreads > 1;
4
     m_numLayers = g_log2Sizeparam.maxCUSize - 2;
5
+#if ENABLE_SCC_EXT
6
+    m_ibcEnabled = param.bEnableSCC;
7
+#endif
8
 
9
     m_rdCost.setPsyRdScale(param.psyRd);
10
     m_rdCost.setSsimRd(param.bSsimRd);
11
@@ -171,6 +174,11 @@
12
     CHECKED_MALLOC(m_tsResidual, int16_t, MAX_TS_SIZE * MAX_TS_SIZE);
13
     CHECKED_MALLOC(m_tsRecon,    pixel,   MAX_TS_SIZE * MAX_TS_SIZE);
14
 
15
+#if ENABLE_SCC_EXT
16
+    m_numBVs = 0;
17
+    m_numBV16s = 0;
18
+#endif
19
+
20
     return ok;
21
 
22
 fail:
23
@@ -496,7 +504,7 @@
24
     }
25
 
26
     // set reconstruction for next intra prediction blocks if full TU prediction won
27
-    PicYuv*  reconPic = m_frame->m_reconPic;
28
+    PicYuv*  reconPic = m_frame->m_reconPic0;
29
     pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
30
     intptr_t picStride = reconPic->m_stride;
31
     primitives.cusizeIdx.copy_pp(picReconY, picStride, reconQt, reconQtStride);
32
@@ -672,7 +680,7 @@
33
     }
34
 
35
     // set reconstruction for next intra prediction blocks
36
-    PicYuv*  reconPic = m_frame->m_reconPic;
37
+    PicYuv*  reconPic = m_frame->m_reconPic0;
38
     pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
39
     intptr_t picStride = reconPic->m_stride;
40
     primitives.cusizeIdx.copy_pp(picReconY, picStride, reconQt, reconQtStride);
41
@@ -723,7 +731,7 @@
42
         uint32_t sizeIdx   = log2TrSize - 2;
43
         primitives.cusizeIdx.calcresidualstride % 64 == 0(fenc, pred, residual, stride);
44
 
45
-        PicYuv*  reconPic = m_frame->m_reconPic;
46
+        PicYuv*  reconPic = m_frame->m_reconPic0;
47
         pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
48
         intptr_t picStride = reconPic->m_stride;
49
 
50
@@ -887,7 +895,7 @@
51
             coeff_t* coeffC        = m_rqtqtLayer.coeffRQTchromaId + coeffOffsetC;
52
             pixel*   reconQt       = m_rqtqtLayer.reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
53
             uint32_t reconQtStride = m_rqtqtLayer.reconQtYuv.m_csize;
54
-            PicYuv*  reconPic = m_frame->m_reconPic;
55
+            PicYuv*  reconPic = m_frame->m_reconPic0;
56
             pixel*   picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
57
             intptr_t picStride = reconPic->m_strideC;
58
 
59
@@ -1078,7 +1086,7 @@
60
             cu.setCbfPartRange(bCbf << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
61
             cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
62
 
63
-            PicYuv*  reconPic = m_frame->m_reconPic;
64
+            PicYuv*  reconPic = m_frame->m_reconPic0;
65
             pixel*   reconPicC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
66
             intptr_t picStride = reconPic->m_strideC;
67
             primitives.cusizeIdxC.copy_pp(reconPicC, picStride, reconQt, reconQtStride);
68
@@ -1185,7 +1193,7 @@
69
             int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
70
             uint32_t coeffOffsetC  = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
71
             coeff_t* coeffC        = cu.m_trCoeffttype + coeffOffsetC;
72
-            PicYuv*  reconPic = m_frame->m_reconPic;
73
+            PicYuv*  reconPic = m_frame->m_reconPic0;
74
             pixel*   picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
75
             intptr_t picStride = reconPic->m_strideC;
76
 
77
@@ -1284,6 +1292,11 @@
78
 
79
     updateModeCost(intraMode);
80
     checkDQP(intraMode, cuGeom);
81
+
82
+#if ENABLE_SCC_EXT
83
+    if (m_param->bEnableSCC)
84
+        intraMode.reconYuv.copyToPicYuv(*m_frame->m_reconPic1, cu.m_cuAddr, cuGeom.absPartIdx);
85
+#endif
86
 }
87
 
88
 /* Note that this function does not save the best intra prediction, it must
89
@@ -1671,7 +1684,7 @@
90
              * output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also
91
              * it is not updating m_rdContextsdepth.cur for the later PUs which I suspect is slightly wrong. I think
92
              * that the contexts should be tracked through each PU */
93
-            PicYuv*  reconPic = m_frame->m_reconPic;
94
+            PicYuv*  reconPic = m_frame->m_reconPic0;
95
             pixel*   dst       = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
96
             uint32_t dststride = reconPic->m_stride;
97
             const pixel*   src = reconYuv->getLumaAddr(absPartIdx);
98
@@ -1844,7 +1857,7 @@
99
         if (!tuIterator.isLastSection())
100
         {
101
             uint32_t zorder    = cuGeom.absPartIdx + absPartIdxC;
102
-            PicYuv*  reconPic  = m_frame->m_reconPic;
103
+            PicYuv*  reconPic  = m_frame->m_reconPic0;
104
             uint32_t dststride = reconPic->m_strideC;
105
             const pixel* src;
106
             pixel* dst;
107
@@ -1895,7 +1908,9 @@
108
     MVField  candMvFieldMRG_MAX_NUM_CANDS2;
109
     uint8_t  candDirMRG_MAX_NUM_CANDS;
110
     uint32_t numMergeCand = cu.getInterMergeCandidates(pu.puAbsPartIdx, puIdx, candMvField, candDir);
111
-
112
+#if ENABLE_SCC_EXT
113
+    restrictBipredMergeCand(&cu, 0, candMvField, candDir, numMergeCand);
114
+#else
115
     if (cu.isBipredRestriction())
116
     {
117
         /* do not allow bidir merge candidates if PU is smaller than 8x8, drop L1 reference */
118
@@ -1908,6 +1923,7 @@
119
             }
120
         }
121
     }
122
+#endif
123
 
124
     Yuv& tempYuv = m_rqtcuGeom.depth.tmpPredYuv;
125
 
126
@@ -1936,6 +1952,12 @@
127
                 continue;
128
         }
129
 
130
+#if ENABLE_SCC_EXT
131
+        if ((candDirmergeCand == 1 || candDirmergeCand == 3) && (m_slice->m_refPOCList0candMvFieldmergeCand0.refIdx == m_slice->m_poc))
132
+        {
133
+            continue;
134
+        }
135
+#endif
136
         cu.m_mv0pu.puAbsPartIdx = candMvFieldmergeCand0.mv;
137
         cu.m_refIdx0pu.puAbsPartIdx = (int8_t)candMvFieldmergeCand0.refIdx;
138
         cu.m_mv1pu.puAbsPartIdx = candMvFieldmergeCand1.mv;
139
@@ -2015,7 +2037,12 @@
140
                 continue;
141
         }
142
         cu.clipMv(mvCand);
143
-        predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refReconPicListlistref, mvCand);
144
+#if ENABLE_SCC_EXT
145
+        if (m_slice->m_param->bEnableSCC && !list && ref == m_slice->m_numRefIdx0 - 1)
146
+            predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refFrameListlistref->m_reconPic1, mvCand);
147
+        else
148
+#endif
149
+            predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refReconPicListlistref, mvCand);
150
         costsi = m_me.bufSAD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size);
151
     }
152
 
153
@@ -2086,13 +2113,18 @@
154
 void Search::singleMotionEstimation(Search& master, Mode& interMode, const PredictionUnit& pu, int part, int list, int ref)
155
 {
156
     uint32_t bits = master.m_listSelBitslist + MVP_IDX_BITS;
157
-    bits += getTUBits(ref, m_slice->m_numRefIdxlist);
158
+    int numIdx = m_slice->m_numRefIdxlist;
159
+#if ENABLE_SCC_EXT
160
+    if (!list && m_ibcEnabled)
161
+        numIdx--;
162
+#endif
163
+    bits += getTUBits(ref, numIdx);
164
 
165
     MotionData* bestME = interMode.bestMEpart;
166
 
167
     // 12 mv candidates including lowresMV
168
     MV  mvc(MD_ABOVE_LEFT + 1) * 2 + 2;
169
-    int numMvc = interMode.cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCandlistref, mvc);
170
+    int numMvc = interMode.cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCandlistref, mvc, 0, pu.puAbsPartIdx);
171
 
172
     const MV* amvp = interMode.amvpCandlistref;
173
     int mvpIdx = selectMVP(interMode.cu, pu, amvp, list, ref);
174
@@ -2102,22 +2134,24 @@
175
     if (!m_param->analysisSave && !m_param->analysisLoad) /* Prevents load/save outputs from diverging if lowresMV is not available */
176
     {
177
         MV lmv = getLowresMV(interMode.cu, pu, list, ref);
178
-        if (lmv.notZero())
179
+        int layer = m_param->numViews > 1 ? m_frame->m_viewId : (m_param->numScalableLayers > 1) ? m_frame->m_sLayerId : 0;
180
+        if (lmv.notZero() && !layer)
181
             mvcnumMvc++ = lmv;
182
         if (m_param->bEnableHME)
183
             mvp_lowres = lmv;
184
     }
185
 
186
+    m_vertRestriction = interMode.cu.m_slice->m_refPOCListlistref == interMode.cu.m_slice->m_poc;
187
     setSearchRange(interMode.cu, mvp, m_param->searchRange, mvmin, mvmax);
188
 
189
-    int satdCost = m_me.motionEstimate(&m_slice->m_mreflistref, mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, 
190
+    int satdCost = m_me.motionEstimate(&m_slice->m_mreflistref, mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, m_vertRestriction,
191
       m_param->bSourceReferenceEstimation ? m_slice->m_refFrameListlistref->m_fencPic->getLumaAddr(0) : 0);
192
 
193
     if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp)
194
     {
195
         MV outmv_lowres;
196
         setSearchRange(interMode.cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);
197
-        int lowresMvCost = m_me.motionEstimate(&m_slice->m_mreflistref, mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices,
198
+        int lowresMvCost = m_me.motionEstimate(&m_slice->m_mreflistref, mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices, m_vertRestriction,
199
             m_param->bSourceReferenceEstimation ? m_slice->m_refFrameListlistref->m_fencPic->getLumaAddr(0) : 0);
200
         if (lowresMvCost < satdCost)
201
x265_3.6.tar.gz/source/encoder/search.h -> x265_4.0.tar.gz/source/encoder/search.h Changed
53
 
1
@@ -286,6 +286,16 @@
2
     int32_t         m_sliceMaxY;
3
     int32_t         m_sliceMinY;
4
 
5
+    bool            m_vertRestriction;
6
+
7
+#if ENABLE_SCC_EXT
8
+    int             m_ibcEnabled;
9
+    int             m_numBVs;
10
+    int             m_numBV16s;
11
+    MV              m_BVs64;
12
+    uint32_t        m_lastCandCost;
13
+#endif
14
+
15
 #if DETAILED_CU_STATS
16
     /* Accumulate CU statistics separately for each frame encoder */
17
     CUStats         m_statsX265_MAX_FRAME_THREADS;
18
@@ -309,7 +319,7 @@
19
     void     encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom);
20
 
21
     // estimation inter prediction (non-skip)
22
-    void     predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t masks2);
23
+    void      predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t masks2, MV* iMVCandList = NULL);
24
     void     searchMV(Mode& interMode, int list, int ref, MV& outmv, MV mvp3, int numMvc, MV* mvc);
25
     // encode residual and compute rd-cost for inter mode
26
     void     encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom);
27
@@ -329,6 +339,25 @@
28
 
29
     MV getLowresMV(const CUData& cu, const PredictionUnit& pu, int list, int ref);
30
 
31
+#if ENABLE_SCC_EXT
32
+    bool      predIntraBCSearch(Mode& intraBCMode, const CUGeom& cuGeom, bool bChromaMC, PartSize ePartSize, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc);
33
+    void      intraBlockCopyEstimate(Mode& intraBCMode, const CUGeom& cuGeom, int puIdx, MV* pred, MV& mv, uint32_t& cost, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc);
34
+    void      setIntraSearchRange(Mode& intraBCMode, MV& pred, int puIdx, int roiWidth, int roiHeight, MV& searchRangeLT, MV& searchRangeRB);
35
+    void      intraPatternSearch(Mode& intraBCMode, const CUGeom& cuGeom, int puIdx, uint32_t partAddr, pixel* refY, int refStride, MV* searchRangeLT, MV* searchRangeRB,
36
+        MV& mv, uint32_t& cost, int roiwidth, int roiheight, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc);
37
+    bool      isValidIntraBCSearchArea(CUData* cu, int predX, int predY, int roiWidth, int roiHeight, int partOffset);
38
+    bool      isBlockVectorValid(int xPos, int yPos, int width, int height, CUData* pcCU,
39
+        int xStartInCU, int yStartInCU, int xBv, int yBv, int ctuSize);
40
+    void      intraBCSearchMVCandUpdate(uint32_t sad, int x, int y, uint32_t* sadBestCand, MV* cMVCand);
41
+    void      updateBVMergeCandLists(int roiWidth, int roiHeight, MV* mvCand, IBC& ibc);
42
+    int       intraBCSearchMVChromaRefine(Mode& intraBCMode, const CUGeom& cuGeom, int roiWidth, int roiHeight, int cuPelX, int cuPelY, uint32_t* sadBestCand, MV* cMVCand,
43
+        uint32_t partOffset, int puIdx);
44
+    static    uint32_t mergeCandLists(MV* dst, uint32_t dn, MV* src, uint32_t sn, bool isSrcQuarPel);
45
+    uint32_t  getSAD(pixel* ref, int refStride, const pixel* curr, int currStride, int width, int height);
46
+    bool      predMixedIntraBCInterSearch(Mode& intraBCMode, const CUGeom& cuGeom, bool bChromaMC, PartSize ePartSize, MV* iMVCandList);
47
+    void      restrictBipredMergeCand(CUData* cu, uint32_t puIdx, MVField(*mvFieldNeighbours)2, uint8_t* interDirNeighbours, uint32_t numValidMergeCand);
48
+#endif
49
+
50
     class PME : public BondedTaskGroup
51
     {
52
     public:
53
x265_3.6.tar.gz/source/encoder/sei.cpp -> x265_4.0.tar.gz/source/encoder/sei.cpp Changed
19
 
1
@@ -36,7 +36,7 @@
2
 
3
 /* marshal a single SEI message sei, storing the marshalled representation
4
 * in bitstream bs */
5
-void SEI::writeSEImessages(Bitstream& bs, const SPS& sps, NalUnitType nalUnitType, NALList& list, int isNested)
6
+void SEI::writeSEImessages(Bitstream& bs, const SPS& sps, NalUnitType nalUnitType, NALList& list, int isNested, int layer)
7
 {
8
     if (!isNested)
9
         bs.resetBits();
10
@@ -68,7 +68,7 @@
11
     {
12
         if (nalUnitType != NAL_UNIT_UNSPECIFIED)
13
             bs.writeByteAlignment();
14
-        list.serialize(nalUnitType, bs, (1 + (nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N)));
15
+        list.serialize(nalUnitType, bs, layer, (1 + (nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N)));
16
     }
17
 }
18
 
19
x265_3.6.tar.gz/source/encoder/sei.h -> x265_4.0.tar.gz/source/encoder/sei.h Changed
201
 
1
@@ -38,7 +38,7 @@
2
 public:
3
     /* SEI users call writeSEImessages() to marshal an SEI to a bitstream.
4
     * The writeSEImessages() method calls writeSEI() which encodes the header */
5
-    void writeSEImessages(Bitstream& bs, const SPS& sps, NalUnitType nalUnitType, NALList& list, int isNested);
6
+    void writeSEImessages(Bitstream& bs, const SPS& sps, NalUnitType nalUnitType, NALList& list, int isNested, int layerId = 0);
7
     void setSize(uint32_t size);
8
     static char* base64Decode(char encodedString, int base64EncodeLength);
9
     virtual ~SEI() {}
10
@@ -189,6 +189,228 @@
11
     }
12
 };
13
 
14
+#if ENABLE_ALPHA
15
+class SEIAlphaChannelInfo : public SEI
16
+{
17
+public:
18
+    SEIAlphaChannelInfo()
19
+    {
20
+        m_payloadType = ALPHA_CHANNEL_INFO;
21
+        m_payloadSize = 0;
22
+    }
23
+
24
+    bool alpha_channel_cancel_flag;
25
+    void writeSEI(const SPS&)
26
+    {
27
+        WRITE_CODE(alpha_channel_cancel_flag, 1, "alpha_channel_cancel_flag");
28
+        if (!alpha_channel_cancel_flag)
29
+        {
30
+            WRITE_CODE(0, 3, "alpha_channel_use_idc");
31
+            WRITE_CODE(0, 3, "alpha_channel_bit_depth_minus8");
32
+            WRITE_CODE(0, 9, "alpha_transparent_value");
33
+            WRITE_CODE(255, 9, "alpha_opaque_value");
34
+            WRITE_CODE(0, 1, "alpha_channel_incr_flag");
35
+            WRITE_CODE(0, 1, "alpha_channel_clip_flag");
36
+        }
37
+        if (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
38
+        {
39
+            WRITE_FLAG(1, "payload_bit_equal_to_one");
40
+            while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
41
+            {
42
+                WRITE_FLAG(0, "payload_bit_equal_to_zero");
43
+            }
44
+        }
45
+    }
46
+};
47
+#endif
48
+
49
+#if ENABLE_MULTIVIEW
50
+class SEIThreeDimensionalReferenceDisplaysInfo : public SEI
51
+{
52
+public:
53
+    SEIThreeDimensionalReferenceDisplaysInfo()
54
+    {
55
+        m_payloadType = THREE_DIMENSIONAL_REFERENCE_DISPLAYS_INFO;
56
+        m_payloadSize = 0;
57
+    }
58
+
59
+    int  m_numRefDisplaysMinus1 = 0;
60
+    bool m_refViewingDistanceFlag = false;
61
+    bool m_additionalShiftPresentFlag = false;
62
+    void writeSEI(const SPS&)
63
+    {
64
+        WRITE_UVLC(31, "prec_ref_display_width");
65
+        WRITE_FLAG(m_refViewingDistanceFlag, "ref_viewing_distance_flag");
66
+        if (m_refViewingDistanceFlag)
67
+        {
68
+            WRITE_UVLC(0, "prec_ref_viewing_dist");
69
+        }
70
+        WRITE_UVLC(0, "num_ref_displays_minus1");
71
+        for (int i = 0; i <= m_numRefDisplaysMinus1; i++)
72
+        {
73
+            WRITE_UVLC(0, "left_view_id");
74
+            WRITE_UVLC(1, "right_view_id");
75
+            WRITE_CODE(0, 6, "exponent_ref_display_width");
76
+            WRITE_CODE(0, 2, "mantissa_ref_display_width");
77
+            if (m_refViewingDistanceFlag)
78
+            {
79
+                WRITE_CODE(0, 6, "exponent_ref_viewing_distance");
80
+                WRITE_CODE(0, 1, "mantissa_ref_viewing_distance");
81
+            }
82
+            WRITE_FLAG(m_additionalShiftPresentFlag, "additional_shift_present_flag");
83
+            if (m_additionalShiftPresentFlag)
84
+            {
85
+                WRITE_CODE(0, 10, "num_sample_shift_plus512");
86
+            }
87
+        }
88
+        WRITE_FLAG(0, "three_dimensional_reference_displays_extension_flag");
89
+
90
+        if (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
91
+        {
92
+            WRITE_FLAG(1, "payload_bit_equal_to_one");
93
+            while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
94
+            {
95
+                WRITE_FLAG(0, "payload_bit_equal_to_zero");
96
+            }
97
+        }
98
+    }
99
+
100
+};
101
+
102
+class SEIMultiviewSceneInfo : public SEI
103
+{
104
+public:
105
+    SEIMultiviewSceneInfo()
106
+    {
107
+        m_payloadType = MULTIVIEW_SCENE_INFO;
108
+        m_payloadSize = 0;
109
+    }
110
+    void writeSEI(const SPS&)
111
+    {
112
+        WRITE_SVLC(-333, "min_disparity");
113
+        WRITE_UVLC(2047, "max_disparity_range");
114
+
115
+        if (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
116
+        {
117
+            WRITE_FLAG(1, "payload_bit_equal_to_one");
118
+            while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
119
+            {
120
+                WRITE_FLAG(0, "payload_bit_equal_to_zero");
121
+            }
122
+        }
123
+    }
124
+};
125
+
126
+class SEIMultiviewAcquisitionInfo : public SEI
127
+{
128
+public:
129
+    SEIMultiviewAcquisitionInfo()
130
+    {
131
+        m_payloadType = MULTIVIEW_ACQUISITION_INFO;
132
+        m_payloadSize = 0;
133
+    }
134
+
135
+    int sign_r33 = { {0,1,0},{1,0,0},{0,1,1} };
136
+    int exponent_r33 = { {10,20,11},{10,5,11},{2,20,11} };
137
+    int mantissa_r33 = { {4,9,1},{0,3,4},{3,3,7} };
138
+    int sign_t13 = { 0,1,0 };
139
+    int exponent_t13 = { 0,10,5 };
140
+    int mantissa_t13 = { 1,8,9 };
141
+    int lenght_mantissa_r33 = { {10,20,11},{10,5,11},{2,20,11} };
142
+    int length_mantissa_t13 = { 1,10,5 };
143
+    bool m_intrinsicParamFlag = true;
144
+    bool m_extrinsicParamFlag = true;
145
+    bool m_intrinsicParamsEqualFlag = true;
146
+    void writeSEI(const SPS& sps)
147
+    {
148
+        WRITE_FLAG(m_intrinsicParamFlag, "intrinsic_param_flag");
149
+        WRITE_FLAG(m_extrinsicParamFlag, "extrinsic_param_flag");
150
+        if (m_intrinsicParamFlag)
151
+        {
152
+            WRITE_FLAG(m_intrinsicParamsEqualFlag, "intrinsic_params_equal_flag");
153
+            WRITE_UVLC(31, "prec_focal_length");
154
+            WRITE_UVLC(31, "prec_principal_point");
155
+            WRITE_UVLC(31, "prec_skew_factor");
156
+
157
+            for (int i = 0; i <= (m_intrinsicParamsEqualFlag ? 0 : sps.maxViews - 1); i++)
158
+            {
159
+                WRITE_FLAG(0, "sign_focal_length_x");
160
+                WRITE_CODE(0, 6, "exponent_focal_length_x");
161
+                WRITE_CODE(0, 1, "mantissa_focal_length_x");
162
+                WRITE_FLAG(0, "sign_focal_length_y");
163
+                WRITE_CODE(0, 6, "exponent_focal_length_y");
164
+                WRITE_CODE(0, 1, "mantissa_focal_length_y");
165
+                WRITE_FLAG(0, "sign_principal_point_x");
166
+                WRITE_CODE(0, 6, "exponent_principal_point_x");
167
+                WRITE_CODE(0, 1, "mantissa_principal_point_x");
168
+                WRITE_FLAG(0, "sign_principal_point_y");
169
+                WRITE_CODE(0, 6, "exponent_principal_point_y");
170
+                WRITE_CODE(0, 1, "mantissa_principal_point_y");
171
+                WRITE_FLAG(0, "sign_skew_factor");
172
+                WRITE_CODE(0, 6, "exponent_skew_factor");
173
+                WRITE_CODE(0, 1, "mantissa_skew_factor");
174
+            }
175
+        }
176
+
177
+        if (m_extrinsicParamFlag)
178
+        {
179
+            WRITE_UVLC(31, "prec_rotation_param");
180
+            WRITE_UVLC(31, "prec_translation_param");
181
+            for (int i = 0; i <= 0; i++)
182
+            {
183
+                for (int j = 0; j <= 2; j++)  /* row */
184
+                {
185
+                    for (int k = 0; k <= 2; k++)  /* column */
186
+                    {
187
+                        WRITE_FLAG(sign_rjk, "sign_r");
188
+                        WRITE_CODE(exponent_rjk, 6, "exponent_r");
189
+                        WRITE_CODE(mantissa_rjk, lenght_mantissa_rjk, "mantissa_r");
190
+                    }
191
+                    WRITE_FLAG(sign_tij, "sign_t");
192
+                    WRITE_CODE(exponent_tij, 6, "exponent_t");
193
+                    WRITE_CODE(mantissa_tij, length_mantissa_tij, "mantissa_t");
194
+                }
195
+            }
196
+        }
197
+        if (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
198
+        {
199
+            WRITE_FLAG(1, "payload_bit_equal_to_one");
200
+            while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
201
x265_3.6.tar.gz/source/encoder/slicetype.cpp -> x265_4.0.tar.gz/source/encoder/slicetype.cpp Changed
22
 
1
@@ -1324,7 +1324,7 @@
2
     int l0poc = slice->m_rps.numberOfNegativePictures ? slice->m_refPOCList00 : -1;
3
     int l1poc = slice->m_refPOCList10;
4
 
5
-    switch (slice->m_sliceType)
6
+    switch (slice->m_origSliceType)
7
     {
8
     case I_SLICE:
9
         framesp0 = &curFrame->m_lowres;
10
@@ -4160,9 +4160,9 @@
11
         /* ME will never return a cost larger than the cost @MVP, so we do not
12
          * have to check that ME cost is more than the estimated merge cost */
13
         if(!hme)
14
-            fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, searchRange, *fencMV, m_lookahead.m_param->maxSlices);
15
+            fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, searchRange, *fencMV, m_lookahead.m_param->maxSlices, 0);
16
         else
17
-            fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, searchRange, *fencMV, m_lookahead.m_param->maxSlices, fref->lowerResPlane0);
18
+            fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, searchRange, *fencMV, m_lookahead.m_param->maxSlices, 0, fref->lowerResPlane0);
19
         if (skipCost < 64 && skipCost < fencCost && bBidir)
20
         {
21
             fencCost = skipCost;
22
x265_3.6.tar.gz/source/encoder/weightPrediction.cpp -> x265_4.0.tar.gz/source/encoder/weightPrediction.cpp Changed
17
 
1
@@ -491,8 +491,14 @@
2
         lumaDenom = weights0.log2WeightDenom;
3
         chromaDenom = weights1.log2WeightDenom;
4
 
5
+        int numIdx = slice.m_numRefIdxlist;
6
+#if ENABLE_SCC_EXT
7
+        if (!list && param.bEnableSCC)
8
+            numIdx--;
9
+#endif
10
+
11
         /* reset weight states */
12
-        for (int ref = 1; ref < slice.m_numRefIdxlist; ref++)
13
+        for (int ref = 1; ref < numIdx; ref++)
14
         {
15
             SET_WEIGHT(wplistref0, false, 1 << lumaDenom, lumaDenom, 0);
16
             SET_WEIGHT(wplistref1, false, 1 << chromaDenom, chromaDenom, 0);
17
x265_3.6.tar.gz/source/input/input.cpp -> x265_4.0.tar.gz/source/input/input.cpp Changed
17
 
1
@@ -27,12 +27,12 @@
2
 
3
 using namespace X265_NS;
4
 
5
-InputFile* InputFile::open(InputFileInfo& info, bool bForceY4m)
6
+InputFile* InputFile::open(InputFileInfo& info, bool bForceY4m, bool alpha, int format)
7
 {
8
     const char * s = strrchr(info.filename, '.');
9
 
10
     if (bForceY4m || (s && !strcmp(s, ".y4m")))
11
-        return new Y4MInput(info);
12
+        return new Y4MInput(info, alpha, format);
13
     else
14
-        return new YUVInput(info);
15
+        return new YUVInput(info, alpha, format);
16
 }
17
x265_3.6.tar.gz/source/input/input.h -> x265_4.0.tar.gz/source/input/input.h Changed
10
 
1
@@ -66,7 +66,7 @@
2
 
3
     InputFile()           {}
4
 
5
-    static InputFile* open(InputFileInfo& info, bool bForceY4m);
6
+    static InputFile* open(InputFileInfo& info, bool bForceY4m, bool alpha, int format);
7
 
8
     virtual void startReader() = 0;
9
 
10
x265_3.6.tar.gz/source/input/y4m.cpp -> x265_4.0.tar.gz/source/input/y4m.cpp Changed
57
 
1
@@ -40,13 +40,14 @@
2
 using namespace X265_NS;
3
 using namespace std;
4
 static const char header = {'F','R','A','M','E'};
5
-Y4MInput::Y4MInput(InputFileInfo& info)
6
+Y4MInput::Y4MInput(InputFileInfo& info, bool alpha, int format)
7
 {
8
     for (int i = 0; i < QUEUE_SIZE; i++)
9
         bufi = NULL;
10
 
11
     threadActive = false;
12
     colorSpace = info.csp;
13
+    alphaAvailable = alpha;
14
     sarWidth = info.sarWidth;
15
     sarHeight = info.sarHeight;
16
     width = info.width;
17
@@ -68,11 +69,13 @@
18
         ifs = x265_fopen(info.filename, "rb");
19
     if (ifs && !ferror(ifs) && parseHeader())
20
     {
21
+        if (format == 1) width /= 2;
22
+        if (format == 2) height /= 2;
23
         int pixelbytes = depth > 8 ? 2 : 1;
24
-        for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
25
+        for (int i = 0; i < x265_cli_cspscolorSpace.planes + alphaAvailable; i++)
26
         {
27
-            int stride = (width >> x265_cli_cspscolorSpace.widthi) * pixelbytes;
28
-            framesize += (stride * (height >> x265_cli_cspscolorSpace.heighti));
29
+            int stride = ((width * (format == 1 ? 2 : 1)) >> x265_cli_cspscolorSpace.widthi) * pixelbytes;
30
+            framesize += (stride * ((height * (format == 2 ? 2 : 1)) >> x265_cli_cspscolorSpace.heighti));
31
         }
32
 
33
         threadActive = true;
34
@@ -390,12 +393,19 @@
35
         pic.height = height;
36
         pic.width = width;
37
         pic.colorSpace = colorSpace;
38
-        pic.stride0 = width * pixelbytes;
39
+        pic.stride0 = width * pixelbytes * (pic.format == 1 ? 2 : 1);
40
         pic.stride1 = pic.stride0 >> x265_cli_cspscolorSpace.width1;
41
         pic.stride2 = pic.stride0 >> x265_cli_cspscolorSpace.width2;
42
         pic.planes0 = bufread % QUEUE_SIZE;
43
-        pic.planes1 = (char*)pic.planes0 + pic.stride0 * height;
44
-        pic.planes2 = (char*)pic.planes1 + pic.stride1 * (height >> x265_cli_cspscolorSpace.height1);
45
+        pic.planes1 = (char*)pic.planes0 + pic.stride0 * (height * (pic.format == 2 ? 2 : 1));
46
+        pic.planes2 = (char*)pic.planes1 + pic.stride1 * ((height * (pic.format == 2 ? 2 : 1)) >> x265_cli_cspscolorSpace.height1);
47
+#if ENABLE_ALPHA
48
+        if (alphaAvailable)
49
+        {
50
+            pic.stride3 = pic.stride0 >> x265_cli_cspscolorSpace.width3;
51
+            pic.planes3 = (char*)pic.planes2 + pic.stride2 * (height >> x265_cli_cspscolorSpace.height2);
52
+        }
53
+#endif
54
         readCount.incr();
55
         return true;
56
     }
57
x265_3.6.tar.gz/source/input/y4m.h -> x265_4.0.tar.gz/source/input/y4m.h Changed
19
 
1
@@ -55,6 +55,8 @@
2
 
3
     int colorSpace;
4
 
5
+    bool alphaAvailable;
6
+
7
     bool threadActive;
8
 
9
     ThreadSafeInteger readCount;
10
@@ -69,7 +71,7 @@
11
 
12
 public:
13
 
14
-    Y4MInput(InputFileInfo& info);
15
+    Y4MInput(InputFileInfo& info, bool alpha, int format);
16
 
17
     virtual ~Y4MInput();
18
     void release();
19
x265_3.6.tar.gz/source/input/yuv.cpp -> x265_4.0.tar.gz/source/input/yuv.cpp Changed
53
 
1
@@ -40,7 +40,7 @@
2
 using namespace X265_NS;
3
 using namespace std;
4
 
5
-YUVInput::YUVInput(InputFileInfo& info)
6
+YUVInput::YUVInput(InputFileInfo& info, bool alpha, int format)
7
 {
8
     for (int i = 0; i < QUEUE_SIZE; i++)
9
         bufi = NULL;
10
@@ -49,15 +49,16 @@
11
     width = info.width;
12
     height = info.height;
13
     colorSpace = info.csp;
14
+    alphaAvailable = alpha;
15
     threadActive = false;
16
     ifs = NULL;
17
 
18
     uint32_t pixelbytes = depth > 8 ? 2 : 1;
19
     framesize = 0;
20
-    for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
21
+    for (int i = 0; i < x265_cli_cspscolorSpace.planes + alphaAvailable; i++)
22
     {
23
-        uint32_t w = width >> x265_cli_cspscolorSpace.widthi;
24
-        uint32_t h = height >> x265_cli_cspscolorSpace.heighti;
25
+        int32_t w = (width * (format == 1 ? 2 : 1)) >> x265_cli_cspscolorSpace.widthi;
26
+        uint32_t h = (height * (format == 2 ? 2 : 1)) >> x265_cli_cspscolorSpace.heighti;
27
         framesize += w * h * pixelbytes;
28
     }
29
 
30
@@ -205,12 +206,19 @@
31
         pic.framesize = framesize;
32
         pic.height = height;
33
         pic.width = width;
34
-        pic.stride0 = width * pixelbytes;
35
+        pic.stride0 = width * pixelbytes * (pic.format == 1 ? 2 : 1);
36
         pic.stride1 = pic.stride0 >> x265_cli_cspscolorSpace.width1;
37
         pic.stride2 = pic.stride0 >> x265_cli_cspscolorSpace.width2;
38
         pic.planes0 = bufread % QUEUE_SIZE;
39
-        pic.planes1 = (char*)pic.planes0 + pic.stride0 * height;
40
-        pic.planes2 = (char*)pic.planes1 + pic.stride1 * (height >> x265_cli_cspscolorSpace.height1);
41
+        pic.planes1 = (char*)pic.planes0 + pic.stride0 * (height * (pic.format == 2 ? 2 : 1));
42
+        pic.planes2 = (char*)pic.planes1 + pic.stride1 * ((height * (pic.format == 2 ? 2 : 1)) >> x265_cli_cspscolorSpace.height1);
43
+#if ENABLE_ALPHA
44
+        if (alphaAvailable)
45
+        {
46
+            pic.stride3 = pic.stride0 >> x265_cli_cspscolorSpace.width3;
47
+            pic.planes3 = (char*)pic.planes2 + pic.stride2 * (height >> x265_cli_cspscolorSpace.height2);
48
+        }
49
+#endif
50
         readCount.incr();
51
         return true;
52
     }
53
x265_3.6.tar.gz/source/input/yuv.h -> x265_4.0.tar.gz/source/input/yuv.h Changed
19
 
1
@@ -47,6 +47,8 @@
2
 
3
     uint32_t framesize;
4
 
5
+    bool alphaAvailable;
6
+
7
     bool threadActive;
8
 
9
     ThreadSafeInteger readCount;
10
@@ -61,7 +63,7 @@
11
 
12
 public:
13
 
14
-    YUVInput(InputFileInfo& info);
15
+    YUVInput(InputFileInfo& info, bool alpha, int format);
16
 
17
     virtual ~YUVInput();
18
     void release();
19
x265_3.6.tar.gz/source/test/ipfilterharness.cpp -> x265_4.0.tar.gz/source/test/ipfilterharness.cpp Changed
103
 
1
@@ -67,7 +67,7 @@
2
     {
3
         int index = i % TEST_CASES;
4
 
5
-        for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++)
6
+        for (int coeffIdx = 1; coeffIdx < 8; coeffIdx++)
7
         {
8
             rand_srcStride = rand() % 100 + 2;
9
             rand_dstStride = rand() % 100 + 64;
10
@@ -102,7 +102,7 @@
11
     {
12
         int index = i % TEST_CASES;
13
 
14
-        for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++)
15
+        for (int coeffIdx = 1; coeffIdx < 8; coeffIdx++)
16
         {
17
             rand_srcStride = rand() % 100;
18
             rand_dstStride = rand() % 100 + 64;
19
@@ -144,7 +144,7 @@
20
     {
21
         int index = i % TEST_CASES;
22
 
23
-        for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++)
24
+        for (int coeffIdx = 1; coeffIdx < 8; coeffIdx++)
25
         {
26
             // 0 : Interpolate W x H, 1 : Interpolate W x (H + 7)
27
             for (int isRowExt = 0; isRowExt < 2; isRowExt++)
28
@@ -185,7 +185,7 @@
29
     {
30
         int index = i % TEST_CASES;
31
 
32
-        for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++)
33
+        for (int coeffIdx = 1; coeffIdx < 8; coeffIdx++)
34
         {
35
             rand_srcStride = rand() % 100;
36
             rand_dstStride = rand() % 100 + 64;
37
@@ -220,7 +220,7 @@
38
     {
39
         int index = i % TEST_CASES;
40
 
41
-        for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++)
42
+        for (int coeffIdx = 1; coeffIdx < 8; coeffIdx++)
43
         {
44
             rand_srcStride = rand() % 100;
45
             rand_dstStride = rand() % 100 + 64;
46
@@ -255,7 +255,7 @@
47
     {
48
         int index = i % TEST_CASES;
49
 
50
-        for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++)
51
+        for (int coeffIdx = 1; coeffIdx < 4; coeffIdx++)
52
         {
53
             rand_srcStride = rand() % 100;
54
             rand_dstStride = rand() % 100 + 64;
55
@@ -290,7 +290,7 @@
56
     {
57
         int index = i % TEST_CASES;
58
 
59
-        for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++)
60
+        for (int coeffIdx = 1; coeffIdx < 4; coeffIdx++)
61
         {
62
             rand_srcStride = rand() % 100;
63
             rand_dstStride = rand() % 100 + 64;
64
@@ -325,7 +325,7 @@
65
     {
66
         int index = i % TEST_CASES;
67
 
68
-        for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++)
69
+        for (int coeffIdx = 1; coeffIdx < 4; coeffIdx++)
70
         {
71
             // 0 : Interpolate W x H, 1 : Interpolate W x (H + 7)
72
             for (int isRowExt = 0; isRowExt < 2; isRowExt++)
73
@@ -366,7 +366,7 @@
74
     {
75
         int index = i % TEST_CASES;
76
 
77
-        for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++)
78
+        for (int coeffIdx = 1; coeffIdx < 4; coeffIdx++)
79
         {
80
             rand_srcStride = rand() % 100;
81
             rand_dstStride = rand() % 100 + 64;
82
@@ -401,7 +401,7 @@
83
     {
84
         int index = i % TEST_CASES;
85
 
86
-        for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++)
87
+        for (int coeffIdx = 1; coeffIdx < 4; coeffIdx++)
88
         {
89
             rand_srcStride = rand() % 100;
90
             rand_dstStride = rand() % 100 + 64;
91
@@ -436,9 +436,9 @@
92
     {
93
         int index = i % TEST_CASES;
94
 
95
-        for (int coeffIdxX = 0; coeffIdxX < 4; coeffIdxX++)
96
+        for (int coeffIdxX = 1; coeffIdxX < 4; coeffIdxX++)
97
         {
98
-            for (int coeffIdxY = 0; coeffIdxY < 4; coeffIdxY++)
99
+            for (int coeffIdxY = 1; coeffIdxY < 4; coeffIdxY++)
100
             {
101
                 rand_srcStride = rand() % 100;
102
                 rand_dstStride = rand() % 100 + 64;
103
x265_3.6.tar.gz/source/test/mbdstharness.cpp -> x265_4.0.tar.gz/source/test/mbdstharness.cpp Changed
18
 
1
@@ -260,8 +260,14 @@
2
         uint32_t optReturnValue = 0;
3
         uint32_t refReturnValue = 0;
4
 
5
-        int bits = rand() % 32;
6
-        int valueToAdd = rand() % (1 << bits);
7
+        int log2TrSize = rand() % 4 + 2;
8
+        const int qp = rand() % (QP_MAX_SPEC + QP_BD_OFFSET + 1);
9
+        const int per = qp / 6;
10
+        const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
11
+
12
+        /* Right shift of non-RDOQ quantizer level = (coeff*Q + offset)>>q_bits */
13
+        int bits = QUANT_SHIFT + per + transformShift;
14
+        int valueToAdd = (1 << (bits - 1));
15
         int cmp_size = sizeof(short) * height * width;
16
         int numCoeff = height * width;
17
 
18
x265_3.6.tar.gz/source/test/pixelharness.cpp -> x265_4.0.tar.gz/source/test/pixelharness.cpp Changed
33
 
1
@@ -1373,8 +1373,7 @@
2
         ref(sbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
3
         checked(opt, sbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
4
 
5
-        if (   memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
6
-            || memcmp(stats_ref, stats_vec, sizeof(stats_ref))
7
+        if (   memcmp(stats_ref, stats_vec, sizeof(stats_ref))
8
             || memcmp(count_ref, count_vec, sizeof(count_ref)))
9
             return false;
10
 
11
@@ -1425,10 +1424,7 @@
12
         ref(sbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, upBufft_ref, endX, endY, stats_ref, count_ref);
13
         checked(opt, sbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, upBufft_vec, endX, endY, stats_vec, count_vec);
14
 
15
-        // TODO: don't check upBuff*, the latest output pixels different, and can move into stack temporary buffer in future
16
-        if (   memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
17
-            || memcmp(_upBufft_ref, _upBufft_vec, sizeof(_upBufft_ref))
18
-            || memcmp(stats_ref, stats_vec, sizeof(stats_ref))
19
+        if (   memcmp(stats_ref, stats_vec, sizeof(stats_ref))
20
             || memcmp(count_ref, count_vec, sizeof(count_ref)))
21
             return false;
22
 
23
@@ -1476,8 +1472,7 @@
24
         ref(sbuf2, pbuf3, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
25
         checked(opt, sbuf2, pbuf3, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
26
 
27
-        if (   memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
28
-            || memcmp(stats_ref, stats_vec, sizeof(stats_ref))
29
+        if (   memcmp(stats_ref, stats_vec, sizeof(stats_ref))
30
             || memcmp(count_ref, count_vec, sizeof(count_ref)))
31
             return false;
32
 
33
x265_3.6.tar.gz/source/test/testbench.cpp -> x265_4.0.tar.gz/source/test/testbench.cpp Changed
54
 
1
@@ -159,10 +159,11 @@
2
 
3
     struct test_arch_t
4
     {
5
-        char name12;
6
+        char name13;
7
         int flag;
8
     } test_arch =
9
     {
10
+#if X265_ARCH_X86
11
         { "SSE2", X265_CPU_SSE2 },
12
         { "SSE3", X265_CPU_SSE3 },
13
         { "SSSE3", X265_CPU_SSSE3 },
14
@@ -172,11 +173,15 @@
15
         { "AVX2", X265_CPU_AVX2 },
16
         { "BMI2", X265_CPU_AVX2 | X265_CPU_BMI1 | X265_CPU_BMI2 },
17
         { "AVX512", X265_CPU_AVX512 },
18
+#else
19
         { "ARMv6", X265_CPU_ARMV6 },
20
         { "NEON", X265_CPU_NEON },
21
         { "SVE2", X265_CPU_SVE2 },
22
         { "SVE", X265_CPU_SVE },
23
+        { "Neon_DotProd", X265_CPU_NEON_DOTPROD },
24
+        { "Neon_I8MM", X265_CPU_NEON_I8MM },
25
         { "FastNeonMRC", X265_CPU_FAST_NEON_MRC },
26
+#endif
27
         { "", 0 },
28
     };
29
 
30
@@ -190,10 +195,10 @@
31
         else
32
             continue;
33
 
34
-#if X265_ARCH_X86
35
+#if defined(X265_ARCH_X86) || defined(X265_ARCH_ARM64)
36
         EncoderPrimitives vecprim;
37
         memset(&vecprim, 0, sizeof(vecprim));
38
-        setupInstrinsicPrimitives(vecprim, test_archi.flag);
39
+        setupIntrinsicPrimitives(vecprim, test_archi.flag);
40
         setupAliasPrimitives(vecprim);
41
         for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++)
42
         {
43
@@ -231,8 +236,8 @@
44
 
45
     EncoderPrimitives optprim;
46
     memset(&optprim, 0, sizeof(optprim));
47
-#if X265_ARCH_X86
48
-    setupInstrinsicPrimitives(optprim, cpuid);
49
+#if defined(X265_ARCH_X86) || defined(X265_ARCH_ARM64)
50
+    setupIntrinsicPrimitives(optprim, cpuid);
51
 #endif
52
 
53
     setupAssemblyPrimitives(optprim, cpuid);
54
x265_3.6.tar.gz/source/test/testharness.h -> x265_4.0.tar.gz/source/test/testharness.h Changed
9
 
1
@@ -88,6 +88,7 @@
2
     // TO-DO: replace clock() function with appropriate ARM cpu instructions
3
     a = clock();
4
 #elif  X265_ARCH_ARM64
5
+    asm volatile("isb" : : : "memory");
6
     asm volatile("mrs %0, cntvct_el0" : "=r"(a));
7
 #endif
8
     return a;
9
x265_3.6.tar.gz/source/x265.h -> x265_4.0.tar.gz/source/x265.h Changed
187
 
1
@@ -371,6 +371,11 @@
2
     MASTERING_DISPLAY_INFO               = 137,
3
     CONTENT_LIGHT_LEVEL_INFO             = 144,
4
     ALTERNATIVE_TRANSFER_CHARACTERISTICS = 147,
5
+    ALPHA_CHANNEL_INFO                   = 165,
6
+    THREE_DIMENSIONAL_REFERENCE_DISPLAYS_INFO = 176,
7
+    MULTIVIEW_SCENE_INFO                 = 178,
8
+    MULTIVIEW_ACQUISITION_INFO           = 179,
9
+    MULTIVIEW_VIEW_POSITION              = 180
10
 } SEIPayloadType;
11
 
12
 typedef struct x265_sei_payload
13
@@ -410,10 +415,10 @@
14
 
15
     /* Must be specified on input pictures, the number of planes is determined
16
      * by the colorSpace value */
17
-    void*   planes3;
18
+    void*   planes4;
19
 
20
     /* Stride is the number of bytes between row starts */
21
-    int     stride3;
22
+    int     stride4;
23
 
24
     /* Must be specified on input pictures. x265_picture_init() will set it to
25
      * the encoder's internal bit depth, but this field must describe the depth
26
@@ -487,6 +492,9 @@
27
     uint32_t picStruct;
28
 
29
     int    width;
30
+
31
+    int   layerID;
32
+    int    format;
33
 } x265_picture;
34
 
35
 typedef enum
36
@@ -536,11 +544,13 @@
37
 #define X265_CPU_SLOW_PALIGNR    (1 << 25)  /* such as on the AMD Bobcat */
38
 
39
 /* ARM */
40
-#define X265_CPU_ARMV6           0x0000001
41
-#define X265_CPU_NEON            0x0000002  /* ARM NEON */
42
-#define X265_CPU_SVE2            0x0000008  /* ARM SVE2 */
43
-#define X265_CPU_SVE             0x0000010  /* ARM SVE2 */
44
-#define X265_CPU_FAST_NEON_MRC   0x0000004  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
45
+#define X265_CPU_ARMV6           (1 << 0)
46
+#define X265_CPU_NEON            (1 << 1)   /* ARM NEON */
47
+#define X265_CPU_FAST_NEON_MRC   (1 << 2)   /* Transfer from NEON to ARM register is fast (Cortex-A9) */
48
+#define X265_CPU_SVE2            (1 << 3)   /* AArch64 SVE2 */
49
+#define X265_CPU_SVE             (1 << 4)   /* AArch64 SVE2 */
50
+#define X265_CPU_NEON_DOTPROD    (1 << 5)   /* AArch64 Neon DotProd */
51
+#define X265_CPU_NEON_I8MM       (1 << 6)   /* AArch64 Neon I8MM */
52
 
53
 /* IBM Power8 */
54
 #define X265_CPU_ALTIVEC         0x0000001
55
@@ -623,13 +633,49 @@
56
 #define X265_MAX_GOP_LENGTH 16
57
 #define MAX_T_LAYERS 7
58
 
59
+#if ENABLE_MULTIVIEW
60
+#define MAX_VIEWS 2
61
+#define MAX_VPS_NUM_SCALABILITY_TYPES     16
62
+#define MAX_VPS_LAYER_ID_PLUS1            MAX_VIEWS
63
+#define MULTIVIEW_SCALABILITY_IDX         1
64
+#else
65
+#define MAX_VIEWS 1
66
+#endif
67
+
68
+#if ENABLE_ALPHA
69
+#define MAX_SCALABLE_LAYERS     2
70
+#define MAX_VPS_NUM_SCALABILITY_TYPES     16
71
+#define MAX_VPS_LAYER_ID_PLUS1            MAX_SCALABLE_LAYERS
72
+#else
73
+#define MAX_SCALABLE_LAYERS     1
74
+#endif
75
+
76
+#if ENABLE_ALPHA || ENABLE_MULTIVIEW
77
+#define MAX_LAYERS              2
78
+#else
79
+#define MAX_LAYERS              1
80
+#endif
81
+
82
+#if ENABLE_SCC_EXT
83
+/* SCC Extension Options */
84
+#define SCC_EXT_IDX               3
85
+#define NUM_EXTENSION_FLAGS       8
86
+#define SCM_S0067_NUM_CANDIDATES  64
87
+#define CHROMA_REFINEMENT_CANDIDATES  8
88
+#define SCM_S0067_IBC_FULL_1D_SEARCH_FOR_PU  2 ///< Do full horizontal/vertical search for Nx2N
89
+#define SCM_S0067_MAX_CAND_SIZE  32 ///< 32 or 64, 16 by default
90
+#define NUM_RECON_VERSION          2
91
+#else
92
+#define NUM_RECON_VERSION          1
93
+#endif
94
+
95
 #define X265_IPRATIO_STRENGTH   1.43
96
 
97
 typedef struct x265_cli_csp
98
 {
99
     int planes;
100
-    int width3;
101
-    int height3;
102
+    int width4;
103
+    int height4;
104
 } x265_cli_csp;
105
 
106
 static const x265_cli_csp x265_cli_csps =
107
@@ -754,10 +800,9 @@
108
     char *pool;
109
     int thread;
110
     int subsample;
111
-    int enable_conf_interval;
112
 }x265_vmaf_commondata;
113
 
114
-static const x265_vmaf_commondata vcd = { { NULL, (char *)"/usr/local/share/model/vmaf_v0.6.1.pkl", NULL, NULL, 0, 0, 0, 0, 0, 0, 0, NULL, 0, 1, 0 } };
115
+static x265_vmaf_commondata vcd = { { NULL, (char *)"/usr/local/share/model/vmaf_v0.6.1.json", NULL, NULL, 0, 0, 0, 0, 0, 0, 0, NULL, 0, 1} };
116
 
117
 typedef struct x265_temporal_layer {
118
     int poc_offset;      /* POC offset */
119
@@ -2268,6 +2313,20 @@
120
 
121
     /*SBRC*/
122
     int      bEnableSBRC;
123
+    int mcstfFrameRange;
124
+
125
+    /*Alpha channel encoding*/
126
+    int      bEnableAlpha;
127
+    int      numScalableLayers;
128
+
129
+    /*Multi View Encoding*/
130
+    int      numViews;
131
+    int      format;
132
+
133
+    int      numLayers;
134
+
135
+    /*Screen Content Coding*/
136
+    int     bEnableSCC;
137
 } x265_param;
138
 
139
 /* x265_param_alloc:
140
@@ -2320,6 +2379,10 @@
141
     "main444-12", "main444-12-intra",
142
 
143
     "main444-16-intra", "main444-16-stillpicture", /* Not Supported! */
144
+
145
+#if ENABLE_SCC_EXT
146
+    "main-scc", "main10-scc", "main444-scc", "main444-10-scc", /* Screen content coding */
147
+#endif
148
     0
149
 };
150
 
151
@@ -2430,7 +2493,7 @@
152
  *      the payloads of all output NALs are guaranteed to be sequential in memory.
153
  *      To flush the encoder and retrieve delayed output pictures, pass pic_in as NULL.
154
  *      Once flushing has begun, all subsequent calls must pass pic_in as NULL. */
155
-int x265_encoder_encode(x265_encoder *encoder, x265_nal **pp_nal, uint32_t *pi_nal, x265_picture *pic_in, x265_picture *pic_out);
156
+int x265_encoder_encode(x265_encoder *encoder, x265_nal **pp_nal, uint32_t *pi_nal, x265_picture *pic_in, x265_picture **pic_out);
157
 
158
 /* x265_encoder_reconfig:
159
  *      various parameters from x265_param are copied.
160
@@ -2537,7 +2600,7 @@
161
 
162
 /* x265_calculate_vmaf_framelevelscore:
163
  *    returns VMAF score for each frame in a given input video. */
164
-double x265_calculate_vmaf_framelevelscore(x265_vmaf_framedata*);
165
+double x265_calculate_vmaf_framelevelscore(x265_param*, x265_vmaf_framedata*);
166
 /* x265_vmaf_encoder_log:
167
  *       write a line to the configured CSV file.  If a CSV filename was not
168
  *       configured, or file open failed, this function will perform no write.
169
@@ -2584,7 +2647,7 @@
170
     int           (*encoder_reconfig)(x265_encoder*, x265_param*);
171
     int           (*encoder_reconfig_zone)(x265_encoder*, x265_zone*);
172
     int           (*encoder_headers)(x265_encoder*, x265_nal**, uint32_t*);
173
-    int           (*encoder_encode)(x265_encoder*, x265_nal**, uint32_t*, x265_picture*, x265_picture*);
174
+    int           (*encoder_encode)(x265_encoder*, x265_nal**, uint32_t*, x265_picture*, x265_picture**);
175
     void          (*encoder_get_stats)(x265_encoder*, x265_stats*, uint32_t);
176
     void          (*encoder_log)(x265_encoder*, int, char**);
177
     void          (*encoder_close)(x265_encoder*);
178
@@ -2602,7 +2665,7 @@
179
     int           (*set_analysis_data)(x265_encoder *encoder, x265_analysis_data *analysis_data, int poc, uint32_t cuBytes);
180
 #if ENABLE_LIBVMAF
181
     double        (*calculate_vmafscore)(x265_param *, x265_vmaf_data *);
182
-    double        (*calculate_vmaf_framelevelscore)(x265_vmaf_framedata *);
183
+    double        (*calculate_vmaf_framelevelscore)(x265_param *, x265_vmaf_framedata *);
184
     void          (*vmaf_encoder_log)(x265_encoder*, int, char**, x265_param *, x265_vmaf_data *);
185
 #endif
186
     int           (*zone_param_parse)(x265_param*, const char*, const char*);
187
x265_3.6.tar.gz/source/x265cli.cpp -> x265_4.0.tar.gz/source/x265cli.cpp Changed
201
 
1
@@ -374,6 +374,17 @@
2
         H0("   --no-frame-dup              Enable Frame duplication. Default %s\n", OPT(param->bEnableFrameDuplication));
3
         H0("   --dup-threshold <integer>     PSNR threshold for Frame duplication. Default %d\n", param->dupThreshold);
4
         H0("   --no-mcstf                  Enable GOP based temporal filter. Default %d\n", param->bEnableTemporalFilter);
5
+#if ENABLE_ALPHA
6
+        H0("   --alpha                       Enable alpha channel support. Default %d\n", param->bEnableAlpha);
7
+#endif
8
+#if ENABLE_MULTIVIEW
9
+        H0("   --num-views                   Number of Views for Multiview Encoding. Default %d\n", param->numViews);
10
+        H0("   --format                      Format of the input video 0 : normal, 1 : side-by-side, 2 : over-under  Default %d\n", param->format);
11
+        H0("   --multiview-config            Configuration file for Multiview Encoding\n");
12
+#endif
13
+#if ENABLE_SCC_EXT
14
+        H0("   --scc <integer>               Enable screen content coding. 0: Diabled, 1:Intrablockcopy fast search with 1x2 CTUs search range, 2: Intrablockcopy Full search. Default %d\n", param->bEnableSCC);
15
+#endif
16
 #ifdef SVT_HEVC
17
         H0("   --nosvt                     Enable SVT HEVC encoder %s\n", OPT(param->bEnableSvtHevc));
18
         H0("   --no-svt-hme                Enable Hierarchial motion estimation(HME) in SVT HEVC encoder \n");
19
@@ -416,12 +427,18 @@
20
             free(argString);
21
         }
22
 
23
-        if (input)
24
-            input->release();
25
-        input = NULL;
26
-        if (recon)
27
-            recon->release();
28
-        recon = NULL;
29
+        for (int i = 0; i < MAX_VIEWS; i++)
30
+        {
31
+            if (inputi)
32
+                inputi->release();
33
+            inputi = NULL;
34
+        }
35
+        for (int i = 0; i < MAX_LAYERS; i++)
36
+        {
37
+            if (reconi)
38
+                reconi->release();
39
+            reconi = NULL;
40
+        }
41
         if (qpfile)
42
             fclose(qpfile);
43
         qpfile = NULL;
44
@@ -577,8 +594,12 @@
45
         int inputBitDepth = 8;
46
         int outputBitDepth = 0;
47
         int reconFileBitDepth = 0;
48
-        const char *inputfn = NULL;
49
-        const char *reconfn = NULL;
50
+        char* inputfnMAX_VIEWS = { NULL };
51
+        for (int view = 0; view < MAX_VIEWS; view++)
52
+        {
53
+            inputfnview = X265_MALLOC(char, sizeof(char) * 1024);
54
+        }
55
+        const char* reconfnMAX_LAYERS = { NULL };
56
         const char *outputfn = NULL;
57
         const char *preset = NULL;
58
         const char *tune = NULL;
59
@@ -717,8 +738,8 @@
60
                 OPT("frames") this->framesToBeEncoded = (uint32_t)x265_atoi(optarg, bError);
61
                 OPT("no-progress") this->bProgress = false;
62
                 OPT("output") outputfn = optarg;
63
-                OPT("input") inputfn = optarg;
64
-                OPT("recon") reconfn = optarg;
65
+                OPT("input") strcpy(inputfn0 , optarg);
66
+                OPT("recon") reconfn0 = optarg;
67
                 OPT("input-depth") inputBitDepth = (uint32_t)x265_atoi(optarg, bError);
68
                 OPT("dither") this->bDither = true;
69
                 OPT("recon-depth") reconFileBitDepth = (uint32_t)x265_atoi(optarg, bError);
70
@@ -750,6 +771,14 @@
71
                     if (!this->scenecutAwareQpConfig)
72
                         x265_log_file(param, X265_LOG_ERROR, "%s scenecut aware qp config file not found or error in opening config file\n", optarg);
73
                 }
74
+#if ENABLE_MULTIVIEW
75
+                OPT("multiview-config")
76
+                {
77
+                    this->multiViewConfig = x265_fopen(optarg, "rb");
78
+                    if (!this->multiViewConfig)
79
+                        x265_log_file(param, X265_LOG_ERROR, "%s Multiview config file not found or error in opening config file\n", optarg);
80
+                }
81
+#endif
82
                 OPT("zonefile")
83
                 {
84
                     this->zoneFile = x265_fopen(optarg, "rb");
85
@@ -776,8 +805,10 @@
86
             }
87
         }
88
 
89
-        if (optind < argc && !inputfn)
90
-            inputfn = argvoptind++;
91
+#if !ENABLE_MULTIVIEW
92
+        if (optind < argc && !inputfn0)
93
+            inputfn0 = argvoptind++;
94
+#endif
95
         if (optind < argc && !outputfn)
96
             outputfn = argvoptind++;
97
         if (optind < argc)
98
@@ -793,9 +824,29 @@
99
             showHelp(param);
100
         }
101
 
102
-        if (!inputfn || !outputfn)
103
+#if ENABLE_MULTIVIEW
104
+        if (this->multiViewConfig)
105
+        {
106
+            if (!this->parseMultiViewConfig(inputfn))
107
+            {
108
+                x265_log(NULL, X265_LOG_ERROR, "Unable to parse multiview config file \n");
109
+                fclose(this->multiViewConfig);
110
+                this->multiViewConfig = NULL;
111
+            }
112
+        }
113
+#endif
114
+        param->numLayers = param->numViews > 1 ? param->numViews : (param->numScalableLayers > 1) ? param->numScalableLayers : 1;
115
+        if (!outputfn)
116
         {
117
             x265_log(param, X265_LOG_ERROR, "input or output file not specified, try --help for help\n");
118
+            for (int view = 0; view < param->numViews; view++)
119
+            {
120
+                if (!inputfnview)
121
+                {
122
+                    x265_log(param, X265_LOG_ERROR, "input or output file not specified, try --help for help\n");
123
+                    return true;
124
+                }
125
+            }
126
             return true;
127
         }
128
 
129
@@ -816,51 +867,53 @@
130
             svtParam->encoderBitDepth = inputBitDepth;
131
         }
132
 #endif
133
-
134
-        InputFileInfo info;
135
-        info.filename = inputfn;
136
-        info.depth = inputBitDepth;
137
-        info.csp = param->internalCsp;
138
-        info.width = param->sourceWidth;
139
-        info.height = param->sourceHeight;
140
-        info.fpsNum = param->fpsNum;
141
-        info.fpsDenom = param->fpsDenom;
142
-        info.sarWidth = param->vui.sarWidth;
143
-        info.sarHeight = param->vui.sarHeight;
144
-        info.skipFrames = seek;
145
-        info.frameCount = 0;
146
-        getParamAspectRatio(param, info.sarWidth, info.sarHeight);
147
-
148
-
149
-        this->input = InputFile::open(info, this->bForceY4m);
150
-        if (!this->input || this->input->isFail())
151
+        InputFileInfo infoMAX_VIEWS;
152
+        for (int i = 0; i < param->numViews - !!param->format; i++)
153
         {
154
-            x265_log_file(param, X265_LOG_ERROR, "unable to open input file <%s>\n", inputfn);
155
-            return true;
156
-        }
157
+            infoi.filename = inputfni;
158
+            infoi.depth = inputBitDepth;
159
+            infoi.csp = param->internalCsp;
160
+            infoi.width = param->sourceWidth;
161
+            infoi.height = param->sourceHeight;
162
+            infoi.fpsNum = param->fpsNum;
163
+            infoi.fpsDenom = param->fpsDenom;
164
+            infoi.sarWidth = param->vui.sarWidth;
165
+            infoi.sarHeight = param->vui.sarHeight;
166
+            infoi.skipFrames = seek;
167
+            infoi.frameCount = 0;
168
+            getParamAspectRatio(param, infoi.sarWidth, infoi.sarHeight);
169
+
170
+            this->inputi = InputFile::open(infoi, this->bForceY4m, param->numScalableLayers > 1, param->format);
171
+            if (!this->inputi || this->inputi->isFail())
172
+            {
173
+                x265_log_file(param, X265_LOG_ERROR, "unable to open input file <%s>\n", inputfni);
174
+                return true;
175
+            }
176
 
177
-        if (info.depth < 8 || info.depth > 16)
178
-        {
179
-            x265_log(param, X265_LOG_ERROR, "Input bit depth (%d) must be between 8 and 16\n", inputBitDepth);
180
-            return true;
181
+            if (infoi.depth < 8 || infoi.depth > 16)
182
+            {
183
+                x265_log(param, X265_LOG_ERROR, "Input bit depth (%d) must be between 8 and 16\n", inputBitDepth);
184
+                return true;
185
+            }
186
         }
187
 
188
+            //TODO:Validate info params of both the views to equal values
189
         /* Unconditionally accept height/width/csp/bitDepth from file info */
190
-        param->sourceWidth = info.width;
191
-        param->sourceHeight = info.height;
192
-        param->internalCsp = info.csp;
193
-        param->sourceBitDepth = info.depth;
194
+            param->sourceWidth = info0.width;
195
+            param->sourceHeight = info0.height;
196
+            param->internalCsp = info0.csp;
197
+            param->sourceBitDepth = info0.depth;
198
 
199
         /* Accept fps and sar from file info if not specified by user */
200
         if (param->fpsDenom == 0 || param->fpsNum == 0)
201
x265_3.6.tar.gz/source/x265cli.h -> x265_4.0.tar.gz/source/x265cli.h Changed
69
 
1
@@ -358,6 +358,17 @@
2
     { "dup-threshold", required_argument, NULL, 0 },
3
     { "mcstf",                 no_argument, NULL, 0 },
4
     { "no-mcstf",              no_argument, NULL, 0 },
5
+#if ENABLE_ALPHA
6
+    { "alpha",                 no_argument, NULL, 0 },
7
+#endif
8
+#if ENABLE_MULTIVIEW
9
+    { "num-views", required_argument, NULL, 0 },
10
+    { "multiview-config", required_argument, NULL, 0 },
11
+    { "format", required_argument, NULL, 0 },
12
+#endif
13
+#if ENABLE_SCC_EXT
14
+    { "scc",        required_argument, NULL, 0 },
15
+#endif
16
 #ifdef SVT_HEVC
17
     { "svt",     no_argument, NULL, 0 },
18
     { "no-svt",  no_argument, NULL, 0 },
19
@@ -393,13 +404,16 @@
20
 
21
     struct CLIOptions
22
     {
23
-        InputFile* input;
24
-        ReconFile* recon;
25
+        InputFile* inputMAX_VIEWS;
26
+        ReconFile* reconMAX_LAYERS;
27
         OutputFile* output;
28
         FILE*       qpfile;
29
         FILE*       zoneFile;
30
         FILE*    dolbyVisionRpu;    /* File containing Dolby Vision BL RPU metadata */
31
         FILE*    scenecutAwareQpConfig; /* File containing scenecut aware frame quantization related CLI options */
32
+#if ENABLE_MULTIVIEW
33
+        FILE* multiViewConfig; /* File containing multi-view related CLI options */
34
+#endif
35
         const char* reconPlayCmd;
36
         const x265_api* api;
37
         x265_param* param;
38
@@ -431,13 +445,18 @@
39
         static const int UPDATE_INTERVAL = 250000;
40
         CLIOptions()
41
         {
42
-            input = NULL;
43
-            recon = NULL;
44
+            for (int i = 0; i < MAX_VIEWS; i++)
45
+                inputi = NULL;
46
+            for (int i = 0; i < MAX_LAYERS; i++)
47
+                reconi = NULL;
48
             output = NULL;
49
             qpfile = NULL;
50
             zoneFile = NULL;
51
             dolbyVisionRpu = NULL;
52
             scenecutAwareQpConfig = NULL;
53
+#if ENABLE_MULTIVIEW
54
+            multiViewConfig = NULL;
55
+#endif
56
             reconPlayCmd = NULL;
57
             api = NULL;
58
             param = NULL;
59
@@ -470,6 +489,9 @@
60
         int rpuParser(x265_picture * pic);
61
         bool parseScenecutAwareQpConfig();
62
         bool parseScenecutAwareQpParam(int argc, char **argv, x265_param* globalParam);
63
+#if ENABLE_MULTIVIEW
64
+        bool parseMultiViewConfig(char** fn);
65
+#endif
66
     };
67
 #ifdef __cplusplus
68
 }
69
x265_3.6.tar.gz/x265Version.txt -> x265_4.0.tar.gz/x265Version.txt Changed
8
 
1
@@ -1,4 +1,4 @@
2
 #Attribute:         Values
3
-repositorychangeset: aa7f602f7
4
+repositorychangeset: 6318f22
5
 releasetagdistance: 1
6
-releasetag: 3.6
7
+releasetag: 4.0
8