Changes of Revision 42

x265.changes Changed
x
 
1
@@ -1,4 +1,53 @@
2
 -------------------------------------------------------------------
3
+Thu Jun 13 05:58:19 UTC 2024 - Luigi Baldoni <aloisio@gmx.com>
4
+
5
+- Update to version 3.6
6
+  New features:
7
+  * Segment based Ratecontrol (SBRC) feature
8
+  * Motion-Compensated Spatio-Temporal Filtering
9
+  * Scene-cut aware qp - BBAQ (Bidirectional Boundary Aware
10
+    Quantization)
11
+  * Histogram-Based Scene Change Detection
12
+  * Film-Grain characteristics as a SEI message to support Film
13
+    Grain Synthesis(FGS)
14
+  * Add temporal layer implementation(Hierarchical B-frame
15
+    implementation)
16
+  Enhancements to existing features:
17
+  * Added Dolby Vision 8.4 Profile Support
18
+  API changes:
19
+  * Add Segment based Ratecontrol(SBRC) feature: "--no-sbrc".
20
+  * Add command line parameter for mcstf feature: "--no-mctf".
21
+  * Add command line parameters for the scene cut aware qp
22
+    feature: "--scenecut-aware-qp" and "--masking-strength".
23
+  * Add command line parameters for Histogram-Based Scene Change
24
+    Detection: "--hist-scenecut".
25
+  * Add film grain characteristics as a SEI message to the
26
+    bitstream: "--film-grain <filename>"
27
+  * cli: add new option --cra-nal (Force nal type to CRA to all
28
+    frames expect for the first frame, works only with keyint 1)
29
+  Optimizations:
30
+  * ARM64 NEON optimizations:- Several time-consuming C
31
+    functions have been optimized for the targeted platform -
32
+    aarch64. The overall performance increased by around 20%.
33
+  * SVE/SVE2 optimizations
34
+  Bug fixes:
35
+  * Linux bug to utilize all the cores
36
+  * Crash with hist-scenecut build when source resolution is not
37
+    multiple of minCuSize
38
+  * 32bit and 64bit builds generation for ARM
39
+  * bugs in zonefile feature (Reflect Zonefile Parameters inside
40
+    Lookahead, extra IDR issue, Avg I Slice QP value issue etc..)
41
+  * Add x86 ASM implementation for subsampling luma
42
+  * Fix for abrladder segfault with load reuse level 1
43
+  * Reorder miniGOP based on temporal layer hierarchy and add
44
+    support for more B frame
45
+  * Add MacOS aarch64 build support
46
+  * Fix boundary condition issue for Gaussian filter
47
+- Drop arm.patch and replace it with 0001-Fix-arm-flags.patch
48
+  and 0004-Do-not-build-with-assembly-support-on-arm.patch
49
+  (courtesy of Debian)
50
+
51
+-------------------------------------------------------------------
52
 Wed May 19 13:21:09 UTC 2021 - Luigi Baldoni <aloisio@gmx.com>
53
 
54
 - Build libx265_main10 and libx265_main12 unconditionally and
55
x265.spec Changed
46
 
1
@@ -1,7 +1,7 @@
2
 #
3
 # spec file for package x265
4
 #
5
-# Copyright (c) 2021 Packman Team <packman@links2linux.de>
6
+# Copyright (c) 2024 Packman Team <packman@links2linux.de>
7
 # Copyright (c) 2014 Torsten Gruner <t.gruner@katodev.de>
8
 #
9
 # All modifications and additions to the file contributed by third parties
10
@@ -17,21 +17,22 @@
11
 #
12
 
13
 
14
-%define sover   199
15
+%define sover   209
16
 %define libname lib%{name}
17
 %define libsoname %{libname}-%{sover}
18
-%define uver    3_5
19
+%define uver    3_6
20
 Name:           x265
21
-Version:        3.5
22
+Version:        3.6
23
 Release:        0
24
 Summary:        A free h265/HEVC encoder - encoder binary
25
 License:        GPL-2.0-or-later
26
 Group:          Productivity/Multimedia/Video/Editors and Convertors
27
 URL:            https://bitbucket.org/multicoreware/x265_git
28
 Source0:        https://bitbucket.org/multicoreware/x265_git/downloads/%{name}_%{version}.tar.gz
29
-Patch0:         arm.patch
30
 Patch1:         x265.pkgconfig.patch
31
 Patch2:         x265-fix_enable512.patch
32
+Patch3:         0001-Fix-arm-flags.patch
33
+Patch4:         0004-Do-not-build-with-assembly-support-on-arm.patch
34
 BuildRequires:  cmake >= 2.8.8
35
 BuildRequires:  gcc-c++
36
 BuildRequires:  nasm >= 2.13
37
@@ -130,6 +131,8 @@
38
 %cmake_install
39
 find %{buildroot} -type f -name "*.a" -delete -print0
40
 
41
+%check
42
+
43
 %post -n %{libsoname} -p /sbin/ldconfig
44
 %postun -n %{libsoname} -p /sbin/ldconfig
45
 
46
0001-Fix-arm-flags.patch Added
41
 
1
@@ -0,0 +1,39 @@
2
+From: Sebastian Ramacher <sramacher@debian.org>
3
+Date: Sun, 21 Jun 2020 17:54:56 +0200
4
+Subject: Fix arm* flags
5
+
6
+---
7
+ source/CMakeLists.txt | 7 ++-----
8
+ 1 file changed, 2 insertions(+), 5 deletions(-)
9
+
10
+diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
11
+index ab5ddfe..eb9b19b 100755
12
+--- a/source/CMakeLists.txt
13
++++ b/source/CMakeLists.txt
14
+@@ -253,10 +253,7 @@ if(GCC)
15
+     elseif(ARM)
16
+         find_package(Neon)
17
+         if(CPU_HAS_NEON)
18
+-            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
19
+             add_definitions(-DHAVE_NEON)
20
+-        else()
21
+-            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
22
+         endif()
23
+     endif()
24
+   if(ARM64 OR CROSS_COMPILE_ARM64)
25
+@@ -265,13 +262,13 @@ if(GCC)
26
+         find_package(SVE2)
27
+         if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2)
28
+             message(STATUS "Found SVE2")
29
+-          set(ARM_ARGS -O3 -march=armv8-a+sve2 -fPIC -flax-vector-conversions)
30
++          set(ARM_ARGS -fPIC -flax-vector-conversions)
31
+             add_definitions(-DHAVE_SVE2)
32
+             add_definitions(-DHAVE_SVE)
33
+             add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE2
34
+         elseif(CPU_HAS_SVE OR CROSS_COMPILE_SVE)
35
+             message(STATUS "Found SVE")
36
+-          set(ARM_ARGS -O3 -march=armv8-a+sve -fPIC -flax-vector-conversions)
37
++          set(ARM_ARGS -fPIC -flax-vector-conversions)
38
+             add_definitions(-DHAVE_SVE)
39
+             add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE
40
+         elseif(CPU_HAS_NEON)
41
0004-Do-not-build-with-assembly-support-on-arm.patch Added
30
 
1
@@ -0,0 +1,28 @@
2
+From: Sebastian Ramacher <sramacher@debian.org>
3
+Date: Fri, 31 May 2024 23:38:23 +0200
4
+Subject: Do not build with assembly support on arm*
5
+
6
+---
7
+ source/CMakeLists.txt | 9 ---------
8
+ 1 file changed, 9 deletions(-)
9
+
10
+diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
11
+index 672cc2d..f112330 100755
12
+--- a/source/CMakeLists.txt
13
++++ b/source/CMakeLists.txt
14
+@@ -73,15 +73,6 @@ elseif(POWERMATCH GREATER "-1")
15
+         add_definitions(-DPPC64=1)
16
+         message(STATUS "Detected POWER PPC64 target processor")
17
+     endif()
18
+-elseif(ARMMATCH GREATER "-1")
19
+-    if(CROSS_COMPILE_ARM)
20
+-        message(STATUS "Cross compiling for ARM arch")
21
+-    else()
22
+-        set(CROSS_COMPILE_ARM 0)
23
+-    endif()
24
+-  message(STATUS "Detected ARM target processor")
25
+-    set(ARM 1)
26
+-    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
27
+ elseif(ARM64MATCH GREATER "-1")
28
+     #if(CROSS_COMPILE_ARM64)
29
+         #message(STATUS "Cross compiling for ARM64 arch")
30
arm.patch Deleted
110
 
1
@@ -1,108 +0,0 @@
2
-Index: x265_3.4/source/CMakeLists.txt
3
-===================================================================
4
---- x265_3.4.orig/source/CMakeLists.txt
5
-+++ x265_3.4/source/CMakeLists.txt
6
-@@ -64,26 +64,26 @@ elseif(POWERMATCH GREATER "-1")
7
-         add_definitions(-DPPC64=1)
8
-         message(STATUS "Detected POWER PPC64 target processor")
9
-     endif()
10
--elseif(ARMMATCH GREATER "-1")
11
--    if(CROSS_COMPILE_ARM)
12
--        message(STATUS "Cross compiling for ARM arch")
13
--    else()
14
--        set(CROSS_COMPILE_ARM 0)
15
--    endif()
16
--    set(ARM 1)
17
--    if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
18
--        message(STATUS "Detected ARM64 target processor")
19
--        set(ARM64 1)
20
--        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0)
21
--    else()
22
--        message(STATUS "Detected ARM target processor")
23
--        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1)
24
--    endif()
25
-+elseif(${SYSPROC} MATCHES "armv5.*")
26
-+    message(STATUS "Detected ARMV5 system processor")
27
-+    set(ARMV5 1)
28
-+    add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=0 -DHAVE_NEON=0)
29
-+elseif(${SYSPROC} STREQUAL "armv6l")
30
-+    message(STATUS "Detected ARMV6 system processor")
31
-+    set(ARMV6 1)
32
-+    add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
33
-+elseif(${SYSPROC} STREQUAL "armv7l")
34
-+    message(STATUS "Detected ARMV7 system processor")
35
-+    set(ARMV7 1)
36
-+    add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
37
-+elseif(${SYSPROC} STREQUAL "aarch64")
38
-+    message(STATUS "Detected AArch64 system processor")
39
-+    set(ARMV7 1)
40
-+    add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0 -DHAVE_NEON=0)
41
- else()
42
-     message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
43
-     message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
44
- endif()
45
--
46
- if(UNIX)
47
-     list(APPEND PLATFORM_LIBS pthread)
48
-     find_library(LIBRT rt)
49
-@@ -238,28 +238,9 @@ if(GCC)
50
-             endif()
51
-         endif()
52
-     endif()
53
--    if(ARM AND CROSS_COMPILE_ARM)
54
--        if(ARM64)
55
--            set(ARM_ARGS -fPIC)
56
--        else()
57
--            set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
58
--        endif()
59
--        message(STATUS "cross compile arm")
60
--    elseif(ARM)
61
--        if(ARM64)
62
--            set(ARM_ARGS -fPIC)
63
--            add_definitions(-DHAVE_NEON)
64
--        else()
65
--            find_package(Neon)
66
--            if(CPU_HAS_NEON)
67
--                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
68
--                add_definitions(-DHAVE_NEON)
69
--            else()
70
--                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
71
--            endif()
72
--        endif()
73
-+    if(ARMV7)
74
-+        add_definitions(-fPIC)
75
-     endif()
76
--    add_definitions(${ARM_ARGS})
77
-     if(FPROFILE_GENERATE)
78
-         if(INTEL_CXX)
79
-             add_definitions(-prof-gen -prof-dir="${CMAKE_CURRENT_BINARY_DIR}")
80
-Index: x265_3.4/source/common/cpu.cpp
81
-===================================================================
82
---- x265_3.4.orig/source/common/cpu.cpp
83
-+++ x265_3.4/source/common/cpu.cpp
84
-@@ -39,7 +39,7 @@
85
- #include <machine/cpu.h>
86
- #endif
87
- 
88
--#if X265_ARCH_ARM && !defined(HAVE_NEON)
89
-+#if X265_ARCH_ARM && (!defined(HAVE_NEON) || HAVE_NEON==0)
90
- #include <signal.h>
91
- #include <setjmp.h>
92
- static sigjmp_buf jmpbuf;
93
-@@ -350,7 +350,6 @@ uint32_t cpu_detect(bool benableavx512)
94
-     }
95
- 
96
-     canjump = 1;
97
--    PFX(cpu_neon_test)();
98
-     canjump = 0;
99
-     signal(SIGILL, oldsig);
100
- #endif // if !HAVE_NEON
101
-@@ -366,7 +365,7 @@ uint32_t cpu_detect(bool benableavx512)
102
-     // which may result in incorrect detection and the counters stuck enabled.
103
-     // right now Apple does not seem to support performance counters for this test
104
- #ifndef __MACH__
105
--    flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
106
-+    //flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
107
- #endif
108
-     // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
109
- #elif X265_ARCH_ARM64
110
baselibs.conf Changed
4
 
1
@@ -1,1 +1,1 @@
2
-libx265-199
3
+libx265-209
4
x265_3.5.tar.gz/source/common/aarch64/ipfilter8.S Deleted
201
 
1
@@ -1,414 +0,0 @@
2
-/*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
- *
5
- * Authors: Yimeng Su <yimeng.su@huawei.com>
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
- *
21
- * This program is also available under a commercial proprietary license.
22
- * For more information, contact us at license @ x265.com.
23
- *****************************************************************************/
24
-
25
-#include "asm.S"
26
-
27
-.section .rodata
28
-
29
-.align 4
30
-
31
-.text
32
-
33
-
34
-
35
-.macro qpel_filter_0_32b
36
-    movi            v24.8h, #64
37
-    uxtl            v19.8h, v5.8b
38
-    smull           v17.4s, v19.4h, v24.4h
39
-    smull2          v18.4s, v19.8h, v24.8h
40
-.endm
41
-
42
-.macro qpel_filter_1_32b
43
-    movi            v16.8h, #58
44
-    uxtl            v19.8h, v5.8b
45
-    smull           v17.4s, v19.4h, v16.4h
46
-    smull2          v18.4s, v19.8h, v16.8h
47
-
48
-    movi            v24.8h, #10
49
-    uxtl            v21.8h, v1.8b
50
-    smull           v19.4s, v21.4h, v24.4h
51
-    smull2          v20.4s, v21.8h, v24.8h
52
-
53
-    movi            v16.8h, #17
54
-    uxtl            v23.8h, v2.8b
55
-    smull           v21.4s, v23.4h, v16.4h
56
-    smull2          v22.4s, v23.8h, v16.8h
57
-
58
-    movi            v24.8h, #5
59
-    uxtl            v1.8h, v6.8b
60
-    smull           v23.4s, v1.4h, v24.4h
61
-    smull2          v16.4s, v1.8h, v24.8h
62
-
63
-    sub             v17.4s, v17.4s, v19.4s
64
-    sub             v18.4s, v18.4s, v20.4s
65
-
66
-    uxtl            v1.8h, v4.8b
67
-    sshll           v19.4s, v1.4h, #2
68
-    sshll2          v20.4s, v1.8h, #2
69
-
70
-    add             v17.4s, v17.4s, v21.4s
71
-    add             v18.4s, v18.4s, v22.4s
72
-
73
-    uxtl            v1.8h, v0.8b
74
-    uxtl            v2.8h, v3.8b
75
-    ssubl           v21.4s, v2.4h, v1.4h
76
-    ssubl2          v22.4s, v2.8h, v1.8h
77
-
78
-    add             v17.4s, v17.4s, v19.4s
79
-    add             v18.4s, v18.4s, v20.4s
80
-    sub             v21.4s, v21.4s, v23.4s
81
-    sub             v22.4s, v22.4s, v16.4s
82
-    add             v17.4s, v17.4s, v21.4s
83
-    add             v18.4s, v18.4s, v22.4s
84
-.endm
85
-
86
-.macro qpel_filter_2_32b
87
-    movi            v16.4s, #11
88
-    uxtl            v19.8h, v5.8b
89
-    uxtl            v20.8h, v2.8b
90
-    saddl           v17.4s, v19.4h, v20.4h
91
-    saddl2          v18.4s, v19.8h, v20.8h
92
-
93
-    uxtl            v21.8h, v1.8b
94
-    uxtl            v22.8h, v6.8b
95
-    saddl           v19.4s, v21.4h, v22.4h
96
-    saddl2          v20.4s, v21.8h, v22.8h
97
-
98
-    mul             v19.4s, v19.4s, v16.4s
99
-    mul             v20.4s, v20.4s, v16.4s
100
-
101
-    movi            v16.4s, #40
102
-    mul             v17.4s, v17.4s, v16.4s
103
-    mul             v18.4s, v18.4s, v16.4s
104
-
105
-    uxtl            v21.8h, v4.8b
106
-    uxtl            v22.8h, v3.8b
107
-    saddl           v23.4s, v21.4h, v22.4h
108
-    saddl2          v16.4s, v21.8h, v22.8h
109
-
110
-    uxtl            v1.8h, v0.8b
111
-    uxtl            v2.8h, v7.8b
112
-    saddl           v21.4s, v1.4h, v2.4h
113
-    saddl2          v22.4s, v1.8h, v2.8h
114
-
115
-    shl             v23.4s, v23.4s, #2
116
-    shl             v16.4s, v16.4s, #2
117
-
118
-    add             v19.4s, v19.4s, v21.4s
119
-    add             v20.4s, v20.4s, v22.4s
120
-    add             v17.4s, v17.4s, v23.4s
121
-    add             v18.4s, v18.4s, v16.4s
122
-    sub             v17.4s, v17.4s, v19.4s
123
-    sub             v18.4s, v18.4s, v20.4s
124
-.endm
125
-
126
-.macro qpel_filter_3_32b
127
-    movi            v16.8h, #17
128
-    movi            v24.8h, #5
129
-
130
-    uxtl            v19.8h, v5.8b
131
-    smull           v17.4s, v19.4h, v16.4h
132
-    smull2          v18.4s, v19.8h, v16.8h
133
-
134
-    uxtl            v21.8h, v1.8b
135
-    smull           v19.4s, v21.4h, v24.4h
136
-    smull2          v20.4s, v21.8h, v24.8h
137
-
138
-    movi            v16.8h, #58
139
-    uxtl            v23.8h, v2.8b
140
-    smull           v21.4s, v23.4h, v16.4h
141
-    smull2          v22.4s, v23.8h, v16.8h
142
-
143
-    movi            v24.8h, #10
144
-    uxtl            v1.8h, v6.8b
145
-    smull           v23.4s, v1.4h, v24.4h
146
-    smull2          v16.4s, v1.8h, v24.8h
147
-
148
-    sub             v17.4s, v17.4s, v19.4s
149
-    sub             v18.4s, v18.4s, v20.4s
150
-
151
-    uxtl            v1.8h, v3.8b
152
-    sshll           v19.4s, v1.4h, #2
153
-    sshll2          v20.4s, v1.8h, #2
154
-
155
-    add             v17.4s, v17.4s, v21.4s
156
-    add             v18.4s, v18.4s, v22.4s
157
-
158
-    uxtl            v1.8h, v4.8b
159
-    uxtl            v2.8h, v7.8b
160
-    ssubl           v21.4s, v1.4h, v2.4h
161
-    ssubl2          v22.4s, v1.8h, v2.8h
162
-
163
-    add             v17.4s, v17.4s, v19.4s
164
-    add             v18.4s, v18.4s, v20.4s
165
-    sub             v21.4s, v21.4s, v23.4s
166
-    sub             v22.4s, v22.4s, v16.4s
167
-    add             v17.4s, v17.4s, v21.4s
168
-    add             v18.4s, v18.4s, v22.4s
169
-.endm
170
-
171
-
172
-
173
-
174
-.macro vextin8
175
-    ld1             {v3.16b}, x11, #16
176
-    mov             v7.d0, v3.d1
177
-    ext             v0.8b, v3.8b, v7.8b, #1
178
-    ext             v4.8b, v3.8b, v7.8b, #2
179
-    ext             v1.8b, v3.8b, v7.8b, #3
180
-    ext             v5.8b, v3.8b, v7.8b, #4
181
-    ext             v2.8b, v3.8b, v7.8b, #5
182
-    ext             v6.8b, v3.8b, v7.8b, #6
183
-    ext             v3.8b, v3.8b, v7.8b, #7
184
-.endm
185
-
186
-
187
-
188
-// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
189
-.macro HPS_FILTER a b filterhps
190
-    mov             w12, #8192
191
-    mov             w6, w10
192
-    sub             x3, x3, #\a
193
-    lsl             x3, x3, #1
194
-    mov             w9, #\a
195
-    cmp             w9, #4
196
-    b.eq            14f
197
-    cmp             w9, #12
198
-    b.eq            15f
199
-    b               7f
200
-14:
201
x265_3.5.tar.gz/source/common/aarch64/ipfilter8.h Deleted
57
 
1
@@ -1,55 +0,0 @@
2
-/*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
- *
5
- * Authors: Yimeng Su <yimeng.su@huawei.com>
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
- *
21
- * This program is also available under a commercial proprietary license.
22
- * For more information, contact us at license @ x265.com.
23
- *****************************************************************************/
24
-
25
-#ifndef X265_IPFILTER8_AARCH64_H
26
-#define X265_IPFILTER8_AARCH64_H
27
-
28
-
29
-void x265_interp_8tap_horiz_ps_4x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
30
-void x265_interp_8tap_horiz_ps_4x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
31
-void x265_interp_8tap_horiz_ps_4x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
32
-void x265_interp_8tap_horiz_ps_8x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
33
-void x265_interp_8tap_horiz_ps_8x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
34
-void x265_interp_8tap_horiz_ps_8x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
35
-void x265_interp_8tap_horiz_ps_8x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
36
-void x265_interp_8tap_horiz_ps_12x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
37
-void x265_interp_8tap_horiz_ps_16x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
38
-void x265_interp_8tap_horiz_ps_16x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
39
-void x265_interp_8tap_horiz_ps_16x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
40
-void x265_interp_8tap_horiz_ps_16x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
41
-void x265_interp_8tap_horiz_ps_16x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
42
-void x265_interp_8tap_horiz_ps_16x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
43
-void x265_interp_8tap_horiz_ps_24x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
44
-void x265_interp_8tap_horiz_ps_32x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
45
-void x265_interp_8tap_horiz_ps_32x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
46
-void x265_interp_8tap_horiz_ps_32x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
47
-void x265_interp_8tap_horiz_ps_32x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
48
-void x265_interp_8tap_horiz_ps_32x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
49
-void x265_interp_8tap_horiz_ps_48x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
50
-void x265_interp_8tap_horiz_ps_64x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
51
-void x265_interp_8tap_horiz_ps_64x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
52
-void x265_interp_8tap_horiz_ps_64x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
53
-void x265_interp_8tap_horiz_ps_64x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
54
-
55
-
56
-#endif // ifndef X265_IPFILTER8_AARCH64_H
57
x265_3.5.tar.gz/source/common/aarch64/pixel-util.h Deleted
42
 
1
@@ -1,40 +0,0 @@
2
-/*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
- *
5
- * Authors: Yimeng Su <yimeng.su@huawei.com>
6
- *          Hongbin Liu <liuhongbin1@huawei.com>
7
- *
8
- * This program is free software; you can redistribute it and/or modify
9
- * it under the terms of the GNU General Public License as published by
10
- * the Free Software Foundation; either version 2 of the License, or
11
- * (at your option) any later version.
12
- *
13
- * This program is distributed in the hope that it will be useful,
14
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
- * GNU General Public License for more details.
17
- *
18
- * You should have received a copy of the GNU General Public License
19
- * along with this program; if not, write to the Free Software
20
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21
- *
22
- * This program is also available under a commercial proprietary license.
23
- * For more information, contact us at license @ x265.com.
24
- *****************************************************************************/
25
-
26
-#ifndef X265_PIXEL_UTIL_AARCH64_H
27
-#define X265_PIXEL_UTIL_AARCH64_H
28
-
29
-int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
30
-int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
31
-int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
32
-int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
33
-int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
34
-int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
35
-int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
36
-int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
37
-
38
-uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
39
-int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
40
-
41
-#endif // ifndef X265_PIXEL_UTIL_AARCH64_H
42
x265_3.5.tar.gz/source/common/aarch64/pixel.h Deleted
107
 
1
@@ -1,105 +0,0 @@
2
-/*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
- *
5
- * Authors: Hongbin Liu <liuhongbin1@huawei.com>
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
- *
21
- * This program is also available under a commercial proprietary license.
22
- * For more information, contact us at license @ x265.com.
23
- *****************************************************************************/
24
-
25
-#ifndef X265_I386_PIXEL_AARCH64_H
26
-#define X265_I386_PIXEL_AARCH64_H
27
-
28
-void x265_pixel_avg_pp_4x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
29
-void x265_pixel_avg_pp_4x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
30
-void x265_pixel_avg_pp_4x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
31
-void x265_pixel_avg_pp_8x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
32
-void x265_pixel_avg_pp_8x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
33
-void x265_pixel_avg_pp_8x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
34
-void x265_pixel_avg_pp_8x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
35
-void x265_pixel_avg_pp_12x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
36
-void x265_pixel_avg_pp_16x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
37
-void x265_pixel_avg_pp_16x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
38
-void x265_pixel_avg_pp_16x12_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
39
-void x265_pixel_avg_pp_16x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
40
-void x265_pixel_avg_pp_16x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
41
-void x265_pixel_avg_pp_16x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
42
-void x265_pixel_avg_pp_24x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
43
-void x265_pixel_avg_pp_32x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
44
-void x265_pixel_avg_pp_32x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
45
-void x265_pixel_avg_pp_32x24_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
46
-void x265_pixel_avg_pp_32x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
47
-void x265_pixel_avg_pp_32x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
48
-void x265_pixel_avg_pp_48x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
49
-void x265_pixel_avg_pp_64x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
50
-void x265_pixel_avg_pp_64x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
51
-void x265_pixel_avg_pp_64x48_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
52
-void x265_pixel_avg_pp_64x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
53
-
54
-void x265_sad_x3_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
55
-void x265_sad_x3_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
56
-void x265_sad_x3_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
57
-void x265_sad_x3_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
58
-void x265_sad_x3_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
59
-void x265_sad_x3_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
60
-void x265_sad_x3_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
61
-void x265_sad_x3_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
62
-void x265_sad_x3_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
63
-void x265_sad_x3_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
64
-void x265_sad_x3_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
65
-void x265_sad_x3_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
66
-void x265_sad_x3_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
67
-void x265_sad_x3_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
68
-void x265_sad_x3_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
69
-void x265_sad_x3_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
70
-void x265_sad_x3_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
71
-void x265_sad_x3_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
72
-void x265_sad_x3_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
73
-void x265_sad_x3_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
74
-void x265_sad_x3_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
75
-void x265_sad_x3_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
76
-void x265_sad_x3_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
77
-void x265_sad_x3_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
78
-void x265_sad_x3_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
79
-
80
-void x265_sad_x4_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
81
-void x265_sad_x4_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
82
-void x265_sad_x4_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
83
-void x265_sad_x4_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
84
-void x265_sad_x4_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
85
-void x265_sad_x4_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
86
-void x265_sad_x4_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
87
-void x265_sad_x4_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
88
-void x265_sad_x4_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
89
-void x265_sad_x4_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
90
-void x265_sad_x4_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
91
-void x265_sad_x4_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
92
-void x265_sad_x4_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
93
-void x265_sad_x4_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
94
-void x265_sad_x4_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
95
-void x265_sad_x4_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
96
-void x265_sad_x4_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
97
-void x265_sad_x4_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
98
-void x265_sad_x4_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
99
-void x265_sad_x4_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
100
-void x265_sad_x4_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
101
-void x265_sad_x4_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
102
-void x265_sad_x4_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
103
-void x265_sad_x4_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
104
-void x265_sad_x4_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
105
-
106
-#endif // ifndef X265_I386_PIXEL_AARCH64_H
107
x265_3.6.tar.gz/.gitignore Added
38
 
1
@@ -0,0 +1,36 @@
2
+# Prerequisites
3
+*.d
4
+
5
+# Compiled Object files
6
+*.slo
7
+*.lo
8
+*.o
9
+*.obj
10
+
11
+# Precompiled Headers
12
+*.gch
13
+*.pch
14
+
15
+# Compiled Dynamic libraries
16
+*.so
17
+*.dylib
18
+*.dll
19
+
20
+# Fortran module files
21
+*.mod
22
+*.smod
23
+
24
+# Compiled Static libraries
25
+*.lai
26
+*.la
27
+*.a
28
+*.lib
29
+
30
+# Executables
31
+*.exe
32
+*.out
33
+*.app
34
+
35
+# Build directory
36
+build/
37
+
38
x265_3.5.tar.gz/build/README.txt -> x265_3.6.tar.gz/build/README.txt Changed
37
 
1
@@ -6,6 +6,9 @@
2
 
3
 Note: MSVC12 requires cmake 2.8.11 or later
4
 
5
+Note: When the SVE/SVE2 instruction set of Arm AArch64 architecture is to be used, the GCC10.x and onwards must
6
+      be installed in order to compile x265.
7
+
8
 
9
 = Optional Prerequisites =
10
 
11
@@ -88,3 +91,25 @@
12
 building out of a Mercurial source repository.  If you are building out of
13
 a release source package, the version will not change.  If Mercurial is not
14
 found, the version will be "unknown".
15
+
16
+= Build Instructions for cross-compilation for Arm AArch64 Targets=
17
+
18
+When the target platform is based on Arm AArch64 architecture, the x265 can be
19
+built in x86 platforms. However, the CMAKE_C_COMPILER and CMAKE_CXX_COMPILER
20
+enviroment variables should be set to point to the cross compilers of the
21
+appropriate gcc. For example:
22
+
23
+1. export CMAKE_C_COMPILER=aarch64-unknown-linux-gnu-gcc
24
+2. export CMAKE_CXX_COMPILER=aarch64-unknown-linux-gnu-g++
25
+
26
+The default ones are aarch64-linux-gnu-gcc and aarch64-linux-gnu-g++.
27
+Then, the normal building process can be followed.
28
+
29
+Moreover, if the target platform supports SVE or SVE2 instruction set, the
30
+CROSS_COMPILE_SVE or CROSS_COMPILE_SVE2 environment variables should be set
31
+to true, respectively. For example:
32
+
33
+1. export CROSS_COMPILE_SVE2=true
34
+2. export CROSS_COMPILE_SVE=true
35
+
36
+Then, the normal building process can be followed.
37
x265_3.6.tar.gz/build/aarch64-darwin Added
2
 
1
+(directory)
2
x265_3.6.tar.gz/build/aarch64-darwin/crosscompile.cmake Added
25
 
1
@@ -0,0 +1,23 @@
2
+# CMake toolchain file for cross compiling x265 for aarch64
3
+# This feature is only supported as experimental. Use with caution.
4
+# Please report bugs on bitbucket
5
+# Run cmake with: cmake -DCMAKE_TOOLCHAIN_FILE=crosscompile.cmake -G "Unix Makefiles" ../../source && ccmake ../../source
6
+
7
+set(CROSS_COMPILE_ARM64 1)
8
+set(CMAKE_SYSTEM_NAME Darwin)
9
+set(CMAKE_SYSTEM_PROCESSOR aarch64)
10
+
11
+# specify the cross compiler
12
+set(CMAKE_C_COMPILER gcc-12)
13
+set(CMAKE_CXX_COMPILER g++-12)
14
+
15
+# specify the target environment
16
+SET(CMAKE_FIND_ROOT_PATH  /opt/homebrew/bin/)
17
+
18
+# specify whether SVE/SVE2 is supported by the target platform
19
+if(DEFINED ENV{CROSS_COMPILE_SVE2})
20
+    set(CROSS_COMPILE_SVE2 1)
21
+elseif(DEFINED ENV{CROSS_COMPILE_SVE})
22
+    set(CROSS_COMPILE_SVE 1)
23
+endif()
24
+
25
x265_3.6.tar.gz/build/aarch64-darwin/make-Makefiles.bash Added
6
 
1
@@ -0,0 +1,4 @@
2
+#!/bin/bash
3
+# Run this from within a bash shell
4
+
5
+cmake -DCMAKE_TOOLCHAIN_FILE="crosscompile.cmake" -G "Unix Makefiles" ../../source && ccmake ../../source
6
x265_3.5.tar.gz/build/aarch64-linux/crosscompile.cmake -> x265_3.6.tar.gz/build/aarch64-linux/crosscompile.cmake Changed
34
 
1
@@ -3,13 +3,29 @@
2
 # Please report bugs on bitbucket
3
 # Run cmake with: cmake -DCMAKE_TOOLCHAIN_FILE=crosscompile.cmake -G "Unix Makefiles" ../../source && ccmake ../../source
4
 
5
-set(CROSS_COMPILE_ARM 1)
6
+set(CROSS_COMPILE_ARM64 1)
7
 set(CMAKE_SYSTEM_NAME Linux)
8
 set(CMAKE_SYSTEM_PROCESSOR aarch64)
9
 
10
 # specify the cross compiler
11
-set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
12
-set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
13
+if(DEFINED ENV{CMAKE_C_COMPILER})
14
+    set(CMAKE_C_COMPILER $ENV{CMAKE_C_COMPILER})
15
+else()
16
+    set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
17
+endif()
18
+if(DEFINED ENV{CMAKE_CXX_COMPILER})
19
+    set(CMAKE_CXX_COMPILER $ENV{CMAKE_CXX_COMPILER})
20
+else()
21
+    set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
22
+endif()
23
 
24
 # specify the target environment
25
 SET(CMAKE_FIND_ROOT_PATH  /usr/aarch64-linux-gnu)
26
+
27
+# specify whether SVE/SVE2 is supported by the target platform
28
+if(DEFINED ENV{CROSS_COMPILE_SVE2})
29
+    set(CROSS_COMPILE_SVE2 1)
30
+elseif(DEFINED ENV{CROSS_COMPILE_SVE})
31
+    set(CROSS_COMPILE_SVE 1)
32
+endif()
33
+
34
x265_3.5.tar.gz/build/arm-linux/make-Makefiles.bash -> x265_3.6.tar.gz/build/arm-linux/make-Makefiles.bash Changed
7
 
1
@@ -1,4 +1,4 @@
2
 #!/bin/bash
3
 # Run this from within a bash shell
4
 
5
-cmake -G "Unix Makefiles" ../../source && ccmake ../../source
6
+cmake -DCMAKE_TOOLCHAIN_FILE="crosscompile.cmake" -G "Unix Makefiles" ../../source && ccmake ../../source
7
x265_3.5.tar.gz/doc/reST/cli.rst -> x265_3.6.tar.gz/doc/reST/cli.rst Changed
201
 
1
@@ -632,9 +632,8 @@
2
    auto-detection by the encoder. If specified, the encoder will
3
    attempt to bring the encode specifications within that specified
4
    level. If the encoder is unable to reach the level it issues a
5
-   warning and aborts the encode. If the requested requirement level is
6
-   higher than the actual level, the actual requirement level is
7
-   signaled.
8
+   warning and aborts the encode. The requested level will be signaled 
9
+   in the bitstream even if it is higher than the actual level.
10
 
11
    Beware, specifying a decoder level will force the encoder to enable
12
    VBV for constant rate factor encodes, which may introduce
13
@@ -714,11 +713,8 @@
14
    (main, main10, etc). Second, an encoder is created from this
15
    x265_param instance and the :option:`--level-idc` and
16
    :option:`--high-tier` parameters are used to reduce bitrate or other
17
-   features in order to enforce the target level. Finally, the encoder
18
-   re-examines the final set of parameters and detects the actual
19
-   minimum decoder requirement level and this is what is signaled in
20
-   the bitstream headers. The detected decoder level will only use High
21
-   tier if the user specified a High tier level.
22
+   features in order to enforce the target level. The detected decoder level
23
+   will only use High tier if the user specified a High tier level.
24
 
25
    The signaled profile will be determined by the encoder's internal
26
    bitdepth and input color space. If :option:`--keyint` is 0 or 1,
27
@@ -961,21 +957,21 @@
28
    Note that :option:`--analysis-save-reuse-level` and :option:`--analysis-load-reuse-level` must be paired
29
    with :option:`--analysis-save` and :option:`--analysis-load` respectively.
30
 
31
-   +--------------+------------------------------------------+
32
-   | Level        | Description                              |
33
-   +==============+==========================================+
34
-   | 1            | Lookahead information                    |
35
-   +--------------+------------------------------------------+
36
-   | 2 to 4       | Level 1 + intra/inter modes, ref's       |
37
-   +--------------+------------------------------------------+
38
-   | 5 and 6      | Level 2 + rect-amp                       |
39
-   +--------------+------------------------------------------+
40
-   | 7            | Level 5 + AVC size CU refinement         |
41
-   +--------------+------------------------------------------+
42
-   | 8 and 9      | Level 5 + AVC size Full CU analysis-info |
43
-   +--------------+------------------------------------------+
44
-   | 10           | Level 5 + Full CU analysis-info          |
45
-   +--------------+------------------------------------------+
46
+   +--------------+---------------------------------------------------+
47
+   | Level        | Description                                       |
48
+   +==============+===================================================+
49
+   | 1            | Lookahead information                             |
50
+   +--------------+---------------------------------------------------+
51
+   | 2 to 4       | Level 1 + intra/inter modes, depth, ref's, cutree |
52
+   +--------------+---------------------------------------------------+
53
+   | 5 and 6      | Level 2 + rect-amp                                |
54
+   +--------------+---------------------------------------------------+
55
+   | 7            | Level 5 + AVC size CU refinement                  |
56
+   +--------------+---------------------------------------------------+
57
+   | 8 and 9      | Level 5 + AVC size Full CU analysis-info          |
58
+   +--------------+---------------------------------------------------+
59
+   | 10           | Level 5 + Full CU analysis-info                   |
60
+   +--------------+---------------------------------------------------+
61
 
62
 .. option:: --refine-mv-type <string>
63
 
64
@@ -1332,6 +1328,11 @@
65
    Search range for HME level 0, 1 and 2.
66
    The Search Range for each HME level must be between 0 and 32768(excluding).
67
    Default search range is 16,32,48 for level 0,1,2 respectively.
68
+   
69
+.. option:: --mcstf, --no-mcstf
70
+
71
+    Enable Motion Compensated Temporal filtering.
72
+   Default: disabled
73
 
74
 Spatial/intra options
75
 =====================
76
@@ -1473,17 +1474,9 @@
77
 
78
 .. option:: --hist-scenecut, --no-hist-scenecut
79
 
80
-   Indicates that scenecuts need to be detected using luma edge and chroma histograms.
81
-   :option:`--hist-scenecut` enables scenecut detection using the histograms and disables the default scene cut algorithm.
82
-   :option:`--no-hist-scenecut` disables histogram based scenecut algorithm.
83
-   
84
-.. option:: --hist-threshold <0.0..1.0>
85
-
86
-   This value represents the threshold for normalized SAD of edge histograms used in scenecut detection.
87
-   This requires :option:`--hist-scenecut` to be enabled. For example, a value of 0.2 indicates that a frame with normalized SAD value 
88
-   greater than 0.2 against the previous frame as scenecut. 
89
-   Increasing the threshold reduces the number of scenecuts detected.
90
-   Default 0.03.
91
+   Scenecuts detected based on histogram, intensity and variance of the picture.
92
+   :option:`--hist-scenecut` enables or :option:`--no-hist-scenecut` disables scenecut detection based on
93
+   histogram.
94
    
95
 .. option:: --radl <integer>
96
    
97
@@ -1766,6 +1759,12 @@
98
    Default 1.0.
99
    **Range of values:** 0.0 to 3.0
100
 
101
+.. option:: --sbrc --no-sbrc
102
+
103
+   To enable and disable segment based rate control.Segment duration depends on the
104
+   keyframe interval specified.If unspecified,default keyframe interval will be used.
105
+   Default: disabled.
106
+
107
 .. option:: --hevc-aq
108
 
109
    Enable adaptive quantization
110
@@ -1976,12 +1975,18 @@
111
    
112
    **CLI ONLY**
113
 
114
+.. option:: --scenecut-qp-config <filename>
115
+
116
+   Specify a text file which contains the scenecut aware QP options.
117
+   The options include :option:`--scenecut-aware-qp` and :option:`--masking-strength`
118
+
119
+   **CLI ONLY**
120
+
121
 .. option:: --scenecut-aware-qp <integer>
122
 
123
    It reduces the bits spent on the inter-frames within the scenecut window
124
    before and after a scenecut by increasing their QP in ratecontrol pass2 algorithm
125
-   without any deterioration in visual quality. If a scenecut falls within the window,
126
-   the QP of the inter-frames after this scenecut will not be modified.
127
+   without any deterioration in visual quality.
128
    :option:`--scenecut-aware-qp` works only with --pass 2. Default 0.
129
 
130
    +-------+---------------------------------------------------------------+
131
@@ -2006,48 +2011,83 @@
132
    for the QP increment for inter-frames when :option:`--scenecut-aware-qp`
133
    is enabled.
134
 
135
-   When :option:`--scenecut-aware-qp` is::
136
+   When :option:`--scenecut-aware-qp` is:
137
+
138
    * 1 (Forward masking):
139
-   --masking-strength <fwdWindow,fwdRefQPDelta,fwdNonRefQPDelta>
140
+   --masking-strength <fwdMaxWindow,fwdRefQPDelta,fwdNonRefQPDelta>
141
+   or 
142
+   --masking-strength <fwdWindow1,fwdRefQPDelta1,fwdNonRefQPDelta1,fwdWindow2,fwdRefQPDelta2,fwdNonRefQPDelta2,
143
+                       fwdWindow3,fwdRefQPDelta3,fwdNonRefQPDelta3,fwdWindow4,fwdRefQPDelta4,fwdNonRefQPDelta4,
144
+                       fwdWindow5,fwdRefQPDelta5,fwdNonRefQPDelta5,fwdWindow6,fwdRefQPDelta6,fwdNonRefQPDelta6>
145
    * 2 (Backward masking):
146
-   --masking-strength <bwdWindow,bwdRefQPDelta,bwdNonRefQPDelta>
147
+   --masking-strength <bwdMaxWindow,bwdRefQPDelta,bwdNonRefQPDelta>
148
+   or 
149
+   --masking-strength <bwdWindow1,bwdRefQPDelta1,bwdNonRefQPDelta1,bwdWindow2,bwdRefQPDelta2,bwdNonRefQPDelta2,
150
+                       bwdWindow3,bwdRefQPDelta3,bwdNonRefQPDelta3,bwdWindow4,bwdRefQPDelta4,bwdNonRefQPDelta4,
151
+                       bwdWindow5,bwdRefQPDelta5,bwdNonRefQPDelta5,bwdWindow6,bwdRefQPDelta6,bwdNonRefQPDelta6>
152
    * 3 (Bi-directional masking):
153
-   --masking-strength <fwdWindow,fwdRefQPDelta,fwdNonRefQPDelta,bwdWindow,bwdRefQPDelta,bwdNonRefQPDelta>
154
+   --masking-strength <fwdMaxWindow,fwdRefQPDelta,fwdNonRefQPDelta,bwdMaxWindow,bwdRefQPDelta,bwdNonRefQPDelta>
155
+   or 
156
+   --masking-strength <fwdWindow1,fwdRefQPDelta1,fwdNonRefQPDelta1,fwdWindow2,fwdRefQPDelta2,fwdNonRefQPDelta2,
157
+                       fwdWindow3,fwdRefQPDelta3,fwdNonRefQPDelta3,fwdWindow4,fwdRefQPDelta4,fwdNonRefQPDelta4,
158
+                       fwdWindow5,fwdRefQPDelta5,fwdNonRefQPDelta5,fwdWindow6,fwdRefQPDelta6,fwdNonRefQPDelta6,
159
+                       bwdWindow1,bwdRefQPDelta1,bwdNonRefQPDelta1,bwdWindow2,bwdRefQPDelta2,bwdNonRefQPDelta2,
160
+                       bwdWindow3,bwdRefQPDelta3,bwdNonRefQPDelta3,bwdWindow4,bwdRefQPDelta4,bwdNonRefQPDelta4,
161
+                       bwdWindow5,bwdRefQPDelta5,bwdNonRefQPDelta5,bwdWindow6,bwdRefQPDelta6,bwdNonRefQPDelta6>
162
 
163
    +-----------------+---------------------------------------------------------------+
164
    | Parameter       | Description                                                   |
165
    +=================+===============================================================+
166
-   | fwdWindow       | The duration(in milliseconds) for which there is a reduction  |
167
-   |                 | in the bits spent on the inter-frames after a scenecut by     |
168
-   |                 | increasing their QP. Default 500ms.                           |
169
-   |                 | **Range of values:** 0 to 1000                                |
170
+   | fwdMaxWindow    | The maximum duration(in milliseconds) for which there is a    |
171
+   |                 | reduction in the bits spent on the inter-frames after a       |
172
+   |                 | scenecut by increasing their QP. Default 500ms.               |
173
+   |                 | **Range of values:** 0 to 2000                                |
174
+   +-----------------+---------------------------------------------------------------+
175
+   | fwdWindow       | The duration of a sub-window(in milliseconds) for which there |
176
+   |                 | is a reduction in the bits spent on the inter-frames after a  |
177
+   |                 | scenecut by increasing their QP. Default 500ms.               |
178
+   |                 | **Range of values:** 0 to 2000                                |
179
    +-----------------+---------------------------------------------------------------+
180
    | fwdRefQPDelta   | The offset by which QP is incremented for inter-frames        |
181
    |                 | after a scenecut. Default 5.                                  |
182
-   |                 | **Range of values:** 0 to 10                                  |
183
+   |                 | **Range of values:** 0 to 20                                  |
184
    +-----------------+---------------------------------------------------------------+
185
    | fwdNonRefQPDelta| The offset by which QP is incremented for non-referenced      |
186
    |                 | inter-frames after a scenecut. The offset is computed from    |
187
    |                 | fwdRefQPDelta when it is not explicitly specified.            |
188
-   |                 | **Range of values:** 0 to 10                                  |
189
+   |                 | **Range of values:** 0 to 20                                  |
190
+   +-----------------+---------------------------------------------------------------+
191
+   | bwdMaxWindow    | The maximum duration(in milliseconds) for which there is a    |
192
+   |                 | reduction in the bits spent on the inter-frames before a      |
193
+   |                 | scenecut by increasing their QP. Default 100ms.               |
194
+   |                 | **Range of values:** 0 to 2000                                |
195
    +-----------------+---------------------------------------------------------------+
196
-   | bwdWindow       | The duration(in milliseconds) for which there is a reduction  |
197
-   |                 | in the bits spent on the inter-frames before a scenecut by    |
198
-   |                 | increasing their QP. Default 100ms.                           |
199
-   |                 | **Range of values:** 0 to 1000                                |
200
+   | bwdWindow       | The duration of a sub-window(in milliseconds) for which there |
201
x265_3.5.tar.gz/doc/reST/introduction.rst -> x265_3.6.tar.gz/doc/reST/introduction.rst Changed
9
 
1
@@ -77,6 +77,6 @@
2
 to start is with the `Motion Picture Experts Group - Licensing Authority
3
 - HEVC Licensing Program <http://www.mpegla.com/main/PID/HEVC/default.aspx>`_.
4
 
5
-x265 is a registered trademark of MulticoreWare, Inc.  The x265 logo is
6
+x265 is a registered trademark of MulticoreWare, Inc.  The X265 logo is
7
 a trademark of MulticoreWare, and may only be used with explicit written
8
 permission.  All rights reserved.
9
x265_3.5.tar.gz/doc/reST/releasenotes.rst -> x265_3.6.tar.gz/doc/reST/releasenotes.rst Changed
55
 
1
@@ -2,6 +2,53 @@
2
 Release Notes
3
 *************
4
 
5
+Version 3.6
6
+===========
7
+
8
+Release date - 4th April, 2024.
9
+
10
+New feature
11
+-----------
12
+1. Segment based Ratecontrol (SBRC) feature
13
+2. Motion-Compensated Spatio-Temporal Filtering
14
+3. Scene-cut aware qp - BBAQ (Bidirectional Boundary Aware Quantization)
15
+4. Histogram-Based Scene Change Detection
16
+5. Film-Grain characteristics as a SEI message to support Film Grain Synthesis(FGS)
17
+6. Add temporal layer implementation(Hierarchical B-frame implementation)
18
+ 
19
+Enhancements to existing features
20
+---------------------------------
21
+1. Added Dolby Vision 8.4 Profile Support
22
+
23
+
24
+API changes
25
+-----------
26
+1. Add Segment based Ratecontrol(SBRC) feature: "--no-sbrc".
27
+2. Add command line parameter for mcstf feature: "--no-mctf".
28
+3. Add command line parameters for the scene cut aware qp feature: "--scenecut-aware-qp" and "--masking-strength".
29
+4. Add command line parameters for Histogram-Based Scene Change Detection: "--hist-scenecut".
30
+5. Add film grain characteristics as a SEI message to the bitstream: "--film-grain <filename>"
31
+6. cli: add new option --cra-nal (Force nal type to CRA to all frames expect for the first frame, works only with keyint 1)
32
+
33
+Optimizations
34
+---------------------
35
+ARM64 NEON optimizations:- Several time-consuming C functions have been optimized for the targeted platform - aarch64. The overall performance increased by around 20%.
36
+SVE/SVE2 optimizations
37
+
38
+
39
+Bug fixes
40
+---------
41
+1. Linux bug to utilize all the cores
42
+2. Crash with hist-scenecut build when source resolution is not multiple of minCuSize
43
+3. 32bit and 64bit builds generation for ARM
44
+4. bugs in zonefile feature (Reflect Zonefile Parameters inside Lookahead, extra IDR issue, Avg I Slice QP value issue etc..)
45
+5. Add x86 ASM implementation for subsampling luma 
46
+6. Fix for abrladder segfault with load reuse level 1 
47
+7. Reorder miniGOP based on temporal layer hierarchy and add support for more B frame 
48
+8. Add MacOS aarch64 build support 
49
+9. Fix boundary condition issue for Gaussian filter
50
+
51
+
52
 Version 3.5
53
 ===========
54
 
55
x265_3.5.tar.gz/readme.rst -> x265_3.6.tar.gz/readme.rst Changed
10
 
1
@@ -2,7 +2,7 @@
2
 x265 HEVC Encoder
3
 =================
4
 
5
-| **Read:** | Online `documentation <http://x265.readthedocs.org/en/default/>`_ | Developer `wiki <http://bitbucket.org/multicoreware/x265/wiki/>`_
6
+| **Read:** | Online `documentation <http://x265.readthedocs.org/en/master/>`_ | Developer `wiki <http://bitbucket.org/multicoreware/x265_git/wiki/>`_
7
 | **Download:** | `releases <http://ftp.videolan.org/pub/videolan/x265/>`_ 
8
 | **Interact:** | #x265 on freenode.irc.net | `x265-devel@videolan.org <http://mailman.videolan.org/listinfo/x265-devel>`_ | `Report an issue <https://bitbucket.org/multicoreware/x265/issues?status=new&status=open>`_
9
 
10
x265_3.5.tar.gz/source/CMakeLists.txt -> x265_3.6.tar.gz/source/CMakeLists.txt Changed
201
 
1
@@ -29,7 +29,7 @@
2
 option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
3
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
4
 # X265_BUILD must be incremented each time the public API is changed
5
-set(X265_BUILD 199)
6
+set(X265_BUILD 209)
7
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
8
                "${PROJECT_BINARY_DIR}/x265.def")
9
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
10
@@ -38,14 +38,20 @@
11
 SET(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake" "${CMAKE_MODULE_PATH}")
12
 
13
 # System architecture detection
14
-string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
15
+if (APPLE AND CMAKE_OSX_ARCHITECTURES)
16
+    string(TOLOWER "${CMAKE_OSX_ARCHITECTURES}" SYSPROC)
17
+else()
18
+    string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
19
+endif()
20
 set(X86_ALIASES x86 i386 i686 x86_64 amd64)
21
-set(ARM_ALIASES armv6l armv7l aarch64)
22
+set(ARM_ALIASES armv6l armv7l)
23
+set(ARM64_ALIASES arm64 arm64e aarch64)
24
 list(FIND X86_ALIASES "${SYSPROC}" X86MATCH)
25
 list(FIND ARM_ALIASES "${SYSPROC}" ARMMATCH)
26
-set(POWER_ALIASES ppc64 ppc64le)
27
+list(FIND ARM64_ALIASES "${SYSPROC}" ARM64MATCH)
28
+set(POWER_ALIASES powerpc64 powerpc64le ppc64 ppc64le)
29
 list(FIND POWER_ALIASES "${SYSPROC}" POWERMATCH)
30
-if("${SYSPROC}" STREQUAL "" OR X86MATCH GREATER "-1")
31
+if(X86MATCH GREATER "-1")
32
     set(X86 1)
33
     add_definitions(-DX265_ARCH_X86=1)
34
     if(CMAKE_CXX_FLAGS STREQUAL "-m32")
35
@@ -70,15 +76,18 @@
36
     else()
37
         set(CROSS_COMPILE_ARM 0)
38
     endif()
39
+   message(STATUS "Detected ARM target processor")
40
     set(ARM 1)
41
-    if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
42
-        message(STATUS "Detected ARM64 target processor")
43
-        set(ARM64 1)
44
-        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0)
45
-    else()
46
-        message(STATUS "Detected ARM target processor")
47
-        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1)
48
-    endif()
49
+    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
50
+elseif(ARM64MATCH GREATER "-1")
51
+    #if(CROSS_COMPILE_ARM64)
52
+        #message(STATUS "Cross compiling for ARM64 arch")
53
+    #else()
54
+        #set(CROSS_COMPILE_ARM64 0)
55
+    #endif()
56
+    message(STATUS "Detected ARM64 target processor")
57
+    set(ARM64 1)
58
+    add_definitions(-DX265_ARCH_ARM64=1 -DHAVE_NEON)
59
 else()
60
     message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
61
     message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
62
@@ -239,26 +248,43 @@
63
         endif()
64
     endif()
65
     if(ARM AND CROSS_COMPILE_ARM)
66
-        if(ARM64)
67
-            set(ARM_ARGS -fPIC)
68
-        else()
69
-            set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
70
-        endif()
71
         message(STATUS "cross compile arm")
72
+       set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
73
     elseif(ARM)
74
-        if(ARM64)
75
-            set(ARM_ARGS -fPIC)
76
+        find_package(Neon)
77
+        if(CPU_HAS_NEON)
78
+            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
79
             add_definitions(-DHAVE_NEON)
80
         else()
81
-            find_package(Neon)
82
-            if(CPU_HAS_NEON)
83
-                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
84
-                add_definitions(-DHAVE_NEON)
85
-            else()
86
-                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
87
-            endif()
88
+            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
89
         endif()
90
     endif()
91
+   if(ARM64 OR CROSS_COMPILE_ARM64)
92
+        find_package(Neon)
93
+        find_package(SVE)
94
+        find_package(SVE2)
95
+        if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2)
96
+            message(STATUS "Found SVE2")
97
+           set(ARM_ARGS -O3 -march=armv8-a+sve2 -fPIC -flax-vector-conversions)
98
+            add_definitions(-DHAVE_SVE2)
99
+            add_definitions(-DHAVE_SVE)
100
+            add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE2
101
+        elseif(CPU_HAS_SVE OR CROSS_COMPILE_SVE)
102
+            message(STATUS "Found SVE")
103
+           set(ARM_ARGS -O3 -march=armv8-a+sve -fPIC -flax-vector-conversions)
104
+            add_definitions(-DHAVE_SVE)
105
+            add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE
106
+        elseif(CPU_HAS_NEON)
107
+            message(STATUS "Found NEON")
108
+            set(ARM_ARGS -fPIC -flax-vector-conversions)
109
+            add_definitions(-DHAVE_NEON)
110
+        else()
111
+            set(ARM_ARGS -fPIC -flax-vector-conversions)
112
+        endif()        
113
+    endif()
114
+   if(ENABLE_PIC)
115
+   list(APPEND ARM_ARGS -DPIC)
116
+   endif()
117
     add_definitions(${ARM_ARGS})
118
     if(FPROFILE_GENERATE)
119
         if(INTEL_CXX)
120
@@ -350,7 +376,7 @@
121
 endif(GCC)
122
 
123
 find_package(Nasm)
124
-if(ARM OR CROSS_COMPILE_ARM)
125
+if(ARM OR CROSS_COMPILE_ARM OR ARM64 OR CROSS_COMPILE_ARM64)
126
     option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" ON)
127
 elseif(NASM_FOUND AND X86)
128
     if (NASM_VERSION_STRING VERSION_LESS "2.13.0")
129
@@ -384,7 +410,7 @@
130
 endif(EXTRA_LIB)
131
 mark_as_advanced(EXTRA_LIB EXTRA_LINK_FLAGS)
132
 
133
-if(X64)
134
+if(X64 OR ARM64 OR PPC64)
135
     # NOTE: We only officially support high-bit-depth compiles of x265
136
     # on 64bit architectures. Main10 plus large resolution plus slow
137
     # preset plus 32bit address space usually means malloc failure.  You
138
@@ -393,7 +419,7 @@
139
     # license" so to speak.  If it breaks you get to keep both halves.
140
     # You will need to disable assembly manually.
141
     option(HIGH_BIT_DEPTH "Store pixel samples as 16bit values (Main10/Main12)" OFF)
142
-endif(X64)
143
+endif(X64 OR ARM64 OR PPC64)
144
 if(HIGH_BIT_DEPTH)
145
     option(MAIN12 "Support Main12 instead of Main10" OFF)
146
     if(MAIN12)
147
@@ -440,6 +466,18 @@
148
 endif()
149
 add_definitions(-DX265_NS=${X265_NS})
150
 
151
+if(ARM64)
152
+  if(HIGH_BIT_DEPTH)
153
+    if(MAIN12)
154
+      list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=12 -DX265_NS=${X265_NS})
155
+    else()
156
+      list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=10 -DX265_NS=${X265_NS})
157
+    endif()
158
+  else()
159
+    list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=0 -DBIT_DEPTH=8 -DX265_NS=${X265_NS})
160
+  endif()
161
+endif(ARM64)
162
+
163
 option(WARNINGS_AS_ERRORS "Stop compiles on first warning" OFF)
164
 if(WARNINGS_AS_ERRORS)
165
     if(GCC)
166
@@ -536,11 +574,7 @@
167
     # compile ARM arch asm files here
168
         enable_language(ASM)
169
         foreach(ASM ${ARM_ASMS})
170
-            if(ARM64)
171
-                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
172
-            else()
173
-                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})
174
-            endif()
175
+           set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})
176
             list(APPEND ASM_SRCS ${ASM_SRC})
177
             list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
178
             add_custom_command(
179
@@ -549,6 +583,52 @@
180
                 ARGS ${ARM_ARGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
181
                 DEPENDS ${ASM_SRC})
182
         endforeach()
183
+   elseif(ARM64 OR CROSS_COMPILE_ARM64)
184
+    # compile ARM64 arch asm files here
185
+        enable_language(ASM)
186
+        foreach(ASM ${ARM_ASMS})
187
+            set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
188
+            list(APPEND ASM_SRCS ${ASM_SRC})
189
+            list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
190
+            add_custom_command(
191
+                OUTPUT ${ASM}.${SUFFIX}
192
+                COMMAND ${CMAKE_CXX_COMPILER}
193
+                ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
194
+                DEPENDS ${ASM_SRC})
195
+        endforeach()
196
+        if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2)
197
+            foreach(ASM ${ARM_ASMS_SVE})
198
+                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
199
+                list(APPEND ASM_SRCS ${ASM_SRC})
200
+                list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
201
x265_3.5.tar.gz/source/abrEncApp.cpp -> x265_3.6.tar.gz/source/abrEncApp.cpp Changed
201
 
1
@@ -1,1111 +1,1111 @@
2
-/*****************************************************************************
3
-* Copyright (C) 2013-2020 MulticoreWare, Inc
4
-*
5
-* Authors: Pooja Venkatesan <pooja@multicorewareinc.com>
6
-*          Aruna Matheswaran <aruna@multicorewareinc.com>
7
-*
8
-* This program is free software; you can redistribute it and/or modify
9
-* it under the terms of the GNU General Public License as published by
10
-* the Free Software Foundation; either version 2 of the License, or
11
-* (at your option) any later version.
12
-*
13
-* This program is distributed in the hope that it will be useful,
14
-* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
-* GNU General Public License for more details.
17
-*
18
-* You should have received a copy of the GNU General Public License
19
-* along with this program; if not, write to the Free Software
20
-* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21
-*
22
-* This program is also available under a commercial proprietary license.
23
-* For more information, contact us at license @ x265.com.
24
-*****************************************************************************/
25
-
26
-#include "abrEncApp.h"
27
-#include "mv.h"
28
-#include "slice.h"
29
-#include "param.h"
30
-
31
-#include <signal.h>
32
-#include <errno.h>
33
-
34
-#include <queue>
35
-
36
-using namespace X265_NS;
37
-
38
-/* Ctrl-C handler */
39
-static volatile sig_atomic_t b_ctrl_c /* = 0 */;
40
-static void sigint_handler(int)
41
-{
42
-    b_ctrl_c = 1;
43
-}
44
-
45
-namespace X265_NS {
46
-    // private namespace
47
-#define X265_INPUT_QUEUE_SIZE 250
48
-
49
-    AbrEncoder::AbrEncoder(CLIOptions cliopt, uint8_t numEncodes, int &ret)
50
-    {
51
-        m_numEncodes = numEncodes;
52
-        m_numActiveEncodes.set(numEncodes);
53
-        m_queueSize = (numEncodes > 1) ? X265_INPUT_QUEUE_SIZE : 1;
54
-        m_passEnc = X265_MALLOC(PassEncoder*, m_numEncodes);
55
-
56
-        for (uint8_t i = 0; i < m_numEncodes; i++)
57
-        {
58
-            m_passEnci = new PassEncoder(i, cliopti, this);
59
-            if (!m_passEnci)
60
-            {
61
-                x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for passEncoder\n");
62
-                ret = 4;
63
-            }
64
-            m_passEnci->init(ret);
65
-        }
66
-
67
-        if (!allocBuffers())
68
-        {
69
-            x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for buffers\n");
70
-            ret = 4;
71
-        }
72
-
73
-        /* start passEncoder worker threads */
74
-        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
75
-            m_passEncpass->startThreads();
76
-    }
77
-
78
-    bool AbrEncoder::allocBuffers()
79
-    {
80
-        m_inputPicBuffer = X265_MALLOC(x265_picture**, m_numEncodes);
81
-        m_analysisBuffer = X265_MALLOC(x265_analysis_data*, m_numEncodes);
82
-
83
-        m_picWriteCnt = new ThreadSafeIntegerm_numEncodes;
84
-        m_picReadCnt = new ThreadSafeIntegerm_numEncodes;
85
-        m_analysisWriteCnt = new ThreadSafeIntegerm_numEncodes;
86
-        m_analysisReadCnt = new ThreadSafeIntegerm_numEncodes;
87
-
88
-        m_picIdxReadCnt = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
89
-        m_analysisWrite = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
90
-        m_analysisRead = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
91
-        m_readFlag = X265_MALLOC(int*, m_numEncodes);
92
-
93
-        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
94
-        {
95
-            m_inputPicBufferpass = X265_MALLOC(x265_picture*, m_queueSize);
96
-            for (uint32_t idx = 0; idx < m_queueSize; idx++)
97
-            {
98
-                m_inputPicBufferpassidx = x265_picture_alloc();
99
-                x265_picture_init(m_passEncpass->m_param, m_inputPicBufferpassidx);
100
-            }
101
-
102
-            CHECKED_MALLOC_ZERO(m_analysisBufferpass, x265_analysis_data, m_queueSize);
103
-            m_picIdxReadCntpass = new ThreadSafeIntegerm_queueSize;
104
-            m_analysisWritepass = new ThreadSafeIntegerm_queueSize;
105
-            m_analysisReadpass = new ThreadSafeIntegerm_queueSize;
106
-            m_readFlagpass = X265_MALLOC(int, m_queueSize);
107
-        }
108
-        return true;
109
-    fail:
110
-        return false;
111
-    }
112
-
113
-    void AbrEncoder::destroy()
114
-    {
115
-        x265_cleanup(); /* Free library singletons */
116
-        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
117
-        {
118
-            for (uint32_t index = 0; index < m_queueSize; index++)
119
-            {
120
-                X265_FREE(m_inputPicBufferpassindex->planes0);
121
-                x265_picture_free(m_inputPicBufferpassindex);
122
-            }
123
-
124
-            X265_FREE(m_inputPicBufferpass);
125
-            X265_FREE(m_analysisBufferpass);
126
-            X265_FREE(m_readFlagpass);
127
-            delete m_picIdxReadCntpass;
128
-            delete m_analysisWritepass;
129
-            delete m_analysisReadpass;
130
-            m_passEncpass->destroy();
131
-            delete m_passEncpass;
132
-        }
133
-        X265_FREE(m_inputPicBuffer);
134
-        X265_FREE(m_analysisBuffer);
135
-        X265_FREE(m_readFlag);
136
-
137
-        delete m_picWriteCnt;
138
-        delete m_picReadCnt;
139
-        delete m_analysisWriteCnt;
140
-        delete m_analysisReadCnt;
141
-
142
-        X265_FREE(m_picIdxReadCnt);
143
-        X265_FREE(m_analysisWrite);
144
-        X265_FREE(m_analysisRead);
145
-
146
-        X265_FREE(m_passEnc);
147
-    }
148
-
149
-    PassEncoder::PassEncoder(uint32_t id, CLIOptions cliopt, AbrEncoder *parent)
150
-    {
151
-        m_id = id;
152
-        m_cliopt = cliopt;
153
-        m_parent = parent;
154
-        if(!(m_cliopt.enableScaler && m_id))
155
-            m_input = m_cliopt.input;
156
-        m_param = cliopt.param;
157
-        m_inputOver = false;
158
-        m_lastIdx = -1;
159
-        m_encoder = NULL;
160
-        m_scaler = NULL;
161
-        m_reader = NULL;
162
-        m_ret = 0;
163
-    }
164
-
165
-    int PassEncoder::init(int &result)
166
-    {
167
-        if (m_parent->m_numEncodes > 1)
168
-            setReuseLevel();
169
-                
170
-        if (!(m_cliopt.enableScaler && m_id))
171
-            m_reader = new Reader(m_id, this);
172
-        else
173
-        {
174
-            VideoDesc *src = NULL, *dst = NULL;
175
-            dst = new VideoDesc(m_param->sourceWidth, m_param->sourceHeight, m_param->internalCsp, m_param->internalBitDepth);
176
-            int dstW = m_parent->m_passEncm_id - 1->m_param->sourceWidth;
177
-            int dstH = m_parent->m_passEncm_id - 1->m_param->sourceHeight;
178
-            src = new VideoDesc(dstW, dstH, m_param->internalCsp, m_param->internalBitDepth);
179
-            if (src != NULL && dst != NULL)
180
-            {
181
-                m_scaler = new Scaler(0, 1, m_id, src, dst, this);
182
-                if (!m_scaler)
183
-                {
184
-                    x265_log(m_param, X265_LOG_ERROR, "\n MALLOC failure in Scaler");
185
-                    result = 4;
186
-                }
187
-            }
188
-        }
189
-
190
-        /* note: we could try to acquire a different libx265 API here based on
191
-        * the profile found during option parsing, but it must be done before
192
-        * opening an encoder */
193
-
194
-        if (m_param)
195
-            m_encoder = m_cliopt.api->encoder_open(m_param);
196
-        if (!m_encoder)
197
-        {
198
-            x265_log(NULL, X265_LOG_ERROR, "x265_encoder_open() failed for Enc, \n");
199
-            m_ret = 2;
200
-            return -1;
201
x265_3.5.tar.gz/source/abrEncApp.h -> x265_3.6.tar.gz/source/abrEncApp.h Changed
9
 
1
@@ -91,6 +91,7 @@
2
         FILE*    m_qpfile;
3
         FILE*    m_zoneFile;
4
         FILE*    m_dolbyVisionRpu;/* File containing Dolby Vision BL RPU metadata */
5
+        FILE*    m_scenecutAwareQpConfig;
6
 
7
         int m_ret;
8
 
9
x265_3.5.tar.gz/source/cmake/FindNeon.cmake -> x265_3.6.tar.gz/source/cmake/FindNeon.cmake Changed
27
 
1
@@ -1,10 +1,21 @@
2
 include(FindPackageHandleStandardArgs)
3
 
4
 # Check the version of neon supported by the ARM CPU
5
-execute_process(COMMAND cat /proc/cpuinfo | grep Features | grep neon
6
-                OUTPUT_VARIABLE neon_version
7
-                ERROR_QUIET
8
-                OUTPUT_STRIP_TRAILING_WHITESPACE)
9
+if(APPLE)
10
+    execute_process(COMMAND sysctl -a
11
+                    COMMAND grep "hw.optional.neon: 1"
12
+                    OUTPUT_VARIABLE neon_version
13
+                    ERROR_QUIET
14
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
15
+else()
16
+    execute_process(COMMAND cat /proc/cpuinfo
17
+                    COMMAND grep Features
18
+                    COMMAND grep neon
19
+                    OUTPUT_VARIABLE neon_version
20
+                    ERROR_QUIET
21
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
22
+endif()
23
+
24
 if(neon_version)
25
     set(CPU_HAS_NEON 1)
26
 endif()
27
x265_3.6.tar.gz/source/cmake/FindSVE.cmake Added
23
 
1
@@ -0,0 +1,21 @@
2
+include(FindPackageHandleStandardArgs)
3
+
4
+# Check the version of SVE supported by the ARM CPU
5
+if(APPLE)
6
+    execute_process(COMMAND sysctl -a
7
+                    COMMAND grep "hw.optional.sve: 1"
8
+                    OUTPUT_VARIABLE sve_version
9
+                    ERROR_QUIET
10
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
11
+else()
12
+    execute_process(COMMAND cat /proc/cpuinfo
13
+                    COMMAND grep Features
14
+                    COMMAND grep -e "sve$" -e "sve:space:"
15
+                    OUTPUT_VARIABLE sve_version
16
+                    ERROR_QUIET
17
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
18
+endif()
19
+
20
+if(sve_version)
21
+    set(CPU_HAS_SVE 1)
22
+endif()
23
x265_3.6.tar.gz/source/cmake/FindSVE2.cmake Added
24
 
1
@@ -0,0 +1,22 @@
2
+include(FindPackageHandleStandardArgs)
3
+
4
+# Check the version of SVE2 supported by the ARM CPU
5
+if(APPLE)
6
+    execute_process(COMMAND sysctl -a
7
+                    COMMAND grep "hw.optional.sve2: 1"
8
+                    OUTPUT_VARIABLE sve2_version
9
+                    ERROR_QUIET
10
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
11
+else()
12
+    execute_process(COMMAND cat /proc/cpuinfo
13
+                    COMMAND grep Features
14
+                    COMMAND grep sve2
15
+                    OUTPUT_VARIABLE sve2_version
16
+                    ERROR_QUIET
17
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
18
+endif()
19
+
20
+if(sve2_version)
21
+    set(CPU_HAS_SVE 1)
22
+    set(CPU_HAS_SVE2 1)
23
+endif()
24
x265_3.5.tar.gz/source/common/CMakeLists.txt -> x265_3.6.tar.gz/source/common/CMakeLists.txt Changed
76
 
1
@@ -84,35 +84,42 @@
2
 endif(ENABLE_ASSEMBLY AND X86)
3
 
4
 if(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
5
-    if(ARM64)
6
-        if(GCC AND (CMAKE_CXX_FLAGS_RELEASE MATCHES "-O3"))
7
-            message(STATUS "Detected CXX compiler using -O3 optimization level")
8
-            add_definitions(-DAUTO_VECTORIZE=1)
9
-        endif()
10
-        set(C_SRCS asm-primitives.cpp pixel.h ipfilter8.h)
11
-
12
-        # add ARM assembly/intrinsic files here
13
-        set(A_SRCS asm.S mc-a.S sad-a.S pixel-util.S ipfilter8.S)
14
-        set(VEC_PRIMITIVES)
15
+    set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
16
 
17
-        set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
18
-        foreach(SRC ${C_SRCS})
19
-            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
20
-        endforeach()
21
-    else()
22
-        set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
23
+    # add ARM assembly/intrinsic files here
24
+    set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S dct-a.S)
25
+    set(VEC_PRIMITIVES)
26
 
27
-        # add ARM assembly/intrinsic files here
28
-        set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S dct-a.S)
29
-        set(VEC_PRIMITIVES)
30
+    set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
31
+    foreach(SRC ${C_SRCS})
32
+        set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
33
+    endforeach()
34
+    source_group(Assembly FILES ${ASM_PRIMITIVES})
35
+endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
36
 
37
-        set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
38
-        foreach(SRC ${C_SRCS})
39
-            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
40
-        endforeach()
41
+if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
42
+    if(GCC AND (CMAKE_CXX_FLAGS_RELEASE MATCHES "-O3"))
43
+        message(STATUS "Detected CXX compiler using -O3 optimization level")
44
+        add_definitions(-DAUTO_VECTORIZE=1)
45
     endif()
46
+
47
+    set(C_SRCS asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h)
48
+    enable_language(ASM)
49
+
50
+    # add ARM assembly/intrinsic files here
51
+    set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S sad-a-common.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S ipfilter.S ipfilter-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S)
52
+    set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S)
53
+    set(A_SRCS_SVE2 mc-a-sve2.S sad-a-sve2.S pixel-util-sve2.S ipfilter-sve2.S ssd-a-sve2.S)
54
+    set(VEC_PRIMITIVES)
55
+
56
+    set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
57
+    set(ARM_ASMS_SVE "${A_SRCS_SVE}" CACHE INTERNAL "ARM Assembly Sources that use SVE instruction set")
58
+    set(ARM_ASMS_SVE2 "${A_SRCS_SVE2}" CACHE INTERNAL "ARM Assembly Sources that use SVE2 instruction set")
59
+    foreach(SRC ${C_SRCS})
60
+        set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
61
+    endforeach()
62
     source_group(Assembly FILES ${ASM_PRIMITIVES})
63
-endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
64
+endif(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
65
 
66
 if(POWER)
67
     set_source_files_properties(version.cpp PROPERTIES COMPILE_FLAGS -DX265_VERSION=${X265_VERSION})
68
@@ -169,4 +176,6 @@
69
     scalinglist.cpp scalinglist.h
70
     quant.cpp quant.h contexts.h
71
     deblock.cpp deblock.h
72
-    scaler.cpp scaler.h)
73
+    scaler.cpp scaler.h
74
+    ringmem.cpp ringmem.h
75
+    temporalfilter.cpp temporalfilter.h)
76
x265_3.6.tar.gz/source/common/aarch64/arm64-utils.cpp Added
201
 
1
@@ -0,0 +1,300 @@
2
+#include "common.h"
3
+#include "x265.h"
4
+#include "arm64-utils.h"
5
+#include <arm_neon.h>
6
+
7
+#define COPY_16(d,s) *(uint8x16_t *)(d) = *(uint8x16_t *)(s)
8
+namespace X265_NS
9
+{
10
+
11
+
12
+
13
+void transpose8x8(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
14
+{
15
+    uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7;
16
+    uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;
17
+
18
+    a0 = *(uint8x8_t *)(src + 0 * sstride);
19
+    a1 = *(uint8x8_t *)(src + 1 * sstride);
20
+    a2 = *(uint8x8_t *)(src + 2 * sstride);
21
+    a3 = *(uint8x8_t *)(src + 3 * sstride);
22
+    a4 = *(uint8x8_t *)(src + 4 * sstride);
23
+    a5 = *(uint8x8_t *)(src + 5 * sstride);
24
+    a6 = *(uint8x8_t *)(src + 6 * sstride);
25
+    a7 = *(uint8x8_t *)(src + 7 * sstride);
26
+
27
+    b0 = vtrn1_u32(a0, a4);
28
+    b1 = vtrn1_u32(a1, a5);
29
+    b2 = vtrn1_u32(a2, a6);
30
+    b3 = vtrn1_u32(a3, a7);
31
+    b4 = vtrn2_u32(a0, a4);
32
+    b5 = vtrn2_u32(a1, a5);
33
+    b6 = vtrn2_u32(a2, a6);
34
+    b7 = vtrn2_u32(a3, a7);
35
+
36
+    a0 = vtrn1_u16(b0, b2);
37
+    a1 = vtrn1_u16(b1, b3);
38
+    a2 = vtrn2_u16(b0, b2);
39
+    a3 = vtrn2_u16(b1, b3);
40
+    a4 = vtrn1_u16(b4, b6);
41
+    a5 = vtrn1_u16(b5, b7);
42
+    a6 = vtrn2_u16(b4, b6);
43
+    a7 = vtrn2_u16(b5, b7);
44
+
45
+    b0 = vtrn1_u8(a0, a1);
46
+    b1 = vtrn2_u8(a0, a1);
47
+    b2 = vtrn1_u8(a2, a3);
48
+    b3 = vtrn2_u8(a2, a3);
49
+    b4 = vtrn1_u8(a4, a5);
50
+    b5 = vtrn2_u8(a4, a5);
51
+    b6 = vtrn1_u8(a6, a7);
52
+    b7 = vtrn2_u8(a6, a7);
53
+
54
+    *(uint8x8_t *)(dst + 0 * dstride) = b0;
55
+    *(uint8x8_t *)(dst + 1 * dstride) = b1;
56
+    *(uint8x8_t *)(dst + 2 * dstride) = b2;
57
+    *(uint8x8_t *)(dst + 3 * dstride) = b3;
58
+    *(uint8x8_t *)(dst + 4 * dstride) = b4;
59
+    *(uint8x8_t *)(dst + 5 * dstride) = b5;
60
+    *(uint8x8_t *)(dst + 6 * dstride) = b6;
61
+    *(uint8x8_t *)(dst + 7 * dstride) = b7;
62
+}
63
+
64
+
65
+
66
+
67
+
68
+
69
+void transpose16x16(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
70
+{
71
+    uint16x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aA, aB, aC, aD, aE, aF;
72
+    uint16x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, bA, bB, bC, bD, bE, bF;
73
+    uint16x8_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF;
74
+    uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, dA, dB, dC, dD, dE, dF;
75
+
76
+    a0 = *(uint16x8_t *)(src + 0 * sstride);
77
+    a1 = *(uint16x8_t *)(src + 1 * sstride);
78
+    a2 = *(uint16x8_t *)(src + 2 * sstride);
79
+    a3 = *(uint16x8_t *)(src + 3 * sstride);
80
+    a4 = *(uint16x8_t *)(src + 4 * sstride);
81
+    a5 = *(uint16x8_t *)(src + 5 * sstride);
82
+    a6 = *(uint16x8_t *)(src + 6 * sstride);
83
+    a7 = *(uint16x8_t *)(src + 7 * sstride);
84
+    a8 = *(uint16x8_t *)(src + 8 * sstride);
85
+    a9 = *(uint16x8_t *)(src + 9 * sstride);
86
+    aA = *(uint16x8_t *)(src + 10 * sstride);
87
+    aB = *(uint16x8_t *)(src + 11 * sstride);
88
+    aC = *(uint16x8_t *)(src + 12 * sstride);
89
+    aD = *(uint16x8_t *)(src + 13 * sstride);
90
+    aE = *(uint16x8_t *)(src + 14 * sstride);
91
+    aF = *(uint16x8_t *)(src + 15 * sstride);
92
+
93
+    b0 = vtrn1q_u64(a0, a8);
94
+    b1 = vtrn1q_u64(a1, a9);
95
+    b2 = vtrn1q_u64(a2, aA);
96
+    b3 = vtrn1q_u64(a3, aB);
97
+    b4 = vtrn1q_u64(a4, aC);
98
+    b5 = vtrn1q_u64(a5, aD);
99
+    b6 = vtrn1q_u64(a6, aE);
100
+    b7 = vtrn1q_u64(a7, aF);
101
+    b8 = vtrn2q_u64(a0, a8);
102
+    b9 = vtrn2q_u64(a1, a9);
103
+    bA = vtrn2q_u64(a2, aA);
104
+    bB = vtrn2q_u64(a3, aB);
105
+    bC = vtrn2q_u64(a4, aC);
106
+    bD = vtrn2q_u64(a5, aD);
107
+    bE = vtrn2q_u64(a6, aE);
108
+    bF = vtrn2q_u64(a7, aF);
109
+
110
+    c0 = vtrn1q_u32(b0, b4);
111
+    c1 = vtrn1q_u32(b1, b5);
112
+    c2 = vtrn1q_u32(b2, b6);
113
+    c3 = vtrn1q_u32(b3, b7);
114
+    c4 = vtrn2q_u32(b0, b4);
115
+    c5 = vtrn2q_u32(b1, b5);
116
+    c6 = vtrn2q_u32(b2, b6);
117
+    c7 = vtrn2q_u32(b3, b7);
118
+    c8 = vtrn1q_u32(b8, bC);
119
+    c9 = vtrn1q_u32(b9, bD);
120
+    cA = vtrn1q_u32(bA, bE);
121
+    cB = vtrn1q_u32(bB, bF);
122
+    cC = vtrn2q_u32(b8, bC);
123
+    cD = vtrn2q_u32(b9, bD);
124
+    cE = vtrn2q_u32(bA, bE);
125
+    cF = vtrn2q_u32(bB, bF);
126
+
127
+    d0 = vtrn1q_u16(c0, c2);
128
+    d1 = vtrn1q_u16(c1, c3);
129
+    d2 = vtrn2q_u16(c0, c2);
130
+    d3 = vtrn2q_u16(c1, c3);
131
+    d4 = vtrn1q_u16(c4, c6);
132
+    d5 = vtrn1q_u16(c5, c7);
133
+    d6 = vtrn2q_u16(c4, c6);
134
+    d7 = vtrn2q_u16(c5, c7);
135
+    d8 = vtrn1q_u16(c8, cA);
136
+    d9 = vtrn1q_u16(c9, cB);
137
+    dA = vtrn2q_u16(c8, cA);
138
+    dB = vtrn2q_u16(c9, cB);
139
+    dC = vtrn1q_u16(cC, cE);
140
+    dD = vtrn1q_u16(cD, cF);
141
+    dE = vtrn2q_u16(cC, cE);
142
+    dF = vtrn2q_u16(cD, cF);
143
+
144
+    *(uint16x8_t *)(dst + 0 * dstride)  = vtrn1q_u8(d0, d1);
145
+    *(uint16x8_t *)(dst + 1 * dstride)  = vtrn2q_u8(d0, d1);
146
+    *(uint16x8_t *)(dst + 2 * dstride)  = vtrn1q_u8(d2, d3);
147
+    *(uint16x8_t *)(dst + 3 * dstride)  = vtrn2q_u8(d2, d3);
148
+    *(uint16x8_t *)(dst + 4 * dstride)  = vtrn1q_u8(d4, d5);
149
+    *(uint16x8_t *)(dst + 5 * dstride)  = vtrn2q_u8(d4, d5);
150
+    *(uint16x8_t *)(dst + 6 * dstride)  = vtrn1q_u8(d6, d7);
151
+    *(uint16x8_t *)(dst + 7 * dstride)  = vtrn2q_u8(d6, d7);
152
+    *(uint16x8_t *)(dst + 8 * dstride)  = vtrn1q_u8(d8, d9);
153
+    *(uint16x8_t *)(dst + 9 * dstride)  = vtrn2q_u8(d8, d9);
154
+    *(uint16x8_t *)(dst + 10 * dstride)  = vtrn1q_u8(dA, dB);
155
+    *(uint16x8_t *)(dst + 11 * dstride)  = vtrn2q_u8(dA, dB);
156
+    *(uint16x8_t *)(dst + 12 * dstride)  = vtrn1q_u8(dC, dD);
157
+    *(uint16x8_t *)(dst + 13 * dstride)  = vtrn2q_u8(dC, dD);
158
+    *(uint16x8_t *)(dst + 14 * dstride)  = vtrn1q_u8(dE, dF);
159
+    *(uint16x8_t *)(dst + 15 * dstride)  = vtrn2q_u8(dE, dF);
160
+
161
+
162
+}
163
+
164
+
165
+void transpose32x32(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
166
+{
167
+    //assumption: there is no partial overlap
168
+    transpose16x16(dst, src, dstride, sstride);
169
+    transpose16x16(dst + 16 * dstride + 16, src + 16 * sstride + 16, dstride, sstride);
170
+    if (dst == src)
171
+    {
172
+        uint8_t tmp16 * 16 __attribute__((aligned(64)));
173
+        transpose16x16(tmp, src + 16, 16, sstride);
174
+        transpose16x16(dst + 16, src + 16 * sstride, dstride, sstride);
175
+        for (int i = 0; i < 16; i++)
176
+        {
177
+            COPY_16(dst + (16 + i)*dstride, tmp + 16 * i);
178
+        }
179
+    }
180
+    else
181
+    {
182
+        transpose16x16(dst + 16 * dstride, src + 16, dstride, sstride);
183
+        transpose16x16(dst + 16, src + 16 * sstride, dstride, sstride);
184
+    }
185
+
186
+}
187
+
188
+
189
+
190
+void transpose8x8(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride)
191
+{
192
+    uint16x8_t a0, a1, a2, a3, a4, a5, a6, a7;
193
+    uint16x8_t b0, b1, b2, b3, b4, b5, b6, b7;
194
+
195
+    a0 = *(uint16x8_t *)(src + 0 * sstride);
196
+    a1 = *(uint16x8_t *)(src + 1 * sstride);
197
+    a2 = *(uint16x8_t *)(src + 2 * sstride);
198
+    a3 = *(uint16x8_t *)(src + 3 * sstride);
199
+    a4 = *(uint16x8_t *)(src + 4 * sstride);
200
+    a5 = *(uint16x8_t *)(src + 5 * sstride);
201
x265_3.6.tar.gz/source/common/aarch64/arm64-utils.h Added
17
 
1
@@ -0,0 +1,15 @@
2
+#ifndef __ARM64_UTILS_H__
3
+#define __ARM64_UTILS_H__
4
+
5
+
6
+namespace X265_NS
7
+{
8
+void transpose8x8(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride);
9
+void transpose16x16(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride);
10
+void transpose32x32(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride);
11
+void transpose8x8(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride);
12
+void transpose16x16(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride);
13
+void transpose32x32(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride);
14
+}
15
+
16
+#endif
17
x265_3.5.tar.gz/source/common/aarch64/asm-primitives.cpp -> x265_3.6.tar.gz/source/common/aarch64/asm-primitives.cpp Changed
201
 
1
@@ -3,6 +3,7 @@
2
  *
3
  * Authors: Hongbin Liu <liuhongbin1@huawei.com>
4
  *          Yimeng Su <yimeng.su@huawei.com>
5
+ *          Sebastian Pop <spop@amazon.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -22,11 +23,659 @@
10
  * For more information, contact us at license @ x265.com.
11
  *****************************************************************************/
12
 
13
+
14
 #include "common.h"
15
 #include "primitives.h"
16
 #include "x265.h"
17
 #include "cpu.h"
18
 
19
+extern "C" {
20
+#include "fun-decls.h"
21
+}
22
+
23
+#define ALL_LUMA_TU_TYPED(prim, fncdef, fname, cpu) \
24
+    p.cuBLOCK_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
25
+    p.cuBLOCK_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
26
+    p.cuBLOCK_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
27
+    p.cuBLOCK_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
28
+    p.cuBLOCK_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu)
29
+#define LUMA_TU_TYPED_NEON(prim, fncdef, fname) \
30
+    p.cuBLOCK_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
31
+    p.cuBLOCK_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
32
+    p.cuBLOCK_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
33
+    p.cuBLOCK_64x64.prim = fncdef PFX(fname ## _64x64_ ## neon)
34
+#define LUMA_TU_TYPED_CAN_USE_SVE(prim, fncdef, fname) \
35
+    p.cuBLOCK_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve)
36
+#define ALL_LUMA_TU(prim, fname, cpu)      ALL_LUMA_TU_TYPED(prim, , fname, cpu)
37
+#define LUMA_TU_NEON(prim, fname)      LUMA_TU_TYPED_NEON(prim, , fname)
38
+#define LUMA_TU_CAN_USE_SVE(prim, fname)      LUMA_TU_TYPED_CAN_USE_SVE(prim, , fname)
39
+
40
+#define ALL_LUMA_PU_TYPED(prim, fncdef, fname, cpu) \
41
+    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
42
+    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
43
+    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
44
+    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
45
+    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
46
+    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## cpu); \
47
+    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## cpu); \
48
+    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## cpu); \
49
+    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
50
+    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
51
+    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
52
+    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
53
+    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
54
+    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
55
+    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
56
+    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## cpu); \
57
+    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## cpu); \
58
+    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
59
+    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
60
+    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
61
+    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu); \
62
+    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
63
+    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
64
+    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \
65
+    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu)
66
+#define LUMA_PU_TYPED_MULTIPLE_ARCHS_1(prim, fncdef, fname, cpu) \
67
+    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
68
+    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## cpu); \
69
+    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## cpu)
70
+#define LUMA_PU_TYPED_MULTIPLE_ARCHS_2(prim, fncdef, fname, cpu) \
71
+    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
72
+    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
73
+    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
74
+    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
75
+    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## cpu); \
76
+    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## cpu); \
77
+    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
78
+    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
79
+    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
80
+    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
81
+    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
82
+    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
83
+    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
84
+    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## cpu); \
85
+    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
86
+    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
87
+    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
88
+    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## cpu); \
89
+    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
90
+    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
91
+    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \
92
+    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu)
93
+#define LUMA_PU_TYPED_NEON_1(prim, fncdef, fname) \
94
+    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
95
+    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
96
+    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon); \
97
+    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## neon); \
98
+    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
99
+    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
100
+    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## neon); \
101
+    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## neon); \
102
+    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## neon); \
103
+    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \
104
+    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
105
+    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## neon); \
106
+    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## neon); \
107
+    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## neon); \
108
+    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## neon); \
109
+    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## neon)
110
+#define LUMA_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fncdef, fname) \
111
+    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve); \
112
+    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## sve); \
113
+    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## sve); \
114
+    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## sve); \
115
+    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## sve); \
116
+    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## sve); \
117
+    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## sve); \
118
+    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## sve); \
119
+    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## sve)
120
+#define LUMA_PU_TYPED_NEON_2(prim, fncdef, fname) \
121
+    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
122
+    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## neon); \
123
+    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
124
+    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## neon); \
125
+    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## neon); \
126
+    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## neon); \
127
+    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
128
+    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
129
+    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \
130
+    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## neon); \
131
+    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon); \
132
+    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## neon); \
133
+    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## neon)
134
+#define LUMA_PU_TYPED_MULTIPLE_ARCHS_3(prim, fncdef, fname, cpu) \
135
+    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
136
+    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
137
+    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
138
+    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
139
+    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
140
+    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
141
+    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
142
+    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
143
+    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
144
+    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
145
+    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
146
+    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu)
147
+#define LUMA_PU_TYPED_NEON_3(prim, fncdef, fname) \
148
+    p.puLUMA_4x4.prim   = fncdef PFX(fname ## _4x4_ ## neon); \
149
+    p.puLUMA_4x8.prim   = fncdef PFX(fname ## _4x8_ ## neon); \
150
+    p.puLUMA_4x16.prim  = fncdef PFX(fname ## _4x16_ ## neon)
151
+#define LUMA_PU_TYPED_CAN_USE_SVE2(prim, fncdef, fname) \
152
+    p.puLUMA_8x8.prim   = fncdef PFX(fname ## _8x8_ ## sve2); \
153
+    p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## sve2); \
154
+    p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve2); \
155
+    p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## sve2); \
156
+    p.puLUMA_8x4.prim   = fncdef PFX(fname ## _8x4_ ## sve2); \
157
+    p.puLUMA_16x8.prim  = fncdef PFX(fname ## _16x8_ ## sve2); \
158
+    p.puLUMA_8x16.prim  = fncdef PFX(fname ## _8x16_ ## sve2); \
159
+    p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## sve2); \
160
+    p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## sve2); \
161
+    p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## sve2); \
162
+    p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## sve2); \
163
+    p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## sve2); \
164
+    p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## sve2); \
165
+    p.puLUMA_16x4.prim  = fncdef PFX(fname ## _16x4_ ## sve2); \
166
+    p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## sve2); \
167
+    p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## sve2); \
168
+    p.puLUMA_32x8.prim  = fncdef PFX(fname ## _32x8_ ## sve2); \
169
+    p.puLUMA_8x32.prim  = fncdef PFX(fname ## _8x32_ ## sve2); \
170
+    p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## sve2); \
171
+    p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## sve2); \
172
+    p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## sve2); \
173
+    p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## sve2)
174
+#define LUMA_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
175
+    p.puLUMA_4x4.prim   = fncdef PFX(filterPixelToShort ## _4x4_ ## neon); \
176
+    p.puLUMA_8x8.prim   = fncdef PFX(filterPixelToShort ## _8x8_ ## neon); \
177
+    p.puLUMA_16x16.prim = fncdef PFX(filterPixelToShort ## _16x16_ ## neon); \
178
+    p.puLUMA_8x4.prim   = fncdef PFX(filterPixelToShort ## _8x4_ ## neon); \
179
+    p.puLUMA_4x8.prim   = fncdef PFX(filterPixelToShort ## _4x8_ ## neon); \
180
+    p.puLUMA_16x8.prim  = fncdef PFX(filterPixelToShort ## _16x8_ ## neon); \
181
+    p.puLUMA_8x16.prim  = fncdef PFX(filterPixelToShort ## _8x16_ ## neon); \
182
+    p.puLUMA_16x32.prim = fncdef PFX(filterPixelToShort ## _16x32_ ## neon); \
183
+    p.puLUMA_16x12.prim = fncdef PFX(filterPixelToShort ## _16x12_ ## neon); \
184
+    p.puLUMA_12x16.prim = fncdef PFX(filterPixelToShort ## _12x16_ ## neon); \
185
+    p.puLUMA_16x4.prim  = fncdef PFX(filterPixelToShort ## _16x4_ ## neon); \
186
+    p.puLUMA_4x16.prim  = fncdef PFX(filterPixelToShort ## _4x16_ ## neon); \
187
+    p.puLUMA_24x32.prim = fncdef PFX(filterPixelToShort ## _24x32_ ## neon); \
188
+    p.puLUMA_8x32.prim  = fncdef PFX(filterPixelToShort ## _8x32_ ## neon); \
189
+    p.puLUMA_16x64.prim = fncdef PFX(filterPixelToShort ## _16x64_ ## neon)
190
+#define LUMA_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
191
+    p.puLUMA_32x32.prim = fncdef PFX(filterPixelToShort ## _32x32_ ## sve); \
192
+    p.puLUMA_32x16.prim = fncdef PFX(filterPixelToShort ## _32x16_ ## sve); \
193
+    p.puLUMA_32x64.prim = fncdef PFX(filterPixelToShort ## _32x64_ ## sve); \
194
+    p.puLUMA_32x24.prim = fncdef PFX(filterPixelToShort ## _32x24_ ## sve); \
195
+    p.puLUMA_32x8.prim  = fncdef PFX(filterPixelToShort ## _32x8_ ## sve); \
196
+    p.puLUMA_64x64.prim = fncdef PFX(filterPixelToShort ## _64x64_ ## sve); \
197
+    p.puLUMA_64x32.prim = fncdef PFX(filterPixelToShort ## _64x32_ ## sve); \
198
+    p.puLUMA_64x48.prim = fncdef PFX(filterPixelToShort ## _64x48_ ## sve); \
199
+    p.puLUMA_64x16.prim = fncdef PFX(filterPixelToShort ## _64x16_ ## sve); \
200
+    p.puLUMA_48x64.prim = fncdef PFX(filterPixelToShort ## _48x64_ ## sve)
201
x265_3.6.tar.gz/source/common/aarch64/asm-sve.S Added
41
 
1
@@ -0,0 +1,39 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+
27
+.arch armv8-a+sve
28
+
29
+.macro ABS2_SVE a b c
30
+    abs             \a, \c\()/m, \a
31
+    abs             \b, \c\()/m, \b
32
+.endm
33
+
34
+.macro ABS8_SVE z0, z1, z2, z3, z4, z5, z6, z7, p0
35
+    ABS2_SVE        \z0, \z1, p0
36
+    ABS2_SVE        \z2, \z3, p0
37
+    ABS2_SVE        \z4, \z5, p0
38
+    ABS2_SVE        \z6, \z7, p0
39
+.endm
40
+
41
x265_3.5.tar.gz/source/common/aarch64/asm.S -> x265_3.6.tar.gz/source/common/aarch64/asm.S Changed
173
 
1
@@ -1,7 +1,8 @@
2
 /*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
+ * Copyright (C) 2020-2021 MulticoreWare, Inc
5
  *
6
  * Authors: Hongbin Liu <liuhongbin1@huawei.com>
7
+ *          Sebastian Pop <spop@amazon.com>
8
  *
9
  * This program is free software; you can redistribute it and/or modify
10
  * it under the terms of the GNU General Public License as published by
11
@@ -21,34 +22,74 @@
12
  * For more information, contact us at license @ x265.com.
13
  *****************************************************************************/
14
 
15
+#ifndef ASM_S_  // #include guards
16
+#define ASM_S_
17
+
18
 .arch           armv8-a
19
 
20
+#define PFX3(prefix, name) prefix ## _ ## name
21
+#define PFX2(prefix, name) PFX3(prefix, name)
22
+#define PFX(name)          PFX2(X265_NS, name)
23
+
24
+#ifdef __APPLE__
25
+#define PREFIX 1
26
+#endif
27
+
28
 #ifdef PREFIX
29
 #define EXTERN_ASM _
30
+#define HAVE_AS_FUNC 0
31
+#elif defined __clang__
32
+#define EXTERN_ASM
33
+#define HAVE_AS_FUNC 0
34
+#define PREFIX 1
35
 #else
36
 #define EXTERN_ASM
37
+#define HAVE_AS_FUNC 1
38
 #endif
39
 
40
 #ifdef __ELF__
41
 #define ELF
42
 #else
43
+#ifdef PREFIX
44
+#define ELF #
45
+#else
46
 #define ELF @
47
 #endif
48
-
49
-#define HAVE_AS_FUNC 1
50
+#endif
51
 
52
 #if HAVE_AS_FUNC
53
 #define FUNC
54
 #else
55
+#ifdef PREFIX
56
+#define FUNC #
57
+#else
58
 #define FUNC @
59
 #endif
60
+#endif
61
+
62
+#define GLUE(a, b) a ## b
63
+#define JOIN(a, b) GLUE(a, b)
64
+
65
+#define PFX_C(name)        JOIN(JOIN(JOIN(EXTERN_ASM, X265_NS), _), name)
66
+
67
+#ifdef __APPLE__
68
+.macro endfunc
69
+ELF .size \name, . - \name
70
+FUNC .endfunc
71
+.endm
72
+#endif
73
 
74
 .macro function name, export=1
75
+#ifdef __APPLE__
76
+    .global \name
77
+    endfunc
78
+#else
79
     .macro endfunc
80
 ELF     .size   \name, . - \name
81
 FUNC    .endfunc
82
         .purgem endfunc
83
     .endm
84
+#endif
85
         .align  2
86
 .if \export == 1
87
         .global EXTERN_ASM\name
88
@@ -64,6 +105,83 @@
89
 .endif
90
 .endm
91
 
92
+.macro  const   name, align=2
93
+    .macro endconst
94
+ELF     .size   \name, . - \name
95
+        .purgem endconst
96
+    .endm
97
+#ifdef __MACH__
98
+    .const_data
99
+#else
100
+    .section .rodata
101
+#endif
102
+    .align          \align
103
+\name:
104
+.endm
105
+
106
+.macro  movrel rd, val, offset=0
107
+#if defined(__APPLE__)
108
+  .if \offset < 0
109
+        adrp            \rd, \val@PAGE
110
+        add             \rd, \rd, \val@PAGEOFF
111
+        sub             \rd, \rd, -(\offset)
112
+  .else
113
+        adrp            \rd, \val+(\offset)@PAGE
114
+        add             \rd, \rd, \val+(\offset)@PAGEOFF
115
+  .endif
116
+#elif defined(PIC) && defined(_WIN32)
117
+  .if \offset < 0
118
+        adrp            \rd, \val
119
+        add             \rd, \rd, :lo12:\val
120
+        sub             \rd, \rd, -(\offset)
121
+  .else
122
+        adrp            \rd, \val+(\offset)
123
+        add             \rd, \rd, :lo12:\val+(\offset)
124
+  .endif
125
+#else
126
+        adrp            \rd, \val+(\offset)
127
+        add             \rd, \rd, :lo12:\val+(\offset)
128
+#endif
129
+.endm
130
 
131
 #define FENC_STRIDE 64
132
 #define FDEC_STRIDE 32
133
+
134
+.macro SUMSUB_AB sum, diff, a, b
135
+    add             \sum,  \a, \b
136
+    sub             \diff, \a, \b
137
+.endm
138
+
139
+.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
140
+    SUMSUB_AB       \s1, \d1, \a, \b
141
+    SUMSUB_AB       \s2, \d2, \c, \d
142
+.endm
143
+
144
+.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
145
+    SUMSUB_ABCD     \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
146
+    SUMSUB_ABCD     \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
147
+.endm
148
+
149
+.macro ABS2 a b
150
+    abs             \a, \a
151
+    abs             \b, \b
152
+.endm
153
+
154
+.macro ABS8 v0, v1, v2, v3, v4, v5, v6, v7
155
+    ABS2            \v0, \v1
156
+    ABS2            \v2, \v3
157
+    ABS2            \v4, \v5
158
+    ABS2            \v6, \v7
159
+.endm
160
+
161
+.macro vtrn t1, t2, s1, s2
162
+    trn1            \t1, \s1, \s2
163
+    trn2            \t2, \s1, \s2
164
+.endm
165
+
166
+.macro trn4 t1, t2, t3, t4, s1, s2, s3, s4
167
+    vtrn            \t1, \t2, \s1, \s2
168
+    vtrn            \t3, \t4, \s3, \s4
169
+.endm
170
+
171
+#endif
172
\ No newline at end of file
173
x265_3.6.tar.gz/source/common/aarch64/blockcopy8-common.S Added
56
 
1
@@ -0,0 +1,54 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+#include "asm.S"
29
+
30
+.arch           armv8-a
31
+
32
+// void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
33
+.macro cpy1Dto2D_shr_start
34
+    add             x2, x2, x2
35
+    dup             v0.8h, w3
36
+    cmeq            v1.8h, v1.8h, v1.8h
37
+    sshl            v1.8h, v1.8h, v0.8h
38
+    sri             v1.8h, v1.8h, #1
39
+    neg             v0.8h, v0.8h
40
+.endm
41
+
42
+.macro cpy2Dto1D_shr_start
43
+    add             x2, x2, x2
44
+    dup             v0.8h, w3
45
+    cmeq            v1.8h, v1.8h, v1.8h
46
+    sshl            v1.8h, v1.8h, v0.8h
47
+    sri             v1.8h, v1.8h, #1
48
+    neg             v0.8h, v0.8h
49
+.endm
50
+
51
+const xtn_xtn2_table, align=4
52
+.byte    0, 2, 4, 6, 8, 10, 12, 14
53
+.byte    16, 18, 20, 22, 24, 26, 28, 30
54
+endconst
55
+
56
x265_3.6.tar.gz/source/common/aarch64/blockcopy8-sve.S Added
201
 
1
@@ -0,0 +1,1416 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ 
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "blockcopy8-common.S"
27
+
28
+.arch armv8-a+sve
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+/* void blockcopy_sp(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
41
+ *
42
+ * r0   - a
43
+ * r1   - stridea
44
+ * r2   - b
45
+ * r3   - strideb */
46
+
47
+function PFX(blockcopy_sp_4x4_sve)
48
+    ptrue           p0.h, vl4
49
+.rept 2
50
+    ld1h            {z0.h}, p0/z, x2
51
+    add             x2, x2, x3, lsl #1
52
+    st1b            {z0.h}, p0, x0
53
+    add             x0, x0, x1
54
+    ld1h            {z1.h}, p0/z, x2
55
+    add             x2, x2, x3, lsl #1
56
+    st1b            {z1.h}, p0, x0
57
+    add             x0, x0, x1
58
+.endr
59
+    ret
60
+endfunc
61
+
62
+function PFX(blockcopy_sp_8x8_sve)
63
+    ptrue           p0.h, vl8
64
+.rept 4
65
+    ld1h            {z0.h}, p0/z, x2
66
+    add             x2, x2, x3, lsl #1
67
+    st1b            {z0.h}, p0, x0
68
+    add            x0, x0, x1
69
+    ld1h            {z1.h}, p0/z, x2
70
+    add             x2, x2, x3, lsl #1
71
+    st1b            {z1.h}, p0, x0
72
+    add            x0, x0, x1
73
+.endr
74
+    ret
75
+endfunc
76
+
77
+function PFX(blockcopy_sp_16x16_sve)
78
+    rdvl            x9, #1
79
+    cmp             x9, #16
80
+    bgt             .vl_gt_16_blockcopy_sp_16_16
81
+    lsl             x3, x3, #1
82
+    movrel          x11, xtn_xtn2_table
83
+    ld1             {v31.16b}, x11
84
+.rept 8
85
+    ld1             {v0.8h-v1.8h}, x2, x3
86
+    ld1             {v2.8h-v3.8h}, x2, x3
87
+    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
88
+    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
89
+    st1             {v0.16b}, x0, x1
90
+    st1             {v1.16b}, x0, x1
91
+.endr
92
+    ret
93
+.vl_gt_16_blockcopy_sp_16_16:
94
+    ptrue           p0.h, vl16
95
+.rept 8
96
+    ld1h            {z0.h}, p0/z, x2
97
+    st1b            {z0.h}, p0, x0
98
+    add             x2, x2, x3, lsl #1
99
+    add             x0, x0, x1
100
+    ld1h            {z1.h}, p0/z, x2
101
+    st1b            {z1.h}, p0, x0
102
+    add             x2, x2, x3, lsl #1
103
+    add             x0, x0, x1
104
+.endr
105
+    ret
106
+endfunc
107
+
108
+function PFX(blockcopy_sp_32x32_sve)
109
+    mov             w12, #4
110
+    rdvl            x9, #1
111
+    cmp             x9, #16
112
+    bgt             .vl_gt_16_blockcopy_sp_32_32
113
+    lsl             x3, x3, #1
114
+    movrel          x11, xtn_xtn2_table
115
+    ld1             {v31.16b}, x11
116
+.loop_csp32_sve:
117
+    sub             w12, w12, #1
118
+.rept 4
119
+    ld1             {v0.8h-v3.8h}, x2, x3
120
+    ld1             {v4.8h-v7.8h}, x2, x3
121
+    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
122
+    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
123
+    tbl             v2.16b, {v4.16b,v5.16b}, v31.16b
124
+    tbl             v3.16b, {v6.16b,v7.16b}, v31.16b
125
+    st1             {v0.16b-v1.16b}, x0, x1
126
+    st1             {v2.16b-v3.16b}, x0, x1
127
+.endr
128
+    cbnz            w12, .loop_csp32_sve
129
+    ret
130
+.vl_gt_16_blockcopy_sp_32_32:
131
+    cmp             x9, #48
132
+    bgt             .vl_gt_48_blockcopy_sp_32_32
133
+    ptrue           p0.h, vl16
134
+.vl_gt_16_loop_csp32_sve:
135
+    sub             w12, w12, #1
136
+.rept 4
137
+    ld1h            {z0.h}, p0/z, x2
138
+    ld1h            {z1.h}, p0/z, x2, #1, mul vl
139
+    st1b            {z0.h}, p0, x0
140
+    st1b            {z1.h}, p0, x0, #1, mul vl
141
+    add             x2, x2, x3, lsl #1
142
+    add             x0, x0, x1
143
+    ld1h            {z2.h}, p0/z, x2
144
+    ld1h            {z3.h}, p0/z, x2, #1, mul vl
145
+    st1b            {z2.h}, p0, x0
146
+    st1b            {z3.h}, p0, x0, #1, mul vl
147
+    add             x2, x2, x3, lsl #1
148
+    add             x0, x0, x1
149
+.endr
150
+    cbnz            w12, .vl_gt_16_loop_csp32_sve
151
+    ret
152
+.vl_gt_48_blockcopy_sp_32_32:
153
+    ptrue           p0.h, vl32
154
+.vl_gt_48_loop_csp32_sve:
155
+    sub             w12, w12, #1
156
+.rept 4
157
+    ld1h            {z0.h}, p0/z, x2
158
+    st1b            {z0.h}, p0, x0
159
+    add             x2, x2, x3, lsl #1
160
+    add             x0, x0, x1
161
+    ld1h            {z1.h}, p0/z, x2
162
+    st1b            {z1.h}, p0, x0
163
+    add             x2, x2, x3, lsl #1
164
+    add             x0, x0, x1
165
+.endr
166
+    cbnz            w12, .vl_gt_48_loop_csp32_sve
167
+    ret
168
+endfunc
169
+
170
+function PFX(blockcopy_ps_16x16_sve)
171
+    rdvl            x9, #1
172
+    cmp             x9, #16
173
+    bgt             .vl_gt_16_blockcopy_ps_16_16
174
+    lsl             x1, x1, #1
175
+.rept 8
176
+    ld1             {v4.16b}, x2, x3
177
+    ld1             {v5.16b}, x2, x3
178
+    uxtl            v0.8h, v4.8b
179
+    uxtl2           v1.8h, v4.16b
180
+    uxtl            v2.8h, v5.8b
181
+    uxtl2           v3.8h, v5.16b
182
+    st1             {v0.8h-v1.8h}, x0, x1
183
+    st1             {v2.8h-v3.8h}, x0, x1
184
+.endr
185
+    ret
186
+.vl_gt_16_blockcopy_ps_16_16:
187
+    ptrue           p0.b, vl32
188
+.rept 16
189
+    ld1b            {z1.h}, p0/z, x2
190
+    st1h            {z1.h}, p0, x0
191
+    add             x0, x0, x1, lsl #1
192
+    add             x2, x2, x3
193
+.endr
194
+    ret
195
+endfunc
196
+
197
+function PFX(blockcopy_ps_32x32_sve)
198
+    rdvl            x9, #1
199
+    cmp             x9, #16
200
+    bgt             .vl_gt_16_blockcopy_ps_32_32
201
x265_3.6.tar.gz/source/common/aarch64/blockcopy8.S Added
201
 
1
@@ -0,0 +1,1299 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2021 MulticoreWare, Inc
4
+ *
5
+ * Authors: Sebastian Pop <spop@amazon.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+#include "blockcopy8-common.S"
27
+
28
+#ifdef __APPLE__
29
+.section __RODATA,__rodata
30
+#else
31
+.section .rodata
32
+#endif
33
+
34
+.align 4
35
+
36
+.text
37
+
38
+/* void blockcopy_sp(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
39
+ *
40
+ * r0   - a
41
+ * r1   - stridea
42
+ * r2   - b
43
+ * r3   - strideb */
44
+function PFX(blockcopy_sp_4x4_neon)
45
+    lsl             x3, x3, #1
46
+.rept 2
47
+    ld1             {v0.8h}, x2, x3
48
+    ld1             {v1.8h}, x2, x3
49
+    xtn             v0.8b, v0.8h
50
+    xtn             v1.8b, v1.8h
51
+    st1             {v0.s}0, x0, x1
52
+    st1             {v1.s}0, x0, x1
53
+.endr
54
+    ret
55
+endfunc
56
+
57
+function PFX(blockcopy_sp_8x8_neon)
58
+    lsl             x3, x3, #1
59
+.rept 4
60
+    ld1             {v0.8h}, x2, x3
61
+    ld1             {v1.8h}, x2, x3
62
+    xtn             v0.8b, v0.8h
63
+    xtn             v1.8b, v1.8h
64
+    st1             {v0.d}0, x0, x1
65
+    st1             {v1.d}0, x0, x1
66
+.endr
67
+    ret
68
+endfunc
69
+
70
+function PFX(blockcopy_sp_16x16_neon)
71
+    lsl             x3, x3, #1
72
+    movrel          x11, xtn_xtn2_table
73
+    ld1             {v31.16b}, x11
74
+.rept 8
75
+    ld1             {v0.8h-v1.8h}, x2, x3
76
+    ld1             {v2.8h-v3.8h}, x2, x3
77
+    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
78
+    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
79
+    st1             {v0.16b}, x0, x1
80
+    st1             {v1.16b}, x0, x1
81
+.endr
82
+    ret
83
+endfunc
84
+
85
+function PFX(blockcopy_sp_32x32_neon)
86
+    mov             w12, #4
87
+    lsl             x3, x3, #1
88
+    movrel          x11, xtn_xtn2_table
89
+    ld1             {v31.16b}, x11
90
+.loop_csp32:
91
+    sub             w12, w12, #1
92
+.rept 4
93
+    ld1             {v0.8h-v3.8h}, x2, x3
94
+    ld1             {v4.8h-v7.8h}, x2, x3
95
+    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
96
+    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
97
+    tbl             v2.16b, {v4.16b,v5.16b}, v31.16b
98
+    tbl             v3.16b, {v6.16b,v7.16b}, v31.16b
99
+    st1             {v0.16b-v1.16b}, x0, x1
100
+    st1             {v2.16b-v3.16b}, x0, x1
101
+.endr
102
+    cbnz            w12, .loop_csp32
103
+    ret
104
+endfunc
105
+
106
+function PFX(blockcopy_sp_64x64_neon)
107
+    mov             w12, #16
108
+    lsl             x3, x3, #1
109
+    sub             x3, x3, #64
110
+    movrel          x11, xtn_xtn2_table
111
+    ld1             {v31.16b}, x11
112
+.loop_csp64:
113
+    sub             w12, w12, #1
114
+.rept 4
115
+    ld1             {v0.8h-v3.8h}, x2, #64
116
+    ld1             {v4.8h-v7.8h}, x2, x3
117
+    tbl             v0.16b, {v0.16b,v1.16b}, v31.16b
118
+    tbl             v1.16b, {v2.16b,v3.16b}, v31.16b
119
+    tbl             v2.16b, {v4.16b,v5.16b}, v31.16b
120
+    tbl             v3.16b, {v6.16b,v7.16b}, v31.16b
121
+    st1             {v0.16b-v3.16b}, x0, x1
122
+.endr
123
+    cbnz            w12, .loop_csp64
124
+    ret
125
+endfunc
126
+
127
+// void blockcopy_ps(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb)
128
+function PFX(blockcopy_ps_4x4_neon)
129
+    lsl             x1, x1, #1
130
+.rept 2
131
+    ld1             {v0.8b}, x2, x3
132
+    ld1             {v1.8b}, x2, x3
133
+    uxtl            v0.8h, v0.8b
134
+    uxtl            v1.8h, v1.8b
135
+    st1             {v0.4h}, x0, x1
136
+    st1             {v1.4h}, x0, x1
137
+.endr
138
+    ret
139
+endfunc
140
+
141
+function PFX(blockcopy_ps_8x8_neon)
142
+    lsl             x1, x1, #1
143
+.rept 4
144
+    ld1             {v0.8b}, x2, x3
145
+    ld1             {v1.8b}, x2, x3
146
+    uxtl            v0.8h, v0.8b
147
+    uxtl            v1.8h, v1.8b
148
+    st1             {v0.8h}, x0, x1
149
+    st1             {v1.8h}, x0, x1
150
+.endr
151
+    ret
152
+endfunc
153
+
154
+function PFX(blockcopy_ps_16x16_neon)
155
+    lsl             x1, x1, #1
156
+.rept 8
157
+    ld1             {v4.16b}, x2, x3
158
+    ld1             {v5.16b}, x2, x3
159
+    uxtl            v0.8h, v4.8b
160
+    uxtl2           v1.8h, v4.16b
161
+    uxtl            v2.8h, v5.8b
162
+    uxtl2           v3.8h, v5.16b
163
+    st1             {v0.8h-v1.8h}, x0, x1
164
+    st1             {v2.8h-v3.8h}, x0, x1
165
+.endr
166
+    ret
167
+endfunc
168
+
169
+function PFX(blockcopy_ps_32x32_neon)
170
+    lsl             x1, x1, #1
171
+    mov             w12, #4
172
+.loop_cps32:
173
+    sub             w12, w12, #1
174
+.rept 4
175
+    ld1             {v16.16b-v17.16b}, x2, x3
176
+    ld1             {v18.16b-v19.16b}, x2, x3
177
+    uxtl            v0.8h, v16.8b
178
+    uxtl2           v1.8h, v16.16b
179
+    uxtl            v2.8h, v17.8b
180
+    uxtl2           v3.8h, v17.16b
181
+    uxtl            v4.8h, v18.8b
182
+    uxtl2           v5.8h, v18.16b
183
+    uxtl            v6.8h, v19.8b
184
+    uxtl2           v7.8h, v19.16b
185
+    st1             {v0.8h-v3.8h}, x0, x1
186
+    st1             {v4.8h-v7.8h}, x0, x1
187
+.endr
188
+    cbnz            w12, .loop_cps32
189
+    ret
190
+endfunc
191
+
192
+function PFX(blockcopy_ps_64x64_neon)
193
+    lsl             x1, x1, #1
194
+    sub             x1, x1, #64
195
+    mov             w12, #16
196
+.loop_cps64:
197
+    sub             w12, w12, #1
198
+.rept 4
199
+    ld1             {v16.16b-v19.16b}, x2, x3
200
+    uxtl            v0.8h, v16.8b
201
x265_3.6.tar.gz/source/common/aarch64/dct-prim.cpp Added
201
 
1
@@ -0,0 +1,948 @@
2
+#include "dct-prim.h"
3
+
4
+
5
+#if HAVE_NEON
6
+
7
+#include <arm_neon.h>
8
+
9
+
10
+namespace
11
+{
12
+using namespace X265_NS;
13
+
14
+
15
+static int16x8_t rev16(const int16x8_t a)
16
+{
17
+    static const int8x16_t tbl = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
18
+    return vqtbx1q_u8(a, a, tbl);
19
+}
20
+
21
+static int32x4_t rev32(const int32x4_t a)
22
+{
23
+    static const int8x16_t tbl = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
24
+    return vqtbx1q_u8(a, a, tbl);
25
+}
26
+
27
+static void transpose_4x4x16(int16x4_t &x0, int16x4_t &x1, int16x4_t &x2, int16x4_t &x3)
28
+{
29
+    int16x4_t s0, s1, s2, s3;
30
+    s0 = vtrn1_s32(x0, x2);
31
+    s1 = vtrn1_s32(x1, x3);
32
+    s2 = vtrn2_s32(x0, x2);
33
+    s3 = vtrn2_s32(x1, x3);
34
+
35
+    x0 = vtrn1_s16(s0, s1);
36
+    x1 = vtrn2_s16(s0, s1);
37
+    x2 = vtrn1_s16(s2, s3);
38
+    x3 = vtrn2_s16(s2, s3);
39
+}
40
+
41
+
42
+
43
+static int scanPosLast_opt(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag,
44
+                           uint8_t *coeffNum, int numSig, const uint16_t * /*scanCG4x4*/, const int /*trSize*/)
45
+{
46
+
47
+    // This is an optimized function for scanPosLast, which removes the rmw dependency, once integrated into mainline x265, should replace reference implementation
48
+    // For clarity, left the original reference code in comments
49
+    int scanPosLast = 0;
50
+
51
+    uint16_t cSign = 0;
52
+    uint16_t cFlag = 0;
53
+    uint8_t cNum = 0;
54
+
55
+    uint32_t prevcgIdx = 0;
56
+    do
57
+    {
58
+        const uint32_t cgIdx = (uint32_t)scanPosLast >> MLS_CG_SIZE;
59
+
60
+        const uint32_t posLast = scanscanPosLast;
61
+
62
+        const int curCoeff = coeffposLast;
63
+        const uint32_t isNZCoeff = (curCoeff != 0);
64
+        /*
65
+        NOTE: the new algorithm is complicated, so I keep reference code here
66
+        uint32_t posy   = posLast >> log2TrSize;
67
+        uint32_t posx   = posLast - (posy << log2TrSize);
68
+        uint32_t blkIdx0 = ((posy >> MLS_CG_LOG2_SIZE) << codingParameters.log2TrSizeCG) + (posx >> MLS_CG_LOG2_SIZE);
69
+        const uint32_t blkIdx = ((posLast >> (2 * MLS_CG_LOG2_SIZE)) & ~maskPosXY) + ((posLast >> MLS_CG_LOG2_SIZE) & maskPosXY);
70
+        sigCoeffGroupFlag64 |= ((uint64_t)isNZCoeff << blkIdx);
71
+        */
72
+
73
+        // get L1 sig map
74
+        numSig -= isNZCoeff;
75
+
76
+        if (scanPosLast % (1 << MLS_CG_SIZE) == 0)
77
+        {
78
+            coeffSignprevcgIdx = cSign;
79
+            coeffFlagprevcgIdx = cFlag;
80
+            coeffNumprevcgIdx = cNum;
81
+            cSign = 0;
82
+            cFlag = 0;
83
+            cNum = 0;
84
+        }
85
+        // TODO: optimize by instruction BTS
86
+        cSign += (uint16_t)(((curCoeff < 0) ? 1 : 0) << cNum);
87
+        cFlag = (cFlag << 1) + (uint16_t)isNZCoeff;
88
+        cNum += (uint8_t)isNZCoeff;
89
+        prevcgIdx = cgIdx;
90
+        scanPosLast++;
91
+    }
92
+    while (numSig > 0);
93
+
94
+    coeffSignprevcgIdx = cSign;
95
+    coeffFlagprevcgIdx = cFlag;
96
+    coeffNumprevcgIdx = cNum;
97
+    return scanPosLast - 1;
98
+}
99
+
100
+
101
+#if (MLS_CG_SIZE == 4)
102
+template<int log2TrSize>
103
+static void nonPsyRdoQuant_neon(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost,
104
+                                int64_t *totalRdCost, uint32_t blkPos)
105
+{
106
+    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH -
107
+                               log2TrSize; /* Represents scaling through forward transform */
108
+    const int scaleBits = SCALE_BITS - 2 * transformShift;
109
+    const uint32_t trSize = 1 << log2TrSize;
110
+
111
+    int64x2_t vcost_sum_0 = vdupq_n_s64(0);
112
+    int64x2_t vcost_sum_1 = vdupq_n_s64(0);
113
+    for (int y = 0; y < MLS_CG_SIZE; y++)
114
+    {
115
+        int16x4_t in = *(int16x4_t *)&m_resiDctCoeffblkPos;
116
+        int32x4_t mul = vmull_s16(in, in);
117
+        int64x2_t cost0, cost1;
118
+        cost0 = vshll_n_s32(vget_low_s32(mul), scaleBits);
119
+        cost1 = vshll_high_n_s32(mul, scaleBits);
120
+        *(int64x2_t *)&costUncodedblkPos + 0 = cost0;
121
+        *(int64x2_t *)&costUncodedblkPos + 2 = cost1;
122
+        vcost_sum_0 = vaddq_s64(vcost_sum_0, cost0);
123
+        vcost_sum_1 = vaddq_s64(vcost_sum_1, cost1);
124
+        blkPos += trSize;
125
+    }
126
+    int64_t sum = vaddvq_s64(vaddq_s64(vcost_sum_0, vcost_sum_1));
127
+    *totalUncodedCost += sum;
128
+    *totalRdCost += sum;
129
+}
130
+
131
+template<int log2TrSize>
132
+static void psyRdoQuant_neon(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded,
133
+                             int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
134
+{
135
+    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH -
136
+                               log2TrSize; /* Represents scaling through forward transform */
137
+    const int scaleBits = SCALE_BITS - 2 * transformShift;
138
+    const uint32_t trSize = 1 << log2TrSize;
139
+    //using preprocessor to bypass clang bug
140
+    const int max = X265_MAX(0, (2 * transformShift + 1));
141
+
142
+    int64x2_t vcost_sum_0 = vdupq_n_s64(0);
143
+    int64x2_t vcost_sum_1 = vdupq_n_s64(0);
144
+    int32x4_t vpsy = vdupq_n_s32(*psyScale);
145
+    for (int y = 0; y < MLS_CG_SIZE; y++)
146
+    {
147
+        int32x4_t signCoef = vmovl_s16(*(int16x4_t *)&m_resiDctCoeffblkPos);
148
+        int32x4_t predictedCoef = vsubq_s32(vmovl_s16(*(int16x4_t *)&m_fencDctCoeffblkPos), signCoef);
149
+        int64x2_t cost0, cost1;
150
+        cost0 = vmull_s32(vget_low_s32(signCoef), vget_low_s32(signCoef));
151
+        cost1 = vmull_high_s32(signCoef, signCoef);
152
+        cost0 = vshlq_n_s64(cost0, scaleBits);
153
+        cost1 = vshlq_n_s64(cost1, scaleBits);
154
+        int64x2_t neg0 = vmull_s32(vget_low_s32(predictedCoef), vget_low_s32(vpsy));
155
+        int64x2_t neg1 = vmull_high_s32(predictedCoef, vpsy);
156
+        if (max > 0)
157
+        {
158
+            int64x2_t shift = vdupq_n_s64(-max);
159
+            neg0 = vshlq_s64(neg0, shift);
160
+            neg1 = vshlq_s64(neg1, shift);
161
+        }
162
+        cost0 = vsubq_s64(cost0, neg0);
163
+        cost1 = vsubq_s64(cost1, neg1);
164
+        *(int64x2_t *)&costUncodedblkPos + 0 = cost0;
165
+        *(int64x2_t *)&costUncodedblkPos + 2 = cost1;
166
+        vcost_sum_0 = vaddq_s64(vcost_sum_0, cost0);
167
+        vcost_sum_1 = vaddq_s64(vcost_sum_1, cost1);
168
+
169
+        blkPos += trSize;
170
+    }
171
+    int64_t sum = vaddvq_s64(vaddq_s64(vcost_sum_0, vcost_sum_1));
172
+    *totalUncodedCost += sum;
173
+    *totalRdCost += sum;
174
+}
175
+
176
+#else
177
+#error "MLS_CG_SIZE must be 4 for neon version"
178
+#endif
179
+
180
+
181
+
182
+template<int trSize>
183
+int  count_nonzero_neon(const int16_t *quantCoeff)
184
+{
185
+    X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
186
+    int count = 0;
187
+    int16x8_t vcount = vdupq_n_s16(0);
188
+    const int numCoeff = trSize * trSize;
189
+    int i = 0;
190
+    for (; (i + 8) <= numCoeff; i += 8)
191
+    {
192
+        int16x8_t in = *(int16x8_t *)&quantCoeffi;
193
+        vcount = vaddq_s16(vcount, vtstq_s16(in, in));
194
+    }
195
+    for (; i < numCoeff; i++)
196
+    {
197
+        count += quantCoeffi != 0;
198
+    }
199
+
200
+    return count - vaddvq_s16(vcount);
201
x265_3.6.tar.gz/source/common/aarch64/dct-prim.h Added
21
 
1
@@ -0,0 +1,19 @@
2
+#ifndef __DCT_PRIM_NEON_H__
3
+#define __DCT_PRIM_NEON_H__
4
+
5
+
6
+#include "common.h"
7
+#include "primitives.h"
8
+#include "contexts.h"   // costCoeffNxN_c
9
+#include "threading.h"  // CLZ
10
+
11
+namespace X265_NS
12
+{
13
+// x265 private namespace
14
+void setupDCTPrimitives_neon(EncoderPrimitives &p);
15
+};
16
+
17
+
18
+
19
+#endif
20
+
21
x265_3.6.tar.gz/source/common/aarch64/filter-prim.cpp Added
201
 
1
@@ -0,0 +1,995 @@
2
+#if HAVE_NEON
3
+
4
+#include "filter-prim.h"
5
+#include <arm_neon.h>
6
+
7
+namespace
8
+{
9
+
10
+using namespace X265_NS;
11
+
12
+
13
+template<int width, int height>
14
+void filterPixelToShort_neon(const pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
15
+{
16
+    const int shift = IF_INTERNAL_PREC - X265_DEPTH;
17
+    int row, col;
18
+    const int16x8_t off = vdupq_n_s16(IF_INTERNAL_OFFS);
19
+    for (row = 0; row < height; row++)
20
+    {
21
+
22
+        for (col = 0; col < width; col += 8)
23
+        {
24
+            int16x8_t in;
25
+
26
+#if HIGH_BIT_DEPTH
27
+            in = *(int16x8_t *)&srccol;
28
+#else
29
+            in = vmovl_u8(*(uint8x8_t *)&srccol);
30
+#endif
31
+
32
+            int16x8_t tmp = vshlq_n_s16(in, shift);
33
+            tmp = vsubq_s16(tmp, off);
34
+            *(int16x8_t *)&dstcol = tmp;
35
+
36
+        }
37
+
38
+        src += srcStride;
39
+        dst += dstStride;
40
+    }
41
+}
42
+
43
+
44
+template<int N, int width, int height>
45
+void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
46
+{
47
+    const int16_t *coeff = (N == 4) ? g_chromaFiltercoeffIdx : g_lumaFiltercoeffIdx;
48
+    int headRoom = IF_FILTER_PREC;
49
+    int offset = (1 << (headRoom - 1));
50
+    uint16_t maxVal = (1 << X265_DEPTH) - 1;
51
+    int cStride = 1;
52
+
53
+    src -= (N / 2 - 1) * cStride;
54
+    int16x8_t vc;
55
+    vc = *(int16x8_t *)coeff;
56
+    int16x4_t low_vc = vget_low_s16(vc);
57
+    int16x4_t high_vc = vget_high_s16(vc);
58
+
59
+    const int32x4_t voffset = vdupq_n_s32(offset);
60
+    const int32x4_t vhr = vdupq_n_s32(-headRoom);
61
+
62
+    int row, col;
63
+    for (row = 0; row < height; row++)
64
+    {
65
+        for (col = 0; col < width; col += 8)
66
+        {
67
+            int32x4_t vsum1, vsum2;
68
+
69
+            int16x8_t inputN;
70
+
71
+            for (int i = 0; i < N; i++)
72
+            {
73
+#if HIGH_BIT_DEPTH
74
+                inputi = *(int16x8_t *)&srccol + i;
75
+#else
76
+                inputi = vmovl_u8(*(uint8x8_t *)&srccol + i);
77
+#endif
78
+            }
79
+            vsum1 = voffset;
80
+            vsum2 = voffset;
81
+
82
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input0), low_vc, 0);
83
+            vsum2 = vmlal_high_lane_s16(vsum2, input0, low_vc, 0);
84
+
85
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input1), low_vc, 1);
86
+            vsum2 = vmlal_high_lane_s16(vsum2, input1, low_vc, 1);
87
+
88
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input2), low_vc, 2);
89
+            vsum2 = vmlal_high_lane_s16(vsum2, input2, low_vc, 2);
90
+
91
+            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input3), low_vc, 3);
92
+            vsum2 = vmlal_high_lane_s16(vsum2, input3, low_vc, 3);
93
+
94
+            if (N == 8)
95
+            {
96
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input4), high_vc, 0);
97
+                vsum2 = vmlal_high_lane_s16(vsum2, input4, high_vc, 0);
98
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input5), high_vc, 1);
99
+                vsum2 = vmlal_high_lane_s16(vsum2, input5, high_vc, 1);
100
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input6), high_vc, 2);
101
+                vsum2 = vmlal_high_lane_s16(vsum2, input6, high_vc, 2);
102
+                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input7), high_vc, 3);
103
+                vsum2 = vmlal_high_lane_s16(vsum2, input7, high_vc, 3);
104
+
105
+            }
106
+
107
+            vsum1 = vshlq_s32(vsum1, vhr);
108
+            vsum2 = vshlq_s32(vsum2, vhr);
109
+
110
+            int16x8_t vsum = vuzp1q_s16(vsum1, vsum2);
111
+            vsum = vminq_s16(vsum, vdupq_n_s16(maxVal));
112
+            vsum = vmaxq_s16(vsum, vdupq_n_s16(0));
113
+#if HIGH_BIT_DEPTH
114
+            *(int16x8_t *)&dstcol = vsum;
115
+#else
116
+            uint8x16_t usum = vuzp1q_u8(vsum, vsum);
117
+            *(uint8x8_t *)&dstcol = vget_low_u8(usum);
118
+#endif
119
+
120
+        }
121
+
122
+        src += srcStride;
123
+        dst += dstStride;
124
+    }
125
+}
126
+
127
+#if HIGH_BIT_DEPTH
128
+
129
+template<int N, int width, int height>
130
+void interp_horiz_ps_neon(const uint16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx,
131
+                          int isRowExt)
132
+{
133
+    const int16_t *coeff = (N == 4) ? g_chromaFiltercoeffIdx : g_lumaFiltercoeffIdx;
134
+    const int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
135
+    const int shift = IF_FILTER_PREC - headRoom;
136
+    const int offset = (unsigned) - IF_INTERNAL_OFFS << shift;
137
+
138
+    int blkheight = height;
139
+    src -= N / 2 - 1;
140
+
141
+    if (isRowExt)
142
+    {
143
+        src -= (N / 2 - 1) * srcStride;
144
+        blkheight += N - 1;
145
+    }
146
+    int16x8_t vc3 = vld1q_s16(coeff);
147
+    const int32x4_t voffset = vdupq_n_s32(offset);
148
+    const int32x4_t vhr = vdupq_n_s32(-shift);
149
+
150
+    int row, col;
151
+    for (row = 0; row < blkheight; row++)
152
+    {
153
+        for (col = 0; col < width; col += 8)
154
+        {
155
+            int32x4_t vsum, vsum2;
156
+
157
+            int16x8_t inputN;
158
+            for (int i = 0; i < N; i++)
159
+            {
160
+                inputi = vld1q_s16((int16_t *)&srccol + i);
161
+            }
162
+
163
+            vsum = voffset;
164
+            vsum2 = voffset;
165
+
166
+            vsum = vmlal_lane_s16(vsum, vget_low_u16(input0), vget_low_s16(vc3), 0);
167
+            vsum2 = vmlal_high_lane_s16(vsum2, input0, vget_low_s16(vc3), 0);
168
+
169
+            vsum = vmlal_lane_s16(vsum, vget_low_u16(input1), vget_low_s16(vc3), 1);
170
+            vsum2 = vmlal_high_lane_s16(vsum2, input1, vget_low_s16(vc3), 1);
171
+
172
+            vsum = vmlal_lane_s16(vsum, vget_low_u16(input2), vget_low_s16(vc3), 2);
173
+            vsum2 = vmlal_high_lane_s16(vsum2, input2, vget_low_s16(vc3), 2);
174
+
175
+            vsum = vmlal_lane_s16(vsum, vget_low_u16(input3), vget_low_s16(vc3), 3);
176
+            vsum2 = vmlal_high_lane_s16(vsum2, input3, vget_low_s16(vc3), 3);
177
+
178
+            if (N == 8)
179
+            {
180
+                vsum = vmlal_lane_s16(vsum, vget_low_s16(input4), vget_high_s16(vc3), 0);
181
+                vsum2 = vmlal_high_lane_s16(vsum2, input4, vget_high_s16(vc3), 0);
182
+
183
+                vsum = vmlal_lane_s16(vsum, vget_low_s16(input5), vget_high_s16(vc3), 1);
184
+                vsum2 = vmlal_high_lane_s16(vsum2, input5, vget_high_s16(vc3), 1);
185
+
186
+                vsum = vmlal_lane_s16(vsum, vget_low_s16(input6), vget_high_s16(vc3), 2);
187
+                vsum2 = vmlal_high_lane_s16(vsum2, input6, vget_high_s16(vc3), 2);
188
+
189
+                vsum = vmlal_lane_s16(vsum, vget_low_s16(input7), vget_high_s16(vc3), 3);
190
+                vsum2 = vmlal_high_lane_s16(vsum2, input7, vget_high_s16(vc3), 3);
191
+            }
192
+
193
+            vsum = vshlq_s32(vsum, vhr);
194
+            vsum2 = vshlq_s32(vsum2, vhr);
195
+            *(int16x4_t *)&dstcol = vmovn_u32(vsum);
196
+            *(int16x4_t *)&dstcol+4 = vmovn_u32(vsum2);
197
+        }
198
+
199
+        src += srcStride;
200
+        dst += dstStride;
201
x265_3.6.tar.gz/source/common/aarch64/filter-prim.h Added
23
 
1
@@ -0,0 +1,21 @@
2
+#ifndef _FILTER_PRIM_ARM64_H__
3
+#define _FILTER_PRIM_ARM64_H__
4
+
5
+
6
+#include "common.h"
7
+#include "slicetype.h"      // LOWRES_COST_MASK
8
+#include "primitives.h"
9
+#include "x265.h"
10
+
11
+
12
+namespace X265_NS
13
+{
14
+
15
+
16
+void setupFilterPrimitives_neon(EncoderPrimitives &p);
17
+
18
+};
19
+
20
+
21
+#endif
22
+
23
x265_3.6.tar.gz/source/common/aarch64/fun-decls.h Added
201
 
1
@@ -0,0 +1,256 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2021 MulticoreWare, Inc
4
+ *
5
+ * Authors: Sebastian Pop <spop@amazon.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#define FUNCDEF_TU(ret, name, cpu, ...) \
26
+    ret PFX(name ## _4x4_ ## cpu(__VA_ARGS__)); \
27
+    ret PFX(name ## _8x8_ ## cpu(__VA_ARGS__)); \
28
+    ret PFX(name ## _16x16_ ## cpu(__VA_ARGS__)); \
29
+    ret PFX(name ## _32x32_ ## cpu(__VA_ARGS__)); \
30
+    ret PFX(name ## _64x64_ ## cpu(__VA_ARGS__))
31
+
32
+#define FUNCDEF_TU_S(ret, name, cpu, ...) \
33
+    ret PFX(name ## _4_ ## cpu(__VA_ARGS__)); \
34
+    ret PFX(name ## _8_ ## cpu(__VA_ARGS__)); \
35
+    ret PFX(name ## _16_ ## cpu(__VA_ARGS__)); \
36
+    ret PFX(name ## _32_ ## cpu(__VA_ARGS__)); \
37
+    ret PFX(name ## _64_ ## cpu(__VA_ARGS__))
38
+
39
+#define FUNCDEF_TU_S2(ret, name, cpu, ...) \
40
+    ret PFX(name ## 4_ ## cpu(__VA_ARGS__)); \
41
+    ret PFX(name ## 8_ ## cpu(__VA_ARGS__)); \
42
+    ret PFX(name ## 16_ ## cpu(__VA_ARGS__)); \
43
+    ret PFX(name ## 32_ ## cpu(__VA_ARGS__)); \
44
+    ret PFX(name ## 64_ ## cpu(__VA_ARGS__))
45
+
46
+#define FUNCDEF_PU(ret, name, cpu, ...) \
47
+    ret PFX(name ## _4x4_   ## cpu)(__VA_ARGS__); \
48
+    ret PFX(name ## _8x8_   ## cpu)(__VA_ARGS__); \
49
+    ret PFX(name ## _16x16_ ## cpu)(__VA_ARGS__); \
50
+    ret PFX(name ## _32x32_ ## cpu)(__VA_ARGS__); \
51
+    ret PFX(name ## _64x64_ ## cpu)(__VA_ARGS__); \
52
+    ret PFX(name ## _8x4_   ## cpu)(__VA_ARGS__); \
53
+    ret PFX(name ## _4x8_   ## cpu)(__VA_ARGS__); \
54
+    ret PFX(name ## _16x8_  ## cpu)(__VA_ARGS__); \
55
+    ret PFX(name ## _8x16_  ## cpu)(__VA_ARGS__); \
56
+    ret PFX(name ## _16x32_ ## cpu)(__VA_ARGS__); \
57
+    ret PFX(name ## _32x16_ ## cpu)(__VA_ARGS__); \
58
+    ret PFX(name ## _64x32_ ## cpu)(__VA_ARGS__); \
59
+    ret PFX(name ## _32x64_ ## cpu)(__VA_ARGS__); \
60
+    ret PFX(name ## _16x12_ ## cpu)(__VA_ARGS__); \
61
+    ret PFX(name ## _12x16_ ## cpu)(__VA_ARGS__); \
62
+    ret PFX(name ## _16x4_  ## cpu)(__VA_ARGS__); \
63
+    ret PFX(name ## _4x16_  ## cpu)(__VA_ARGS__); \
64
+    ret PFX(name ## _32x24_ ## cpu)(__VA_ARGS__); \
65
+    ret PFX(name ## _24x32_ ## cpu)(__VA_ARGS__); \
66
+    ret PFX(name ## _32x8_  ## cpu)(__VA_ARGS__); \
67
+    ret PFX(name ## _8x32_  ## cpu)(__VA_ARGS__); \
68
+    ret PFX(name ## _64x48_ ## cpu)(__VA_ARGS__); \
69
+    ret PFX(name ## _48x64_ ## cpu)(__VA_ARGS__); \
70
+    ret PFX(name ## _64x16_ ## cpu)(__VA_ARGS__); \
71
+    ret PFX(name ## _16x64_ ## cpu)(__VA_ARGS__)
72
+
73
+#define FUNCDEF_CHROMA_PU(ret, name, cpu, ...) \
74
+    FUNCDEF_PU(ret, name, cpu, __VA_ARGS__); \
75
+    ret PFX(name ## _4x2_ ## cpu)(__VA_ARGS__); \
76
+    ret PFX(name ## _4x4_ ## cpu)(__VA_ARGS__); \
77
+    ret PFX(name ## _2x4_ ## cpu)(__VA_ARGS__); \
78
+    ret PFX(name ## _8x2_ ## cpu)(__VA_ARGS__); \
79
+    ret PFX(name ## _2x8_ ## cpu)(__VA_ARGS__); \
80
+    ret PFX(name ## _8x6_ ## cpu)(__VA_ARGS__); \
81
+    ret PFX(name ## _6x8_ ## cpu)(__VA_ARGS__); \
82
+    ret PFX(name ## _8x12_ ## cpu)(__VA_ARGS__); \
83
+    ret PFX(name ## _12x8_ ## cpu)(__VA_ARGS__); \
84
+    ret PFX(name ## _6x16_ ## cpu)(__VA_ARGS__); \
85
+    ret PFX(name ## _16x6_ ## cpu)(__VA_ARGS__); \
86
+    ret PFX(name ## _2x16_ ## cpu)(__VA_ARGS__); \
87
+    ret PFX(name ## _16x2_ ## cpu)(__VA_ARGS__); \
88
+    ret PFX(name ## _4x12_ ## cpu)(__VA_ARGS__); \
89
+    ret PFX(name ## _12x4_ ## cpu)(__VA_ARGS__); \
90
+    ret PFX(name ## _32x12_ ## cpu)(__VA_ARGS__); \
91
+    ret PFX(name ## _12x32_ ## cpu)(__VA_ARGS__); \
92
+    ret PFX(name ## _32x4_ ## cpu)(__VA_ARGS__); \
93
+    ret PFX(name ## _4x32_ ## cpu)(__VA_ARGS__); \
94
+    ret PFX(name ## _32x48_ ## cpu)(__VA_ARGS__); \
95
+    ret PFX(name ## _48x32_ ## cpu)(__VA_ARGS__); \
96
+    ret PFX(name ## _16x24_ ## cpu)(__VA_ARGS__); \
97
+    ret PFX(name ## _24x16_ ## cpu)(__VA_ARGS__); \
98
+    ret PFX(name ## _8x64_ ## cpu)(__VA_ARGS__); \
99
+    ret PFX(name ## _64x8_ ## cpu)(__VA_ARGS__); \
100
+    ret PFX(name ## _64x24_ ## cpu)(__VA_ARGS__); \
101
+    ret PFX(name ## _24x64_ ## cpu)(__VA_ARGS__);
102
+
103
+#define DECLS(cpu) \
104
+    FUNCDEF_TU(void, cpy2Dto1D_shl, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
105
+    FUNCDEF_TU(void, cpy2Dto1D_shr, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
106
+    FUNCDEF_TU(void, cpy1Dto2D_shl, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
107
+    FUNCDEF_TU(void, cpy1Dto2D_shl_aligned, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
108
+    FUNCDEF_TU(void, cpy1Dto2D_shr, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
109
+    FUNCDEF_TU_S(uint32_t, copy_cnt, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride); \
110
+    FUNCDEF_TU_S(int, count_nonzero, cpu, const int16_t* quantCoeff); \
111
+    FUNCDEF_TU(void, blockfill_s, cpu, int16_t* dst, intptr_t dstride, int16_t val); \
112
+    FUNCDEF_TU(void, blockfill_s_aligned, cpu, int16_t* dst, intptr_t dstride, int16_t val); \
113
+    FUNCDEF_CHROMA_PU(void, blockcopy_ss, cpu, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); \
114
+    FUNCDEF_CHROMA_PU(void, blockcopy_pp, cpu, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); \
115
+    FUNCDEF_PU(void, blockcopy_sp, cpu, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); \
116
+    FUNCDEF_PU(void, blockcopy_ps, cpu, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); \
117
+    FUNCDEF_PU(void, interp_8tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
118
+    FUNCDEF_PU(void, interp_8tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
119
+    FUNCDEF_PU(void, interp_8tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
120
+    FUNCDEF_PU(void, interp_8tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
121
+    FUNCDEF_PU(void, interp_8tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
122
+    FUNCDEF_PU(void, interp_8tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
123
+    FUNCDEF_PU(void, interp_8tap_hv_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY); \
124
+    FUNCDEF_CHROMA_PU(void, filterPixelToShort, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
125
+    FUNCDEF_CHROMA_PU(void, filterPixelToShort_aligned, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
126
+    FUNCDEF_CHROMA_PU(void, interp_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
127
+    FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
128
+    FUNCDEF_CHROMA_PU(void, interp_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
129
+    FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
130
+    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
131
+    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
132
+    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
133
+    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
134
+    FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
135
+    FUNCDEF_CHROMA_PU(void, addAvg_aligned, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
136
+    FUNCDEF_PU(void, pixel_avg_pp, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
137
+    FUNCDEF_PU(void, pixel_avg_pp_aligned, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
138
+    FUNCDEF_PU(void, sad_x3, cpu, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
139
+    FUNCDEF_PU(void, sad_x4, cpu, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
140
+    FUNCDEF_CHROMA_PU(int, pixel_sad, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
141
+    FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
142
+    FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s_aligned, cpu, const int16_t*, intptr_t); \
143
+    FUNCDEF_TU_S(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
144
+    FUNCDEF_TU_S(sse_t, pixel_ssd_s_aligned, cpu, const int16_t*, intptr_t); \
145
+    FUNCDEF_PU(sse_t, pixel_sse_pp, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
146
+    FUNCDEF_CHROMA_PU(sse_t, pixel_sse_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
147
+    FUNCDEF_PU(void, pixel_sub_ps, cpu, int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); \
148
+    FUNCDEF_PU(void, pixel_add_ps, cpu, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \
149
+    FUNCDEF_PU(void, pixel_add_ps_aligned, cpu, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \
150
+    FUNCDEF_CHROMA_PU(int, pixel_satd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
151
+    FUNCDEF_TU_S2(void, ssimDist, cpu, const pixel *fenc, uint32_t fStride, const pixel *recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k); \
152
+    FUNCDEF_TU_S2(void, normFact, cpu, const pixel *src, uint32_t blockSize, int shift, uint64_t *z_k)
153
+
154
+DECLS(neon);
155
+DECLS(sve);
156
+DECLS(sve2);
157
+
158
+
159
+void x265_pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
160
+
161
+uint64_t x265_pixel_var_8x8_neon(const pixel* pix, intptr_t stride);
162
+uint64_t x265_pixel_var_16x16_neon(const pixel* pix, intptr_t stride);
163
+uint64_t x265_pixel_var_32x32_neon(const pixel* pix, intptr_t stride);
164
+uint64_t x265_pixel_var_64x64_neon(const pixel* pix, intptr_t stride);
165
+
166
+void x265_getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
167
+void x265_getResidual8_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
168
+void x265_getResidual16_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
169
+void x265_getResidual32_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
170
+
171
+void x265_scale1D_128to64_neon(pixel *dst, const pixel *src);
172
+void x265_scale2D_64to32_neon(pixel* dst, const pixel* src, intptr_t stride);
173
+
174
+int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
175
+int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
176
+int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
177
+int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
178
+int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
179
+int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
180
+int x265_pixel_satd_8x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
181
+int x265_pixel_satd_8x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
182
+int x265_pixel_satd_8x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
183
+int x265_pixel_satd_8x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
184
+int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
185
+int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
186
+int x265_pixel_satd_16x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
187
+int x265_pixel_satd_16x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
188
+int x265_pixel_satd_16x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
189
+int x265_pixel_satd_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
190
+int x265_pixel_satd_16x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
191
+int x265_pixel_satd_16x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
192
+int x265_pixel_satd_16x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
193
+int x265_pixel_satd_24x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
194
+int x265_pixel_satd_24x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
195
+int x265_pixel_satd_32x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
196
+int x265_pixel_satd_32x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
197
+int x265_pixel_satd_32x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
198
+int x265_pixel_satd_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
199
+int x265_pixel_satd_32x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
200
+int x265_pixel_satd_32x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
201
x265_3.6.tar.gz/source/common/aarch64/intrapred-prim.cpp Added
201
 
1
@@ -0,0 +1,265 @@
2
+#include "common.h"
3
+#include "primitives.h"
4
+
5
+
6
+#if 1
7
+#include "arm64-utils.h"
8
+#include <arm_neon.h>
9
+
10
+using namespace X265_NS;
11
+
12
+namespace
13
+{
14
+
15
+
16
+
17
+template<int width>
18
+void intra_pred_ang_neon(pixel *dst, intptr_t dstStride, const pixel *srcPix0, int dirMode, int bFilter)
19
+{
20
+    int width2 = width << 1;
21
+    // Flip the neighbours in the horizontal case.
22
+    int horMode = dirMode < 18;
23
+    pixel neighbourBuf129;
24
+    const pixel *srcPix = srcPix0;
25
+
26
+    if (horMode)
27
+    {
28
+        neighbourBuf0 = srcPix0;
29
+        //for (int i = 0; i < width << 1; i++)
30
+        //{
31
+        //    neighbourBuf1 + i = srcPixwidth2 + 1 + i;
32
+        //    neighbourBufwidth2 + 1 + i = srcPix1 + i;
33
+        //}
34
+        memcpy(&neighbourBuf1, &srcPixwidth2 + 1, sizeof(pixel) * (width << 1));
35
+        memcpy(&neighbourBufwidth2 + 1, &srcPix1, sizeof(pixel) * (width << 1));
36
+        srcPix = neighbourBuf;
37
+    }
38
+
39
+    // Intra prediction angle and inverse angle tables.
40
+    const int8_t angleTable17 = { -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 };
41
+    const int16_t invAngleTable8 = { 4096, 1638, 910, 630, 482, 390, 315, 256 };
42
+
43
+    // Get the prediction angle.
44
+    int angleOffset = horMode ? 10 - dirMode : dirMode - 26;
45
+    int angle = angleTable8 + angleOffset;
46
+
47
+    // Vertical Prediction.
48
+    if (!angle)
49
+    {
50
+        for (int y = 0; y < width; y++)
51
+        {
52
+            memcpy(&dsty * dstStride, srcPix + 1, sizeof(pixel)*width);
53
+        }
54
+        if (bFilter)
55
+        {
56
+            int topLeft = srcPix0, top = srcPix1;
57
+            for (int y = 0; y < width; y++)
58
+            {
59
+                dsty * dstStride = x265_clip((int16_t)(top + ((srcPixwidth2 + 1 + y - topLeft) >> 1)));
60
+            }
61
+        }
62
+    }
63
+    else // Angular prediction.
64
+    {
65
+        // Get the reference pixels. The reference base is the first pixel to the top (neighbourBuf1).
66
+        pixel refBuf64;
67
+        const pixel *ref;
68
+
69
+        // Use the projected left neighbours and the top neighbours.
70
+        if (angle < 0)
71
+        {
72
+            // Number of neighbours projected.
73
+            int nbProjected = -((width * angle) >> 5) - 1;
74
+            pixel *ref_pix = refBuf + nbProjected + 1;
75
+
76
+            // Project the neighbours.
77
+            int invAngle = invAngleTable- angleOffset - 1;
78
+            int invAngleSum = 128;
79
+            for (int i = 0; i < nbProjected; i++)
80
+            {
81
+                invAngleSum += invAngle;
82
+                ref_pix- 2 - i = srcPixwidth2 + (invAngleSum >> 8);
83
+            }
84
+
85
+            // Copy the top-left and top pixels.
86
+            //for (int i = 0; i < width + 1; i++)
87
+            //ref_pix-1 + i = srcPixi;
88
+
89
+            memcpy(&ref_pix-1, srcPix, (width + 1)*sizeof(pixel));
90
+            ref = ref_pix;
91
+        }
92
+        else // Use the top and top-right neighbours.
93
+        {
94
+            ref = srcPix + 1;
95
+        }
96
+
97
+        // Pass every row.
98
+        int angleSum = 0;
99
+        for (int y = 0; y < width; y++)
100
+        {
101
+            angleSum += angle;
102
+            int offset = angleSum >> 5;
103
+            int fraction = angleSum & 31;
104
+
105
+            if (fraction) // Interpolate
106
+            {
107
+                if (width >= 8 && sizeof(pixel) == 1)
108
+                {
109
+                    const int16x8_t f0 = vdupq_n_s16(32 - fraction);
110
+                    const int16x8_t f1 = vdupq_n_s16(fraction);
111
+                    for (int x = 0; x < width; x += 8)
112
+                    {
113
+                        uint8x8_t in0 = *(uint8x8_t *)&refoffset + x;
114
+                        uint8x8_t in1 = *(uint8x8_t *)&refoffset + x + 1;
115
+                        int16x8_t lo = vmlaq_s16(vdupq_n_s16(16), vmovl_u8(in0), f0);
116
+                        lo = vmlaq_s16(lo, vmovl_u8(in1), f1);
117
+                        lo = vshrq_n_s16(lo, 5);
118
+                        *(uint8x8_t *)&dsty * dstStride + x = vmovn_u16(lo);
119
+                    }
120
+                }
121
+                else if (width >= 4 && sizeof(pixel) == 2)
122
+                {
123
+                    const int32x4_t f0 = vdupq_n_s32(32 - fraction);
124
+                    const int32x4_t f1 = vdupq_n_s32(fraction);
125
+                    for (int x = 0; x < width; x += 4)
126
+                    {
127
+                        uint16x4_t in0 = *(uint16x4_t *)&refoffset + x;
128
+                        uint16x4_t in1 = *(uint16x4_t *)&refoffset + x + 1;
129
+                        int32x4_t lo = vmlaq_s32(vdupq_n_s32(16), vmovl_u16(in0), f0);
130
+                        lo = vmlaq_s32(lo, vmovl_u16(in1), f1);
131
+                        lo = vshrq_n_s32(lo, 5);
132
+                        *(uint16x4_t *)&dsty * dstStride + x = vmovn_u32(lo);
133
+                    }
134
+                }
135
+                else
136
+                {
137
+                    for (int x = 0; x < width; x++)
138
+                    {
139
+                        dsty * dstStride + x = (pixel)(((32 - fraction) * refoffset + x + fraction * refoffset + x + 1 + 16) >> 5);
140
+                    }
141
+                }
142
+            }
143
+            else // Copy.
144
+            {
145
+                memcpy(&dsty * dstStride, &refoffset, sizeof(pixel)*width);
146
+            }
147
+        }
148
+    }
149
+
150
+    // Flip for horizontal.
151
+    if (horMode)
152
+    {
153
+        if (width == 8)
154
+        {
155
+            transpose8x8(dst, dst, dstStride, dstStride);
156
+        }
157
+        else if (width == 16)
158
+        {
159
+            transpose16x16(dst, dst, dstStride, dstStride);
160
+        }
161
+        else if (width == 32)
162
+        {
163
+            transpose32x32(dst, dst, dstStride, dstStride);
164
+        }
165
+        else
166
+        {
167
+            for (int y = 0; y < width - 1; y++)
168
+            {
169
+                for (int x = y + 1; x < width; x++)
170
+                {
171
+                    pixel tmp              = dsty * dstStride + x;
172
+                    dsty * dstStride + x = dstx * dstStride + y;
173
+                    dstx * dstStride + y = tmp;
174
+                }
175
+            }
176
+        }
177
+    }
178
+}
179
+
180
+template<int log2Size>
181
+void all_angs_pred_neon(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
182
+{
183
+    const int size = 1 << log2Size;
184
+    for (int mode = 2; mode <= 34; mode++)
185
+    {
186
+        pixel *srcPix  = (g_intraFilterFlagsmode & size ? filtPix  : refPix);
187
+        pixel *out = dest + ((mode - 2) << (log2Size * 2));
188
+
189
+        intra_pred_ang_neon<size>(out, size, srcPix, mode, bLuma);
190
+
191
+        // Optimize code don't flip buffer
192
+        bool modeHor = (mode < 18);
193
+
194
+        // transpose the block if this is a horizontal mode
195
+        if (modeHor)
196
+        {
197
+            if (size == 8)
198
+            {
199
+                transpose8x8(out, out, size, size);
200
+            }
201
x265_3.6.tar.gz/source/common/aarch64/intrapred-prim.h Added
17
 
1
@@ -0,0 +1,15 @@
2
+#ifndef INTRAPRED_PRIM_H__
3
+
4
+#if defined(__aarch64__)
5
+
6
+namespace X265_NS
7
+{
8
+// x265 private namespace
9
+
10
+void setupIntraPrimitives_neon(EncoderPrimitives &p);
11
+}
12
+
13
+#endif
14
+
15
+#endif
16
+
17
x265_3.6.tar.gz/source/common/aarch64/ipfilter-common.S Added
201
 
1
@@ -0,0 +1,1436 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+// Macros below follow these conventions:
29
+// - input data in registers: v0, v1, v2, v3, v4, v5, v6, v7
30
+// - constants in registers: v24, v25, v26, v27, v31
31
+// - temporary registers: v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30.
32
+// - _32b macros output a result in v17.4s
33
+// - _64b and _32b_1 macros output results in v17.4s, v18.4s
34
+
35
+#include "asm.S"
36
+
37
+.arch           armv8-a
38
+
39
+#ifdef __APPLE__
40
+.section __RODATA,__rodata
41
+#else
42
+.section .rodata
43
+#endif
44
+
45
+.align 4
46
+
47
+.macro vextin8 v
48
+    ldp             d6, d7, x11, #16
49
+.if \v == 0
50
+    // qpel_filter_0 only uses values in v3
51
+    ext             v3.8b, v6.8b, v7.8b, #4
52
+.else
53
+.if \v != 3
54
+    ext             v0.8b, v6.8b, v7.8b, #1
55
+.endif
56
+    ext             v1.8b, v6.8b, v7.8b, #2
57
+    ext             v2.8b, v6.8b, v7.8b, #3
58
+    ext             v3.8b, v6.8b, v7.8b, #4
59
+    ext             v4.8b, v6.8b, v7.8b, #5
60
+    ext             v5.8b, v6.8b, v7.8b, #6
61
+    ext             v6.8b, v6.8b, v7.8b, #7
62
+.endif
63
+.endm
64
+
65
+.macro vextin8_64 v
66
+    ldp             q6, q7, x11, #32
67
+.if \v == 0
68
+    // qpel_filter_0 only uses values in v3
69
+    ext             v3.16b, v6.16b, v7.16b, #4
70
+.else
71
+.if \v != 3
72
+    // qpel_filter_3 does not use values in v0
73
+    ext             v0.16b, v6.16b, v7.16b, #1
74
+.endif
75
+    ext             v1.16b, v6.16b, v7.16b, #2
76
+    ext             v2.16b, v6.16b, v7.16b, #3
77
+    ext             v3.16b, v6.16b, v7.16b, #4
78
+    ext             v4.16b, v6.16b, v7.16b, #5
79
+    ext             v5.16b, v6.16b, v7.16b, #6
80
+.if \v == 1
81
+    ext             v6.16b, v6.16b, v7.16b, #7
82
+    // qpel_filter_1 does not use v7
83
+.else
84
+    ext             v16.16b, v6.16b, v7.16b, #7
85
+    ext             v7.16b, v6.16b, v7.16b, #8
86
+    mov             v6.16b, v16.16b
87
+.endif
88
+.endif
89
+.endm
90
+
91
+.macro vextin8_chroma v
92
+    ldp             d6, d7, x11, #16
93
+.if \v == 0
94
+    // qpel_filter_chroma_0 only uses values in v1
95
+    ext             v1.8b, v6.8b, v7.8b, #2
96
+.else
97
+    ext             v0.8b, v6.8b, v7.8b, #1
98
+    ext             v1.8b, v6.8b, v7.8b, #2
99
+    ext             v2.8b, v6.8b, v7.8b, #3
100
+    ext             v3.8b, v6.8b, v7.8b, #4
101
+.endif
102
+.endm
103
+
104
+.macro vextin8_chroma_64 v
105
+    ldp             q16, q17, x11, #32
106
+.if \v == 0
107
+    // qpel_filter_chroma_0 only uses values in v1
108
+    ext             v1.16b, v16.16b, v17.16b, #2
109
+.else
110
+    ext             v0.16b, v16.16b, v17.16b, #1
111
+    ext             v1.16b, v16.16b, v17.16b, #2
112
+    ext             v2.16b, v16.16b, v17.16b, #3
113
+    ext             v3.16b, v16.16b, v17.16b, #4
114
+.endif
115
+.endm
116
+
117
+.macro qpel_load_32b v
118
+.if \v == 0
119
+    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
120
+    ld1             {v3.8b}, x6, x1
121
+.elseif \v == 1 || \v == 2 || \v == 3
122
+.if \v != 3                           // not used in qpel_filter_3
123
+    ld1             {v0.8b}, x6, x1
124
+.else
125
+    add             x6, x6, x1
126
+.endif
127
+    ld1             {v1.8b}, x6, x1
128
+    ld1             {v2.8b}, x6, x1
129
+    ld1             {v3.8b}, x6, x1
130
+    ld1             {v4.8b}, x6, x1
131
+    ld1             {v5.8b}, x6, x1
132
+.if \v != 1                           // not used in qpel_filter_1
133
+    ld1             {v6.8b}, x6, x1
134
+    ld1             {v7.8b}, x6
135
+.else
136
+    ld1             {v6.8b}, x6
137
+.endif
138
+.endif
139
+.endm
140
+
141
+.macro qpel_load_64b v
142
+.if \v == 0
143
+    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
144
+    ld1             {v3.16b}, x6, x1
145
+.elseif \v == 1 || \v == 2 || \v == 3
146
+.if \v != 3                           // not used in qpel_filter_3
147
+    ld1             {v0.16b}, x6, x1
148
+.else
149
+    add             x6, x6, x1
150
+.endif
151
+    ld1             {v1.16b}, x6, x1
152
+    ld1             {v2.16b}, x6, x1
153
+    ld1             {v3.16b}, x6, x1
154
+    ld1             {v4.16b}, x6, x1
155
+    ld1             {v5.16b}, x6, x1
156
+.if \v != 1                           // not used in qpel_filter_1
157
+    ld1             {v6.16b}, x6, x1
158
+    ld1             {v7.16b}, x6
159
+.else
160
+    ld1             {v6.16b}, x6
161
+.endif
162
+.endif
163
+.endm
164
+
165
+.macro qpel_chroma_load_32b v
166
+.if \v == 0
167
+    // qpel_filter_chroma_0 only uses values in v1
168
+    add             x6, x6, x1
169
+    ldr             d1, x6
170
+.else
171
+    ld1             {v0.8b}, x6, x1
172
+    ld1             {v1.8b}, x6, x1
173
+    ld1             {v2.8b}, x6, x1
174
+    ld1             {v3.8b}, x6
175
+.endif
176
+.endm
177
+
178
+.macro qpel_chroma_load_64b v
179
+.if \v == 0
180
+    // qpel_filter_chroma_0 only uses values in v1
181
+    add             x6, x6, x1
182
+    ldr             q1, x6
183
+.else
184
+    ld1             {v0.16b}, x6, x1
185
+    ld1             {v1.16b}, x6, x1
186
+    ld1             {v2.16b}, x6, x1
187
+    ld1             {v3.16b}, x6
188
+.endif
189
+.endm
190
+
191
+//          a, b,   c,  d,  e,   f, g,  h
192
+// .hword   0, 0,   0, 64,  0,   0, 0,  0
193
+.macro qpel_start_0
194
+    movi            v24.16b, #64
195
+.endm
196
+
197
+.macro qpel_filter_0_32b
198
+    umull           v17.8h, v3.8b, v24.8b    // 64*d
199
+.endm
200
+
201
x265_3.6.tar.gz/source/common/aarch64/ipfilter-sve2.S Added
201
 
1
@@ -0,0 +1,1282 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// Functions in this file:
26
+// ***** luma_vpp *****
27
+// ***** luma_vps *****
28
+// ***** luma_vsp *****
29
+// ***** luma_vss *****
30
+// ***** luma_hpp *****
31
+// ***** luma_hps *****
32
+// ***** chroma_vpp *****
33
+// ***** chroma_vps *****
34
+// ***** chroma_vsp *****
35
+// ***** chroma_vss *****
36
+// ***** chroma_hpp *****
37
+// ***** chroma_hps *****
38
+
39
+#include "asm-sve.S"
40
+#include "ipfilter-common.S"
41
+
42
+.arch armv8-a+sve2
43
+
44
+#ifdef __APPLE__
45
+.section __RODATA,__rodata
46
+#else
47
+.section .rodata
48
+#endif
49
+
50
+.align 4
51
+
52
+.text
53
+
54
+.macro qpel_load_32b_sve2 v
55
+.if \v == 0
56
+    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
57
+    ld1b            {z3.h}, p0/z, x6
58
+    add             x6, x6, x1
59
+.elseif \v == 1 || \v == 2 || \v == 3
60
+.if \v != 3                           // not used in qpel_filter_3
61
+    ld1b            {z0.h}, p0/z, x6
62
+    add             x6, x6, x1
63
+.else
64
+    add             x6, x6, x1
65
+.endif
66
+    ld1b            {z1.h}, p0/z, x6
67
+    add             x6, x6, x1
68
+    ld1b            {z2.h}, p0/z, x6
69
+    add             x6, x6, x1
70
+    ld1b            {z3.h}, p0/z, x6
71
+    add             x6, x6, x1
72
+    ld1b            {z4.h}, p0/z, x6
73
+    add             x6, x6, x1
74
+    ld1b            {z5.h}, p0/z, x6
75
+    add             x6, x6, x1
76
+.if \v != 1                           // not used in qpel_filter_1
77
+    ld1b            {z6.h}, p0/z, x6
78
+    add             x6, x6, x1
79
+    ld1b            {z7.h}, p0/z, x6
80
+.else
81
+    ld1b            {z6.h}, p0/z, x6
82
+.endif
83
+.endif
84
+.endm
85
+
86
+.macro qpel_load_64b_sve2_gt_16 v
87
+.if \v == 0
88
+    add             x6, x6, x11       // do not load 3 values that are not used in qpel_filter_0
89
+    ld1b            {z3.h}, p2/z, x6
90
+    add             x6, x6, x1
91
+.elseif \v == 1 || \v == 2 || \v == 3
92
+.if \v != 3                           // not used in qpel_filter_3
93
+    ld1b            {z0.h}, p2/z, x6
94
+    add             x6, x6, x1
95
+.else
96
+    add             x6, x6, x1
97
+.endif
98
+    ld1b            {z1.h}, p2/z, x6
99
+    add             x6, x6, x1
100
+    ld1b            {z2.h}, p2/z, x6
101
+    add             x6, x6, x1
102
+    ld1b            {z3.h}, p2/z, x6
103
+    add             x6, x6, x1
104
+    ld1b            {z4.h}, p2/z, x6
105
+    add             x6, x6, x1
106
+    ld1b            {z5.h}, p2/z, x6
107
+    add             x6, x6, x1
108
+.if \v != 1                           // not used in qpel_filter_1
109
+    ld1b            {z6.h}, p2/z, x6
110
+    add             x6, x6, x1
111
+    ld1b            {z7.h}, p2/z, x6
112
+.else
113
+    ld1b            {z6.h}, p2/z, x6
114
+.endif
115
+.endif
116
+.endm
117
+
118
+.macro qpel_chroma_load_32b_sve2 v
119
+.if \v == 0
120
+    // qpel_filter_chroma_0 only uses values in v1
121
+    add             x6, x6, x1
122
+    ld1b            {z1.h}, p0/z, x6
123
+.else
124
+    ld1b            {z0.h}, p0/z, x6
125
+    add             x6, x6, x1
126
+    ld1b            {z1.h}, p0/z, x6
127
+    add             x6, x6, x1
128
+    ld1b            {z2.h}, p0/z, x6
129
+    add             x6, x6, x1
130
+    ld1b            {z3.h}, p0/z, x6
131
+.endif
132
+.endm
133
+
134
+.macro qpel_start_sve2_0
135
+    mov             z24.h, #64
136
+.endm
137
+
138
+.macro qpel_filter_sve2_0_32b
139
+    mul             z17.h, z3.h, z24.h    // 64*d
140
+.endm
141
+
142
+.macro qpel_filter_sve2_0_64b
143
+    qpel_filter_sve2_0_32b
144
+    mul             z18.h, z11.h, z24.h
145
+.endm
146
+
147
+.macro qpel_start_sve2_1
148
+    mov             z24.h, #58
149
+    mov             z25.h, #10
150
+    mov             z26.h, #17
151
+    mov             z27.h, #5
152
+.endm
153
+
154
+.macro qpel_filter_sve2_1_32b
155
+    mul             z19.h, z2.h, z25.h  // c*10
156
+    mul             z17.h, z3.h, z24.h  // d*58
157
+    mul             z21.h, z4.h, z26.h  // e*17
158
+    mul             z23.h, z5.h, z27.h  // f*5
159
+    sub             z17.h, z17.h, z19.h // d*58 - c*10
160
+    lsl             z18.h, z1.h, #2      // b*4
161
+    add             z17.h, z17.h, z21.h // d*58 - c*10 + e*17
162
+    sub             z21.h, z6.h, z0.h   // g - a
163
+    add             z17.h, z17.h, z18.h // d*58 - c*10 + e*17 + b*4
164
+    sub             z21.h, z21.h, z23.h // g - a - f*5
165
+    add             z17.h, z17.h, z21.h // d*58 - c*10 + e*17 + b*4 + g - a - f*5
166
+.endm
167
+
168
+.macro qpel_filter_sve2_1_64b
169
+    qpel_filter_sve2_1_32b
170
+    mul             z20.h, z10.h, z25.h  // c*10
171
+    mul             z18.h, z11.h, z24.h  // d*58
172
+    mul             z21.h, z12.h, z26.h  // e*17
173
+    mul             z23.h, z13.h, z27.h  // f*5
174
+    sub             z18.h, z18.h, z20.h   // d*58 - c*10
175
+    lsl             z28.h, z30.h, #2       // b*4
176
+    add             z18.h, z18.h, z21.h   // d*58 - c*10 + e*17
177
+    sub             z21.h, z14.h, z29.h   // g - a
178
+    add             z18.h, z18.h, z28.h   // d*58 - c*10 + e*17 + b*4
179
+    sub             z21.h, z21.h, z23.h   // g - a - f*5
180
+    add             z18.h, z18.h, z21.h   // d*58 - c*10 + e*17 + b*4 + g - a - f*5
181
+.endm
182
+
183
+.macro qpel_start_sve2_2
184
+    mov             z24.h, #11
185
+    mov             z25.h, #40
186
+.endm
187
+
188
+.macro qpel_filter_sve2_2_32b
189
+    add             z17.h, z3.h, z4.h     // d + e
190
+    add             z19.h, z2.h, z5.h     // c + f
191
+    add             z23.h, z1.h, z6.h     // b + g
192
+    add             z21.h, z0.h, z7.h     // a + h
193
+    mul             z17.h, z17.h, z25.h   // 40 * (d + e)
194
+    mul             z19.h, z19.h, z24.h   // 11 * (c + f)
195
+    lsl             z23.h, z23.h, #2       // (b + g) * 4
196
+    add             z19.h, z19.h, z21.h   // 11 * (c + f) + a + h
197
+    add             z17.h, z17.h, z23.h   // 40 * (d + e) + (b + g) * 4
198
+    sub             z17.h, z17.h, z19.h   // 40 * (d + e) + (b + g) * 4 - 11 * (c + f) - a - h
199
+.endm
200
+
201
x265_3.6.tar.gz/source/common/aarch64/ipfilter.S Added
201
 
1
@@ -0,0 +1,1054 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2021 MulticoreWare, Inc
4
+ *
5
+ * Authors: Sebastian Pop <spop@amazon.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// Functions in this file:
26
+// ***** luma_vpp *****
27
+// ***** luma_vps *****
28
+// ***** luma_vsp *****
29
+// ***** luma_vss *****
30
+// ***** luma_hpp *****
31
+// ***** luma_hps *****
32
+// ***** chroma_vpp *****
33
+// ***** chroma_vps *****
34
+// ***** chroma_vsp *****
35
+// ***** chroma_vss *****
36
+// ***** chroma_hpp *****
37
+// ***** chroma_hps *****
38
+
39
+#include "asm.S"
40
+#include "ipfilter-common.S"
41
+
42
+#ifdef __APPLE__
43
+.section __RODATA,__rodata
44
+#else
45
+.section .rodata
46
+#endif
47
+
48
+.align 4
49
+
50
+.text
51
+
52
+// ***** luma_vpp *****
53
+// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
54
+.macro LUMA_VPP_4xN h
55
+function x265_interp_8tap_vert_pp_4x\h\()_neon
56
+    movrel          x10, g_luma_s16
57
+    sub             x0, x0, x1
58
+    sub             x0, x0, x1, lsl #1         // src -= 3 * srcStride
59
+    lsl             x4, x4, #4
60
+    ldr             q0, x10, x4              // q0 = luma interpolate coeff
61
+    dup             v24.8h, v0.h0
62
+    dup             v25.8h, v0.h1
63
+    trn1            v24.2d, v24.2d, v25.2d
64
+    dup             v26.8h, v0.h2
65
+    dup             v27.8h, v0.h3
66
+    trn1            v26.2d, v26.2d, v27.2d
67
+    dup             v28.8h, v0.h4
68
+    dup             v29.8h, v0.h5
69
+    trn1            v28.2d, v28.2d, v29.2d
70
+    dup             v30.8h, v0.h6
71
+    dup             v31.8h, v0.h7
72
+    trn1            v30.2d, v30.2d, v31.2d
73
+
74
+    // prepare to load 8 lines
75
+    ld1             {v0.s}0, x0, x1
76
+    ld1             {v0.s}1, x0, x1
77
+    ushll           v0.8h, v0.8b, #0
78
+    ld1             {v1.s}0, x0, x1
79
+    ld1             {v1.s}1, x0, x1
80
+    ushll           v1.8h, v1.8b, #0
81
+    ld1             {v2.s}0, x0, x1
82
+    ld1             {v2.s}1, x0, x1
83
+    ushll           v2.8h, v2.8b, #0
84
+    ld1             {v3.s}0, x0, x1
85
+    ld1             {v3.s}1, x0, x1
86
+    ushll           v3.8h, v3.8b, #0
87
+
88
+    mov             x9, #\h
89
+.loop_4x\h:
90
+    ld1             {v4.s}0, x0, x1
91
+    ld1             {v4.s}1, x0, x1
92
+    ushll           v4.8h, v4.8b, #0
93
+
94
+    // row0-1
95
+    mul             v16.8h, v0.8h, v24.8h
96
+    ext             v21.16b, v0.16b, v1.16b, #8
97
+    mul             v17.8h, v21.8h, v24.8h
98
+    mov             v0.16b, v1.16b
99
+
100
+    // row2-3
101
+    mla             v16.8h, v1.8h, v26.8h
102
+    ext             v21.16b, v1.16b, v2.16b, #8
103
+    mla             v17.8h, v21.8h, v26.8h
104
+    mov             v1.16b, v2.16b
105
+
106
+    // row4-5
107
+    mla             v16.8h, v2.8h, v28.8h
108
+    ext             v21.16b, v2.16b, v3.16b, #8
109
+    mla             v17.8h, v21.8h, v28.8h
110
+    mov             v2.16b, v3.16b
111
+
112
+    // row6-7
113
+    mla             v16.8h, v3.8h, v30.8h
114
+    ext             v21.16b, v3.16b, v4.16b, #8
115
+    mla             v17.8h, v21.8h, v30.8h
116
+    mov             v3.16b, v4.16b
117
+
118
+    // sum row0-7
119
+    trn1            v20.2d, v16.2d, v17.2d
120
+    trn2            v21.2d, v16.2d, v17.2d
121
+    add             v16.8h, v20.8h, v21.8h
122
+
123
+    sqrshrun        v16.8b,  v16.8h,  #6
124
+    st1             {v16.s}0, x2, x3
125
+    st1             {v16.s}1, x2, x3
126
+
127
+    sub             x9, x9, #2
128
+    cbnz            x9, .loop_4x\h
129
+    ret
130
+endfunc
131
+.endm
132
+
133
+LUMA_VPP_4xN 4
134
+LUMA_VPP_4xN 8
135
+LUMA_VPP_4xN 16
136
+
137
+// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
138
+.macro LUMA_VPP w, h
139
+function x265_interp_8tap_vert_pp_\w\()x\h\()_neon
140
+    cmp             x4, #0
141
+    b.eq            0f
142
+    cmp             x4, #1
143
+    b.eq            1f
144
+    cmp             x4, #2
145
+    b.eq            2f
146
+    cmp             x4, #3
147
+    b.eq            3f
148
+0:
149
+    FILTER_LUMA_VPP \w, \h, 0
150
+1:
151
+    FILTER_LUMA_VPP \w, \h, 1
152
+2:
153
+    FILTER_LUMA_VPP \w, \h, 2
154
+3:
155
+    FILTER_LUMA_VPP \w, \h, 3
156
+endfunc
157
+.endm
158
+
159
+LUMA_VPP 8, 4
160
+LUMA_VPP 8, 8
161
+LUMA_VPP 8, 16
162
+LUMA_VPP 8, 32
163
+LUMA_VPP 12, 16
164
+LUMA_VPP 16, 4
165
+LUMA_VPP 16, 8
166
+LUMA_VPP 16, 16
167
+LUMA_VPP 16, 32
168
+LUMA_VPP 16, 64
169
+LUMA_VPP 16, 12
170
+LUMA_VPP 24, 32
171
+LUMA_VPP 32, 8
172
+LUMA_VPP 32, 16
173
+LUMA_VPP 32, 32
174
+LUMA_VPP 32, 64
175
+LUMA_VPP 32, 24
176
+LUMA_VPP 48, 64
177
+LUMA_VPP 64, 16
178
+LUMA_VPP 64, 32
179
+LUMA_VPP 64, 64
180
+LUMA_VPP 64, 48
181
+
182
+// ***** luma_vps *****
183
+// void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
184
+.macro LUMA_VPS_4xN h
185
+function x265_interp_8tap_vert_ps_4x\h\()_neon
186
+    lsl             x3, x3, #1
187
+    lsl             x5, x4, #6
188
+    lsl             x4, x1, #2
189
+    sub             x4, x4, x1
190
+    sub             x0, x0, x4
191
+
192
+    mov             w6, #8192
193
+    dup             v28.4s, w6
194
+    mov             x4, #\h
195
+    movrel          x12, g_lumaFilter
196
+    add             x12, x12, x5
197
+    ld1r            {v16.2d}, x12, #8
198
+    ld1r            {v17.2d}, x12, #8
199
+    ld1r            {v18.2d}, x12, #8
200
+    ld1r            {v19.2d}, x12, #8
201
x265_3.6.tar.gz/source/common/aarch64/loopfilter-prim.cpp Added
201
 
1
@@ -0,0 +1,291 @@
2
+#include "loopfilter-prim.h"
3
+
4
+#define PIXEL_MIN 0
5
+
6
+
7
+
8
+#if !(HIGH_BIT_DEPTH) && defined(HAVE_NEON)
9
+#include<arm_neon.h>
10
+
11
+namespace
12
+{
13
+
14
+
15
+/* get the sign of input variable (TODO: this is a dup, make common) */
16
+static inline int8_t signOf(int x)
17
+{
18
+    return (x >> 31) | ((int)((((uint32_t) - x)) >> 31));
19
+}
20
+
21
+static inline int8x8_t sign_diff_neon(const uint8x8_t in0, const uint8x8_t in1)
22
+{
23
+    int16x8_t in = vsubl_u8(in0, in1);
24
+    return vmovn_s16(vmaxq_s16(vminq_s16(in, vdupq_n_s16(1)), vdupq_n_s16(-1)));
25
+}
26
+
27
+static void calSign_neon(int8_t *dst, const pixel *src1, const pixel *src2, const int endX)
28
+{
29
+    int x = 0;
30
+    for (; (x + 8) <= endX; x += 8)
31
+    {
32
+        *(int8x8_t *)&dstx  = sign_diff_neon(*(uint8x8_t *)&src1x, *(uint8x8_t *)&src2x);
33
+    }
34
+
35
+    for (; x < endX; x++)
36
+    {
37
+        dstx = signOf(src1x - src2x);
38
+    }
39
+}
40
+
41
+static void processSaoCUE0_neon(pixel *rec, int8_t *offsetEo, int width, int8_t *signLeft, intptr_t stride)
42
+{
43
+
44
+
45
+    int y;
46
+    int8_t signRight, signLeft0;
47
+    int8_t edgeType;
48
+
49
+    for (y = 0; y < 2; y++)
50
+    {
51
+        signLeft0 = signLefty;
52
+        int x = 0;
53
+
54
+        if (width >= 8)
55
+        {
56
+            int8x8_t vsignRight;
57
+            int8x8x2_t shifter;
58
+            shifter.val10 = signLeft0;
59
+            static const int8x8_t index = {8, 0, 1, 2, 3, 4, 5, 6};
60
+            int8x8_t tbl = *(int8x8_t *)offsetEo;
61
+            for (; (x + 8) <= width; x += 8)
62
+            {
63
+                uint8x8_t in = *(uint8x8_t *)&recx;
64
+                vsignRight = sign_diff_neon(in, *(uint8x8_t *)&recx + 1);
65
+                shifter.val0 = vneg_s8(vsignRight);
66
+                int8x8_t tmp = shifter.val0;
67
+                int8x8_t edge = vtbl2_s8(shifter, index);
68
+                int8x8_t vedgeType = vadd_s8(vadd_s8(vsignRight, edge), vdup_n_s8(2));
69
+                shifter.val10 = tmp7;
70
+                int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
71
+                t1 = vaddw_u8(t1, in);
72
+                t1 = vmaxq_s16(t1, vdupq_n_s16(0));
73
+                t1 = vminq_s16(t1, vdupq_n_s16(255));
74
+                *(uint8x8_t *)&recx = vmovn_u16(t1);
75
+            }
76
+            signLeft0 = shifter.val10;
77
+        }
78
+        for (; x < width; x++)
79
+        {
80
+            signRight = ((recx - recx + 1) < 0) ? -1 : ((recx - recx + 1) > 0) ? 1 : 0;
81
+            edgeType = signRight + signLeft0 + 2;
82
+            signLeft0 = -signRight;
83
+            recx = x265_clip(recx + offsetEoedgeType);
84
+        }
85
+        rec += stride;
86
+    }
87
+}
88
+
89
+static void processSaoCUE1_neon(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int width)
90
+{
91
+    int x = 0;
92
+    int8_t signDown;
93
+    int edgeType;
94
+
95
+    if (width >= 8)
96
+    {
97
+        int8x8_t tbl = *(int8x8_t *)offsetEo;
98
+        for (; (x + 8) <= width; x += 8)
99
+        {
100
+            uint8x8_t in0 = *(uint8x8_t *)&recx;
101
+            uint8x8_t in1 = *(uint8x8_t *)&recx + stride;
102
+            int8x8_t vsignDown = sign_diff_neon(in0, in1);
103
+            int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1x), vdup_n_s8(2));
104
+            *(int8x8_t *)&upBuff1x = vneg_s8(vsignDown);
105
+            int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
106
+            t1 = vaddw_u8(t1, in0);
107
+            *(uint8x8_t *)&recx = vqmovun_s16(t1);
108
+        }
109
+    }
110
+    for (; x < width; x++)
111
+    {
112
+        signDown = signOf(recx - recx + stride);
113
+        edgeType = signDown + upBuff1x + 2;
114
+        upBuff1x = -signDown;
115
+        recx = x265_clip(recx + offsetEoedgeType);
116
+    }
117
+}
118
+
119
+static void processSaoCUE1_2Rows_neon(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int width)
120
+{
121
+    int y;
122
+    int8_t signDown;
123
+    int edgeType;
124
+
125
+    for (y = 0; y < 2; y++)
126
+    {
127
+        int x = 0;
128
+        if (width >= 8)
129
+        {
130
+            int8x8_t tbl = *(int8x8_t *)offsetEo;
131
+            for (; (x + 8) <= width; x += 8)
132
+            {
133
+                uint8x8_t in0 = *(uint8x8_t *)&recx;
134
+                uint8x8_t in1 = *(uint8x8_t *)&recx + stride;
135
+                int8x8_t vsignDown = sign_diff_neon(in0, in1);
136
+                int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1x), vdup_n_s8(2));
137
+                *(int8x8_t *)&upBuff1x = vneg_s8(vsignDown);
138
+                int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
139
+                t1 = vaddw_u8(t1, in0);
140
+                t1 = vmaxq_s16(t1, vdupq_n_s16(0));
141
+                t1 = vminq_s16(t1, vdupq_n_s16(255));
142
+                *(uint8x8_t *)&recx = vmovn_u16(t1);
143
+
144
+            }
145
+        }
146
+        for (; x < width; x++)
147
+        {
148
+            signDown = signOf(recx - recx + stride);
149
+            edgeType = signDown + upBuff1x + 2;
150
+            upBuff1x = -signDown;
151
+            recx = x265_clip(recx + offsetEoedgeType);
152
+        }
153
+        rec += stride;
154
+    }
155
+}
156
+
157
+static void processSaoCUE2_neon(pixel *rec, int8_t *bufft, int8_t *buff1, int8_t *offsetEo, int width, intptr_t stride)
158
+{
159
+    int x;
160
+
161
+    if (abs(buff1 - bufft) < 16)
162
+    {
163
+        for (x = 0; x < width; x++)
164
+        {
165
+            int8_t signDown = signOf(recx - recx + stride + 1);
166
+            int edgeType = signDown + buff1x + 2;
167
+            bufftx + 1 = -signDown;
168
+            recx = x265_clip(recx + offsetEoedgeType);;
169
+        }
170
+    }
171
+    else
172
+    {
173
+        int8x8_t tbl = *(int8x8_t *)offsetEo;
174
+        x = 0;
175
+        for (; (x + 8) <= width; x += 8)
176
+        {
177
+            uint8x8_t in0 = *(uint8x8_t *)&recx;
178
+            uint8x8_t in1 = *(uint8x8_t *)&recx + stride + 1;
179
+            int8x8_t vsignDown = sign_diff_neon(in0, in1);
180
+            int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&buff1x), vdup_n_s8(2));
181
+            *(int8x8_t *)&bufftx + 1 = vneg_s8(vsignDown);
182
+            int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
183
+            t1 = vaddw_u8(t1, in0);
184
+            t1 = vmaxq_s16(t1, vdupq_n_s16(0));
185
+            t1 = vminq_s16(t1, vdupq_n_s16(255));
186
+            *(uint8x8_t *)&recx = vmovn_u16(t1);
187
+        }
188
+        for (; x < width; x++)
189
+        {
190
+            int8_t signDown = signOf(recx - recx + stride + 1);
191
+            int edgeType = signDown + buff1x + 2;
192
+            bufftx + 1 = -signDown;
193
+            recx = x265_clip(recx + offsetEoedgeType);;
194
+        }
195
+
196
+    }
197
+}
198
+
199
+
200
+static void processSaoCUE3_neon(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int startX, int endX)
201
x265_3.6.tar.gz/source/common/aarch64/loopfilter-prim.h Added
18
 
1
@@ -0,0 +1,16 @@
2
+#ifndef _LOOPFILTER_NEON_H__
3
+#define _LOOPFILTER_NEON_H__
4
+
5
+#include "common.h"
6
+#include "primitives.h"
7
+
8
+#define PIXEL_MIN 0
9
+
10
+namespace X265_NS
11
+{
12
+void setupLoopFilterPrimitives_neon(EncoderPrimitives &p);
13
+
14
+};
15
+
16
+
17
+#endif
18
x265_3.6.tar.gz/source/common/aarch64/mc-a-common.S Added
50
 
1
@@ -0,0 +1,48 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+.arch           armv8-a
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.macro addAvg_start
37
+    lsl             x3, x3, #1
38
+    lsl             x4, x4, #1
39
+    mov             w11, #0x40
40
+    dup             v30.16b, w11
41
+.endm
42
+
43
+.macro addavg_1 v0, v1
44
+    add             \v0\().8h, \v0\().8h, \v1\().8h
45
+    saddl           v16.4s, \v0\().4h, v30.4h
46
+    saddl2          v17.4s, \v0\().8h, v30.8h
47
+    shrn            \v0\().4h, v16.4s, #7
48
+    shrn2           \v0\().8h, v17.4s, #7
49
+.endm
50
x265_3.6.tar.gz/source/common/aarch64/mc-a-sve2.S Added
201
 
1
@@ -0,0 +1,924 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "mc-a-common.S"
27
+
28
+.arch armv8-a+sve2
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+function PFX(pixel_avg_pp_12x16_sve2)
41
+    sub             x1, x1, #4
42
+    sub             x3, x3, #4
43
+    sub             x5, x5, #4
44
+    ptrue           p0.s, vl1
45
+    ptrue           p1.b, vl8
46
+    mov             x11, #4
47
+.rept 16
48
+    ld1w            {z0.s}, p0/z, x2
49
+    ld1b            {z1.b}, p1/z, x2, x11
50
+    ld1w            {z2.s}, p0/z, x4
51
+    ld1b            {z3.b}, p1/z, x4, x11
52
+    add             x2, x2, #4
53
+    add             x2, x2, x3
54
+    add             x4, x4, #4
55
+    add             x4, x4, x5
56
+    urhadd          z0.b, p1/m, z0.b, z2.b
57
+    urhadd          z1.b, p1/m, z1.b, z3.b
58
+    st1b            {z0.b}, p1, x0
59
+    st1b            {z1.b}, p1, x0, x11
60
+    add             x0, x0, #4
61
+    add             x0, x0, x1
62
+.endr
63
+    ret
64
+endfunc
65
+
66
+function PFX(pixel_avg_pp_24x32_sve2)
67
+    mov             w12, #4
68
+    rdvl            x9, #1
69
+    cmp             x9, #16
70
+    bgt             .vl_gt_16_pixel_avg_pp_24x32
71
+    sub             x1, x1, #16
72
+    sub             x3, x3, #16
73
+    sub             x5, x5, #16
74
+.lpavg_24x32_sve2:
75
+    sub             w12, w12, #1
76
+.rept 8
77
+    ld1             {v0.16b}, x2, #16
78
+    ld1             {v1.8b}, x2, x3
79
+    ld1             {v2.16b}, x4, #16
80
+    ld1             {v3.8b}, x4, x5
81
+    urhadd          v0.16b, v0.16b, v2.16b
82
+    urhadd          v1.8b, v1.8b, v3.8b
83
+    st1             {v0.16b}, x0, #16
84
+    st1             {v1.8b}, x0, x1
85
+.endr
86
+    cbnz            w12, .lpavg_24x32_sve2
87
+    ret
88
+.vl_gt_16_pixel_avg_pp_24x32:
89
+    mov             x10, #24
90
+    mov             x11, #0
91
+    whilelt         p0.b, x11, x10
92
+.vl_gt_16_loop_pixel_avg_pp_24x32:
93
+    sub             w12, w12, #1
94
+.rept 8
95
+    ld1b            {z0.b}, p0/z, x2
96
+    ld1b            {z2.b}, p0/z, x4
97
+    add             x2, x2, x3
98
+    add             x4, x4, x5
99
+    urhadd          z0.b, p0/m, z0.b, z2.b
100
+    st1b            {z0.b}, p0, x0
101
+    add             x0, x0, x1
102
+.endr
103
+    cbnz            w12, .vl_gt_16_loop_pixel_avg_pp_24x32
104
+    ret
105
+endfunc
106
+
107
+.macro pixel_avg_pp_32xN_sve2 h
108
+function PFX(pixel_avg_pp_32x\h\()_sve2)
109
+    rdvl            x9, #1
110
+    cmp             x9, #16
111
+    bgt             .vl_gt_16_pixel_avg_pp_32_\h
112
+.rept \h
113
+    ld1             {v0.16b-v1.16b}, x2, x3
114
+    ld1             {v2.16b-v3.16b}, x4, x5
115
+    urhadd          v0.16b, v0.16b, v2.16b
116
+    urhadd          v1.16b, v1.16b, v3.16b
117
+    st1             {v0.16b-v1.16b}, x0, x1
118
+.endr
119
+    ret
120
+.vl_gt_16_pixel_avg_pp_32_\h:
121
+    ptrue           p0.b, vl32
122
+.rept \h
123
+    ld1b            {z0.b}, p0/z, x2
124
+    ld1b            {z2.b}, p0/z, x4
125
+    add             x2, x2, x3
126
+    add             x4, x4, x5
127
+    urhadd          z0.b, p0/m, z0.b, z2.b
128
+    st1b            {z0.b}, p0, x0
129
+    add             x0, x0, x1
130
+.endr
131
+    ret
132
+endfunc
133
+.endm
134
+
135
+pixel_avg_pp_32xN_sve2 8
136
+pixel_avg_pp_32xN_sve2 16
137
+pixel_avg_pp_32xN_sve2 24
138
+
139
+.macro pixel_avg_pp_32xN1_sve2 h
140
+function PFX(pixel_avg_pp_32x\h\()_sve2)
141
+    rdvl            x9, #1
142
+    cmp             x9, #16
143
+    bgt             .vl_gt_16_pixel_avg_pp_32xN1_\h
144
+    mov             w12, #\h / 8
145
+.lpavg_sve2_32x\h\():
146
+    sub             w12, w12, #1
147
+.rept 8
148
+    ld1             {v0.16b-v1.16b}, x2, x3
149
+    ld1             {v2.16b-v3.16b}, x4, x5
150
+    urhadd          v0.16b, v0.16b, v2.16b
151
+    urhadd          v1.16b, v1.16b, v3.16b
152
+    st1             {v0.16b-v1.16b}, x0, x1
153
+.endr
154
+    cbnz            w12, .lpavg_sve2_32x\h
155
+    ret
156
+.vl_gt_16_pixel_avg_pp_32xN1_\h:
157
+    ptrue           p0.b, vl32
158
+    mov             w12, #\h / 8
159
+.eq_32_loop_pixel_avg_pp_32xN1_\h\():
160
+    sub             w12, w12, #1
161
+.rept 8
162
+    ld1b            {z0.b}, p0/z, x2
163
+    ld1b            {z2.b}, p0/z, x4
164
+    add             x2, x2, x3
165
+    add             x4, x4, x5
166
+    urhadd          z0.b, p0/m, z0.b, z2.b
167
+    st1b            {z0.b}, p0, x0
168
+    add             x0, x0, x1
169
+.endr
170
+    cbnz            w12, .eq_32_loop_pixel_avg_pp_32xN1_\h
171
+    ret
172
+endfunc
173
+.endm
174
+
175
+pixel_avg_pp_32xN1_sve2 32
176
+pixel_avg_pp_32xN1_sve2 64
177
+
178
+function PFX(pixel_avg_pp_48x64_sve2)
179
+    rdvl            x9, #1
180
+    cmp             x9, #16
181
+    bgt             .vl_gt_16_pixel_avg_pp_48x64
182
+    mov             w12, #8
183
+.lpavg_48x64_sve2:
184
+    sub             w12, w12, #1
185
+.rept 8
186
+    ld1             {v0.16b-v2.16b}, x2, x3
187
+    ld1             {v3.16b-v5.16b}, x4, x5
188
+    urhadd          v0.16b, v0.16b, v3.16b
189
+    urhadd          v1.16b, v1.16b, v4.16b
190
+    urhadd          v2.16b, v2.16b, v5.16b
191
+    st1             {v0.16b-v2.16b}, x0, x1
192
+.endr
193
+    cbnz            w12, .lpavg_48x64_sve2
194
+    ret
195
+.vl_gt_16_pixel_avg_pp_48x64:
196
+    cmp             x9, #32
197
+    bgt             .vl_gt_32_pixel_avg_pp_48x64
198
+    ptrue           p0.b, vl32
199
+    ptrue           p1.b, vl16
200
+    mov             w12, #8
201
x265_3.5.tar.gz/source/common/aarch64/mc-a.S -> x265_3.6.tar.gz/source/common/aarch64/mc-a.S Changed
201
 
1
@@ -1,7 +1,8 @@
2
 /*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
+ * Copyright (C) 2020-2021 MulticoreWare, Inc
5
  *
6
  * Authors: Hongbin Liu <liuhongbin1@huawei.com>
7
+ *          Sebastian Pop <spop@amazon.com>
8
  *
9
  * This program is free software; you can redistribute it and/or modify
10
  * it under the terms of the GNU General Public License as published by
11
@@ -22,15 +23,20 @@
12
  *****************************************************************************/
13
 
14
 #include "asm.S"
15
+#include "mc-a-common.S"
16
 
17
+#ifdef __APPLE__
18
+.section __RODATA,__rodata
19
+#else
20
 .section .rodata
21
+#endif
22
 
23
 .align 4
24
 
25
 .text
26
 
27
 .macro pixel_avg_pp_4xN_neon h
28
-function x265_pixel_avg_pp_4x\h\()_neon
29
+function PFX(pixel_avg_pp_4x\h\()_neon)
30
 .rept \h
31
     ld1             {v0.s}0, x2, x3
32
     ld1             {v1.s}0, x4, x5
33
@@ -46,7 +52,7 @@
34
 pixel_avg_pp_4xN_neon 16
35
 
36
 .macro pixel_avg_pp_8xN_neon h
37
-function x265_pixel_avg_pp_8x\h\()_neon
38
+function PFX(pixel_avg_pp_8x\h\()_neon)
39
 .rept \h
40
     ld1             {v0.8b}, x2, x3
41
     ld1             {v1.8b}, x4, x5
42
@@ -61,3 +67,491 @@
43
 pixel_avg_pp_8xN_neon 8
44
 pixel_avg_pp_8xN_neon 16
45
 pixel_avg_pp_8xN_neon 32
46
+
47
+function PFX(pixel_avg_pp_12x16_neon)
48
+    sub             x1, x1, #4
49
+    sub             x3, x3, #4
50
+    sub             x5, x5, #4
51
+.rept 16
52
+    ld1             {v0.s}0, x2, #4
53
+    ld1             {v1.8b}, x2, x3
54
+    ld1             {v2.s}0, x4, #4
55
+    ld1             {v3.8b}, x4, x5
56
+    urhadd          v4.8b, v0.8b, v2.8b
57
+    urhadd          v5.8b, v1.8b, v3.8b
58
+    st1             {v4.s}0, x0, #4
59
+    st1             {v5.8b}, x0, x1
60
+.endr
61
+    ret
62
+endfunc
63
+
64
+.macro pixel_avg_pp_16xN_neon h
65
+function PFX(pixel_avg_pp_16x\h\()_neon)
66
+.rept \h
67
+    ld1             {v0.16b}, x2, x3
68
+    ld1             {v1.16b}, x4, x5
69
+    urhadd          v2.16b, v0.16b, v1.16b
70
+    st1             {v2.16b}, x0, x1
71
+.endr
72
+    ret
73
+endfunc
74
+.endm
75
+
76
+pixel_avg_pp_16xN_neon 4
77
+pixel_avg_pp_16xN_neon 8
78
+pixel_avg_pp_16xN_neon 12
79
+pixel_avg_pp_16xN_neon 16
80
+pixel_avg_pp_16xN_neon 32
81
+
82
+function PFX(pixel_avg_pp_16x64_neon)
83
+    mov             w12, #8
84
+.lpavg_16x64:
85
+    sub             w12, w12, #1
86
+.rept 8
87
+    ld1             {v0.16b}, x2, x3
88
+    ld1             {v1.16b}, x4, x5
89
+    urhadd          v2.16b, v0.16b, v1.16b
90
+    st1             {v2.16b}, x0, x1
91
+.endr
92
+    cbnz            w12, .lpavg_16x64
93
+    ret
94
+endfunc
95
+
96
+function PFX(pixel_avg_pp_24x32_neon)
97
+    sub             x1, x1, #16
98
+    sub             x3, x3, #16
99
+    sub             x5, x5, #16
100
+    mov             w12, #4
101
+.lpavg_24x32:
102
+    sub             w12, w12, #1
103
+.rept 8
104
+    ld1             {v0.16b}, x2, #16
105
+    ld1             {v1.8b}, x2, x3
106
+    ld1             {v2.16b}, x4, #16
107
+    ld1             {v3.8b}, x4, x5
108
+    urhadd          v0.16b, v0.16b, v2.16b
109
+    urhadd          v1.8b, v1.8b, v3.8b
110
+    st1             {v0.16b}, x0, #16
111
+    st1             {v1.8b}, x0, x1
112
+.endr
113
+    cbnz            w12, .lpavg_24x32
114
+    ret
115
+endfunc
116
+
117
+.macro pixel_avg_pp_32xN_neon h
118
+function PFX(pixel_avg_pp_32x\h\()_neon)
119
+.rept \h
120
+    ld1             {v0.16b-v1.16b}, x2, x3
121
+    ld1             {v2.16b-v3.16b}, x4, x5
122
+    urhadd          v0.16b, v0.16b, v2.16b
123
+    urhadd          v1.16b, v1.16b, v3.16b
124
+    st1             {v0.16b-v1.16b}, x0, x1
125
+.endr
126
+    ret
127
+endfunc
128
+.endm
129
+
130
+pixel_avg_pp_32xN_neon 8
131
+pixel_avg_pp_32xN_neon 16
132
+pixel_avg_pp_32xN_neon 24
133
+
134
+.macro pixel_avg_pp_32xN1_neon h
135
+function PFX(pixel_avg_pp_32x\h\()_neon)
136
+    mov             w12, #\h / 8
137
+.lpavg_32x\h\():
138
+    sub             w12, w12, #1
139
+.rept 8
140
+    ld1             {v0.16b-v1.16b}, x2, x3
141
+    ld1             {v2.16b-v3.16b}, x4, x5
142
+    urhadd          v0.16b, v0.16b, v2.16b
143
+    urhadd          v1.16b, v1.16b, v3.16b
144
+    st1             {v0.16b-v1.16b}, x0, x1
145
+.endr
146
+    cbnz            w12, .lpavg_32x\h
147
+    ret
148
+endfunc
149
+.endm
150
+
151
+pixel_avg_pp_32xN1_neon 32
152
+pixel_avg_pp_32xN1_neon 64
153
+
154
+function PFX(pixel_avg_pp_48x64_neon)
155
+    mov             w12, #8
156
+.lpavg_48x64:
157
+    sub             w12, w12, #1
158
+.rept 8
159
+    ld1             {v0.16b-v2.16b}, x2, x3
160
+    ld1             {v3.16b-v5.16b}, x4, x5
161
+    urhadd          v0.16b, v0.16b, v3.16b
162
+    urhadd          v1.16b, v1.16b, v4.16b
163
+    urhadd          v2.16b, v2.16b, v5.16b
164
+    st1             {v0.16b-v2.16b}, x0, x1
165
+.endr
166
+    cbnz            w12, .lpavg_48x64
167
+    ret
168
+endfunc
169
+
170
+.macro pixel_avg_pp_64xN_neon h
171
+function PFX(pixel_avg_pp_64x\h\()_neon)
172
+    mov             w12, #\h / 4
173
+.lpavg_64x\h\():
174
+    sub             w12, w12, #1
175
+.rept 4
176
+    ld1             {v0.16b-v3.16b}, x2, x3
177
+    ld1             {v4.16b-v7.16b}, x4, x5
178
+    urhadd          v0.16b, v0.16b, v4.16b
179
+    urhadd          v1.16b, v1.16b, v5.16b
180
+    urhadd          v2.16b, v2.16b, v6.16b
181
+    urhadd          v3.16b, v3.16b, v7.16b
182
+    st1             {v0.16b-v3.16b}, x0, x1
183
+.endr
184
+    cbnz            w12, .lpavg_64x\h
185
+    ret
186
+endfunc
187
+.endm
188
+
189
+pixel_avg_pp_64xN_neon 16
190
+pixel_avg_pp_64xN_neon 32
191
+pixel_avg_pp_64xN_neon 48
192
+pixel_avg_pp_64xN_neon 64
193
+
194
+// void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
195
+.macro addAvg_2xN h
196
+function PFX(addAvg_2x\h\()_neon)
197
+    addAvg_start
198
+.rept \h / 2
199
+    ldr             w10, x0
200
+    ldr             w11, x1
201
x265_3.6.tar.gz/source/common/aarch64/p2s-common.S Added
104
 
1
@@ -0,0 +1,102 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+.arch           armv8-a
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+#if HIGH_BIT_DEPTH
39
+# if BIT_DEPTH == 10
40
+#  define P2S_SHIFT 4
41
+# elif BIT_DEPTH == 12
42
+#  define P2S_SHIFT 2
43
+# endif
44
+.macro p2s_start
45
+    add             x3, x3, x3
46
+    add             x1, x1, x1
47
+    movi            v31.8h, #0xe0, lsl #8
48
+.endm
49
+
50
+#else // if !HIGH_BIT_DEPTH
51
+# define P2S_SHIFT 6
52
+.macro p2s_start
53
+    add             x3, x3, x3
54
+    movi            v31.8h, #0xe0, lsl #8
55
+.endm
56
+#endif // HIGH_BIT_DEPTH
57
+
58
+.macro p2s_2x2
59
+#if HIGH_BIT_DEPTH
60
+    ld1             {v0.s}0, x0, x1
61
+    ld1             {v0.s}1, x0, x1
62
+    shl             v3.8h, v0.8h, #P2S_SHIFT
63
+#else
64
+    ldrh            w10, x0
65
+    add             x0, x0, x1
66
+    ldrh            w11, x0
67
+    orr             w10, w10, w11, lsl #16
68
+    add             x0, x0, x1
69
+    dup             v0.4s, w10
70
+    ushll           v3.8h, v0.8b, #P2S_SHIFT
71
+#endif
72
+    add             v3.8h, v3.8h, v31.8h
73
+    st1             {v3.s}0, x2, x3
74
+    st1             {v3.s}1, x2, x3
75
+.endm
76
+
77
+.macro p2s_6x2
78
+#if HIGH_BIT_DEPTH
79
+    ld1             {v0.d}0, x0, #8
80
+    ld1             {v1.s}0, x0, x1
81
+    ld1             {v0.d}1, x0, #8
82
+    ld1             {v1.s}1, x0, x1
83
+    shl             v3.8h, v0.8h, #P2S_SHIFT
84
+    shl             v4.8h, v1.8h, #P2S_SHIFT
85
+#else
86
+    ldr             s0, x0
87
+    ldrh            w10, x0, #4
88
+    add             x0, x0, x1
89
+    ld1             {v0.s}1, x0
90
+    ldrh            w11, x0, #4
91
+    add             x0, x0, x1
92
+    orr             w10, w10, w11, lsl #16
93
+    dup             v1.4s, w10
94
+    ushll           v3.8h, v0.8b, #P2S_SHIFT
95
+    ushll           v4.8h, v1.8b, #P2S_SHIFT
96
+#endif
97
+    add             v3.8h, v3.8h, v31.8h
98
+    add             v4.8h, v4.8h, v31.8h
99
+    st1             {v3.d}0, x2, #8
100
+    st1             {v4.s}0, x2, x3
101
+    st1             {v3.d}1, x2, #8
102
+    st1             {v4.s}1, x2, x3
103
+.endm
104
x265_3.6.tar.gz/source/common/aarch64/p2s-sve.S Added
201
 
1
@@ -0,0 +1,445 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "p2s-common.S"
27
+
28
+.arch armv8-a+sve
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+#if HIGH_BIT_DEPTH
41
+# if BIT_DEPTH == 10
42
+#  define P2S_SHIFT 4
43
+# elif BIT_DEPTH == 12
44
+#  define P2S_SHIFT 2
45
+# endif
46
+
47
+.macro p2s_start_sve
48
+    add             x3, x3, x3
49
+    add             x1, x1, x1
50
+    mov             z31.h, #0xe0, lsl #8
51
+.endm
52
+
53
+#else // if !HIGH_BIT_DEPTH
54
+# define P2S_SHIFT 6
55
+.macro p2s_start_sve
56
+    add             x3, x3, x3
57
+    mov             z31.h, #0xe0, lsl #8
58
+.endm
59
+
60
+#endif // HIGH_BIT_DEPTH
61
+
62
+// filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
63
+.macro p2s_2xN_sve h
64
+function PFX(filterPixelToShort_2x\h\()_sve)
65
+    p2s_start_sve
66
+.rept \h / 2
67
+    p2s_2x2
68
+.endr
69
+    ret
70
+endfunc
71
+.endm
72
+
73
+p2s_2xN_sve 4
74
+p2s_2xN_sve 8
75
+p2s_2xN_sve 16
76
+
77
+.macro p2s_6xN_sve h
78
+function PFX(filterPixelToShort_6x\h\()_sve)
79
+    p2s_start_sve
80
+    sub             x3, x3, #8
81
+#if HIGH_BIT_DEPTH
82
+    sub             x1, x1, #8
83
+#endif
84
+.rept \h / 2
85
+    p2s_6x2
86
+.endr
87
+    ret
88
+endfunc
89
+.endm
90
+
91
+p2s_6xN_sve 8
92
+p2s_6xN_sve 16
93
+
94
+function PFX(filterPixelToShort_4x2_sve)
95
+    p2s_start_sve
96
+#if HIGH_BIT_DEPTH
97
+    ptrue           p0.h, vl8
98
+    index           z1.d, #0, x1
99
+    index           z2.d, #0, x3
100
+    ld1d            {z3.d}, p0/z, x0, z1.d
101
+    lsl             z3.h, p0/m, z3.h, #P2S_SHIFT
102
+    add             z3.h, p0/m, z3.h, z31.h
103
+    st1d            {z3.d}, p0, x2, z2.d
104
+#else
105
+    ptrue           p0.h, vl4
106
+    ld1b            {z0.h}, p0/z, x0
107
+    add             x0, x0, x1
108
+    ld1b            {z1.h}, p0/z, x0
109
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
110
+    lsl             z1.h, p0/m, z1.h, #P2S_SHIFT
111
+    add             z0.h, p0/m, z0.h, z31.h
112
+    add             z1.h, p0/m, z1.h, z31.h
113
+    st1h            {z0.h}, p0, x2
114
+    add             x2, x2, x3
115
+    st1h            {z1.h}, p0, x2
116
+#endif
117
+    ret
118
+endfunc
119
+
120
+
121
+.macro p2s_8xN_sve h
122
+function PFX(filterPixelToShort_8x\h\()_sve)
123
+    p2s_start_sve
124
+    ptrue           p0.h, vl8
125
+.rept \h
126
+#if HIGH_BIT_DEPTH
127
+    ld1d            {z0.d}, p0/z, x0
128
+    add             x0, x0, x1
129
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
130
+    add             z0.h, p0/m, z0.h, z31.h
131
+    st1h            {z0.h}, p0, x2
132
+    add             x2, x2, x3
133
+#else
134
+    ld1b            {z0.h}, p0/z, x0
135
+    add             x0, x0, x1
136
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
137
+    add             z0.h, p0/m, z0.h, z31.h
138
+    st1h            {z0.h}, p0, x2
139
+    add             x2, x2, x3
140
+#endif
141
+.endr
142
+    ret
143
+endfunc
144
+.endm
145
+
146
+p2s_8xN_sve 2
147
+
148
+.macro p2s_32xN_sve h
149
+function PFX(filterPixelToShort_32x\h\()_sve)
150
+#if HIGH_BIT_DEPTH
151
+    p2s_start_sve
152
+    rdvl            x9, #1
153
+    cmp             x9, #16
154
+    bgt             .vl_gt_16_filterPixelToShort_high_32x\h
155
+    ptrue           p0.h, vl8
156
+.rept \h
157
+    ld1h            {z0.h}, p0/z, x0
158
+    ld1h            {z1.h}, p0/z, x0, #1, mul vl
159
+    ld1h            {z2.h}, p0/z, x0, #2, mul vl
160
+    ld1h            {z3.h}, p0/z, x0, #3, mul vl
161
+    add             x0, x0, x1
162
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
163
+    lsl             z1.h, p0/m, z1.h, #P2S_SHIFT
164
+    lsl             z2.h, p0/m, z2.h, #P2S_SHIFT
165
+    lsl             z3.h, p0/m, z3.h, #P2S_SHIFT
166
+    add             z0.h, p0/m, z0.h, z31.h
167
+    add             z1.h, p0/m, z1.h, z31.h
168
+    add             z2.h, p0/m, z2.h, z31.h
169
+    add             z3.h, p0/m, z3.h, z31.h
170
+    st1h            {z0.h}, p0, x2
171
+    st1h            {z1.h}, p0, x2, #1, mul vl
172
+    st1h            {z2.h}, p0, x2, #2, mul vl
173
+    st1h            {z3.h}, p0, x2, #3, mul vl
174
+    add             x2, x2, x3
175
+.endr
176
+    ret
177
+.vl_gt_16_filterPixelToShort_high_32x\h\():
178
+    cmp             x9, #48
179
+    bgt             .vl_gt_48_filterPixelToShort_high_32x\h
180
+    ptrue           p0.h, vl16
181
+.rept \h
182
+    ld1h            {z0.h}, p0/z, x0
183
+    ld1h            {z1.h}, p0/z, x0, #1, mul vl
184
+    add             x0, x0, x1
185
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
186
+    lsl             z1.h, p0/m, z1.h, #P2S_SHIFT
187
+    add             z0.h, p0/m, z0.h, z31.h
188
+    add             z1.h, p0/m, z1.h, z31.h
189
+    st1h            {z0.h}, p0, x2
190
+    st1h            {z1.h}, p0, x2, #1, mul vl
191
+    add             x2, x2, x3
192
+.endr
193
+    ret
194
+.vl_gt_48_filterPixelToShort_high_32x\h\():
195
+    ptrue           p0.h, vl32
196
+.rept \h
197
+    ld1h            {z0.h}, p0/z, x0
198
+    add             x0, x0, x1
199
+    lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
200
+    add             z0.h, p0/m, z0.h, z31.h
201
x265_3.6.tar.gz/source/common/aarch64/p2s.S Added
201
 
1
@@ -0,0 +1,386 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2021 MulticoreWare, Inc
4
+ *
5
+ * Authors: Sebastian Pop <spop@amazon.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+#include "p2s-common.S"
27
+
28
+#ifdef __APPLE__
29
+.section __RODATA,__rodata
30
+#else
31
+.section .rodata
32
+#endif
33
+
34
+.align 4
35
+
36
+.text
37
+
38
+// filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
39
+.macro p2s_2xN h
40
+function PFX(filterPixelToShort_2x\h\()_neon)
41
+    p2s_start
42
+.rept \h / 2
43
+    p2s_2x2
44
+.endr
45
+    ret
46
+endfunc
47
+.endm
48
+
49
+p2s_2xN 4
50
+p2s_2xN 8
51
+p2s_2xN 16
52
+
53
+.macro p2s_6xN h
54
+function PFX(filterPixelToShort_6x\h\()_neon)
55
+    p2s_start
56
+    sub             x3, x3, #8
57
+#if HIGH_BIT_DEPTH
58
+    sub             x1, x1, #8
59
+#endif
60
+.rept \h / 2
61
+    p2s_6x2
62
+.endr
63
+    ret
64
+endfunc
65
+.endm
66
+
67
+p2s_6xN 8
68
+p2s_6xN 16
69
+
70
+function PFX(filterPixelToShort_4x2_neon)
71
+    p2s_start
72
+#if HIGH_BIT_DEPTH
73
+    ld1             {v0.d}0, x0, x1
74
+    ld1             {v0.d}1, x0, x1
75
+    shl             v3.8h, v0.8h, #P2S_SHIFT
76
+#else
77
+    ld1             {v0.s}0, x0, x1
78
+    ld1             {v0.s}1, x0, x1
79
+    ushll           v3.8h, v0.8b, #P2S_SHIFT
80
+#endif
81
+    add             v3.8h, v3.8h, v31.8h
82
+    st1             {v3.d}0, x2, x3
83
+    st1             {v3.d}1, x2, x3
84
+    ret
85
+endfunc
86
+
87
+function PFX(filterPixelToShort_4x4_neon)
88
+    p2s_start
89
+#if HIGH_BIT_DEPTH
90
+    ld1             {v0.d}0, x0, x1
91
+    ld1             {v0.d}1, x0, x1
92
+    shl             v3.8h, v0.8h, #P2S_SHIFT
93
+#else
94
+    ld1             {v0.s}0, x0, x1
95
+    ld1             {v0.s}1, x0, x1
96
+    ushll           v3.8h, v0.8b, #P2S_SHIFT
97
+#endif
98
+    add             v3.8h, v3.8h, v31.8h
99
+    st1             {v3.d}0, x2, x3
100
+    st1             {v3.d}1, x2, x3
101
+#if HIGH_BIT_DEPTH
102
+    ld1             {v1.d}0, x0, x1
103
+    ld1             {v1.d}1, x0, x1
104
+    shl             v4.8h, v1.8h, #P2S_SHIFT
105
+#else
106
+    ld1             {v1.s}0, x0, x1
107
+    ld1             {v1.s}1, x0, x1
108
+    ushll           v4.8h, v1.8b, #P2S_SHIFT
109
+#endif
110
+    add             v4.8h, v4.8h, v31.8h
111
+    st1             {v4.d}0, x2, x3
112
+    st1             {v4.d}1, x2, x3
113
+    ret
114
+endfunc
115
+
116
+.macro p2s_4xN h
117
+function PFX(filterPixelToShort_4x\h\()_neon)
118
+    p2s_start
119
+.rept \h / 2
120
+#if HIGH_BIT_DEPTH
121
+    ld1             {v0.16b}, x0, x1
122
+    shl             v0.8h, v0.8h, #P2S_SHIFT
123
+#else
124
+    ld1             {v0.8b}, x0, x1
125
+    ushll           v0.8h, v0.8b, #P2S_SHIFT
126
+#endif
127
+    add             v2.4h, v0.4h, v31.4h
128
+    st1             {v2.4h}, x2, x3
129
+#if HIGH_BIT_DEPTH
130
+    ld1             {v1.16b}, x0, x1
131
+    shl             v1.8h, v1.8h, #P2S_SHIFT
132
+#else
133
+    ld1             {v1.8b}, x0, x1
134
+    ushll           v1.8h, v1.8b, #P2S_SHIFT
135
+#endif
136
+    add             v3.4h, v1.4h, v31.4h
137
+    st1             {v3.4h}, x2, x3
138
+.endr
139
+    ret
140
+endfunc
141
+.endm
142
+
143
+p2s_4xN 8
144
+p2s_4xN 16
145
+p2s_4xN 32
146
+
147
+.macro p2s_8xN h
148
+function PFX(filterPixelToShort_8x\h\()_neon)
149
+    p2s_start
150
+.rept \h / 2
151
+#if HIGH_BIT_DEPTH
152
+    ld1             {v0.16b}, x0, x1
153
+    ld1             {v1.16b}, x0, x1
154
+    shl             v0.8h, v0.8h, #P2S_SHIFT
155
+    shl             v1.8h, v1.8h, #P2S_SHIFT
156
+#else
157
+    ld1             {v0.8b}, x0, x1
158
+    ld1             {v1.8b}, x0, x1
159
+    ushll           v0.8h, v0.8b, #P2S_SHIFT
160
+    ushll           v1.8h, v1.8b, #P2S_SHIFT
161
+#endif
162
+    add             v2.8h, v0.8h, v31.8h
163
+    st1             {v2.8h}, x2, x3
164
+    add             v3.8h, v1.8h, v31.8h
165
+    st1             {v3.8h}, x2, x3
166
+.endr
167
+    ret
168
+endfunc
169
+.endm
170
+
171
+p2s_8xN 2
172
+p2s_8xN 4
173
+p2s_8xN 6
174
+p2s_8xN 8
175
+p2s_8xN 12
176
+p2s_8xN 16
177
+p2s_8xN 32
178
+p2s_8xN 64
179
+
180
+.macro p2s_12xN h
181
+function PFX(filterPixelToShort_12x\h\()_neon)
182
+    p2s_start
183
+    sub             x3, x3, #16
184
+.rept \h
185
+#if HIGH_BIT_DEPTH
186
+    ld1             {v0.16b-v1.16b}, x0, x1
187
+    shl             v2.8h, v0.8h, #P2S_SHIFT
188
+    shl             v3.8h, v1.8h, #P2S_SHIFT
189
+#else
190
+    ld1             {v0.16b}, x0, x1
191
+    ushll           v2.8h, v0.8b,  #P2S_SHIFT
192
+    ushll2          v3.8h, v0.16b, #P2S_SHIFT
193
+#endif
194
+    add             v2.8h, v2.8h, v31.8h
195
+    add             v3.8h, v3.8h, v31.8h
196
+    st1             {v2.16b}, x2, #16
197
+    st1             {v3.8b}, x2, x3
198
+.endr
199
+    ret
200
+endfunc
201
x265_3.6.tar.gz/source/common/aarch64/pixel-prim.cpp Added
201
 
1
@@ -0,0 +1,2059 @@
2
+#include "common.h"
3
+#include "slicetype.h"      // LOWRES_COST_MASK
4
+#include "primitives.h"
5
+#include "x265.h"
6
+
7
+#include "pixel-prim.h"
8
+#include "arm64-utils.h"
9
+#if HAVE_NEON
10
+
11
+#include <arm_neon.h>
12
+
13
+using namespace X265_NS;
14
+
15
+
16
+
17
+namespace
18
+{
19
+
20
+
21
+/* SATD SA8D variants - based on x264 */
22
+static inline void SUMSUB_AB(int16x8_t &sum, int16x8_t &sub, const int16x8_t a, const int16x8_t b)
23
+{
24
+    sum = vaddq_s16(a, b);
25
+    sub = vsubq_s16(a, b);
26
+}
27
+
28
+static inline void transpose_8h(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
29
+{
30
+    t1 = vtrn1q_s16(s1, s2);
31
+    t2 = vtrn2q_s16(s1, s2);
32
+}
33
+
34
+static inline void transpose_4s(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
35
+{
36
+    t1 = vtrn1q_s32(s1, s2);
37
+    t2 = vtrn2q_s32(s1, s2);
38
+}
39
+
40
+#if (X265_DEPTH <= 10)
41
+static inline void transpose_2d(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
42
+{
43
+    t1 = vtrn1q_s64(s1, s2);
44
+    t2 = vtrn2q_s64(s1, s2);
45
+}
46
+#endif
47
+
48
+
49
+static inline void SUMSUB_ABCD(int16x8_t &s1, int16x8_t &d1, int16x8_t &s2, int16x8_t &d2,
50
+                               int16x8_t a, int16x8_t  b, int16x8_t  c, int16x8_t  d)
51
+{
52
+    SUMSUB_AB(s1, d1, a, b);
53
+    SUMSUB_AB(s2, d2, c, d);
54
+}
55
+
56
+static inline void HADAMARD4_V(int16x8_t &r1, int16x8_t &r2, int16x8_t &r3, int16x8_t &r4,
57
+                               int16x8_t &t1, int16x8_t &t2, int16x8_t &t3, int16x8_t &t4)
58
+{
59
+    SUMSUB_ABCD(t1, t2, t3, t4, r1, r2, r3, r4);
60
+    SUMSUB_ABCD(r1, r3, r2, r4, t1, t3, t2, t4);
61
+}
62
+
63
+
64
+static int _satd_4x8_8x4_end_neon(int16x8_t v0, int16x8_t v1, int16x8_t v2, int16x8_t v3)
65
+
66
+{
67
+
68
+    int16x8_t v4, v5, v6, v7, v16, v17, v18, v19;
69
+
70
+
71
+    SUMSUB_AB(v16, v17, v0,  v1);
72
+    SUMSUB_AB(v18, v19, v2,  v3);
73
+
74
+    SUMSUB_AB(v4 , v6 , v16, v18);
75
+    SUMSUB_AB(v5 , v7 , v17, v19);
76
+
77
+    v0 = vtrn1q_s16(v4, v5);
78
+    v1 = vtrn2q_s16(v4, v5);
79
+    v2 = vtrn1q_s16(v6, v7);
80
+    v3 = vtrn2q_s16(v6, v7);
81
+
82
+    SUMSUB_AB(v16, v17, v0,  v1);
83
+    SUMSUB_AB(v18, v19, v2,  v3);
84
+
85
+    v0 = vtrn1q_s32(v16, v18);
86
+    v1 = vtrn2q_s32(v16, v18);
87
+    v2 = vtrn1q_s32(v17, v19);
88
+    v3 = vtrn2q_s32(v17, v19);
89
+
90
+    v0 = vabsq_s16(v0);
91
+    v1 = vabsq_s16(v1);
92
+    v2 = vabsq_s16(v2);
93
+    v3 = vabsq_s16(v3);
94
+
95
+    v0 = vmaxq_u16(v0, v1);
96
+    v1 = vmaxq_u16(v2, v3);
97
+
98
+    v0 = vaddq_u16(v0, v1);
99
+    return vaddlvq_u16(v0);
100
+}
101
+
102
+static inline int _satd_4x4_neon(int16x8_t v0, int16x8_t v1)
103
+{
104
+    int16x8_t v2, v3;
105
+    SUMSUB_AB(v2,  v3,  v0,  v1);
106
+
107
+    v0 = vzip1q_s64(v2, v3);
108
+    v1 = vzip2q_s64(v2, v3);
109
+    SUMSUB_AB(v2,  v3,  v0,  v1);
110
+
111
+    v0 = vtrn1q_s16(v2, v3);
112
+    v1 = vtrn2q_s16(v2, v3);
113
+    SUMSUB_AB(v2,  v3,  v0,  v1);
114
+
115
+    v0 = vtrn1q_s32(v2, v3);
116
+    v1 = vtrn2q_s32(v2, v3);
117
+
118
+    v0 = vabsq_s16(v0);
119
+    v1 = vabsq_s16(v1);
120
+    v0 = vmaxq_u16(v0, v1);
121
+
122
+    return vaddlvq_s16(v0);
123
+}
124
+
125
+static void _satd_8x4v_8x8h_neon(int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3, int16x8_t &v20,
126
+                                 int16x8_t &v21, int16x8_t &v22, int16x8_t &v23)
127
+{
128
+    int16x8_t v16, v17, v18, v19, v4, v5, v6, v7;
129
+
130
+    SUMSUB_AB(v16, v18, v0,  v2);
131
+    SUMSUB_AB(v17, v19, v1,  v3);
132
+
133
+    HADAMARD4_V(v20, v21, v22, v23, v0,  v1, v2, v3);
134
+
135
+    transpose_8h(v0,  v1,  v16, v17);
136
+    transpose_8h(v2,  v3,  v18, v19);
137
+    transpose_8h(v4,  v5,  v20, v21);
138
+    transpose_8h(v6,  v7,  v22, v23);
139
+
140
+    SUMSUB_AB(v16, v17, v0,  v1);
141
+    SUMSUB_AB(v18, v19, v2,  v3);
142
+    SUMSUB_AB(v20, v21, v4,  v5);
143
+    SUMSUB_AB(v22, v23, v6,  v7);
144
+
145
+    transpose_4s(v0,  v2,  v16, v18);
146
+    transpose_4s(v1,  v3,  v17, v19);
147
+    transpose_4s(v4,  v6,  v20, v22);
148
+    transpose_4s(v5,  v7,  v21, v23);
149
+
150
+    v0 = vabsq_s16(v0);
151
+    v1 = vabsq_s16(v1);
152
+    v2 = vabsq_s16(v2);
153
+    v3 = vabsq_s16(v3);
154
+    v4 = vabsq_s16(v4);
155
+    v5 = vabsq_s16(v5);
156
+    v6 = vabsq_s16(v6);
157
+    v7 = vabsq_s16(v7);
158
+
159
+    v0 = vmaxq_u16(v0, v2);
160
+    v1 = vmaxq_u16(v1, v3);
161
+    v2 = vmaxq_u16(v4, v6);
162
+    v3 = vmaxq_u16(v5, v7);
163
+
164
+}
165
+
166
+#if HIGH_BIT_DEPTH
167
+
168
+#if (X265_DEPTH > 10)
169
+static inline void transpose_2d(int32x4_t &t1, int32x4_t &t2, const int32x4_t s1, const int32x4_t s2)
170
+{
171
+    t1 = vtrn1q_s64(s1, s2);
172
+    t2 = vtrn2q_s64(s1, s2);
173
+}
174
+
175
+static inline void ISUMSUB_AB(int32x4_t &sum, int32x4_t &sub, const int32x4_t a, const int32x4_t b)
176
+{
177
+    sum = vaddq_s32(a, b);
178
+    sub = vsubq_s32(a, b);
179
+}
180
+
181
+static inline void ISUMSUB_AB_FROM_INT16(int32x4_t &suml, int32x4_t &sumh, int32x4_t &subl, int32x4_t &subh,
182
+        const int16x8_t a, const int16x8_t b)
183
+{
184
+    suml = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
185
+    sumh = vaddl_high_s16(a, b);
186
+    subl = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
187
+    subh = vsubl_high_s16(a, b);
188
+}
189
+
190
+#endif
191
+
192
+static inline void _sub_8x8_fly(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2,
193
+                                int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3,
194
+                                int16x8_t &v20, int16x8_t &v21, int16x8_t &v22, int16x8_t &v23)
195
+{
196
+    uint16x8_t r0, r1, r2, r3;
197
+    uint16x8_t t0, t1, t2, t3;
198
+    int16x8_t v16, v17;
199
+    int16x8_t v18, v19;
200
+
201
x265_3.6.tar.gz/source/common/aarch64/pixel-prim.h Added
25
 
1
@@ -0,0 +1,23 @@
2
+#ifndef PIXEL_PRIM_NEON_H__
3
+#define PIXEL_PRIM_NEON_H__
4
+
5
+#include "common.h"
6
+#include "slicetype.h"      // LOWRES_COST_MASK
7
+#include "primitives.h"
8
+#include "x265.h"
9
+
10
+
11
+
12
+namespace X265_NS
13
+{
14
+
15
+
16
+
17
+void setupPixelPrimitives_neon(EncoderPrimitives &p);
18
+
19
+
20
+}
21
+
22
+
23
+#endif
24
+
25
x265_3.6.tar.gz/source/common/aarch64/pixel-util-common.S Added
86
 
1
@@ -0,0 +1,84 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+.arch           armv8-a
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.macro pixel_var_start
39
+    movi            v0.16b, #0
40
+    movi            v1.16b, #0
41
+    movi            v2.16b, #0
42
+    movi            v3.16b, #0
43
+.endm
44
+
45
+.macro pixel_var_1 v
46
+    uaddw           v0.8h, v0.8h, \v\().8b
47
+    umull           v30.8h, \v\().8b, \v\().8b
48
+    uaddw2          v1.8h, v1.8h, \v\().16b
49
+    umull2          v31.8h, \v\().16b, \v\().16b
50
+    uadalp          v2.4s, v30.8h
51
+    uadalp          v3.4s, v31.8h
52
+.endm
53
+
54
+.macro pixel_var_end
55
+    uaddlv          s0, v0.8h
56
+    uaddlv          s1, v1.8h
57
+    add             v2.4s, v2.4s, v3.4s
58
+    fadd            s0, s0, s1
59
+    uaddlv          d2, v2.4s
60
+    fmov            w0, s0
61
+    fmov            x2, d2
62
+    orr             x0, x0, x2, lsl #32
63
+.endm
64
+
65
+.macro ssimDist_start
66
+    movi            v0.16b, #0
67
+    movi            v1.16b, #0
68
+.endm
69
+
70
+.macro ssimDist_end
71
+    uaddlv          d0, v0.4s
72
+    uaddlv          d1, v1.4s
73
+    str             d0, x6
74
+    str             d1, x4
75
+.endm
76
+
77
+.macro normFact_start
78
+    movi            v0.16b, #0
79
+.endm
80
+
81
+.macro normFact_end
82
+    uaddlv          d0, v0.4s
83
+    str             d0, x3
84
+.endm
85
+
86
x265_3.6.tar.gz/source/common/aarch64/pixel-util-sve.S Added
201
 
1
@@ -0,0 +1,373 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "pixel-util-common.S"
27
+
28
+.arch armv8-a+sve
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+function PFX(pixel_sub_ps_8x16_sve)
41
+    lsl             x1, x1, #1
42
+    ptrue           p0.h, vl8
43
+.rept 8
44
+    ld1b            {z0.h}, p0/z, x2
45
+    ld1b            {z1.h}, p0/z, x3
46
+    add             x2, x2, x4
47
+    add             x3, x3, x5
48
+    ld1b            {z2.h}, p0/z, x2
49
+    ld1b            {z3.h}, p0/z, x3
50
+    add             x2, x2, x4
51
+    add             x3, x3, x5
52
+    sub             z4.h, z0.h, z1.h
53
+    sub             z5.h, z2.h, z3.h
54
+    st1             {v4.8h}, x0, x1
55
+    st1             {v5.8h}, x0, x1
56
+.endr
57
+    ret
58
+endfunc
59
+
60
+//******* satd *******
61
+.macro satd_4x4_sve
62
+    ld1b            {z0.h}, p0/z, x0
63
+    ld1b            {z2.h}, p0/z, x2
64
+    add             x0, x0, x1
65
+    add             x2, x2, x3
66
+    ld1b            {z1.h}, p0/z, x0
67
+    ld1b            {z3.h}, p0/z, x2
68
+    add             x0, x0, x1
69
+    add             x2, x2, x3
70
+    ld1b            {z4.h}, p0/z, x0
71
+    ld1b            {z6.h}, p0/z, x2
72
+    add             x0, x0, x1
73
+    add             x2, x2, x3
74
+    ld1b            {z5.h}, p0/z, x0
75
+    ld1b            {z7.h}, p0/z, x2
76
+    add             x0, x0, x1
77
+    add             x2, x2, x3
78
+
79
+    sub             z0.h, z0.h, z2.h
80
+    sub             z1.h, z1.h, z3.h
81
+    sub             z2.h, z4.h, z6.h
82
+    sub             z3.h, z5.h, z7.h
83
+
84
+    add             z4.h, z0.h, z2.h
85
+    add             z5.h, z1.h, z3.h
86
+    sub             z6.h, z0.h, z2.h
87
+    sub             z7.h, z1.h, z3.h
88
+
89
+    add             z0.h, z4.h, z5.h
90
+    sub             z1.h, z4.h, z5.h
91
+
92
+    add             z2.h, z6.h, z7.h
93
+    sub             z3.h, z6.h, z7.h
94
+
95
+    trn1            z4.h, z0.h, z2.h
96
+    trn2            z5.h, z0.h, z2.h
97
+
98
+    trn1            z6.h, z1.h, z3.h
99
+    trn2            z7.h, z1.h, z3.h
100
+
101
+    add             z0.h, z4.h, z5.h
102
+    sub             z1.h, z4.h, z5.h
103
+
104
+    add             z2.h, z6.h, z7.h
105
+    sub             z3.h, z6.h, z7.h
106
+
107
+    trn1            z4.s, z0.s, z1.s
108
+    trn2            z5.s, z0.s, z1.s
109
+
110
+    trn1            z6.s, z2.s, z3.s
111
+    trn2            z7.s, z2.s, z3.s
112
+
113
+    abs             z4.h, p0/m, z4.h
114
+    abs             z5.h, p0/m, z5.h
115
+    abs             z6.h, p0/m, z6.h
116
+    abs             z7.h, p0/m, z7.h
117
+
118
+    smax            z4.h, p0/m, z4.h, z5.h
119
+    smax            z6.h, p0/m, z6.h, z7.h
120
+
121
+    add             z0.h, z4.h, z6.h
122
+
123
+    uaddlp          v0.2s, v0.4h
124
+    uaddlp          v0.1d, v0.2s
125
+.endm
126
+
127
+// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
128
+function PFX(pixel_satd_4x4_sve)
129
+    ptrue           p0.h, vl4
130
+    satd_4x4_sve
131
+    fmov            x0, d0
132
+    ret
133
+endfunc
134
+
135
+function PFX(pixel_satd_8x4_sve)
136
+    ptrue           p0.h, vl4
137
+    mov             x4, x0
138
+    mov             x5, x2
139
+    satd_4x4_sve
140
+    add             x0, x4, #4
141
+    add             x2, x5, #4
142
+    umov            x6, v0.d0
143
+    satd_4x4_sve
144
+    umov            x0, v0.d0
145
+    add             x0, x0, x6
146
+    ret
147
+endfunc
148
+
149
+function PFX(pixel_satd_8x12_sve)
150
+    ptrue           p0.h, vl4
151
+    mov             x4, x0
152
+    mov             x5, x2
153
+    mov             x7, #0
154
+    satd_4x4_sve
155
+    umov            x6, v0.d0
156
+    add             x7, x7, x6
157
+    add             x0, x4, #4
158
+    add             x2, x5, #4
159
+    satd_4x4_sve
160
+    umov            x6, v0.d0
161
+    add             x7, x7, x6
162
+.rept 2
163
+    sub             x0, x0, #4
164
+    sub             x2, x2, #4
165
+    mov             x4, x0
166
+    mov             x5, x2
167
+    satd_4x4_sve
168
+    umov            x6, v0.d0
169
+    add             x7, x7, x6
170
+    add             x0, x4, #4
171
+    add             x2, x5, #4
172
+    satd_4x4_sve
173
+    umov            x6, v0.d0
174
+    add             x7, x7, x6
175
+.endr
176
+    mov             x0, x7
177
+    ret
178
+endfunc
179
+
180
+.macro LOAD_DIFF_16x4_sve v0 v1 v2 v3 v4 v5 v6 v7
181
+    mov             x11, #8 // in order to consider CPUs whose vector size is greater than 128 bits
182
+    ld1b            {z0.h}, p0/z, x0
183
+    ld1b            {z1.h}, p0/z, x0, x11
184
+    ld1b            {z2.h}, p0/z, x2
185
+    ld1b            {z3.h}, p0/z, x2, x11
186
+    add             x0, x0, x1
187
+    add             x2, x2, x3
188
+    ld1b            {z4.h}, p0/z, x0
189
+    ld1b            {z5.h}, p0/z, x0, x11
190
+    ld1b            {z6.h}, p0/z, x2
191
+    ld1b            {z7.h}, p0/z, x2, x11
192
+    add             x0, x0, x1
193
+    add             x2, x2, x3
194
+    ld1b            {z29.h}, p0/z, x0
195
+    ld1b            {z9.h}, p0/z, x0, x11
196
+    ld1b            {z10.h}, p0/z, x2
197
+    ld1b            {z11.h}, p0/z, x2, x11
198
+    add             x0, x0, x1
199
+    add             x2, x2, x3
200
+    ld1b            {z12.h}, p0/z, x0
201
x265_3.6.tar.gz/source/common/aarch64/pixel-util-sve2.S Added
201
 
1
@@ -0,0 +1,1686 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "pixel-util-common.S"
27
+
28
+.arch armv8-a+sve2
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+// uint64_t pixel_var(const pixel* pix, intptr_t i_stride)
41
+function PFX(pixel_var_8x8_sve2)
42
+    ptrue           p0.h, vl8
43
+    ld1b            {z0.h}, p0/z, x0
44
+    add             x0, x0, x1
45
+    mul             z31.h, z0.h, z0.h
46
+    uaddlp          v1.4s, v31.8h
47
+.rept 7
48
+    ld1b            {z4.h}, p0/z, x0
49
+    add             x0, x0, x1
50
+    add             z0.h, z0.h, z4.h
51
+    mul             z31.h, z4.h, z4.h
52
+    uadalp          z1.s, p0/m, z31.h
53
+.endr
54
+    uaddlv          s0, v0.8h
55
+    uaddlv          d1, v1.4s
56
+    fmov            w0, s0
57
+    fmov            x1, d1
58
+    orr             x0, x0, x1, lsl #32
59
+    ret
60
+endfunc
61
+
62
+function PFX(pixel_var_16x16_sve2)
63
+    rdvl            x9, #1
64
+    cmp             x9, #16
65
+    bgt             .vl_gt_16_pixel_var_16x16
66
+    pixel_var_start
67
+    mov             w12, #16
68
+.loop_var_16_sve2:
69
+    sub             w12, w12, #1
70
+    ld1             {v4.16b}, x0, x1
71
+    pixel_var_1 v4
72
+    cbnz            w12, .loop_var_16_sve2
73
+    pixel_var_end
74
+    ret
75
+.vl_gt_16_pixel_var_16x16:
76
+    ptrue           p0.h, vl16
77
+    mov             z0.d, #0
78
+.rept 16
79
+    ld1b            {z4.h}, p0/z, x0
80
+    add             x0, x0, x1
81
+    add             z0.h, z0.h, z4.h
82
+    mul             z30.h, z4.h, z4.h
83
+    uadalp          z1.s, p0/m, z30.h
84
+.endr
85
+    uaddv           d0, p0, z0.h
86
+    uaddv           d1, p0, z1.s
87
+    fmov            w0, s0
88
+    fmov            x1, d1
89
+    orr             x0, x0, x1, lsl #32
90
+    ret
91
+endfunc
92
+
93
+function PFX(pixel_var_32x32_sve2)
94
+    rdvl            x9, #1
95
+    cmp             x9, #16
96
+    bgt             .vl_gt_16_pixel_var_32x32
97
+    pixel_var_start
98
+    mov             w12, #32
99
+.loop_var_32_sve2:
100
+    sub             w12, w12, #1
101
+    ld1             {v4.16b-v5.16b}, x0, x1
102
+    pixel_var_1 v4
103
+    pixel_var_1 v5
104
+    cbnz            w12, .loop_var_32_sve2
105
+    pixel_var_end
106
+    ret
107
+.vl_gt_16_pixel_var_32x32:
108
+    cmp             x9, #48
109
+    bgt             .vl_gt_48_pixel_var_32x32
110
+    ptrue           p0.b, vl32
111
+    mov             z0.d, #0
112
+    mov             z1.d, #0
113
+.rept 32
114
+    ld1b            {z4.b}, p0/z, x0
115
+    add             x0, x0, x1
116
+    uaddwb          z0.h, z0.h, z4.b
117
+    uaddwt          z0.h, z0.h, z4.b
118
+    umullb          z28.h, z4.b, z4.b
119
+    umullt          z29.h, z4.b, z4.b
120
+    uadalp          z1.s, p0/m, z28.h
121
+    uadalp          z1.s, p0/m, z29.h
122
+.endr
123
+    uaddv           d0, p0, z0.h
124
+    uaddv           d1, p0, z1.s
125
+    fmov            w0, s0
126
+    fmov            x1, d1
127
+    orr             x0, x0, x1, lsl #32
128
+    ret
129
+.vl_gt_48_pixel_var_32x32:
130
+    ptrue           p0.h, vl32
131
+    mov             z0.d, #0
132
+    mov             z1.d, #0
133
+.rept 32
134
+    ld1b            {z4.h}, p0/z, x0
135
+    add             x0, x0, x1
136
+    add             z0.h, z0.h, z4.h
137
+    mul             z28.h, z4.h, z4.h
138
+    uadalp          z1.s, p0/m, z28.h
139
+.endr
140
+    uaddv           d0, p0, z0.h
141
+    uaddv           d1, p0, z1.s
142
+    fmov            w0, s0
143
+    fmov            x1, d1
144
+    orr             x0, x0, x1, lsl #32
145
+    ret
146
+endfunc
147
+
148
+function PFX(pixel_var_64x64_sve2)
149
+    rdvl            x9, #1
150
+    cmp             x9, #16
151
+    bgt             .vl_gt_16_pixel_var_64x64
152
+    pixel_var_start
153
+    mov             w12, #64
154
+.loop_var_64_sve2:
155
+    sub             w12, w12, #1
156
+    ld1             {v4.16b-v7.16b}, x0, x1
157
+    pixel_var_1 v4
158
+    pixel_var_1 v5
159
+    pixel_var_1 v6
160
+    pixel_var_1 v7
161
+    cbnz            w12, .loop_var_64_sve2
162
+    pixel_var_end
163
+    ret
164
+.vl_gt_16_pixel_var_64x64:
165
+    cmp             x9, #48
166
+    bgt             .vl_gt_48_pixel_var_64x64
167
+    ptrue           p0.b, vl32
168
+    mov             z0.d, #0
169
+    mov             z2.d, #0
170
+.rept 64
171
+    ld1b            {z4.b}, p0/z, x0
172
+    ld1b            {z5.b}, p0/z, x0, #1, mul vl
173
+    add             x0, x0, x1
174
+    uaddwb          z0.h, z0.h, z4.b
175
+    uaddwt          z0.h, z0.h, z4.b
176
+    uaddwb          z0.h, z0.h, z5.b
177
+    uaddwt          z0.h, z0.h, z5.b
178
+    umullb          z24.h, z4.b, z4.b
179
+    umullt          z25.h, z4.b, z4.b
180
+    umullb          z26.h, z5.b, z5.b
181
+    umullt          z27.h, z5.b, z5.b
182
+    uadalp          z2.s, p0/m, z24.h
183
+    uadalp          z2.s, p0/m, z25.h
184
+    uadalp          z2.s, p0/m, z26.h
185
+    uadalp          z2.s, p0/m, z27.h
186
+.endr
187
+    uaddv           d0, p0, z0.h
188
+    uaddv           d1, p0, z2.s
189
+    fmov            w0, s0
190
+    fmov            x1, d1
191
+    orr             x0, x0, x1, lsl #32
192
+    ret
193
+.vl_gt_48_pixel_var_64x64:
194
+    cmp             x9, #112
195
+    bgt             .vl_gt_112_pixel_var_64x64
196
+    ptrue           p0.b, vl64
197
+    mov             z0.d, #0
198
+    mov             z1.d, #0
199
+.rept 64
200
+    ld1b            {z4.b}, p0/z, x0
201
x265_3.5.tar.gz/source/common/aarch64/pixel-util.S -> x265_3.6.tar.gz/source/common/aarch64/pixel-util.S Changed
201
 
1
@@ -1,8 +1,9 @@
2
 /*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
+ * Copyright (C) 2020-2021 MulticoreWare, Inc
5
  *
6
  * Authors: Yimeng Su <yimeng.su@huawei.com>
7
  *          Hongbin Liu <liuhongbin1@huawei.com>
8
+ *          Sebastian Pop <spop@amazon.com>
9
  *
10
  * This program is free software; you can redistribute it and/or modify
11
  * it under the terms of the GNU General Public License as published by
12
@@ -23,13 +24,652 @@
13
  *****************************************************************************/
14
 
15
 #include "asm.S"
16
+#include "pixel-util-common.S"
17
 
18
+#ifdef __APPLE__
19
+.section __RODATA,__rodata
20
+#else
21
 .section .rodata
22
+#endif
23
 
24
 .align 4
25
 
26
 .text
27
 
28
+// uint64_t pixel_var(const pixel* pix, intptr_t i_stride)
29
+function PFX(pixel_var_8x8_neon)
30
+    ld1             {v4.8b}, x0, x1        // pixx
31
+    uxtl            v0.8h, v4.8b             // sum = pixx
32
+    umull           v1.8h, v4.8b, v4.8b
33
+    uaddlp          v1.4s, v1.8h             // sqr = pixx * pixx
34
+
35
+.rept 7
36
+    ld1             {v4.8b}, x0, x1        // pixx
37
+    umull           v31.8h, v4.8b, v4.8b
38
+    uaddw           v0.8h, v0.8h, v4.8b      // sum += pixx
39
+    uadalp          v1.4s, v31.8h            // sqr += pixx * pixx
40
+.endr
41
+    uaddlv          s0, v0.8h
42
+    uaddlv          d1, v1.4s
43
+    fmov            w0, s0
44
+    fmov            x1, d1
45
+    orr             x0, x0, x1, lsl #32      // return sum + ((uint64_t)sqr << 32);
46
+    ret
47
+endfunc
48
+
49
+function PFX(pixel_var_16x16_neon)
50
+    pixel_var_start
51
+    mov             w12, #16
52
+.loop_var_16:
53
+    sub             w12, w12, #1
54
+    ld1             {v4.16b}, x0, x1
55
+    pixel_var_1 v4
56
+    cbnz            w12, .loop_var_16
57
+    pixel_var_end
58
+    ret
59
+endfunc
60
+
61
+function PFX(pixel_var_32x32_neon)
62
+    pixel_var_start
63
+    mov             w12, #32
64
+.loop_var_32:
65
+    sub             w12, w12, #1
66
+    ld1             {v4.16b-v5.16b}, x0, x1
67
+    pixel_var_1 v4
68
+    pixel_var_1 v5
69
+    cbnz            w12, .loop_var_32
70
+    pixel_var_end
71
+    ret
72
+endfunc
73
+
74
+function PFX(pixel_var_64x64_neon)
75
+    pixel_var_start
76
+    mov             w12, #64
77
+.loop_var_64:
78
+    sub             w12, w12, #1
79
+    ld1             {v4.16b-v7.16b}, x0, x1
80
+    pixel_var_1 v4
81
+    pixel_var_1 v5
82
+    pixel_var_1 v6
83
+    pixel_var_1 v7
84
+    cbnz            w12, .loop_var_64
85
+    pixel_var_end
86
+    ret
87
+endfunc
88
+
89
+// void getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride)
90
+function PFX(getResidual4_neon)
91
+    lsl             x4, x3, #1
92
+.rept 2
93
+    ld1             {v0.8b}, x0, x3
94
+    ld1             {v1.8b}, x1, x3
95
+    ld1             {v2.8b}, x0, x3
96
+    ld1             {v3.8b}, x1, x3
97
+    usubl           v4.8h, v0.8b, v1.8b
98
+    usubl           v5.8h, v2.8b, v3.8b
99
+    st1             {v4.8b}, x2, x4
100
+    st1             {v5.8b}, x2, x4
101
+.endr
102
+    ret
103
+endfunc
104
+
105
+function PFX(getResidual8_neon)
106
+    lsl             x4, x3, #1
107
+.rept 4
108
+    ld1             {v0.8b}, x0, x3
109
+    ld1             {v1.8b}, x1, x3
110
+    ld1             {v2.8b}, x0, x3
111
+    ld1             {v3.8b}, x1, x3
112
+    usubl           v4.8h, v0.8b, v1.8b
113
+    usubl           v5.8h, v2.8b, v3.8b
114
+    st1             {v4.16b}, x2, x4
115
+    st1             {v5.16b}, x2, x4
116
+.endr
117
+    ret
118
+endfunc
119
+
120
+function PFX(getResidual16_neon)
121
+    lsl             x4, x3, #1
122
+.rept 8
123
+    ld1             {v0.16b}, x0, x3
124
+    ld1             {v1.16b}, x1, x3
125
+    ld1             {v2.16b}, x0, x3
126
+    ld1             {v3.16b}, x1, x3
127
+    usubl           v4.8h, v0.8b, v1.8b
128
+    usubl2          v5.8h, v0.16b, v1.16b
129
+    usubl           v6.8h, v2.8b, v3.8b
130
+    usubl2          v7.8h, v2.16b, v3.16b
131
+    st1             {v4.8h-v5.8h}, x2, x4
132
+    st1             {v6.8h-v7.8h}, x2, x4
133
+.endr
134
+    ret
135
+endfunc
136
+
137
+function PFX(getResidual32_neon)
138
+    lsl             x4, x3, #1
139
+    mov             w12, #4
140
+.loop_residual_32:
141
+    sub             w12, w12, #1
142
+.rept 4
143
+    ld1             {v0.16b-v1.16b}, x0, x3
144
+    ld1             {v2.16b-v3.16b}, x1, x3
145
+    ld1             {v4.16b-v5.16b}, x0, x3
146
+    ld1             {v6.16b-v7.16b}, x1, x3
147
+    usubl           v16.8h, v0.8b, v2.8b
148
+    usubl2          v17.8h, v0.16b, v2.16b
149
+    usubl           v18.8h, v1.8b, v3.8b
150
+    usubl2          v19.8h, v1.16b, v3.16b
151
+    usubl           v20.8h, v4.8b, v6.8b
152
+    usubl2          v21.8h, v4.16b, v6.16b
153
+    usubl           v22.8h, v5.8b, v7.8b
154
+    usubl2          v23.8h, v5.16b, v7.16b
155
+    st1             {v16.8h-v19.8h}, x2, x4
156
+    st1             {v20.8h-v23.8h}, x2, x4
157
+.endr
158
+    cbnz            w12, .loop_residual_32
159
+    ret
160
+endfunc
161
+
162
+// void pixel_sub_ps_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1)
163
+function PFX(pixel_sub_ps_4x4_neon)
164
+    lsl             x1, x1, #1
165
+.rept 2
166
+    ld1             {v0.8b}, x2, x4
167
+    ld1             {v1.8b}, x3, x5
168
+    ld1             {v2.8b}, x2, x4
169
+    ld1             {v3.8b}, x3, x5
170
+    usubl           v4.8h, v0.8b, v1.8b
171
+    usubl           v5.8h, v2.8b, v3.8b
172
+    st1             {v4.4h}, x0, x1
173
+    st1             {v5.4h}, x0, x1
174
+.endr
175
+    ret
176
+endfunc
177
+
178
+function PFX(pixel_sub_ps_8x8_neon)
179
+    lsl             x1, x1, #1
180
+.rept 4
181
+    ld1             {v0.8b}, x2, x4
182
+    ld1             {v1.8b}, x3, x5
183
+    ld1             {v2.8b}, x2, x4
184
+    ld1             {v3.8b}, x3, x5
185
+    usubl           v4.8h, v0.8b, v1.8b
186
+    usubl           v5.8h, v2.8b, v3.8b
187
+    st1             {v4.8h}, x0, x1
188
+    st1             {v5.8h}, x0, x1
189
+.endr
190
+    ret
191
+endfunc
192
+
193
+function PFX(pixel_sub_ps_16x16_neon)
194
+    lsl             x1, x1, #1
195
+.rept 8
196
+    ld1             {v0.16b}, x2, x4
197
+    ld1             {v1.16b}, x3, x5
198
+    ld1             {v2.16b}, x2, x4
199
+    ld1             {v3.16b}, x3, x5
200
+    usubl           v4.8h, v0.8b, v1.8b
201
x265_3.6.tar.gz/source/common/aarch64/sad-a-common.S Added
201
 
1
@@ -0,0 +1,514 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+#include "asm.S"
29
+
30
+.arch           armv8-a
31
+
32
+#ifdef __APPLE__
33
+.section __RODATA,__rodata
34
+#else
35
+.section .rodata
36
+#endif
37
+
38
+.align 4
39
+
40
+.macro SAD_START_4 f
41
+    ld1             {v0.s}0, x0, x1
42
+    ld1             {v0.s}1, x0, x1
43
+    ld1             {v1.s}0, x2, x3
44
+    ld1             {v1.s}1, x2, x3
45
+    \f              v16.8h, v0.8b, v1.8b
46
+.endm
47
+
48
+.macro SAD_4 h
49
+.rept \h / 2 - 1
50
+    SAD_START_4 uabal
51
+.endr
52
+.endm
53
+
54
+.macro SAD_START_8 f
55
+    ld1             {v0.8b}, x0, x1
56
+    ld1             {v1.8b}, x2, x3
57
+    ld1             {v2.8b}, x0, x1
58
+    ld1             {v3.8b}, x2, x3
59
+    \f              v16.8h, v0.8b, v1.8b
60
+    \f              v17.8h, v2.8b, v3.8b
61
+.endm
62
+
63
+.macro SAD_8 h
64
+.rept \h / 2 - 1
65
+    SAD_START_8 uabal
66
+.endr
67
+.endm
68
+
69
+.macro SAD_START_16 f
70
+    ld1             {v0.16b}, x0, x1
71
+    ld1             {v1.16b}, x2, x3
72
+    ld1             {v2.16b}, x0, x1
73
+    ld1             {v3.16b}, x2, x3
74
+    \f              v16.8h, v0.8b, v1.8b
75
+    \f\()2          v17.8h, v0.16b, v1.16b
76
+    uabal           v16.8h, v2.8b, v3.8b
77
+    uabal2          v17.8h, v2.16b, v3.16b
78
+.endm
79
+
80
+.macro SAD_16 h
81
+.rept \h / 2 - 1
82
+    SAD_START_16 uabal
83
+.endr
84
+.endm
85
+
86
+.macro SAD_START_32
87
+    movi            v16.16b, #0
88
+    movi            v17.16b, #0
89
+    movi            v18.16b, #0
90
+    movi            v19.16b, #0
91
+.endm
92
+
93
+.macro SAD_32
94
+    ld1             {v0.16b-v1.16b}, x0, x1
95
+    ld1             {v2.16b-v3.16b}, x2, x3
96
+    ld1             {v4.16b-v5.16b}, x0, x1
97
+    ld1             {v6.16b-v7.16b}, x2, x3
98
+    uabal           v16.8h, v0.8b, v2.8b
99
+    uabal2          v17.8h, v0.16b, v2.16b
100
+    uabal           v18.8h, v1.8b, v3.8b
101
+    uabal2          v19.8h, v1.16b, v3.16b
102
+    uabal           v16.8h, v4.8b, v6.8b
103
+    uabal2          v17.8h, v4.16b, v6.16b
104
+    uabal           v18.8h, v5.8b, v7.8b
105
+    uabal2          v19.8h, v5.16b, v7.16b
106
+.endm
107
+
108
+.macro SAD_END_32
109
+    add             v16.8h, v16.8h, v17.8h
110
+    add             v17.8h, v18.8h, v19.8h
111
+    add             v16.8h, v16.8h, v17.8h
112
+    uaddlv          s0, v16.8h
113
+    fmov            w0, s0
114
+    ret
115
+.endm
116
+
117
+.macro SAD_START_64
118
+    movi            v16.16b, #0
119
+    movi            v17.16b, #0
120
+    movi            v18.16b, #0
121
+    movi            v19.16b, #0
122
+    movi            v20.16b, #0
123
+    movi            v21.16b, #0
124
+    movi            v22.16b, #0
125
+    movi            v23.16b, #0
126
+.endm
127
+
128
+.macro SAD_64
129
+    ld1             {v0.16b-v3.16b}, x0, x1
130
+    ld1             {v4.16b-v7.16b}, x2, x3
131
+    ld1             {v24.16b-v27.16b}, x0, x1
132
+    ld1             {v28.16b-v31.16b}, x2, x3
133
+    uabal           v16.8h, v0.8b, v4.8b
134
+    uabal2          v17.8h, v0.16b, v4.16b
135
+    uabal           v18.8h, v1.8b, v5.8b
136
+    uabal2          v19.8h, v1.16b, v5.16b
137
+    uabal           v20.8h, v2.8b, v6.8b
138
+    uabal2          v21.8h, v2.16b, v6.16b
139
+    uabal           v22.8h, v3.8b, v7.8b
140
+    uabal2          v23.8h, v3.16b, v7.16b
141
+
142
+    uabal           v16.8h, v24.8b, v28.8b
143
+    uabal2          v17.8h, v24.16b, v28.16b
144
+    uabal           v18.8h, v25.8b, v29.8b
145
+    uabal2          v19.8h, v25.16b, v29.16b
146
+    uabal           v20.8h, v26.8b, v30.8b
147
+    uabal2          v21.8h, v26.16b, v30.16b
148
+    uabal           v22.8h, v27.8b, v31.8b
149
+    uabal2          v23.8h, v27.16b, v31.16b
150
+.endm
151
+
152
+.macro SAD_END_64
153
+    add             v16.8h, v16.8h, v17.8h
154
+    add             v17.8h, v18.8h, v19.8h
155
+    add             v16.8h, v16.8h, v17.8h
156
+    uaddlp          v16.4s, v16.8h
157
+    add             v18.8h, v20.8h, v21.8h
158
+    add             v19.8h, v22.8h, v23.8h
159
+    add             v17.8h, v18.8h, v19.8h
160
+    uaddlp          v17.4s, v17.8h
161
+    add             v16.4s, v16.4s, v17.4s
162
+    uaddlv          d0, v16.4s
163
+    fmov            x0, d0
164
+    ret
165
+.endm
166
+
167
+.macro SAD_START_12
168
+    movrel          x12, sad12_mask
169
+    ld1             {v31.16b}, x12
170
+    movi            v16.16b, #0
171
+    movi            v17.16b, #0
172
+.endm
173
+
174
+.macro SAD_12
175
+    ld1             {v0.16b}, x0, x1
176
+    and             v0.16b, v0.16b, v31.16b
177
+    ld1             {v1.16b}, x2, x3
178
+    and             v1.16b, v1.16b, v31.16b
179
+    ld1             {v2.16b}, x0, x1
180
+    and             v2.16b, v2.16b, v31.16b
181
+    ld1             {v3.16b}, x2, x3
182
+    and             v3.16b, v3.16b, v31.16b
183
+    uabal           v16.8h, v0.8b, v1.8b
184
+    uabal2          v17.8h, v0.16b, v1.16b
185
+    uabal           v16.8h, v2.8b, v3.8b
186
+    uabal2          v17.8h, v2.16b, v3.16b
187
+.endm
188
+
189
+.macro SAD_END_12
190
+    add             v16.8h, v16.8h, v17.8h
191
+    uaddlv          s0, v16.8h
192
+    fmov            w0, s0
193
+    ret
194
+.endm
195
+
196
+.macro SAD_START_24
197
+    movi            v16.16b, #0
198
+    movi            v17.16b, #0
199
+    movi            v18.16b, #0
200
+    sub             x1, x1, #16
201
x265_3.6.tar.gz/source/common/aarch64/sad-a-sve2.S Added
201
 
1
@@ -0,0 +1,511 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "sad-a-common.S"
27
+
28
+.arch armv8-a+sve2
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+.macro SAD_SVE2_16 h
41
+    mov             z16.d, #0
42
+    ptrue           p0.h, vl16
43
+.rept \h
44
+    ld1b            {z0.h}, p0/z, x0
45
+    ld1b            {z2.h}, p0/z, x2
46
+    add             x0, x0, x1
47
+    add             x2, x2, x3
48
+    uaba            z16.h, z0.h, z2.h
49
+.endr
50
+    uaddv           d0, p0, z16.h
51
+    fmov            w0, s0
52
+    ret
53
+.endm
54
+
55
+.macro SAD_SVE2_32 h
56
+    ptrue           p0.b, vl32
57
+.rept \h
58
+    ld1b            {z0.b}, p0/z, x0
59
+    ld1b            {z4.b}, p0/z, x2
60
+    add             x0, x0, x1
61
+    add             x2, x2, x3
62
+    uabalb          z16.h, z0.b, z4.b
63
+    uabalt          z16.h, z0.b, z4.b
64
+.endr
65
+    uaddv           d0, p0, z16.h
66
+    fmov            w0, s0
67
+    ret
68
+.endm
69
+
70
+.macro SAD_SVE2_64 h
71
+    cmp             x9, #48
72
+    bgt             .vl_gt_48_pixel_sad_64x\h
73
+    mov             z16.d, #0
74
+    mov             z17.d, #0
75
+    mov             z18.d, #0
76
+    mov             z19.d, #0
77
+    ptrue           p0.b, vl32
78
+.rept \h
79
+    ld1b            {z0.b}, p0/z, x0
80
+    ld1b            {z1.b}, p0/z, x0, #1, mul vl
81
+    ld1b            {z4.b}, p0/z, x2
82
+    ld1b            {z5.b}, p0/z, x2, #1, mul vl
83
+    add             x0, x0, x1
84
+    add             x2, x2, x3
85
+    uabalb          z16.h, z0.b, z4.b
86
+    uabalt          z17.h, z0.b, z4.b
87
+    uabalb          z18.h, z1.b, z5.b
88
+    uabalt          z19.h, z1.b, z5.b
89
+.endr
90
+    add             z16.h, z16.h, z17.h
91
+    add             z17.h, z18.h, z19.h
92
+    add             z16.h, z16.h, z17.h
93
+    uadalp          z24.s, p0/m, z16.h
94
+    uaddv           d5, p0, z24.s
95
+    fmov            x0, d5
96
+    ret
97
+.vl_gt_48_pixel_sad_64x\h\():
98
+    mov             z16.d, #0
99
+    mov             z17.d, #0
100
+    mov             z24.d, #0
101
+    ptrue           p0.b, vl64
102
+.rept \h
103
+    ld1b            {z0.b}, p0/z, x0
104
+    ld1b            {z4.b}, p0/z, x2
105
+    add             x0, x0, x1
106
+    add             x2, x2, x3
107
+    uabalb          z16.h, z0.b, z4.b
108
+    uabalt          z17.h, z0.b, z4.b
109
+.endr
110
+    add             z16.h, z16.h, z17.h
111
+    uadalp          z24.s, p0/m, z16.h
112
+    uaddv           d5, p0, z24.s
113
+    fmov            x0, d5
114
+    ret
115
+.endm
116
+
117
+.macro SAD_SVE2_24 h
118
+    mov             z16.d, #0
119
+    mov             x10, #24
120
+    mov             x11, #0
121
+    whilelt         p0.b, x11, x10
122
+.rept \h
123
+    ld1b            {z0.b}, p0/z, x0
124
+    ld1b            {z8.b}, p0/z, x2
125
+    add             x0, x0, x1
126
+    add             x2, x2, x3
127
+    uabalb          z16.h, z0.b, z8.b
128
+    uabalt          z16.h, z0.b, z8.b
129
+.endr
130
+    uaddv           d5, p0, z16.h
131
+    fmov            w0, s5
132
+    ret
133
+.endm
134
+
135
+.macro SAD_SVE2_48 h
136
+    cmp             x9, #48
137
+    bgt             .vl_gt_48_pixel_sad_48x\h
138
+    mov             z16.d, #0
139
+    mov             z17.d, #0
140
+    mov             z18.d, #0
141
+    mov             z19.d, #0
142
+    ptrue           p0.b, vl32
143
+    ptrue           p1.b, vl16
144
+.rept \h
145
+    ld1b            {z0.b}, p0/z, x0
146
+    ld1b            {z1.b}, p1/z, x0, #1, mul vl
147
+    ld1b            {z8.b}, p0/z, x2
148
+    ld1b            {z9.b}, p1/z, x2, #1, mul vl
149
+    add             x0, x0, x1
150
+    add             x2, x2, x3
151
+    uabalb          z16.h, z0.b, z8.b
152
+    uabalt          z17.h, z0.b, z8.b
153
+    uabalb          z18.h, z1.b, z9.b
154
+    uabalt          z19.h, z1.b, z9.b
155
+.endr
156
+    add             z16.h, z16.h, z17.h
157
+    add             z17.h, z18.h, z19.h
158
+    add             z16.h, z16.h, z17.h
159
+    uaddv           d5, p0, z16.h
160
+    fmov            w0, s5
161
+    ret
162
+.vl_gt_48_pixel_sad_48x\h\():
163
+    mov             z16.d, #0
164
+    mov             z17.d, #0
165
+    mov             x10, #48
166
+    mov             x11, #0
167
+    whilelt         p0.b, x11, x10
168
+.rept \h
169
+    ld1b            {z0.b}, p0/z, x0
170
+    ld1b            {z8.b}, p0/z, x2
171
+    add             x0, x0, x1
172
+    add             x2, x2, x3
173
+    uabalb          z16.h, z0.b, z8.b
174
+    uabalt          z17.h, z0.b, z8.b
175
+.endr
176
+    add             z16.h, z16.h, z17.h
177
+    uaddv           d5, p0, z16.h
178
+    fmov            w0, s5
179
+    ret
180
+.endm
181
+
182
+// Fully unrolled.
183
+.macro SAD_FUNC_SVE2 w, h
184
+function PFX(pixel_sad_\w\()x\h\()_sve2)
185
+    rdvl            x9, #1
186
+    cmp             x9, #16
187
+    bgt             .vl_gt_16_pixel_sad_\w\()x\h
188
+    SAD_START_\w uabdl
189
+    SAD_\w \h
190
+.if \w > 4
191
+    add             v16.8h, v16.8h, v17.8h
192
+.endif
193
+    uaddlv          s0, v16.8h
194
+    fmov            w0, s0
195
+    ret
196
+.vl_gt_16_pixel_sad_\w\()x\h\():
197
+.if \w == 4 || \w == 8 || \w == 12
198
+    SAD_START_\w uabdl
199
+    SAD_\w \h
200
+.if \w > 4
201
x265_3.5.tar.gz/source/common/aarch64/sad-a.S -> x265_3.6.tar.gz/source/common/aarch64/sad-a.S Changed
201
 
1
@@ -1,7 +1,8 @@
2
 /*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
+ * Copyright (C) 2020-2021 MulticoreWare, Inc
5
  *
6
  * Authors: Hongbin Liu <liuhongbin1@huawei.com>
7
+ *          Sebastian Pop <spop@amazon.com>
8
  *
9
  * This program is free software; you can redistribute it and/or modify
10
  * it under the terms of the GNU General Public License as published by
11
@@ -22,84 +23,186 @@
12
  *****************************************************************************/
13
 
14
 #include "asm.S"
15
+#include "sad-a-common.S"
16
 
17
+#ifdef __APPLE__
18
+.section __RODATA,__rodata
19
+#else
20
 .section .rodata
21
+#endif
22
 
23
 .align 4
24
 
25
 .text
26
 
27
-.macro SAD_X_START_8 x
28
-    ld1             {v0.8b}, x0, x9
29
-.if \x == 3
30
-    ld1             {v1.8b}, x1, x4
31
-    ld1             {v2.8b}, x2, x4
32
-    ld1             {v3.8b}, x3, x4
33
-.elseif \x == 4
34
-    ld1             {v1.8b}, x1, x5
35
-    ld1             {v2.8b}, x2, x5
36
-    ld1             {v3.8b}, x3, x5
37
-    ld1             {v4.8b}, x4, x5
38
-.endif
39
-    uabdl           v16.8h, v0.8b, v1.8b
40
-    uabdl           v17.8h, v0.8b, v2.8b
41
-    uabdl           v18.8h, v0.8b, v3.8b
42
-.if \x == 4
43
-    uabdl           v19.8h, v0.8b, v4.8b
44
+// Fully unrolled.
45
+.macro SAD_FUNC w, h
46
+function PFX(pixel_sad_\w\()x\h\()_neon)
47
+    SAD_START_\w uabdl
48
+    SAD_\w \h
49
+.if \w > 4
50
+    add             v16.8h, v16.8h, v17.8h
51
 .endif
52
+    uaddlv          s0, v16.8h
53
+    fmov            w0, s0
54
+    ret
55
+endfunc
56
+.endm
57
+
58
+// Loop unrolled 4.
59
+.macro SAD_FUNC_LOOP w, h
60
+function PFX(pixel_sad_\w\()x\h\()_neon)
61
+    SAD_START_\w
62
+
63
+    mov             w9, #\h/8
64
+.loop_\w\()x\h:
65
+    sub             w9, w9, #1
66
+.rept 4
67
+    SAD_\w
68
+.endr
69
+    cbnz            w9, .loop_\w\()x\h
70
+
71
+    SAD_END_\w
72
+endfunc
73
 .endm
74
 
75
-.macro SAD_X_8 x
76
-    ld1             {v0.8b}, x0, x9
77
+SAD_FUNC  4,  4
78
+SAD_FUNC  4,  8
79
+SAD_FUNC  4,  16
80
+SAD_FUNC  8,  4
81
+SAD_FUNC  8,  8
82
+SAD_FUNC  8,  16
83
+SAD_FUNC  8,  32
84
+SAD_FUNC  16, 4
85
+SAD_FUNC  16, 8
86
+SAD_FUNC  16, 12
87
+SAD_FUNC  16, 16
88
+SAD_FUNC  16, 32
89
+SAD_FUNC  16, 64
90
+
91
+SAD_FUNC_LOOP  32, 8
92
+SAD_FUNC_LOOP  32, 16
93
+SAD_FUNC_LOOP  32, 24
94
+SAD_FUNC_LOOP  32, 32
95
+SAD_FUNC_LOOP  32, 64
96
+SAD_FUNC_LOOP  64, 16
97
+SAD_FUNC_LOOP  64, 32
98
+SAD_FUNC_LOOP  64, 48
99
+SAD_FUNC_LOOP  64, 64
100
+SAD_FUNC_LOOP  12, 16
101
+SAD_FUNC_LOOP  24, 32
102
+SAD_FUNC_LOOP  48, 64
103
+
104
+// SAD_X3 and SAD_X4 code start
105
+
106
+// static void x264_pixel_sad_x3_##size(pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, intptr_t i_stride, int scores3)
107
+// static void x264_pixel_sad_x4_##size(pixel *fenc, pixel *pix0, pixel *pix1,pixel *pix2, pixel *pix3, intptr_t i_stride, int scores4)
108
+.macro SAD_X_FUNC x, w, h
109
+function PFX(sad_x\x\()_\w\()x\h\()_neon)
110
+    mov             x9, #FENC_STRIDE
111
+
112
+// Make function arguments for x == 3 look like x == 4.
113
 .if \x == 3
114
-    ld1             {v1.8b}, x1, x4
115
-    ld1             {v2.8b}, x2, x4
116
-    ld1             {v3.8b}, x3, x4
117
-.elseif \x == 4
118
-    ld1             {v1.8b}, x1, x5
119
-    ld1             {v2.8b}, x2, x5
120
-    ld1             {v3.8b}, x3, x5
121
-    ld1             {v4.8b}, x4, x5
122
+    mov             x6, x5
123
+    mov             x5, x4
124
 .endif
125
-    uabal           v16.8h, v0.8b, v1.8b
126
-    uabal           v17.8h, v0.8b, v2.8b
127
-    uabal           v18.8h, v0.8b, v3.8b
128
-.if \x == 4
129
-    uabal           v19.8h, v0.8b, v4.8b
130
+
131
+.if \w == 12
132
+    movrel          x12, sad12_mask
133
+    ld1             {v31.16b}, x12
134
 .endif
135
+
136
+    SAD_X_START_\w \h, \x, uabdl
137
+    SAD_X_\w \h, \x
138
+    SAD_X_END_\w \x
139
+endfunc
140
 .endm
141
 
142
-.macro SAD_X_8xN x, h
143
-function x265_sad_x\x\()_8x\h\()_neon
144
+.macro SAD_X_LOOP x, w, h
145
+function PFX(sad_x\x\()_\w\()x\h\()_neon)
146
     mov             x9, #FENC_STRIDE
147
-    SAD_X_START_8 \x
148
-.rept \h - 1
149
-    SAD_X_8 \x
150
-.endr
151
-    uaddlv          s0, v16.8h
152
-    uaddlv          s1, v17.8h
153
-    uaddlv          s2, v18.8h
154
-.if \x == 4
155
-    uaddlv          s3, v19.8h
156
-.endif
157
 
158
+// Make function arguments for x == 3 look like x == 4.
159
 .if \x == 3
160
-    stp             s0, s1, x5
161
-    str             s2, x5, #8
162
-.elseif \x == 4
163
-    stp             s0, s1, x6
164
-    stp             s2, s3, x6, #8
165
+    mov             x6, x5
166
+    mov             x5, x4
167
 .endif
168
-    ret
169
+    SAD_X_START_\w \x
170
+    mov             w12, #\h/4
171
+.loop_sad_x\x\()_\w\()x\h:
172
+    sub             w12, w12, #1
173
+ .rept 4
174
+  .if \w == 24
175
+    ld1             {v6.16b}, x0, #16
176
+    ld1             {v7.8b}, x0, x9
177
+  .elseif \w == 32
178
+    ld1             {v6.16b-v7.16b}, x0, x9
179
+  .elseif \w == 48
180
+    ld1             {v4.16b-v6.16b}, x0, x9
181
+  .elseif \w == 64
182
+    ld1             {v4.16b-v7.16b}, x0, x9
183
+  .endif
184
+    SAD_X_\w x1, v16, v20
185
+    SAD_X_\w x2, v17, v21
186
+    SAD_X_\w x3, v18, v22
187
+  .if \x == 4
188
+    SAD_X_\w x4, v19, v23
189
+  .endif
190
+ .endr
191
+    cbnz            w12, .loop_sad_x\x\()_\w\()x\h
192
+    SAD_X_END_\w \x
193
 endfunc
194
 .endm
195
 
196
-SAD_X_8xN 3 4
197
-SAD_X_8xN 3 8
198
-SAD_X_8xN 3 16
199
-SAD_X_8xN 3 32
200
 
201
x265_3.6.tar.gz/source/common/aarch64/ssd-a-common.S Added
39
 
1
@@ -0,0 +1,37 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+#include "asm.S"
29
+
30
+.arch           armv8-a
31
+
32
+.macro ret_v0_w0
33
+    trn2            v1.2d, v0.2d, v0.2d
34
+    add             v0.2s, v0.2s, v1.2s
35
+    addp            v0.2s, v0.2s, v0.2s
36
+    fmov            w0, s0
37
+    ret
38
+.endm
39
x265_3.6.tar.gz/source/common/aarch64/ssd-a-sve.S Added
80
 
1
@@ -0,0 +1,78 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+
27
+.arch armv8-a+sve
28
+
29
+#ifdef __APPLE__
30
+.section __RODATA,__rodata
31
+#else
32
+.section .rodata
33
+#endif
34
+
35
+.align 4
36
+
37
+.text
38
+
39
+function PFX(pixel_sse_pp_4x4_sve)
40
+    ptrue           p0.s, vl4
41
+    ld1b            {z0.s}, p0/z, x0
42
+    ld1b            {z17.s}, p0/z, x2
43
+    add             x0, x0, x1
44
+    add             x2, x2, x3
45
+    sub             z0.s, p0/m, z0.s, z17.s
46
+    mul             z0.s, p0/m, z0.s, z0.s
47
+.rept 3
48
+    ld1b            {z16.s}, p0/z, x0
49
+    ld1b            {z17.s}, p0/z, x2
50
+    add             x0, x0, x1
51
+    add             x2, x2, x3
52
+    sub             z16.s, p0/m, z16.s, z17.s
53
+    mla             z0.s, p0/m, z16.s, z16.s
54
+.endr
55
+    uaddv           d0, p0, z0.s
56
+    fmov            w0, s0
57
+    ret
58
+endfunc
59
+
60
+function PFX(pixel_sse_pp_4x8_sve)
61
+    ptrue           p0.s, vl4
62
+    ld1b            {z0.s}, p0/z, x0
63
+    ld1b            {z17.s}, p0/z, x2
64
+    add             x0, x0, x1
65
+    add             x2, x2, x3
66
+    sub             z0.s, p0/m, z0.s, z17.s
67
+    mul             z0.s, p0/m, z0.s, z0.s
68
+.rept 7
69
+    ld1b            {z16.s}, p0/z, x0
70
+    ld1b            {z17.s}, p0/z, x2
71
+    add             x0, x0, x1
72
+    add             x2, x2, x3
73
+    sub             z16.s, p0/m, z16.s, z17.s
74
+    mla             z0.s, p0/m, z16.s, z16.s
75
+.endr
76
+    uaddv           d0, p0, z0.s
77
+    fmov            w0, s0
78
+    ret
79
+endfunc
80
x265_3.6.tar.gz/source/common/aarch64/ssd-a-sve2.S Added
201
 
1
@@ -0,0 +1,887 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "ssd-a-common.S"
27
+
28
+.arch armv8-a+sve2
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+function PFX(pixel_sse_pp_32x32_sve2)
41
+    rdvl            x9, #1
42
+    cmp             x9, #16
43
+    bgt             .vl_gt_16_pixel_sse_pp_32x32
44
+    mov             w12, #8
45
+    movi            v0.16b, #0
46
+    movi            v1.16b, #0
47
+.loop_sse_pp_32_sve2:
48
+    sub             w12, w12, #1
49
+.rept 4
50
+    ld1             {v16.16b,v17.16b}, x0, x1
51
+    ld1             {v18.16b,v19.16b}, x2, x3
52
+    usubl           v2.8h, v16.8b, v18.8b
53
+    usubl2          v3.8h, v16.16b, v18.16b
54
+    usubl           v4.8h, v17.8b, v19.8b
55
+    usubl2          v5.8h, v17.16b, v19.16b
56
+    smlal           v0.4s, v2.4h, v2.4h
57
+    smlal2          v1.4s, v2.8h, v2.8h
58
+    smlal           v0.4s, v3.4h, v3.4h
59
+    smlal2          v1.4s, v3.8h, v3.8h
60
+    smlal           v0.4s, v4.4h, v4.4h
61
+    smlal2          v1.4s, v4.8h, v4.8h
62
+    smlal           v0.4s, v5.4h, v5.4h
63
+    smlal2          v1.4s, v5.8h, v5.8h
64
+.endr
65
+    cbnz            w12, .loop_sse_pp_32_sve2
66
+    add             v0.4s, v0.4s, v1.4s
67
+    ret_v0_w0
68
+.vl_gt_16_pixel_sse_pp_32x32:
69
+    ptrue           p0.b, vl32
70
+    ld1b            {z16.b}, p0/z, x0
71
+    ld1b            {z18.b}, p0/z, x2
72
+    add             x0, x0, x1
73
+    add             x2, x2, x3
74
+    usublb          z1.h, z16.b, z18.b
75
+    usublt          z2.h, z16.b, z18.b
76
+    smullb          z0.s, z1.h, z1.h
77
+    smlalt          z0.s, z1.h, z1.h
78
+    smlalb          z0.s, z2.h, z2.h
79
+    smlalt          z0.s, z2.h, z2.h
80
+.rept 31
81
+    ld1b            {z16.b}, p0/z, x0
82
+    ld1b            {z18.b}, p0/z, x2
83
+    add             x0, x0, x1
84
+    add             x2, x2, x3
85
+    usublb          z1.h, z16.b, z18.b
86
+    usublt          z2.h, z16.b, z18.b
87
+    smullb          z0.s, z1.h, z1.h
88
+    smlalt          z0.s, z1.h, z1.h
89
+    smlalb          z0.s, z2.h, z2.h
90
+    smlalt          z0.s, z2.h, z2.h
91
+.endr
92
+    uaddv           d3, p0, z0.s
93
+    fmov            w0, s3
94
+    ret
95
+endfunc
96
+
97
+function PFX(pixel_sse_pp_32x64_sve2)
98
+    rdvl            x9, #1
99
+    cmp             x9, #16
100
+    bgt             .vl_gt_16_pixel_sse_pp_32x64
101
+    ptrue           p0.b, vl16
102
+    ld1b            {z16.b}, p0/z, x0
103
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
104
+    ld1b            {z18.b}, p0/z, x2
105
+    ld1b            {z19.b}, p0/z, x2, #1, mul vl
106
+    add             x0, x0, x1
107
+    add             x2, x2, x3
108
+    usublb          z1.h, z16.b, z18.b
109
+    usublt          z2.h, z16.b, z18.b
110
+    usublb          z3.h, z17.b, z19.b
111
+    usublt          z4.h, z17.b, z19.b
112
+    smullb          z20.s, z1.h, z1.h
113
+    smullt          z21.s, z1.h, z1.h
114
+    smlalb          z20.s, z2.h, z2.h
115
+    smlalt          z21.s, z2.h, z2.h
116
+    smlalb          z20.s, z3.h, z3.h
117
+    smlalt          z21.s, z3.h, z3.h
118
+    smlalb          z20.s, z4.h, z4.h
119
+    smlalt          z21.s, z4.h, z4.h
120
+.rept 63
121
+    ld1b            {z16.b}, p0/z, x0
122
+    ld1b            {z17.b}, p0/z, x0, #1, mul vl
123
+    ld1b            {z18.b}, p0/z, x2
124
+    ld1b            {z19.b}, p0/z, x2, #1, mul vl
125
+    add             x0, x0, x1
126
+    add             x2, x2, x3
127
+    usublb          z1.h, z16.b, z18.b
128
+    usublt          z2.h, z16.b, z18.b
129
+    usublb          z3.h, z17.b, z19.b
130
+    usublt          z4.h, z17.b, z19.b
131
+    smlalb          z20.s, z1.h, z1.h
132
+    smlalt          z21.s, z1.h, z1.h
133
+    smlalb          z20.s, z2.h, z2.h
134
+    smlalt          z21.s, z2.h, z2.h
135
+    smlalb          z20.s, z3.h, z3.h
136
+    smlalt          z21.s, z3.h, z3.h
137
+    smlalb          z20.s, z4.h, z4.h
138
+    smlalt          z21.s, z4.h, z4.h
139
+.endr
140
+    uaddv           d3, p0, z20.s
141
+    fmov            w0, s3
142
+    uaddv           d4, p0, z21.s
143
+    fmov            w1, s4
144
+    add             w0, w0, w1
145
+    ret
146
+.vl_gt_16_pixel_sse_pp_32x64:
147
+    ptrue           p0.b, vl32
148
+    ld1b            {z16.b}, p0/z, x0
149
+    ld1b            {z18.b}, p0/z, x2
150
+    add             x0, x0, x1
151
+    add             x2, x2, x3
152
+    usublb          z1.h, z16.b, z18.b
153
+    usublt          z2.h, z16.b, z18.b
154
+    smullb          z20.s, z1.h, z1.h
155
+    smullt          z21.s, z1.h, z1.h
156
+    smlalb          z20.s, z2.h, z2.h
157
+    smlalt          z21.s, z2.h, z2.h
158
+.rept 63
159
+    ld1b            {z16.b}, p0/z, x0
160
+    ld1b            {z18.b}, p0/z, x2
161
+    add             x0, x0, x1
162
+    add             x2, x2, x3
163
+    usublb          z1.h, z16.b, z18.b
164
+    usublt          z2.h, z16.b, z18.b
165
+    smlalb          z20.s, z1.h, z1.h
166
+    smlalt          z21.s, z1.h, z1.h
167
+    smlalb          z20.s, z2.h, z2.h
168
+    smlalt          z21.s, z2.h, z2.h
169
+.endr
170
+    uaddv           d3, p0, z20.s
171
+    fmov            w0, s3
172
+    uaddv           d4, p0, z21.s
173
+    fmov            w1, s4
174
+    add             w0, w0, w1
175
+    ret
176
+endfunc
177
+
178
+function PFX(pixel_sse_pp_64x64_sve2)
179
+    rdvl            x9, #1
180
+    cmp             x9, #16
181
+    bgt             .vl_gt_16_pixel_sse_pp_64x64
182
+    mov             w12, #16
183
+    movi            v0.16b, #0
184
+    movi            v1.16b, #0
185
+
186
+.loop_sse_pp_64_sve2:
187
+    sub             w12, w12, #1
188
+.rept 4
189
+    ld1             {v16.16b-v19.16b}, x0, x1
190
+    ld1             {v20.16b-v23.16b}, x2, x3
191
+
192
+    usubl           v2.8h, v16.8b, v20.8b
193
+    usubl2          v3.8h, v16.16b, v20.16b
194
+    usubl           v4.8h, v17.8b, v21.8b
195
+    usubl2          v5.8h, v17.16b, v21.16b
196
+    smlal           v0.4s, v2.4h, v2.4h
197
+    smlal2          v1.4s, v2.8h, v2.8h
198
+    smlal           v0.4s, v3.4h, v3.4h
199
+    smlal2          v1.4s, v3.8h, v3.8h
200
+    smlal           v0.4s, v4.4h, v4.4h
201
x265_3.6.tar.gz/source/common/aarch64/ssd-a.S Added
201
 
1
@@ -0,0 +1,476 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2021 MulticoreWare, Inc
4
+ *
5
+ * Authors: Sebastian Pop <spop@amazon.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+#include "ssd-a-common.S"
27
+
28
+#ifdef __APPLE__
29
+.section __RODATA,__rodata
30
+#else
31
+.section .rodata
32
+#endif
33
+
34
+.align 4
35
+
36
+.text
37
+
38
+function PFX(pixel_sse_pp_4x4_neon)
39
+    ld1             {v16.s}0, x0, x1
40
+    ld1             {v17.s}0, x2, x3
41
+    ld1             {v18.s}0, x0, x1
42
+    ld1             {v19.s}0, x2, x3
43
+    ld1             {v20.s}0, x0, x1
44
+    ld1             {v21.s}0, x2, x3
45
+    ld1             {v22.s}0, x0, x1
46
+    ld1             {v23.s}0, x2, x3
47
+
48
+    usubl           v1.8h, v16.8b, v17.8b
49
+    usubl           v2.8h, v18.8b, v19.8b
50
+    usubl           v3.8h, v20.8b, v21.8b
51
+    usubl           v4.8h, v22.8b, v23.8b
52
+
53
+    smull           v0.4s, v1.4h, v1.4h
54
+    smlal           v0.4s, v2.4h, v2.4h
55
+    smlal           v0.4s, v3.4h, v3.4h
56
+    smlal           v0.4s, v4.4h, v4.4h
57
+    ret_v0_w0
58
+endfunc
59
+
60
+function PFX(pixel_sse_pp_4x8_neon)
61
+    ld1             {v16.s}0, x0, x1
62
+    ld1             {v17.s}0, x2, x3
63
+    usubl           v1.8h, v16.8b, v17.8b
64
+    ld1             {v16.s}0, x0, x1
65
+    ld1             {v17.s}0, x2, x3
66
+    smull           v0.4s, v1.4h, v1.4h
67
+.rept 6
68
+    usubl           v1.8h, v16.8b, v17.8b
69
+    ld1             {v16.s}0, x0, x1
70
+    smlal           v0.4s, v1.4h, v1.4h
71
+    ld1             {v17.s}0, x2, x3
72
+.endr
73
+    usubl           v1.8h, v16.8b, v17.8b
74
+    smlal           v0.4s, v1.4h, v1.4h
75
+    ret_v0_w0
76
+endfunc
77
+
78
+function PFX(pixel_sse_pp_8x8_neon)
79
+    ld1             {v16.8b}, x0, x1
80
+    ld1             {v17.8b}, x2, x3
81
+    usubl           v1.8h, v16.8b, v17.8b
82
+    ld1             {v16.8b}, x0, x1
83
+    smull           v0.4s, v1.4h, v1.4h
84
+    smlal2          v0.4s, v1.8h, v1.8h
85
+    ld1             {v17.8b}, x2, x3
86
+
87
+.rept 6
88
+    usubl           v1.8h, v16.8b, v17.8b
89
+    ld1             {v16.8b}, x0, x1
90
+    smlal           v0.4s, v1.4h, v1.4h
91
+    smlal2          v0.4s, v1.8h, v1.8h
92
+    ld1             {v17.8b}, x2, x3
93
+.endr
94
+    usubl           v1.8h, v16.8b, v17.8b
95
+    smlal           v0.4s, v1.4h, v1.4h
96
+    smlal2          v0.4s, v1.8h, v1.8h
97
+    ret_v0_w0
98
+endfunc
99
+
100
+function PFX(pixel_sse_pp_8x16_neon)
101
+    ld1             {v16.8b}, x0, x1
102
+    ld1             {v17.8b}, x2, x3
103
+    usubl           v1.8h, v16.8b, v17.8b
104
+    ld1             {v16.8b}, x0, x1
105
+    smull           v0.4s, v1.4h, v1.4h
106
+    smlal2          v0.4s, v1.8h, v1.8h
107
+    ld1             {v17.8b}, x2, x3
108
+
109
+.rept 14
110
+    usubl           v1.8h, v16.8b, v17.8b
111
+    ld1             {v16.8b}, x0, x1
112
+    smlal           v0.4s, v1.4h, v1.4h
113
+    smlal2          v0.4s, v1.8h, v1.8h
114
+    ld1             {v17.8b}, x2, x3
115
+.endr
116
+    usubl           v1.8h, v16.8b, v17.8b
117
+    smlal           v0.4s, v1.4h, v1.4h
118
+    smlal2          v0.4s, v1.8h, v1.8h
119
+    ret_v0_w0
120
+endfunc
121
+
122
+.macro sse_pp_16xN h
123
+function PFX(pixel_sse_pp_16x\h\()_neon)
124
+    ld1             {v16.16b}, x0, x1
125
+    ld1             {v17.16b}, x2, x3
126
+    usubl           v1.8h, v16.8b, v17.8b
127
+    usubl2          v2.8h, v16.16b, v17.16b
128
+    ld1             {v16.16b}, x0, x1
129
+    ld1             {v17.16b}, x2, x3
130
+    smull           v0.4s, v1.4h, v1.4h
131
+    smlal2          v0.4s, v1.8h, v1.8h
132
+    smlal           v0.4s, v2.4h, v2.4h
133
+    smlal2          v0.4s, v2.8h, v2.8h
134
+.rept \h - 2
135
+    usubl           v1.8h, v16.8b, v17.8b
136
+    usubl2          v2.8h, v16.16b, v17.16b
137
+    ld1             {v16.16b}, x0, x1
138
+    smlal           v0.4s, v1.4h, v1.4h
139
+    smlal2          v0.4s, v1.8h, v1.8h
140
+    ld1             {v17.16b}, x2, x3
141
+    smlal           v0.4s, v2.4h, v2.4h
142
+    smlal2          v0.4s, v2.8h, v2.8h
143
+.endr
144
+    usubl           v1.8h, v16.8b, v17.8b
145
+    usubl2          v2.8h, v16.16b, v17.16b
146
+    smlal           v0.4s, v1.4h, v1.4h
147
+    smlal2          v0.4s, v1.8h, v1.8h
148
+    smlal           v0.4s, v2.4h, v2.4h
149
+    smlal2          v0.4s, v2.8h, v2.8h
150
+    ret_v0_w0
151
+endfunc
152
+.endm
153
+
154
+sse_pp_16xN 16
155
+sse_pp_16xN 32
156
+
157
+function PFX(pixel_sse_pp_32x32_neon)
158
+    mov             w12, #8
159
+    movi            v0.16b, #0
160
+    movi            v1.16b, #0
161
+.loop_sse_pp_32:
162
+    sub             w12, w12, #1
163
+.rept 4
164
+    ld1             {v16.16b,v17.16b}, x0, x1
165
+    ld1             {v18.16b,v19.16b}, x2, x3
166
+    usubl           v2.8h, v16.8b, v18.8b
167
+    usubl2          v3.8h, v16.16b, v18.16b
168
+    usubl           v4.8h, v17.8b, v19.8b
169
+    usubl2          v5.8h, v17.16b, v19.16b
170
+    smlal           v0.4s, v2.4h, v2.4h
171
+    smlal2          v1.4s, v2.8h, v2.8h
172
+    smlal           v0.4s, v3.4h, v3.4h
173
+    smlal2          v1.4s, v3.8h, v3.8h
174
+    smlal           v0.4s, v4.4h, v4.4h
175
+    smlal2          v1.4s, v4.8h, v4.8h
176
+    smlal           v0.4s, v5.4h, v5.4h
177
+    smlal2          v1.4s, v5.8h, v5.8h
178
+.endr
179
+    cbnz            w12, .loop_sse_pp_32
180
+    add             v0.4s, v0.4s, v1.4s
181
+    ret_v0_w0
182
+endfunc
183
+
184
+function PFX(pixel_sse_pp_32x64_neon)
185
+    mov             w12, #16
186
+    movi            v0.16b, #0
187
+    movi            v1.16b, #0
188
+.loop_sse_pp_32x64:
189
+    sub             w12, w12, #1
190
+.rept 4
191
+    ld1             {v16.16b,v17.16b}, x0, x1
192
+    ld1             {v18.16b,v19.16b}, x2, x3
193
+    usubl           v2.8h, v16.8b, v18.8b
194
+    usubl2          v3.8h, v16.16b, v18.16b
195
+    usubl           v4.8h, v17.8b, v19.8b
196
+    usubl2          v5.8h, v17.16b, v19.16b
197
+    smlal           v0.4s, v2.4h, v2.4h
198
+    smlal2          v1.4s, v2.8h, v2.8h
199
+    smlal           v0.4s, v3.4h, v3.4h
200
+    smlal2          v1.4s, v3.8h, v3.8h
201
x265_3.5.tar.gz/source/common/common.h -> x265_3.6.tar.gz/source/common/common.h Changed
51
 
1
@@ -130,7 +130,6 @@
2
 typedef uint64_t pixel4;
3
 typedef int64_t  ssum2_t;
4
 #define SHIFT_TO_BITPLANE 9
5
-#define HISTOGRAM_BINS 1024
6
 #else
7
 typedef uint8_t  pixel;
8
 typedef uint16_t sum_t;
9
@@ -138,7 +137,6 @@
10
 typedef uint32_t pixel4;
11
 typedef int32_t  ssum2_t; // Signed sum
12
 #define SHIFT_TO_BITPLANE 7
13
-#define HISTOGRAM_BINS 256
14
 #endif // if HIGH_BIT_DEPTH
15
 
16
 #if X265_DEPTH < 10
17
@@ -162,6 +160,8 @@
18
 
19
 #define MIN_QPSCALE     0.21249999999999999
20
 #define MAX_MAX_QPSCALE 615.46574234477100
21
+#define FRAME_BRIGHTNESS_THRESHOLD  50.0 // Min % of pixels in a frame, that are above BRIGHTNESS_THRESHOLD for it to be considered a bright frame
22
+#define FRAME_EDGE_THRESHOLD  10.0 // Min % of edge pixels in a frame, for it to be considered to have high edge density
23
 
24
 
25
 template<typename T>
26
@@ -340,6 +340,9 @@
27
 #define FILLER_OVERHEAD (NAL_TYPE_OVERHEAD + START_CODE_OVERHEAD + 1)
28
 
29
 #define MAX_NUM_DYN_REFINE          (NUM_CU_DEPTH * X265_REFINE_INTER_LEVELS)
30
+#define X265_BYTE 8
31
+
32
+#define MAX_MCSTF_TEMPORAL_WINDOW_LENGTH 8
33
 
34
 namespace X265_NS {
35
 
36
@@ -434,6 +437,14 @@
37
 #define  x265_unlink(fileName) unlink(fileName)
38
 #define  x265_rename(oldName, newName) rename(oldName, newName)
39
 #endif
40
+/* Close a file */
41
+#define  x265_fclose(file) if (file != NULL) fclose(file); file=NULL;
42
+#define x265_fread(val, size, readSize, fileOffset,errorMessage)\
43
+    if (fread(val, size, readSize, fileOffset) != readSize)\
44
+    {\
45
+        x265_log(NULL, X265_LOG_ERROR, errorMessage); \
46
+        return; \
47
+    }
48
 int      x265_exp2fix8(double x);
49
 
50
 double   x265_ssim2dB(double ssim);
51
x265_3.5.tar.gz/source/common/cpu.cpp -> x265_3.6.tar.gz/source/common/cpu.cpp Changed
58
 
1
@@ -7,6 +7,8 @@
2
  *          Steve Borho <steve@borho.org>
3
  *          Hongbin Liu <liuhongbin1@huawei.com>
4
  *          Yimeng Su <yimeng.su@huawei.com>
5
+ *          Josh Dekker <josh@itanimul.li>
6
+ *          Jean-Baptiste Kempf <jb@videolan.org>
7
  *
8
  * This program is free software; you can redistribute it and/or modify
9
  * it under the terms of the GNU General Public License as published by
10
@@ -105,6 +107,14 @@
11
     { "NEON",            X265_CPU_NEON },
12
     { "FastNeonMRC",     X265_CPU_FAST_NEON_MRC },
13
 
14
+#elif X265_ARCH_ARM64
15
+    { "NEON",            X265_CPU_NEON },
16
+#if defined(HAVE_SVE)
17
+    { "SVE",            X265_CPU_SVE },
18
+#endif
19
+#if defined(HAVE_SVE2)
20
+    { "SVE2",            X265_CPU_SVE2 },
21
+#endif
22
 #elif X265_ARCH_POWER8
23
     { "Altivec",         X265_CPU_ALTIVEC },
24
 
25
@@ -369,12 +379,30 @@
26
     flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
27
 #endif
28
     // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
29
-#elif X265_ARCH_ARM64
30
-    flags |= X265_CPU_NEON;
31
 #endif // if HAVE_ARMV6
32
     return flags;
33
 }
34
 
35
+#elif X265_ARCH_ARM64
36
+
37
+uint32_t cpu_detect(bool benableavx512)
38
+{
39
+    int flags = 0;
40
+
41
+    #if defined(HAVE_SVE2)
42
+         flags |= X265_CPU_SVE2;
43
+         flags |= X265_CPU_SVE;
44
+         flags |= X265_CPU_NEON;
45
+    #elif defined(HAVE_SVE)
46
+         flags |= X265_CPU_SVE;
47
+         flags |= X265_CPU_NEON;
48
+    #elif HAVE_NEON
49
+         flags |= X265_CPU_NEON;
50
+    #endif
51
+        
52
+    return flags;
53
+}
54
+
55
 #elif X265_ARCH_POWER8
56
 
57
 uint32_t cpu_detect(bool benableavx512)
58
x265_3.5.tar.gz/source/common/frame.cpp -> x265_3.6.tar.gz/source/common/frame.cpp Changed
102
 
1
@@ -64,12 +64,40 @@
2
     m_edgeBitPlane = NULL;
3
     m_edgeBitPic = NULL;
4
     m_isInsideWindow = 0;
5
+
6
+    // mcstf
7
+    m_isSubSampled = NULL;
8
+    m_mcstf = NULL;
9
+    m_refPicCnt0 = 0;
10
+    m_refPicCnt1 = 0;
11
+    m_nextMCSTF = NULL;
12
+    m_prevMCSTF = NULL;
13
+
14
+    m_tempLayer = 0;
15
+    m_sameLayerRefPic = false;
16
 }
17
 
18
 bool Frame::create(x265_param *param, float* quantOffsets)
19
 {
20
     m_fencPic = new PicYuv;
21
     m_param = param;
22
+
23
+    if (m_param->bEnableTemporalFilter)
24
+    {
25
+        m_mcstf = new TemporalFilter;
26
+        m_mcstf->init(param);
27
+
28
+        m_fencPicSubsampled2 = new PicYuv;
29
+        m_fencPicSubsampled4 = new PicYuv;
30
+
31
+        if (!m_fencPicSubsampled2->createScaledPicYUV(param, 2))
32
+            return false;
33
+        if (!m_fencPicSubsampled4->createScaledPicYUV(param, 4))
34
+            return false;
35
+
36
+        CHECKED_MALLOC_ZERO(m_isSubSampled, int, 1);
37
+    }
38
+
39
     CHECKED_MALLOC_ZERO(m_rcData, RcStats, 1);
40
 
41
     if (param->bCTUInfo)
42
@@ -151,6 +179,22 @@
43
     return false;
44
 }
45
 
46
+bool Frame::createSubSample()
47
+{
48
+
49
+    m_fencPicSubsampled2 = new PicYuv;
50
+    m_fencPicSubsampled4 = new PicYuv;
51
+
52
+    if (!m_fencPicSubsampled2->createScaledPicYUV(m_param, 2))
53
+        return false;
54
+    if (!m_fencPicSubsampled4->createScaledPicYUV(m_param, 4))
55
+        return false;
56
+    CHECKED_MALLOC_ZERO(m_isSubSampled, int, 1);
57
+    return true;
58
+fail:
59
+    return false;
60
+}
61
+
62
 bool Frame::allocEncodeData(x265_param *param, const SPS& sps)
63
 {
64
     m_encData = new FrameData;
65
@@ -207,6 +251,26 @@
66
         m_fencPic = NULL;
67
     }
68
 
69
+    if (m_param->bEnableTemporalFilter)
70
+    {
71
+
72
+        if (m_fencPicSubsampled2)
73
+        {
74
+            m_fencPicSubsampled2->destroy();
75
+            delete m_fencPicSubsampled2;
76
+            m_fencPicSubsampled2 = NULL;
77
+        }
78
+
79
+        if (m_fencPicSubsampled4)
80
+        {
81
+            m_fencPicSubsampled4->destroy();
82
+            delete m_fencPicSubsampled4;
83
+            m_fencPicSubsampled4 = NULL;
84
+        }
85
+        delete m_mcstf;
86
+        X265_FREE(m_isSubSampled);
87
+    }
88
+
89
     if (m_reconPic)
90
     {
91
         m_reconPic->destroy();
92
@@ -267,7 +331,8 @@
93
         X265_FREE(m_addOnPrevChange);
94
         m_addOnPrevChange = NULL;
95
     }
96
-    m_lowres.destroy();
97
+
98
+    m_lowres.destroy(m_param);
99
     X265_FREE(m_rcData);
100
 
101
     if (m_param->bDynamicRefine)
102
x265_3.5.tar.gz/source/common/frame.h -> x265_3.6.tar.gz/source/common/frame.h Changed
60
 
1
@@ -28,6 +28,7 @@
2
 #include "common.h"
3
 #include "lowres.h"
4
 #include "threading.h"
5
+#include "temporalfilter.h"
6
 
7
 namespace X265_NS {
8
 // private namespace
9
@@ -70,6 +71,7 @@
10
     double   count4;
11
     double   offset4;
12
     double   bufferFillFinal;
13
+    int64_t  currentSatd;
14
 };
15
 
16
 class Frame
17
@@ -83,8 +85,12 @@
18
 
19
     /* Data associated with x265_picture */
20
     PicYuv*                m_fencPic;
21
+    PicYuv*                m_fencPicSubsampled2;
22
+    PicYuv*                m_fencPicSubsampled4;
23
+
24
     int                    m_poc;
25
     int                    m_encodeOrder;
26
+    int                    m_gopOffset;
27
     int64_t                m_pts;                // user provided presentation time stamp
28
     int64_t                m_reorderedPts;
29
     int64_t                m_dts;
30
@@ -132,6 +138,13 @@
31
     bool                   m_classifyFrame;
32
     int                    m_fieldNum;
33
 
34
+    /*MCSTF*/
35
+    TemporalFilter*        m_mcstf;
36
+    int                    m_refPicCnt2;
37
+    Frame*                 m_nextMCSTF;           // PicList doubly linked list pointers
38
+    Frame*                 m_prevMCSTF;
39
+    int*                   m_isSubSampled;
40
+
41
     /* aq-mode 4 : Gaussian, edge and theta frames for edge information */
42
     pixel*                 m_edgePic;
43
     pixel*                 m_gaussianPic;
44
@@ -143,9 +156,15 @@
45
 
46
     int                    m_isInsideWindow;
47
 
48
+    /*Frame's temporal layer info*/
49
+    uint8_t                m_tempLayer;
50
+    int8_t                 m_gopId;
51
+    bool                   m_sameLayerRefPic;
52
+
53
     Frame();
54
 
55
     bool create(x265_param *param, float* quantOffsets);
56
+    bool createSubSample();
57
     bool allocEncodeData(x265_param *param, const SPS& sps);
58
     void reinit(const SPS& sps);
59
     void destroy();
60
x265_3.5.tar.gz/source/common/framedata.cpp -> x265_3.6.tar.gz/source/common/framedata.cpp Changed
10
 
1
@@ -62,7 +62,7 @@
2
     }
3
     else
4
         return false;
5
-    CHECKED_MALLOC_ZERO(m_cuStat, RCStatCU, sps.numCUsInFrame);
6
+    CHECKED_MALLOC_ZERO(m_cuStat, RCStatCU, sps.numCUsInFrame + 1);
7
     CHECKED_MALLOC(m_rowStat, RCStatRow, sps.numCuInHeight);
8
     reinit(sps);
9
     
10
x265_3.5.tar.gz/source/common/lowres.cpp -> x265_3.6.tar.gz/source/common/lowres.cpp Changed
154
 
1
@@ -28,6 +28,28 @@
2
 
3
 using namespace X265_NS;
4
 
5
+/*
6
+ * Down Sample input picture
7
+ */
8
+static
9
+void frame_lowres_core(const pixel* src0, pixel* dst0,
10
+    intptr_t src_stride, intptr_t dst_stride, int width, int height)
11
+{
12
+    for (int y = 0; y < height; y++)
13
+    {
14
+        const pixel* src1 = src0 + src_stride;
15
+        for (int x = 0; x < width; x++)
16
+        {
17
+            // slower than naive bilinear, but matches asm
18
+#define FILTER(a, b, c, d) ((((a + b + 1) >> 1) + ((c + d + 1) >> 1) + 1) >> 1)
19
+            dst0x = FILTER(src02 * x, src12 * x, src02 * x + 1, src12 * x + 1);
20
+#undef FILTER
21
+        }
22
+        src0 += src_stride * 2;
23
+        dst0 += dst_stride;
24
+    }
25
+}
26
+
27
 bool PicQPAdaptationLayer::create(uint32_t width, uint32_t height, uint32_t partWidth, uint32_t partHeight, uint32_t numAQPartInWidthExt, uint32_t numAQPartInHeightExt)
28
 {
29
     aqPartWidth = partWidth;
30
@@ -73,7 +95,7 @@
31
 
32
     size_t planesize = lumaStride * (lines + 2 * origPic->m_lumaMarginY);
33
     size_t padoffset = lumaStride * origPic->m_lumaMarginY + origPic->m_lumaMarginX;
34
-    if (!!param->rc.aqMode || !!param->rc.hevcAq || !!param->bAQMotion)
35
+    if (!!param->rc.aqMode || !!param->rc.hevcAq || !!param->bAQMotion || !!param->bEnableWeightedPred || !!param->bEnableWeightedBiPred)
36
     {
37
         CHECKED_MALLOC_ZERO(qpAqOffset, double, cuCountFullRes);
38
         CHECKED_MALLOC_ZERO(invQscaleFactor, int, cuCountFullRes);
39
@@ -190,13 +212,45 @@
40
         }
41
     }
42
 
43
+    if (param->bHistBasedSceneCut)
44
+    {
45
+        quarterSampleLowResWidth = widthFullRes / 4;
46
+        quarterSampleLowResHeight = heightFullRes / 4;
47
+        quarterSampleLowResOriginX = 16;
48
+        quarterSampleLowResOriginY = 16;
49
+        quarterSampleLowResStrideY = quarterSampleLowResWidth + 2 * quarterSampleLowResOriginY;
50
+
51
+        size_t quarterSampleLowResPlanesize = quarterSampleLowResStrideY * (quarterSampleLowResHeight + 2 * quarterSampleLowResOriginX);
52
+        /* allocate quarter sampled lowres buffers */
53
+        CHECKED_MALLOC_ZERO(quarterSampleLowResBuffer, pixel, quarterSampleLowResPlanesize);
54
+
55
+        // Allocate memory for Histograms
56
+        picHistogram = X265_MALLOC(uint32_t***, NUMBER_OF_SEGMENTS_IN_WIDTH * sizeof(uint32_t***));
57
+        picHistogram0 = X265_MALLOC(uint32_t**, NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
58
+        for (uint32_t wd = 1; wd < NUMBER_OF_SEGMENTS_IN_WIDTH; wd++) {
59
+            picHistogramwd = picHistogram0 + wd * NUMBER_OF_SEGMENTS_IN_HEIGHT;
60
+        }
61
+
62
+        for (uint32_t regionInPictureWidthIndex = 0; regionInPictureWidthIndex < NUMBER_OF_SEGMENTS_IN_WIDTH; regionInPictureWidthIndex++)
63
+        {
64
+            for (uint32_t regionInPictureHeightIndex = 0; regionInPictureHeightIndex < NUMBER_OF_SEGMENTS_IN_HEIGHT; regionInPictureHeightIndex++)
65
+            {
66
+                picHistogramregionInPictureWidthIndexregionInPictureHeightIndex = X265_MALLOC(uint32_t*, NUMBER_OF_SEGMENTS_IN_WIDTH *sizeof(uint32_t*));
67
+                picHistogramregionInPictureWidthIndexregionInPictureHeightIndex0 = X265_MALLOC(uint32_t, 3 * HISTOGRAM_NUMBER_OF_BINS * sizeof(uint32_t));
68
+                for (uint32_t wd = 1; wd < 3; wd++) {
69
+                    picHistogramregionInPictureWidthIndexregionInPictureHeightIndexwd = picHistogramregionInPictureWidthIndexregionInPictureHeightIndex0 + wd * HISTOGRAM_NUMBER_OF_BINS;
70
+                }
71
+            }
72
+        }
73
+    }
74
+
75
     return true;
76
 
77
 fail:
78
     return false;
79
 }
80
 
81
-void Lowres::destroy()
82
+void Lowres::destroy(x265_param* param)
83
 {
84
     X265_FREE(buffer0);
85
     if(bEnableHME)
86
@@ -234,7 +288,8 @@
87
     X265_FREE(invQscaleFactor8x8);
88
     X265_FREE(edgeInclined);
89
     X265_FREE(qpAqMotionOffset);
90
-    X265_FREE(blockVariance);
91
+    if (param->bDynamicRefine || param->bEnableFades)
92
+        X265_FREE(blockVariance);
93
     if (maxAQDepth > 0)
94
     {
95
         for (uint32_t d = 0; d < 4; d++)
96
@@ -254,6 +309,29 @@
97
 
98
         delete pAQLayer;
99
     }
100
+
101
+    // Histograms
102
+    if (param->bHistBasedSceneCut)
103
+    {
104
+        for (uint32_t segmentInFrameWidthIdx = 0; segmentInFrameWidthIdx < NUMBER_OF_SEGMENTS_IN_WIDTH; segmentInFrameWidthIdx++)
105
+        {
106
+            if (picHistogramsegmentInFrameWidthIdx)
107
+            {
108
+                for (uint32_t segmentInFrameHeightIdx = 0; segmentInFrameHeightIdx < NUMBER_OF_SEGMENTS_IN_HEIGHT; segmentInFrameHeightIdx++)
109
+                {
110
+                    if (picHistogramsegmentInFrameWidthIdxsegmentInFrameHeightIdx)
111
+                        X265_FREE(picHistogramsegmentInFrameWidthIdxsegmentInFrameHeightIdx0);
112
+                    X265_FREE(picHistogramsegmentInFrameWidthIdxsegmentInFrameHeightIdx);
113
+                }
114
+            }
115
+        }
116
+        if (picHistogram)
117
+            X265_FREE(picHistogram0);
118
+        X265_FREE(picHistogram);
119
+
120
+        X265_FREE(quarterSampleLowResBuffer);
121
+
122
+    }
123
 }
124
 // (re) initialize lowres state
125
 void Lowres::init(PicYuv *origPic, int poc)
126
@@ -266,10 +344,6 @@
127
     indB = 0;
128
     memset(costEst, -1, sizeof(costEst));
129
     memset(weightedCostDelta, 0, sizeof(weightedCostDelta));
130
-    interPCostPercDiff = 0.0;
131
-    intraCostPercDiff = 0.0;
132
-    m_bIsMaxThres = false;
133
-    m_bIsHardScenecut = false;
134
 
135
     if (qpAqOffset && invQscaleFactor)
136
         memset(costEstAq, -1, sizeof(costEstAq));
137
@@ -314,4 +388,16 @@
138
     }
139
 
140
     fpelPlane0 = lowresPlane0;
141
+
142
+    if (origPic->m_param->bHistBasedSceneCut)
143
+    {
144
+        // Quarter Sampled Input Picture Formation
145
+        // TO DO: Replace with ASM function
146
+        frame_lowres_core(
147
+            lowresPlane0,
148
+            quarterSampleLowResBuffer + quarterSampleLowResOriginX + quarterSampleLowResOriginY * quarterSampleLowResStrideY,
149
+            lumaStride,
150
+            quarterSampleLowResStrideY,
151
+            widthFullRes / 4, heightFullRes / 4);
152
+    }
153
 }
154
x265_3.5.tar.gz/source/common/lowres.h -> x265_3.6.tar.gz/source/common/lowres.h Changed
73
 
1
@@ -32,6 +32,10 @@
2
 namespace X265_NS {
3
 // private namespace
4
 
5
+#define HISTOGRAM_NUMBER_OF_BINS         256
6
+#define NUMBER_OF_SEGMENTS_IN_WIDTH      4
7
+#define NUMBER_OF_SEGMENTS_IN_HEIGHT     4
8
+
9
 struct ReferencePlanes
10
 {
11
     ReferencePlanes() { memset(this, 0, sizeof(ReferencePlanes)); }
12
@@ -171,6 +175,7 @@
13
 
14
     int    frameNum;         // Presentation frame number
15
     int    sliceType;        // Slice type decided by lookahead
16
+    int    sliceTypeReq;     // Slice type required as per the QP file
17
     int    width;            // width of lowres frame in pixels
18
     int    lines;            // height of lowres frame in pixel lines
19
     int    leadingBframes;   // number of leading B frames for P or I
20
@@ -214,13 +219,13 @@
21
     double*   qpAqOffset;      // AQ QP offset values for each 16x16 CU
22
     double*   qpCuTreeOffset;  // cuTree QP offset values for each 16x16 CU
23
     double*   qpAqMotionOffset;
24
-    int*      invQscaleFactor; // qScale values for qp Aq Offsets
25
+    int*      invQscaleFactor;    // qScale values for qp Aq Offsets
26
     int*      invQscaleFactor8x8; // temporary buffer for qg-size 8
27
     uint32_t* blockVariance;
28
     uint64_t  wp_ssd3;       // This is different than SSDY, this is sum(pixel^2) - sum(pixel)^2 for entire frame
29
     uint64_t  wp_sum3;
30
     double    frameVariance;
31
-    int* edgeInclined;
32
+    int*      edgeInclined;
33
 
34
 
35
     /* cutree intermediate data */
36
@@ -230,18 +235,30 @@
37
     uint32_t heightFullRes;
38
     uint32_t m_maxCUSize;
39
     uint32_t m_qgSize;
40
-    
41
+
42
     uint16_t* propagateCost;
43
     double    weightedCostDeltaX265_BFRAME_MAX + 2;
44
     ReferencePlanes weightedRefX265_BFRAME_MAX + 2;
45
+
46
     /* For hist-based scenecut */
47
-    bool   m_bIsMaxThres;
48
-    double interPCostPercDiff;
49
-    double intraCostPercDiff;
50
-    bool   m_bIsHardScenecut;
51
+    int          quarterSampleLowResWidth;     // width of 1/4 lowres frame in pixels
52
+    int          quarterSampleLowResHeight;    // height of 1/4 lowres frame in pixels
53
+    int          quarterSampleLowResStrideY;
54
+    int          quarterSampleLowResOriginX;
55
+    int          quarterSampleLowResOriginY;
56
+    pixel       *quarterSampleLowResBuffer;
57
+    bool         bHistScenecutAnalyzed;
58
+
59
+    uint16_t     picAvgVariance;
60
+    uint16_t     picAvgVarianceCb;
61
+    uint16_t     picAvgVarianceCr;
62
+
63
+    uint32_t ****picHistogram;
64
+    uint64_t     averageIntensityPerSegmentNUMBER_OF_SEGMENTS_IN_WIDTHNUMBER_OF_SEGMENTS_IN_HEIGHT3;
65
+    uint8_t      averageIntensity3;
66
 
67
     bool create(x265_param* param, PicYuv *origPic, uint32_t qgSize);
68
-    void destroy();
69
+    void destroy(x265_param* param);
70
     void init(PicYuv *origPic, int poc);
71
 };
72
 }
73
x265_3.5.tar.gz/source/common/mv.h -> x265_3.6.tar.gz/source/common/mv.h Changed
10
 
1
@@ -105,6 +105,8 @@
2
     {
3
         return x >= _min.x && x <= _max.x && y >= _min.y && y <= _max.y;
4
     }
5
+
6
+    void set(int32_t _x, int32_t _y) { x = _x; y = _y; }
7
 };
8
 }
9
 
10
x265_3.5.tar.gz/source/common/param.cpp -> x265_3.6.tar.gz/source/common/param.cpp Changed
201
 
1
@@ -145,6 +145,8 @@
2
     param->bAnnexB = 1;
3
     param->bRepeatHeaders = 0;
4
     param->bEnableAccessUnitDelimiters = 0;
5
+    param->bEnableEndOfBitstream = 0;
6
+    param->bEnableEndOfSequence = 0;
7
     param->bEmitHRDSEI = 0;
8
     param->bEmitInfoSEI = 1;
9
     param->bEmitHDRSEI = 0; /*Deprecated*/
10
@@ -163,12 +165,12 @@
11
     param->keyframeMax = 250;
12
     param->gopLookahead = 0;
13
     param->bOpenGOP = 1;
14
+   param->craNal = 0;
15
     param->bframes = 4;
16
     param->lookaheadDepth = 20;
17
     param->bFrameAdaptive = X265_B_ADAPT_TRELLIS;
18
     param->bBPyramid = 1;
19
     param->scenecutThreshold = 40; /* Magic number pulled in from x264 */
20
-    param->edgeTransitionThreshold = 0.03;
21
     param->bHistBasedSceneCut = 0;
22
     param->lookaheadSlices = 8;
23
     param->lookaheadThreads = 0;
24
@@ -179,12 +181,20 @@
25
     param->bEnableHRDConcatFlag = 0;
26
     param->bEnableFades = 0;
27
     param->bEnableSceneCutAwareQp = 0;
28
-    param->fwdScenecutWindow = 500;
29
-    param->fwdRefQpDelta = 5;
30
-    param->fwdNonRefQpDelta = param->fwdRefQpDelta + (SLICE_TYPE_DELTA * param->fwdRefQpDelta);
31
-    param->bwdScenecutWindow = 100;
32
-    param->bwdRefQpDelta = -1;
33
-    param->bwdNonRefQpDelta = -1;
34
+    param->fwdMaxScenecutWindow = 1200;
35
+    param->bwdMaxScenecutWindow = 600;
36
+    for (int i = 0; i < 6; i++)
37
+    {
38
+        int deltas6 = { 5, 4, 3, 2, 1, 0 };
39
+
40
+        param->fwdScenecutWindowi = 200;
41
+        param->fwdRefQpDeltai = deltasi;
42
+        param->fwdNonRefQpDeltai = param->fwdRefQpDeltai + (SLICE_TYPE_DELTA * param->fwdRefQpDeltai);
43
+
44
+        param->bwdScenecutWindowi = 100;
45
+        param->bwdRefQpDeltai = -1;
46
+        param->bwdNonRefQpDeltai = -1;
47
+    }
48
 
49
     /* Intra Coding Tools */
50
     param->bEnableConstrainedIntra = 0;
51
@@ -278,7 +288,10 @@
52
     param->rc.rfConstantMin = 0;
53
     param->rc.bStatRead = 0;
54
     param->rc.bStatWrite = 0;
55
+    param->rc.dataShareMode = X265_SHARE_MODE_FILE;
56
     param->rc.statFileName = NULL;
57
+    param->rc.sharedMemName = NULL;
58
+    param->rc.bEncFocusedFramesOnly = 0;
59
     param->rc.complexityBlur = 20;
60
     param->rc.qblur = 0.5;
61
     param->rc.zoneCount = 0;
62
@@ -321,6 +334,7 @@
63
     param->maxLuma = PIXEL_MAX;
64
     param->log2MaxPocLsb = 8;
65
     param->maxSlices = 1;
66
+    param->videoSignalTypePreset = NULL;
67
 
68
     /*Conformance window*/
69
     param->confWinRightOffset = 0;
70
@@ -373,10 +387,17 @@
71
     param->bEnableSvtHevc = 0;
72
     param->svtHevcParam = NULL;
73
 
74
+    /* MCSTF */
75
+    param->bEnableTemporalFilter = 0;
76
+    param->temporalFilterStrength = 0.95;
77
+
78
 #ifdef SVT_HEVC
79
     param->svtHevcParam = svtParam;
80
     svt_param_default(param);
81
 #endif
82
+    /* Film grain characteristics model filename */
83
+    param->filmGrain = NULL;
84
+    param->bEnableSBRC = 0;
85
 }
86
 
87
 int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
88
@@ -666,6 +687,46 @@
89
 #define atof(str) x265_atof(str, bError)
90
 #define atobool(str) (x265_atobool(str, bError))
91
 
92
+int x265_scenecut_aware_qp_param_parse(x265_param* p, const char* name, const char* value)
93
+{
94
+    bool bError = false;
95
+    char nameBuf64;
96
+    if (!name)
97
+        return X265_PARAM_BAD_NAME;
98
+    // skip -- prefix if provided
99
+    if (name0 == '-' && name1 == '-')
100
+        name += 2;
101
+    // s/_/-/g
102
+    if (strlen(name) + 1 < sizeof(nameBuf) && strchr(name, '_'))
103
+    {
104
+        char *c;
105
+        strcpy(nameBuf, name);
106
+        while ((c = strchr(nameBuf, '_')) != 0)
107
+            *c = '-';
108
+        name = nameBuf;
109
+    }
110
+    if (!value)
111
+        value = "true";
112
+    else if (value0 == '=')
113
+        value++;
114
+#define OPT(STR) else if (!strcmp(name, STR))
115
+    if (0);
116
+    OPT("scenecut-aware-qp") p->bEnableSceneCutAwareQp = x265_atoi(value, bError);
117
+    OPT("masking-strength") bError = parseMaskingStrength(p, value);
118
+    else
119
+        return X265_PARAM_BAD_NAME;
120
+#undef OPT
121
+    return bError ? X265_PARAM_BAD_VALUE : 0;
122
+}
123
+
124
+
125
+/* internal versions of string-to-int with additional error checking */
126
+#undef atoi
127
+#undef atof
128
+#define atoi(str) x265_atoi(str, bError)
129
+#define atof(str) x265_atof(str, bError)
130
+#define atobool(str) (x265_atobool(str, bError))
131
+
132
 int x265_zone_param_parse(x265_param* p, const char* name, const char* value)
133
 {
134
     bool bError = false;
135
@@ -949,10 +1010,9 @@
136
        {
137
            bError = false;
138
            p->scenecutThreshold = atoi(value);
139
-           p->bHistBasedSceneCut = 0;
140
        }
141
     }
142
-    OPT("temporal-layers") p->bEnableTemporalSubLayers = atobool(value);
143
+    OPT("temporal-layers") p->bEnableTemporalSubLayers = atoi(value);
144
     OPT("keyint") p->keyframeMax = atoi(value);
145
     OPT("min-keyint") p->keyframeMin = atoi(value);
146
     OPT("rc-lookahead") p->lookaheadDepth = atoi(value);
147
@@ -1184,6 +1244,7 @@
148
         int pass = x265_clip3(0, 3, atoi(value));
149
         p->rc.bStatWrite = pass & 1;
150
         p->rc.bStatRead = pass & 2;
151
+        p->rc.dataShareMode = X265_SHARE_MODE_FILE;
152
     }
153
     OPT("stats") p->rc.statFileName = strdup(value);
154
     OPT("scaling-list") p->scalingLists = strdup(value);
155
@@ -1216,21 +1277,7 @@
156
         OPT("opt-ref-list-length-pps") p->bOptRefListLengthPPS = atobool(value);
157
         OPT("multi-pass-opt-rps") p->bMultiPassOptRPS = atobool(value);
158
         OPT("scenecut-bias") p->scenecutBias = atof(value);
159
-        OPT("hist-scenecut")
160
-        {
161
-            p->bHistBasedSceneCut = atobool(value);
162
-            if (bError)
163
-            {
164
-                bError = false;
165
-                p->bHistBasedSceneCut = 0;
166
-            }
167
-            if (p->bHistBasedSceneCut)
168
-            {
169
-                bError = false;
170
-                p->scenecutThreshold = 0;
171
-            }
172
-        }
173
-        OPT("hist-threshold") p->edgeTransitionThreshold = atof(value);
174
+        OPT("hist-scenecut") p->bHistBasedSceneCut = atobool(value);
175
         OPT("rskip-edge-threshold") p->edgeVarThreshold = atoi(value)/100.0f;
176
         OPT("lookahead-threads") p->lookaheadThreads = atoi(value);
177
         OPT("opt-cu-delta-qp") p->bOptCUDeltaQP = atobool(value);
178
@@ -1238,6 +1285,7 @@
179
         OPT("multi-pass-opt-distortion") p->analysisMultiPassDistortion = atobool(value);
180
         OPT("aq-motion") p->bAQMotion = atobool(value);
181
         OPT("dynamic-rd") p->dynamicRd = atof(value);
182
+       OPT("cra-nal") p->craNal = atobool(value);
183
         OPT("analysis-reuse-level")
184
         {
185
             p->analysisReuseLevel = atoi(value);
186
@@ -1348,71 +1396,7 @@
187
         }
188
         OPT("fades") p->bEnableFades = atobool(value);
189
         OPT("scenecut-aware-qp") p->bEnableSceneCutAwareQp = atoi(value);
190
-        OPT("masking-strength")
191
-        {
192
-            int window1;
193
-            double refQpDelta1, nonRefQpDelta1;
194
-
195
-            if (p->bEnableSceneCutAwareQp == FORWARD)
196
-            {
197
-                if (3 == sscanf(value, "%d,%lf,%lf", &window1, &refQpDelta1, &nonRefQpDelta1))
198
-                {
199
-                    if (window1 > 0)
200
-                        p->fwdScenecutWindow = window1;
201
x265_3.5.tar.gz/source/common/param.h -> x265_3.6.tar.gz/source/common/param.h Changed
17
 
1
@@ -38,6 +38,7 @@
2
 void  getParamAspectRatio(x265_param *p, int& width, int& height);
3
 bool  parseLambdaFile(x265_param *param);
4
 void x265_copy_params(x265_param* dst, x265_param* src);
5
+bool parseMaskingStrength(x265_param* p, const char* value);
6
 
7
 /* this table is kept internal to avoid confusion, since log level indices start at -1 */
8
 static const char * const logLevelNames = { "none", "error", "warning", "info", "debug", "full", 0 };
9
@@ -52,6 +53,7 @@
10
 int x265_param_default_preset(x265_param *, const char *preset, const char *tune);
11
 int x265_param_apply_profile(x265_param *, const char *profile);
12
 int x265_param_parse(x265_param *p, const char *name, const char *value);
13
+int x265_scenecut_aware_qp_param_parse(x265_param* p, const char* name, const char* value);
14
 int x265_zone_param_parse(x265_param* p, const char* name, const char* value);
15
 #define PARAM_NS X265_NS
16
 #endif
17
x265_3.5.tar.gz/source/common/piclist.cpp -> x265_3.6.tar.gz/source/common/piclist.cpp Changed
134
 
1
@@ -45,6 +45,25 @@
2
     m_count++;
3
 }
4
 
5
+void PicList::pushFrontMCSTF(Frame& curFrame)
6
+{
7
+    X265_CHECK(!curFrame.m_nextMCSTF && !curFrame.m_nextMCSTF, "piclist: picture already in OPB list\n"); // ensure frame is not in a list
8
+    curFrame.m_nextMCSTF = m_start;
9
+    curFrame.m_prevMCSTF = NULL;
10
+
11
+    if (m_count)
12
+    {
13
+        m_start->m_prevMCSTF = &curFrame;
14
+        m_start = &curFrame;
15
+    }
16
+    else
17
+    {
18
+        m_start = m_end = &curFrame;
19
+    }
20
+    m_count++;
21
+
22
+}
23
+
24
 void PicList::pushBack(Frame& curFrame)
25
 {
26
     X265_CHECK(!curFrame.m_next && !curFrame.m_prev, "piclist: picture already in list\n"); // ensure frame is not in a list
27
@@ -63,6 +82,24 @@
28
     m_count++;
29
 }
30
 
31
+void PicList::pushBackMCSTF(Frame& curFrame)
32
+{
33
+    X265_CHECK(!curFrame.m_nextMCSTF && !curFrame.m_prevMCSTF, "piclist: picture already in OPB list\n"); // ensure frame is not in a list
34
+    curFrame.m_nextMCSTF = NULL;
35
+    curFrame.m_prevMCSTF = m_end;
36
+
37
+    if (m_count)
38
+    {
39
+        m_end->m_nextMCSTF = &curFrame;
40
+        m_end = &curFrame;
41
+    }
42
+    else
43
+    {
44
+        m_start = m_end = &curFrame;
45
+    }
46
+    m_count++;
47
+}
48
+
49
 Frame *PicList::popFront()
50
 {
51
     if (m_start)
52
@@ -94,6 +131,14 @@
53
     return curFrame;
54
 }
55
 
56
+Frame* PicList::getPOCMCSTF(int poc)
57
+{
58
+    Frame *curFrame = m_start;
59
+    while (curFrame && curFrame->m_poc != poc)
60
+        curFrame = curFrame->m_nextMCSTF;
61
+    return curFrame;
62
+}
63
+
64
 Frame *PicList::popBack()
65
 {
66
     if (m_end)
67
@@ -117,6 +162,29 @@
68
         return NULL;
69
 }
70
 
71
+Frame *PicList::popBackMCSTF()
72
+{
73
+    if (m_end)
74
+    {
75
+        Frame* temp = m_end;
76
+        m_count--;
77
+
78
+        if (m_count)
79
+        {
80
+            m_end = m_end->m_prevMCSTF;
81
+            m_end->m_nextMCSTF = NULL;
82
+        }
83
+        else
84
+        {
85
+            m_start = m_end = NULL;
86
+        }
87
+        temp->m_nextMCSTF = temp->m_prevMCSTF = NULL;
88
+        return temp;
89
+    }
90
+    else
91
+        return NULL;
92
+}
93
+
94
 Frame* PicList::getCurFrame(void)
95
 {
96
     Frame *curFrame = m_start;
97
@@ -158,3 +226,36 @@
98
 
99
     curFrame.m_next = curFrame.m_prev = NULL;
100
 }
101
+
102
+void PicList::removeMCSTF(Frame& curFrame)
103
+{
104
+#if _DEBUG
105
+    Frame *tmp = m_start;
106
+    while (tmp && tmp != &curFrame)
107
+    {
108
+        tmp = tmp->m_nextMCSTF;
109
+    }
110
+
111
+    X265_CHECK(tmp == &curFrame, "framelist: pic being removed was not in list\n"); // verify pic is in this list
112
+#endif
113
+
114
+    m_count--;
115
+    if (m_count)
116
+    {
117
+        if (m_start == &curFrame)
118
+            m_start = curFrame.m_nextMCSTF;
119
+        if (m_end == &curFrame)
120
+            m_end = curFrame.m_prevMCSTF;
121
+
122
+        if (curFrame.m_nextMCSTF)
123
+            curFrame.m_nextMCSTF->m_prevMCSTF = curFrame.m_prevMCSTF;
124
+        if (curFrame.m_prevMCSTF)
125
+            curFrame.m_prevMCSTF->m_nextMCSTF = curFrame.m_nextMCSTF;
126
+    }
127
+    else
128
+    {
129
+        m_start = m_end = NULL;
130
+    }
131
+
132
+    curFrame.m_nextMCSTF = curFrame.m_prevMCSTF = NULL;
133
+}
134
x265_3.5.tar.gz/source/common/piclist.h -> x265_3.6.tar.gz/source/common/piclist.h Changed
33
 
1
@@ -49,24 +49,31 @@
2
 
3
     /** Push picture to end of the list */
4
     void pushBack(Frame& pic);
5
+    void pushBackMCSTF(Frame& pic);
6
 
7
     /** Push picture to beginning of the list */
8
     void pushFront(Frame& pic);
9
+    void pushFrontMCSTF(Frame& pic);
10
 
11
     /** Pop picture from end of the list */
12
     Frame* popBack();
13
+    Frame* popBackMCSTF();
14
 
15
     /** Pop picture from beginning of the list */
16
     Frame* popFront();
17
 
18
     /** Find frame with specified POC */
19
     Frame* getPOC(int poc);
20
+    /* Find next MCSTF frame with specified POC */
21
+    Frame* getPOCMCSTF(int poc);
22
 
23
     /** Get the current Frame from the list **/
24
     Frame* getCurFrame(void);
25
 
26
     /** Remove picture from list */
27
     void remove(Frame& pic);
28
+    /* Remove MCSTF picture from list */
29
+    void removeMCSTF(Frame& pic);
30
 
31
     Frame* first()        { return m_start;   }
32
 
33
x265_3.5.tar.gz/source/common/picyuv.cpp -> x265_3.6.tar.gz/source/common/picyuv.cpp Changed
60
 
1
@@ -125,6 +125,58 @@
2
     return false;
3
 }
4
 
5
+/*Copy pixels from the picture buffer of a frame to picture buffer of another frame*/
6
+void PicYuv::copyFromFrame(PicYuv* source)
7
+{
8
+    uint32_t numCuInHeight = (m_picHeight + m_param->maxCUSize - 1) / m_param->maxCUSize;
9
+
10
+    int maxHeight = numCuInHeight * m_param->maxCUSize;
11
+    memcpy(m_picBuf0, source->m_picBuf0, sizeof(pixel)* m_stride * (maxHeight + (m_lumaMarginY * 2)));
12
+    m_picOrg0 = m_picBuf0 + m_lumaMarginY * m_stride + m_lumaMarginX;
13
+
14
+    if (m_picCsp != X265_CSP_I400)
15
+    {
16
+        memcpy(m_picBuf1, source->m_picBuf1, sizeof(pixel)* m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
17
+        memcpy(m_picBuf2, source->m_picBuf2, sizeof(pixel)* m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
18
+
19
+        m_picOrg1 = m_picBuf1 + m_chromaMarginY * m_strideC + m_chromaMarginX;
20
+        m_picOrg2 = m_picBuf2 + m_chromaMarginY * m_strideC + m_chromaMarginX;
21
+    }
22
+    else
23
+    {
24
+        m_picBuf1 = m_picBuf2 = NULL;
25
+        m_picOrg1 = m_picOrg2 = NULL;
26
+    }
27
+}
28
+
29
+bool PicYuv::createScaledPicYUV(x265_param* param, uint8_t scaleFactor)
30
+{
31
+    m_param = param;
32
+    m_picWidth = m_param->sourceWidth / scaleFactor;
33
+    m_picHeight = m_param->sourceHeight / scaleFactor;
34
+
35
+    m_picCsp = m_param->internalCsp;
36
+    m_hChromaShift = CHROMA_H_SHIFT(m_picCsp);
37
+    m_vChromaShift = CHROMA_V_SHIFT(m_picCsp);
38
+
39
+    uint32_t numCuInWidth = (m_picWidth + param->maxCUSize - 1) / param->maxCUSize;
40
+    uint32_t numCuInHeight = (m_picHeight + param->maxCUSize - 1) / param->maxCUSize;
41
+
42
+    m_lumaMarginX = 128; // search margin for L0 and L1 ME in horizontal direction
43
+    m_lumaMarginY = 128; // search margin for L0 and L1 ME in vertical direction
44
+    m_stride = (numCuInWidth * param->maxCUSize) + (m_lumaMarginX << 1);
45
+
46
+    int maxHeight = numCuInHeight * param->maxCUSize;
47
+    CHECKED_MALLOC_ZERO(m_picBuf0, pixel, m_stride * (maxHeight + (m_lumaMarginY * 2)));
48
+    m_picOrg0 = m_picBuf0 + m_lumaMarginY * m_stride + m_lumaMarginX;
49
+    m_picBuf1 = m_picBuf2 = NULL;
50
+    m_picOrg1 = m_picOrg2 = NULL;
51
+    return true;
52
+
53
+fail:
54
+    return false;
55
+}
56
+
57
 int PicYuv::getLumaBufLen(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp)
58
 {
59
     m_picWidth = picWidth;
60
x265_3.5.tar.gz/source/common/picyuv.h -> x265_3.6.tar.gz/source/common/picyuv.h Changed
15
 
1
@@ -78,11 +78,13 @@
2
     PicYuv();
3
 
4
     bool  create(x265_param* param, bool picAlloc = true, pixel *pixelbuf = NULL);
5
+    bool  createScaledPicYUV(x265_param* param, uint8_t scaleFactor);
6
     bool  createOffsets(const SPS& sps);
7
     void  destroy();
8
     int   getLumaBufLen(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp);
9
 
10
     void  copyFromPicture(const x265_picture&, const x265_param& param, int padx, int pady);
11
+    void  copyFromFrame(PicYuv* source);
12
 
13
     intptr_t getChromaAddrOffset(uint32_t ctuAddr, uint32_t absPartIdx) const { return m_cuOffsetCctuAddr + m_buOffsetCabsPartIdx; }
14
 
15
x265_3.5.tar.gz/source/common/pixel.cpp -> x265_3.6.tar.gz/source/common/pixel.cpp Changed
51
 
1
@@ -266,7 +266,7 @@
2
 {
3
     int satd = 0;
4
 
5
-#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
6
+#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 && !HIGH_BIT_DEPTH
7
     pixelcmp_t satd_4x4 = x265_pixel_satd_4x4_neon;
8
 #endif
9
 
10
@@ -284,7 +284,7 @@
11
 {
12
     int satd = 0;
13
 
14
-#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
15
+#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 && !HIGH_BIT_DEPTH
16
     pixelcmp_t satd_8x4 = x265_pixel_satd_8x4_neon;
17
 #endif
18
 
19
@@ -627,6 +627,23 @@
20
     }
21
 }
22
 
23
+static
24
+void frame_subsample_luma(const pixel* src0, pixel* dst0, intptr_t src_stride, intptr_t dst_stride, int width, int height)
25
+{
26
+    for (int y = 0; y < height; y++, src0 += 2 * src_stride, dst0 += dst_stride)
27
+    {
28
+        const pixel *inRow = src0;
29
+        const pixel *inRowBelow = src0 + src_stride;
30
+        pixel *target = dst0;
31
+        for (int x = 0; x < width; x++)
32
+        {
33
+            targetx = (((inRow0 + inRowBelow0 + 1) >> 1) + ((inRow1 + inRowBelow1 + 1) >> 1) + 1) >> 1;
34
+            inRow += 2;
35
+            inRowBelow += 2;
36
+        }
37
+    }
38
+}
39
+
40
 /* structural similarity metric */
41
 static void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24)
42
 {
43
@@ -1355,5 +1372,7 @@
44
     p.cuBLOCK_16x16.normFact = normFact_c;
45
     p.cuBLOCK_32x32.normFact = normFact_c;
46
     p.cuBLOCK_64x64.normFact = normFact_c;
47
+    /* SubSample Luma*/
48
+    p.frameSubSampleLuma = frame_subsample_luma;
49
 }
50
 }
51
x265_3.5.tar.gz/source/common/ppc/intrapred_altivec.cpp -> x265_3.6.tar.gz/source/common/ppc/intrapred_altivec.cpp Changed
10
 
1
@@ -27,7 +27,7 @@
2
 #include <assert.h>
3
 #include <math.h>
4
 #include <cmath>
5
-#include <linux/types.h>
6
+#include <sys/types.h>
7
 #include <stdlib.h>
8
 #include <stdio.h>
9
 #include <stdint.h>
10
x265_3.5.tar.gz/source/common/primitives.h -> x265_3.6.tar.gz/source/common/primitives.h Changed
28
 
1
@@ -232,6 +232,8 @@
2
 typedef void(*psyRdoQuant_t2)(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos);
3
 typedef void(*ssimDistortion_t)(const pixel *fenc, uint32_t fStride, const pixel *recon,  intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k);
4
 typedef void(*normFactor_t)(const pixel *src, uint32_t blockSize, int shift, uint64_t *z_k);
5
+/* SubSampling Luma */
6
+typedef void (*downscaleluma_t)(const pixel* src0, pixel* dstf, intptr_t src_stride, intptr_t dst_stride, int width, int height);
7
 /* Function pointers to optimized encoder primitives. Each pointer can reference
8
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
9
 struct EncoderPrimitives
10
@@ -353,6 +355,8 @@
11
 
12
     downscale_t           frameInitLowres;
13
     downscale_t           frameInitLowerRes;
14
+    /* Sub Sample Luma */
15
+    downscaleluma_t        frameSubSampleLuma;
16
     cutree_propagate_cost propagateCost;
17
     cutree_fix8_unpack    fix8Unpack;
18
     cutree_fix8_pack      fix8Pack;
19
@@ -488,7 +492,7 @@
20
 
21
 #if ENABLE_ASSEMBLY && X265_ARCH_ARM64
22
 extern "C" {
23
-#include "aarch64/pixel-util.h"
24
+#include "aarch64/fun-decls.h"
25
 }
26
 #endif
27
 
28
x265_3.6.tar.gz/source/common/ringmem.cpp Added
201
 
1
@@ -0,0 +1,357 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2013-2017 MulticoreWare, Inc
4
+ *
5
+ * Authors: liwei <liwei@multicorewareinc.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com
23
+ *****************************************************************************/
24
+
25
+#include "ringmem.h"
26
+
27
+#ifndef _WIN32
28
+#include <sys/mman.h>
29
+#endif ////< _WIN32
30
+
31
+#ifdef _WIN32
32
+#define X265_SHARED_MEM_NAME                    "Local\\_x265_shr_mem_"
33
+#define X265_SEMAPHORE_RINGMEM_WRITER_NAME     "_x265_semW_"
34
+#define X265_SEMAPHORE_RINGMEM_READER_NAME     "_x265_semR_"
35
+#else /* POSIX / pthreads */
36
+#define X265_SHARED_MEM_NAME                    "/tmp/_x265_shr_mem_"
37
+#define X265_SEMAPHORE_RINGMEM_WRITER_NAME     "/tmp/_x265_semW_"
38
+#define X265_SEMAPHORE_RINGMEM_READER_NAME     "/tmp/_x265_semR_"
39
+#endif
40
+
41
+#define RINGMEM_ALLIGNMENT                       64
42
+
43
+namespace X265_NS {
44
+    RingMem::RingMem() 
45
+        : m_initialized(false)
46
+        , m_protectRW(false)
47
+        , m_itemSize(0)
48
+        , m_itemCnt(0)
49
+        , m_dataPool(NULL)
50
+        , m_shrMem(NULL)
51
+#ifdef _WIN32
52
+        , m_handle(NULL)
53
+#else //_WIN32
54
+        , m_filepath(NULL)
55
+#endif //_WIN32
56
+        , m_writeSem(NULL)
57
+        , m_readSem(NULL)
58
+    {
59
+    }
60
+
61
+
62
+    RingMem::~RingMem()
63
+    {
64
+    }
65
+
66
+    bool RingMem::skipRead(int32_t cnt) {
67
+        if (!m_initialized)
68
+        {
69
+            return false;
70
+        }
71
+
72
+        if (m_protectRW)
73
+        {
74
+            for (int i = 0; i < cnt; i++)
75
+            {
76
+                m_readSem->take();
77
+            }
78
+        }
79
+        
80
+        ATOMIC_ADD(&m_shrMem->m_read, cnt);
81
+
82
+        if (m_protectRW)
83
+        {
84
+            m_writeSem->give(cnt);
85
+        }
86
+
87
+        return true;
88
+    }
89
+
90
+    bool RingMem::skipWrite(int32_t cnt) {
91
+        if (!m_initialized)
92
+        {
93
+            return false;
94
+        }
95
+
96
+        if (m_protectRW)
97
+        {
98
+            for (int i = 0; i < cnt; i++)
99
+            {
100
+                m_writeSem->take();
101
+            }
102
+        }
103
+
104
+        ATOMIC_ADD(&m_shrMem->m_write, cnt);
105
+
106
+        if (m_protectRW)
107
+        {
108
+            m_readSem->give(cnt);
109
+        }
110
+
111
+        return true;
112
+    }
113
+
114
+    ///< initialize
115
+    bool RingMem::init(int32_t itemSize, int32_t itemCnt, const char *name, bool protectRW)
116
+    {
117
+        ///< check parameters
118
+        if (itemSize <= 0 || itemCnt <= 0 || NULL == name)
119
+        {
120
+            ///< invalid parameters 
121
+            return false;
122
+        }
123
+
124
+        if (!m_initialized)
125
+        {
126
+            ///< formating names
127
+            char nameBufMAX_SHR_NAME_LEN = { 0 };
128
+
129
+            ///< shared memory name
130
+            snprintf(nameBuf, sizeof(nameBuf) - 1, "%s%s", X265_SHARED_MEM_NAME, name);
131
+
132
+            ///< create or open shared memory
133
+            bool newCreated = false;
134
+
135
+            ///< calculate the size of the shared memory
136
+            int32_t shrMemSize = (itemSize * itemCnt + sizeof(ShrMemCtrl) + RINGMEM_ALLIGNMENT - 1) & ~(RINGMEM_ALLIGNMENT - 1);
137
+
138
+#ifdef _WIN32
139
+            HANDLE h = OpenFileMappingA(FILE_MAP_WRITE | FILE_MAP_READ, FALSE, nameBuf);
140
+            if (!h)
141
+            {
142
+                h = CreateFileMappingA(INVALID_HANDLE_VALUE, NULL, PAGE_READWRITE, 0, shrMemSize, nameBuf);
143
+
144
+                if (!h)
145
+                {
146
+                    return false;
147
+                }
148
+
149
+                newCreated = true;
150
+            }
151
+
152
+            void *pool = MapViewOfFile(h, FILE_MAP_ALL_ACCESS, 0, 0, 0);
153
+
154
+            ///< should not close the handle here, otherwise the OpenFileMapping would fail
155
+            //CloseHandle(h);
156
+            m_handle = h;
157
+
158
+            if (!pool)
159
+            {
160
+                return false;
161
+            }
162
+
163
+#else /* POSIX / pthreads */
164
+            mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH;
165
+            int flag = O_RDWR;
166
+            int shrfd = -1;
167
+            if ((shrfd = open(nameBuf, flag, mode)) < 0)
168
+            {
169
+                flag |= O_CREAT;
170
+                
171
+                shrfd = open(nameBuf, flag, mode);
172
+                if (shrfd < 0)
173
+                {
174
+                    return false;
175
+                }
176
+                newCreated = true;
177
+
178
+                lseek(shrfd, shrMemSize - 1, SEEK_SET);
179
+
180
+                if (-1 == write(shrfd, "\0", 1))
181
+                {
182
+                    close(shrfd);
183
+                    return false;
184
+                }
185
+
186
+                if (lseek(shrfd, 0, SEEK_END) < shrMemSize)
187
+                {
188
+                    close(shrfd);
189
+                    return false;
190
+                }
191
+            }
192
+
193
+            void *pool = mmap(0,
194
+                shrMemSize,
195
+                PROT_READ | PROT_WRITE,
196
+                MAP_SHARED,
197
+                shrfd,
198
+                0);
199
+
200
+            close(shrfd);
201
x265_3.6.tar.gz/source/common/ringmem.h Added
92
 
1
@@ -0,0 +1,90 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2013-2017 MulticoreWare, Inc
4
+ *
5
+ * Authors: liwei <liwei@multicorewareinc.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com
23
+ *****************************************************************************/
24
+
25
+#ifndef X265_RINGMEM_H
26
+#define X265_RINGMEM_H
27
+
28
+#include "common.h"
29
+#include "threading.h"
30
+
31
+#if _MSC_VER
32
+#define snprintf _snprintf
33
+#define strdup _strdup
34
+#endif
35
+
36
+namespace X265_NS {
37
+
38
+#define MAX_SHR_NAME_LEN                         256
39
+
40
+    class RingMem {
41
+    public:
42
+        RingMem();
43
+        ~RingMem();
44
+
45
+        bool skipRead(int32_t cnt);
46
+
47
+        bool skipWrite(int32_t cnt);
48
+
49
+        ///< initialize
50
+        ///< protectRW: if use the semaphore the protect the write and read operation.
51
+        bool init(int32_t itemSize, int32_t itemCnt, const char *name, bool protectRW = false);
52
+        ///< finalize
53
+        void release();
54
+
55
+        typedef void(*fnRWSharedData)(void *dst, void *src, int32_t size);
56
+
57
+        ///< data read
58
+        bool readNext(void* dst, fnRWSharedData callback);
59
+        ///< data write
60
+        bool writeData(void *data, fnRWSharedData callback);
61
+
62
+    private:        
63
+        bool    m_initialized;
64
+        bool    m_protectRW;
65
+
66
+        int32_t m_itemSize;
67
+        int32_t m_itemCnt;
68
+        ///< data pool
69
+        void   *m_dataPool;
70
+        typedef struct {
71
+            ///< index to write
72
+            int32_t m_write;
73
+            ///< index to read
74
+            int32_t m_read;
75
+            
76
+        }ShrMemCtrl;
77
+
78
+        ShrMemCtrl *m_shrMem;
79
+#ifdef _WIN32
80
+        void       *m_handle;
81
+#else // _WIN32
82
+        char       *m_filepath;
83
+#endif // _WIN32
84
+
85
+        ///< Semaphores
86
+        NamedSemaphore *m_writeSem;
87
+        NamedSemaphore *m_readSem;
88
+    };
89
+};
90
+
91
+#endif // ifndef X265_RINGMEM_H
92
x265_3.5.tar.gz/source/common/slice.h -> x265_3.6.tar.gz/source/common/slice.h Changed
35
 
1
@@ -156,9 +156,9 @@
2
     HRDInfo          hrdParameters;
3
     ProfileTierLevel ptl;
4
     uint32_t         maxTempSubLayers;
5
-    uint32_t         numReorderPics;
6
-    uint32_t         maxDecPicBuffering;
7
-    uint32_t         maxLatencyIncrease;
8
+    uint32_t         numReorderPicsMAX_T_LAYERS;
9
+    uint32_t         maxDecPicBufferingMAX_T_LAYERS;
10
+    uint32_t         maxLatencyIncreaseMAX_T_LAYERS;
11
 };
12
 
13
 struct Window
14
@@ -235,9 +235,9 @@
15
     uint32_t maxAMPDepth;
16
 
17
     uint32_t maxTempSubLayers;   // max number of Temporal Sub layers
18
-    uint32_t maxDecPicBuffering; // these are dups of VPS values
19
-    uint32_t maxLatencyIncrease;
20
-    int      numReorderPics;
21
+    uint32_t maxDecPicBufferingMAX_T_LAYERS; // these are dups of VPS values
22
+    uint32_t maxLatencyIncreaseMAX_T_LAYERS;
23
+    int      numReorderPicsMAX_T_LAYERS;
24
 
25
     RPS      spsrpsMAX_NUM_SHORT_TERM_RPS;
26
     int      spsrpsNum;
27
@@ -363,6 +363,7 @@
28
     int         m_iNumRPSInSPS;
29
     const x265_param *m_param;
30
     int         m_fieldNum;
31
+    Frame*      m_mcstfRefFrameList2MAX_MCSTF_TEMPORAL_WINDOW_LENGTH;
32
 
33
     Slice()
34
     {
35
x265_3.6.tar.gz/source/common/temporalfilter.cpp Added
201
 
1
@@ -0,0 +1,1017 @@
2
+/*****************************************************************************
3
+* Copyright (C) 2013-2021 MulticoreWare, Inc
4
+*
5
+ * Authors: Ashok Kumar Mishra <ashok@multicorewareinc.com>
6
+ *
7
+* This program is free software; you can redistribute it and/or modify
8
+* it under the terms of the GNU General Public License as published by
9
+* the Free Software Foundation; either version 2 of the License, or
10
+* (at your option) any later version.
11
+*
12
+* This program is distributed in the hope that it will be useful,
13
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+* GNU General Public License for more details.
16
+*
17
+* You should have received a copy of the GNU General Public License
18
+* along with this program; if not, write to the Free Software
19
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+*
21
+* This program is also available under a commercial proprietary license.
22
+* For more information, contact us at license @ x265.com.
23
+*****************************************************************************/
24
+#include "common.h"
25
+#include "temporalfilter.h"
26
+#include "primitives.h"
27
+
28
+#include "frame.h"
29
+#include "slice.h"
30
+#include "framedata.h"
31
+#include "analysis.h"
32
+
33
+using namespace X265_NS;
34
+
35
+void OrigPicBuffer::addPicture(Frame* inFrame)
36
+{
37
+    m_mcstfPicList.pushFrontMCSTF(*inFrame);
38
+}
39
+
40
+void OrigPicBuffer::addEncPicture(Frame* inFrame)
41
+{
42
+    m_mcstfOrigPicFreeList.pushFrontMCSTF(*inFrame);
43
+}
44
+
45
+void OrigPicBuffer::addEncPictureToPicList(Frame* inFrame)
46
+{
47
+    m_mcstfOrigPicList.pushFrontMCSTF(*inFrame);
48
+}
49
+
50
+OrigPicBuffer::~OrigPicBuffer()
51
+{
52
+    while (!m_mcstfOrigPicList.empty())
53
+    {
54
+        Frame* curFrame = m_mcstfOrigPicList.popBackMCSTF();
55
+        curFrame->destroy();
56
+        delete curFrame;
57
+    }
58
+
59
+    while (!m_mcstfOrigPicFreeList.empty())
60
+    {
61
+        Frame* curFrame = m_mcstfOrigPicFreeList.popBackMCSTF();
62
+        curFrame->destroy();
63
+        delete curFrame;
64
+    }
65
+}
66
+
67
+void OrigPicBuffer::setOrigPicList(Frame* inFrame, int frameCnt)
68
+{
69
+    Slice* slice = inFrame->m_encData->m_slice;
70
+    uint8_t j = 0;
71
+    for (int iterPOC = (inFrame->m_poc - inFrame->m_mcstf->m_range);
72
+        iterPOC <= (inFrame->m_poc + inFrame->m_mcstf->m_range); iterPOC++)
73
+    {
74
+        if (iterPOC != inFrame->m_poc)
75
+        {
76
+            if (iterPOC < 0)
77
+                continue;
78
+            if (iterPOC >= frameCnt)
79
+                break;
80
+
81
+            Frame *iterFrame = m_mcstfPicList.getPOCMCSTF(iterPOC);
82
+            X265_CHECK(iterFrame, "Reference frame not found in OPB");
83
+            if (iterFrame != NULL)
84
+            {
85
+                slice->m_mcstfRefFrameList1j = iterFrame;
86
+                iterFrame->m_refPicCnt1--;
87
+            }
88
+
89
+            iterFrame = m_mcstfOrigPicList.getPOCMCSTF(iterPOC);
90
+            if (iterFrame != NULL)
91
+            {
92
+
93
+                slice->m_mcstfRefFrameList1j = iterFrame;
94
+
95
+                iterFrame->m_refPicCnt1--;
96
+                Frame *cFrame = m_mcstfOrigPicList.getPOCMCSTF(inFrame->m_poc);
97
+                X265_CHECK(cFrame, "Reference frame not found in encoded OPB");
98
+                cFrame->m_refPicCnt1--;
99
+            }
100
+            j++;
101
+        }
102
+    }
103
+}
104
+
105
+void OrigPicBuffer::recycleOrigPicList()
106
+{
107
+    Frame *iterFrame = m_mcstfPicList.first();
108
+
109
+    while (iterFrame)
110
+    {
111
+        Frame *curFrame = iterFrame;
112
+        iterFrame = iterFrame->m_nextMCSTF;
113
+        if (!curFrame->m_refPicCnt1)
114
+        {
115
+            m_mcstfPicList.removeMCSTF(*curFrame);
116
+            iterFrame = m_mcstfPicList.first();
117
+        }
118
+    }
119
+
120
+    iterFrame = m_mcstfOrigPicList.first();
121
+
122
+    while (iterFrame)
123
+    {
124
+        Frame *curFrame = iterFrame;
125
+        iterFrame = iterFrame->m_nextMCSTF;
126
+        if (!curFrame->m_refPicCnt1)
127
+        {
128
+            m_mcstfOrigPicList.removeMCSTF(*curFrame);
129
+            *curFrame->m_isSubSampled = false;
130
+            m_mcstfOrigPicFreeList.pushFrontMCSTF(*curFrame);
131
+            iterFrame = m_mcstfOrigPicList.first();
132
+        }
133
+    }
134
+}
135
+
136
+void OrigPicBuffer::addPictureToFreelist(Frame* inFrame)
137
+{
138
+    m_mcstfOrigPicFreeList.pushBack(*inFrame);
139
+}
140
+
141
+TemporalFilter::TemporalFilter()
142
+{
143
+    m_sourceWidth = 0;
144
+    m_sourceHeight = 0,
145
+    m_QP = 0;
146
+    m_sliceTypeConfig = 3;
147
+    m_numRef = 0;
148
+    m_useSADinME = 1;
149
+
150
+    m_range = 2;
151
+    m_chromaFactor = 0.55;
152
+    m_sigmaMultiplier = 9.0;
153
+    m_sigmaZeroPoint = 10.0;
154
+    m_motionVectorFactor = 16;
155
+}
156
+
157
+void TemporalFilter::init(const x265_param* param)
158
+{
159
+    m_param = param;
160
+    m_bitDepth = param->internalBitDepth;
161
+    m_sourceWidth = param->sourceWidth;
162
+    m_sourceHeight = param->sourceHeight;
163
+    m_internalCsp = param->internalCsp;
164
+    m_numComponents = (m_internalCsp != X265_CSP_I400) ? MAX_NUM_COMPONENT : 1;
165
+
166
+    m_metld = new MotionEstimatorTLD;
167
+
168
+    predPUYuv.create(FENC_STRIDE, X265_CSP_I400);
169
+}
170
+
171
+int TemporalFilter::createRefPicInfo(TemporalFilterRefPicInfo* refFrame, x265_param* param)
172
+{
173
+    CHECKED_MALLOC_ZERO(refFrame->mvs, MV, sizeof(MV)* ((m_sourceWidth ) / 4) * ((m_sourceHeight ) / 4));
174
+    refFrame->mvsStride = m_sourceWidth / 4;
175
+    CHECKED_MALLOC_ZERO(refFrame->mvs0, MV, sizeof(MV)* ((m_sourceWidth ) / 16) * ((m_sourceHeight ) / 16));
176
+    refFrame->mvsStride0 = m_sourceWidth / 16;
177
+    CHECKED_MALLOC_ZERO(refFrame->mvs1, MV, sizeof(MV)* ((m_sourceWidth ) / 16) * ((m_sourceHeight ) / 16));
178
+    refFrame->mvsStride1 = m_sourceWidth / 16;
179
+    CHECKED_MALLOC_ZERO(refFrame->mvs2, MV, sizeof(MV)* ((m_sourceWidth ) / 16)*((m_sourceHeight ) / 16));
180
+    refFrame->mvsStride2 = m_sourceWidth / 16;
181
+
182
+    CHECKED_MALLOC_ZERO(refFrame->noise, int, sizeof(int) * ((m_sourceWidth) / 4) * ((m_sourceHeight) / 4));
183
+    CHECKED_MALLOC_ZERO(refFrame->error, int, sizeof(int) * ((m_sourceWidth) / 4) * ((m_sourceHeight) / 4));
184
+
185
+    refFrame->slicetype = X265_TYPE_AUTO;
186
+
187
+    refFrame->compensatedPic = new PicYuv;
188
+    refFrame->compensatedPic->create(param, true);
189
+
190
+    return 1;
191
+fail:
192
+    return 0;
193
+}
194
+
195
+int TemporalFilter::motionErrorLumaSAD(
196
+    PicYuv *orig,
197
+    PicYuv *buffer,
198
+    int x,
199
+    int y,
200
+    int dx,
201
x265_3.6.tar.gz/source/common/temporalfilter.h Added
187
 
1
@@ -0,0 +1,185 @@
2
+/*****************************************************************************
3
+* Copyright (C) 2013-2021 MulticoreWare, Inc
4
+*
5
+ * Authors: Ashok Kumar Mishra <ashok@multicorewareinc.com>
6
+ *
7
+* This program is free software; you can redistribute it and/or modify
8
+* it under the terms of the GNU General Public License as published by
9
+* the Free Software Foundation; either version 2 of the License, or
10
+* (at your option) any later version.
11
+*
12
+* This program is distributed in the hope that it will be useful,
13
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+* GNU General Public License for more details.
16
+*
17
+* You should have received a copy of the GNU General Public License
18
+* along with this program; if not, write to the Free Software
19
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+*
21
+* This program is also available under a commercial proprietary license.
22
+* For more information, contact us at license @ x265.com.
23
+*****************************************************************************/
24
+
25
+#ifndef X265_TEMPORAL_FILTER_H
26
+#define X265_TEMPORAL_FILTER_H
27
+
28
+#include "x265.h"
29
+#include "picyuv.h"
30
+#include "mv.h"
31
+#include "piclist.h"
32
+#include "yuv.h"
33
+#include "motion.h"
34
+
35
+const int s_interpolationFilter168 =
36
+{
37
+    {   0,   0,   0,  64,   0,   0,   0,   0 },   //0
38
+    {   0,   1,  -3,  64,   4,  -2,   0,   0 },   //1 -->-->
39
+    {   0,   1,  -6,  62,   9,  -3,   1,   0 },   //2 -->
40
+    {   0,   2,  -8,  60,  14,  -5,   1,   0 },   //3 -->-->
41
+    {   0,   2,  -9,  57,  19,  -7,   2,   0 },   //4
42
+    {   0,   3, -10,  53,  24,  -8,   2,   0 },   //5 -->-->
43
+    {   0,   3, -11,  50,  29,  -9,   2,   0 },   //6 -->
44
+    {   0,   3, -11,  44,  35, -10,   3,   0 },   //7 -->-->
45
+    {   0,   1,  -7,  38,  38,  -7,   1,   0 },   //8
46
+    {   0,   3, -10,  35,  44, -11,   3,   0 },   //9 -->-->
47
+    {   0,   2,  -9,  29,  50, -11,   3,   0 },   //10-->
48
+    {   0,   2,  -8,  24,  53, -10,   3,   0 },   //11-->-->
49
+    {   0,   2,  -7,  19,  57,  -9,   2,   0 },   //12
50
+    {   0,   1,  -5,  14,  60,  -8,   2,   0 },   //13-->-->
51
+    {   0,   1,  -3,   9,  62,  -6,   1,   0 },   //14-->
52
+    {   0,   0,  -2,   4,  64,  -3,   1,   0 }    //15-->-->
53
+};
54
+
55
+const double s_refStrengths34 =
56
+{ // abs(POC offset)
57
+  //  1,    2     3     4
58
+  {0.85, 0.57, 0.41, 0.33},  // m_range * 2
59
+  {1.13, 0.97, 0.81, 0.57},  // m_range
60
+  {0.30, 0.30, 0.30, 0.30}   // otherwise
61
+};
62
+
63
+namespace X265_NS {
64
+    class OrigPicBuffer
65
+    {
66
+    public:
67
+        PicList    m_mcstfPicList;
68
+        PicList    m_mcstfOrigPicFreeList;
69
+        PicList    m_mcstfOrigPicList;
70
+
71
+        ~OrigPicBuffer();
72
+        void addPicture(Frame*);
73
+        void addEncPicture(Frame*);
74
+        void setOrigPicList(Frame*, int);
75
+        void recycleOrigPicList();
76
+        void addPictureToFreelist(Frame*);
77
+        void addEncPictureToPicList(Frame*);
78
+    };
79
+
80
+    struct MotionEstimatorTLD
81
+    {
82
+        MotionEstimate  me;
83
+
84
+        MotionEstimatorTLD()
85
+        {
86
+            me.init(X265_CSP_I400);
87
+            me.setQP(X265_LOOKAHEAD_QP);
88
+        }
89
+
90
+        ~MotionEstimatorTLD() {}
91
+    };
92
+
93
+    struct TemporalFilterRefPicInfo
94
+    {
95
+        PicYuv*    picBuffer;
96
+        PicYuv*    picBufferSubSampled2;
97
+        PicYuv*    picBufferSubSampled4;
98
+        MV*        mvs;
99
+        MV*        mvs0;
100
+        MV*        mvs1;
101
+        MV*        mvs2;
102
+        uint32_t   mvsStride;
103
+        uint32_t   mvsStride0;
104
+        uint32_t   mvsStride1;
105
+        uint32_t   mvsStride2;
106
+        int*       error;
107
+        int*       noise;
108
+
109
+        int16_t    origOffset;
110
+        bool       isFilteredFrame;
111
+        PicYuv*    compensatedPic;
112
+
113
+        int*       isSubsampled;
114
+
115
+        int        slicetype;
116
+    };
117
+
118
+    class TemporalFilter
119
+    {
120
+    public:
121
+        TemporalFilter();
122
+        ~TemporalFilter() {}
123
+
124
+        void init(const x265_param* param);
125
+
126
+        //private:
127
+            // Private static member variables
128
+        const x265_param *m_param;
129
+        int32_t  m_bitDepth;
130
+        int m_range;
131
+        uint8_t m_numRef;
132
+        double m_chromaFactor;
133
+        double m_sigmaMultiplier;
134
+        double m_sigmaZeroPoint;
135
+        int m_motionVectorFactor;
136
+        int m_padding;
137
+
138
+        // Private member variables
139
+
140
+        int m_sourceWidth;
141
+        int m_sourceHeight;
142
+        int m_QP;
143
+
144
+        int m_internalCsp;
145
+        int m_numComponents;
146
+        uint8_t m_sliceTypeConfig;
147
+
148
+        MotionEstimatorTLD* m_metld;
149
+        Yuv  predPUYuv;
150
+        int m_useSADinME;
151
+
152
+        int createRefPicInfo(TemporalFilterRefPicInfo* refFrame, x265_param* param);
153
+
154
+        void bilateralFilter(Frame* frame, TemporalFilterRefPicInfo* mctfRefList, double overallStrength);
155
+
156
+        void motionEstimationLuma(MV *mvs, uint32_t mvStride, PicYuv *orig, PicYuv *buffer, int bs,
157
+            MV *previous = 0, uint32_t prevmvStride = 0, int factor = 1);
158
+
159
+        void motionEstimationLumaDoubleRes(MV *mvs, uint32_t mvStride, PicYuv *orig, PicYuv *buffer, int blockSize,
160
+            MV *previous, uint32_t prevMvStride, int factor, int* minError);
161
+
162
+        int motionErrorLumaSSD(PicYuv *orig,
163
+            PicYuv *buffer,
164
+            int x,
165
+            int y,
166
+            int dx,
167
+            int dy,
168
+            int bs,
169
+            int besterror = 8 * 8 * 1024 * 1024);
170
+
171
+        int motionErrorLumaSAD(PicYuv *orig,
172
+            PicYuv *buffer,
173
+            int x,
174
+            int y,
175
+            int dx,
176
+            int dy,
177
+            int bs,
178
+            int besterror = 8 * 8 * 1024 * 1024);
179
+
180
+        void destroyRefPicInfo(TemporalFilterRefPicInfo* curFrame);
181
+
182
+        void applyMotion(MV *mvs, uint32_t mvsStride, PicYuv *input, PicYuv *output);
183
+
184
+    };
185
+}
186
+#endif
187
x265_3.5.tar.gz/source/common/threading.h -> x265_3.6.tar.gz/source/common/threading.h Changed
201
 
1
@@ -3,6 +3,7 @@
2
  *
3
  * Authors: Steve Borho <steve@borho.org>
4
  *          Min Chen <chenm003@163.com>
5
+            liwei <liwei@multicorewareinc.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -253,6 +254,47 @@
10
     int                m_val;
11
 };
12
 
13
+class NamedSemaphore
14
+{
15
+public:
16
+    NamedSemaphore() : m_sem(NULL)
17
+    {
18
+    }
19
+
20
+    ~NamedSemaphore()
21
+    {
22
+    }
23
+
24
+    bool create(const char* name, const int initcnt, const int maxcnt)
25
+    {
26
+        if(!m_sem)
27
+        {
28
+            m_sem = CreateSemaphoreA(NULL, initcnt, maxcnt, name);
29
+        }
30
+        return m_sem != NULL;
31
+    }
32
+
33
+    bool give(const int32_t cnt)
34
+    {
35
+        return ReleaseSemaphore(m_sem, (LONG)cnt, NULL) != FALSE;
36
+    }
37
+
38
+    bool take(const uint32_t time_out = INFINITE)
39
+    {
40
+        int32_t rt = WaitForSingleObject(m_sem, time_out);
41
+        return rt != WAIT_TIMEOUT && rt != WAIT_FAILED;
42
+    }
43
+
44
+    void release()
45
+    {
46
+        CloseHandle(m_sem);
47
+        m_sem = NULL;
48
+    }
49
+
50
+private:
51
+    HANDLE m_sem;
52
+};
53
+
54
 #else /* POSIX / pthreads */
55
 
56
 typedef pthread_t ThreadHandle;
57
@@ -459,6 +501,282 @@
58
     int             m_val;
59
 };
60
 
61
+#define TIMEOUT_INFINITE 0xFFFFFFFF
62
+
63
+class NamedSemaphore
64
+{
65
+public:
66
+    NamedSemaphore() 
67
+        : m_sem(NULL)
68
+#ifndef __APPLE__
69
+        , m_name(NULL)
70
+#endif //__APPLE__
71
+    {
72
+    }
73
+
74
+    ~NamedSemaphore()
75
+    {
76
+    }
77
+
78
+    bool create(const char* name, const int initcnt, const int maxcnt)
79
+    {
80
+        bool ret = false;
81
+
82
+        if (initcnt >= maxcnt)
83
+        {
84
+            return false;
85
+        }
86
+
87
+#ifdef __APPLE__
88
+        do
89
+        {
90
+            int32_t pshared = name != NULL ? PTHREAD_PROCESS_SHARED : PTHREAD_PROCESS_PRIVATE;
91
+
92
+            m_sem = (mac_sem_t *)malloc(sizeof(mac_sem_t));
93
+            if (!m_sem)
94
+            {
95
+                break;
96
+            }
97
+
98
+            if (pthread_mutexattr_init(&m_sem->mutexAttr))
99
+            {
100
+                break;
101
+            }
102
+
103
+            if (pthread_mutexattr_setpshared(&m_sem->mutexAttr, pshared))
104
+            {
105
+                break;
106
+            }
107
+
108
+            if (pthread_condattr_init(&m_sem->condAttr))
109
+            {
110
+                break;
111
+            }
112
+
113
+            if (pthread_condattr_setpshared(&m_sem->condAttr, pshared))
114
+            {
115
+                break;
116
+            }
117
+
118
+            if (pthread_mutex_init(&m_sem->mutex, &m_sem->mutexAttr))
119
+            {
120
+                break;
121
+            }
122
+
123
+            if (pthread_cond_init(&m_sem->cond, &m_sem->condAttr))
124
+            {
125
+                break;
126
+            }
127
+
128
+            m_sem->curCnt = initcnt;
129
+            m_sem->maxCnt = maxcnt;
130
+
131
+            ret = true;
132
+        } while (0);
133
+        
134
+        if (!ret)
135
+        {
136
+            release();
137
+        }
138
+
139
+#else  //__APPLE__
140
+        m_sem = sem_open(name, O_CREAT | O_EXCL, 0666, initcnt);
141
+        if (m_sem != SEM_FAILED) 
142
+        {
143
+            m_name = strdup(name);
144
+            ret = true;
145
+        }
146
+        else 
147
+        {
148
+            if (EEXIST == errno) 
149
+            {
150
+                m_sem = sem_open(name, 0);
151
+                if (m_sem != SEM_FAILED) 
152
+                {
153
+                    m_name = strdup(name);
154
+                    ret = true;
155
+                }
156
+            }
157
+        }
158
+#endif //__APPLE__
159
+
160
+        return ret;
161
+    }
162
+
163
+    bool give(const int32_t cnt)
164
+    {
165
+        if (!m_sem)
166
+        {
167
+            return false;
168
+        }
169
+
170
+#ifdef __APPLE__
171
+        if (pthread_mutex_lock(&m_sem->mutex))
172
+        {
173
+            return false;
174
+        }
175
+
176
+        int oldCnt = m_sem->curCnt;
177
+        m_sem->curCnt += cnt;
178
+        if (m_sem->curCnt > m_sem->maxCnt)
179
+        {
180
+            m_sem->curCnt = m_sem->maxCnt;
181
+        }
182
+
183
+        bool ret = true;
184
+        if (!oldCnt)
185
+        {
186
+            ret = 0 == pthread_cond_broadcast(&m_sem->cond);
187
+        }
188
+
189
+        if (pthread_mutex_unlock(&m_sem->mutex))
190
+        {
191
+            return false;
192
+        }
193
+
194
+        return ret;
195
+#else //__APPLE__
196
+        int ret = 0;
197
+        int32_t curCnt = cnt;
198
+        while (curCnt-- && !ret) {
199
+            ret = sem_post(m_sem);
200
+        }
201
x265_3.5.tar.gz/source/common/threadpool.cpp -> x265_3.6.tar.gz/source/common/threadpool.cpp Changed
10
 
1
@@ -301,7 +301,7 @@
2
     /* limit threads based on param->numaPools
3
      * For windows because threads can't be allocated to live across sockets
4
      * changing the default behavior to be per-socket pools -- FIXME */
5
-#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
6
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 || HAVE_LIBNUMA
7
     if (!p->numaPools || (strcmp(p->numaPools, "NULL") == 0 || strcmp(p->numaPools, "*") == 0 || strcmp(p->numaPools, "") == 0))
8
     {
9
          char poolString50 = "";
10
x265_3.5.tar.gz/source/common/version.cpp -> x265_3.6.tar.gz/source/common/version.cpp Changed
10
 
1
@@ -71,7 +71,7 @@
2
 #define ONOS    "Unk-OS"
3
 #endif
4
 
5
-#if X86_64
6
+#if defined(_LP64) || defined(_WIN64)
7
 #define BITS    "64 bit"
8
 #else
9
 #define BITS    "32 bit"
10
x265_3.5.tar.gz/source/common/x86/asm-primitives.cpp -> x265_3.6.tar.gz/source/common/x86/asm-primitives.cpp Changed
85
 
1
@@ -1091,6 +1091,7 @@
2
 
3
         p.frameInitLowres = PFX(frame_init_lowres_core_sse2);
4
         p.frameInitLowerRes = PFX(frame_init_lowres_core_sse2);
5
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_sse2);
6
         // TODO: the planecopy_sp is really planecopy_SC now, must be fix it 
7
         //p.planecopy_sp = PFX(downShift_16_sse2);
8
         p.planecopy_sp_shl = PFX(upShift_16_sse2);
9
@@ -1121,6 +1122,7 @@
10
     {
11
         ASSIGN2(p.scale1D_128to64, scale1D_128to64_ssse3);
12
         p.scale2D_64to32 = PFX(scale2D_64to32_ssse3);
13
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_ssse3);
14
 
15
         // p.puLUMA_4x4.satd = p.cuBLOCK_4x4.sa8d = PFX(pixel_satd_4x4_ssse3); this one is broken
16
         ALL_LUMA_PU(satd, pixel_satd, ssse3);
17
@@ -1462,6 +1464,7 @@
18
         p.puLUMA_64x48.copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx);
19
         p.puLUMA_64x64.copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx);
20
         p.propagateCost = PFX(mbtree_propagate_cost_avx);
21
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_avx);
22
     }
23
     if (cpuMask & X265_CPU_XOP)
24
     {
25
@@ -1473,6 +1476,7 @@
26
         LUMA_VAR(xop);
27
         p.frameInitLowres = PFX(frame_init_lowres_core_xop);
28
         p.frameInitLowerRes = PFX(frame_init_lowres_core_xop);
29
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_xop);
30
     }
31
     if (cpuMask & X265_CPU_AVX2)
32
     {
33
@@ -2301,6 +2305,9 @@
34
 
35
         p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
36
         p.frameInitLowerRes = PFX(frame_init_lowres_core_avx2);
37
+
38
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_avx2);
39
+
40
         p.propagateCost = PFX(mbtree_propagate_cost_avx2);
41
         p.fix8Unpack = PFX(cutree_fix8_unpack_avx2);
42
         p.fix8Pack = PFX(cutree_fix8_pack_avx2);
43
@@ -3300,6 +3307,7 @@
44
         //p.frameInitLowres = PFX(frame_init_lowres_core_mmx2);
45
         p.frameInitLowres = PFX(frame_init_lowres_core_sse2);
46
         p.frameInitLowerRes = PFX(frame_init_lowres_core_sse2);
47
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_sse2);
48
 
49
         ALL_LUMA_TU(blockfill_sNONALIGNED, blockfill_s, sse2);
50
         ALL_LUMA_TU(blockfill_sALIGNED, blockfill_s, sse2);
51
@@ -3424,6 +3432,8 @@
52
         ASSIGN2(p.scale1D_128to64, scale1D_128to64_ssse3);
53
         p.scale2D_64to32 = PFX(scale2D_64to32_ssse3);
54
 
55
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_ssse3);
56
+
57
         ASSIGN2(p.puLUMA_8x4.convert_p2s, filterPixelToShort_8x4_ssse3);
58
         ASSIGN2(p.puLUMA_8x8.convert_p2s, filterPixelToShort_8x8_ssse3);
59
         ASSIGN2(p.puLUMA_8x16.convert_p2s, filterPixelToShort_8x16_ssse3);
60
@@ -3691,6 +3701,7 @@
61
         p.frameInitLowres = PFX(frame_init_lowres_core_avx);
62
         p.frameInitLowerRes = PFX(frame_init_lowres_core_avx);
63
         p.propagateCost = PFX(mbtree_propagate_cost_avx);
64
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_avx);
65
     }
66
     if (cpuMask & X265_CPU_XOP)
67
     {
68
@@ -3702,6 +3713,7 @@
69
         p.cuBLOCK_16x16.sse_pp = PFX(pixel_ssd_16x16_xop);
70
         p.frameInitLowres = PFX(frame_init_lowres_core_xop);
71
         p.frameInitLowerRes = PFX(frame_init_lowres_core_xop);
72
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_xop);
73
 
74
     }
75
 #if X86_64
76
@@ -4684,6 +4696,8 @@
77
         p.saoCuStatsE2 = PFX(saoCuStatsE2_avx2);
78
         p.saoCuStatsE3 = PFX(saoCuStatsE3_avx2);
79
 
80
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_avx2);
81
+
82
         if (cpuMask & X265_CPU_BMI2)
83
         {
84
             p.scanPosLast = PFX(scanPosLast_avx2_bmi2);
85
x265_3.5.tar.gz/source/common/x86/const-a.asm -> x265_3.6.tar.gz/source/common/x86/const-a.asm Changed
10
 
1
@@ -100,7 +100,7 @@
2
 const pw_2000,              times 16 dw 0x2000
3
 const pw_8000,              times  8 dw 0x8000
4
 const pw_3fff,              times 16 dw 0x3fff
5
-const pw_32_0,              times  4 dw 32,
6
+const pw_32_0,              times  4 dw 32
7
                             times  4 dw 0
8
 const pw_pixel_max,         times 16 dw ((1 << BIT_DEPTH)-1)
9
 
10
x265_3.5.tar.gz/source/common/x86/h-ipfilter8.asm -> x265_3.6.tar.gz/source/common/x86/h-ipfilter8.asm Changed
20
 
1
@@ -125,6 +125,9 @@
2
 ALIGN 32
3
 interp4_hps_shuf: times 2 db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
4
 
5
+ALIGN 32
6
+const interp_4tap_8x8_horiz_shuf,   dd 0, 4, 1, 5, 2, 6, 3, 7
7
+
8
 SECTION .text
9
 
10
 cextern pw_1
11
@@ -1459,8 +1462,6 @@
12
 
13
     RET
14
 
15
-ALIGN 32
16
-const interp_4tap_8x8_horiz_shuf,   dd 0, 4, 1, 5, 2, 6, 3, 7
17
 
18
 %macro FILTER_H4_w6 3
19
     movu        %1, srcq - 1
20
x265_3.5.tar.gz/source/common/x86/mc-a2.asm -> x265_3.6.tar.gz/source/common/x86/mc-a2.asm Changed
201
 
1
@@ -992,6 +992,262 @@
2
 FRAME_INIT_LOWRES
3
 %endif
4
 
5
+%macro SUBSAMPLEFILT8x4 7
6
+    mova      %3, r0+%7
7
+    mova      %4, r0+r2+%7
8
+    pavgb     %3, %4
9
+    pavgb     %4, r0+r2*2+%7
10
+    PALIGNR   %1, %3, 1, m6
11
+    PALIGNR   %2, %4, 1, m6
12
+%if cpuflag(xop)
13
+    pavgb     %1, %3
14
+    pavgb     %2, %4
15
+%else
16
+    pavgb     %1, %3
17
+    pavgb     %2, %4
18
+    psrlw     %5, %1, 8
19
+    psrlw     %6, %2, 8
20
+    pand      %1, m7
21
+    pand      %2, m7
22
+%endif
23
+%endmacro
24
+
25
+%macro SUBSAMPLEFILT32x4U 1
26
+    movu      m1, r0+r2
27
+    pavgb     m0, m1, r0
28
+    movu      m3, r0+r2+1
29
+    pavgb     m2, m3, r0+1
30
+    pavgb     m1, r0+r2*2
31
+    pavgb     m3, r0+r2*2+1
32
+    pavgb     m0, m2
33
+    pavgb     m1, m3
34
+
35
+    movu      m3, r0+r2+mmsize
36
+    pavgb     m2, m3, r0+mmsize
37
+    movu      m5, r0+r2+1+mmsize
38
+    pavgb     m4, m5, r0+1+mmsize
39
+    pavgb     m2, m4
40
+
41
+    pshufb    m0, m7
42
+    pshufb    m2, m7
43
+    punpcklqdq m0, m0, m2
44
+    vpermq    m0, m0, q3120
45
+    movu    %1, m0
46
+%endmacro
47
+
48
+%macro SUBSAMPLEFILT16x2 3
49
+    mova      m3, r0+%3+mmsize
50
+    mova      m2, r0+%3
51
+    pavgb     m3, r0+%3+r2+mmsize
52
+    pavgb     m2, r0+%3+r2
53
+    PALIGNR   %1, m3, 1, m6
54
+    pavgb     %1, m3
55
+    PALIGNR   m3, m2, 1, m6
56
+    pavgb     m3, m2
57
+%if cpuflag(xop)
58
+    vpperm    m3, m3, %1, m6
59
+%else
60
+    pand      m3, m7
61
+    pand      %1, m7
62
+    packuswb  m3, %1
63
+%endif
64
+    mova    %2, m3
65
+    mova      %1, m2
66
+%endmacro
67
+
68
+%macro SUBSAMPLEFILT8x2U 2
69
+    mova      m2, r0+%2
70
+    pavgb     m2, r0+%2+r2
71
+    mova      m0, r0+%2+1
72
+    pavgb     m0, r0+%2+r2+1
73
+    pavgb     m1, m3
74
+    pavgb     m0, m2
75
+    pand      m1, m7
76
+    pand      m0, m7
77
+    packuswb  m0, m1
78
+    mova    %1, m0
79
+%endmacro
80
+
81
+%macro SUBSAMPLEFILT8xU 2
82
+    mova      m3, r0+%2+8
83
+    mova      m2, r0+%2
84
+    pavgw     m3, r0+%2+r2+8
85
+    pavgw     m2, r0+%2+r2
86
+    movu      m1, r0+%2+10
87
+    movu      m0, r0+%2+2
88
+    pavgw     m1, r0+%2+r2+10
89
+    pavgw     m0, r0+%2+r2+2
90
+    pavgw     m1, m3
91
+    pavgw     m0, m2
92
+    psrld     m3, m1, 16
93
+    pand      m1, m7
94
+    pand      m0, m7
95
+    packssdw  m0, m1
96
+    movu    %1, m0
97
+%endmacro
98
+
99
+%macro SUBSAMPLEFILT8xA 3
100
+    movu      m3, r0+%3+mmsize
101
+    movu      m2, r0+%3
102
+    pavgw     m3, r0+%3+r2+mmsize
103
+    pavgw     m2, r0+%3+r2
104
+    PALIGNR   %1, m3, 2, m6
105
+    pavgw     %1, m3
106
+    PALIGNR   m3, m2, 2, m6
107
+    pavgw     m3, m2
108
+%if cpuflag(xop)
109
+    vpperm    m3, m3, %1, m6
110
+%else
111
+    pand      m3, m7
112
+    pand      %1, m7
113
+    packssdw  m3, %1
114
+%endif
115
+%if cpuflag(avx2)
116
+    vpermq     m3, m3, q3120
117
+%endif
118
+    movu    %2, m3
119
+    movu      %1, m2
120
+%endmacro
121
+
122
+;-----------------------------------------------------------------------------
123
+; void frame_subsample_luma( uint8_t *src0, uint8_t *dst0,
124
+;                              intptr_t src_stride, intptr_t dst_stride, int width, int height )
125
+;-----------------------------------------------------------------------------
126
+
127
+%macro FRAME_SUBSAMPLE_LUMA 0
128
+cglobal frame_subsample_luma, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
129
+%if HIGH_BIT_DEPTH
130
+    shl   dword r3m, 1
131
+    FIX_STRIDES r2
132
+    shl   dword r4m, 1
133
+%endif
134
+%if mmsize >= 16
135
+    add   dword r4m, mmsize-1
136
+    and   dword r4m, ~(mmsize-1)
137
+%endif
138
+    ; src += 2*(height-1)*stride + 2*width
139
+    mov      r6d, r5m
140
+    dec      r6d
141
+    imul     r6d, r2d
142
+    add      r6d, r4m
143
+    lea       r0, r0+r6*2
144
+    ; dst += (height-1)*stride + width
145
+    mov      r6d, r5m
146
+    dec      r6d
147
+    imul     r6d, r3m
148
+    add      r6d, r4m
149
+    add       r1, r6
150
+    ; gap = stride - width
151
+    mov      r6d, r3m
152
+    sub      r6d, r4m
153
+    PUSH      r6
154
+    %define dst_gap rsp+gprsize
155
+    mov      r6d, r2d
156
+    sub      r6d, r4m
157
+    shl      r6d, 1
158
+    PUSH      r6
159
+    %define src_gap rsp
160
+%if HIGH_BIT_DEPTH
161
+%if cpuflag(xop)
162
+    mova      m6, deinterleave_shuf32a
163
+    mova      m7, deinterleave_shuf32b
164
+%else
165
+    pcmpeqw   m7, m7
166
+    psrld     m7, 16
167
+%endif
168
+.vloop:
169
+    mov      r6d, r4m
170
+%ifnidn cpuname, mmx2
171
+    movu      m0, r0
172
+    movu      m1, r0+r2
173
+    pavgw     m0, m1
174
+    pavgw     m1, r0+r2*2
175
+%endif
176
+.hloop:
177
+    sub       r0, mmsize*2
178
+    sub       r1, mmsize
179
+%ifidn cpuname, mmx2
180
+    SUBSAMPLEFILT8xU r1, 0
181
+%else
182
+    SUBSAMPLEFILT8xA m0, r1, 0
183
+%endif
184
+    sub      r6d, mmsize
185
+    jg .hloop
186
+%else ; !HIGH_BIT_DEPTH
187
+%if cpuflag(avx2)
188
+    mova      m7, deinterleave_shuf
189
+%elif cpuflag(xop)
190
+    mova      m6, deinterleave_shuf32a
191
+    mova      m7, deinterleave_shuf32b
192
+%else
193
+    pcmpeqb   m7, m7
194
+    psrlw     m7, 8
195
+%endif
196
+.vloop:
197
+    mov      r6d, r4m
198
+%ifnidn cpuname, mmx2
199
+%if mmsize <= 16
200
+    mova      m0, r0
201
x265_3.5.tar.gz/source/common/x86/mc.h -> x265_3.6.tar.gz/source/common/x86/mc.h Changed
19
 
1
@@ -36,6 +36,17 @@
2
 
3
 #undef LOWRES
4
 
5
+#define SUBSAMPLELUMA(cpu) \
6
+    void PFX(frame_subsample_luma_ ## cpu)(const pixel* src0, pixel* dst0, intptr_t src_stride, intptr_t dst_stride, int width, int height);
7
+SUBSAMPLELUMA(mmx2)
8
+SUBSAMPLELUMA(sse2)
9
+SUBSAMPLELUMA(ssse3)
10
+SUBSAMPLELUMA(avx)
11
+SUBSAMPLELUMA(avx2)
12
+SUBSAMPLELUMA(xop)
13
+
14
+#undef SUBSAMPLELUMA
15
+
16
 #define PROPAGATE_COST(cpu) \
17
     void PFX(mbtree_propagate_cost_ ## cpu)(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, \
18
                                               const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
19
x265_3.5.tar.gz/source/common/x86/x86inc.asm -> x265_3.6.tar.gz/source/common/x86/x86inc.asm Changed
96
 
1
@@ -401,16 +401,6 @@
2
     %endif
3
 %endmacro
4
 
5
-%macro DEFINE_ARGS_INTERNAL 3+
6
-    %ifnum %2
7
-        DEFINE_ARGS %3
8
-    %elif %1 == 4
9
-        DEFINE_ARGS %2
10
-    %elif %1 > 4
11
-        DEFINE_ARGS %2, %3
12
-    %endif
13
-%endmacro
14
-
15
 %if WIN64 ; Windows x64 ;=================================================
16
 
17
 DECLARE_REG 0,  rcx
18
@@ -429,7 +419,7 @@
19
 DECLARE_REG 13, R12, 112
20
 DECLARE_REG 14, R13, 120
21
 
22
-%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
23
+%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
24
     %assign num_args %1
25
     %assign regs_used %2
26
     ASSERT regs_used >= num_args
27
@@ -441,7 +431,15 @@
28
         WIN64_SPILL_XMM %3
29
     %endif
30
     LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
31
-    DEFINE_ARGS_INTERNAL %0, %4, %5
32
+    %if %0 > 4
33
+         %ifnum %4
34
+             DEFINE_ARGS %5
35
+         %else
36
+             DEFINE_ARGS %4, %5
37
+         %endif
38
+     %elifnnum %4
39
+         DEFINE_ARGS %4
40
+     %endif
41
 %endmacro
42
 
43
 %macro WIN64_PUSH_XMM 0
44
@@ -537,7 +535,7 @@
45
 DECLARE_REG 13, R12, 64
46
 DECLARE_REG 14, R13, 72
47
 
48
-%macro PROLOGUE 2-5+ 0; #args, #regs, #xmm_regs, stack_size, arg_names...
49
+%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
50
     %assign num_args %1
51
     %assign regs_used %2
52
     %assign xmm_regs_used %3
53
@@ -547,7 +545,15 @@
54
     PUSH_IF_USED 9, 10, 11, 12, 13, 14
55
     ALLOC_STACK %4
56
     LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
57
-    DEFINE_ARGS_INTERNAL %0, %4, %5
58
+    %if %0 > 4
59
+         %ifnum %4
60
+             DEFINE_ARGS %5
61
+         %else
62
+             DEFINE_ARGS %4, %5
63
+         %endif
64
+     %elifnnum %4
65
+         DEFINE_ARGS %4
66
+     %endif
67
 %endmacro
68
 
69
 %define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
70
@@ -588,7 +594,7 @@
71
 
72
 DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
73
 
74
-%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, stack_size, arg_names...
75
+%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
76
     %assign num_args %1
77
     %assign regs_used %2
78
     ASSERT regs_used >= num_args
79
@@ -603,7 +609,15 @@
80
     PUSH_IF_USED 3, 4, 5, 6
81
     ALLOC_STACK %4
82
     LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
83
-    DEFINE_ARGS_INTERNAL %0, %4, %5
84
+    %if %0 > 4
85
+         %ifnum %4
86
+             DEFINE_ARGS %5
87
+         %else
88
+             DEFINE_ARGS %4, %5
89
+         %endif
90
+     %elifnnum %4
91
+         DEFINE_ARGS %4
92
+     %endif
93
 %endmacro
94
 
95
 %define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
96
x265_3.5.tar.gz/source/common/x86/x86util.asm -> x265_3.6.tar.gz/source/common/x86/x86util.asm Changed
13
 
1
@@ -578,8 +578,10 @@
2
     %elif %1==2
3
         %if mmsize==8
4
             SBUTTERFLY dq, %3, %4, %5
5
-        %else
6
+        %elif %0==6
7
             TRANS q, ORDER, %3, %4, %5, %6
8
+        %else
9
+            TRANS q, ORDER, %3, %4, %5
10
         %endif
11
     %elif %1==4
12
         SBUTTERFLY qdq, %3, %4, %5
13
x265_3.5.tar.gz/source/encoder/analysis.cpp -> x265_3.6.tar.gz/source/encoder/analysis.cpp Changed
10
 
1
@@ -3645,7 +3645,7 @@
2
             qp += distortionData->offsetctu.m_cuAddr;
3
     }
4
 
5
-    if (m_param->analysisLoadReuseLevel == 10 && m_param->rc.cuTree)
6
+    if (m_param->analysisLoadReuseLevel >= 2 && m_param->rc.cuTree)
7
     {
8
         int cuIdx = (ctu.m_cuAddr * ctu.m_numPartitions) + cuGeom.absPartIdx;
9
         if (ctu.m_slice->m_sliceType == I_SLICE)
10
x265_3.5.tar.gz/source/encoder/api.cpp -> x265_3.6.tar.gz/source/encoder/api.cpp Changed
50
 
1
@@ -208,7 +208,6 @@
2
     memcpy(zoneParam, param, sizeof(x265_param));
3
     for (int i = 0; i < param->rc.zonefileCount; i++)
4
     {
5
-        param->rc.zonesi.startFrame = -1;
6
         encoder->configureZone(zoneParam, param->rc.zonesi.zoneParam);
7
     }
8
 
9
@@ -608,6 +607,14 @@
10
     if (numEncoded < 0)
11
         encoder->m_aborted = true;
12
 
13
+    if ((!encoder->m_numDelayedPic && !numEncoded) && (encoder->m_param->bEnableEndOfSequence || encoder->m_param->bEnableEndOfBitstream))
14
+    {
15
+        Bitstream bs;
16
+        encoder->getEndNalUnits(encoder->m_nalList, bs);
17
+        *pp_nal = &encoder->m_nalList.m_nal0;
18
+        if (pi_nal) *pi_nal = encoder->m_nalList.m_numNal;
19
+    }
20
+
21
     return numEncoded;
22
 }
23
 
24
@@ -1042,6 +1049,7 @@
25
     &PARAM_NS::x265_param_free,
26
     &PARAM_NS::x265_param_default,
27
     &PARAM_NS::x265_param_parse,
28
+    &PARAM_NS::x265_scenecut_aware_qp_param_parse,
29
     &PARAM_NS::x265_param_apply_profile,
30
     &PARAM_NS::x265_param_default_preset,
31
     &x265_picture_alloc,
32
@@ -1288,6 +1296,8 @@
33
             if (param->csvLogLevel)
34
             {
35
                 fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, Scenecut, ");
36
+                if (!!param->bEnableTemporalSubLayers)
37
+                    fprintf(csvfp, "Temporal Sub Layer ID, ");
38
                 if (param->csvLogLevel >= 2)
39
                     fprintf(csvfp, "I/P cost ratio, ");
40
                 if (param->rc.rateControlMode == X265_RC_CRF)
41
@@ -1401,6 +1411,8 @@
42
     const x265_frame_stats* frameStats = &pic->frameData;
43
     fprintf(param->csvfpt, "%d, %c-SLICE, %4d, %2.2lf, %10d, %d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc,
44
                                                                    frameStats->qp, (int)frameStats->bits, frameStats->bScenecut);
45
+    if (!!param->bEnableTemporalSubLayers)
46
+        fprintf(param->csvfpt, "%d,", frameStats->tLayer);
47
     if (param->csvLogLevel >= 2)
48
         fprintf(param->csvfpt, "%.2f,", frameStats->ipCostRatio);
49
     if (param->rc.rateControlMode == X265_RC_CRF)
50
x265_3.5.tar.gz/source/encoder/dpb.cpp -> x265_3.6.tar.gz/source/encoder/dpb.cpp Changed
201
 
1
@@ -70,10 +70,18 @@
2
     {
3
         Frame *curFrame = iterFrame;
4
         iterFrame = iterFrame->m_next;
5
-        if (!curFrame->m_encData->m_bHasReferences && !curFrame->m_countRefEncoders)
6
+        bool isMCSTFReferenced = false;
7
+
8
+        if (curFrame->m_param->bEnableTemporalFilter)
9
+            isMCSTFReferenced =!!(curFrame->m_refPicCnt1);
10
+
11
+        if (!curFrame->m_encData->m_bHasReferences && !curFrame->m_countRefEncoders && !isMCSTFReferenced)
12
         {
13
             curFrame->m_bChromaExtended = false;
14
 
15
+            if (curFrame->m_param->bEnableTemporalFilter)
16
+                *curFrame->m_isSubSampled = false;
17
+
18
             // Reset column counter
19
             X265_CHECK(curFrame->m_reconRowFlag != NULL, "curFrame->m_reconRowFlag check failure");
20
             X265_CHECK(curFrame->m_reconColCount != NULL, "curFrame->m_reconColCount check failure");
21
@@ -142,12 +150,13 @@
22
     {
23
         newFrame->m_encData->m_bHasReferences = false;
24
 
25
+        newFrame->m_tempLayer = (newFrame->m_param->bEnableTemporalSubLayers && !m_bTemporalSublayer) ? 1 : newFrame->m_tempLayer;
26
         // Adjust NAL type for unreferenced B frames (change from _R "referenced"
27
         // to _N "non-referenced" NAL unit type)
28
         switch (slice->m_nalUnitType)
29
         {
30
         case NAL_UNIT_CODED_SLICE_TRAIL_R:
31
-            slice->m_nalUnitType = m_bTemporalSublayer ? NAL_UNIT_CODED_SLICE_TSA_N : NAL_UNIT_CODED_SLICE_TRAIL_N;
32
+            slice->m_nalUnitType = newFrame->m_param->bEnableTemporalSubLayers ? NAL_UNIT_CODED_SLICE_TSA_N : NAL_UNIT_CODED_SLICE_TRAIL_N;
33
             break;
34
         case NAL_UNIT_CODED_SLICE_RADL_R:
35
             slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_RADL_N;
36
@@ -168,13 +177,94 @@
37
 
38
     m_picList.pushFront(*newFrame);
39
 
40
+    if (m_bTemporalSublayer && getTemporalLayerNonReferenceFlag())
41
+    {
42
+        switch (slice->m_nalUnitType)
43
+        {
44
+        case NAL_UNIT_CODED_SLICE_TRAIL_R:
45
+            slice->m_nalUnitType =  NAL_UNIT_CODED_SLICE_TRAIL_N;
46
+            break;
47
+        case NAL_UNIT_CODED_SLICE_RADL_R:
48
+            slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_RADL_N;
49
+            break;
50
+        case NAL_UNIT_CODED_SLICE_RASL_R:
51
+            slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_RASL_N;
52
+            break;
53
+        default:
54
+            break;
55
+        }
56
+    }
57
     // Do decoding refresh marking if any
58
     decodingRefreshMarking(pocCurr, slice->m_nalUnitType);
59
 
60
-    computeRPS(pocCurr, slice->isIRAP(), &slice->m_rps, slice->m_sps->maxDecPicBuffering);
61
-
62
+    computeRPS(pocCurr, newFrame->m_tempLayer, slice->isIRAP(), &slice->m_rps, slice->m_sps->maxDecPicBufferingnewFrame->m_tempLayer);
63
+    bool isTSAPic = ((slice->m_nalUnitType == 2) || (slice->m_nalUnitType == 3)) ? true : false;
64
     // Mark pictures in m_piclist as unreferenced if they are not included in RPS
65
-    applyReferencePictureSet(&slice->m_rps, pocCurr);
66
+    applyReferencePictureSet(&slice->m_rps, pocCurr, newFrame->m_tempLayer, isTSAPic);
67
+
68
+
69
+    if (m_bTemporalSublayer && newFrame->m_tempLayer > 0
70
+        && !(slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RADL_N     // Check if not a leading picture
71
+            || slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RADL_R
72
+            || slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RASL_N
73
+            || slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RASL_R)
74
+        )
75
+    {
76
+        if (isTemporalLayerSwitchingPoint(pocCurr, newFrame->m_tempLayer) || (slice->m_sps->maxTempSubLayers == 1))
77
+        {
78
+            if (getTemporalLayerNonReferenceFlag())
79
+            {
80
+                slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_TSA_N;
81
+            }
82
+            else
83
+            {
84
+                slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_TSA_R;
85
+            }
86
+        }
87
+        else if (isStepwiseTemporalLayerSwitchingPoint(&slice->m_rps, pocCurr, newFrame->m_tempLayer))
88
+        {
89
+            bool isSTSA = true;
90
+            int id = newFrame->m_gopOffset % x265_gop_ra_lengthnewFrame->m_gopId;
91
+            for (int ii = id; (ii < x265_gop_ra_lengthnewFrame->m_gopId && isSTSA == true); ii++)
92
+            {
93
+                int tempIdRef = x265_gop_ranewFrame->m_gopIdii.layer;
94
+                if (tempIdRef == newFrame->m_tempLayer)
95
+                {
96
+                    for (int jj = 0; jj < slice->m_rps.numberOfPositivePictures + slice->m_rps.numberOfNegativePictures; jj++)
97
+                    {
98
+                        if (slice->m_rps.bUsedjj)
99
+                        {
100
+                            int refPoc = x265_gop_ranewFrame->m_gopIdii.poc_offset + slice->m_rps.deltaPOCjj;
101
+                            int kk = 0;
102
+                            for (kk = 0; kk < x265_gop_ra_lengthnewFrame->m_gopId; kk++)
103
+                            {
104
+                                if (x265_gop_ranewFrame->m_gopIdkk.poc_offset == refPoc)
105
+                                {
106
+                                    break;
107
+                                }
108
+                            }
109
+                            if (x265_gop_ranewFrame->m_gopIdkk.layer >= newFrame->m_tempLayer)
110
+                            {
111
+                                isSTSA = false;
112
+                                break;
113
+                            }
114
+                        }
115
+                    }
116
+                }
117
+            }
118
+            if (isSTSA == true)
119
+            {
120
+                if (getTemporalLayerNonReferenceFlag())
121
+                {
122
+                    slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_STSA_N;
123
+                }
124
+                else
125
+                {
126
+                    slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_STSA_R;
127
+                }
128
+            }
129
+        }
130
+    }
131
 
132
     if (slice->m_sliceType != I_SLICE)
133
         slice->m_numRefIdx0 = x265_clip3(1, newFrame->m_param->maxNumReferences, slice->m_rps.numberOfNegativePictures);
134
@@ -218,7 +308,7 @@
135
     }
136
 }
137
 
138
-void DPB::computeRPS(int curPoc, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer)
139
+void DPB::computeRPS(int curPoc, int tempId, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer)
140
 {
141
     unsigned int poci = 0, numNeg = 0, numPos = 0;
142
 
143
@@ -228,7 +318,7 @@
144
     {
145
         if ((iterPic->m_poc != curPoc) && iterPic->m_encData->m_bHasReferences)
146
         {
147
-            if ((m_lastIDR >= curPoc) || (m_lastIDR <= iterPic->m_poc))
148
+            if ((!m_bTemporalSublayer || (iterPic->m_tempLayer <= tempId)) && ((m_lastIDR >= curPoc) || (m_lastIDR <= iterPic->m_poc)))
149
             {
150
                     rps->pocpoci = iterPic->m_poc;
151
                     rps->deltaPOCpoci = rps->pocpoci - curPoc;
152
@@ -247,6 +337,18 @@
153
     rps->sortDeltaPOC();
154
 }
155
 
156
+bool DPB::getTemporalLayerNonReferenceFlag()
157
+{
158
+    Frame* curFrame = m_picList.first();
159
+    if (curFrame->m_encData->m_bHasReferences)
160
+    {
161
+        curFrame->m_sameLayerRefPic = true;
162
+        return false;
163
+    }
164
+    else
165
+        return true;
166
+}
167
+
168
 /* Marking reference pictures when an IDR/CRA is encountered. */
169
 void DPB::decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType)
170
 {
171
@@ -296,7 +398,7 @@
172
 }
173
 
174
 /** Function for applying picture marking based on the Reference Picture Set */
175
-void DPB::applyReferencePictureSet(RPS *rps, int curPoc)
176
+void DPB::applyReferencePictureSet(RPS *rps, int curPoc, int tempId, bool isTSAPicture)
177
 {
178
     // loop through all pictures in the reference picture buffer
179
     Frame* iterFrame = m_picList.first();
180
@@ -317,9 +419,68 @@
181
             }
182
             if (!referenced)
183
                 iterFrame->m_encData->m_bHasReferences = false;
184
+
185
+            if (m_bTemporalSublayer)
186
+            {
187
+                //check that pictures of higher temporal layers are not used
188
+                assert(referenced == 0 || iterFrame->m_encData->m_bHasReferences == false || iterFrame->m_tempLayer <= tempId);
189
+
190
+                //check that pictures of higher or equal temporal layer are not in the RPS if the current picture is a TSA picture
191
+                if (isTSAPicture)
192
+                {
193
+                    assert(referenced == 0 || iterFrame->m_tempLayer < tempId);
194
+                }
195
+                //check that pictures marked as temporal layer non-reference pictures are not used for reference
196
+                if (iterFrame->m_tempLayer == tempId)
197
+                {
198
+                    assert(referenced == 0 || iterFrame->m_sameLayerRefPic == true);
199
+                }
200
+            }
201
x265_3.5.tar.gz/source/encoder/dpb.h -> x265_3.6.tar.gz/source/encoder/dpb.h Changed
35
 
1
@@ -40,6 +40,7 @@
2
     int                m_lastIDR;
3
     int                m_pocCRA;
4
     int                m_bOpenGOP;
5
+   int                m_craNal;
6
     int                m_bhasLeadingPicture;
7
     bool               m_bRefreshPending;
8
     bool               m_bTemporalSublayer;
9
@@ -66,7 +67,8 @@
10
         m_bRefreshPending = false;
11
         m_frameDataFreeList = NULL;
12
         m_bOpenGOP = param->bOpenGOP;
13
-        m_bTemporalSublayer = !!param->bEnableTemporalSubLayers;
14
+       m_craNal = param->craNal;
15
+        m_bTemporalSublayer = (param->bEnableTemporalSubLayers > 2);
16
     }
17
 
18
     ~DPB();
19
@@ -77,10 +79,13 @@
20
 
21
 protected:
22
 
23
-    void computeRPS(int curPoc, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer);
24
+    void computeRPS(int curPoc,int tempId, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer);
25
 
26
-    void applyReferencePictureSet(RPS *rps, int curPoc);
27
+    void applyReferencePictureSet(RPS *rps, int curPoc, int tempId, bool isTSAPicture);
28
+    bool getTemporalLayerNonReferenceFlag();
29
     void decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType);
30
+    bool isTemporalLayerSwitchingPoint(int curPoc, int tempId);
31
+    bool isStepwiseTemporalLayerSwitchingPoint(RPS *rps, int curPoc, int tempId);
32
 
33
     NalUnitType getNalUnitType(int curPoc, bool bIsKeyFrame);
34
 };
35
x265_3.5.tar.gz/source/encoder/encoder.cpp -> x265_3.6.tar.gz/source/encoder/encoder.cpp Changed
201
 
1
@@ -72,7 +72,40 @@
2
 {
3
     { 1, 1, 1, 1, 1, 5, 1,  2, 2, 2, 50 },
4
     { 1, 1, 1, 1, 1, 5, 0, 16, 9, 9, 81 },
5
-    { 1, 1, 1, 1, 1, 5, 0,  1, 1, 1, 82 }
6
+    { 1, 1, 1, 1, 1, 5, 0,  1, 1, 1, 82 },
7
+    { 1, 1, 1, 1, 1, 5, 0, 18, 9, 9, 84 }
8
+};
9
+
10
+typedef struct
11
+{
12
+    int bEnableVideoSignalTypePresentFlag;
13
+    int bEnableColorDescriptionPresentFlag;
14
+    int bEnableChromaLocInfoPresentFlag;
15
+    int colorPrimaries;
16
+    int transferCharacteristics;
17
+    int matrixCoeffs;
18
+    int bEnableVideoFullRangeFlag;
19
+    int chromaSampleLocTypeTopField;
20
+    int chromaSampleLocTypeBottomField;
21
+    const char* systemId;
22
+}VideoSignalTypePresets;
23
+
24
+VideoSignalTypePresets vstPresets =
25
+{
26
+    {1, 1, 1, 6, 6, 6, 0, 0, 0, "BT601_525"},
27
+    {1, 1, 1, 5, 6, 5, 0, 0, 0, "BT601_626"},
28
+    {1, 1, 1, 1, 1, 1, 0, 0, 0, "BT709_YCC"},
29
+    {1, 1, 0, 1, 1, 0, 0, 0, 0, "BT709_RGB"},
30
+    {1, 1, 1, 9, 14, 1, 0, 2, 2, "BT2020_YCC_NCL"},
31
+    {1, 1, 0, 9, 16, 9, 0, 0, 0, "BT2020_RGB"},
32
+    {1, 1, 1, 9, 16, 9, 0, 2, 2, "BT2100_PQ_YCC"},
33
+    {1, 1, 1, 9, 16, 14, 0, 2, 2, "BT2100_PQ_ICTCP"},
34
+    {1, 1, 0, 9, 16, 0, 0, 0, 0, "BT2100_PQ_RGB"},
35
+    {1, 1, 1, 9, 18, 9, 0, 2, 2, "BT2100_HLG_YCC"},
36
+    {1, 1, 0, 9, 18, 0, 0, 0, 0, "BT2100_HLG_RGB"},
37
+    {1, 1, 0, 1, 1, 0, 1, 0, 0, "FR709_RGB"},
38
+    {1, 1, 0, 9, 14, 0, 1, 0, 0, "FR2020_RGB"},
39
+    {1, 1, 1, 12, 1, 6, 1, 1, 1, "FRP3D65_YCC"}
40
 };
41
 }
42
 
43
@@ -109,6 +142,7 @@
44
     m_threadPool = NULL;
45
     m_analysisFileIn = NULL;
46
     m_analysisFileOut = NULL;
47
+    m_filmGrainIn = NULL;
48
     m_naluFile = NULL;
49
     m_offsetEmergency = NULL;
50
     m_iFrameNum = 0;
51
@@ -134,12 +168,8 @@
52
     m_prevTonemapPayload.payload = NULL;
53
     m_startPoint = 0;
54
     m_saveCTUSize = 0;
55
-    m_edgePic = NULL;
56
-    m_edgeHistThreshold = 0;
57
-    m_chromaHistThreshold = 0.0;
58
-    m_scaledEdgeThreshold = 0.0;
59
-    m_scaledChromaThreshold = 0.0;
60
     m_zoneIndex = 0;
61
+    m_origPicBuffer = 0;
62
 }
63
 
64
 inline char *strcatFilename(const char *input, const char *suffix)
65
@@ -216,34 +246,6 @@
66
         }
67
     }
68
 
69
-    if (m_param->bHistBasedSceneCut)
70
-    {
71
-        m_planeSizes0 = (m_param->sourceWidth >> x265_cli_cspsp->internalCsp.width0) * (m_param->sourceHeight >> x265_cli_cspsm_param->internalCsp.height0);
72
-        uint32_t pixelbytes = m_param->internalBitDepth > 8 ? 2 : 1;
73
-        m_edgePic = X265_MALLOC(pixel, m_planeSizes0 * pixelbytes);
74
-        m_edgeHistThreshold = m_param->edgeTransitionThreshold;
75
-        m_chromaHistThreshold = x265_min(m_edgeHistThreshold * 10.0, MAX_SCENECUT_THRESHOLD);
76
-        m_scaledEdgeThreshold = x265_min(m_edgeHistThreshold * SCENECUT_STRENGTH_FACTOR, MAX_SCENECUT_THRESHOLD);
77
-        m_scaledChromaThreshold = x265_min(m_chromaHistThreshold * SCENECUT_STRENGTH_FACTOR, MAX_SCENECUT_THRESHOLD);
78
-        if (m_param->sourceBitDepth != m_param->internalBitDepth)
79
-        {
80
-            int size = m_param->sourceWidth * m_param->sourceHeight;
81
-            int hshift = CHROMA_H_SHIFT(m_param->internalCsp);
82
-            int vshift = CHROMA_V_SHIFT(m_param->internalCsp);
83
-            int widthC = m_param->sourceWidth >> hshift;
84
-            int heightC = m_param->sourceHeight >> vshift;
85
-
86
-            m_inputPic0 = X265_MALLOC(pixel, size);
87
-            if (m_param->internalCsp != X265_CSP_I400)
88
-            {
89
-                for (int j = 1; j < 3; j++)
90
-                {
91
-                    m_inputPicj = X265_MALLOC(pixel, widthC * heightC);
92
-                }
93
-            }
94
-        }
95
-    }
96
-
97
     // Do not allow WPP if only one row or fewer than 3 columns, it is pointless and unstable
98
     if (rows == 1 || cols < 3)
99
     {
100
@@ -357,6 +359,10 @@
101
             lookAheadThreadPooli.start();
102
     m_lookahead->m_numPools = pools;
103
     m_dpb = new DPB(m_param);
104
+
105
+    if (m_param->bEnableTemporalFilter)
106
+        m_origPicBuffer = new OrigPicBuffer();
107
+
108
     m_rateControl = new RateControl(*m_param, this);
109
     if (!m_param->bResetZoneConfig)
110
     {
111
@@ -518,6 +524,15 @@
112
             }
113
         }
114
     }
115
+    if (m_param->filmGrain)
116
+    {
117
+        m_filmGrainIn = x265_fopen(m_param->filmGrain, "rb");
118
+        if (!m_filmGrainIn)
119
+        {
120
+            x265_log_file(NULL, X265_LOG_ERROR, "Failed to open film grain characteristics binary file %s\n", m_param->filmGrain);
121
+        }
122
+    }
123
+
124
     m_bZeroLatency = !m_param->bframes && !m_param->lookaheadDepth && m_param->frameNumThreads == 1 && m_param->maxSlices == 1;
125
     m_aborted |= parseLambdaFile(m_param);
126
 
127
@@ -879,26 +894,6 @@
128
         }
129
     }
130
 
131
-    if (m_param->bHistBasedSceneCut)
132
-    {
133
-        if (m_edgePic != NULL)
134
-        {
135
-            X265_FREE_ZERO(m_edgePic);
136
-        }
137
-
138
-        if (m_param->sourceBitDepth != m_param->internalBitDepth)
139
-        {
140
-            X265_FREE_ZERO(m_inputPic0);
141
-            if (m_param->internalCsp != X265_CSP_I400)
142
-            {
143
-                for (int i = 1; i < 3; i++)
144
-                {
145
-                    X265_FREE_ZERO(m_inputPici);
146
-                }
147
-            }
148
-        }
149
-    }
150
-
151
     for (int i = 0; i < m_param->frameNumThreads; i++)
152
     {
153
         if (m_frameEncoderi)
154
@@ -924,6 +919,10 @@
155
         delete zoneReadCount;
156
         delete zoneWriteCount;
157
     }
158
+
159
+    if (m_param->bEnableTemporalFilter)
160
+        delete m_origPicBuffer;
161
+
162
     if (m_rateControl)
163
     {
164
         m_rateControl->destroy();
165
@@ -963,6 +962,8 @@
166
      }
167
     if (m_naluFile)
168
         fclose(m_naluFile);
169
+    if (m_filmGrainIn)
170
+        x265_fclose(m_filmGrainIn);
171
 
172
 #ifdef SVT_HEVC
173
     X265_FREE(m_svtAppData);
174
@@ -974,6 +975,7 @@
175
         /* release string arguments that were strdup'd */
176
         free((char*)m_param->rc.lambdaFileName);
177
         free((char*)m_param->rc.statFileName);
178
+        free((char*)m_param->rc.sharedMemName);
179
         free((char*)m_param->analysisReuseFileName);
180
         free((char*)m_param->scalingLists);
181
         free((char*)m_param->csvfn);
182
@@ -982,6 +984,7 @@
183
         free((char*)m_param->toneMapFile);
184
         free((char*)m_param->analysisSave);
185
         free((char*)m_param->analysisLoad);
186
+        free((char*)m_param->videoSignalTypePreset);
187
         PARAM_NS::x265_param_free(m_param);
188
     }
189
 }
190
@@ -1358,215 +1361,90 @@
191
     dest->planes2 = (char*)dest->planes1 + src->stride1 * (src->height >> x265_cli_cspssrc->colorSpace.height1);
192
 }
193
 
194
-bool Encoder::computeHistograms(x265_picture *pic)
195
+bool Encoder::isFilterThisframe(uint8_t sliceTypeConfig, int curSliceType)
196
 {
197
-    pixel *src = NULL, *planeV = NULL, *planeU = NULL;
198
-    uint32_t widthC, heightC;
199
-    int hshift, vshift;
200
-
201
x265_3.5.tar.gz/source/encoder/encoder.h -> x265_3.6.tar.gz/source/encoder/encoder.h Changed
72
 
1
@@ -32,6 +32,7 @@
2
 #include "nal.h"
3
 #include "framedata.h"
4
 #include "svt.h"
5
+#include "temporalfilter.h"
6
 #ifdef ENABLE_HDR10_PLUS
7
     #include "dynamicHDR10/hdr10plus.h"
8
 #endif
9
@@ -256,19 +257,6 @@
10
     int                m_bToneMap; // Enables tone-mapping
11
     int                m_enableNal;
12
 
13
-    /* For histogram based scene-cut detection */
14
-    pixel*             m_edgePic;
15
-    pixel*             m_inputPic3;
16
-    int32_t            m_curYUVHist3HISTOGRAM_BINS;
17
-    int32_t            m_prevYUVHist3HISTOGRAM_BINS;
18
-    int32_t            m_curEdgeHist2;
19
-    int32_t            m_prevEdgeHist2;
20
-    uint32_t           m_planeSizes3;
21
-    double             m_edgeHistThreshold;
22
-    double             m_chromaHistThreshold;
23
-    double             m_scaledEdgeThreshold;
24
-    double             m_scaledChromaThreshold;
25
-
26
 #ifdef ENABLE_HDR10_PLUS
27
     const hdr10plus_api     *m_hdr10plus_api;
28
     uint8_t                 **m_cim;
29
@@ -295,6 +283,9 @@
30
 
31
     ThreadSafeInteger* zoneReadCount;
32
     ThreadSafeInteger* zoneWriteCount;
33
+    /* Film grain model file */
34
+    FILE* m_filmGrainIn;
35
+    OrigPicBuffer*          m_origPicBuffer;
36
 
37
     Encoder();
38
     ~Encoder()
39
@@ -327,6 +318,8 @@
40
 
41
     void getStreamHeaders(NALList& list, Entropy& sbacCoder, Bitstream& bs);
42
 
43
+    void getEndNalUnits(NALList& list, Bitstream& bs);
44
+
45
     void fetchStats(x265_stats* stats, size_t statsSizeBytes);
46
 
47
     void printSummary();
48
@@ -373,11 +366,6 @@
49
 
50
     void copyPicture(x265_picture *dest, const x265_picture *src);
51
 
52
-    bool computeHistograms(x265_picture *pic);
53
-    void computeHistogramSAD(double *maxUVNormalizedSAD, double *edgeNormalizedSAD, int curPoc);
54
-    double normalizeRange(int32_t value, int32_t minValue, int32_t maxValue, double rangeStart, double rangeEnd);
55
-    void findSceneCuts(x265_picture *pic, bool& bDup, double m_maxUVSADVal, double m_edgeSADVal, bool& isMaxThres, bool& isHardSC);
56
-
57
     void initRefIdx();
58
     void analyseRefIdx(int *numRefIdx);
59
     void updateRefIdx();
60
@@ -387,6 +375,11 @@
61
 
62
     void configureDolbyVisionParams(x265_param* p);
63
 
64
+    void configureVideoSignalTypePreset(x265_param* p);
65
+
66
+    bool isFilterThisframe(uint8_t sliceTypeConfig, int curSliceType);
67
+    bool generateMcstfRef(Frame* frameEnc, FrameEncoder* currEncoder);
68
+
69
 protected:
70
 
71
     void initVPS(VPS *vps);
72
x265_3.5.tar.gz/source/encoder/entropy.cpp -> x265_3.6.tar.gz/source/encoder/entropy.cpp Changed
41
 
1
@@ -245,9 +245,9 @@
2
 
3
     for (uint32_t i = 0; i < vps.maxTempSubLayers; i++)
4
     {
5
-        WRITE_UVLC(vps.maxDecPicBuffering - 1, "vps_max_dec_pic_buffering_minus1i");
6
-        WRITE_UVLC(vps.numReorderPics,         "vps_num_reorder_picsi");
7
-        WRITE_UVLC(vps.maxLatencyIncrease + 1, "vps_max_latency_increase_plus1i");
8
+        WRITE_UVLC(vps.maxDecPicBufferingi - 1, "vps_max_dec_pic_buffering_minus1i");
9
+        WRITE_UVLC(vps.numReorderPicsi,         "vps_num_reorder_picsi");
10
+        WRITE_UVLC(vps.maxLatencyIncreasei + 1, "vps_max_latency_increase_plus1i");
11
     }
12
 
13
     WRITE_CODE(0, 6, "vps_max_nuh_reserved_zero_layer_id");
14
@@ -291,9 +291,9 @@
15
 
16
     for (uint32_t i = 0; i < sps.maxTempSubLayers; i++)
17
     {
18
-        WRITE_UVLC(sps.maxDecPicBuffering - 1, "sps_max_dec_pic_buffering_minus1i");
19
-        WRITE_UVLC(sps.numReorderPics,         "sps_num_reorder_picsi");
20
-        WRITE_UVLC(sps.maxLatencyIncrease + 1, "sps_max_latency_increase_plus1i");
21
+        WRITE_UVLC(sps.maxDecPicBufferingi - 1, "sps_max_dec_pic_buffering_minus1i");
22
+        WRITE_UVLC(sps.numReorderPicsi,         "sps_num_reorder_picsi");
23
+        WRITE_UVLC(sps.maxLatencyIncreasei + 1, "sps_max_latency_increase_plus1i");
24
     }
25
 
26
     WRITE_UVLC(sps.log2MinCodingBlockSize - 3,    "log2_min_coding_block_size_minus3");
27
@@ -418,8 +418,11 @@
28
 
29
     if (maxTempSubLayers > 1)
30
     {
31
-         WRITE_FLAG(0, "sub_layer_profile_present_flagi");
32
-         WRITE_FLAG(0, "sub_layer_level_present_flagi");
33
+        for(int i = 0; i < maxTempSubLayers - 1; i++)
34
+        {
35
+            WRITE_FLAG(0, "sub_layer_profile_present_flagi");
36
+            WRITE_FLAG(0, "sub_layer_level_present_flagi");
37
+        }
38
          for (int i = maxTempSubLayers - 1; i < 8 ; i++)
39
              WRITE_CODE(0, 2, "reserved_zero_2bits");
40
     }
41
x265_3.5.tar.gz/source/encoder/frameencoder.cpp -> x265_3.6.tar.gz/source/encoder/frameencoder.cpp Changed
200
 
1
@@ -34,6 +34,7 @@
2
 #include "common.h"
3
 #include "slicetype.h"
4
 #include "nal.h"
5
+#include "temporalfilter.h"
6
 
7
 namespace X265_NS {
8
 void weightAnalyse(Slice& slice, Frame& frame, x265_param& param);
9
@@ -101,6 +102,16 @@
10
         delete m_rce.picTimingSEI;
11
         delete m_rce.hrdTiming;
12
     }
13
+
14
+    if (m_param->bEnableTemporalFilter)
15
+    {
16
+        delete m_frameEncTF->m_metld;
17
+
18
+        for (int i = 0; i < (m_frameEncTF->m_range << 1); i++)
19
+            m_frameEncTF->destroyRefPicInfo(&m_mcstfRefListi);
20
+
21
+        delete m_frameEncTF;
22
+    }
23
 }
24
 
25
 bool FrameEncoder::init(Encoder *top, int numRows, int numCols)
26
@@ -195,6 +206,16 @@
27
         m_sliceAddrBits = (uint16_t)(tmp + 1);
28
     }
29
 
30
+    if (m_param->bEnableTemporalFilter)
31
+    {
32
+        m_frameEncTF = new TemporalFilter();
33
+        if (m_frameEncTF)
34
+            m_frameEncTF->init(m_param);
35
+
36
+        for (int i = 0; i < (m_frameEncTF->m_range << 1); i++)
37
+            ok &= !!m_frameEncTF->createRefPicInfo(&m_mcstfRefListi, m_param);
38
+    }
39
+
40
     return ok;
41
 }
42
 
43
@@ -450,7 +471,7 @@
44
     m_ssimCnt = 0;
45
     memset(&(m_frame->m_encData->m_frameStats), 0, sizeof(m_frame->m_encData->m_frameStats));
46
 
47
-    if (!m_param->bHistBasedSceneCut && m_param->rc.aqMode != X265_AQ_EDGE && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
48
+    if (m_param->rc.aqMode != X265_AQ_EDGE && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
49
     {
50
         int height = m_frame->m_fencPic->m_picHeight;
51
         int width = m_frame->m_fencPic->m_picWidth;
52
@@ -467,6 +488,12 @@
53
      * unit) */
54
     Slice* slice = m_frame->m_encData->m_slice;
55
 
56
+    if (m_param->bEnableEndOfSequence && m_frame->m_lowres.sliceType == X265_TYPE_IDR && m_frame->m_poc)
57
+    {
58
+        m_bs.resetBits();
59
+        m_nalList.serialize(NAL_UNIT_EOS, m_bs);
60
+    }
61
+
62
     if (m_param->bEnableAccessUnitDelimiters && (m_frame->m_poc || m_param->bRepeatHeaders))
63
     {
64
         m_bs.resetBits();
65
@@ -573,6 +600,12 @@
66
     int qp = m_top->m_rateControl->rateControlStart(m_frame, &m_rce, m_top);
67
     m_rce.newQp = qp;
68
 
69
+    if (m_param->bEnableTemporalFilter)
70
+    {
71
+        m_frameEncTF->m_QP = qp;
72
+        m_frameEncTF->bilateralFilter(m_frame, m_mcstfRefList, m_param->temporalFilterStrength);
73
+    }
74
+
75
     if (m_nr)
76
     {
77
         if (qp > QP_MAX_SPEC && m_frame->m_param->rc.vbvBufferSize)
78
@@ -744,7 +777,7 @@
79
             // wait after removal of the access unit with the most recent
80
             // buffering period SEI message
81
             sei->m_auCpbRemovalDelay = X265_MIN(X265_MAX(1, m_rce.encodeOrder - prevBPSEI), (1 << hrd->cpbRemovalDelayLength));
82
-            sei->m_picDpbOutputDelay = slice->m_sps->numReorderPics + poc - m_rce.encodeOrder;
83
+            sei->m_picDpbOutputDelay = slice->m_sps->numReorderPicsm_frame->m_tempLayer + poc - m_rce.encodeOrder;
84
         }
85
 
86
         sei->writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
87
@@ -756,7 +789,14 @@
88
         m_seiAlternativeTC.m_preferredTransferCharacteristics = m_param->preferredTransferCharacteristics;
89
         m_seiAlternativeTC.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
90
     }
91
-
92
+    /* Write Film grain characteristics if present */
93
+    if (this->m_top->m_filmGrainIn)
94
+    {
95
+        FilmGrainCharacteristics m_filmGrain;
96
+        /* Read the Film grain model file */
97
+        readModel(&m_filmGrain, this->m_top->m_filmGrainIn);
98
+        m_filmGrain.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
99
+    }
100
     /* Write user SEI */
101
     for (int i = 0; i < m_frame->m_userSEI.numPayloads; i++)
102
     {
103
@@ -933,6 +973,23 @@
104
     if (m_param->bDynamicRefine && m_top->m_startPoint <= m_frame->m_encodeOrder) //Avoid collecting data that will not be used by future frames.
105
         collectDynDataFrame();
106
 
107
+    if (m_param->bEnableTemporalFilter && m_top->isFilterThisframe(m_frame->m_mcstf->m_sliceTypeConfig, m_frame->m_lowres.sliceType))
108
+    {
109
+        //Reset the MCSTF context in Frame Encoder and Frame
110
+        for (int i = 0; i < (m_frameEncTF->m_range << 1); i++)
111
+        {
112
+            memset(m_mcstfRefListi.mvs0, 0, sizeof(MV) * ((m_param->sourceWidth / 16) * (m_param->sourceHeight / 16)));
113
+            memset(m_mcstfRefListi.mvs1, 0, sizeof(MV) * ((m_param->sourceWidth / 16) * (m_param->sourceHeight / 16)));
114
+            memset(m_mcstfRefListi.mvs2, 0, sizeof(MV) * ((m_param->sourceWidth / 16) * (m_param->sourceHeight / 16)));
115
+            memset(m_mcstfRefListi.mvs,  0, sizeof(MV) * ((m_param->sourceWidth / 4) * (m_param->sourceHeight / 4)));
116
+            memset(m_mcstfRefListi.noise, 0, sizeof(int) * ((m_param->sourceWidth / 4) * (m_param->sourceHeight / 4)));
117
+            memset(m_mcstfRefListi.error, 0, sizeof(int) * ((m_param->sourceWidth / 4) * (m_param->sourceHeight / 4)));
118
+
119
+            m_frame->m_mcstf->m_numRef = 0;
120
+        }
121
+    }
122
+
123
+
124
     if (m_param->rc.bStatWrite)
125
     {
126
         int totalI = 0, totalP = 0, totalSkip = 0;
127
@@ -1041,7 +1098,7 @@
128
             
129
             m_bs.writeByteAlignment();
130
 
131
-            m_nalList.serialize(slice->m_nalUnitType, m_bs);
132
+            m_nalList.serialize(slice->m_nalUnitType, m_bs, (!!m_param->bEnableTemporalSubLayers ? m_frame->m_tempLayer + 1 : (1 + (slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N))));
133
         }
134
     }
135
     else
136
@@ -1062,7 +1119,7 @@
137
             m_entropyCoder.codeSliceHeaderWPPEntryPoints(m_substreamSizes, (slice->m_sps->numCuInHeight - 1), maxStreamSize);
138
         m_bs.writeByteAlignment();
139
 
140
-        m_nalList.serialize(slice->m_nalUnitType, m_bs);
141
+        m_nalList.serialize(slice->m_nalUnitType, m_bs, (!!m_param->bEnableTemporalSubLayers ? m_frame->m_tempLayer + 1 : (1 + (slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N))));
142
     }
143
 
144
     if (m_param->decodedPictureHashSEI)
145
@@ -2127,6 +2184,54 @@
146
         m_nr->nrOffsetDenoisecat0 = 0;
147
     }
148
 }
149
+
150
+void FrameEncoder::readModel(FilmGrainCharacteristics* m_filmGrain, FILE* filmgrain)
151
+{
152
+    char const* errorMessage = "Error reading FilmGrain characteristics\n";
153
+    FilmGrain m_fg;
154
+    x265_fread((char* )&m_fg, sizeof(bool) * 3 + sizeof(uint8_t), 1, filmgrain, errorMessage);
155
+    m_filmGrain->m_filmGrainCharacteristicsCancelFlag = m_fg.m_filmGrainCharacteristicsCancelFlag;
156
+    m_filmGrain->m_filmGrainCharacteristicsPersistenceFlag = m_fg.m_filmGrainCharacteristicsPersistenceFlag;
157
+    m_filmGrain->m_filmGrainModelId = m_fg.m_filmGrainModelId;
158
+    m_filmGrain->m_separateColourDescriptionPresentFlag = m_fg.m_separateColourDescriptionPresentFlag;
159
+    if (m_filmGrain->m_separateColourDescriptionPresentFlag)
160
+    {
161
+        ColourDescription m_clr;
162
+        x265_fread((char* )&m_clr, sizeof(bool) + sizeof(uint8_t) * 5, 1, filmgrain, errorMessage);
163
+        m_filmGrain->m_filmGrainBitDepthLumaMinus8 = m_clr.m_filmGrainBitDepthLumaMinus8;
164
+        m_filmGrain->m_filmGrainBitDepthChromaMinus8 = m_clr.m_filmGrainBitDepthChromaMinus8;
165
+        m_filmGrain->m_filmGrainFullRangeFlag = m_clr.m_filmGrainFullRangeFlag;
166
+        m_filmGrain->m_filmGrainColourPrimaries = m_clr.m_filmGrainColourPrimaries;
167
+        m_filmGrain->m_filmGrainTransferCharacteristics = m_clr.m_filmGrainTransferCharacteristics;
168
+        m_filmGrain->m_filmGrainMatrixCoeffs = m_clr.m_filmGrainMatrixCoeffs;
169
+    }
170
+    FGPresent m_present;
171
+    x265_fread((char* )&m_present, sizeof(bool) * 3 + sizeof(uint8_t) * 2, 1, filmgrain, errorMessage);
172
+    m_filmGrain->m_blendingModeId = m_present.m_blendingModeId;
173
+    m_filmGrain->m_log2ScaleFactor = m_present.m_log2ScaleFactor;
174
+    m_filmGrain->m_compModel0.bPresentFlag = m_present.m_presentFlag0;
175
+    m_filmGrain->m_compModel1.bPresentFlag = m_present.m_presentFlag1;
176
+    m_filmGrain->m_compModel2.bPresentFlag = m_present.m_presentFlag2;
177
+    for (int i = 0; i < MAX_NUM_COMPONENT; i++)
178
+    {
179
+        if (m_filmGrain->m_compModeli.bPresentFlag)
180
+        {
181
+            x265_fread((char* )(&m_filmGrain->m_compModeli.m_filmGrainNumIntensityIntervalMinus1), sizeof(uint8_t), 1, filmgrain, errorMessage);
182
+            x265_fread((char* )(&m_filmGrain->m_compModeli.numModelValues), sizeof(uint8_t), 1, filmgrain, errorMessage);
183
+            m_filmGrain->m_compModeli.intensityValues = (FilmGrainCharacteristics::CompModelIntensityValues* ) malloc(sizeof(FilmGrainCharacteristics::CompModelIntensityValues) * (m_filmGrain->m_compModeli.m_filmGrainNumIntensityIntervalMinus1+1)) ;
184
+            for (int j = 0; j <= m_filmGrain->m_compModeli.m_filmGrainNumIntensityIntervalMinus1; j++)
185
+            {
186
+                x265_fread((char* )(&m_filmGrain->m_compModeli.intensityValuesj.intensityIntervalLowerBound), sizeof(uint8_t), 1, filmgrain, errorMessage);
187
+                x265_fread((char* )(&m_filmGrain->m_compModeli.intensityValuesj.intensityIntervalUpperBound), sizeof(uint8_t), 1, filmgrain, errorMessage);
188
+                m_filmGrain->m_compModeli.intensityValuesj.compModelValue = (int* ) malloc(sizeof(int) * (m_filmGrain->m_compModeli.numModelValues));
189
+                for (int k = 0; k < m_filmGrain->m_compModeli.numModelValues; k++)
190
+                {
191
+                    x265_fread((char* )(&m_filmGrain->m_compModeli.intensityValuesj.compModelValuek), sizeof(int), 1, filmgrain, errorMessage);
192
+                }
193
+            }
194
+        }
195
+    }
196
+}
197
 #if ENABLE_LIBVMAF
198
 void FrameEncoder::vmafFrameLevelScore()
199
 {
200
x265_3.5.tar.gz/source/encoder/frameencoder.h -> x265_3.6.tar.gz/source/encoder/frameencoder.h Changed
63
 
1
@@ -40,6 +40,7 @@
2
 #include "ratecontrol.h"
3
 #include "reference.h"
4
 #include "nal.h"
5
+#include "temporalfilter.h"
6
 
7
 namespace X265_NS {
8
 // private x265 namespace
9
@@ -113,6 +114,34 @@
10
     }
11
 };
12
 
13
+/*Film grain characteristics*/
14
+struct FilmGrain
15
+{
16
+    bool    m_filmGrainCharacteristicsCancelFlag;
17
+    bool    m_filmGrainCharacteristicsPersistenceFlag;
18
+    bool    m_separateColourDescriptionPresentFlag;
19
+    uint8_t m_filmGrainModelId;
20
+    uint8_t m_blendingModeId;
21
+    uint8_t m_log2ScaleFactor;
22
+};
23
+
24
+struct ColourDescription
25
+{
26
+    bool        m_filmGrainFullRangeFlag;
27
+    uint8_t     m_filmGrainBitDepthLumaMinus8;
28
+    uint8_t     m_filmGrainBitDepthChromaMinus8;
29
+    uint8_t     m_filmGrainColourPrimaries;
30
+    uint8_t     m_filmGrainTransferCharacteristics;
31
+    uint8_t     m_filmGrainMatrixCoeffs;
32
+};
33
+
34
+struct FGPresent
35
+{
36
+    uint8_t     m_blendingModeId;
37
+    uint8_t     m_log2ScaleFactor;
38
+    bool        m_presentFlag3;
39
+};
40
+
41
 // Manages the wave-front processing of a single encoding frame
42
 class FrameEncoder : public WaveFront, public Thread
43
 {
44
@@ -205,6 +234,10 @@
45
     FrameFilter              m_frameFilter;
46
     NALList                  m_nalList;
47
 
48
+    // initialization for mcstf
49
+    TemporalFilter*          m_frameEncTF;
50
+    TemporalFilterRefPicInfo m_mcstfRefListMAX_MCSTF_TEMPORAL_WINDOW_LENGTH;
51
+
52
     class WeightAnalysis : public BondedTaskGroup
53
     {
54
     public:
55
@@ -250,6 +283,7 @@
56
     void collectDynDataFrame();
57
     void computeAvgTrainingData();
58
     void collectDynDataRow(CUData& ctu, FrameStats* rowStats);    
59
+    void readModel(FilmGrainCharacteristics* m_filmGrain, FILE* filmgrain);
60
 };
61
 }
62
 
63
x265_3.5.tar.gz/source/encoder/level.cpp -> x265_3.6.tar.gz/source/encoder/level.cpp Changed
86
 
1
@@ -72,7 +72,7 @@
2
      * for intra-only profiles (vps.ptl.intraConstraintFlag) */
3
     vps.ptl.lowerBitRateConstraintFlag = true;
4
 
5
-    vps.maxTempSubLayers = param.bEnableTemporalSubLayers ? 2 : 1;
6
+    vps.maxTempSubLayers = !!param.bEnableTemporalSubLayers ? param.bEnableTemporalSubLayers : 1;
7
     
8
     if (param.internalCsp == X265_CSP_I420 && param.internalBitDepth <= 10)
9
     {
10
@@ -167,7 +167,7 @@
11
 
12
         /* The value of sps_max_dec_pic_buffering_minus1 HighestTid  + 1 shall be less than
13
          * or equal to MaxDpbSize */
14
-        if (vps.maxDecPicBuffering > maxDpbSize)
15
+        if (vps.maxDecPicBufferingvps.maxTempSubLayers - 1 > maxDpbSize)
16
             continue;
17
 
18
         /* For level 5 and higher levels, the value of CtbSizeY shall be equal to 32 or 64 */
19
@@ -182,8 +182,8 @@
20
         }
21
 
22
         /* The value of NumPocTotalCurr shall be less than or equal to 8 */
23
-        int numPocTotalCurr = param.maxNumReferences + vps.numReorderPics;
24
-        if (numPocTotalCurr > 8)
25
+        int numPocTotalCurr = param.maxNumReferences + vps.numReorderPicsvps.maxTempSubLayers - 1;
26
+        if (numPocTotalCurr > 10)
27
         {
28
             x265_log(&param, X265_LOG_WARNING, "level %s detected, but NumPocTotalCurr (total references) is non-compliant\n", levelsi.name);
29
             vps.ptl.profileIdc = Profile::NONE;
30
@@ -289,9 +289,40 @@
31
  * circumstances it will be quite noisy */
32
 bool enforceLevel(x265_param& param, VPS& vps)
33
 {
34
-    vps.numReorderPics = (param.bBPyramid && param.bframes > 1) ? 2 : !!param.bframes;
35
-    vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 2, (uint32_t)param.maxNumReferences) + 1);
36
+    vps.maxTempSubLayers = !!param.bEnableTemporalSubLayers ? param.bEnableTemporalSubLayers : 1;
37
+    for (uint32_t i = 0; i < vps.maxTempSubLayers; i++)
38
+    {
39
+        vps.numReorderPicsi = (i == 0) ? ((param.bBPyramid && param.bframes > 1) ? 2 : !!param.bframes) : i;
40
+        vps.maxDecPicBufferingi = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPicsi + 2, (uint32_t)param.maxNumReferences) + 1);
41
+    }
42
 
43
+    if (!!param.bEnableTemporalSubLayers)
44
+    {
45
+        for (int i = 0; i < MAX_T_LAYERS - 1; i++)
46
+        {
47
+            // a lower layer can not have higher value of numReorderPics than a higher layer
48
+            if (vps.numReorderPicsi + 1 < vps.numReorderPicsi)
49
+            {
50
+                vps.numReorderPicsi + 1 = vps.numReorderPicsi;
51
+            }
52
+            // the value of numReorderPicsi shall be in the range of 0 to maxDecPicBufferingi - 1, inclusive
53
+            if (vps.numReorderPicsi > vps.maxDecPicBufferingi - 1)
54
+            {
55
+                vps.maxDecPicBufferingi = vps.numReorderPicsi + 1;
56
+            }
57
+            // a lower layer can not have higher value of maxDecPicBuffering than a higher layer
58
+            if (vps.maxDecPicBufferingi + 1 < vps.maxDecPicBufferingi)
59
+            {
60
+                vps.maxDecPicBufferingi + 1 = vps.maxDecPicBufferingi;
61
+            }
62
+        }
63
+
64
+        // the value of numReorderPicsi shall be in the range of 0 to maxDecPicBuffering i  -  1, inclusive
65
+        if (vps.numReorderPicsMAX_T_LAYERS - 1 > vps.maxDecPicBufferingMAX_T_LAYERS - 1 - 1)
66
+        {
67
+            vps.maxDecPicBufferingMAX_T_LAYERS - 1 = vps.numReorderPicsMAX_T_LAYERS - 1 + 1;
68
+        }
69
+    }
70
     /* no level specified by user, just auto-detect from the configuration */
71
     if (param.levelIdc <= 0)
72
         return true;
73
@@ -391,10 +422,10 @@
74
     }
75
 
76
     int savedRefCount = param.maxNumReferences;
77
-    while (vps.maxDecPicBuffering > maxDpbSize && param.maxNumReferences > 1)
78
+    while (vps.maxDecPicBufferingvps.maxTempSubLayers - 1 > maxDpbSize && param.maxNumReferences > 1)
79
     {
80
         param.maxNumReferences--;
81
-        vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 1, (uint32_t)param.maxNumReferences) + 1);
82
+        vps.maxDecPicBufferingvps.maxTempSubLayers - 1 = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPicsvps.maxTempSubLayers - 1 + 1, (uint32_t)param.maxNumReferences) + 1);
83
     }
84
     if (param.maxNumReferences != savedRefCount)
85
         x265_log(&param, X265_LOG_WARNING, "Lowering max references to %d to meet level requirement\n", param.maxNumReferences);
86
x265_3.5.tar.gz/source/encoder/motion.cpp -> x265_3.6.tar.gz/source/encoder/motion.cpp Changed
33
 
1
@@ -190,6 +190,31 @@
2
     X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
3
 }
4
 
5
+/* Called by lookahead, luma only, no use of PicYuv */
6
+void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int method, const int refine)
7
+{
8
+    partEnum = partitionFromSizes(pwidth, pheight);
9
+    X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
10
+    sad = primitives.pupartEnum.sad;
11
+    ads = primitives.pupartEnum.ads;
12
+    satd = primitives.pupartEnum.satd;
13
+    sad_x3 = primitives.pupartEnum.sad_x3;
14
+    sad_x4 = primitives.pupartEnum.sad_x4;
15
+
16
+
17
+    blockwidth = pwidth;
18
+    blockOffset = offset;
19
+    absPartIdx = ctuAddr = -1;
20
+
21
+    /* Search params */
22
+    searchMethod = method;
23
+    subpelRefine = refine;
24
+
25
+    /* copy PU block into cache */
26
+    primitives.pupartEnum.copy_pp(fencPUYuv.m_buf0, FENC_STRIDE, fencY + offset, stride);
27
+    X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
28
+}
29
+
30
 /* Called by Search::predInterSearch() or --pme equivalent, chroma residual might be considered */
31
 void MotionEstimate::setSourcePU(const Yuv& srcFencYuv, int _ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight, const int method, const int refine, bool bChroma)
32
 {
33
x265_3.5.tar.gz/source/encoder/motion.h -> x265_3.6.tar.gz/source/encoder/motion.h Changed
10
 
1
@@ -77,7 +77,7 @@
2
     void init(int csp);
3
 
4
     /* Methods called at slice setup */
5
-
6
+    void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int searchMethod, const int subpelRefine);
7
     void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int searchMethod, const int searchL0, const int searchL1, const int subpelRefine);
8
     void setSourcePU(const Yuv& srcFencYuv, int ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight, const int searchMethod, const int subpelRefine, bool bChroma);
9
 
10
x265_3.5.tar.gz/source/encoder/nal.cpp -> x265_3.6.tar.gz/source/encoder/nal.cpp Changed
19
 
1
@@ -57,7 +57,7 @@
2
     other.m_buffer = X265_MALLOC(uint8_t, m_allocSize);
3
 }
4
 
5
-void NALList::serialize(NalUnitType nalUnitType, const Bitstream& bs)
6
+void NALList::serialize(NalUnitType nalUnitType, const Bitstream& bs, uint8_t temporalID)
7
 {
8
     static const char startCodePrefix = { 0, 0, 0, 1 };
9
 
10
@@ -114,7 +114,7 @@
11
      * nuh_reserved_zero_6bits  6-bits
12
      * nuh_temporal_id_plus1    3-bits */
13
     outbytes++ = (uint8_t)nalUnitType << 1;
14
-    outbytes++ = 1 + (nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N);
15
+    outbytes++ = temporalID;
16
 
17
     /* 7.4.1 ...
18
      * Within the NAL unit, the following three-byte sequences shall not occur at
19
x265_3.5.tar.gz/source/encoder/nal.h -> x265_3.6.tar.gz/source/encoder/nal.h Changed
10
 
1
@@ -56,7 +56,7 @@
2
 
3
     void takeContents(NALList& other);
4
 
5
-    void serialize(NalUnitType nalUnitType, const Bitstream& bs);
6
+    void serialize(NalUnitType nalUnitType, const Bitstream& bs, uint8_t temporalID = 1);
7
 
8
     uint32_t serializeSubstreams(uint32_t* streamSizeBytes, uint32_t streamCount, const Bitstream* streams);
9
 };
10
x265_3.5.tar.gz/source/encoder/ratecontrol.cpp -> x265_3.6.tar.gz/source/encoder/ratecontrol.cpp Changed
201
 
1
@@ -41,6 +41,10 @@
2
 #define BR_SHIFT  6
3
 #define CPB_SHIFT 4
4
 
5
+#define SHARED_DATA_ALIGNMENT      4 ///< 4btye, 32bit
6
+#define CUTREE_SHARED_MEM_NAME     "cutree"
7
+#define GOP_CNT_CU_TREE            3
8
+
9
 using namespace X265_NS;
10
 
11
 /* Amortize the partial cost of I frames over the next N frames */
12
@@ -104,6 +108,37 @@
13
     return output;
14
 }
15
 
16
+typedef struct CUTreeSharedDataItem
17
+{
18
+    uint8_t  *type;
19
+    uint16_t *stats;
20
+}CUTreeSharedDataItem;
21
+
22
+void static ReadSharedCUTreeData(void *dst, void *src, int32_t size)
23
+{
24
+    CUTreeSharedDataItem *statsDst = reinterpret_cast<CUTreeSharedDataItem *>(dst);
25
+    uint8_t *typeSrc = reinterpret_cast<uint8_t *>(src);
26
+    *statsDst->type = *typeSrc;
27
+
28
+    ///< for memory alignment, the type will take 32bit in the shared memory
29
+    int32_t offset = (sizeof(*statsDst->type) + SHARED_DATA_ALIGNMENT - 1) & ~(SHARED_DATA_ALIGNMENT - 1);
30
+    uint16_t *statsSrc = reinterpret_cast<uint16_t *>(typeSrc + offset);
31
+    memcpy(statsDst->stats, statsSrc, size - offset);
32
+}
33
+
34
+void static WriteSharedCUTreeData(void *dst, void *src, int32_t size)
35
+{
36
+    CUTreeSharedDataItem *statsSrc = reinterpret_cast<CUTreeSharedDataItem *>(src);
37
+    uint8_t *typeDst = reinterpret_cast<uint8_t *>(dst);
38
+    *typeDst = *statsSrc->type;
39
+
40
+    ///< for memory alignment, the type will take 32bit in the shared memory
41
+    int32_t offset = (sizeof(*statsSrc->type) + SHARED_DATA_ALIGNMENT - 1) & ~(SHARED_DATA_ALIGNMENT - 1);
42
+    uint16_t *statsDst = reinterpret_cast<uint16_t *>(typeDst + offset);
43
+    memcpy(statsDst, statsSrc->stats, size - offset);
44
+}
45
+
46
+
47
 inline double qScale2bits(RateControlEntry *rce, double qScale)
48
 {
49
     if (qScale < 0.1)
50
@@ -209,6 +244,7 @@
51
     m_lastAbrResetPoc = -1;
52
     m_statFileOut = NULL;
53
     m_cutreeStatFileOut = m_cutreeStatFileIn = NULL;
54
+    m_cutreeShrMem = NULL;
55
     m_rce2Pass = NULL;
56
     m_encOrder = NULL;
57
     m_lastBsliceSatdCost = 0;
58
@@ -224,6 +260,8 @@
59
     m_initVbv = false;
60
     m_singleFrameVbv = 0;
61
     m_rateTolerance = 1.0;
62
+    m_encodedSegmentBits = 0;
63
+    m_segDur = 0;
64
 
65
     if (m_param->rc.vbvBufferSize)
66
     {
67
@@ -320,47 +358,86 @@
68
         m_cuTreeStats.qpBufferi = NULL;
69
 }
70
 
71
-bool RateControl::init(const SPS& sps)
72
+bool RateControl::initCUTreeSharedMem()
73
 {
74
-    if (m_isVbv && !m_initVbv)
75
-    {
76
-        /* We don't support changing the ABR bitrate right now,
77
-         * so if the stream starts as CBR, keep it CBR. */
78
-        if (m_param->rc.vbvBufferSize < (int)(m_param->rc.vbvMaxBitrate / m_fps))
79
+    if (!m_cutreeShrMem) {
80
+        m_cutreeShrMem = new RingMem();
81
+        if (!m_cutreeShrMem)
82
         {
83
-            m_param->rc.vbvBufferSize = (int)(m_param->rc.vbvMaxBitrate / m_fps);
84
-            x265_log(m_param, X265_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
85
-                     m_param->rc.vbvBufferSize);
86
+            return false;
87
         }
88
-        int vbvBufferSize = m_param->rc.vbvBufferSize * 1000;
89
-        int vbvMaxBitrate = m_param->rc.vbvMaxBitrate * 1000;
90
 
91
-        if (m_param->bEmitHRDSEI && !m_param->decoderVbvMaxRate)
92
+        ///< now cutree data form at most 3 gops would be stored in the shared memory at the same time
93
+        int32_t itemSize = (sizeof(uint8_t) + SHARED_DATA_ALIGNMENT - 1) & ~(SHARED_DATA_ALIGNMENT - 1);
94
+        if (m_param->rc.qgSize == 8)
95
         {
96
-            const HRDInfo* hrd = &sps.vuiParameters.hrdParameters;
97
-            vbvBufferSize = hrd->cpbSizeValue << (hrd->cpbSizeScale + CPB_SHIFT);
98
-            vbvMaxBitrate = hrd->bitRateValue << (hrd->bitRateScale + BR_SHIFT);
99
+            itemSize += sizeof(uint16_t) * m_ncu * 4;
100
         }
101
-        m_bufferRate = vbvMaxBitrate / m_fps;
102
-        m_vbvMaxRate = vbvMaxBitrate;
103
-        m_bufferSize = vbvBufferSize;
104
-        m_singleFrameVbv = m_bufferRate * 1.1 > m_bufferSize;
105
+        else
106
+        {
107
+            itemSize += sizeof(uint16_t) * m_ncu;
108
+        }
109
+
110
+        int32_t itemCnt = X265_MIN(m_param->keyframeMax, (int)(m_fps + 0.5));
111
+        itemCnt *= GOP_CNT_CU_TREE;
112
 
113
-        if (m_param->rc.vbvBufferInit > 1.)
114
-            m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, m_param->rc.vbvBufferInit / m_param->rc.vbvBufferSize);
115
-        if (m_param->vbvBufferEnd > 1.)
116
-            m_param->vbvBufferEnd = x265_clip3(0.0, 1.0, m_param->vbvBufferEnd / m_param->rc.vbvBufferSize);
117
-        if (m_param->vbvEndFrameAdjust > 1.)
118
-            m_param->vbvEndFrameAdjust = x265_clip3(0.0, 1.0, m_param->vbvEndFrameAdjust);
119
-        m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, X265_MAX(m_param->rc.vbvBufferInit, m_bufferRate / m_bufferSize));
120
-        m_bufferFillFinal = m_bufferSize * m_param->rc.vbvBufferInit;
121
-        m_bufferFillActual = m_bufferFillFinal;
122
-        m_bufferExcess = 0;
123
-        m_minBufferFill = m_param->minVbvFullness / 100;
124
-        m_maxBufferFill = 1 - (m_param->maxVbvFullness / 100);
125
-        m_initVbv = true;
126
+        char shrnameMAX_SHR_NAME_LEN = { 0 };
127
+        strcpy(shrname, m_param->rc.sharedMemName);
128
+        strcat(shrname, CUTREE_SHARED_MEM_NAME);
129
+
130
+        if (!m_cutreeShrMem->init(itemSize, itemCnt, shrname))
131
+        {
132
+            return false;
133
+        }
134
     }
135
 
136
+    return true;
137
+}
138
+
139
+void RateControl::initVBV(const SPS& sps)
140
+{
141
+    /* We don't support changing the ABR bitrate right now,
142
+ * so if the stream starts as CBR, keep it CBR. */
143
+    if (m_param->rc.vbvBufferSize < (int)(m_param->rc.vbvMaxBitrate / m_fps))
144
+    {
145
+        m_param->rc.vbvBufferSize = (int)(m_param->rc.vbvMaxBitrate / m_fps);
146
+        x265_log(m_param, X265_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
147
+            m_param->rc.vbvBufferSize);
148
+    }
149
+    int vbvBufferSize = m_param->rc.vbvBufferSize * 1000;
150
+    int vbvMaxBitrate = m_param->rc.vbvMaxBitrate * 1000;
151
+
152
+    if (m_param->bEmitHRDSEI && !m_param->decoderVbvMaxRate)
153
+    {
154
+        const HRDInfo* hrd = &sps.vuiParameters.hrdParameters;
155
+        vbvBufferSize = hrd->cpbSizeValue << (hrd->cpbSizeScale + CPB_SHIFT);
156
+        vbvMaxBitrate = hrd->bitRateValue << (hrd->bitRateScale + BR_SHIFT);
157
+    }
158
+    m_bufferRate = vbvMaxBitrate / m_fps;
159
+    m_vbvMaxRate = vbvMaxBitrate;
160
+    m_bufferSize = vbvBufferSize;
161
+    m_singleFrameVbv = m_bufferRate * 1.1 > m_bufferSize;
162
+
163
+    if (m_param->rc.vbvBufferInit > 1.)
164
+        m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, m_param->rc.vbvBufferInit / m_param->rc.vbvBufferSize);
165
+    if (m_param->vbvBufferEnd > 1.)
166
+        m_param->vbvBufferEnd = x265_clip3(0.0, 1.0, m_param->vbvBufferEnd / m_param->rc.vbvBufferSize);
167
+    if (m_param->vbvEndFrameAdjust > 1.)
168
+        m_param->vbvEndFrameAdjust = x265_clip3(0.0, 1.0, m_param->vbvEndFrameAdjust);
169
+    m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, X265_MAX(m_param->rc.vbvBufferInit, m_bufferRate / m_bufferSize));
170
+    m_bufferFillFinal = m_bufferSize * m_param->rc.vbvBufferInit;
171
+    m_bufferFillActual = m_bufferFillFinal;
172
+    m_bufferExcess = 0;
173
+    m_minBufferFill = m_param->minVbvFullness / 100;
174
+    m_maxBufferFill = 1 - (m_param->maxVbvFullness / 100);
175
+    m_initVbv = true;
176
+}
177
+
178
+bool RateControl::init(const SPS& sps)
179
+{
180
+    if (m_isVbv && !m_initVbv)
181
+        initVBV(sps);
182
+
183
     if (!m_param->bResetZoneConfig && (m_relativeComplexity == NULL))
184
     {
185
         m_relativeComplexity = X265_MALLOC(double, m_param->reconfigWindowSize);
186
@@ -373,7 +450,9 @@
187
 
188
     m_totalBits = 0;
189
     m_encodedBits = 0;
190
+    m_encodedSegmentBits = 0;
191
     m_framesDone = 0;
192
+    m_segDur = 0;
193
     m_residualCost = 0;
194
     m_partialResidualCost = 0;
195
     m_amortizeFraction = 0.85;
196
@@ -421,244 +500,257 @@
197
         /* Load stat file and init 2pass algo */
198
         if (m_param->rc.bStatRead)
199
         {
200
-            m_expectedBitsSum = 0;
201
x265_3.5.tar.gz/source/encoder/ratecontrol.h -> x265_3.6.tar.gz/source/encoder/ratecontrol.h Changed
90
 
1
@@ -28,6 +28,7 @@
2
 
3
 #include "common.h"
4
 #include "sei.h"
5
+#include "ringmem.h"
6
 
7
 namespace X265_NS {
8
 // encoder namespace
9
@@ -46,11 +47,6 @@
10
 #define MIN_AMORTIZE_FRACTION 0.2
11
 #define CLIP_DURATION(f) x265_clip3(MIN_FRAME_DURATION, MAX_FRAME_DURATION, f)
12
 
13
-/*Scenecut Aware QP*/
14
-#define WINDOW1_DELTA           1.0 /* The offset for the frames coming in the window-1*/
15
-#define WINDOW2_DELTA           0.7 /* The offset for the frames coming in the window-2*/
16
-#define WINDOW3_DELTA           0.4 /* The offset for the frames coming in the window-3*/
17
-
18
 struct Predictor
19
 {
20
     double coeffMin;
21
@@ -73,6 +69,7 @@
22
     Predictor  rowPreds32;
23
     Predictor* rowPred2;
24
 
25
+    int64_t currentSatd;
26
     int64_t lastSatd;      /* Contains the picture cost of the previous frame, required for resetAbr and VBV */
27
     int64_t leadingNoBSatd;
28
     int64_t rowTotalBits;  /* update cplxrsum and totalbits at the end of 2 rows */
29
@@ -87,6 +84,8 @@
30
     double  rowCplxrSum;
31
     double  qpNoVbv;
32
     double  bufferFill;
33
+    double  bufferFillFinal;
34
+    double  bufferFillActual;
35
     double  targetFill;
36
     bool    vbvEndAdj;
37
     double  frameDuration;
38
@@ -192,6 +191,8 @@
39
     double  m_qCompress;
40
     int64_t m_totalBits;        /* total bits used for already encoded frames (after ammortization) */
41
     int64_t m_encodedBits;      /* bits used for encoded frames (without ammortization) */
42
+    int64_t m_encodedSegmentBits;      /* bits used for encoded frames in a segment*/
43
+    double  m_segDur;
44
     double  m_fps;
45
     int64_t m_satdCostWindow50;
46
     int64_t m_encodedBitsWindow50;
47
@@ -237,6 +238,8 @@
48
     FILE*   m_statFileOut;
49
     FILE*   m_cutreeStatFileOut;
50
     FILE*   m_cutreeStatFileIn;
51
+    ///< store the cutree data in memory instead of file
52
+    RingMem *m_cutreeShrMem;
53
     double  m_lastAccumPNorm;
54
     double  m_expectedBitsSum;   /* sum of qscale2bits after rceq, ratefactor, and overflow, only includes finished frames */
55
     int64_t m_predictedBits;
56
@@ -254,6 +257,7 @@
57
     RateControl(x265_param& p, Encoder *enc);
58
     bool init(const SPS& sps);
59
     void initHRD(SPS& sps);
60
+    void initVBV(const SPS& sps);
61
     void reconfigureRC();
62
 
63
     void setFinalFrameCount(int count);
64
@@ -271,6 +275,9 @@
65
     int writeRateControlFrameStats(Frame* curFrame, RateControlEntry* rce);
66
     bool   initPass2();
67
 
68
+    bool initCUTreeSharedMem();
69
+    void skipCUTreeSharedMemRead(int32_t cnt);
70
+
71
     double forwardMasking(Frame* curFrame, double q);
72
     double backwardMasking(Frame* curFrame, double q);
73
 
74
@@ -291,6 +298,7 @@
75
     double rateEstimateQscale(Frame* pic, RateControlEntry *rce); // main logic for calculating QP based on ABR
76
     double tuneAbrQScaleFromFeedback(double qScale);
77
     double tuneQScaleForZone(RateControlEntry *rce, double qScale); // Tune qScale to adhere to zone budget
78
+    double tuneQscaleForSBRC(Frame* curFrame, double q); // Tune qScale to adhere to segment budget
79
     void   accumPQpUpdate();
80
 
81
     int    getPredictorType(int lowresSliceType, int sliceType);
82
@@ -311,6 +319,7 @@
83
     double tuneQScaleForGrain(double rcOverflow);
84
     void   splitdeltaPOC(char deltapoc, RateControlEntry *rce);
85
     void   splitbUsed(char deltapoc, RateControlEntry *rce);
86
+    void   checkAndResetCRF(RateControlEntry* rce);
87
 };
88
 }
89
 #endif // ifndef X265_RATECONTROL_H
90
x265_3.5.tar.gz/source/encoder/sei.cpp -> x265_3.6.tar.gz/source/encoder/sei.cpp Changed
10
 
1
@@ -68,7 +68,7 @@
2
     {
3
         if (nalUnitType != NAL_UNIT_UNSPECIFIED)
4
             bs.writeByteAlignment();
5
-        list.serialize(nalUnitType, bs);
6
+        list.serialize(nalUnitType, bs, (1 + (nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N)));
7
     }
8
 }
9
 
10
x265_3.5.tar.gz/source/encoder/sei.h -> x265_3.6.tar.gz/source/encoder/sei.h Changed
103
 
1
@@ -73,6 +73,101 @@
2
     }
3
 };
4
 
5
+/* Film grain characteristics */
6
+class FilmGrainCharacteristics : public SEI
7
+{
8
+  public:
9
+
10
+    FilmGrainCharacteristics()
11
+    {
12
+        m_payloadType = FILM_GRAIN_CHARACTERISTICS;
13
+        m_payloadSize = 0;
14
+    }
15
+
16
+    struct CompModelIntensityValues
17
+    {
18
+        uint8_t intensityIntervalLowerBound;
19
+        uint8_t intensityIntervalUpperBound;
20
+        int*    compModelValue;
21
+    };
22
+
23
+    struct CompModel
24
+    {
25
+        bool    bPresentFlag;
26
+        uint8_t numModelValues;
27
+        uint8_t m_filmGrainNumIntensityIntervalMinus1;
28
+        CompModelIntensityValues* intensityValues;
29
+    };
30
+
31
+    CompModel   m_compModelMAX_NUM_COMPONENT;
32
+    bool        m_filmGrainCharacteristicsPersistenceFlag;
33
+    bool        m_filmGrainCharacteristicsCancelFlag;
34
+    bool        m_separateColourDescriptionPresentFlag;
35
+    bool        m_filmGrainFullRangeFlag;
36
+    uint8_t     m_filmGrainModelId;
37
+    uint8_t     m_blendingModeId;
38
+    uint8_t     m_log2ScaleFactor;
39
+    uint8_t     m_filmGrainBitDepthLumaMinus8;
40
+    uint8_t     m_filmGrainBitDepthChromaMinus8;
41
+    uint8_t     m_filmGrainColourPrimaries;
42
+    uint8_t     m_filmGrainTransferCharacteristics;
43
+    uint8_t     m_filmGrainMatrixCoeffs;
44
+
45
+    void writeSEI(const SPS&)
46
+    {
47
+        WRITE_FLAG(m_filmGrainCharacteristicsCancelFlag, "film_grain_characteristics_cancel_flag");
48
+
49
+        if (!m_filmGrainCharacteristicsCancelFlag)
50
+        {
51
+            WRITE_CODE(m_filmGrainModelId, 2, "film_grain_model_id");
52
+            WRITE_FLAG(m_separateColourDescriptionPresentFlag, "separate_colour_description_present_flag");
53
+            if (m_separateColourDescriptionPresentFlag)
54
+            {
55
+                WRITE_CODE(m_filmGrainBitDepthLumaMinus8, 3, "film_grain_bit_depth_luma_minus8");
56
+                WRITE_CODE(m_filmGrainBitDepthChromaMinus8, 3, "film_grain_bit_depth_chroma_minus8");
57
+                WRITE_FLAG(m_filmGrainFullRangeFlag, "film_grain_full_range_flag");
58
+                WRITE_CODE(m_filmGrainColourPrimaries, X265_BYTE, "film_grain_colour_primaries");
59
+                WRITE_CODE(m_filmGrainTransferCharacteristics, X265_BYTE, "film_grain_transfer_characteristics");
60
+                WRITE_CODE(m_filmGrainMatrixCoeffs, X265_BYTE, "film_grain_matrix_coeffs");
61
+            }
62
+            WRITE_CODE(m_blendingModeId, 2, "blending_mode_id");
63
+            WRITE_CODE(m_log2ScaleFactor, 4, "log2_scale_factor");
64
+            for (uint8_t c = 0; c < 3; c++)
65
+            {
66
+                WRITE_FLAG(m_compModelc.bPresentFlag && m_compModelc.m_filmGrainNumIntensityIntervalMinus1 + 1 > 0 && m_compModelc.numModelValues > 0, "comp_model_present_flagc");
67
+            }
68
+            for (uint8_t c = 0; c < 3; c++)
69
+            {
70
+                if (m_compModelc.bPresentFlag && m_compModelc.m_filmGrainNumIntensityIntervalMinus1 + 1 > 0 && m_compModelc.numModelValues > 0)
71
+                {
72
+                    assert(m_compModelc.m_filmGrainNumIntensityIntervalMinus1 + 1 <= 256);
73
+                    assert(m_compModelc.numModelValues <= X265_BYTE);
74
+                    WRITE_CODE(m_compModelc.m_filmGrainNumIntensityIntervalMinus1 , X265_BYTE, "num_intensity_intervals_minus1c");
75
+                    WRITE_CODE(m_compModelc.numModelValues - 1, 3, "num_model_values_minus1c");
76
+                    for (uint8_t interval = 0; interval < m_compModelc.m_filmGrainNumIntensityIntervalMinus1 + 1; interval++)
77
+                    {
78
+                        WRITE_CODE(m_compModelc.intensityValuesinterval.intensityIntervalLowerBound, X265_BYTE, "intensity_interval_lower_boundci");
79
+                        WRITE_CODE(m_compModelc.intensityValuesinterval.intensityIntervalUpperBound, X265_BYTE, "intensity_interval_upper_boundci");
80
+                        for (uint8_t j = 0; j < m_compModelc.numModelValues; j++)
81
+                        {
82
+                            WRITE_SVLC(m_compModelc.intensityValuesinterval.compModelValuej,"comp_model_valueci");
83
+                        }
84
+                    }
85
+                }
86
+            }
87
+            WRITE_FLAG(m_filmGrainCharacteristicsPersistenceFlag, "film_grain_characteristics_persistence_flag");
88
+        }
89
+        if (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
90
+        {
91
+            WRITE_FLAG(1, "payload_bit_equal_to_one");
92
+            while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
93
+            {
94
+                WRITE_FLAG(0, "payload_bit_equal_to_zero");
95
+            }
96
+        }
97
+    }
98
+};
99
+
100
 static const uint32_t ISO_IEC_11578_LEN = 16;
101
 
102
 class SEIuserDataUnregistered : public SEI
103
x265_3.5.tar.gz/source/encoder/slicetype.cpp -> x265_3.6.tar.gz/source/encoder/slicetype.cpp Changed
201
 
1
@@ -87,6 +87,14 @@
2
 
3
 namespace X265_NS {
4
 
5
+uint32_t acEnergyVarHist(uint64_t sum_ssd, int shift)
6
+{
7
+    uint32_t sum = (uint32_t)sum_ssd;
8
+    uint32_t ssd = (uint32_t)(sum_ssd >> 32);
9
+
10
+    return ssd - ((uint64_t)sum * sum >> shift);
11
+}
12
+
13
 bool computeEdge(pixel* edgePic, pixel* refPic, pixel* edgeTheta, intptr_t stride, int height, int width, bool bcalcTheta, pixel whitePixel)
14
 {
15
     intptr_t rowOne = 0, rowTwo = 0, rowThree = 0, colOne = 0, colTwo = 0, colThree = 0;
16
@@ -184,7 +192,7 @@
17
     {
18
         for (int colNum = 0; colNum < width; colNum++)
19
         {
20
-            if ((rowNum >= 2) && (colNum >= 2) && (rowNum != height - 2) && (colNum != width - 2)) //Ignoring the border pixels of the picture
21
+            if ((rowNum >= 2) && (colNum >= 2) && (rowNum < height - 2) && (colNum < width - 2)) //Ignoring the border pixels of the picture
22
             {
23
                 /*  5x5 Gaussian filter
24
                     2   4   5   4   2
25
@@ -519,7 +527,7 @@
26
                 if (param->rc.aqMode == X265_AQ_EDGE)
27
                     edgeFilter(curFrame, param);
28
 
29
-                if (param->rc.aqMode == X265_AQ_EDGE && !param->bHistBasedSceneCut && param->recursionSkipMode == EDGE_BASED_RSKIP)
30
+                if (param->rc.aqMode == X265_AQ_EDGE && param->recursionSkipMode == EDGE_BASED_RSKIP)
31
                 {
32
                     pixel* src = curFrame->m_edgePic + curFrame->m_fencPic->m_lumaMarginY * curFrame->m_fencPic->m_stride + curFrame->m_fencPic->m_lumaMarginX;
33
                     primitives.planecopy_pp_shr(src, curFrame->m_fencPic->m_stride, curFrame->m_edgeBitPic,
34
@@ -1050,7 +1058,48 @@
35
     m_countPreLookahead = 0;
36
 #endif
37
 
38
-    memset(m_histogram, 0, sizeof(m_histogram));
39
+    m_accHistDiffRunningAvgCb = X265_MALLOC(uint32_t*, NUMBER_OF_SEGMENTS_IN_WIDTH * sizeof(uint32_t*));
40
+    m_accHistDiffRunningAvgCb0 = X265_MALLOC(uint32_t, NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
41
+    memset(m_accHistDiffRunningAvgCb0, 0, sizeof(uint32_t) * NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
42
+    for (uint32_t w = 1; w < NUMBER_OF_SEGMENTS_IN_WIDTH; w++) {
43
+        m_accHistDiffRunningAvgCbw = m_accHistDiffRunningAvgCb0 + w * NUMBER_OF_SEGMENTS_IN_HEIGHT;
44
+    }
45
+
46
+    m_accHistDiffRunningAvgCr = X265_MALLOC(uint32_t*, NUMBER_OF_SEGMENTS_IN_WIDTH * sizeof(uint32_t*));
47
+    m_accHistDiffRunningAvgCr0 = X265_MALLOC(uint32_t, NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
48
+    memset(m_accHistDiffRunningAvgCr0, 0, sizeof(uint32_t) * NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
49
+    for (uint32_t w = 1; w < NUMBER_OF_SEGMENTS_IN_WIDTH; w++) {
50
+        m_accHistDiffRunningAvgCrw = m_accHistDiffRunningAvgCr0 + w * NUMBER_OF_SEGMENTS_IN_HEIGHT;
51
+    }
52
+
53
+    m_accHistDiffRunningAvg = X265_MALLOC(uint32_t*, NUMBER_OF_SEGMENTS_IN_WIDTH * sizeof(uint32_t*));
54
+    m_accHistDiffRunningAvg0 = X265_MALLOC(uint32_t, NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
55
+    memset(m_accHistDiffRunningAvg0, 0, sizeof(uint32_t) * NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
56
+    for (uint32_t w = 1; w < NUMBER_OF_SEGMENTS_IN_WIDTH; w++) {
57
+        m_accHistDiffRunningAvgw = m_accHistDiffRunningAvg0 + w * NUMBER_OF_SEGMENTS_IN_HEIGHT;
58
+    }
59
+
60
+    m_resetRunningAvg = true;
61
+
62
+    m_segmentCountThreshold = (uint32_t)(((float)((NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT) * 50) / 100) + 0.5);
63
+
64
+    if (m_param->bEnableTemporalSubLayers > 2)
65
+    {
66
+        switch (m_param->bEnableTemporalSubLayers)
67
+        {
68
+        case 3:
69
+            m_gopId = 0;
70
+            break;
71
+        case 4:
72
+            m_gopId = 1;
73
+            break;
74
+        case 5:
75
+            m_gopId = 2;
76
+            break;
77
+        default:
78
+            break;
79
+        }
80
+    }
81
 }
82
 
83
 #if DETAILED_CU_STATS
84
@@ -1098,6 +1147,7 @@
85
             m_pooli.stopWorkers();
86
     }
87
 }
88
+
89
 void Lookahead::destroy()
90
 {
91
     // these two queues will be empty unless the encode was aborted
92
@@ -1309,32 +1359,32 @@
93
     default:
94
         return;
95
     }
96
-    if (!m_param->analysisLoad || !m_param->bDisableLookahead)
97
+    if (!curFrame->m_param->analysisLoad || !curFrame->m_param->bDisableLookahead)
98
     {
99
         X265_CHECK(curFrame->m_lowres.costEstb - p0p1 - b > 0, "Slice cost not estimated\n")
100
 
101
-        if (m_param->rc.cuTree && !m_param->rc.bStatRead)
102
+        if (curFrame->m_param->rc.cuTree && !curFrame->m_param->rc.bStatRead)
103
             /* update row satds based on cutree offsets */
104
             curFrame->m_lowres.satdCost = frameCostRecalculate(frames, p0, p1, b);
105
-        else if (!m_param->analysisLoad || m_param->scaleFactor || m_param->bAnalysisType == HEVC_INFO)
106
+        else if (!curFrame->m_param->analysisLoad || curFrame->m_param->scaleFactor || curFrame->m_param->bAnalysisType == HEVC_INFO)
107
         {
108
-            if (m_param->rc.aqMode)
109
+            if (curFrame->m_param->rc.aqMode)
110
                 curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstAqb - p0p1 - b;
111
             else
112
                 curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstb - p0p1 - b;
113
         }
114
-        if (m_param->rc.vbvBufferSize && m_param->rc.vbvMaxBitrate)
115
+        if (curFrame->m_param->rc.vbvBufferSize && curFrame->m_param->rc.vbvMaxBitrate)
116
         {
117
             /* aggregate lowres row satds to CTU resolution */
118
             curFrame->m_lowres.lowresCostForRc = curFrame->m_lowres.lowresCostsb - p0p1 - b;
119
             uint32_t lowresRow = 0, lowresCol = 0, lowresCuIdx = 0, sum = 0, intraSum = 0;
120
-            uint32_t scale = m_param->maxCUSize / (2 * X265_LOWRES_CU_SIZE);
121
-            uint32_t numCuInHeight = (m_param->sourceHeight + m_param->maxCUSize - 1) / m_param->maxCUSize;
122
+            uint32_t scale = curFrame->m_param->maxCUSize / (2 * X265_LOWRES_CU_SIZE);
123
+            uint32_t numCuInHeight = (curFrame->m_param->sourceHeight + curFrame->m_param->maxCUSize - 1) / curFrame->m_param->maxCUSize;
124
             uint32_t widthInLowresCu = (uint32_t)m_8x8Width, heightInLowresCu = (uint32_t)m_8x8Height;
125
             double *qp_offset = 0;
126
             /* Factor in qpoffsets based on Aq/Cutree in CU costs */
127
-            if (m_param->rc.aqMode || m_param->bAQMotion)
128
-                qp_offset = (framesb->sliceType == X265_TYPE_B || !m_param->rc.cuTree) ? framesb->qpAqOffset : framesb->qpCuTreeOffset;
129
+            if (curFrame->m_param->rc.aqMode || curFrame->m_param->bAQMotion)
130
+                qp_offset = (framesb->sliceType == X265_TYPE_B || !curFrame->m_param->rc.cuTree) ? framesb->qpAqOffset : framesb->qpCuTreeOffset;
131
 
132
             for (uint32_t row = 0; row < numCuInHeight; row++)
133
             {
134
@@ -1350,7 +1400,7 @@
135
                         if (qp_offset)
136
                         {
137
                             double qpOffset;
138
-                            if (m_param->rc.qgSize == 8)
139
+                            if (curFrame->m_param->rc.qgSize == 8)
140
                                 qpOffset = (qp_offsetlowresCol * 2 + lowresRow * widthInLowresCu * 4 +
141
                                 qp_offsetlowresCol * 2 + lowresRow * widthInLowresCu * 4 + 1 +
142
                                 qp_offsetlowresCol * 2 + lowresRow * widthInLowresCu * 4 + curFrame->m_lowres.maxBlocksInRowFullRes +
143
@@ -1361,7 +1411,7 @@
144
                             int32_t intraCuCost = curFrame->m_lowres.intraCostlowresCuIdx;
145
                             curFrame->m_lowres.intraCostlowresCuIdx = (intraCuCost * x265_exp2fix8(qpOffset) + 128) >> 8;
146
                         }
147
-                        if (m_param->bIntraRefresh && slice->m_sliceType == X265_TYPE_P)
148
+                        if (curFrame->m_param->bIntraRefresh && slice->m_sliceType == X265_TYPE_P)
149
                             for (uint32_t x = curFrame->m_encData->m_pir.pirStartCol; x <= curFrame->m_encData->m_pir.pirEndCol; x++)
150
                                 diff += curFrame->m_lowres.intraCostlowresCuIdx - lowresCuCost;
151
                         curFrame->m_lowres.lowresCostForRclowresCuIdx = lowresCuCost;
152
@@ -1377,6 +1427,291 @@
153
     }
154
 }
155
 
156
+uint32_t LookaheadTLD::calcVariance(pixel* inpSrc, intptr_t stride, intptr_t blockOffset, uint32_t plane)
157
+{
158
+    pixel* src = inpSrc + blockOffset;
159
+
160
+    uint32_t var;
161
+    if (!plane)
162
+        var = acEnergyVarHist(primitives.cuBLOCK_8x8.var(src, stride), 6);
163
+    else
164
+        var = acEnergyVarHist(primitives.cuBLOCK_4x4.var(src, stride), 4);
165
+
166
+    x265_emms();
167
+    return var;
168
+}
169
+
170
+/*
171
+** Compute Block and Picture Variance, Block Mean for all blocks in the picture
172
+*/
173
+void LookaheadTLD::computePictureStatistics(Frame *curFrame)
174
+{
175
+    int maxCol = curFrame->m_fencPic->m_picWidth;
176
+    int maxRow = curFrame->m_fencPic->m_picHeight;
177
+    intptr_t inpStride = curFrame->m_fencPic->m_stride;
178
+
179
+    // Variance
180
+    uint64_t picTotVariance = 0;
181
+    uint32_t variance;
182
+
183
+    uint64_t blockXY = 0;
184
+    pixel* src = curFrame->m_fencPic->m_picOrg0;
185
+
186
+    for (int blockY = 0; blockY < maxRow; blockY += 8)
187
+    {
188
+        uint64_t rowVariance = 0;
189
+        for (int blockX = 0; blockX < maxCol; blockX += 8)
190
+        {
191
+            intptr_t blockOffsetLuma = blockX + (blockY * inpStride);
192
+
193
+            variance = calcVariance(
194
+                src,
195
+                inpStride,
196
+                blockOffsetLuma, 0);
197
+
198
+            rowVariance += variance;
199
+            blockXY++;
200
+        }
201
x265_3.5.tar.gz/source/encoder/slicetype.h -> x265_3.6.tar.gz/source/encoder/slicetype.h Changed
110
 
1
@@ -44,6 +44,24 @@
2
 #define EDGE_INCLINATION 45
3
 #define TEMPORAL_SCENECUT_THRESHOLD 50
4
 
5
+#define X265_ABS(a)                        (((a) < 0) ? (-(a)) : (a))
6
+
7
+#define PICTURE_DIFF_VARIANCE_TH            390
8
+#define PICTURE_VARIANCE_TH                 1500
9
+#define LOW_VAR_SCENE_CHANGE_TH             2250
10
+#define HIGH_VAR_SCENE_CHANGE_TH            3500
11
+
12
+#define PICTURE_DIFF_VARIANCE_CHROMA_TH     10
13
+#define PICTURE_VARIANCE_CHROMA_TH          20
14
+#define LOW_VAR_SCENE_CHANGE_CHROMA_TH      2250/4
15
+#define HIGH_VAR_SCENE_CHANGE_CHROMA_TH     3500/4
16
+
17
+#define FLASH_TH                            1.5
18
+#define FADE_TH                             4
19
+#define INTENSITY_CHANGE_TH                 4
20
+
21
+#define NUM64x64INPIC(w,h)                  ((w*h)>> (MAX_LOG2_CU_SIZE<<1))
22
+
23
 #if HIGH_BIT_DEPTH
24
 #define EDGE_THRESHOLD 1023.0
25
 #else
26
@@ -93,7 +111,29 @@
27
 
28
     ~LookaheadTLD() { X265_FREE(wbuffer0); }
29
 
30
+    void collectPictureStatistics(Frame *curFrame);
31
+    void computeIntensityHistogramBinsLuma(Frame *curFrame, uint64_t *sumAvgIntensityTotalSegmentsLuma);
32
+
33
+    void computeIntensityHistogramBinsChroma(
34
+        Frame    *curFrame,
35
+        uint64_t *sumAverageIntensityCb,
36
+        uint64_t *sumAverageIntensityCr);
37
+
38
+    void calculateHistogram(
39
+        pixel    *inputSrc,
40
+        uint32_t  inputWidth,
41
+        uint32_t  inputHeight,
42
+        intptr_t  stride,
43
+        uint8_t   dsFactor,
44
+        uint32_t *histogram,
45
+        uint64_t *sum);
46
+
47
+    void computePictureStatistics(Frame *curFrame);
48
+
49
+    uint32_t calcVariance(pixel* src, intptr_t stride, intptr_t blockOffset, uint32_t plane);
50
+
51
     void calcAdaptiveQuantFrame(Frame *curFrame, x265_param* param);
52
+    void calcFrameSegment(Frame *curFrame);
53
     void lowresIntraEstimate(Lowres& fenc, uint32_t qgSize);
54
 
55
     void weightsAnalyse(Lowres& fenc, Lowres& ref);
56
@@ -124,7 +164,6 @@
57
 
58
     /* pre-lookahead */
59
     int           m_fullQueueSize;
60
-    int           m_histogramX265_BFRAME_MAX + 1;
61
     int           m_lastKeyframe;
62
     int           m_8x8Width;
63
     int           m_8x8Height;
64
@@ -153,6 +192,16 @@
65
     bool          m_isFadeIn;
66
     uint64_t      m_fadeCount;
67
     int           m_fadeStart;
68
+
69
+    uint32_t    **m_accHistDiffRunningAvgCb;
70
+    uint32_t    **m_accHistDiffRunningAvgCr;
71
+    uint32_t    **m_accHistDiffRunningAvg;
72
+
73
+    bool          m_resetRunningAvg;
74
+    uint32_t      m_segmentCountThreshold;
75
+
76
+    int8_t                  m_gopId;
77
+
78
     Lookahead(x265_param *param, ThreadPool *pool);
79
 #if DETAILED_CU_STATS
80
     int64_t       m_slicetypeDecideElapsedTime;
81
@@ -174,6 +223,7 @@
82
 
83
     void    getEstimatedPictureCost(Frame *pic);
84
     void    setLookaheadQueue();
85
+    int     findSliceType(int poc);
86
 
87
 protected:
88
 
89
@@ -184,6 +234,10 @@
90
     /* called by slicetypeAnalyse() to make slice decisions */
91
     bool    scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, int numFrames);
92
     bool    scenecutInternal(Lowres **frames, int p0, int p1, bool bRealScenecut);
93
+
94
+    bool    histBasedScenecut(Lowres **frames, int p0, int p1, int numFrames);
95
+    bool    detectHistBasedSceneChange(Lowres **frames, int p0, int p1, int p2);
96
+
97
     void    slicetypePath(Lowres **frames, int length, char(*best_paths)X265_LOOKAHEAD_MAX + 1);
98
     int64_t slicetypePathCost(Lowres **frames, char *path, int64_t threshold);
99
     int64_t vbvFrameCost(Lowres **frames, int p0, int p1, int b);
100
@@ -199,6 +253,9 @@
101
 
102
     /* called by getEstimatedPictureCost() to finalize cuTree costs */
103
     int64_t frameCostRecalculate(Lowres **frames, int p0, int p1, int b);
104
+    /*Compute index for positioning B-Ref frames*/
105
+    void     placeBref(Frame** frames, int start, int end, int num, int *brefs);
106
+    void     compCostBref(Lowres **frame, int start, int end, int num);
107
 };
108
 
109
 class PreLookaheadGroup : public BondedTaskGroup
110
x265_3.5.tar.gz/source/output/output.cpp -> x265_3.6.tar.gz/source/output/output.cpp Changed
19
 
1
@@ -30,14 +30,14 @@
2
 
3
 using namespace X265_NS;
4
 
5
-ReconFile* ReconFile::open(const char *fname, int width, int height, uint32_t bitdepth, uint32_t fpsNum, uint32_t fpsDenom, int csp)
6
+ReconFile* ReconFile::open(const char *fname, int width, int height, uint32_t bitdepth, uint32_t fpsNum, uint32_t fpsDenom, int csp, int sourceBitDepth)
7
 {
8
     const char * s = strrchr(fname, '.');
9
 
10
     if (s && !strcmp(s, ".y4m"))
11
-        return new Y4MOutput(fname, width, height, fpsNum, fpsDenom, csp);
12
+        return new Y4MOutput(fname, width, height, bitdepth, fpsNum, fpsDenom, csp, sourceBitDepth);
13
     else
14
-        return new YUVOutput(fname, width, height, bitdepth, csp);
15
+        return new YUVOutput(fname, width, height, bitdepth, csp, sourceBitDepth);
16
 }
17
 
18
 OutputFile* OutputFile::open(const char *fname, InputFileInfo& inputInfo)
19
x265_3.5.tar.gz/source/output/output.h -> x265_3.6.tar.gz/source/output/output.h Changed
10
 
1
@@ -42,7 +42,7 @@
2
     ReconFile()           {}
3
 
4
     static ReconFile* open(const char *fname, int width, int height, uint32_t bitdepth,
5
-                           uint32_t fpsNum, uint32_t fpsDenom, int csp);
6
+                           uint32_t fpsNum, uint32_t fpsDenom, int csp, int sourceBitDepth);
7
 
8
     virtual bool isFail() const = 0;
9
 
10
x265_3.5.tar.gz/source/output/y4m.cpp -> x265_3.6.tar.gz/source/output/y4m.cpp Changed
145
 
1
@@ -28,11 +28,13 @@
2
 using namespace X265_NS;
3
 using namespace std;
4
 
5
-Y4MOutput::Y4MOutput(const char *filename, int w, int h, uint32_t fpsNum, uint32_t fpsDenom, int csp)
6
+Y4MOutput::Y4MOutput(const char* filename, int w, int h, uint32_t bitdepth, uint32_t fpsNum, uint32_t fpsDenom, int csp, int inputdepth)
7
     : width(w)
8
     , height(h)
9
+    , bitDepth(bitdepth)
10
     , colorSpace(csp)
11
     , frameSize(0)
12
+    , inputDepth(inputdepth)
13
 {
14
     ofs.open(filename, ios::binary | ios::out);
15
     buf = new charwidth;
16
@@ -41,7 +43,13 @@
17
 
18
     if (ofs)
19
     {
20
-        ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << fpsNum << ":" << fpsDenom << " Ip" << " C" << cf << "\n";
21
+        if (bitDepth == 10)
22
+            ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << fpsNum << ":" << fpsDenom << " Ip" << " C" << cf << "p10" << " XYSCSS = " << cf << "P10" << "\n";
23
+        else if (bitDepth == 12)
24
+            ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << fpsNum << ":" << fpsDenom << " Ip" << " C" << cf << "p12" << " XYSCSS = " << cf << "P12" << "\n";
25
+        else
26
+            ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << fpsNum << ":" << fpsDenom << " Ip" << " C" << cf << "\n";
27
+
28
         header = ofs.tellp();
29
     }
30
 
31
@@ -58,52 +66,81 @@
32
 bool Y4MOutput::writePicture(const x265_picture& pic)
33
 {
34
     std::ofstream::pos_type outPicPos = header;
35
-    outPicPos += (uint64_t)pic.poc * (6 + frameSize);
36
+    if (pic.bitDepth > 8)
37
+        outPicPos += (uint64_t)(pic.poc * (6 + frameSize * 2));
38
+    else
39
+        outPicPos += (uint64_t)pic.poc * (6 + frameSize);
40
     ofs.seekp(outPicPos);
41
     ofs << "FRAME\n";
42
 
43
-#if HIGH_BIT_DEPTH
44
-    if (pic.bitDepth > 8 && pic.poc == 0)
45
-        x265_log(NULL, X265_LOG_WARNING, "y4m: down-shifting reconstructed pixels to 8 bits\n");
46
-#else
47
-    if (pic.bitDepth > 8 && pic.poc == 0)
48
-        x265_log(NULL, X265_LOG_WARNING, "y4m: forcing reconstructed pixels to 8 bits\n");
49
-#endif
50
+    if (inputDepth > 8)
51
+    {
52
+        if (pic.bitDepth == 8 && pic.poc == 0)
53
+            x265_log(NULL, X265_LOG_WARNING, "y4m: down-shifting reconstructed pixels to 8 bits\n");
54
+    }
55
 
56
     X265_CHECK(pic.colorSpace == colorSpace, "invalid chroma subsampling\n");
57
 
58
-#if HIGH_BIT_DEPTH
59
-
60
-    // encoder gave us short pixels, downshift, then write
61
-    X265_CHECK(pic.bitDepth > 8, "invalid bit depth\n");
62
-    int shift = pic.bitDepth - 8;
63
-    for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
64
+    if (inputDepth > 8)//if HIGH_BIT_DEPTH
65
     {
66
-        uint16_t *src = (uint16_t*)pic.planesi;
67
-        for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
68
+        if (pic.bitDepth == 8)
69
         {
70
-            for (int w = 0; w < width >> x265_cli_cspscolorSpace.widthi; w++)
71
-                bufw = (char)(srcw >> shift);
72
-
73
-            ofs.write(buf, width >> x265_cli_cspscolorSpace.widthi);
74
-            src += pic.stridei / sizeof(*src);
75
+            // encoder gave us short pixels, downshift, then write
76
+            X265_CHECK(pic.bitDepth == 8, "invalid bit depth\n");
77
+            int shift = pic.bitDepth - 8;
78
+            for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
79
+            {
80
+                char *src = (char*)pic.planesi;
81
+                for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
82
+                {
83
+                    for (int w = 0; w < width >> x265_cli_cspscolorSpace.widthi; w++)
84
+                        bufw = (char)(srcw >> shift);
85
+
86
+                    ofs.write(buf, width >> x265_cli_cspscolorSpace.widthi);
87
+                    src += pic.stridei / sizeof(*src);
88
+                }
89
+            }
90
+        }
91
+        else
92
+        {
93
+            X265_CHECK(pic.bitDepth > 8, "invalid bit depth\n");
94
+            for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
95
+            {
96
+                uint16_t *src = (uint16_t*)pic.planesi;
97
+                for (int h = 0; h < (height * 1) >> x265_cli_cspscolorSpace.heighti; h++)
98
+                {
99
+                    ofs.write((const char*)src, (width * 2) >> x265_cli_cspscolorSpace.widthi);
100
+                    src += pic.stridei / sizeof(*src);
101
+                }
102
+            }
103
         }
104
     }
105
-
106
-#else // if HIGH_BIT_DEPTH
107
-
108
-    X265_CHECK(pic.bitDepth == 8, "invalid bit depth\n");
109
-    for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
110
+    else if (inputDepth == 8 && pic.bitDepth > 8)
111
     {
112
-        char *src = (char*)pic.planesi;
113
-        for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
114
+        X265_CHECK(pic.bitDepth > 8, "invalid bit depth\n");
115
+        for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
116
         {
117
-            ofs.write(src, width >> x265_cli_cspscolorSpace.widthi);
118
-            src += pic.stridei / sizeof(*src);
119
+            uint16_t* src = (uint16_t*)pic.planesi;
120
+            for (int h = 0; h < (height * 1) >> x265_cli_cspscolorSpace.heighti; h++)
121
+            {
122
+                ofs.write((const char*)src, (width * 2) >> x265_cli_cspscolorSpace.widthi);
123
+                src += pic.stridei / sizeof(*src);
124
+            }
125
+        }
126
+    }
127
+    else
128
+    {
129
+        X265_CHECK(pic.bitDepth == 8, "invalid bit depth\n");
130
+        for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
131
+        {
132
+            char *src = (char*)pic.planesi;
133
+            for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
134
+            {
135
+                ofs.write(src, width >> x265_cli_cspscolorSpace.widthi);
136
+                src += pic.stridei / sizeof(*src);
137
+            }
138
         }
139
     }
140
-
141
-#endif // if HIGH_BIT_DEPTH
142
 
143
     return true;
144
 }
145
x265_3.5.tar.gz/source/output/y4m.h -> x265_3.6.tar.gz/source/output/y4m.h Changed
25
 
1
@@ -38,10 +38,14 @@
2
 
3
     int height;
4
 
5
+    uint32_t bitDepth;
6
+
7
     int colorSpace;
8
 
9
     uint32_t frameSize;
10
 
11
+    int inputDepth;
12
+
13
     std::ofstream ofs;
14
 
15
     std::ofstream::pos_type header;
16
@@ -52,7 +56,7 @@
17
 
18
 public:
19
 
20
-    Y4MOutput(const char *filename, int width, int height, uint32_t fpsNum, uint32_t fpsDenom, int csp);
21
+    Y4MOutput(const char *filename, int width, int height, uint32_t bitdepth, uint32_t fpsNum, uint32_t fpsDenom, int csp, int inputDepth);
22
 
23
     virtual ~Y4MOutput();
24
 
25
x265_3.5.tar.gz/source/output/yuv.cpp -> x265_3.6.tar.gz/source/output/yuv.cpp Changed
107
 
1
@@ -28,12 +28,13 @@
2
 using namespace X265_NS;
3
 using namespace std;
4
 
5
-YUVOutput::YUVOutput(const char *filename, int w, int h, uint32_t d, int csp)
6
+YUVOutput::YUVOutput(const char *filename, int w, int h, uint32_t d, int csp, int inputdepth)
7
     : width(w)
8
     , height(h)
9
     , depth(d)
10
     , colorSpace(csp)
11
     , frameSize(0)
12
+    , inputDepth(inputdepth)
13
 {
14
     ofs.open(filename, ios::binary | ios::out);
15
     buf = new charwidth;
16
@@ -56,50 +57,52 @@
17
     X265_CHECK(pic.colorSpace == colorSpace, "invalid chroma subsampling\n");
18
     X265_CHECK(pic.bitDepth == (int)depth, "invalid bit depth\n");
19
 
20
-#if HIGH_BIT_DEPTH
21
-    if (depth == 8)
22
+    if (inputDepth > 8)
23
     {
24
-        int shift = pic.bitDepth - 8;
25
-        ofs.seekp((std::streamoff)fileOffset);
26
-        for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
27
-        {
28
-            uint16_t *src = (uint16_t*)pic.planesi;
29
-            for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
30
-            {
31
-                for (int w = 0; w < width >> x265_cli_cspscolorSpace.widthi; w++)
32
-                    bufw = (char)(srcw >> shift);
33
+   if (depth == 8)
34
+   {
35
+       int shift = pic.bitDepth - 8;
36
+       ofs.seekp((std::streamoff)fileOffset);
37
+       for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
38
+       {
39
+           uint16_t *src = (uint16_t*)pic.planesi;
40
+           for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
41
+           {
42
+               for (int w = 0; w < width >> x265_cli_cspscolorSpace.widthi; w++)
43
+                   bufw = (char)(srcw >> shift);
44
 
45
-                ofs.write(buf, width >> x265_cli_cspscolorSpace.widthi);
46
-                src += pic.stridei / sizeof(*src);
47
-            }
48
-        }
49
+               ofs.write(buf, width >> x265_cli_cspscolorSpace.widthi);
50
+               src += pic.stridei / sizeof(*src);
51
+           }
52
+       }
53
+   }
54
+   else
55
+   {
56
+       ofs.seekp((std::streamoff)(fileOffset * 2));
57
+       for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
58
+       {
59
+           uint16_t *src = (uint16_t*)pic.planesi;
60
+           for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
61
+           {
62
+               ofs.write((const char*)src, (width * 2) >> x265_cli_cspscolorSpace.widthi);
63
+               src += pic.stridei / sizeof(*src);
64
+           }
65
+       }
66
+   }
67
     }
68
     else
69
     {
70
-        ofs.seekp((std::streamoff)(fileOffset * 2));
71
-        for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
72
-        {
73
-            uint16_t *src = (uint16_t*)pic.planesi;
74
-            for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
75
-            {
76
-                ofs.write((const char*)src, (width * 2) >> x265_cli_cspscolorSpace.widthi);
77
-                src += pic.stridei / sizeof(*src);
78
-            }
79
-        }
80
+   ofs.seekp((std::streamoff)fileOffset);
81
+   for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
82
+   {
83
+       char *src = (char*)pic.planesi;
84
+       for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
85
+       {
86
+           ofs.write(src, width >> x265_cli_cspscolorSpace.widthi);
87
+           src += pic.stridei / sizeof(*src);
88
+       }
89
+   }
90
     }
91
-#else // if HIGH_BIT_DEPTH
92
-    ofs.seekp((std::streamoff)fileOffset);
93
-    for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
94
-    {
95
-        char *src = (char*)pic.planesi;
96
-        for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
97
-        {
98
-            ofs.write(src, width >> x265_cli_cspscolorSpace.widthi);
99
-            src += pic.stridei / sizeof(*src);
100
-        }
101
-    }
102
-
103
-#endif // if HIGH_BIT_DEPTH
104
 
105
     return true;
106
 }
107
x265_3.5.tar.gz/source/output/yuv.h -> x265_3.6.tar.gz/source/output/yuv.h Changed
18
 
1
@@ -46,13 +46,15 @@
2
 
3
     uint32_t frameSize;
4
 
5
+    int inputDepth;
6
+
7
     char *buf;
8
 
9
     std::ofstream ofs;
10
 
11
 public:
12
 
13
-    YUVOutput(const char *filename, int width, int height, uint32_t bitdepth, int csp);
14
+    YUVOutput(const char *filename, int width, int height, uint32_t bitdepth, int csp, int inputDepth);
15
 
16
     virtual ~YUVOutput();
17
 
18
x265_3.5.tar.gz/source/test/CMakeLists.txt -> x265_3.6.tar.gz/source/test/CMakeLists.txt Changed
24
 
1
@@ -23,15 +23,13 @@
2
 
3
 # add ARM assembly files
4
 if(ARM OR CROSS_COMPILE_ARM)
5
-    if(NOT ARM64)
6
-        enable_language(ASM)
7
-        set(NASM_SRC checkasm-arm.S)
8
-        add_custom_command(
9
-            OUTPUT checkasm-arm.obj
10
-            COMMAND ${CMAKE_CXX_COMPILER}
11
-            ARGS ${NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S -o checkasm-arm.obj
12
-            DEPENDS checkasm-arm.S)
13
-    endif()
14
+    enable_language(ASM)
15
+    set(NASM_SRC checkasm-arm.S)
16
+    add_custom_command(
17
+        OUTPUT checkasm-arm.obj
18
+        COMMAND ${CMAKE_CXX_COMPILER}
19
+        ARGS ${NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S -o checkasm-arm.obj
20
+        DEPENDS checkasm-arm.S)
21
 endif(ARM OR CROSS_COMPILE_ARM)
22
 
23
 # add PowerPC assembly files
24
x265_3.5.tar.gz/source/test/pixelharness.cpp -> x265_3.6.tar.gz/source/test/pixelharness.cpp Changed
63
 
1
@@ -406,6 +406,32 @@
2
     return true;
3
 }
4
 
5
+bool PixelHarness::check_downscaleluma_t(downscaleluma_t ref, downscaleluma_t opt)
6
+{
7
+    ALIGN_VAR_16(pixel, ref_destf32 * 32);
8
+    ALIGN_VAR_16(pixel, opt_destf32 * 32);
9
+
10
+    intptr_t src_stride = 64;
11
+    intptr_t dst_stride = 32;
12
+    int bx = 32;
13
+    int by = 32;
14
+    int j = 0;
15
+    for (int i = 0; i < ITERS; i++)
16
+    {
17
+        int index = i % TEST_CASES;
18
+        ref(pixel_test_buffindex + j, ref_destf, src_stride, dst_stride, bx, by);
19
+        checked(opt, pixel_test_buffindex + j, opt_destf, src_stride, dst_stride, bx, by);
20
+
21
+        if (memcmp(ref_destf, opt_destf, 32 * 32 * sizeof(pixel)))
22
+            return false;
23
+
24
+        reportfail();
25
+        j += INCR;
26
+    }
27
+
28
+    return true;
29
+}
30
+
31
 bool PixelHarness::check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt)
32
 {
33
     ALIGN_VAR_16(int16_t, ref_dest64 * 64);
34
@@ -2793,6 +2819,15 @@
35
         }
36
     }
37
 
38
+    if (opt.frameSubSampleLuma)
39
+    {
40
+        if (!check_downscaleluma_t(ref.frameSubSampleLuma, opt.frameSubSampleLuma))
41
+        {
42
+            printf("SubSample Luma failed!\n");
43
+            return false;
44
+        }
45
+    }
46
+
47
     if (opt.scale1D_128to64NONALIGNED)
48
     {
49
         if (!check_scale1D_pp(ref.scale1D_128to64NONALIGNED, opt.scale1D_128to64NONALIGNED))
50
@@ -3492,6 +3527,12 @@
51
         REPORT_SPEEDUP(opt.frameInitLowres, ref.frameInitLowres, pbuf2, pbuf1, pbuf2, pbuf3, pbuf4, 64, 64, 64, 64);
52
     }
53
 
54
+    if (opt.frameSubSampleLuma)
55
+    {
56
+        HEADER0("downscaleluma");
57
+        REPORT_SPEEDUP(opt.frameSubSampleLuma, ref.frameSubSampleLuma, pbuf2, pbuf1, 64, 64, 64, 64);
58
+    }
59
+
60
     if (opt.scale1D_128to64NONALIGNED)
61
     {
62
         HEADER0("scale1D_128to64");
63
x265_3.5.tar.gz/source/test/pixelharness.h -> x265_3.6.tar.gz/source/test/pixelharness.h Changed
9
 
1
@@ -138,6 +138,7 @@
2
     bool check_integral_inith(integralh_t ref, integralh_t opt);
3
     bool check_ssimDist(ssimDistortion_t ref, ssimDistortion_t opt);
4
     bool check_normFact(normFactor_t ref, normFactor_t opt, int block);
5
+    bool check_downscaleluma_t(downscaleluma_t ref, downscaleluma_t opt);
6
 
7
 public:
8
 
9
x265_3.5.tar.gz/source/test/rate-control-tests.txt -> x265_3.6.tar.gz/source/test/rate-control-tests.txt Changed
10
 
1
@@ -15,7 +15,7 @@
2
 112_1920x1080_25.yuv,--preset ultrafast --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 15000 --hrd --strict-cbr
3
 Traffic_4096x2048_30.yuv,--preset superfast --bitrate 20000 --vbv-maxrate 20000 --vbv-bufsize 20000 --repeat-headers --strict-cbr
4
 Traffic_4096x2048_30.yuv,--preset faster --bitrate 8000 --vbv-maxrate 8000 --vbv-bufsize 6000 --aud --repeat-headers --no-open-gop --hrd --pmode --pme
5
-News-4k.y4m,--preset veryfast --bitrate 3000 --vbv-maxrate 5000 --vbv-bufsize 5000 --repeat-headers --temporal-layers
6
+News-4k.y4m,--preset veryfast --bitrate 3000 --vbv-maxrate 5000 --vbv-bufsize 5000 --repeat-headers --temporal-layers 3
7
 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --bitrate 18000 --vbv-bufsize 20000 --vbv-maxrate 18000 --strict-cbr
8
 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --bitrate 8000 --vbv-bufsize 12000 --vbv-maxrate 10000  --tune grain
9
 big_buck_bunny_360p24.y4m,--preset medium --bitrate 400 --vbv-bufsize 600 --vbv-maxrate 600 --aud --hrd --tune fast-decode
10
x265_3.5.tar.gz/source/test/regression-tests.txt -> x265_3.6.tar.gz/source/test/regression-tests.txt Changed
91
 
1
@@ -18,12 +18,12 @@
2
 BasketballDrive_1920x1080_50.y4m,--preset faster --aq-strength 2 --merange 190 --slices 3
3
 BasketballDrive_1920x1080_50.y4m,--preset medium --ctu 16 --max-tu-size 8 --subme 7 --qg-size 16 --cu-lossless --tu-inter-depth 3 --limit-tu 1
4
 BasketballDrive_1920x1080_50.y4m,--preset medium --keyint -1 --nr-inter 100 -F4 --no-sao
5
-BasketballDrive_1920x1080_50.y4m,--preset medium --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 2 --bitrate 7000 --limit-modes::--preset medium --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 2 --bitrate 7000 --limit-modes
6
+BasketballDrive_1920x1080_50.y4m,--preset medium --analysis-save x265_analysis.dat --analysis-save-reuse-level 2 --bitrate 7000 --limit-modes::--preset medium --analysis-load x265_analysis.dat --analysis-load-reuse-level 2 --bitrate 7000 --limit-modes
7
 BasketballDrive_1920x1080_50.y4m,--preset slow --nr-intra 100 -F4 --aq-strength 3 --qg-size 16 --limit-refs 1
8
 BasketballDrive_1920x1080_50.y4m,--preset slower --lossless --chromaloc 3 --subme 0 --limit-tu 4
9
-BasketballDrive_1920x1080_50.y4m,--preset slower --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --bitrate 7000 --limit-tu 0::--preset slower --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 10 --bitrate 7000 --limit-tu 0
10
+BasketballDrive_1920x1080_50.y4m,--preset slower --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --bitrate 7000 --limit-tu 0::--preset slower --analysis-load x265_analysis.dat --analysis-load-reuse-level 10 --bitrate 7000 --limit-tu 0
11
 BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1 --aq-mode 3 --limit-tu 3
12
-BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --crf 18 --tskip-fast --limit-tu 2::--preset veryslow --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --crf 18 --tskip-fast --limit-tu 2
13
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --crf 18 --tskip-fast --limit-tu 2::--preset veryslow --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --crf 18 --tskip-fast --limit-tu 2
14
 BasketballDrive_1920x1080_50.y4m,--preset veryslow --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
15
 Coastguard-4k.y4m,--preset ultrafast --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
16
 Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop
17
@@ -33,7 +33,7 @@
18
 Coastguard-4k.y4m,--preset slow --tune psnr --cbqpoffs -1 --crqpoffs 1 --limit-refs 1
19
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset ultrafast --weightp --tune zerolatency --qg-size 16
20
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset superfast --weightp --no-wpp --sao
21
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset veryfast --temporal-layers --tune grain
22
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset veryfast --temporal-layers 2 --tune grain
23
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset faster --max-tu-size 4 --min-cu-size 32
24
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --aq-mode 0 --sar 2 --range full
25
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset medium --no-wpp --no-cutree --no-strong-intra-smoothing --limit-refs 1
26
@@ -41,7 +41,7 @@
27
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset slower --tune ssim --tune fastdecode --limit-refs 2
28
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset ultrafast --weightp --no-wpp --no-open-gop
29
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset superfast --weightp --dither --no-psy-rd
30
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryfast --temporal-layers --repeat-headers --limit-refs 2
31
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryfast --temporal-layers 2 --repeat-headers --limit-refs 2
32
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset medium --dither --keyint -1 --rdoq-level 1 --limit-modes
33
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --tskip --tskip-fast --no-scenecut --limit-tu 1
34
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --aq-mode 3 --aq-strength 1.5 --aq-motion --bitrate 5000
35
@@ -49,11 +49,11 @@
36
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --hevc-aq --no-cutree --qg-size 16
37
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset superfast --weightp --qg-size 16
38
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset medium --tune psnr --bframes 16 --limit-modes
39
-DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
40
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers 2 --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
41
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset veryfast --weightp --nr-intra 1000 -F4
42
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset medium --nr-inter 500 -F4 --no-psy-rdoq
43
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset slower --no-weightp --rdoq-level 0 --limit-refs 3 --tu-inter-depth 4 --limit-tu 3
44
-DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset fast --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1::--preset fast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1
45
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset fast --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1::--preset fast --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1
46
 FourPeople_1280x720_60.y4m,--preset superfast --no-wpp --lookahead-slices 2
47
 FourPeople_1280x720_60.y4m,--preset veryfast --aq-mode 2 --aq-strength 1.5 --qg-size 8
48
 FourPeople_1280x720_60.y4m,--preset medium --qp 38 --no-psy-rd
49
@@ -158,13 +158,10 @@
50
 ducks_take_off_420_1_720p50.y4m,--preset medium --selective-sao 4 --sao --crf 20
51
 Traffic_4096x2048_30p.y4m, --preset medium --frame-dup --dup-threshold 60 --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000
52
 Kimono1_1920x1080_24_400.yuv,--preset superfast --qp 28 --zones 0,139,q=32
53
-sintel_trailer_2k_1920x1080_24.yuv, --preset medium --hist-scenecut --hist-threshold 0.02 --frame-dup --dup-threshold 60 --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000
54
-sintel_trailer_2k_1920x1080_24.yuv, --preset medium --hist-scenecut --hist-threshold 0.02
55
-sintel_trailer_2k_1920x1080_24.yuv, --preset ultrafast --hist-scenecut --hist-threshold 0.02
56
 crowd_run_1920x1080_50.yuv, --preset faster --ctu 32 --rskip 2 --rskip-edge-threshold 5
57
 crowd_run_1920x1080_50.yuv, --preset fast --ctu 64 --rskip 2 --rskip-edge-threshold 5 --aq-mode 4
58
-crowd_run_1920x1080_50.yuv, --preset slow --ctu 32 --rskip 2 --rskip-edge-threshold 5 --hist-scenecut --hist-threshold 0.1
59
-crowd_run_1920x1080_50.yuv, --preset slower --ctu 16 --rskip 2 --rskip-edge-threshold 5 --hist-scenecut --hist-threshold 0.1 --aq-mode 4
60
+crowd_run_1920x1080_50.yuv, --preset ultrafast --video-signal-type-preset BT2100_PQ_YCC:BT2100x108n0005
61
+crowd_run_1920x1080_50.yuv, --preset ultrafast --eob --eos
62
  
63
 # Main12 intraCost overflow bug test
64
 720p50_parkrun_ter.y4m,--preset medium
65
@@ -182,14 +179,22 @@
66
 
67
 #scaled save/load test
68
 crowd_run_1080p50.y4m,--preset ultrafast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 8000 --vbv-bufsize 8000::crowd_run_2160p50.y4m, --preset ultrafast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 12000 --vbv-bufsize 12000 
69
-crowd_run_1080p50.y4m,--preset superfast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m, --preset superfast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000 
70
-crowd_run_1080p50.y4m,--preset fast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m, --preset fast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --scale-factor 2 --qp 18
71
+crowd_run_1080p50.y4m,--preset superfast --analysis-save x265_analysis.dat  --analysis-save-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m, --preset superfast --analysis-load x265_analysis.dat  --analysis-load-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000 
72
+crowd_run_1080p50.y4m,--preset fast --analysis-save x265_analysis.dat  --analysis-save-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m, --preset fast --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --scale-factor 2 --qp 18
73
 crowd_run_1080p50.y4m,--preset medium --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000  --vbv-maxrate 5000 --vbv-bufsize 5000 --early-skip --tu-inter-depth 3::crowd_run_2160p50.y4m, --preset medium --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 4 --dynamic-refine::crowd_run_2160p50.y4m, --preset medium --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 3 --refine-inter 3
74
-RaceHorses_416x240_30.y4m,--preset slow --no-cutree --ctu 16 --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --crf 22  --vbv-maxrate 1000 --vbv-bufsize 1000::RaceHorses_832x480_30.y4m, --preset slow --no-cutree --ctu 32 --analysis-load x265_analysis.dat  --analysis-save x265_analysis_2.dat --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --crf 16 --vbv-maxrate 4000 --vbv-bufsize 4000 --refine-intra 0 --refine-inter 1::RaceHorses_1664x960_30.y4m,--preset slow --no-cutree --ctu 64 --analysis-load x265_analysis_2.dat  --analysis-load-reuse-level 10 --scale-factor 2 --crf 12 --vbv-maxrate 7000 --vbv-bufsize 7000 --refine-intra 2 --refine-inter 2
75
+RaceHorses_416x240_30.y4m,--preset slow --ctu 16 --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --crf 22  --vbv-maxrate 1000 --vbv-bufsize 1000::RaceHorses_832x480_30.y4m, --preset slow --ctu 32 --analysis-load x265_analysis.dat  --analysis-save x265_analysis_2.dat --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --crf 16 --vbv-maxrate 4000 --vbv-bufsize 4000 --refine-intra 0 --refine-inter 1::RaceHorses_1664x960_30.y4m,--preset slow --ctu 64 --analysis-load x265_analysis_2.dat  --analysis-load-reuse-level 10 --scale-factor 2 --crf 12 --vbv-maxrate 7000 --vbv-bufsize 7000 --refine-intra 2 --refine-inter 2
76
 ElFunete_960x540_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune psnr --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500 --analysis-save-reuse-level 10 --analysis-save elfuente_960x540.dat --scale-factor 2::ElFunete_1920x1080_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune psnr --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500 --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --analysis-save elfuente_1920x1080.dat --limit-tu 0 --scale-factor 2 --analysis-load elfuente_960x540.dat --refine-intra 4 --refine-inter 2::ElFuente_3840x2160_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune=psnr --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000 --analysis-load-reuse-level 10 --limit-tu 0 --scale-factor 2 --analysis-load elfuente_1920x1080.dat --refine-intra 4 --refine-inter 2
77
 #save/load with ctu distortion refinement
78
 CrowdRun_1920x1080_50_10bit_422.yuv,--no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --refine-ctu-distortion 1 --bitrate 7000::--no-cutree --analysis-load x265_analysis.dat --refine-ctu-distortion 1 --bitrate 7000 --analysis-load-reuse-level 5
79
 #segment encoding
80
 BasketballDrive_1920x1080_50.y4m, --preset ultrafast --no-open-gop --chunk-start 100 --chunk-end 200
81
 
82
+#Test FG SEI message addition
83
+#OldTownCross_1920x1080_50_10bit_422.yuv,--preset slower --tune grain --film-grain "OldTownCross_1920x1080_50_10bit_422.bin"
84
+#RaceHorses_416x240_30_10bit.yuv,--preset ultrafast --signhide --colormatrix bt709 --film-grain "RaceHorses_416x240_30_10bit.bin"
85
+
86
+#Temporal layers tests
87
+ducks_take_off_420_720p50.y4m,--preset slow --temporal-layers 3 --b-adapt 0
88
+parkrun_ter_720p50.y4m,--preset medium --temporal-layers 4 --b-adapt 0
89
+BasketballDrive_1920x1080_50.y4m, --preset medium --no-open-gop --keyint 50 --min-keyint 50 --temporal-layers 5 --b-adapt 0
90
 # vim: tw=200
91
x265_3.5.tar.gz/source/test/save-load-tests.txt -> x265_3.6.tar.gz/source/test/save-load-tests.txt Changed
16
 
1
@@ -12,10 +12,10 @@
2
 # not auto-detected.
3
 crowd_run_1080p50.y4m, --preset ultrafast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 8000 --vbv-bufsize 8000::crowd_run_2160p50.y4m, --preset ultrafast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 12000 --vbv-bufsize 12000
4
 crowd_run_540p50.y4m, --preset ultrafast --no-cutree --analysis-save x265_analysis.dat --scale-factor 2 --crf 26 --vbv-maxrate 8000 --vbv-bufsize 8000::crowd_run_1080p50.y4m, --preset ultrafast --no-cutree --analysis-load x265_analysis.dat --scale-factor 2 --crf 26 --vbv-maxrate 12000 --vbv-bufsize 12000
5
-crowd_run_1080p50.y4m, --preset superfast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m,   --preset superfast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000
6
-crowd_run_1080p50.y4m,  --preset fast --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m,   --preset fast --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --scale-factor 2 --qp 18
7
-crowd_run_1080p50.y4m,   --preset medium --no-cutree --analysis-save x265_analysis.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000  --vbv-maxrate 5000 --vbv-bufsize 5000 --early-skip --tu-inter-depth 3::crowd_run_2160p50.y4m,    --preset medium --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 4 --dynamic-refine::crowd_run_2160p50.y4m,    --preset medium --no-cutree --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 3 --refine-inter 3
8
+crowd_run_1080p50.y4m, --preset superfast --analysis-save x265_analysis.dat  --analysis-save-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m,   --preset superfast --analysis-load x265_analysis.dat  --analysis-load-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000
9
+crowd_run_1080p50.y4m,  --preset fast --analysis-save x265_analysis.dat  --analysis-save-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m,   --preset fast --analysis-load x265_analysis.dat  --analysis-load-reuse-level 5 --scale-factor 2 --qp 18
10
+crowd_run_1080p50.y4m,   --preset medium --analysis-save x265_analysis.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000  --vbv-maxrate 5000 --vbv-bufsize 5000 --early-skip --tu-inter-depth 3::crowd_run_2160p50.y4m,    --preset medium --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 4 --dynamic-refine::crowd_run_2160p50.y4m,    --preset medium --analysis-load x265_analysis.dat  --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 3 --refine-inter 3
11
 RaceHorses_416x240_30.y4m,   --preset slow --no-cutree --ctu 16 --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --crf 22  --vbv-maxrate 1000 --vbv-bufsize 1000::RaceHorses_832x480_30.y4m,    --preset slow --no-cutree --ctu 32 --analysis-load x265_analysis.dat  --analysis-save x265_analysis_2.dat --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --crf 16 --vbv-maxrate 4000 --vbv-bufsize 4000 --refine-intra 0 --refine-inter 1::RaceHorses_1664x960_30.y4m,   --preset slow --no-cutree --ctu 64 --analysis-load x265_analysis_2.dat  --analysis-load-reuse-level 10 --scale-factor 2 --crf 12 --vbv-maxrate 7000 --vbv-bufsize 7000 --refine-intra 2 --refine-inter 2
12
-crowd_run_540p50.y4m,   --preset veryslow --no-cutree --analysis-save x265_analysis_540.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-bufsize 15000 --vbv-maxrate 9000::crowd_run_1080p50.y4m,   --preset veryslow --no-cutree --analysis-save x265_analysis_1080.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_1080p50.y4m,  --preset veryslow --no-cutree --analysis-save x265_analysis_1080.dat --analysis-load x265_analysis_540.dat --refine-intra 4 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_2160p50.y4m,  --preset veryslow --no-cutree --analysis-save x265_analysis_2160.dat --analysis-load x265_analysis_1080.dat --refine-intra 3 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000::crowd_run_2160p50.y4m,  --preset veryslow --no-cutree --analysis-load x265_analysis_2160.dat --refine-intra 2 --dynamic-refine --analysis-load-reuse-level 10 --scale-factor 1 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000
13
+crowd_run_540p50.y4m,   --preset veryslow --analysis-save x265_analysis_540.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-bufsize 15000 --vbv-maxrate 9000::crowd_run_1080p50.y4m,   --preset veryslow --analysis-save x265_analysis_1080.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_1080p50.y4m,  --preset veryslow --analysis-save x265_analysis_1080.dat --analysis-load x265_analysis_540.dat --refine-intra 4 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_2160p50.y4m,  --preset veryslow --analysis-save x265_analysis_2160.dat --analysis-load x265_analysis_1080.dat --refine-intra 3 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000::crowd_run_2160p50.y4m,  --preset veryslow --analysis-load x265_analysis_2160.dat --refine-intra 2 --dynamic-refine --analysis-load-reuse-level 10 --scale-factor 1 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000
14
 crowd_run_540p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_540.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-bufsize 15000 --vbv-maxrate 9000::crowd_run_1080p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_1080.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_1080p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_1080.dat --analysis-load x265_analysis_540.dat --refine-intra 4 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_2160p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_2160.dat --analysis-load x265_analysis_1080.dat --refine-intra 3 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000::crowd_run_2160p50.y4m,  --preset medium --no-cutree --analysis-load x265_analysis_2160.dat --refine-intra 2 --dynamic-refine --analysis-load-reuse-level 10 --scale-factor 1 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000
15
 News-4k.y4m,  --preset medium --analysis-save x265_analysis_fdup.dat --frame-dup --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000::News-4k.y4m, --analysis-load x265_analysis_fdup.dat --frame-dup --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000
16
x265_3.5.tar.gz/source/test/smoke-tests.txt -> x265_3.6.tar.gz/source/test/smoke-tests.txt Changed
9
 
1
@@ -23,3 +23,7 @@
2
 # Main12 intraCost overflow bug test
3
 720p50_parkrun_ter.y4m,--preset medium
4
 720p50_parkrun_ter.y4m,--preset=fast --hevc-aq --no-cutree
5
+# Test FG SEI message addition
6
+# CrowdRun_1920x1080_50_10bit_444.yuv,--preset=ultrafast --weightp --keyint -1 --film-grain "CrowdRun_1920x1080_50_10bit_444.bin"
7
+# DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=veryfast --min-cu 16 --film-grain "DucksAndLegs_1920x1080_60_10bit_422.bin"
8
+# NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset=superfast --bitrate 10000 --sao --limit-sao --cll --max-cll "1000,400" --film-grain "NebutaFestival_2560x1600_60_10bit_crop.bin"
9
x265_3.5.tar.gz/source/test/testbench.cpp -> x265_3.6.tar.gz/source/test/testbench.cpp Changed
43
 
1
@@ -174,6 +174,8 @@
2
         { "AVX512", X265_CPU_AVX512 },
3
         { "ARMv6", X265_CPU_ARMV6 },
4
         { "NEON", X265_CPU_NEON },
5
+        { "SVE2", X265_CPU_SVE2 },
6
+        { "SVE", X265_CPU_SVE },
7
         { "FastNeonMRC", X265_CPU_FAST_NEON_MRC },
8
         { "", 0 },
9
     };
10
@@ -208,15 +210,8 @@
11
 
12
         EncoderPrimitives asmprim;
13
         memset(&asmprim, 0, sizeof(asmprim));
14
-        setupAssemblyPrimitives(asmprim, test_archi.flag);
15
-
16
-#if X265_ARCH_ARM64
17
-        /* Temporary workaround because luma_vsp assembly primitive has not been completed
18
-         * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
19
-         * Otherwise, segment fault occurs. */
20
-        setupAliasCPrimitives(cprim, asmprim, test_archi.flag);
21
-#endif
22
 
23
+        setupAssemblyPrimitives(asmprim, test_archi.flag);
24
         setupAliasPrimitives(asmprim);
25
         memcpy(&primitives, &asmprim, sizeof(EncoderPrimitives));
26
         for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++)
27
@@ -239,14 +234,8 @@
28
 #if X265_ARCH_X86
29
     setupInstrinsicPrimitives(optprim, cpuid);
30
 #endif
31
-    setupAssemblyPrimitives(optprim, cpuid);
32
 
33
-#if X265_ARCH_ARM64
34
-    /* Temporary workaround because luma_vsp assembly primitive has not been completed
35
-     * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
36
-     * Otherwise, segment fault occurs. */
37
-    setupAliasCPrimitives(cprim, optprim, cpuid);
38
-#endif
39
+    setupAssemblyPrimitives(optprim, cpuid);
40
 
41
     /* Note that we do not setup aliases for performance tests, that would be
42
      * redundant. The testbench only verifies they are correctly aliased */
43
x265_3.5.tar.gz/source/test/testharness.h -> x265_3.6.tar.gz/source/test/testharness.h Changed
48
 
1
@@ -73,7 +73,7 @@
2
 #include <x86intrin.h>
3
 #elif ( !defined(__APPLE__) && defined (__GNUC__) && defined(__ARM_NEON__))
4
 #include <arm_neon.h>
5
-#elif defined(__GNUC__) && (!defined(__clang__) || __clang_major__ < 4)
6
+#else
7
 /* fallback for older GCC/MinGW */
8
 static inline uint32_t __rdtsc(void)
9
 {
10
@@ -82,15 +82,13 @@
11
 #if X265_ARCH_X86
12
     asm volatile("rdtsc" : "=a" (a) ::"edx");
13
 #elif X265_ARCH_ARM
14
-#if X265_ARCH_ARM64
15
-    asm volatile("mrs %0, cntvct_el0" : "=r"(a));
16
-#else
17
     // TOD-DO: verify following inline asm to get cpu Timestamp Counter for ARM arch
18
     // asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(a));
19
 
20
     // TO-DO: replace clock() function with appropriate ARM cpu instructions
21
     a = clock();
22
-#endif
23
+#elif  X265_ARCH_ARM64
24
+    asm volatile("mrs %0, cntvct_el0" : "=r"(a));
25
 #endif
26
     return a;
27
 }
28
@@ -128,8 +126,8 @@
29
         x265_emms(); \
30
         float optperf = (10.0f * cycles / runs) / 4; \
31
         float refperf = (10.0f * refcycles / refruns) / 4; \
32
-        printf("\t%3.2fx ", refperf / optperf); \
33
-        printf("\t %-8.2lf \t %-8.2lf\n", optperf, refperf); \
34
+        printf(" | \t%3.2fx | ", refperf / optperf); \
35
+        printf("\t %-8.2lf | \t %-8.2lf\n", optperf, refperf); \
36
     }
37
 
38
 extern "C" {
39
@@ -140,7 +138,7 @@
40
  * needs an explicit asm check because it only sometimes crashes in normal use. */
41
 intptr_t PFX(checkasm_call)(intptr_t (*func)(), int *ok, ...);
42
 float PFX(checkasm_call_float)(float (*func)(), int *ok, ...);
43
-#elif X265_ARCH_ARM == 0
44
+#elif (X265_ARCH_ARM == 0 && X265_ARCH_ARM64 == 0)
45
 #define PFX(stack_pagealign)(func, align) func()
46
 #endif
47
 
48
x265_3.5.tar.gz/source/x265.cpp -> x265_3.6.tar.gz/source/x265.cpp Changed
18
 
1
@@ -296,6 +296,16 @@
2
 
3
     int ret = 0;
4
 
5
+    if (cliopt0.scenecutAwareQpConfig)
6
+    {
7
+        if (!cliopt0.parseScenecutAwareQpConfig())
8
+        {
9
+            x265_log(NULL, X265_LOG_ERROR, "Unable to parse scenecut aware qp config file \n");
10
+            fclose(cliopt0.scenecutAwareQpConfig);
11
+            cliopt0.scenecutAwareQpConfig = NULL;
12
+        }
13
+    }
14
+
15
     AbrEncoder* abrEnc = new AbrEncoder(cliopt, numEncodes, ret);
16
     int threadsActive = abrEnc->m_numActiveEncodes.get();
17
     while (threadsActive)
18
x265_3.5.tar.gz/source/x265.h -> x265_3.6.tar.gz/source/x265.h Changed
201
 
1
@@ -26,6 +26,7 @@
2
 #define X265_H
3
 #include <stdint.h>
4
 #include <stdio.h>
5
+#include <sys/stat.h>
6
 #include "x265_config.h"
7
 #ifdef __cplusplus
8
 extern "C" {
9
@@ -59,7 +60,7 @@
10
     NAL_UNIT_CODED_SLICE_TRAIL_N = 0,
11
     NAL_UNIT_CODED_SLICE_TRAIL_R,
12
     NAL_UNIT_CODED_SLICE_TSA_N,
13
-    NAL_UNIT_CODED_SLICE_TLA_R,
14
+    NAL_UNIT_CODED_SLICE_TSA_R,
15
     NAL_UNIT_CODED_SLICE_STSA_N,
16
     NAL_UNIT_CODED_SLICE_STSA_R,
17
     NAL_UNIT_CODED_SLICE_RADL_N,
18
@@ -311,6 +312,7 @@
19
     double           vmafFrameScore;
20
     double           bufferFillFinal;
21
     double           unclippedBufferFillFinal;
22
+    uint8_t          tLayer;
23
 } x265_frame_stats;
24
 
25
 typedef struct x265_ctu_info_t
26
@@ -536,6 +538,8 @@
27
 /* ARM */
28
 #define X265_CPU_ARMV6           0x0000001
29
 #define X265_CPU_NEON            0x0000002  /* ARM NEON */
30
+#define X265_CPU_SVE2            0x0000008  /* ARM SVE2 */
31
+#define X265_CPU_SVE             0x0000010  /* ARM SVE2 */
32
 #define X265_CPU_FAST_NEON_MRC   0x0000004  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
33
 
34
 /* IBM Power8 */
35
@@ -613,6 +617,13 @@
36
 #define SLICE_TYPE_DELTA        0.3 /* The offset decremented or incremented for P-frames or b-frames respectively*/
37
 #define BACKWARD_WINDOW         1 /* Scenecut window before a scenecut */
38
 #define FORWARD_WINDOW          2 /* Scenecut window after a scenecut */
39
+#define BWD_WINDOW_DELTA        0.4
40
+
41
+#define X265_MAX_GOP_CONFIG 3
42
+#define X265_MAX_GOP_LENGTH 16
43
+#define MAX_T_LAYERS 7
44
+
45
+#define X265_IPRATIO_STRENGTH   1.43
46
 
47
 typedef struct x265_cli_csp
48
 {
49
@@ -696,6 +707,7 @@
50
 typedef struct x265_zone
51
 {
52
     int   startFrame, endFrame; /* range of frame numbers */
53
+    int   keyframeMax;          /* it store the default/user defined keyframeMax value*/
54
     int   bForceQp;             /* whether to use qp vs bitrate factor */
55
     int   qp;
56
     float bitrateFactor;
57
@@ -747,6 +759,271 @@
58
 
59
 static const x265_vmaf_commondata vcd = { { NULL, (char *)"/usr/local/share/model/vmaf_v0.6.1.pkl", NULL, NULL, 0, 0, 0, 0, 0, 0, 0, NULL, 0, 1, 0 } };
60
 
61
+typedef struct x265_temporal_layer {
62
+    int poc_offset;      /* POC offset */
63
+    int8_t layer;        /* Current layer */
64
+    int8_t qp_offset;    /* QP offset */
65
+} x265_temporal_layer;
66
+
67
+static const int8_t x265_temporal_layer_bframesMAX_T_LAYERS = {-1, -1, 3, 7, 15, -1, -1};
68
+
69
+static const int8_t x265_gop_ra_lengthX265_MAX_GOP_CONFIG = { 4, 8, 16};
70
+static const x265_temporal_layer x265_gop_raX265_MAX_GOP_CONFIGX265_MAX_GOP_LENGTH = {
71
+    {
72
+        {
73
+            4,
74
+            0,
75
+            1,
76
+        },
77
+        {
78
+            2,
79
+            1,
80
+            5,
81
+        },
82
+        {
83
+            1,
84
+            2,
85
+            3,
86
+        },
87
+        {
88
+            3,
89
+            2,
90
+            5,
91
+        },
92
+        {
93
+            -1,
94
+            -1,
95
+            -1,
96
+        },
97
+        {
98
+            -1,
99
+            -1,
100
+            -1,
101
+        },
102
+        {
103
+            -1,
104
+            -1,
105
+            -1,
106
+        },
107
+        {
108
+            -1,
109
+            -1,
110
+            -1,
111
+        },
112
+        {
113
+            -1,
114
+            -1,
115
+            -1,
116
+        },
117
+        {
118
+            -1,
119
+            -1,
120
+            -1,
121
+        },
122
+        {
123
+            -1,
124
+            -1,
125
+            -1,
126
+        },
127
+        {
128
+            -1,
129
+            -1,
130
+            -1,
131
+        },
132
+        {
133
+            -1,
134
+            -1,
135
+            -1,
136
+        },
137
+        {
138
+            -1,
139
+            -1,
140
+            -1,
141
+        },
142
+        {
143
+            -1,
144
+            -1,
145
+            -1,
146
+        },
147
+        {
148
+            -1,
149
+            -1,
150
+            -1,
151
+        }
152
+    },
153
+
154
+    {
155
+        {
156
+            8,
157
+            0,
158
+            1,
159
+        },
160
+        {
161
+            4,
162
+            1,
163
+            5,
164
+        },
165
+        {
166
+            2,
167
+            2,
168
+            4,
169
+        },
170
+        {
171
+            1,
172
+            3,
173
+            5,
174
+        },
175
+        {
176
+            3,
177
+            3,
178
+            2,
179
+        },
180
+        {
181
+            6,
182
+            2,
183
+            5,
184
+        },
185
+        {
186
+            5,
187
+            3,
188
+            4,
189
+        },
190
+        {
191
+            7,
192
+            3,
193
+            5,
194
+        },
195
+        {
196
+            -1,
197
+            -1,
198
+            -1,
199
+        },
200
+        {
201
x265_3.5.tar.gz/source/x265cli.cpp -> x265_3.6.tar.gz/source/x265cli.cpp Changed
201
 
1
@@ -28,8 +28,8 @@
2
 #include "x265cli.h"
3
 #include "svt.h"
4
 
5
-#define START_CODE 0x00000001
6
-#define START_CODE_BYTES 4
7
+#define START_CODE 0x00000001
8
+#define START_CODE_BYTES 4
9
 
10
 #ifdef __cplusplus
11
 namespace X265_NS {
12
@@ -166,6 +166,7 @@
13
         H0("   --rdpenalty <0..2>            penalty for 32x32 intra TU in non-I slices. 0:disabled 1:RD-penalty 2:maximum. Default %d\n", param->rdPenalty);
14
         H0("\nSlice decision options:\n");
15
         H0("   --no-open-gop               Enable open-GOP, allows I slices to be non-IDR. Default %s\n", OPT(param->bOpenGOP));
16
+       H0("   --cra-nal                     Force nal type to CRA to all frames expect first frame, works only with keyint 1. Default %s\n", OPT(param->craNal));
17
         H0("-I/--keyint <integer>            Max IDR period in frames. -1 for infinite-gop. Default %d\n", param->keyframeMax);
18
         H0("-i/--min-keyint <integer>        Scenecuts closer together than this are coded as I, not IDR. Default: auto\n");
19
         H0("   --gop-lookahead <integer>     Extends gop boundary if a scenecut is found within this from keyint boundary. Default 0\n");
20
@@ -174,7 +175,6 @@
21
         H1("   --scenecut-bias <0..100.0>    Bias for scenecut detection. Default %.2f\n", param->scenecutBias);
22
         H0("   --hist-scenecut               Enables histogram based scene-cut detection using histogram based algorithm.\n");
23
         H0("   --no-hist-scenecut            Disables histogram based scene-cut detection using histogram based algorithm.\n");
24
-        H1("   --hist-threshold <0.0..1.0>   Luma Edge histogram's Normalized SAD threshold for histogram based scenecut detection Default %.2f\n", param->edgeTransitionThreshold);
25
         H0("   --no-fades                  Enable detection and handling of fade-in regions. Default %s\n", OPT(param->bEnableFades));
26
         H1("   --scenecut-aware-qp <0..3>    Enable increasing QP for frames inside the scenecut window around scenecut. Default %s\n", OPT(param->bEnableSceneCutAwareQp));
27
         H1("                                 0 - Disabled\n");
28
@@ -182,6 +182,7 @@
29
         H1("                                 2 - Backward masking\n");
30
         H1("                                 3 - Bidirectional masking\n");
31
         H1("   --masking-strength <string>   Comma separated values which specify the duration and offset for the QP increment for inter-frames when scenecut-aware-qp is enabled.\n");
32
+        H1("   --scenecut-qp-config <file>   File containing scenecut-aware-qp mode, window duration and offsets settings required for the masking. Works only with --pass 2\n");
33
         H0("   --radl <integer>              Number of RADL pictures allowed in front of IDR. Default %d\n", param->radl);
34
         H0("   --intra-refresh               Use Periodic Intra Refresh instead of IDR frames\n");
35
         H0("   --rc-lookahead <integer>      Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth);
36
@@ -262,6 +263,7 @@
37
         H0("   --aq-strength <float>         Reduces blocking and blurring in flat and textured areas (0 to 3.0). Default %.2f\n", param->rc.aqStrength);
38
         H0("   --qp-adaptation-range <float> Delta QP range by QP adaptation based on a psycho-visual model (1.0 to 6.0). Default %.2f\n", param->rc.qpAdaptationRange);
39
         H0("   --no-aq-motion              Block level QP adaptation based on the relative motion between the block and the frame. Default %s\n", OPT(param->bAQMotion));
40
+        H1("   --no-sbrc                   Enables the segment based rate control. Default %s\n", OPT(param->bEnableSBRC));
41
         H0("   --qg-size <int>               Specifies the size of the quantization group (64, 32, 16, 8). Default %d\n", param->rc.qgSize);
42
         H0("   --no-cutree                 Enable cutree for Adaptive Quantization. Default %s\n", OPT(param->rc.cuTree));
43
         H0("   --no-rc-grain               Enable ratecontrol mode to handle grains specifically. turned on with tune grain. Default %s\n", OPT(param->rc.bEnableGrain));
44
@@ -282,6 +284,7 @@
45
         H1("                                       q=<integer> (force QP)\n");
46
         H1("                                   or  b=<float> (bitrate multiplier)\n");
47
         H0("   --zonefile <filename>         Zone file containing the zone boundaries and the parameters to be reconfigured.\n");
48
+        H0("   --no-zonefile-rc-init         This allow to use rate-control history across zones in zonefile.\n");
49
         H1("   --lambda-file <string>        Specify a file containing replacement values for the lambda tables\n");
50
         H1("                                 MAX_MAX_QP+1 floats for lambda table, then again for lambda2 table\n");
51
         H1("                                 Blank lines and lines starting with hash(#) are ignored\n");
52
@@ -314,6 +317,30 @@
53
         H0("   --master-display <string>     SMPTE ST 2086 master display color volume info SEI (HDR)\n");
54
         H0("                                    format: G(x,y)B(x,y)R(x,y)WP(x,y)L(max,min)\n");
55
         H0("   --max-cll <string>            Specify content light level info SEI as \"cll,fall\" (HDR).\n");
56
+        H0("   --video-signal-type-preset <string>    Specify combinations of color primaries, transfer characteristics, color matrix, range of luma and chroma signals, and chroma sample location\n");
57
+        H0("                                            format: <system-id>:<color-volume>\n");
58
+        H0("                                            This has higher precedence than individual VUI parameters. If any individual VUI option is specified together with this,\n");
59
+        H0("                                            which changes the values set corresponding to the system-id or color-volume, it will be discarded.\n");
60
+        H0("                                            The color-volume can be used only with the system-id options BT2100_PQ_YCC, BT2100_PQ_ICTCP, and BT2100_PQ_RGB.\n");
61
+        H0("                                            system-id options and their corresponding values:\n");
62
+        H0("                                              BT601_525:       --colorprim smpte170m --transfer smpte170m --colormatrix smpte170m --range limited --chromaloc 0\n");
63
+        H0("                                              BT601_626:       --colorprim bt470bg --transfer smpte170m --colormatrix bt470bg --range limited --chromaloc 0\n");
64
+        H0("                                              BT709_YCC:       --colorprim bt709 --transfer bt709 --colormatrix bt709 --range limited --chromaloc 0\n");
65
+        H0("                                              BT709_RGB:       --colorprim bt709 --transfer bt709 --colormatrix gbr --range limited\n");
66
+        H0("                                              BT2020_YCC_NCL:  --colorprim bt2020 --transfer bt2020-10 --colormatrix bt709 --range limited --chromaloc 2\n");
67
+        H0("                                              BT2020_RGB:      --colorprim bt2020 --transfer smpte2084 --colormatrix bt2020nc --range limited\n");
68
+        H0("                                              BT2100_PQ_YCC:   --colorprim bt2020 --transfer smpte2084 --colormatrix bt2020nc --range limited --chromaloc 2\n");
69
+        H0("                                              BT2100_PQ_ICTCP: --colorprim bt2020 --transfer smpte2084 --colormatrix ictcp --range limited --chromaloc 2\n");
70
+        H0("                                              BT2100_PQ_RGB:   --colorprim bt2020 --transfer smpte2084 --colormatrix gbr --range limited\n");
71
+        H0("                                              BT2100_HLG_YCC:  --colorprim bt2020 --transfer arib-std-b67 --colormatrix bt2020nc --range limited --chromaloc 2\n");
72
+        H0("                                              BT2100_HLG_RGB:  --colorprim bt2020 --transfer arib-std-b67 --colormatrix gbr --range limited\n");
73
+        H0("                                              FR709_RGB:       --colorprim bt709 --transfer bt709 --colormatrix gbr --range full\n");
74
+        H0("                                              FR2020_RGB:      --colorprim bt2020 --transfer bt2020-10 --colormatrix gbr --range full\n");
75
+        H0("                                              FRP3D65_YCC:     --colorprim smpte432 --transfer bt709 --colormatrix smpte170m --range full --chromaloc 1\n");
76
+        H0("                                            color-volume options and their corresponding values:\n");
77
+        H0("                                              P3D65x1000n0005: --master-display G(13250,34500)B(7500,3000)R(34000,16000)WP(15635,16450)L(10000000,5)\n");
78
+        H0("                                              P3D65x4000n005:  --master-display G(13250,34500)B(7500,3000)R(34000,16000)WP(15635,16450)L(40000000,50)\n");
79
+        H0("                                              BT2100x108n0005: --master-display G(8500,39850)B(6550,2300)R(34000,146000)WP(15635,16450)L(10000000,1)\n");
80
         H0("   --no-cll                    Emit content light level info SEI. Default %s\n", OPT(param->bEmitCLL));
81
         H0("   --no-hdr10                  Control dumping of HDR10 SEI packet. If max-cll or master-display has non-zero values, this is enabled. Default %s\n", OPT(param->bEmitHDR10SEI));
82
         H0("   --no-hdr-opt                Add luma and chroma offsets for HDR/WCG content. Default %s. Now deprecated.\n", OPT(param->bHDROpt));
83
@@ -324,9 +351,11 @@
84
         H0("   --no-repeat-headers         Emit SPS and PPS headers at each keyframe. Default %s\n", OPT(param->bRepeatHeaders));
85
         H0("   --no-info                   Emit SEI identifying encoder and parameters. Default %s\n", OPT(param->bEmitInfoSEI));
86
         H0("   --no-hrd                    Enable HRD parameters signaling. Default %s\n", OPT(param->bEmitHRDSEI));
87
-        H0("   --no-idr-recovery-sei      Emit recovery point infor SEI at each IDR frame \n");
88
-        H0("   --no-temporal-layers        Enable a temporal sublayer for unreferenced B frames. Default %s\n", OPT(param->bEnableTemporalSubLayers));
89
+        H0("   --no-idr-recovery-sei       Emit recovery point infor SEI at each IDR frame \n");
90
+        H0("   --temporal-layers             Enable a temporal sublayer for unreferenced B frames. Default %s\n", OPT(param->bEnableTemporalSubLayers));
91
         H0("   --no-aud                    Emit access unit delimiters at the start of each access unit. Default %s\n", OPT(param->bEnableAccessUnitDelimiters));
92
+        H0("   --no-eob                    Emit end of bitstream nal unit at the end of the bitstream. Default %s\n", OPT(param->bEnableEndOfBitstream));
93
+        H0("   --no-eos                    Emit end of sequence nal unit at the end of every coded video sequence. Default %s\n", OPT(param->bEnableEndOfSequence));
94
         H1("   --hash <integer>              Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default %d\n", param->decodedPictureHashSEI);
95
         H0("   --atc-sei <integer>           Emit the alternative transfer characteristics SEI message where the integer is the preferred transfer characteristics. Default disabled\n");
96
         H0("   --pic-struct <integer>        Set the picture structure and emits it in the picture timing SEI message. Values in the range 0..12. See D.3.3 of the HEVC spec. for a detailed explanation.\n");
97
@@ -344,6 +373,7 @@
98
         H0("   --lowpass-dct                 Use low-pass subband dct approximation. Default %s\n", OPT(param->bLowPassDct));
99
         H0("   --no-frame-dup              Enable Frame duplication. Default %s\n", OPT(param->bEnableFrameDuplication));
100
         H0("   --dup-threshold <integer>     PSNR threshold for Frame duplication. Default %d\n", param->dupThreshold);
101
+        H0("   --no-mcstf                  Enable GOP based temporal filter. Default %d\n", param->bEnableTemporalFilter);
102
 #ifdef SVT_HEVC
103
         H0("   --nosvt                     Enable SVT HEVC encoder %s\n", OPT(param->bEnableSvtHevc));
104
         H0("   --no-svt-hme                Enable Hierarchial motion estimation(HME) in SVT HEVC encoder \n");
105
@@ -365,6 +395,9 @@
106
         H1("    2 - unable to open encoder\n");
107
         H1("    3 - unable to generate stream headers\n");
108
         H1("    4 - encoder abort\n");
109
+        H0("\nSEI Message Options\n");
110
+        H0("   --film-grain <filename>           File containing Film Grain Characteristics to be written as a SEI Message\n");
111
+
112
 #undef OPT
113
 #undef H0
114
 #undef H1
115
@@ -484,6 +517,9 @@
116
 
117
         memcpy(globalParam->rc.zoneszonefileCount.zoneParam, globalParam, sizeof(x265_param));
118
 
119
+        if (zonefileCount == 0)
120
+            globalParam->rc.zoneszonefileCount.keyframeMax = globalParam->keyframeMax;
121
+
122
         for (optind = 0;;)
123
         {
124
             int long_options_index = -1;
125
@@ -708,12 +744,19 @@
126
                         return true;
127
                     }
128
                 }
129
+                OPT("scenecut-qp-config")
130
+                {
131
+                    this->scenecutAwareQpConfig = x265_fopen(optarg, "rb");
132
+                    if (!this->scenecutAwareQpConfig)
133
+                        x265_log_file(param, X265_LOG_ERROR, "%s scenecut aware qp config file not found or error in opening config file\n", optarg);
134
+                }
135
                 OPT("zonefile")
136
                 {
137
                     this->zoneFile = x265_fopen(optarg, "rb");
138
                     if (!this->zoneFile)
139
                         x265_log_file(param, X265_LOG_ERROR, "%s zone file not found or error in opening zone file\n", optarg);
140
                 }
141
+                OPT("no-zonefile-rc-init") this->param->bNoResetZoneConfig = true;
142
                 OPT("fullhelp")
143
                 {
144
                     param->logLevel = X265_LOG_FULL;
145
@@ -875,7 +918,7 @@
146
             if (reconFileBitDepth == 0)
147
                 reconFileBitDepth = param->internalBitDepth;
148
             this->recon = ReconFile::open(reconfn, param->sourceWidth, param->sourceHeight, reconFileBitDepth,
149
-                param->fpsNum, param->fpsDenom, param->internalCsp);
150
+                param->fpsNum, param->fpsDenom, param->internalCsp, param->sourceBitDepth);
151
             if (this->recon->isFail())
152
             {
153
                 x265_log(param, X265_LOG_WARNING, "unable to write reconstructed outputs file\n");
154
@@ -973,6 +1016,7 @@
155
         param->rc.zones = X265_MALLOC(x265_zone, param->rc.zonefileCount);
156
         for (int i = 0; i < param->rc.zonefileCount; i++)
157
         {
158
+            param->rc.zonesi.startFrame = -1;
159
             while (fgets(line, sizeof(line), zoneFile))
160
             {
161
                 if (*line == '#' || (strcmp(line, "\r\n") == 0))
162
@@ -1010,57 +1054,179 @@
163
         return 1;
164
     }
165
 
166
-    /* Parse the RPU file and extract the RPU corresponding to the current picture
167
-    * and fill the rpu field of the input picture */
168
-    int CLIOptions::rpuParser(x265_picture * pic)
169
-    {
170
-        uint8_t byteVal;
171
-        uint32_t code = 0;
172
-        int bytesRead = 0;
173
-        pic->rpu.payloadSize = 0;
174
-
175
-        if (!pic->pts)
176
-        {
177
-            while (bytesRead++ < 4 && fread(&byteVal, sizeof(uint8_t), 1, dolbyVisionRpu))
178
-                code = (code << 8) | byteVal;
179
-
180
-            if (code != START_CODE)
181
-            {
182
-                x265_log(NULL, X265_LOG_ERROR, "Invalid Dolby Vision RPU startcode in POC %d\n", pic->pts);
183
-                return 1;
184
-            }
185
-        }
186
-
187
-        bytesRead = 0;
188
-        while (fread(&byteVal, sizeof(uint8_t), 1, dolbyVisionRpu))
189
-        {
190
-            code = (code << 8) | byteVal;
191
-            if (bytesRead++ < 3)
192
-                continue;
193
-            if (bytesRead >= 1024)
194
-            {
195
-                x265_log(NULL, X265_LOG_ERROR, "Invalid Dolby Vision RPU size in POC %d\n", pic->pts);
196
-                return 1;
197
-            }
198
-
199
-            if (code != START_CODE)
200
-                pic->rpu.payloadpic->rpu.payloadSize++ = (code >> (3 * 8)) & 0xFF;
201
x265_3.5.tar.gz/source/x265cli.h -> x265_3.6.tar.gz/source/x265cli.h Changed
104
 
1
@@ -135,6 +135,7 @@
2
     { "no-fast-intra",        no_argument, NULL, 0 },
3
     { "no-open-gop",          no_argument, NULL, 0 },
4
     { "open-gop",             no_argument, NULL, 0 },
5
+    { "cra-nal",              no_argument, NULL, 0 },
6
     { "keyint",         required_argument, NULL, 'I' },
7
     { "min-keyint",     required_argument, NULL, 'i' },
8
     { "gop-lookahead",  required_argument, NULL, 0 },
9
@@ -143,7 +144,6 @@
10
     { "scenecut-bias",  required_argument, NULL, 0 },
11
     { "hist-scenecut",        no_argument, NULL, 0},
12
     { "no-hist-scenecut",     no_argument, NULL, 0},
13
-    { "hist-threshold", required_argument, NULL, 0},
14
     { "fades",                no_argument, NULL, 0 },
15
     { "no-fades",             no_argument, NULL, 0 },
16
     { "scenecut-aware-qp", required_argument, NULL, 0 },
17
@@ -182,6 +182,8 @@
18
     { "qp",             required_argument, NULL, 'q' },
19
     { "aq-mode",        required_argument, NULL, 0 },
20
     { "aq-strength",    required_argument, NULL, 0 },
21
+    { "sbrc",                 no_argument, NULL, 0 },
22
+    { "no-sbrc",              no_argument, NULL, 0 },
23
     { "rc-grain",             no_argument, NULL, 0 },
24
     { "no-rc-grain",          no_argument, NULL, 0 },
25
     { "ipratio",        required_argument, NULL, 0 },
26
@@ -244,6 +246,7 @@
27
     { "crop-rect",      required_argument, NULL, 0 }, /* DEPRECATED */
28
     { "master-display", required_argument, NULL, 0 },
29
     { "max-cll",        required_argument, NULL, 0 },
30
+    {"video-signal-type-preset", required_argument, NULL, 0 },
31
     { "min-luma",       required_argument, NULL, 0 },
32
     { "max-luma",       required_argument, NULL, 0 },
33
     { "log2-max-poc-lsb", required_argument, NULL, 8 },
34
@@ -263,11 +266,16 @@
35
     { "repeat-headers",       no_argument, NULL, 0 },
36
     { "aud",                  no_argument, NULL, 0 },
37
     { "no-aud",               no_argument, NULL, 0 },
38
+    { "eob",                  no_argument, NULL, 0 },
39
+    { "no-eob",               no_argument, NULL, 0 },
40
+    { "eos",                  no_argument, NULL, 0 },
41
+    { "no-eos",               no_argument, NULL, 0 },
42
     { "info",                 no_argument, NULL, 0 },
43
     { "no-info",              no_argument, NULL, 0 },
44
     { "zones",          required_argument, NULL, 0 },
45
     { "qpfile",         required_argument, NULL, 0 },
46
     { "zonefile",       required_argument, NULL, 0 },
47
+    { "no-zonefile-rc-init",  no_argument, NULL, 0 },
48
     { "lambda-file",    required_argument, NULL, 0 },
49
     { "b-intra",              no_argument, NULL, 0 },
50
     { "no-b-intra",           no_argument, NULL, 0 },
51
@@ -298,8 +306,7 @@
52
     { "dynamic-refine",       no_argument, NULL, 0 },
53
     { "no-dynamic-refine",    no_argument, NULL, 0 },
54
     { "strict-cbr",           no_argument, NULL, 0 },
55
-    { "temporal-layers",      no_argument, NULL, 0 },
56
-    { "no-temporal-layers",   no_argument, NULL, 0 },
57
+    { "temporal-layers",      required_argument, NULL, 0 },
58
     { "qg-size",        required_argument, NULL, 0 },
59
     { "recon-y4m-exec", required_argument, NULL, 0 },
60
     { "analyze-src-pics", no_argument, NULL, 0 },
61
@@ -349,6 +356,8 @@
62
     { "frame-dup",            no_argument, NULL, 0 },
63
     { "no-frame-dup", no_argument, NULL, 0 },
64
     { "dup-threshold", required_argument, NULL, 0 },
65
+    { "mcstf",                 no_argument, NULL, 0 },
66
+    { "no-mcstf",              no_argument, NULL, 0 },
67
 #ifdef SVT_HEVC
68
     { "svt",     no_argument, NULL, 0 },
69
     { "no-svt",  no_argument, NULL, 0 },
70
@@ -373,6 +382,8 @@
71
     { "abr-ladder", required_argument, NULL, 0 },
72
     { "min-vbv-fullness", required_argument, NULL, 0 },
73
     { "max-vbv-fullness", required_argument, NULL, 0 },
74
+    { "scenecut-qp-config", required_argument, NULL, 0 },
75
+    { "film-grain", required_argument, NULL, 0 },
76
     { 0, 0, 0, 0 },
77
     { 0, 0, 0, 0 },
78
     { 0, 0, 0, 0 },
79
@@ -388,6 +399,7 @@
80
         FILE*       qpfile;
81
         FILE*       zoneFile;
82
         FILE*    dolbyVisionRpu;    /* File containing Dolby Vision BL RPU metadata */
83
+        FILE*    scenecutAwareQpConfig; /* File containing scenecut aware frame quantization related CLI options */
84
         const char* reconPlayCmd;
85
         const x265_api* api;
86
         x265_param* param;
87
@@ -425,6 +437,7 @@
88
             qpfile = NULL;
89
             zoneFile = NULL;
90
             dolbyVisionRpu = NULL;
91
+            scenecutAwareQpConfig = NULL;
92
             reconPlayCmd = NULL;
93
             api = NULL;
94
             param = NULL;
95
@@ -455,6 +468,8 @@
96
         bool parseQPFile(x265_picture &pic_org);
97
         bool parseZoneFile();
98
         int rpuParser(x265_picture * pic);
99
+        bool parseScenecutAwareQpConfig();
100
+        bool parseScenecutAwareQpParam(int argc, char **argv, x265_param* globalParam);
101
     };
102
 #ifdef __cplusplus
103
 }
104
x265_3.5.tar.gz/x265Version.txt -> x265_3.6.tar.gz/x265Version.txt Changed
8
 
1
@@ -1,4 +1,4 @@
2
 #Attribute:         Values
3
-repositorychangeset: f0c1022b6
4
+repositorychangeset: aa7f602f7
5
 releasetagdistance: 1
6
-releasetag: 3.5
7
+releasetag: 3.6
8