We truncated the diff of some files because they were too big.
If you want to see the full diff for every file, click here.
Changes of Revision 42
x265.changes
Changed
x
1
2
-------------------------------------------------------------------
3
+Thu Jun 13 05:58:19 UTC 2024 - Luigi Baldoni <aloisio@gmx.com>
4
+
5
+- Update to version 3.6
6
+ New features:
7
+ * Segment based Ratecontrol (SBRC) feature
8
+ * Motion-Compensated Spatio-Temporal Filtering
9
+ * Scene-cut aware qp - BBAQ (Bidirectional Boundary Aware
10
+ Quantization)
11
+ * Histogram-Based Scene Change Detection
12
+ * Film-Grain characteristics as a SEI message to support Film
13
+ Grain Synthesis(FGS)
14
+ * Add temporal layer implementation(Hierarchical B-frame
15
+ implementation)
16
+ Enhancements to existing features:
17
+ * Added Dolby Vision 8.4 Profile Support
18
+ API changes:
19
+ * Add Segment based Ratecontrol(SBRC) feature: "--no-sbrc".
20
+ * Add command line parameter for mcstf feature: "--no-mctf".
21
+ * Add command line parameters for the scene cut aware qp
22
+ feature: "--scenecut-aware-qp" and "--masking-strength".
23
+ * Add command line parameters for Histogram-Based Scene Change
24
+ Detection: "--hist-scenecut".
25
+ * Add film grain characteristics as a SEI message to the
26
+ bitstream: "--film-grain <filename>"
27
+ * cli: add new option --cra-nal (Force nal type to CRA to all
28
+ frames expect for the first frame, works only with keyint 1)
29
+ Optimizations:
30
+ * ARM64 NEON optimizations:- Several time-consuming C
31
+ functions have been optimized for the targeted platform -
32
+ aarch64. The overall performance increased by around 20%.
33
+ * SVE/SVE2 optimizations
34
+ Bug fixes:
35
+ * Linux bug to utilize all the cores
36
+ * Crash with hist-scenecut build when source resolution is not
37
+ multiple of minCuSize
38
+ * 32bit and 64bit builds generation for ARM
39
+ * bugs in zonefile feature (Reflect Zonefile Parameters inside
40
+ Lookahead, extra IDR issue, Avg I Slice QP value issue etc..)
41
+ * Add x86 ASM implementation for subsampling luma
42
+ * Fix for abrladder segfault with load reuse level 1
43
+ * Reorder miniGOP based on temporal layer hierarchy and add
44
+ support for more B frame
45
+ * Add MacOS aarch64 build support
46
+ * Fix boundary condition issue for Gaussian filter
47
+- Drop arm.patch and replace it with 0001-Fix-arm-flags.patch
48
+ and 0004-Do-not-build-with-assembly-support-on-arm.patch
49
+ (courtesy of Debian)
50
+
51
+-------------------------------------------------------------------
52
Wed May 19 13:21:09 UTC 2021 - Luigi Baldoni <aloisio@gmx.com>
53
54
- Build libx265_main10 and libx265_main12 unconditionally and
55
x265.spec
Changed
46
1
2
#
3
# spec file for package x265
4
#
5
-# Copyright (c) 2021 Packman Team <packman@links2linux.de>
6
+# Copyright (c) 2024 Packman Team <packman@links2linux.de>
7
# Copyright (c) 2014 Torsten Gruner <t.gruner@katodev.de>
8
#
9
# All modifications and additions to the file contributed by third parties
10
11
#
12
13
14
-%define sover 199
15
+%define sover 209
16
%define libname lib%{name}
17
%define libsoname %{libname}-%{sover}
18
-%define uver 3_5
19
+%define uver 3_6
20
Name: x265
21
-Version: 3.5
22
+Version: 3.6
23
Release: 0
24
Summary: A free h265/HEVC encoder - encoder binary
25
License: GPL-2.0-or-later
26
Group: Productivity/Multimedia/Video/Editors and Convertors
27
URL: https://bitbucket.org/multicoreware/x265_git
28
Source0: https://bitbucket.org/multicoreware/x265_git/downloads/%{name}_%{version}.tar.gz
29
-Patch0: arm.patch
30
Patch1: x265.pkgconfig.patch
31
Patch2: x265-fix_enable512.patch
32
+Patch3: 0001-Fix-arm-flags.patch
33
+Patch4: 0004-Do-not-build-with-assembly-support-on-arm.patch
34
BuildRequires: cmake >= 2.8.8
35
BuildRequires: gcc-c++
36
BuildRequires: nasm >= 2.13
37
38
%cmake_install
39
find %{buildroot} -type f -name "*.a" -delete -print0
40
41
+%check
42
+
43
%post -n %{libsoname} -p /sbin/ldconfig
44
%postun -n %{libsoname} -p /sbin/ldconfig
45
46
0001-Fix-arm-flags.patch
Added
41
1
2
+From: Sebastian Ramacher <sramacher@debian.org>
3
+Date: Sun, 21 Jun 2020 17:54:56 +0200
4
+Subject: Fix arm* flags
5
+
6
+---
7
+ source/CMakeLists.txt | 7 ++-----
8
+ 1 file changed, 2 insertions(+), 5 deletions(-)
9
+
10
+diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
11
+index ab5ddfe..eb9b19b 100755
12
+--- a/source/CMakeLists.txt
13
++++ b/source/CMakeLists.txt
14
+@@ -253,10 +253,7 @@ if(GCC)
15
+ elseif(ARM)
16
+ find_package(Neon)
17
+ if(CPU_HAS_NEON)
18
+- set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
19
+ add_definitions(-DHAVE_NEON)
20
+- else()
21
+- set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
22
+ endif()
23
+ endif()
24
+ if(ARM64 OR CROSS_COMPILE_ARM64)
25
+@@ -265,13 +262,13 @@ if(GCC)
26
+ find_package(SVE2)
27
+ if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2)
28
+ message(STATUS "Found SVE2")
29
+- set(ARM_ARGS -O3 -march=armv8-a+sve2 -fPIC -flax-vector-conversions)
30
++ set(ARM_ARGS -fPIC -flax-vector-conversions)
31
+ add_definitions(-DHAVE_SVE2)
32
+ add_definitions(-DHAVE_SVE)
33
+ add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE2
34
+ elseif(CPU_HAS_SVE OR CROSS_COMPILE_SVE)
35
+ message(STATUS "Found SVE")
36
+- set(ARM_ARGS -O3 -march=armv8-a+sve -fPIC -flax-vector-conversions)
37
++ set(ARM_ARGS -fPIC -flax-vector-conversions)
38
+ add_definitions(-DHAVE_SVE)
39
+ add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE
40
+ elseif(CPU_HAS_NEON)
41
0004-Do-not-build-with-assembly-support-on-arm.patch
Added
30
1
2
+From: Sebastian Ramacher <sramacher@debian.org>
3
+Date: Fri, 31 May 2024 23:38:23 +0200
4
+Subject: Do not build with assembly support on arm*
5
+
6
+---
7
+ source/CMakeLists.txt | 9 ---------
8
+ 1 file changed, 9 deletions(-)
9
+
10
+diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
11
+index 672cc2d..f112330 100755
12
+--- a/source/CMakeLists.txt
13
++++ b/source/CMakeLists.txt
14
+@@ -73,15 +73,6 @@ elseif(POWERMATCH GREATER "-1")
15
+ add_definitions(-DPPC64=1)
16
+ message(STATUS "Detected POWER PPC64 target processor")
17
+ endif()
18
+-elseif(ARMMATCH GREATER "-1")
19
+- if(CROSS_COMPILE_ARM)
20
+- message(STATUS "Cross compiling for ARM arch")
21
+- else()
22
+- set(CROSS_COMPILE_ARM 0)
23
+- endif()
24
+- message(STATUS "Detected ARM target processor")
25
+- set(ARM 1)
26
+- add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
27
+ elseif(ARM64MATCH GREATER "-1")
28
+ #if(CROSS_COMPILE_ARM64)
29
+ #message(STATUS "Cross compiling for ARM64 arch")
30
arm.patch
Deleted
110
1
2
-Index: x265_3.4/source/CMakeLists.txt
3
-===================================================================
4
---- x265_3.4.orig/source/CMakeLists.txt
5
-+++ x265_3.4/source/CMakeLists.txt
6
-@@ -64,26 +64,26 @@ elseif(POWERMATCH GREATER "-1")
7
- add_definitions(-DPPC64=1)
8
- message(STATUS "Detected POWER PPC64 target processor")
9
- endif()
10
--elseif(ARMMATCH GREATER "-1")
11
-- if(CROSS_COMPILE_ARM)
12
-- message(STATUS "Cross compiling for ARM arch")
13
-- else()
14
-- set(CROSS_COMPILE_ARM 0)
15
-- endif()
16
-- set(ARM 1)
17
-- if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
18
-- message(STATUS "Detected ARM64 target processor")
19
-- set(ARM64 1)
20
-- add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0)
21
-- else()
22
-- message(STATUS "Detected ARM target processor")
23
-- add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1)
24
-- endif()
25
-+elseif(${SYSPROC} MATCHES "armv5.*")
26
-+ message(STATUS "Detected ARMV5 system processor")
27
-+ set(ARMV5 1)
28
-+ add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=0 -DHAVE_NEON=0)
29
-+elseif(${SYSPROC} STREQUAL "armv6l")
30
-+ message(STATUS "Detected ARMV6 system processor")
31
-+ set(ARMV6 1)
32
-+ add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
33
-+elseif(${SYSPROC} STREQUAL "armv7l")
34
-+ message(STATUS "Detected ARMV7 system processor")
35
-+ set(ARMV7 1)
36
-+ add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
37
-+elseif(${SYSPROC} STREQUAL "aarch64")
38
-+ message(STATUS "Detected AArch64 system processor")
39
-+ set(ARMV7 1)
40
-+ add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0 -DHAVE_NEON=0)
41
- else()
42
- message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
43
- message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
44
- endif()
45
--
46
- if(UNIX)
47
- list(APPEND PLATFORM_LIBS pthread)
48
- find_library(LIBRT rt)
49
-@@ -238,28 +238,9 @@ if(GCC)
50
- endif()
51
- endif()
52
- endif()
53
-- if(ARM AND CROSS_COMPILE_ARM)
54
-- if(ARM64)
55
-- set(ARM_ARGS -fPIC)
56
-- else()
57
-- set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
58
-- endif()
59
-- message(STATUS "cross compile arm")
60
-- elseif(ARM)
61
-- if(ARM64)
62
-- set(ARM_ARGS -fPIC)
63
-- add_definitions(-DHAVE_NEON)
64
-- else()
65
-- find_package(Neon)
66
-- if(CPU_HAS_NEON)
67
-- set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
68
-- add_definitions(-DHAVE_NEON)
69
-- else()
70
-- set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
71
-- endif()
72
-- endif()
73
-+ if(ARMV7)
74
-+ add_definitions(-fPIC)
75
- endif()
76
-- add_definitions(${ARM_ARGS})
77
- if(FPROFILE_GENERATE)
78
- if(INTEL_CXX)
79
- add_definitions(-prof-gen -prof-dir="${CMAKE_CURRENT_BINARY_DIR}")
80
-Index: x265_3.4/source/common/cpu.cpp
81
-===================================================================
82
---- x265_3.4.orig/source/common/cpu.cpp
83
-+++ x265_3.4/source/common/cpu.cpp
84
-@@ -39,7 +39,7 @@
85
- #include <machine/cpu.h>
86
- #endif
87
-
88
--#if X265_ARCH_ARM && !defined(HAVE_NEON)
89
-+#if X265_ARCH_ARM && (!defined(HAVE_NEON) || HAVE_NEON==0)
90
- #include <signal.h>
91
- #include <setjmp.h>
92
- static sigjmp_buf jmpbuf;
93
-@@ -350,7 +350,6 @@ uint32_t cpu_detect(bool benableavx512)
94
- }
95
-
96
- canjump = 1;
97
-- PFX(cpu_neon_test)();
98
- canjump = 0;
99
- signal(SIGILL, oldsig);
100
- #endif // if !HAVE_NEON
101
-@@ -366,7 +365,7 @@ uint32_t cpu_detect(bool benableavx512)
102
- // which may result in incorrect detection and the counters stuck enabled.
103
- // right now Apple does not seem to support performance counters for this test
104
- #ifndef __MACH__
105
-- flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
106
-+ //flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
107
- #endif
108
- // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
109
- #elif X265_ARCH_ARM64
110
baselibs.conf
Changed
4
1
2
-libx265-199
3
+libx265-209
4
x265_3.5.tar.gz/source/common/aarch64/ipfilter8.S
Deleted
201
1
2
-/*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
- *
5
- * Authors: Yimeng Su <yimeng.su@huawei.com>
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
- *
21
- * This program is also available under a commercial proprietary license.
22
- * For more information, contact us at license @ x265.com.
23
- *****************************************************************************/
24
-
25
-#include "asm.S"
26
-
27
-.section .rodata
28
-
29
-.align 4
30
-
31
-.text
32
-
33
-
34
-
35
-.macro qpel_filter_0_32b
36
- movi v24.8h, #64
37
- uxtl v19.8h, v5.8b
38
- smull v17.4s, v19.4h, v24.4h
39
- smull2 v18.4s, v19.8h, v24.8h
40
-.endm
41
-
42
-.macro qpel_filter_1_32b
43
- movi v16.8h, #58
44
- uxtl v19.8h, v5.8b
45
- smull v17.4s, v19.4h, v16.4h
46
- smull2 v18.4s, v19.8h, v16.8h
47
-
48
- movi v24.8h, #10
49
- uxtl v21.8h, v1.8b
50
- smull v19.4s, v21.4h, v24.4h
51
- smull2 v20.4s, v21.8h, v24.8h
52
-
53
- movi v16.8h, #17
54
- uxtl v23.8h, v2.8b
55
- smull v21.4s, v23.4h, v16.4h
56
- smull2 v22.4s, v23.8h, v16.8h
57
-
58
- movi v24.8h, #5
59
- uxtl v1.8h, v6.8b
60
- smull v23.4s, v1.4h, v24.4h
61
- smull2 v16.4s, v1.8h, v24.8h
62
-
63
- sub v17.4s, v17.4s, v19.4s
64
- sub v18.4s, v18.4s, v20.4s
65
-
66
- uxtl v1.8h, v4.8b
67
- sshll v19.4s, v1.4h, #2
68
- sshll2 v20.4s, v1.8h, #2
69
-
70
- add v17.4s, v17.4s, v21.4s
71
- add v18.4s, v18.4s, v22.4s
72
-
73
- uxtl v1.8h, v0.8b
74
- uxtl v2.8h, v3.8b
75
- ssubl v21.4s, v2.4h, v1.4h
76
- ssubl2 v22.4s, v2.8h, v1.8h
77
-
78
- add v17.4s, v17.4s, v19.4s
79
- add v18.4s, v18.4s, v20.4s
80
- sub v21.4s, v21.4s, v23.4s
81
- sub v22.4s, v22.4s, v16.4s
82
- add v17.4s, v17.4s, v21.4s
83
- add v18.4s, v18.4s, v22.4s
84
-.endm
85
-
86
-.macro qpel_filter_2_32b
87
- movi v16.4s, #11
88
- uxtl v19.8h, v5.8b
89
- uxtl v20.8h, v2.8b
90
- saddl v17.4s, v19.4h, v20.4h
91
- saddl2 v18.4s, v19.8h, v20.8h
92
-
93
- uxtl v21.8h, v1.8b
94
- uxtl v22.8h, v6.8b
95
- saddl v19.4s, v21.4h, v22.4h
96
- saddl2 v20.4s, v21.8h, v22.8h
97
-
98
- mul v19.4s, v19.4s, v16.4s
99
- mul v20.4s, v20.4s, v16.4s
100
-
101
- movi v16.4s, #40
102
- mul v17.4s, v17.4s, v16.4s
103
- mul v18.4s, v18.4s, v16.4s
104
-
105
- uxtl v21.8h, v4.8b
106
- uxtl v22.8h, v3.8b
107
- saddl v23.4s, v21.4h, v22.4h
108
- saddl2 v16.4s, v21.8h, v22.8h
109
-
110
- uxtl v1.8h, v0.8b
111
- uxtl v2.8h, v7.8b
112
- saddl v21.4s, v1.4h, v2.4h
113
- saddl2 v22.4s, v1.8h, v2.8h
114
-
115
- shl v23.4s, v23.4s, #2
116
- shl v16.4s, v16.4s, #2
117
-
118
- add v19.4s, v19.4s, v21.4s
119
- add v20.4s, v20.4s, v22.4s
120
- add v17.4s, v17.4s, v23.4s
121
- add v18.4s, v18.4s, v16.4s
122
- sub v17.4s, v17.4s, v19.4s
123
- sub v18.4s, v18.4s, v20.4s
124
-.endm
125
-
126
-.macro qpel_filter_3_32b
127
- movi v16.8h, #17
128
- movi v24.8h, #5
129
-
130
- uxtl v19.8h, v5.8b
131
- smull v17.4s, v19.4h, v16.4h
132
- smull2 v18.4s, v19.8h, v16.8h
133
-
134
- uxtl v21.8h, v1.8b
135
- smull v19.4s, v21.4h, v24.4h
136
- smull2 v20.4s, v21.8h, v24.8h
137
-
138
- movi v16.8h, #58
139
- uxtl v23.8h, v2.8b
140
- smull v21.4s, v23.4h, v16.4h
141
- smull2 v22.4s, v23.8h, v16.8h
142
-
143
- movi v24.8h, #10
144
- uxtl v1.8h, v6.8b
145
- smull v23.4s, v1.4h, v24.4h
146
- smull2 v16.4s, v1.8h, v24.8h
147
-
148
- sub v17.4s, v17.4s, v19.4s
149
- sub v18.4s, v18.4s, v20.4s
150
-
151
- uxtl v1.8h, v3.8b
152
- sshll v19.4s, v1.4h, #2
153
- sshll2 v20.4s, v1.8h, #2
154
-
155
- add v17.4s, v17.4s, v21.4s
156
- add v18.4s, v18.4s, v22.4s
157
-
158
- uxtl v1.8h, v4.8b
159
- uxtl v2.8h, v7.8b
160
- ssubl v21.4s, v1.4h, v2.4h
161
- ssubl2 v22.4s, v1.8h, v2.8h
162
-
163
- add v17.4s, v17.4s, v19.4s
164
- add v18.4s, v18.4s, v20.4s
165
- sub v21.4s, v21.4s, v23.4s
166
- sub v22.4s, v22.4s, v16.4s
167
- add v17.4s, v17.4s, v21.4s
168
- add v18.4s, v18.4s, v22.4s
169
-.endm
170
-
171
-
172
-
173
-
174
-.macro vextin8
175
- ld1 {v3.16b}, x11, #16
176
- mov v7.d0, v3.d1
177
- ext v0.8b, v3.8b, v7.8b, #1
178
- ext v4.8b, v3.8b, v7.8b, #2
179
- ext v1.8b, v3.8b, v7.8b, #3
180
- ext v5.8b, v3.8b, v7.8b, #4
181
- ext v2.8b, v3.8b, v7.8b, #5
182
- ext v6.8b, v3.8b, v7.8b, #6
183
- ext v3.8b, v3.8b, v7.8b, #7
184
-.endm
185
-
186
-
187
-
188
-// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
189
-.macro HPS_FILTER a b filterhps
190
- mov w12, #8192
191
- mov w6, w10
192
- sub x3, x3, #\a
193
- lsl x3, x3, #1
194
- mov w9, #\a
195
- cmp w9, #4
196
- b.eq 14f
197
- cmp w9, #12
198
- b.eq 15f
199
- b 7f
200
-14:
201
x265_3.5.tar.gz/source/common/aarch64/ipfilter8.h
Deleted
57
1
2
-/*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
- *
5
- * Authors: Yimeng Su <yimeng.su@huawei.com>
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
- *
21
- * This program is also available under a commercial proprietary license.
22
- * For more information, contact us at license @ x265.com.
23
- *****************************************************************************/
24
-
25
-#ifndef X265_IPFILTER8_AARCH64_H
26
-#define X265_IPFILTER8_AARCH64_H
27
-
28
-
29
-void x265_interp_8tap_horiz_ps_4x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
30
-void x265_interp_8tap_horiz_ps_4x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
31
-void x265_interp_8tap_horiz_ps_4x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
32
-void x265_interp_8tap_horiz_ps_8x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
33
-void x265_interp_8tap_horiz_ps_8x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
34
-void x265_interp_8tap_horiz_ps_8x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
35
-void x265_interp_8tap_horiz_ps_8x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
36
-void x265_interp_8tap_horiz_ps_12x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
37
-void x265_interp_8tap_horiz_ps_16x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
38
-void x265_interp_8tap_horiz_ps_16x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
39
-void x265_interp_8tap_horiz_ps_16x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
40
-void x265_interp_8tap_horiz_ps_16x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
41
-void x265_interp_8tap_horiz_ps_16x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
42
-void x265_interp_8tap_horiz_ps_16x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
43
-void x265_interp_8tap_horiz_ps_24x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
44
-void x265_interp_8tap_horiz_ps_32x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
45
-void x265_interp_8tap_horiz_ps_32x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
46
-void x265_interp_8tap_horiz_ps_32x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
47
-void x265_interp_8tap_horiz_ps_32x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
48
-void x265_interp_8tap_horiz_ps_32x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
49
-void x265_interp_8tap_horiz_ps_48x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
50
-void x265_interp_8tap_horiz_ps_64x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
51
-void x265_interp_8tap_horiz_ps_64x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
52
-void x265_interp_8tap_horiz_ps_64x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
53
-void x265_interp_8tap_horiz_ps_64x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
54
-
55
-
56
-#endif // ifndef X265_IPFILTER8_AARCH64_H
57
x265_3.5.tar.gz/source/common/aarch64/pixel-util.h
Deleted
42
1
2
-/*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
- *
5
- * Authors: Yimeng Su <yimeng.su@huawei.com>
6
- * Hongbin Liu <liuhongbin1@huawei.com>
7
- *
8
- * This program is free software; you can redistribute it and/or modify
9
- * it under the terms of the GNU General Public License as published by
10
- * the Free Software Foundation; either version 2 of the License, or
11
- * (at your option) any later version.
12
- *
13
- * This program is distributed in the hope that it will be useful,
14
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
- * GNU General Public License for more details.
17
- *
18
- * You should have received a copy of the GNU General Public License
19
- * along with this program; if not, write to the Free Software
20
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21
- *
22
- * This program is also available under a commercial proprietary license.
23
- * For more information, contact us at license @ x265.com.
24
- *****************************************************************************/
25
-
26
-#ifndef X265_PIXEL_UTIL_AARCH64_H
27
-#define X265_PIXEL_UTIL_AARCH64_H
28
-
29
-int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
30
-int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
31
-int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
32
-int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
33
-int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
34
-int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
35
-int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
36
-int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
37
-
38
-uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
39
-int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
40
-
41
-#endif // ifndef X265_PIXEL_UTIL_AARCH64_H
42
x265_3.5.tar.gz/source/common/aarch64/pixel.h
Deleted
107
1
2
-/*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
- *
5
- * Authors: Hongbin Liu <liuhongbin1@huawei.com>
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
- *
21
- * This program is also available under a commercial proprietary license.
22
- * For more information, contact us at license @ x265.com.
23
- *****************************************************************************/
24
-
25
-#ifndef X265_I386_PIXEL_AARCH64_H
26
-#define X265_I386_PIXEL_AARCH64_H
27
-
28
-void x265_pixel_avg_pp_4x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
29
-void x265_pixel_avg_pp_4x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
30
-void x265_pixel_avg_pp_4x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
31
-void x265_pixel_avg_pp_8x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
32
-void x265_pixel_avg_pp_8x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
33
-void x265_pixel_avg_pp_8x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
34
-void x265_pixel_avg_pp_8x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
35
-void x265_pixel_avg_pp_12x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
36
-void x265_pixel_avg_pp_16x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
37
-void x265_pixel_avg_pp_16x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
38
-void x265_pixel_avg_pp_16x12_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
39
-void x265_pixel_avg_pp_16x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
40
-void x265_pixel_avg_pp_16x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
41
-void x265_pixel_avg_pp_16x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
42
-void x265_pixel_avg_pp_24x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
43
-void x265_pixel_avg_pp_32x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
44
-void x265_pixel_avg_pp_32x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
45
-void x265_pixel_avg_pp_32x24_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
46
-void x265_pixel_avg_pp_32x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
47
-void x265_pixel_avg_pp_32x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
48
-void x265_pixel_avg_pp_48x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
49
-void x265_pixel_avg_pp_64x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
50
-void x265_pixel_avg_pp_64x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
51
-void x265_pixel_avg_pp_64x48_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
52
-void x265_pixel_avg_pp_64x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
53
-
54
-void x265_sad_x3_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
55
-void x265_sad_x3_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
56
-void x265_sad_x3_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
57
-void x265_sad_x3_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
58
-void x265_sad_x3_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
59
-void x265_sad_x3_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
60
-void x265_sad_x3_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
61
-void x265_sad_x3_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
62
-void x265_sad_x3_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
63
-void x265_sad_x3_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
64
-void x265_sad_x3_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
65
-void x265_sad_x3_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
66
-void x265_sad_x3_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
67
-void x265_sad_x3_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
68
-void x265_sad_x3_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
69
-void x265_sad_x3_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
70
-void x265_sad_x3_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
71
-void x265_sad_x3_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
72
-void x265_sad_x3_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
73
-void x265_sad_x3_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
74
-void x265_sad_x3_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
75
-void x265_sad_x3_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
76
-void x265_sad_x3_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
77
-void x265_sad_x3_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
78
-void x265_sad_x3_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
79
-
80
-void x265_sad_x4_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
81
-void x265_sad_x4_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
82
-void x265_sad_x4_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
83
-void x265_sad_x4_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
84
-void x265_sad_x4_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
85
-void x265_sad_x4_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
86
-void x265_sad_x4_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
87
-void x265_sad_x4_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
88
-void x265_sad_x4_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
89
-void x265_sad_x4_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
90
-void x265_sad_x4_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
91
-void x265_sad_x4_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
92
-void x265_sad_x4_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
93
-void x265_sad_x4_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
94
-void x265_sad_x4_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
95
-void x265_sad_x4_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
96
-void x265_sad_x4_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
97
-void x265_sad_x4_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
98
-void x265_sad_x4_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
99
-void x265_sad_x4_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
100
-void x265_sad_x4_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
101
-void x265_sad_x4_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
102
-void x265_sad_x4_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
103
-void x265_sad_x4_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
104
-void x265_sad_x4_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
105
-
106
-#endif // ifndef X265_I386_PIXEL_AARCH64_H
107
x265_3.6.tar.gz/.gitignore
Added
38
1
2
+# Prerequisites
3
+*.d
4
+
5
+# Compiled Object files
6
+*.slo
7
+*.lo
8
+*.o
9
+*.obj
10
+
11
+# Precompiled Headers
12
+*.gch
13
+*.pch
14
+
15
+# Compiled Dynamic libraries
16
+*.so
17
+*.dylib
18
+*.dll
19
+
20
+# Fortran module files
21
+*.mod
22
+*.smod
23
+
24
+# Compiled Static libraries
25
+*.lai
26
+*.la
27
+*.a
28
+*.lib
29
+
30
+# Executables
31
+*.exe
32
+*.out
33
+*.app
34
+
35
+# Build directory
36
+build/
37
+
38
x265_3.5.tar.gz/build/README.txt -> x265_3.6.tar.gz/build/README.txt
Changed
37
1
2
3
Note: MSVC12 requires cmake 2.8.11 or later
4
5
+Note: When the SVE/SVE2 instruction set of Arm AArch64 architecture is to be used, the GCC10.x and onwards must
6
+ be installed in order to compile x265.
7
+
8
9
= Optional Prerequisites =
10
11
12
building out of a Mercurial source repository. If you are building out of
13
a release source package, the version will not change. If Mercurial is not
14
found, the version will be "unknown".
15
+
16
+= Build Instructions for cross-compilation for Arm AArch64 Targets=
17
+
18
+When the target platform is based on Arm AArch64 architecture, the x265 can be
19
+built in x86 platforms. However, the CMAKE_C_COMPILER and CMAKE_CXX_COMPILER
20
+enviroment variables should be set to point to the cross compilers of the
21
+appropriate gcc. For example:
22
+
23
+1. export CMAKE_C_COMPILER=aarch64-unknown-linux-gnu-gcc
24
+2. export CMAKE_CXX_COMPILER=aarch64-unknown-linux-gnu-g++
25
+
26
+The default ones are aarch64-linux-gnu-gcc and aarch64-linux-gnu-g++.
27
+Then, the normal building process can be followed.
28
+
29
+Moreover, if the target platform supports SVE or SVE2 instruction set, the
30
+CROSS_COMPILE_SVE or CROSS_COMPILE_SVE2 environment variables should be set
31
+to true, respectively. For example:
32
+
33
+1. export CROSS_COMPILE_SVE2=true
34
+2. export CROSS_COMPILE_SVE=true
35
+
36
+Then, the normal building process can be followed.
37
x265_3.6.tar.gz/build/aarch64-darwin
Added
2
1
+(directory)
2
x265_3.6.tar.gz/build/aarch64-darwin/crosscompile.cmake
Added
25
1
2
+# CMake toolchain file for cross compiling x265 for aarch64
3
+# This feature is only supported as experimental. Use with caution.
4
+# Please report bugs on bitbucket
5
+# Run cmake with: cmake -DCMAKE_TOOLCHAIN_FILE=crosscompile.cmake -G "Unix Makefiles" ../../source && ccmake ../../source
6
+
7
+set(CROSS_COMPILE_ARM64 1)
8
+set(CMAKE_SYSTEM_NAME Darwin)
9
+set(CMAKE_SYSTEM_PROCESSOR aarch64)
10
+
11
+# specify the cross compiler
12
+set(CMAKE_C_COMPILER gcc-12)
13
+set(CMAKE_CXX_COMPILER g++-12)
14
+
15
+# specify the target environment
16
+SET(CMAKE_FIND_ROOT_PATH /opt/homebrew/bin/)
17
+
18
+# specify whether SVE/SVE2 is supported by the target platform
19
+if(DEFINED ENV{CROSS_COMPILE_SVE2})
20
+ set(CROSS_COMPILE_SVE2 1)
21
+elseif(DEFINED ENV{CROSS_COMPILE_SVE})
22
+ set(CROSS_COMPILE_SVE 1)
23
+endif()
24
+
25
x265_3.6.tar.gz/build/aarch64-darwin/make-Makefiles.bash
Added
6
1
2
+#!/bin/bash
3
+# Run this from within a bash shell
4
+
5
+cmake -DCMAKE_TOOLCHAIN_FILE="crosscompile.cmake" -G "Unix Makefiles" ../../source && ccmake ../../source
6
x265_3.5.tar.gz/build/aarch64-linux/crosscompile.cmake -> x265_3.6.tar.gz/build/aarch64-linux/crosscompile.cmake
Changed
34
1
2
# Please report bugs on bitbucket
3
# Run cmake with: cmake -DCMAKE_TOOLCHAIN_FILE=crosscompile.cmake -G "Unix Makefiles" ../../source && ccmake ../../source
4
5
-set(CROSS_COMPILE_ARM 1)
6
+set(CROSS_COMPILE_ARM64 1)
7
set(CMAKE_SYSTEM_NAME Linux)
8
set(CMAKE_SYSTEM_PROCESSOR aarch64)
9
10
# specify the cross compiler
11
-set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
12
-set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
13
+if(DEFINED ENV{CMAKE_C_COMPILER})
14
+ set(CMAKE_C_COMPILER $ENV{CMAKE_C_COMPILER})
15
+else()
16
+ set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
17
+endif()
18
+if(DEFINED ENV{CMAKE_CXX_COMPILER})
19
+ set(CMAKE_CXX_COMPILER $ENV{CMAKE_CXX_COMPILER})
20
+else()
21
+ set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
22
+endif()
23
24
# specify the target environment
25
SET(CMAKE_FIND_ROOT_PATH /usr/aarch64-linux-gnu)
26
+
27
+# specify whether SVE/SVE2 is supported by the target platform
28
+if(DEFINED ENV{CROSS_COMPILE_SVE2})
29
+ set(CROSS_COMPILE_SVE2 1)
30
+elseif(DEFINED ENV{CROSS_COMPILE_SVE})
31
+ set(CROSS_COMPILE_SVE 1)
32
+endif()
33
+
34
x265_3.5.tar.gz/build/arm-linux/make-Makefiles.bash -> x265_3.6.tar.gz/build/arm-linux/make-Makefiles.bash
Changed
7
1
2
#!/bin/bash
3
# Run this from within a bash shell
4
5
-cmake -G "Unix Makefiles" ../../source && ccmake ../../source
6
+cmake -DCMAKE_TOOLCHAIN_FILE="crosscompile.cmake" -G "Unix Makefiles" ../../source && ccmake ../../source
7
x265_3.5.tar.gz/doc/reST/cli.rst -> x265_3.6.tar.gz/doc/reST/cli.rst
Changed
201
1
2
auto-detection by the encoder. If specified, the encoder will
3
attempt to bring the encode specifications within that specified
4
level. If the encoder is unable to reach the level it issues a
5
- warning and aborts the encode. If the requested requirement level is
6
- higher than the actual level, the actual requirement level is
7
- signaled.
8
+ warning and aborts the encode. The requested level will be signaled
9
+ in the bitstream even if it is higher than the actual level.
10
11
Beware, specifying a decoder level will force the encoder to enable
12
VBV for constant rate factor encodes, which may introduce
13
14
(main, main10, etc). Second, an encoder is created from this
15
x265_param instance and the :option:`--level-idc` and
16
:option:`--high-tier` parameters are used to reduce bitrate or other
17
- features in order to enforce the target level. Finally, the encoder
18
- re-examines the final set of parameters and detects the actual
19
- minimum decoder requirement level and this is what is signaled in
20
- the bitstream headers. The detected decoder level will only use High
21
- tier if the user specified a High tier level.
22
+ features in order to enforce the target level. The detected decoder level
23
+ will only use High tier if the user specified a High tier level.
24
25
The signaled profile will be determined by the encoder's internal
26
bitdepth and input color space. If :option:`--keyint` is 0 or 1,
27
28
Note that :option:`--analysis-save-reuse-level` and :option:`--analysis-load-reuse-level` must be paired
29
with :option:`--analysis-save` and :option:`--analysis-load` respectively.
30
31
- +--------------+------------------------------------------+
32
- | Level | Description |
33
- +==============+==========================================+
34
- | 1 | Lookahead information |
35
- +--------------+------------------------------------------+
36
- | 2 to 4 | Level 1 + intra/inter modes, ref's |
37
- +--------------+------------------------------------------+
38
- | 5 and 6 | Level 2 + rect-amp |
39
- +--------------+------------------------------------------+
40
- | 7 | Level 5 + AVC size CU refinement |
41
- +--------------+------------------------------------------+
42
- | 8 and 9 | Level 5 + AVC size Full CU analysis-info |
43
- +--------------+------------------------------------------+
44
- | 10 | Level 5 + Full CU analysis-info |
45
- +--------------+------------------------------------------+
46
+ +--------------+---------------------------------------------------+
47
+ | Level | Description |
48
+ +==============+===================================================+
49
+ | 1 | Lookahead information |
50
+ +--------------+---------------------------------------------------+
51
+ | 2 to 4 | Level 1 + intra/inter modes, depth, ref's, cutree |
52
+ +--------------+---------------------------------------------------+
53
+ | 5 and 6 | Level 2 + rect-amp |
54
+ +--------------+---------------------------------------------------+
55
+ | 7 | Level 5 + AVC size CU refinement |
56
+ +--------------+---------------------------------------------------+
57
+ | 8 and 9 | Level 5 + AVC size Full CU analysis-info |
58
+ +--------------+---------------------------------------------------+
59
+ | 10 | Level 5 + Full CU analysis-info |
60
+ +--------------+---------------------------------------------------+
61
62
.. option:: --refine-mv-type <string>
63
64
65
Search range for HME level 0, 1 and 2.
66
The Search Range for each HME level must be between 0 and 32768(excluding).
67
Default search range is 16,32,48 for level 0,1,2 respectively.
68
+
69
+.. option:: --mcstf, --no-mcstf
70
+
71
+ Enable Motion Compensated Temporal filtering.
72
+ Default: disabled
73
74
Spatial/intra options
75
=====================
76
77
78
.. option:: --hist-scenecut, --no-hist-scenecut
79
80
- Indicates that scenecuts need to be detected using luma edge and chroma histograms.
81
- :option:`--hist-scenecut` enables scenecut detection using the histograms and disables the default scene cut algorithm.
82
- :option:`--no-hist-scenecut` disables histogram based scenecut algorithm.
83
-
84
-.. option:: --hist-threshold <0.0..1.0>
85
-
86
- This value represents the threshold for normalized SAD of edge histograms used in scenecut detection.
87
- This requires :option:`--hist-scenecut` to be enabled. For example, a value of 0.2 indicates that a frame with normalized SAD value
88
- greater than 0.2 against the previous frame as scenecut.
89
- Increasing the threshold reduces the number of scenecuts detected.
90
- Default 0.03.
91
+ Scenecuts detected based on histogram, intensity and variance of the picture.
92
+ :option:`--hist-scenecut` enables or :option:`--no-hist-scenecut` disables scenecut detection based on
93
+ histogram.
94
95
.. option:: --radl <integer>
96
97
98
Default 1.0.
99
**Range of values:** 0.0 to 3.0
100
101
+.. option:: --sbrc --no-sbrc
102
+
103
+ To enable and disable segment based rate control.Segment duration depends on the
104
+ keyframe interval specified.If unspecified,default keyframe interval will be used.
105
+ Default: disabled.
106
+
107
.. option:: --hevc-aq
108
109
Enable adaptive quantization
110
111
112
**CLI ONLY**
113
114
+.. option:: --scenecut-qp-config <filename>
115
+
116
+ Specify a text file which contains the scenecut aware QP options.
117
+ The options include :option:`--scenecut-aware-qp` and :option:`--masking-strength`
118
+
119
+ **CLI ONLY**
120
+
121
.. option:: --scenecut-aware-qp <integer>
122
123
It reduces the bits spent on the inter-frames within the scenecut window
124
before and after a scenecut by increasing their QP in ratecontrol pass2 algorithm
125
- without any deterioration in visual quality. If a scenecut falls within the window,
126
- the QP of the inter-frames after this scenecut will not be modified.
127
+ without any deterioration in visual quality.
128
:option:`--scenecut-aware-qp` works only with --pass 2. Default 0.
129
130
+-------+---------------------------------------------------------------+
131
132
for the QP increment for inter-frames when :option:`--scenecut-aware-qp`
133
is enabled.
134
135
- When :option:`--scenecut-aware-qp` is::
136
+ When :option:`--scenecut-aware-qp` is:
137
+
138
* 1 (Forward masking):
139
- --masking-strength <fwdWindow,fwdRefQPDelta,fwdNonRefQPDelta>
140
+ --masking-strength <fwdMaxWindow,fwdRefQPDelta,fwdNonRefQPDelta>
141
+ or
142
+ --masking-strength <fwdWindow1,fwdRefQPDelta1,fwdNonRefQPDelta1,fwdWindow2,fwdRefQPDelta2,fwdNonRefQPDelta2,
143
+ fwdWindow3,fwdRefQPDelta3,fwdNonRefQPDelta3,fwdWindow4,fwdRefQPDelta4,fwdNonRefQPDelta4,
144
+ fwdWindow5,fwdRefQPDelta5,fwdNonRefQPDelta5,fwdWindow6,fwdRefQPDelta6,fwdNonRefQPDelta6>
145
* 2 (Backward masking):
146
- --masking-strength <bwdWindow,bwdRefQPDelta,bwdNonRefQPDelta>
147
+ --masking-strength <bwdMaxWindow,bwdRefQPDelta,bwdNonRefQPDelta>
148
+ or
149
+ --masking-strength <bwdWindow1,bwdRefQPDelta1,bwdNonRefQPDelta1,bwdWindow2,bwdRefQPDelta2,bwdNonRefQPDelta2,
150
+ bwdWindow3,bwdRefQPDelta3,bwdNonRefQPDelta3,bwdWindow4,bwdRefQPDelta4,bwdNonRefQPDelta4,
151
+ bwdWindow5,bwdRefQPDelta5,bwdNonRefQPDelta5,bwdWindow6,bwdRefQPDelta6,bwdNonRefQPDelta6>
152
* 3 (Bi-directional masking):
153
- --masking-strength <fwdWindow,fwdRefQPDelta,fwdNonRefQPDelta,bwdWindow,bwdRefQPDelta,bwdNonRefQPDelta>
154
+ --masking-strength <fwdMaxWindow,fwdRefQPDelta,fwdNonRefQPDelta,bwdMaxWindow,bwdRefQPDelta,bwdNonRefQPDelta>
155
+ or
156
+ --masking-strength <fwdWindow1,fwdRefQPDelta1,fwdNonRefQPDelta1,fwdWindow2,fwdRefQPDelta2,fwdNonRefQPDelta2,
157
+ fwdWindow3,fwdRefQPDelta3,fwdNonRefQPDelta3,fwdWindow4,fwdRefQPDelta4,fwdNonRefQPDelta4,
158
+ fwdWindow5,fwdRefQPDelta5,fwdNonRefQPDelta5,fwdWindow6,fwdRefQPDelta6,fwdNonRefQPDelta6,
159
+ bwdWindow1,bwdRefQPDelta1,bwdNonRefQPDelta1,bwdWindow2,bwdRefQPDelta2,bwdNonRefQPDelta2,
160
+ bwdWindow3,bwdRefQPDelta3,bwdNonRefQPDelta3,bwdWindow4,bwdRefQPDelta4,bwdNonRefQPDelta4,
161
+ bwdWindow5,bwdRefQPDelta5,bwdNonRefQPDelta5,bwdWindow6,bwdRefQPDelta6,bwdNonRefQPDelta6>
162
163
+-----------------+---------------------------------------------------------------+
164
| Parameter | Description |
165
+=================+===============================================================+
166
- | fwdWindow | The duration(in milliseconds) for which there is a reduction |
167
- | | in the bits spent on the inter-frames after a scenecut by |
168
- | | increasing their QP. Default 500ms. |
169
- | | **Range of values:** 0 to 1000 |
170
+ | fwdMaxWindow | The maximum duration(in milliseconds) for which there is a |
171
+ | | reduction in the bits spent on the inter-frames after a |
172
+ | | scenecut by increasing their QP. Default 500ms. |
173
+ | | **Range of values:** 0 to 2000 |
174
+ +-----------------+---------------------------------------------------------------+
175
+ | fwdWindow | The duration of a sub-window(in milliseconds) for which there |
176
+ | | is a reduction in the bits spent on the inter-frames after a |
177
+ | | scenecut by increasing their QP. Default 500ms. |
178
+ | | **Range of values:** 0 to 2000 |
179
+-----------------+---------------------------------------------------------------+
180
| fwdRefQPDelta | The offset by which QP is incremented for inter-frames |
181
| | after a scenecut. Default 5. |
182
- | | **Range of values:** 0 to 10 |
183
+ | | **Range of values:** 0 to 20 |
184
+-----------------+---------------------------------------------------------------+
185
| fwdNonRefQPDelta| The offset by which QP is incremented for non-referenced |
186
| | inter-frames after a scenecut. The offset is computed from |
187
| | fwdRefQPDelta when it is not explicitly specified. |
188
- | | **Range of values:** 0 to 10 |
189
+ | | **Range of values:** 0 to 20 |
190
+ +-----------------+---------------------------------------------------------------+
191
+ | bwdMaxWindow | The maximum duration(in milliseconds) for which there is a |
192
+ | | reduction in the bits spent on the inter-frames before a |
193
+ | | scenecut by increasing their QP. Default 100ms. |
194
+ | | **Range of values:** 0 to 2000 |
195
+-----------------+---------------------------------------------------------------+
196
- | bwdWindow | The duration(in milliseconds) for which there is a reduction |
197
- | | in the bits spent on the inter-frames before a scenecut by |
198
- | | increasing their QP. Default 100ms. |
199
- | | **Range of values:** 0 to 1000 |
200
+ | bwdWindow | The duration of a sub-window(in milliseconds) for which there |
201
x265_3.5.tar.gz/doc/reST/introduction.rst -> x265_3.6.tar.gz/doc/reST/introduction.rst
Changed
9
1
2
to start is with the `Motion Picture Experts Group - Licensing Authority
3
- HEVC Licensing Program <http://www.mpegla.com/main/PID/HEVC/default.aspx>`_.
4
5
-x265 is a registered trademark of MulticoreWare, Inc. The x265 logo is
6
+x265 is a registered trademark of MulticoreWare, Inc. The X265 logo is
7
a trademark of MulticoreWare, and may only be used with explicit written
8
permission. All rights reserved.
9
x265_3.5.tar.gz/doc/reST/releasenotes.rst -> x265_3.6.tar.gz/doc/reST/releasenotes.rst
Changed
55
1
2
Release Notes
3
*************
4
5
+Version 3.6
6
+===========
7
+
8
+Release date - 4th April, 2024.
9
+
10
+New feature
11
+-----------
12
+1. Segment based Ratecontrol (SBRC) feature
13
+2. Motion-Compensated Spatio-Temporal Filtering
14
+3. Scene-cut aware qp - BBAQ (Bidirectional Boundary Aware Quantization)
15
+4. Histogram-Based Scene Change Detection
16
+5. Film-Grain characteristics as a SEI message to support Film Grain Synthesis(FGS)
17
+6. Add temporal layer implementation(Hierarchical B-frame implementation)
18
+
19
+Enhancements to existing features
20
+---------------------------------
21
+1. Added Dolby Vision 8.4 Profile Support
22
+
23
+
24
+API changes
25
+-----------
26
+1. Add Segment based Ratecontrol(SBRC) feature: "--no-sbrc".
27
+2. Add command line parameter for mcstf feature: "--no-mctf".
28
+3. Add command line parameters for the scene cut aware qp feature: "--scenecut-aware-qp" and "--masking-strength".
29
+4. Add command line parameters for Histogram-Based Scene Change Detection: "--hist-scenecut".
30
+5. Add film grain characteristics as a SEI message to the bitstream: "--film-grain <filename>"
31
+6. cli: add new option --cra-nal (Force nal type to CRA to all frames expect for the first frame, works only with keyint 1)
32
+
33
+Optimizations
34
+---------------------
35
+ARM64 NEON optimizations:- Several time-consuming C functions have been optimized for the targeted platform - aarch64. The overall performance increased by around 20%.
36
+SVE/SVE2 optimizations
37
+
38
+
39
+Bug fixes
40
+---------
41
+1. Linux bug to utilize all the cores
42
+2. Crash with hist-scenecut build when source resolution is not multiple of minCuSize
43
+3. 32bit and 64bit builds generation for ARM
44
+4. bugs in zonefile feature (Reflect Zonefile Parameters inside Lookahead, extra IDR issue, Avg I Slice QP value issue etc..)
45
+5. Add x86 ASM implementation for subsampling luma
46
+6. Fix for abrladder segfault with load reuse level 1
47
+7. Reorder miniGOP based on temporal layer hierarchy and add support for more B frame
48
+8. Add MacOS aarch64 build support
49
+9. Fix boundary condition issue for Gaussian filter
50
+
51
+
52
Version 3.5
53
===========
54
55
x265_3.5.tar.gz/readme.rst -> x265_3.6.tar.gz/readme.rst
Changed
10
1
2
x265 HEVC Encoder
3
=================
4
5
-| **Read:** | Online `documentation <http://x265.readthedocs.org/en/default/>`_ | Developer `wiki <http://bitbucket.org/multicoreware/x265/wiki/>`_
6
+| **Read:** | Online `documentation <http://x265.readthedocs.org/en/master/>`_ | Developer `wiki <http://bitbucket.org/multicoreware/x265_git/wiki/>`_
7
| **Download:** | `releases <http://ftp.videolan.org/pub/videolan/x265/>`_
8
| **Interact:** | #x265 on freenode.irc.net | `x265-devel@videolan.org <http://mailman.videolan.org/listinfo/x265-devel>`_ | `Report an issue <https://bitbucket.org/multicoreware/x265/issues?status=new&status=open>`_
9
10
x265_3.5.tar.gz/source/CMakeLists.txt -> x265_3.6.tar.gz/source/CMakeLists.txt
Changed
201
1
2
option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
3
mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
4
# X265_BUILD must be incremented each time the public API is changed
5
-set(X265_BUILD 199)
6
+set(X265_BUILD 209)
7
configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
8
"${PROJECT_BINARY_DIR}/x265.def")
9
configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
10
11
SET(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake" "${CMAKE_MODULE_PATH}")
12
13
# System architecture detection
14
-string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
15
+if (APPLE AND CMAKE_OSX_ARCHITECTURES)
16
+ string(TOLOWER "${CMAKE_OSX_ARCHITECTURES}" SYSPROC)
17
+else()
18
+ string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
19
+endif()
20
set(X86_ALIASES x86 i386 i686 x86_64 amd64)
21
-set(ARM_ALIASES armv6l armv7l aarch64)
22
+set(ARM_ALIASES armv6l armv7l)
23
+set(ARM64_ALIASES arm64 arm64e aarch64)
24
list(FIND X86_ALIASES "${SYSPROC}" X86MATCH)
25
list(FIND ARM_ALIASES "${SYSPROC}" ARMMATCH)
26
-set(POWER_ALIASES ppc64 ppc64le)
27
+list(FIND ARM64_ALIASES "${SYSPROC}" ARM64MATCH)
28
+set(POWER_ALIASES powerpc64 powerpc64le ppc64 ppc64le)
29
list(FIND POWER_ALIASES "${SYSPROC}" POWERMATCH)
30
-if("${SYSPROC}" STREQUAL "" OR X86MATCH GREATER "-1")
31
+if(X86MATCH GREATER "-1")
32
set(X86 1)
33
add_definitions(-DX265_ARCH_X86=1)
34
if(CMAKE_CXX_FLAGS STREQUAL "-m32")
35
36
else()
37
set(CROSS_COMPILE_ARM 0)
38
endif()
39
+ message(STATUS "Detected ARM target processor")
40
set(ARM 1)
41
- if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
42
- message(STATUS "Detected ARM64 target processor")
43
- set(ARM64 1)
44
- add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0)
45
- else()
46
- message(STATUS "Detected ARM target processor")
47
- add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1)
48
- endif()
49
+ add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
50
+elseif(ARM64MATCH GREATER "-1")
51
+ #if(CROSS_COMPILE_ARM64)
52
+ #message(STATUS "Cross compiling for ARM64 arch")
53
+ #else()
54
+ #set(CROSS_COMPILE_ARM64 0)
55
+ #endif()
56
+ message(STATUS "Detected ARM64 target processor")
57
+ set(ARM64 1)
58
+ add_definitions(-DX265_ARCH_ARM64=1 -DHAVE_NEON)
59
else()
60
message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
61
message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
62
63
endif()
64
endif()
65
if(ARM AND CROSS_COMPILE_ARM)
66
- if(ARM64)
67
- set(ARM_ARGS -fPIC)
68
- else()
69
- set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
70
- endif()
71
message(STATUS "cross compile arm")
72
+ set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
73
elseif(ARM)
74
- if(ARM64)
75
- set(ARM_ARGS -fPIC)
76
+ find_package(Neon)
77
+ if(CPU_HAS_NEON)
78
+ set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
79
add_definitions(-DHAVE_NEON)
80
else()
81
- find_package(Neon)
82
- if(CPU_HAS_NEON)
83
- set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
84
- add_definitions(-DHAVE_NEON)
85
- else()
86
- set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
87
- endif()
88
+ set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
89
endif()
90
endif()
91
+ if(ARM64 OR CROSS_COMPILE_ARM64)
92
+ find_package(Neon)
93
+ find_package(SVE)
94
+ find_package(SVE2)
95
+ if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2)
96
+ message(STATUS "Found SVE2")
97
+ set(ARM_ARGS -O3 -march=armv8-a+sve2 -fPIC -flax-vector-conversions)
98
+ add_definitions(-DHAVE_SVE2)
99
+ add_definitions(-DHAVE_SVE)
100
+ add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE2
101
+ elseif(CPU_HAS_SVE OR CROSS_COMPILE_SVE)
102
+ message(STATUS "Found SVE")
103
+ set(ARM_ARGS -O3 -march=armv8-a+sve -fPIC -flax-vector-conversions)
104
+ add_definitions(-DHAVE_SVE)
105
+ add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE
106
+ elseif(CPU_HAS_NEON)
107
+ message(STATUS "Found NEON")
108
+ set(ARM_ARGS -fPIC -flax-vector-conversions)
109
+ add_definitions(-DHAVE_NEON)
110
+ else()
111
+ set(ARM_ARGS -fPIC -flax-vector-conversions)
112
+ endif()
113
+ endif()
114
+ if(ENABLE_PIC)
115
+ list(APPEND ARM_ARGS -DPIC)
116
+ endif()
117
add_definitions(${ARM_ARGS})
118
if(FPROFILE_GENERATE)
119
if(INTEL_CXX)
120
121
endif(GCC)
122
123
find_package(Nasm)
124
-if(ARM OR CROSS_COMPILE_ARM)
125
+if(ARM OR CROSS_COMPILE_ARM OR ARM64 OR CROSS_COMPILE_ARM64)
126
option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" ON)
127
elseif(NASM_FOUND AND X86)
128
if (NASM_VERSION_STRING VERSION_LESS "2.13.0")
129
130
endif(EXTRA_LIB)
131
mark_as_advanced(EXTRA_LIB EXTRA_LINK_FLAGS)
132
133
-if(X64)
134
+if(X64 OR ARM64 OR PPC64)
135
# NOTE: We only officially support high-bit-depth compiles of x265
136
# on 64bit architectures. Main10 plus large resolution plus slow
137
# preset plus 32bit address space usually means malloc failure. You
138
139
# license" so to speak. If it breaks you get to keep both halves.
140
# You will need to disable assembly manually.
141
option(HIGH_BIT_DEPTH "Store pixel samples as 16bit values (Main10/Main12)" OFF)
142
-endif(X64)
143
+endif(X64 OR ARM64 OR PPC64)
144
if(HIGH_BIT_DEPTH)
145
option(MAIN12 "Support Main12 instead of Main10" OFF)
146
if(MAIN12)
147
148
endif()
149
add_definitions(-DX265_NS=${X265_NS})
150
151
+if(ARM64)
152
+ if(HIGH_BIT_DEPTH)
153
+ if(MAIN12)
154
+ list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=12 -DX265_NS=${X265_NS})
155
+ else()
156
+ list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=10 -DX265_NS=${X265_NS})
157
+ endif()
158
+ else()
159
+ list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=0 -DBIT_DEPTH=8 -DX265_NS=${X265_NS})
160
+ endif()
161
+endif(ARM64)
162
+
163
option(WARNINGS_AS_ERRORS "Stop compiles on first warning" OFF)
164
if(WARNINGS_AS_ERRORS)
165
if(GCC)
166
167
# compile ARM arch asm files here
168
enable_language(ASM)
169
foreach(ASM ${ARM_ASMS})
170
- if(ARM64)
171
- set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
172
- else()
173
- set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})
174
- endif()
175
+ set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})
176
list(APPEND ASM_SRCS ${ASM_SRC})
177
list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
178
add_custom_command(
179
180
ARGS ${ARM_ARGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
181
DEPENDS ${ASM_SRC})
182
endforeach()
183
+ elseif(ARM64 OR CROSS_COMPILE_ARM64)
184
+ # compile ARM64 arch asm files here
185
+ enable_language(ASM)
186
+ foreach(ASM ${ARM_ASMS})
187
+ set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
188
+ list(APPEND ASM_SRCS ${ASM_SRC})
189
+ list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
190
+ add_custom_command(
191
+ OUTPUT ${ASM}.${SUFFIX}
192
+ COMMAND ${CMAKE_CXX_COMPILER}
193
+ ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
194
+ DEPENDS ${ASM_SRC})
195
+ endforeach()
196
+ if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2)
197
+ foreach(ASM ${ARM_ASMS_SVE})
198
+ set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
199
+ list(APPEND ASM_SRCS ${ASM_SRC})
200
+ list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
201
x265_3.5.tar.gz/source/abrEncApp.cpp -> x265_3.6.tar.gz/source/abrEncApp.cpp
Changed
201
1
2
-/*****************************************************************************
3
-* Copyright (C) 2013-2020 MulticoreWare, Inc
4
-*
5
-* Authors: Pooja Venkatesan <pooja@multicorewareinc.com>
6
-* Aruna Matheswaran <aruna@multicorewareinc.com>
7
-*
8
-* This program is free software; you can redistribute it and/or modify
9
-* it under the terms of the GNU General Public License as published by
10
-* the Free Software Foundation; either version 2 of the License, or
11
-* (at your option) any later version.
12
-*
13
-* This program is distributed in the hope that it will be useful,
14
-* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
-* GNU General Public License for more details.
17
-*
18
-* You should have received a copy of the GNU General Public License
19
-* along with this program; if not, write to the Free Software
20
-* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21
-*
22
-* This program is also available under a commercial proprietary license.
23
-* For more information, contact us at license @ x265.com.
24
-*****************************************************************************/
25
-
26
-#include "abrEncApp.h"
27
-#include "mv.h"
28
-#include "slice.h"
29
-#include "param.h"
30
-
31
-#include <signal.h>
32
-#include <errno.h>
33
-
34
-#include <queue>
35
-
36
-using namespace X265_NS;
37
-
38
-/* Ctrl-C handler */
39
-static volatile sig_atomic_t b_ctrl_c /* = 0 */;
40
-static void sigint_handler(int)
41
-{
42
- b_ctrl_c = 1;
43
-}
44
-
45
-namespace X265_NS {
46
- // private namespace
47
-#define X265_INPUT_QUEUE_SIZE 250
48
-
49
- AbrEncoder::AbrEncoder(CLIOptions cliopt, uint8_t numEncodes, int &ret)
50
- {
51
- m_numEncodes = numEncodes;
52
- m_numActiveEncodes.set(numEncodes);
53
- m_queueSize = (numEncodes > 1) ? X265_INPUT_QUEUE_SIZE : 1;
54
- m_passEnc = X265_MALLOC(PassEncoder*, m_numEncodes);
55
-
56
- for (uint8_t i = 0; i < m_numEncodes; i++)
57
- {
58
- m_passEnci = new PassEncoder(i, cliopti, this);
59
- if (!m_passEnci)
60
- {
61
- x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for passEncoder\n");
62
- ret = 4;
63
- }
64
- m_passEnci->init(ret);
65
- }
66
-
67
- if (!allocBuffers())
68
- {
69
- x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for buffers\n");
70
- ret = 4;
71
- }
72
-
73
- /* start passEncoder worker threads */
74
- for (uint8_t pass = 0; pass < m_numEncodes; pass++)
75
- m_passEncpass->startThreads();
76
- }
77
-
78
- bool AbrEncoder::allocBuffers()
79
- {
80
- m_inputPicBuffer = X265_MALLOC(x265_picture**, m_numEncodes);
81
- m_analysisBuffer = X265_MALLOC(x265_analysis_data*, m_numEncodes);
82
-
83
- m_picWriteCnt = new ThreadSafeIntegerm_numEncodes;
84
- m_picReadCnt = new ThreadSafeIntegerm_numEncodes;
85
- m_analysisWriteCnt = new ThreadSafeIntegerm_numEncodes;
86
- m_analysisReadCnt = new ThreadSafeIntegerm_numEncodes;
87
-
88
- m_picIdxReadCnt = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
89
- m_analysisWrite = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
90
- m_analysisRead = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
91
- m_readFlag = X265_MALLOC(int*, m_numEncodes);
92
-
93
- for (uint8_t pass = 0; pass < m_numEncodes; pass++)
94
- {
95
- m_inputPicBufferpass = X265_MALLOC(x265_picture*, m_queueSize);
96
- for (uint32_t idx = 0; idx < m_queueSize; idx++)
97
- {
98
- m_inputPicBufferpassidx = x265_picture_alloc();
99
- x265_picture_init(m_passEncpass->m_param, m_inputPicBufferpassidx);
100
- }
101
-
102
- CHECKED_MALLOC_ZERO(m_analysisBufferpass, x265_analysis_data, m_queueSize);
103
- m_picIdxReadCntpass = new ThreadSafeIntegerm_queueSize;
104
- m_analysisWritepass = new ThreadSafeIntegerm_queueSize;
105
- m_analysisReadpass = new ThreadSafeIntegerm_queueSize;
106
- m_readFlagpass = X265_MALLOC(int, m_queueSize);
107
- }
108
- return true;
109
- fail:
110
- return false;
111
- }
112
-
113
- void AbrEncoder::destroy()
114
- {
115
- x265_cleanup(); /* Free library singletons */
116
- for (uint8_t pass = 0; pass < m_numEncodes; pass++)
117
- {
118
- for (uint32_t index = 0; index < m_queueSize; index++)
119
- {
120
- X265_FREE(m_inputPicBufferpassindex->planes0);
121
- x265_picture_free(m_inputPicBufferpassindex);
122
- }
123
-
124
- X265_FREE(m_inputPicBufferpass);
125
- X265_FREE(m_analysisBufferpass);
126
- X265_FREE(m_readFlagpass);
127
- delete m_picIdxReadCntpass;
128
- delete m_analysisWritepass;
129
- delete m_analysisReadpass;
130
- m_passEncpass->destroy();
131
- delete m_passEncpass;
132
- }
133
- X265_FREE(m_inputPicBuffer);
134
- X265_FREE(m_analysisBuffer);
135
- X265_FREE(m_readFlag);
136
-
137
- delete m_picWriteCnt;
138
- delete m_picReadCnt;
139
- delete m_analysisWriteCnt;
140
- delete m_analysisReadCnt;
141
-
142
- X265_FREE(m_picIdxReadCnt);
143
- X265_FREE(m_analysisWrite);
144
- X265_FREE(m_analysisRead);
145
-
146
- X265_FREE(m_passEnc);
147
- }
148
-
149
- PassEncoder::PassEncoder(uint32_t id, CLIOptions cliopt, AbrEncoder *parent)
150
- {
151
- m_id = id;
152
- m_cliopt = cliopt;
153
- m_parent = parent;
154
- if(!(m_cliopt.enableScaler && m_id))
155
- m_input = m_cliopt.input;
156
- m_param = cliopt.param;
157
- m_inputOver = false;
158
- m_lastIdx = -1;
159
- m_encoder = NULL;
160
- m_scaler = NULL;
161
- m_reader = NULL;
162
- m_ret = 0;
163
- }
164
-
165
- int PassEncoder::init(int &result)
166
- {
167
- if (m_parent->m_numEncodes > 1)
168
- setReuseLevel();
169
-
170
- if (!(m_cliopt.enableScaler && m_id))
171
- m_reader = new Reader(m_id, this);
172
- else
173
- {
174
- VideoDesc *src = NULL, *dst = NULL;
175
- dst = new VideoDesc(m_param->sourceWidth, m_param->sourceHeight, m_param->internalCsp, m_param->internalBitDepth);
176
- int dstW = m_parent->m_passEncm_id - 1->m_param->sourceWidth;
177
- int dstH = m_parent->m_passEncm_id - 1->m_param->sourceHeight;
178
- src = new VideoDesc(dstW, dstH, m_param->internalCsp, m_param->internalBitDepth);
179
- if (src != NULL && dst != NULL)
180
- {
181
- m_scaler = new Scaler(0, 1, m_id, src, dst, this);
182
- if (!m_scaler)
183
- {
184
- x265_log(m_param, X265_LOG_ERROR, "\n MALLOC failure in Scaler");
185
- result = 4;
186
- }
187
- }
188
- }
189
-
190
- /* note: we could try to acquire a different libx265 API here based on
191
- * the profile found during option parsing, but it must be done before
192
- * opening an encoder */
193
-
194
- if (m_param)
195
- m_encoder = m_cliopt.api->encoder_open(m_param);
196
- if (!m_encoder)
197
- {
198
- x265_log(NULL, X265_LOG_ERROR, "x265_encoder_open() failed for Enc, \n");
199
- m_ret = 2;
200
- return -1;
201
x265_3.5.tar.gz/source/abrEncApp.h -> x265_3.6.tar.gz/source/abrEncApp.h
Changed
9
1
2
FILE* m_qpfile;
3
FILE* m_zoneFile;
4
FILE* m_dolbyVisionRpu;/* File containing Dolby Vision BL RPU metadata */
5
+ FILE* m_scenecutAwareQpConfig;
6
7
int m_ret;
8
9
x265_3.5.tar.gz/source/cmake/FindNeon.cmake -> x265_3.6.tar.gz/source/cmake/FindNeon.cmake
Changed
27
1
2
include(FindPackageHandleStandardArgs)
3
4
# Check the version of neon supported by the ARM CPU
5
-execute_process(COMMAND cat /proc/cpuinfo | grep Features | grep neon
6
- OUTPUT_VARIABLE neon_version
7
- ERROR_QUIET
8
- OUTPUT_STRIP_TRAILING_WHITESPACE)
9
+if(APPLE)
10
+ execute_process(COMMAND sysctl -a
11
+ COMMAND grep "hw.optional.neon: 1"
12
+ OUTPUT_VARIABLE neon_version
13
+ ERROR_QUIET
14
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
15
+else()
16
+ execute_process(COMMAND cat /proc/cpuinfo
17
+ COMMAND grep Features
18
+ COMMAND grep neon
19
+ OUTPUT_VARIABLE neon_version
20
+ ERROR_QUIET
21
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
22
+endif()
23
+
24
if(neon_version)
25
set(CPU_HAS_NEON 1)
26
endif()
27
x265_3.6.tar.gz/source/cmake/FindSVE.cmake
Added
23
1
2
+include(FindPackageHandleStandardArgs)
3
+
4
+# Check the version of SVE supported by the ARM CPU
5
+if(APPLE)
6
+ execute_process(COMMAND sysctl -a
7
+ COMMAND grep "hw.optional.sve: 1"
8
+ OUTPUT_VARIABLE sve_version
9
+ ERROR_QUIET
10
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
11
+else()
12
+ execute_process(COMMAND cat /proc/cpuinfo
13
+ COMMAND grep Features
14
+ COMMAND grep -e "sve$" -e "sve:space:"
15
+ OUTPUT_VARIABLE sve_version
16
+ ERROR_QUIET
17
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
18
+endif()
19
+
20
+if(sve_version)
21
+ set(CPU_HAS_SVE 1)
22
+endif()
23
x265_3.6.tar.gz/source/cmake/FindSVE2.cmake
Added
24
1
2
+include(FindPackageHandleStandardArgs)
3
+
4
+# Check the version of SVE2 supported by the ARM CPU
5
+if(APPLE)
6
+ execute_process(COMMAND sysctl -a
7
+ COMMAND grep "hw.optional.sve2: 1"
8
+ OUTPUT_VARIABLE sve2_version
9
+ ERROR_QUIET
10
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
11
+else()
12
+ execute_process(COMMAND cat /proc/cpuinfo
13
+ COMMAND grep Features
14
+ COMMAND grep sve2
15
+ OUTPUT_VARIABLE sve2_version
16
+ ERROR_QUIET
17
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
18
+endif()
19
+
20
+if(sve2_version)
21
+ set(CPU_HAS_SVE 1)
22
+ set(CPU_HAS_SVE2 1)
23
+endif()
24
x265_3.5.tar.gz/source/common/CMakeLists.txt -> x265_3.6.tar.gz/source/common/CMakeLists.txt
Changed
76
1
2
endif(ENABLE_ASSEMBLY AND X86)
3
4
if(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
5
- if(ARM64)
6
- if(GCC AND (CMAKE_CXX_FLAGS_RELEASE MATCHES "-O3"))
7
- message(STATUS "Detected CXX compiler using -O3 optimization level")
8
- add_definitions(-DAUTO_VECTORIZE=1)
9
- endif()
10
- set(C_SRCS asm-primitives.cpp pixel.h ipfilter8.h)
11
-
12
- # add ARM assembly/intrinsic files here
13
- set(A_SRCS asm.S mc-a.S sad-a.S pixel-util.S ipfilter8.S)
14
- set(VEC_PRIMITIVES)
15
+ set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
16
17
- set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
18
- foreach(SRC ${C_SRCS})
19
- set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
20
- endforeach()
21
- else()
22
- set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
23
+ # add ARM assembly/intrinsic files here
24
+ set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S dct-a.S)
25
+ set(VEC_PRIMITIVES)
26
27
- # add ARM assembly/intrinsic files here
28
- set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S dct-a.S)
29
- set(VEC_PRIMITIVES)
30
+ set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
31
+ foreach(SRC ${C_SRCS})
32
+ set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
33
+ endforeach()
34
+ source_group(Assembly FILES ${ASM_PRIMITIVES})
35
+endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
36
37
- set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
38
- foreach(SRC ${C_SRCS})
39
- set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
40
- endforeach()
41
+if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
42
+ if(GCC AND (CMAKE_CXX_FLAGS_RELEASE MATCHES "-O3"))
43
+ message(STATUS "Detected CXX compiler using -O3 optimization level")
44
+ add_definitions(-DAUTO_VECTORIZE=1)
45
endif()
46
+
47
+ set(C_SRCS asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h)
48
+ enable_language(ASM)
49
+
50
+ # add ARM assembly/intrinsic files here
51
+ set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S sad-a-common.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S ipfilter.S ipfilter-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S)
52
+ set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S)
53
+ set(A_SRCS_SVE2 mc-a-sve2.S sad-a-sve2.S pixel-util-sve2.S ipfilter-sve2.S ssd-a-sve2.S)
54
+ set(VEC_PRIMITIVES)
55
+
56
+ set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
57
+ set(ARM_ASMS_SVE "${A_SRCS_SVE}" CACHE INTERNAL "ARM Assembly Sources that use SVE instruction set")
58
+ set(ARM_ASMS_SVE2 "${A_SRCS_SVE2}" CACHE INTERNAL "ARM Assembly Sources that use SVE2 instruction set")
59
+ foreach(SRC ${C_SRCS})
60
+ set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
61
+ endforeach()
62
source_group(Assembly FILES ${ASM_PRIMITIVES})
63
-endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
64
+endif(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
65
66
if(POWER)
67
set_source_files_properties(version.cpp PROPERTIES COMPILE_FLAGS -DX265_VERSION=${X265_VERSION})
68
69
scalinglist.cpp scalinglist.h
70
quant.cpp quant.h contexts.h
71
deblock.cpp deblock.h
72
- scaler.cpp scaler.h)
73
+ scaler.cpp scaler.h
74
+ ringmem.cpp ringmem.h
75
+ temporalfilter.cpp temporalfilter.h)
76
x265_3.6.tar.gz/source/common/aarch64/arm64-utils.cpp
Added
201
1
2
+#include "common.h"
3
+#include "x265.h"
4
+#include "arm64-utils.h"
5
+#include <arm_neon.h>
6
+
7
+#define COPY_16(d,s) *(uint8x16_t *)(d) = *(uint8x16_t *)(s)
8
+namespace X265_NS
9
+{
10
+
11
+
12
+
13
+void transpose8x8(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
14
+{
15
+ uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7;
16
+ uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;
17
+
18
+ a0 = *(uint8x8_t *)(src + 0 * sstride);
19
+ a1 = *(uint8x8_t *)(src + 1 * sstride);
20
+ a2 = *(uint8x8_t *)(src + 2 * sstride);
21
+ a3 = *(uint8x8_t *)(src + 3 * sstride);
22
+ a4 = *(uint8x8_t *)(src + 4 * sstride);
23
+ a5 = *(uint8x8_t *)(src + 5 * sstride);
24
+ a6 = *(uint8x8_t *)(src + 6 * sstride);
25
+ a7 = *(uint8x8_t *)(src + 7 * sstride);
26
+
27
+ b0 = vtrn1_u32(a0, a4);
28
+ b1 = vtrn1_u32(a1, a5);
29
+ b2 = vtrn1_u32(a2, a6);
30
+ b3 = vtrn1_u32(a3, a7);
31
+ b4 = vtrn2_u32(a0, a4);
32
+ b5 = vtrn2_u32(a1, a5);
33
+ b6 = vtrn2_u32(a2, a6);
34
+ b7 = vtrn2_u32(a3, a7);
35
+
36
+ a0 = vtrn1_u16(b0, b2);
37
+ a1 = vtrn1_u16(b1, b3);
38
+ a2 = vtrn2_u16(b0, b2);
39
+ a3 = vtrn2_u16(b1, b3);
40
+ a4 = vtrn1_u16(b4, b6);
41
+ a5 = vtrn1_u16(b5, b7);
42
+ a6 = vtrn2_u16(b4, b6);
43
+ a7 = vtrn2_u16(b5, b7);
44
+
45
+ b0 = vtrn1_u8(a0, a1);
46
+ b1 = vtrn2_u8(a0, a1);
47
+ b2 = vtrn1_u8(a2, a3);
48
+ b3 = vtrn2_u8(a2, a3);
49
+ b4 = vtrn1_u8(a4, a5);
50
+ b5 = vtrn2_u8(a4, a5);
51
+ b6 = vtrn1_u8(a6, a7);
52
+ b7 = vtrn2_u8(a6, a7);
53
+
54
+ *(uint8x8_t *)(dst + 0 * dstride) = b0;
55
+ *(uint8x8_t *)(dst + 1 * dstride) = b1;
56
+ *(uint8x8_t *)(dst + 2 * dstride) = b2;
57
+ *(uint8x8_t *)(dst + 3 * dstride) = b3;
58
+ *(uint8x8_t *)(dst + 4 * dstride) = b4;
59
+ *(uint8x8_t *)(dst + 5 * dstride) = b5;
60
+ *(uint8x8_t *)(dst + 6 * dstride) = b6;
61
+ *(uint8x8_t *)(dst + 7 * dstride) = b7;
62
+}
63
+
64
+
65
+
66
+
67
+
68
+
69
+void transpose16x16(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
70
+{
71
+ uint16x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aA, aB, aC, aD, aE, aF;
72
+ uint16x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, bA, bB, bC, bD, bE, bF;
73
+ uint16x8_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF;
74
+ uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, dA, dB, dC, dD, dE, dF;
75
+
76
+ a0 = *(uint16x8_t *)(src + 0 * sstride);
77
+ a1 = *(uint16x8_t *)(src + 1 * sstride);
78
+ a2 = *(uint16x8_t *)(src + 2 * sstride);
79
+ a3 = *(uint16x8_t *)(src + 3 * sstride);
80
+ a4 = *(uint16x8_t *)(src + 4 * sstride);
81
+ a5 = *(uint16x8_t *)(src + 5 * sstride);
82
+ a6 = *(uint16x8_t *)(src + 6 * sstride);
83
+ a7 = *(uint16x8_t *)(src + 7 * sstride);
84
+ a8 = *(uint16x8_t *)(src + 8 * sstride);
85
+ a9 = *(uint16x8_t *)(src + 9 * sstride);
86
+ aA = *(uint16x8_t *)(src + 10 * sstride);
87
+ aB = *(uint16x8_t *)(src + 11 * sstride);
88
+ aC = *(uint16x8_t *)(src + 12 * sstride);
89
+ aD = *(uint16x8_t *)(src + 13 * sstride);
90
+ aE = *(uint16x8_t *)(src + 14 * sstride);
91
+ aF = *(uint16x8_t *)(src + 15 * sstride);
92
+
93
+ b0 = vtrn1q_u64(a0, a8);
94
+ b1 = vtrn1q_u64(a1, a9);
95
+ b2 = vtrn1q_u64(a2, aA);
96
+ b3 = vtrn1q_u64(a3, aB);
97
+ b4 = vtrn1q_u64(a4, aC);
98
+ b5 = vtrn1q_u64(a5, aD);
99
+ b6 = vtrn1q_u64(a6, aE);
100
+ b7 = vtrn1q_u64(a7, aF);
101
+ b8 = vtrn2q_u64(a0, a8);
102
+ b9 = vtrn2q_u64(a1, a9);
103
+ bA = vtrn2q_u64(a2, aA);
104
+ bB = vtrn2q_u64(a3, aB);
105
+ bC = vtrn2q_u64(a4, aC);
106
+ bD = vtrn2q_u64(a5, aD);
107
+ bE = vtrn2q_u64(a6, aE);
108
+ bF = vtrn2q_u64(a7, aF);
109
+
110
+ c0 = vtrn1q_u32(b0, b4);
111
+ c1 = vtrn1q_u32(b1, b5);
112
+ c2 = vtrn1q_u32(b2, b6);
113
+ c3 = vtrn1q_u32(b3, b7);
114
+ c4 = vtrn2q_u32(b0, b4);
115
+ c5 = vtrn2q_u32(b1, b5);
116
+ c6 = vtrn2q_u32(b2, b6);
117
+ c7 = vtrn2q_u32(b3, b7);
118
+ c8 = vtrn1q_u32(b8, bC);
119
+ c9 = vtrn1q_u32(b9, bD);
120
+ cA = vtrn1q_u32(bA, bE);
121
+ cB = vtrn1q_u32(bB, bF);
122
+ cC = vtrn2q_u32(b8, bC);
123
+ cD = vtrn2q_u32(b9, bD);
124
+ cE = vtrn2q_u32(bA, bE);
125
+ cF = vtrn2q_u32(bB, bF);
126
+
127
+ d0 = vtrn1q_u16(c0, c2);
128
+ d1 = vtrn1q_u16(c1, c3);
129
+ d2 = vtrn2q_u16(c0, c2);
130
+ d3 = vtrn2q_u16(c1, c3);
131
+ d4 = vtrn1q_u16(c4, c6);
132
+ d5 = vtrn1q_u16(c5, c7);
133
+ d6 = vtrn2q_u16(c4, c6);
134
+ d7 = vtrn2q_u16(c5, c7);
135
+ d8 = vtrn1q_u16(c8, cA);
136
+ d9 = vtrn1q_u16(c9, cB);
137
+ dA = vtrn2q_u16(c8, cA);
138
+ dB = vtrn2q_u16(c9, cB);
139
+ dC = vtrn1q_u16(cC, cE);
140
+ dD = vtrn1q_u16(cD, cF);
141
+ dE = vtrn2q_u16(cC, cE);
142
+ dF = vtrn2q_u16(cD, cF);
143
+
144
+ *(uint16x8_t *)(dst + 0 * dstride) = vtrn1q_u8(d0, d1);
145
+ *(uint16x8_t *)(dst + 1 * dstride) = vtrn2q_u8(d0, d1);
146
+ *(uint16x8_t *)(dst + 2 * dstride) = vtrn1q_u8(d2, d3);
147
+ *(uint16x8_t *)(dst + 3 * dstride) = vtrn2q_u8(d2, d3);
148
+ *(uint16x8_t *)(dst + 4 * dstride) = vtrn1q_u8(d4, d5);
149
+ *(uint16x8_t *)(dst + 5 * dstride) = vtrn2q_u8(d4, d5);
150
+ *(uint16x8_t *)(dst + 6 * dstride) = vtrn1q_u8(d6, d7);
151
+ *(uint16x8_t *)(dst + 7 * dstride) = vtrn2q_u8(d6, d7);
152
+ *(uint16x8_t *)(dst + 8 * dstride) = vtrn1q_u8(d8, d9);
153
+ *(uint16x8_t *)(dst + 9 * dstride) = vtrn2q_u8(d8, d9);
154
+ *(uint16x8_t *)(dst + 10 * dstride) = vtrn1q_u8(dA, dB);
155
+ *(uint16x8_t *)(dst + 11 * dstride) = vtrn2q_u8(dA, dB);
156
+ *(uint16x8_t *)(dst + 12 * dstride) = vtrn1q_u8(dC, dD);
157
+ *(uint16x8_t *)(dst + 13 * dstride) = vtrn2q_u8(dC, dD);
158
+ *(uint16x8_t *)(dst + 14 * dstride) = vtrn1q_u8(dE, dF);
159
+ *(uint16x8_t *)(dst + 15 * dstride) = vtrn2q_u8(dE, dF);
160
+
161
+
162
+}
163
+
164
+
165
+void transpose32x32(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride)
166
+{
167
+ //assumption: there is no partial overlap
168
+ transpose16x16(dst, src, dstride, sstride);
169
+ transpose16x16(dst + 16 * dstride + 16, src + 16 * sstride + 16, dstride, sstride);
170
+ if (dst == src)
171
+ {
172
+ uint8_t tmp16 * 16 __attribute__((aligned(64)));
173
+ transpose16x16(tmp, src + 16, 16, sstride);
174
+ transpose16x16(dst + 16, src + 16 * sstride, dstride, sstride);
175
+ for (int i = 0; i < 16; i++)
176
+ {
177
+ COPY_16(dst + (16 + i)*dstride, tmp + 16 * i);
178
+ }
179
+ }
180
+ else
181
+ {
182
+ transpose16x16(dst + 16 * dstride, src + 16, dstride, sstride);
183
+ transpose16x16(dst + 16, src + 16 * sstride, dstride, sstride);
184
+ }
185
+
186
+}
187
+
188
+
189
+
190
+void transpose8x8(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride)
191
+{
192
+ uint16x8_t a0, a1, a2, a3, a4, a5, a6, a7;
193
+ uint16x8_t b0, b1, b2, b3, b4, b5, b6, b7;
194
+
195
+ a0 = *(uint16x8_t *)(src + 0 * sstride);
196
+ a1 = *(uint16x8_t *)(src + 1 * sstride);
197
+ a2 = *(uint16x8_t *)(src + 2 * sstride);
198
+ a3 = *(uint16x8_t *)(src + 3 * sstride);
199
+ a4 = *(uint16x8_t *)(src + 4 * sstride);
200
+ a5 = *(uint16x8_t *)(src + 5 * sstride);
201
x265_3.6.tar.gz/source/common/aarch64/arm64-utils.h
Added
17
1
2
+#ifndef __ARM64_UTILS_H__
3
+#define __ARM64_UTILS_H__
4
+
5
+
6
+namespace X265_NS
7
+{
8
+void transpose8x8(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride);
9
+void transpose16x16(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride);
10
+void transpose32x32(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride);
11
+void transpose8x8(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride);
12
+void transpose16x16(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride);
13
+void transpose32x32(uint16_t *dst, const uint16_t *src, intptr_t dstride, intptr_t sstride);
14
+}
15
+
16
+#endif
17
x265_3.5.tar.gz/source/common/aarch64/asm-primitives.cpp -> x265_3.6.tar.gz/source/common/aarch64/asm-primitives.cpp
Changed
201
1
2
*
3
* Authors: Hongbin Liu <liuhongbin1@huawei.com>
4
* Yimeng Su <yimeng.su@huawei.com>
5
+ * Sebastian Pop <spop@amazon.com>
6
*
7
* This program is free software; you can redistribute it and/or modify
8
* it under the terms of the GNU General Public License as published by
9
10
* For more information, contact us at license @ x265.com.
11
*****************************************************************************/
12
13
+
14
#include "common.h"
15
#include "primitives.h"
16
#include "x265.h"
17
#include "cpu.h"
18
19
+extern "C" {
20
+#include "fun-decls.h"
21
+}
22
+
23
+#define ALL_LUMA_TU_TYPED(prim, fncdef, fname, cpu) \
24
+ p.cuBLOCK_4x4.prim = fncdef PFX(fname ## _4x4_ ## cpu); \
25
+ p.cuBLOCK_8x8.prim = fncdef PFX(fname ## _8x8_ ## cpu); \
26
+ p.cuBLOCK_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
27
+ p.cuBLOCK_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
28
+ p.cuBLOCK_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu)
29
+#define LUMA_TU_TYPED_NEON(prim, fncdef, fname) \
30
+ p.cuBLOCK_4x4.prim = fncdef PFX(fname ## _4x4_ ## neon); \
31
+ p.cuBLOCK_8x8.prim = fncdef PFX(fname ## _8x8_ ## neon); \
32
+ p.cuBLOCK_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
33
+ p.cuBLOCK_64x64.prim = fncdef PFX(fname ## _64x64_ ## neon)
34
+#define LUMA_TU_TYPED_CAN_USE_SVE(prim, fncdef, fname) \
35
+ p.cuBLOCK_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve)
36
+#define ALL_LUMA_TU(prim, fname, cpu) ALL_LUMA_TU_TYPED(prim, , fname, cpu)
37
+#define LUMA_TU_NEON(prim, fname) LUMA_TU_TYPED_NEON(prim, , fname)
38
+#define LUMA_TU_CAN_USE_SVE(prim, fname) LUMA_TU_TYPED_CAN_USE_SVE(prim, , fname)
39
+
40
+#define ALL_LUMA_PU_TYPED(prim, fncdef, fname, cpu) \
41
+ p.puLUMA_4x4.prim = fncdef PFX(fname ## _4x4_ ## cpu); \
42
+ p.puLUMA_8x8.prim = fncdef PFX(fname ## _8x8_ ## cpu); \
43
+ p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
44
+ p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
45
+ p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
46
+ p.puLUMA_8x4.prim = fncdef PFX(fname ## _8x4_ ## cpu); \
47
+ p.puLUMA_4x8.prim = fncdef PFX(fname ## _4x8_ ## cpu); \
48
+ p.puLUMA_16x8.prim = fncdef PFX(fname ## _16x8_ ## cpu); \
49
+ p.puLUMA_8x16.prim = fncdef PFX(fname ## _8x16_ ## cpu); \
50
+ p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
51
+ p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
52
+ p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
53
+ p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
54
+ p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
55
+ p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
56
+ p.puLUMA_16x4.prim = fncdef PFX(fname ## _16x4_ ## cpu); \
57
+ p.puLUMA_4x16.prim = fncdef PFX(fname ## _4x16_ ## cpu); \
58
+ p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
59
+ p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
60
+ p.puLUMA_32x8.prim = fncdef PFX(fname ## _32x8_ ## cpu); \
61
+ p.puLUMA_8x32.prim = fncdef PFX(fname ## _8x32_ ## cpu); \
62
+ p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
63
+ p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
64
+ p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \
65
+ p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu)
66
+#define LUMA_PU_TYPED_MULTIPLE_ARCHS_1(prim, fncdef, fname, cpu) \
67
+ p.puLUMA_4x4.prim = fncdef PFX(fname ## _4x4_ ## cpu); \
68
+ p.puLUMA_4x8.prim = fncdef PFX(fname ## _4x8_ ## cpu); \
69
+ p.puLUMA_4x16.prim = fncdef PFX(fname ## _4x16_ ## cpu)
70
+#define LUMA_PU_TYPED_MULTIPLE_ARCHS_2(prim, fncdef, fname, cpu) \
71
+ p.puLUMA_8x8.prim = fncdef PFX(fname ## _8x8_ ## cpu); \
72
+ p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \
73
+ p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
74
+ p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
75
+ p.puLUMA_8x4.prim = fncdef PFX(fname ## _8x4_ ## cpu); \
76
+ p.puLUMA_16x8.prim = fncdef PFX(fname ## _16x8_ ## cpu); \
77
+ p.puLUMA_8x16.prim = fncdef PFX(fname ## _8x16_ ## cpu); \
78
+ p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \
79
+ p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
80
+ p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
81
+ p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
82
+ p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \
83
+ p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
84
+ p.puLUMA_16x4.prim = fncdef PFX(fname ## _16x4_ ## cpu); \
85
+ p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
86
+ p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
87
+ p.puLUMA_32x8.prim = fncdef PFX(fname ## _32x8_ ## cpu); \
88
+ p.puLUMA_8x32.prim = fncdef PFX(fname ## _8x32_ ## cpu); \
89
+ p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
90
+ p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
91
+ p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \
92
+ p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu)
93
+#define LUMA_PU_TYPED_NEON_1(prim, fncdef, fname) \
94
+ p.puLUMA_4x4.prim = fncdef PFX(fname ## _4x4_ ## neon); \
95
+ p.puLUMA_4x8.prim = fncdef PFX(fname ## _4x8_ ## neon); \
96
+ p.puLUMA_4x16.prim = fncdef PFX(fname ## _4x16_ ## neon); \
97
+ p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## neon); \
98
+ p.puLUMA_8x8.prim = fncdef PFX(fname ## _8x8_ ## neon); \
99
+ p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
100
+ p.puLUMA_8x4.prim = fncdef PFX(fname ## _8x4_ ## neon); \
101
+ p.puLUMA_16x8.prim = fncdef PFX(fname ## _16x8_ ## neon); \
102
+ p.puLUMA_8x16.prim = fncdef PFX(fname ## _8x16_ ## neon); \
103
+ p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \
104
+ p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
105
+ p.puLUMA_16x4.prim = fncdef PFX(fname ## _16x4_ ## neon); \
106
+ p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## neon); \
107
+ p.puLUMA_8x32.prim = fncdef PFX(fname ## _8x32_ ## neon); \
108
+ p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## neon); \
109
+ p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## neon)
110
+#define LUMA_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fncdef, fname) \
111
+ p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve); \
112
+ p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## sve); \
113
+ p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## sve); \
114
+ p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## sve); \
115
+ p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## sve); \
116
+ p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## sve); \
117
+ p.puLUMA_32x8.prim = fncdef PFX(fname ## _32x8_ ## sve); \
118
+ p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## sve); \
119
+ p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## sve)
120
+#define LUMA_PU_TYPED_NEON_2(prim, fncdef, fname) \
121
+ p.puLUMA_4x4.prim = fncdef PFX(fname ## _4x4_ ## neon); \
122
+ p.puLUMA_8x4.prim = fncdef PFX(fname ## _8x4_ ## neon); \
123
+ p.puLUMA_4x8.prim = fncdef PFX(fname ## _4x8_ ## neon); \
124
+ p.puLUMA_8x8.prim = fncdef PFX(fname ## _8x8_ ## neon); \
125
+ p.puLUMA_16x8.prim = fncdef PFX(fname ## _16x8_ ## neon); \
126
+ p.puLUMA_8x16.prim = fncdef PFX(fname ## _8x16_ ## neon); \
127
+ p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \
128
+ p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \
129
+ p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \
130
+ p.puLUMA_16x4.prim = fncdef PFX(fname ## _16x4_ ## neon); \
131
+ p.puLUMA_4x16.prim = fncdef PFX(fname ## _4x16_ ## neon); \
132
+ p.puLUMA_8x32.prim = fncdef PFX(fname ## _8x32_ ## neon); \
133
+ p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## neon)
134
+#define LUMA_PU_TYPED_MULTIPLE_ARCHS_3(prim, fncdef, fname, cpu) \
135
+ p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \
136
+ p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \
137
+ p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \
138
+ p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \
139
+ p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \
140
+ p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \
141
+ p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \
142
+ p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \
143
+ p.puLUMA_32x8.prim = fncdef PFX(fname ## _32x8_ ## cpu); \
144
+ p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \
145
+ p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \
146
+ p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu)
147
+#define LUMA_PU_TYPED_NEON_3(prim, fncdef, fname) \
148
+ p.puLUMA_4x4.prim = fncdef PFX(fname ## _4x4_ ## neon); \
149
+ p.puLUMA_4x8.prim = fncdef PFX(fname ## _4x8_ ## neon); \
150
+ p.puLUMA_4x16.prim = fncdef PFX(fname ## _4x16_ ## neon)
151
+#define LUMA_PU_TYPED_CAN_USE_SVE2(prim, fncdef, fname) \
152
+ p.puLUMA_8x8.prim = fncdef PFX(fname ## _8x8_ ## sve2); \
153
+ p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## sve2); \
154
+ p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve2); \
155
+ p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## sve2); \
156
+ p.puLUMA_8x4.prim = fncdef PFX(fname ## _8x4_ ## sve2); \
157
+ p.puLUMA_16x8.prim = fncdef PFX(fname ## _16x8_ ## sve2); \
158
+ p.puLUMA_8x16.prim = fncdef PFX(fname ## _8x16_ ## sve2); \
159
+ p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## sve2); \
160
+ p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## sve2); \
161
+ p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## sve2); \
162
+ p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## sve2); \
163
+ p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## sve2); \
164
+ p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## sve2); \
165
+ p.puLUMA_16x4.prim = fncdef PFX(fname ## _16x4_ ## sve2); \
166
+ p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## sve2); \
167
+ p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## sve2); \
168
+ p.puLUMA_32x8.prim = fncdef PFX(fname ## _32x8_ ## sve2); \
169
+ p.puLUMA_8x32.prim = fncdef PFX(fname ## _8x32_ ## sve2); \
170
+ p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## sve2); \
171
+ p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## sve2); \
172
+ p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## sve2); \
173
+ p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## sve2)
174
+#define LUMA_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
175
+ p.puLUMA_4x4.prim = fncdef PFX(filterPixelToShort ## _4x4_ ## neon); \
176
+ p.puLUMA_8x8.prim = fncdef PFX(filterPixelToShort ## _8x8_ ## neon); \
177
+ p.puLUMA_16x16.prim = fncdef PFX(filterPixelToShort ## _16x16_ ## neon); \
178
+ p.puLUMA_8x4.prim = fncdef PFX(filterPixelToShort ## _8x4_ ## neon); \
179
+ p.puLUMA_4x8.prim = fncdef PFX(filterPixelToShort ## _4x8_ ## neon); \
180
+ p.puLUMA_16x8.prim = fncdef PFX(filterPixelToShort ## _16x8_ ## neon); \
181
+ p.puLUMA_8x16.prim = fncdef PFX(filterPixelToShort ## _8x16_ ## neon); \
182
+ p.puLUMA_16x32.prim = fncdef PFX(filterPixelToShort ## _16x32_ ## neon); \
183
+ p.puLUMA_16x12.prim = fncdef PFX(filterPixelToShort ## _16x12_ ## neon); \
184
+ p.puLUMA_12x16.prim = fncdef PFX(filterPixelToShort ## _12x16_ ## neon); \
185
+ p.puLUMA_16x4.prim = fncdef PFX(filterPixelToShort ## _16x4_ ## neon); \
186
+ p.puLUMA_4x16.prim = fncdef PFX(filterPixelToShort ## _4x16_ ## neon); \
187
+ p.puLUMA_24x32.prim = fncdef PFX(filterPixelToShort ## _24x32_ ## neon); \
188
+ p.puLUMA_8x32.prim = fncdef PFX(filterPixelToShort ## _8x32_ ## neon); \
189
+ p.puLUMA_16x64.prim = fncdef PFX(filterPixelToShort ## _16x64_ ## neon)
190
+#define LUMA_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
191
+ p.puLUMA_32x32.prim = fncdef PFX(filterPixelToShort ## _32x32_ ## sve); \
192
+ p.puLUMA_32x16.prim = fncdef PFX(filterPixelToShort ## _32x16_ ## sve); \
193
+ p.puLUMA_32x64.prim = fncdef PFX(filterPixelToShort ## _32x64_ ## sve); \
194
+ p.puLUMA_32x24.prim = fncdef PFX(filterPixelToShort ## _32x24_ ## sve); \
195
+ p.puLUMA_32x8.prim = fncdef PFX(filterPixelToShort ## _32x8_ ## sve); \
196
+ p.puLUMA_64x64.prim = fncdef PFX(filterPixelToShort ## _64x64_ ## sve); \
197
+ p.puLUMA_64x32.prim = fncdef PFX(filterPixelToShort ## _64x32_ ## sve); \
198
+ p.puLUMA_64x48.prim = fncdef PFX(filterPixelToShort ## _64x48_ ## sve); \
199
+ p.puLUMA_64x16.prim = fncdef PFX(filterPixelToShort ## _64x16_ ## sve); \
200
+ p.puLUMA_48x64.prim = fncdef PFX(filterPixelToShort ## _48x64_ ## sve)
201
x265_3.6.tar.gz/source/common/aarch64/asm-sve.S
Added
41
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+
27
+.arch armv8-a+sve
28
+
29
+.macro ABS2_SVE a b c
30
+ abs \a, \c\()/m, \a
31
+ abs \b, \c\()/m, \b
32
+.endm
33
+
34
+.macro ABS8_SVE z0, z1, z2, z3, z4, z5, z6, z7, p0
35
+ ABS2_SVE \z0, \z1, p0
36
+ ABS2_SVE \z2, \z3, p0
37
+ ABS2_SVE \z4, \z5, p0
38
+ ABS2_SVE \z6, \z7, p0
39
+.endm
40
+
41
x265_3.5.tar.gz/source/common/aarch64/asm.S -> x265_3.6.tar.gz/source/common/aarch64/asm.S
Changed
173
1
2
/*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
+ * Copyright (C) 2020-2021 MulticoreWare, Inc
5
*
6
* Authors: Hongbin Liu <liuhongbin1@huawei.com>
7
+ * Sebastian Pop <spop@amazon.com>
8
*
9
* This program is free software; you can redistribute it and/or modify
10
* it under the terms of the GNU General Public License as published by
11
12
* For more information, contact us at license @ x265.com.
13
*****************************************************************************/
14
15
+#ifndef ASM_S_ // #include guards
16
+#define ASM_S_
17
+
18
.arch armv8-a
19
20
+#define PFX3(prefix, name) prefix ## _ ## name
21
+#define PFX2(prefix, name) PFX3(prefix, name)
22
+#define PFX(name) PFX2(X265_NS, name)
23
+
24
+#ifdef __APPLE__
25
+#define PREFIX 1
26
+#endif
27
+
28
#ifdef PREFIX
29
#define EXTERN_ASM _
30
+#define HAVE_AS_FUNC 0
31
+#elif defined __clang__
32
+#define EXTERN_ASM
33
+#define HAVE_AS_FUNC 0
34
+#define PREFIX 1
35
#else
36
#define EXTERN_ASM
37
+#define HAVE_AS_FUNC 1
38
#endif
39
40
#ifdef __ELF__
41
#define ELF
42
#else
43
+#ifdef PREFIX
44
+#define ELF #
45
+#else
46
#define ELF @
47
#endif
48
-
49
-#define HAVE_AS_FUNC 1
50
+#endif
51
52
#if HAVE_AS_FUNC
53
#define FUNC
54
#else
55
+#ifdef PREFIX
56
+#define FUNC #
57
+#else
58
#define FUNC @
59
#endif
60
+#endif
61
+
62
+#define GLUE(a, b) a ## b
63
+#define JOIN(a, b) GLUE(a, b)
64
+
65
+#define PFX_C(name) JOIN(JOIN(JOIN(EXTERN_ASM, X265_NS), _), name)
66
+
67
+#ifdef __APPLE__
68
+.macro endfunc
69
+ELF .size \name, . - \name
70
+FUNC .endfunc
71
+.endm
72
+#endif
73
74
.macro function name, export=1
75
+#ifdef __APPLE__
76
+ .global \name
77
+ endfunc
78
+#else
79
.macro endfunc
80
ELF .size \name, . - \name
81
FUNC .endfunc
82
.purgem endfunc
83
.endm
84
+#endif
85
.align 2
86
.if \export == 1
87
.global EXTERN_ASM\name
88
89
.endif
90
.endm
91
92
+.macro const name, align=2
93
+ .macro endconst
94
+ELF .size \name, . - \name
95
+ .purgem endconst
96
+ .endm
97
+#ifdef __MACH__
98
+ .const_data
99
+#else
100
+ .section .rodata
101
+#endif
102
+ .align \align
103
+\name:
104
+.endm
105
+
106
+.macro movrel rd, val, offset=0
107
+#if defined(__APPLE__)
108
+ .if \offset < 0
109
+ adrp \rd, \val@PAGE
110
+ add \rd, \rd, \val@PAGEOFF
111
+ sub \rd, \rd, -(\offset)
112
+ .else
113
+ adrp \rd, \val+(\offset)@PAGE
114
+ add \rd, \rd, \val+(\offset)@PAGEOFF
115
+ .endif
116
+#elif defined(PIC) && defined(_WIN32)
117
+ .if \offset < 0
118
+ adrp \rd, \val
119
+ add \rd, \rd, :lo12:\val
120
+ sub \rd, \rd, -(\offset)
121
+ .else
122
+ adrp \rd, \val+(\offset)
123
+ add \rd, \rd, :lo12:\val+(\offset)
124
+ .endif
125
+#else
126
+ adrp \rd, \val+(\offset)
127
+ add \rd, \rd, :lo12:\val+(\offset)
128
+#endif
129
+.endm
130
131
#define FENC_STRIDE 64
132
#define FDEC_STRIDE 32
133
+
134
+.macro SUMSUB_AB sum, diff, a, b
135
+ add \sum, \a, \b
136
+ sub \diff, \a, \b
137
+.endm
138
+
139
+.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
140
+ SUMSUB_AB \s1, \d1, \a, \b
141
+ SUMSUB_AB \s2, \d2, \c, \d
142
+.endm
143
+
144
+.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
145
+ SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
146
+ SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
147
+.endm
148
+
149
+.macro ABS2 a b
150
+ abs \a, \a
151
+ abs \b, \b
152
+.endm
153
+
154
+.macro ABS8 v0, v1, v2, v3, v4, v5, v6, v7
155
+ ABS2 \v0, \v1
156
+ ABS2 \v2, \v3
157
+ ABS2 \v4, \v5
158
+ ABS2 \v6, \v7
159
+.endm
160
+
161
+.macro vtrn t1, t2, s1, s2
162
+ trn1 \t1, \s1, \s2
163
+ trn2 \t2, \s1, \s2
164
+.endm
165
+
166
+.macro trn4 t1, t2, t3, t4, s1, s2, s3, s4
167
+ vtrn \t1, \t2, \s1, \s2
168
+ vtrn \t3, \t4, \s3, \s4
169
+.endm
170
+
171
+#endif
172
\ No newline at end of file
173
x265_3.6.tar.gz/source/common/aarch64/blockcopy8-common.S
Added
56
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+#include "asm.S"
29
+
30
+.arch armv8-a
31
+
32
+// void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
33
+.macro cpy1Dto2D_shr_start
34
+ add x2, x2, x2
35
+ dup v0.8h, w3
36
+ cmeq v1.8h, v1.8h, v1.8h
37
+ sshl v1.8h, v1.8h, v0.8h
38
+ sri v1.8h, v1.8h, #1
39
+ neg v0.8h, v0.8h
40
+.endm
41
+
42
+.macro cpy2Dto1D_shr_start
43
+ add x2, x2, x2
44
+ dup v0.8h, w3
45
+ cmeq v1.8h, v1.8h, v1.8h
46
+ sshl v1.8h, v1.8h, v0.8h
47
+ sri v1.8h, v1.8h, #1
48
+ neg v0.8h, v0.8h
49
+.endm
50
+
51
+const xtn_xtn2_table, align=4
52
+.byte 0, 2, 4, 6, 8, 10, 12, 14
53
+.byte 16, 18, 20, 22, 24, 26, 28, 30
54
+endconst
55
+
56
x265_3.6.tar.gz/source/common/aarch64/blockcopy8-sve.S
Added
201
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "blockcopy8-common.S"
27
+
28
+.arch armv8-a+sve
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+/* void blockcopy_sp(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
41
+ *
42
+ * r0 - a
43
+ * r1 - stridea
44
+ * r2 - b
45
+ * r3 - strideb */
46
+
47
+function PFX(blockcopy_sp_4x4_sve)
48
+ ptrue p0.h, vl4
49
+.rept 2
50
+ ld1h {z0.h}, p0/z, x2
51
+ add x2, x2, x3, lsl #1
52
+ st1b {z0.h}, p0, x0
53
+ add x0, x0, x1
54
+ ld1h {z1.h}, p0/z, x2
55
+ add x2, x2, x3, lsl #1
56
+ st1b {z1.h}, p0, x0
57
+ add x0, x0, x1
58
+.endr
59
+ ret
60
+endfunc
61
+
62
+function PFX(blockcopy_sp_8x8_sve)
63
+ ptrue p0.h, vl8
64
+.rept 4
65
+ ld1h {z0.h}, p0/z, x2
66
+ add x2, x2, x3, lsl #1
67
+ st1b {z0.h}, p0, x0
68
+ add x0, x0, x1
69
+ ld1h {z1.h}, p0/z, x2
70
+ add x2, x2, x3, lsl #1
71
+ st1b {z1.h}, p0, x0
72
+ add x0, x0, x1
73
+.endr
74
+ ret
75
+endfunc
76
+
77
+function PFX(blockcopy_sp_16x16_sve)
78
+ rdvl x9, #1
79
+ cmp x9, #16
80
+ bgt .vl_gt_16_blockcopy_sp_16_16
81
+ lsl x3, x3, #1
82
+ movrel x11, xtn_xtn2_table
83
+ ld1 {v31.16b}, x11
84
+.rept 8
85
+ ld1 {v0.8h-v1.8h}, x2, x3
86
+ ld1 {v2.8h-v3.8h}, x2, x3
87
+ tbl v0.16b, {v0.16b,v1.16b}, v31.16b
88
+ tbl v1.16b, {v2.16b,v3.16b}, v31.16b
89
+ st1 {v0.16b}, x0, x1
90
+ st1 {v1.16b}, x0, x1
91
+.endr
92
+ ret
93
+.vl_gt_16_blockcopy_sp_16_16:
94
+ ptrue p0.h, vl16
95
+.rept 8
96
+ ld1h {z0.h}, p0/z, x2
97
+ st1b {z0.h}, p0, x0
98
+ add x2, x2, x3, lsl #1
99
+ add x0, x0, x1
100
+ ld1h {z1.h}, p0/z, x2
101
+ st1b {z1.h}, p0, x0
102
+ add x2, x2, x3, lsl #1
103
+ add x0, x0, x1
104
+.endr
105
+ ret
106
+endfunc
107
+
108
+function PFX(blockcopy_sp_32x32_sve)
109
+ mov w12, #4
110
+ rdvl x9, #1
111
+ cmp x9, #16
112
+ bgt .vl_gt_16_blockcopy_sp_32_32
113
+ lsl x3, x3, #1
114
+ movrel x11, xtn_xtn2_table
115
+ ld1 {v31.16b}, x11
116
+.loop_csp32_sve:
117
+ sub w12, w12, #1
118
+.rept 4
119
+ ld1 {v0.8h-v3.8h}, x2, x3
120
+ ld1 {v4.8h-v7.8h}, x2, x3
121
+ tbl v0.16b, {v0.16b,v1.16b}, v31.16b
122
+ tbl v1.16b, {v2.16b,v3.16b}, v31.16b
123
+ tbl v2.16b, {v4.16b,v5.16b}, v31.16b
124
+ tbl v3.16b, {v6.16b,v7.16b}, v31.16b
125
+ st1 {v0.16b-v1.16b}, x0, x1
126
+ st1 {v2.16b-v3.16b}, x0, x1
127
+.endr
128
+ cbnz w12, .loop_csp32_sve
129
+ ret
130
+.vl_gt_16_blockcopy_sp_32_32:
131
+ cmp x9, #48
132
+ bgt .vl_gt_48_blockcopy_sp_32_32
133
+ ptrue p0.h, vl16
134
+.vl_gt_16_loop_csp32_sve:
135
+ sub w12, w12, #1
136
+.rept 4
137
+ ld1h {z0.h}, p0/z, x2
138
+ ld1h {z1.h}, p0/z, x2, #1, mul vl
139
+ st1b {z0.h}, p0, x0
140
+ st1b {z1.h}, p0, x0, #1, mul vl
141
+ add x2, x2, x3, lsl #1
142
+ add x0, x0, x1
143
+ ld1h {z2.h}, p0/z, x2
144
+ ld1h {z3.h}, p0/z, x2, #1, mul vl
145
+ st1b {z2.h}, p0, x0
146
+ st1b {z3.h}, p0, x0, #1, mul vl
147
+ add x2, x2, x3, lsl #1
148
+ add x0, x0, x1
149
+.endr
150
+ cbnz w12, .vl_gt_16_loop_csp32_sve
151
+ ret
152
+.vl_gt_48_blockcopy_sp_32_32:
153
+ ptrue p0.h, vl32
154
+.vl_gt_48_loop_csp32_sve:
155
+ sub w12, w12, #1
156
+.rept 4
157
+ ld1h {z0.h}, p0/z, x2
158
+ st1b {z0.h}, p0, x0
159
+ add x2, x2, x3, lsl #1
160
+ add x0, x0, x1
161
+ ld1h {z1.h}, p0/z, x2
162
+ st1b {z1.h}, p0, x0
163
+ add x2, x2, x3, lsl #1
164
+ add x0, x0, x1
165
+.endr
166
+ cbnz w12, .vl_gt_48_loop_csp32_sve
167
+ ret
168
+endfunc
169
+
170
+function PFX(blockcopy_ps_16x16_sve)
171
+ rdvl x9, #1
172
+ cmp x9, #16
173
+ bgt .vl_gt_16_blockcopy_ps_16_16
174
+ lsl x1, x1, #1
175
+.rept 8
176
+ ld1 {v4.16b}, x2, x3
177
+ ld1 {v5.16b}, x2, x3
178
+ uxtl v0.8h, v4.8b
179
+ uxtl2 v1.8h, v4.16b
180
+ uxtl v2.8h, v5.8b
181
+ uxtl2 v3.8h, v5.16b
182
+ st1 {v0.8h-v1.8h}, x0, x1
183
+ st1 {v2.8h-v3.8h}, x0, x1
184
+.endr
185
+ ret
186
+.vl_gt_16_blockcopy_ps_16_16:
187
+ ptrue p0.b, vl32
188
+.rept 16
189
+ ld1b {z1.h}, p0/z, x2
190
+ st1h {z1.h}, p0, x0
191
+ add x0, x0, x1, lsl #1
192
+ add x2, x2, x3
193
+.endr
194
+ ret
195
+endfunc
196
+
197
+function PFX(blockcopy_ps_32x32_sve)
198
+ rdvl x9, #1
199
+ cmp x9, #16
200
+ bgt .vl_gt_16_blockcopy_ps_32_32
201
x265_3.6.tar.gz/source/common/aarch64/blockcopy8.S
Added
201
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2021 MulticoreWare, Inc
4
+ *
5
+ * Authors: Sebastian Pop <spop@amazon.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+#include "blockcopy8-common.S"
27
+
28
+#ifdef __APPLE__
29
+.section __RODATA,__rodata
30
+#else
31
+.section .rodata
32
+#endif
33
+
34
+.align 4
35
+
36
+.text
37
+
38
+/* void blockcopy_sp(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
39
+ *
40
+ * r0 - a
41
+ * r1 - stridea
42
+ * r2 - b
43
+ * r3 - strideb */
44
+function PFX(blockcopy_sp_4x4_neon)
45
+ lsl x3, x3, #1
46
+.rept 2
47
+ ld1 {v0.8h}, x2, x3
48
+ ld1 {v1.8h}, x2, x3
49
+ xtn v0.8b, v0.8h
50
+ xtn v1.8b, v1.8h
51
+ st1 {v0.s}0, x0, x1
52
+ st1 {v1.s}0, x0, x1
53
+.endr
54
+ ret
55
+endfunc
56
+
57
+function PFX(blockcopy_sp_8x8_neon)
58
+ lsl x3, x3, #1
59
+.rept 4
60
+ ld1 {v0.8h}, x2, x3
61
+ ld1 {v1.8h}, x2, x3
62
+ xtn v0.8b, v0.8h
63
+ xtn v1.8b, v1.8h
64
+ st1 {v0.d}0, x0, x1
65
+ st1 {v1.d}0, x0, x1
66
+.endr
67
+ ret
68
+endfunc
69
+
70
+function PFX(blockcopy_sp_16x16_neon)
71
+ lsl x3, x3, #1
72
+ movrel x11, xtn_xtn2_table
73
+ ld1 {v31.16b}, x11
74
+.rept 8
75
+ ld1 {v0.8h-v1.8h}, x2, x3
76
+ ld1 {v2.8h-v3.8h}, x2, x3
77
+ tbl v0.16b, {v0.16b,v1.16b}, v31.16b
78
+ tbl v1.16b, {v2.16b,v3.16b}, v31.16b
79
+ st1 {v0.16b}, x0, x1
80
+ st1 {v1.16b}, x0, x1
81
+.endr
82
+ ret
83
+endfunc
84
+
85
+function PFX(blockcopy_sp_32x32_neon)
86
+ mov w12, #4
87
+ lsl x3, x3, #1
88
+ movrel x11, xtn_xtn2_table
89
+ ld1 {v31.16b}, x11
90
+.loop_csp32:
91
+ sub w12, w12, #1
92
+.rept 4
93
+ ld1 {v0.8h-v3.8h}, x2, x3
94
+ ld1 {v4.8h-v7.8h}, x2, x3
95
+ tbl v0.16b, {v0.16b,v1.16b}, v31.16b
96
+ tbl v1.16b, {v2.16b,v3.16b}, v31.16b
97
+ tbl v2.16b, {v4.16b,v5.16b}, v31.16b
98
+ tbl v3.16b, {v6.16b,v7.16b}, v31.16b
99
+ st1 {v0.16b-v1.16b}, x0, x1
100
+ st1 {v2.16b-v3.16b}, x0, x1
101
+.endr
102
+ cbnz w12, .loop_csp32
103
+ ret
104
+endfunc
105
+
106
+function PFX(blockcopy_sp_64x64_neon)
107
+ mov w12, #16
108
+ lsl x3, x3, #1
109
+ sub x3, x3, #64
110
+ movrel x11, xtn_xtn2_table
111
+ ld1 {v31.16b}, x11
112
+.loop_csp64:
113
+ sub w12, w12, #1
114
+.rept 4
115
+ ld1 {v0.8h-v3.8h}, x2, #64
116
+ ld1 {v4.8h-v7.8h}, x2, x3
117
+ tbl v0.16b, {v0.16b,v1.16b}, v31.16b
118
+ tbl v1.16b, {v2.16b,v3.16b}, v31.16b
119
+ tbl v2.16b, {v4.16b,v5.16b}, v31.16b
120
+ tbl v3.16b, {v6.16b,v7.16b}, v31.16b
121
+ st1 {v0.16b-v3.16b}, x0, x1
122
+.endr
123
+ cbnz w12, .loop_csp64
124
+ ret
125
+endfunc
126
+
127
+// void blockcopy_ps(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb)
128
+function PFX(blockcopy_ps_4x4_neon)
129
+ lsl x1, x1, #1
130
+.rept 2
131
+ ld1 {v0.8b}, x2, x3
132
+ ld1 {v1.8b}, x2, x3
133
+ uxtl v0.8h, v0.8b
134
+ uxtl v1.8h, v1.8b
135
+ st1 {v0.4h}, x0, x1
136
+ st1 {v1.4h}, x0, x1
137
+.endr
138
+ ret
139
+endfunc
140
+
141
+function PFX(blockcopy_ps_8x8_neon)
142
+ lsl x1, x1, #1
143
+.rept 4
144
+ ld1 {v0.8b}, x2, x3
145
+ ld1 {v1.8b}, x2, x3
146
+ uxtl v0.8h, v0.8b
147
+ uxtl v1.8h, v1.8b
148
+ st1 {v0.8h}, x0, x1
149
+ st1 {v1.8h}, x0, x1
150
+.endr
151
+ ret
152
+endfunc
153
+
154
+function PFX(blockcopy_ps_16x16_neon)
155
+ lsl x1, x1, #1
156
+.rept 8
157
+ ld1 {v4.16b}, x2, x3
158
+ ld1 {v5.16b}, x2, x3
159
+ uxtl v0.8h, v4.8b
160
+ uxtl2 v1.8h, v4.16b
161
+ uxtl v2.8h, v5.8b
162
+ uxtl2 v3.8h, v5.16b
163
+ st1 {v0.8h-v1.8h}, x0, x1
164
+ st1 {v2.8h-v3.8h}, x0, x1
165
+.endr
166
+ ret
167
+endfunc
168
+
169
+function PFX(blockcopy_ps_32x32_neon)
170
+ lsl x1, x1, #1
171
+ mov w12, #4
172
+.loop_cps32:
173
+ sub w12, w12, #1
174
+.rept 4
175
+ ld1 {v16.16b-v17.16b}, x2, x3
176
+ ld1 {v18.16b-v19.16b}, x2, x3
177
+ uxtl v0.8h, v16.8b
178
+ uxtl2 v1.8h, v16.16b
179
+ uxtl v2.8h, v17.8b
180
+ uxtl2 v3.8h, v17.16b
181
+ uxtl v4.8h, v18.8b
182
+ uxtl2 v5.8h, v18.16b
183
+ uxtl v6.8h, v19.8b
184
+ uxtl2 v7.8h, v19.16b
185
+ st1 {v0.8h-v3.8h}, x0, x1
186
+ st1 {v4.8h-v7.8h}, x0, x1
187
+.endr
188
+ cbnz w12, .loop_cps32
189
+ ret
190
+endfunc
191
+
192
+function PFX(blockcopy_ps_64x64_neon)
193
+ lsl x1, x1, #1
194
+ sub x1, x1, #64
195
+ mov w12, #16
196
+.loop_cps64:
197
+ sub w12, w12, #1
198
+.rept 4
199
+ ld1 {v16.16b-v19.16b}, x2, x3
200
+ uxtl v0.8h, v16.8b
201
x265_3.6.tar.gz/source/common/aarch64/dct-prim.cpp
Added
201
1
2
+#include "dct-prim.h"
3
+
4
+
5
+#if HAVE_NEON
6
+
7
+#include <arm_neon.h>
8
+
9
+
10
+namespace
11
+{
12
+using namespace X265_NS;
13
+
14
+
15
+static int16x8_t rev16(const int16x8_t a)
16
+{
17
+ static const int8x16_t tbl = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
18
+ return vqtbx1q_u8(a, a, tbl);
19
+}
20
+
21
+static int32x4_t rev32(const int32x4_t a)
22
+{
23
+ static const int8x16_t tbl = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
24
+ return vqtbx1q_u8(a, a, tbl);
25
+}
26
+
27
+static void transpose_4x4x16(int16x4_t &x0, int16x4_t &x1, int16x4_t &x2, int16x4_t &x3)
28
+{
29
+ int16x4_t s0, s1, s2, s3;
30
+ s0 = vtrn1_s32(x0, x2);
31
+ s1 = vtrn1_s32(x1, x3);
32
+ s2 = vtrn2_s32(x0, x2);
33
+ s3 = vtrn2_s32(x1, x3);
34
+
35
+ x0 = vtrn1_s16(s0, s1);
36
+ x1 = vtrn2_s16(s0, s1);
37
+ x2 = vtrn1_s16(s2, s3);
38
+ x3 = vtrn2_s16(s2, s3);
39
+}
40
+
41
+
42
+
43
+static int scanPosLast_opt(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag,
44
+ uint8_t *coeffNum, int numSig, const uint16_t * /*scanCG4x4*/, const int /*trSize*/)
45
+{
46
+
47
+ // This is an optimized function for scanPosLast, which removes the rmw dependency, once integrated into mainline x265, should replace reference implementation
48
+ // For clarity, left the original reference code in comments
49
+ int scanPosLast = 0;
50
+
51
+ uint16_t cSign = 0;
52
+ uint16_t cFlag = 0;
53
+ uint8_t cNum = 0;
54
+
55
+ uint32_t prevcgIdx = 0;
56
+ do
57
+ {
58
+ const uint32_t cgIdx = (uint32_t)scanPosLast >> MLS_CG_SIZE;
59
+
60
+ const uint32_t posLast = scanscanPosLast;
61
+
62
+ const int curCoeff = coeffposLast;
63
+ const uint32_t isNZCoeff = (curCoeff != 0);
64
+ /*
65
+ NOTE: the new algorithm is complicated, so I keep reference code here
66
+ uint32_t posy = posLast >> log2TrSize;
67
+ uint32_t posx = posLast - (posy << log2TrSize);
68
+ uint32_t blkIdx0 = ((posy >> MLS_CG_LOG2_SIZE) << codingParameters.log2TrSizeCG) + (posx >> MLS_CG_LOG2_SIZE);
69
+ const uint32_t blkIdx = ((posLast >> (2 * MLS_CG_LOG2_SIZE)) & ~maskPosXY) + ((posLast >> MLS_CG_LOG2_SIZE) & maskPosXY);
70
+ sigCoeffGroupFlag64 |= ((uint64_t)isNZCoeff << blkIdx);
71
+ */
72
+
73
+ // get L1 sig map
74
+ numSig -= isNZCoeff;
75
+
76
+ if (scanPosLast % (1 << MLS_CG_SIZE) == 0)
77
+ {
78
+ coeffSignprevcgIdx = cSign;
79
+ coeffFlagprevcgIdx = cFlag;
80
+ coeffNumprevcgIdx = cNum;
81
+ cSign = 0;
82
+ cFlag = 0;
83
+ cNum = 0;
84
+ }
85
+ // TODO: optimize by instruction BTS
86
+ cSign += (uint16_t)(((curCoeff < 0) ? 1 : 0) << cNum);
87
+ cFlag = (cFlag << 1) + (uint16_t)isNZCoeff;
88
+ cNum += (uint8_t)isNZCoeff;
89
+ prevcgIdx = cgIdx;
90
+ scanPosLast++;
91
+ }
92
+ while (numSig > 0);
93
+
94
+ coeffSignprevcgIdx = cSign;
95
+ coeffFlagprevcgIdx = cFlag;
96
+ coeffNumprevcgIdx = cNum;
97
+ return scanPosLast - 1;
98
+}
99
+
100
+
101
+#if (MLS_CG_SIZE == 4)
102
+template<int log2TrSize>
103
+static void nonPsyRdoQuant_neon(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost,
104
+ int64_t *totalRdCost, uint32_t blkPos)
105
+{
106
+ const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH -
107
+ log2TrSize; /* Represents scaling through forward transform */
108
+ const int scaleBits = SCALE_BITS - 2 * transformShift;
109
+ const uint32_t trSize = 1 << log2TrSize;
110
+
111
+ int64x2_t vcost_sum_0 = vdupq_n_s64(0);
112
+ int64x2_t vcost_sum_1 = vdupq_n_s64(0);
113
+ for (int y = 0; y < MLS_CG_SIZE; y++)
114
+ {
115
+ int16x4_t in = *(int16x4_t *)&m_resiDctCoeffblkPos;
116
+ int32x4_t mul = vmull_s16(in, in);
117
+ int64x2_t cost0, cost1;
118
+ cost0 = vshll_n_s32(vget_low_s32(mul), scaleBits);
119
+ cost1 = vshll_high_n_s32(mul, scaleBits);
120
+ *(int64x2_t *)&costUncodedblkPos + 0 = cost0;
121
+ *(int64x2_t *)&costUncodedblkPos + 2 = cost1;
122
+ vcost_sum_0 = vaddq_s64(vcost_sum_0, cost0);
123
+ vcost_sum_1 = vaddq_s64(vcost_sum_1, cost1);
124
+ blkPos += trSize;
125
+ }
126
+ int64_t sum = vaddvq_s64(vaddq_s64(vcost_sum_0, vcost_sum_1));
127
+ *totalUncodedCost += sum;
128
+ *totalRdCost += sum;
129
+}
130
+
131
+template<int log2TrSize>
132
+static void psyRdoQuant_neon(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded,
133
+ int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
134
+{
135
+ const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH -
136
+ log2TrSize; /* Represents scaling through forward transform */
137
+ const int scaleBits = SCALE_BITS - 2 * transformShift;
138
+ const uint32_t trSize = 1 << log2TrSize;
139
+ //using preprocessor to bypass clang bug
140
+ const int max = X265_MAX(0, (2 * transformShift + 1));
141
+
142
+ int64x2_t vcost_sum_0 = vdupq_n_s64(0);
143
+ int64x2_t vcost_sum_1 = vdupq_n_s64(0);
144
+ int32x4_t vpsy = vdupq_n_s32(*psyScale);
145
+ for (int y = 0; y < MLS_CG_SIZE; y++)
146
+ {
147
+ int32x4_t signCoef = vmovl_s16(*(int16x4_t *)&m_resiDctCoeffblkPos);
148
+ int32x4_t predictedCoef = vsubq_s32(vmovl_s16(*(int16x4_t *)&m_fencDctCoeffblkPos), signCoef);
149
+ int64x2_t cost0, cost1;
150
+ cost0 = vmull_s32(vget_low_s32(signCoef), vget_low_s32(signCoef));
151
+ cost1 = vmull_high_s32(signCoef, signCoef);
152
+ cost0 = vshlq_n_s64(cost0, scaleBits);
153
+ cost1 = vshlq_n_s64(cost1, scaleBits);
154
+ int64x2_t neg0 = vmull_s32(vget_low_s32(predictedCoef), vget_low_s32(vpsy));
155
+ int64x2_t neg1 = vmull_high_s32(predictedCoef, vpsy);
156
+ if (max > 0)
157
+ {
158
+ int64x2_t shift = vdupq_n_s64(-max);
159
+ neg0 = vshlq_s64(neg0, shift);
160
+ neg1 = vshlq_s64(neg1, shift);
161
+ }
162
+ cost0 = vsubq_s64(cost0, neg0);
163
+ cost1 = vsubq_s64(cost1, neg1);
164
+ *(int64x2_t *)&costUncodedblkPos + 0 = cost0;
165
+ *(int64x2_t *)&costUncodedblkPos + 2 = cost1;
166
+ vcost_sum_0 = vaddq_s64(vcost_sum_0, cost0);
167
+ vcost_sum_1 = vaddq_s64(vcost_sum_1, cost1);
168
+
169
+ blkPos += trSize;
170
+ }
171
+ int64_t sum = vaddvq_s64(vaddq_s64(vcost_sum_0, vcost_sum_1));
172
+ *totalUncodedCost += sum;
173
+ *totalRdCost += sum;
174
+}
175
+
176
+#else
177
+#error "MLS_CG_SIZE must be 4 for neon version"
178
+#endif
179
+
180
+
181
+
182
+template<int trSize>
183
+int count_nonzero_neon(const int16_t *quantCoeff)
184
+{
185
+ X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
186
+ int count = 0;
187
+ int16x8_t vcount = vdupq_n_s16(0);
188
+ const int numCoeff = trSize * trSize;
189
+ int i = 0;
190
+ for (; (i + 8) <= numCoeff; i += 8)
191
+ {
192
+ int16x8_t in = *(int16x8_t *)&quantCoeffi;
193
+ vcount = vaddq_s16(vcount, vtstq_s16(in, in));
194
+ }
195
+ for (; i < numCoeff; i++)
196
+ {
197
+ count += quantCoeffi != 0;
198
+ }
199
+
200
+ return count - vaddvq_s16(vcount);
201
x265_3.6.tar.gz/source/common/aarch64/dct-prim.h
Added
21
1
2
+#ifndef __DCT_PRIM_NEON_H__
3
+#define __DCT_PRIM_NEON_H__
4
+
5
+
6
+#include "common.h"
7
+#include "primitives.h"
8
+#include "contexts.h" // costCoeffNxN_c
9
+#include "threading.h" // CLZ
10
+
11
+namespace X265_NS
12
+{
13
+// x265 private namespace
14
+void setupDCTPrimitives_neon(EncoderPrimitives &p);
15
+};
16
+
17
+
18
+
19
+#endif
20
+
21
x265_3.6.tar.gz/source/common/aarch64/filter-prim.cpp
Added
201
1
2
+#if HAVE_NEON
3
+
4
+#include "filter-prim.h"
5
+#include <arm_neon.h>
6
+
7
+namespace
8
+{
9
+
10
+using namespace X265_NS;
11
+
12
+
13
+template<int width, int height>
14
+void filterPixelToShort_neon(const pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
15
+{
16
+ const int shift = IF_INTERNAL_PREC - X265_DEPTH;
17
+ int row, col;
18
+ const int16x8_t off = vdupq_n_s16(IF_INTERNAL_OFFS);
19
+ for (row = 0; row < height; row++)
20
+ {
21
+
22
+ for (col = 0; col < width; col += 8)
23
+ {
24
+ int16x8_t in;
25
+
26
+#if HIGH_BIT_DEPTH
27
+ in = *(int16x8_t *)&srccol;
28
+#else
29
+ in = vmovl_u8(*(uint8x8_t *)&srccol);
30
+#endif
31
+
32
+ int16x8_t tmp = vshlq_n_s16(in, shift);
33
+ tmp = vsubq_s16(tmp, off);
34
+ *(int16x8_t *)&dstcol = tmp;
35
+
36
+ }
37
+
38
+ src += srcStride;
39
+ dst += dstStride;
40
+ }
41
+}
42
+
43
+
44
+template<int N, int width, int height>
45
+void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
46
+{
47
+ const int16_t *coeff = (N == 4) ? g_chromaFiltercoeffIdx : g_lumaFiltercoeffIdx;
48
+ int headRoom = IF_FILTER_PREC;
49
+ int offset = (1 << (headRoom - 1));
50
+ uint16_t maxVal = (1 << X265_DEPTH) - 1;
51
+ int cStride = 1;
52
+
53
+ src -= (N / 2 - 1) * cStride;
54
+ int16x8_t vc;
55
+ vc = *(int16x8_t *)coeff;
56
+ int16x4_t low_vc = vget_low_s16(vc);
57
+ int16x4_t high_vc = vget_high_s16(vc);
58
+
59
+ const int32x4_t voffset = vdupq_n_s32(offset);
60
+ const int32x4_t vhr = vdupq_n_s32(-headRoom);
61
+
62
+ int row, col;
63
+ for (row = 0; row < height; row++)
64
+ {
65
+ for (col = 0; col < width; col += 8)
66
+ {
67
+ int32x4_t vsum1, vsum2;
68
+
69
+ int16x8_t inputN;
70
+
71
+ for (int i = 0; i < N; i++)
72
+ {
73
+#if HIGH_BIT_DEPTH
74
+ inputi = *(int16x8_t *)&srccol + i;
75
+#else
76
+ inputi = vmovl_u8(*(uint8x8_t *)&srccol + i);
77
+#endif
78
+ }
79
+ vsum1 = voffset;
80
+ vsum2 = voffset;
81
+
82
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input0), low_vc, 0);
83
+ vsum2 = vmlal_high_lane_s16(vsum2, input0, low_vc, 0);
84
+
85
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input1), low_vc, 1);
86
+ vsum2 = vmlal_high_lane_s16(vsum2, input1, low_vc, 1);
87
+
88
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input2), low_vc, 2);
89
+ vsum2 = vmlal_high_lane_s16(vsum2, input2, low_vc, 2);
90
+
91
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input3), low_vc, 3);
92
+ vsum2 = vmlal_high_lane_s16(vsum2, input3, low_vc, 3);
93
+
94
+ if (N == 8)
95
+ {
96
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input4), high_vc, 0);
97
+ vsum2 = vmlal_high_lane_s16(vsum2, input4, high_vc, 0);
98
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input5), high_vc, 1);
99
+ vsum2 = vmlal_high_lane_s16(vsum2, input5, high_vc, 1);
100
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input6), high_vc, 2);
101
+ vsum2 = vmlal_high_lane_s16(vsum2, input6, high_vc, 2);
102
+ vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input7), high_vc, 3);
103
+ vsum2 = vmlal_high_lane_s16(vsum2, input7, high_vc, 3);
104
+
105
+ }
106
+
107
+ vsum1 = vshlq_s32(vsum1, vhr);
108
+ vsum2 = vshlq_s32(vsum2, vhr);
109
+
110
+ int16x8_t vsum = vuzp1q_s16(vsum1, vsum2);
111
+ vsum = vminq_s16(vsum, vdupq_n_s16(maxVal));
112
+ vsum = vmaxq_s16(vsum, vdupq_n_s16(0));
113
+#if HIGH_BIT_DEPTH
114
+ *(int16x8_t *)&dstcol = vsum;
115
+#else
116
+ uint8x16_t usum = vuzp1q_u8(vsum, vsum);
117
+ *(uint8x8_t *)&dstcol = vget_low_u8(usum);
118
+#endif
119
+
120
+ }
121
+
122
+ src += srcStride;
123
+ dst += dstStride;
124
+ }
125
+}
126
+
127
+#if HIGH_BIT_DEPTH
128
+
129
+template<int N, int width, int height>
130
+void interp_horiz_ps_neon(const uint16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx,
131
+ int isRowExt)
132
+{
133
+ const int16_t *coeff = (N == 4) ? g_chromaFiltercoeffIdx : g_lumaFiltercoeffIdx;
134
+ const int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
135
+ const int shift = IF_FILTER_PREC - headRoom;
136
+ const int offset = (unsigned) - IF_INTERNAL_OFFS << shift;
137
+
138
+ int blkheight = height;
139
+ src -= N / 2 - 1;
140
+
141
+ if (isRowExt)
142
+ {
143
+ src -= (N / 2 - 1) * srcStride;
144
+ blkheight += N - 1;
145
+ }
146
+ int16x8_t vc3 = vld1q_s16(coeff);
147
+ const int32x4_t voffset = vdupq_n_s32(offset);
148
+ const int32x4_t vhr = vdupq_n_s32(-shift);
149
+
150
+ int row, col;
151
+ for (row = 0; row < blkheight; row++)
152
+ {
153
+ for (col = 0; col < width; col += 8)
154
+ {
155
+ int32x4_t vsum, vsum2;
156
+
157
+ int16x8_t inputN;
158
+ for (int i = 0; i < N; i++)
159
+ {
160
+ inputi = vld1q_s16((int16_t *)&srccol + i);
161
+ }
162
+
163
+ vsum = voffset;
164
+ vsum2 = voffset;
165
+
166
+ vsum = vmlal_lane_s16(vsum, vget_low_u16(input0), vget_low_s16(vc3), 0);
167
+ vsum2 = vmlal_high_lane_s16(vsum2, input0, vget_low_s16(vc3), 0);
168
+
169
+ vsum = vmlal_lane_s16(vsum, vget_low_u16(input1), vget_low_s16(vc3), 1);
170
+ vsum2 = vmlal_high_lane_s16(vsum2, input1, vget_low_s16(vc3), 1);
171
+
172
+ vsum = vmlal_lane_s16(vsum, vget_low_u16(input2), vget_low_s16(vc3), 2);
173
+ vsum2 = vmlal_high_lane_s16(vsum2, input2, vget_low_s16(vc3), 2);
174
+
175
+ vsum = vmlal_lane_s16(vsum, vget_low_u16(input3), vget_low_s16(vc3), 3);
176
+ vsum2 = vmlal_high_lane_s16(vsum2, input3, vget_low_s16(vc3), 3);
177
+
178
+ if (N == 8)
179
+ {
180
+ vsum = vmlal_lane_s16(vsum, vget_low_s16(input4), vget_high_s16(vc3), 0);
181
+ vsum2 = vmlal_high_lane_s16(vsum2, input4, vget_high_s16(vc3), 0);
182
+
183
+ vsum = vmlal_lane_s16(vsum, vget_low_s16(input5), vget_high_s16(vc3), 1);
184
+ vsum2 = vmlal_high_lane_s16(vsum2, input5, vget_high_s16(vc3), 1);
185
+
186
+ vsum = vmlal_lane_s16(vsum, vget_low_s16(input6), vget_high_s16(vc3), 2);
187
+ vsum2 = vmlal_high_lane_s16(vsum2, input6, vget_high_s16(vc3), 2);
188
+
189
+ vsum = vmlal_lane_s16(vsum, vget_low_s16(input7), vget_high_s16(vc3), 3);
190
+ vsum2 = vmlal_high_lane_s16(vsum2, input7, vget_high_s16(vc3), 3);
191
+ }
192
+
193
+ vsum = vshlq_s32(vsum, vhr);
194
+ vsum2 = vshlq_s32(vsum2, vhr);
195
+ *(int16x4_t *)&dstcol = vmovn_u32(vsum);
196
+ *(int16x4_t *)&dstcol+4 = vmovn_u32(vsum2);
197
+ }
198
+
199
+ src += srcStride;
200
+ dst += dstStride;
201
x265_3.6.tar.gz/source/common/aarch64/filter-prim.h
Added
23
1
2
+#ifndef _FILTER_PRIM_ARM64_H__
3
+#define _FILTER_PRIM_ARM64_H__
4
+
5
+
6
+#include "common.h"
7
+#include "slicetype.h" // LOWRES_COST_MASK
8
+#include "primitives.h"
9
+#include "x265.h"
10
+
11
+
12
+namespace X265_NS
13
+{
14
+
15
+
16
+void setupFilterPrimitives_neon(EncoderPrimitives &p);
17
+
18
+};
19
+
20
+
21
+#endif
22
+
23
x265_3.6.tar.gz/source/common/aarch64/fun-decls.h
Added
201
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2021 MulticoreWare, Inc
4
+ *
5
+ * Authors: Sebastian Pop <spop@amazon.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#define FUNCDEF_TU(ret, name, cpu, ...) \
26
+ ret PFX(name ## _4x4_ ## cpu(__VA_ARGS__)); \
27
+ ret PFX(name ## _8x8_ ## cpu(__VA_ARGS__)); \
28
+ ret PFX(name ## _16x16_ ## cpu(__VA_ARGS__)); \
29
+ ret PFX(name ## _32x32_ ## cpu(__VA_ARGS__)); \
30
+ ret PFX(name ## _64x64_ ## cpu(__VA_ARGS__))
31
+
32
+#define FUNCDEF_TU_S(ret, name, cpu, ...) \
33
+ ret PFX(name ## _4_ ## cpu(__VA_ARGS__)); \
34
+ ret PFX(name ## _8_ ## cpu(__VA_ARGS__)); \
35
+ ret PFX(name ## _16_ ## cpu(__VA_ARGS__)); \
36
+ ret PFX(name ## _32_ ## cpu(__VA_ARGS__)); \
37
+ ret PFX(name ## _64_ ## cpu(__VA_ARGS__))
38
+
39
+#define FUNCDEF_TU_S2(ret, name, cpu, ...) \
40
+ ret PFX(name ## 4_ ## cpu(__VA_ARGS__)); \
41
+ ret PFX(name ## 8_ ## cpu(__VA_ARGS__)); \
42
+ ret PFX(name ## 16_ ## cpu(__VA_ARGS__)); \
43
+ ret PFX(name ## 32_ ## cpu(__VA_ARGS__)); \
44
+ ret PFX(name ## 64_ ## cpu(__VA_ARGS__))
45
+
46
+#define FUNCDEF_PU(ret, name, cpu, ...) \
47
+ ret PFX(name ## _4x4_ ## cpu)(__VA_ARGS__); \
48
+ ret PFX(name ## _8x8_ ## cpu)(__VA_ARGS__); \
49
+ ret PFX(name ## _16x16_ ## cpu)(__VA_ARGS__); \
50
+ ret PFX(name ## _32x32_ ## cpu)(__VA_ARGS__); \
51
+ ret PFX(name ## _64x64_ ## cpu)(__VA_ARGS__); \
52
+ ret PFX(name ## _8x4_ ## cpu)(__VA_ARGS__); \
53
+ ret PFX(name ## _4x8_ ## cpu)(__VA_ARGS__); \
54
+ ret PFX(name ## _16x8_ ## cpu)(__VA_ARGS__); \
55
+ ret PFX(name ## _8x16_ ## cpu)(__VA_ARGS__); \
56
+ ret PFX(name ## _16x32_ ## cpu)(__VA_ARGS__); \
57
+ ret PFX(name ## _32x16_ ## cpu)(__VA_ARGS__); \
58
+ ret PFX(name ## _64x32_ ## cpu)(__VA_ARGS__); \
59
+ ret PFX(name ## _32x64_ ## cpu)(__VA_ARGS__); \
60
+ ret PFX(name ## _16x12_ ## cpu)(__VA_ARGS__); \
61
+ ret PFX(name ## _12x16_ ## cpu)(__VA_ARGS__); \
62
+ ret PFX(name ## _16x4_ ## cpu)(__VA_ARGS__); \
63
+ ret PFX(name ## _4x16_ ## cpu)(__VA_ARGS__); \
64
+ ret PFX(name ## _32x24_ ## cpu)(__VA_ARGS__); \
65
+ ret PFX(name ## _24x32_ ## cpu)(__VA_ARGS__); \
66
+ ret PFX(name ## _32x8_ ## cpu)(__VA_ARGS__); \
67
+ ret PFX(name ## _8x32_ ## cpu)(__VA_ARGS__); \
68
+ ret PFX(name ## _64x48_ ## cpu)(__VA_ARGS__); \
69
+ ret PFX(name ## _48x64_ ## cpu)(__VA_ARGS__); \
70
+ ret PFX(name ## _64x16_ ## cpu)(__VA_ARGS__); \
71
+ ret PFX(name ## _16x64_ ## cpu)(__VA_ARGS__)
72
+
73
+#define FUNCDEF_CHROMA_PU(ret, name, cpu, ...) \
74
+ FUNCDEF_PU(ret, name, cpu, __VA_ARGS__); \
75
+ ret PFX(name ## _4x2_ ## cpu)(__VA_ARGS__); \
76
+ ret PFX(name ## _4x4_ ## cpu)(__VA_ARGS__); \
77
+ ret PFX(name ## _2x4_ ## cpu)(__VA_ARGS__); \
78
+ ret PFX(name ## _8x2_ ## cpu)(__VA_ARGS__); \
79
+ ret PFX(name ## _2x8_ ## cpu)(__VA_ARGS__); \
80
+ ret PFX(name ## _8x6_ ## cpu)(__VA_ARGS__); \
81
+ ret PFX(name ## _6x8_ ## cpu)(__VA_ARGS__); \
82
+ ret PFX(name ## _8x12_ ## cpu)(__VA_ARGS__); \
83
+ ret PFX(name ## _12x8_ ## cpu)(__VA_ARGS__); \
84
+ ret PFX(name ## _6x16_ ## cpu)(__VA_ARGS__); \
85
+ ret PFX(name ## _16x6_ ## cpu)(__VA_ARGS__); \
86
+ ret PFX(name ## _2x16_ ## cpu)(__VA_ARGS__); \
87
+ ret PFX(name ## _16x2_ ## cpu)(__VA_ARGS__); \
88
+ ret PFX(name ## _4x12_ ## cpu)(__VA_ARGS__); \
89
+ ret PFX(name ## _12x4_ ## cpu)(__VA_ARGS__); \
90
+ ret PFX(name ## _32x12_ ## cpu)(__VA_ARGS__); \
91
+ ret PFX(name ## _12x32_ ## cpu)(__VA_ARGS__); \
92
+ ret PFX(name ## _32x4_ ## cpu)(__VA_ARGS__); \
93
+ ret PFX(name ## _4x32_ ## cpu)(__VA_ARGS__); \
94
+ ret PFX(name ## _32x48_ ## cpu)(__VA_ARGS__); \
95
+ ret PFX(name ## _48x32_ ## cpu)(__VA_ARGS__); \
96
+ ret PFX(name ## _16x24_ ## cpu)(__VA_ARGS__); \
97
+ ret PFX(name ## _24x16_ ## cpu)(__VA_ARGS__); \
98
+ ret PFX(name ## _8x64_ ## cpu)(__VA_ARGS__); \
99
+ ret PFX(name ## _64x8_ ## cpu)(__VA_ARGS__); \
100
+ ret PFX(name ## _64x24_ ## cpu)(__VA_ARGS__); \
101
+ ret PFX(name ## _24x64_ ## cpu)(__VA_ARGS__);
102
+
103
+#define DECLS(cpu) \
104
+ FUNCDEF_TU(void, cpy2Dto1D_shl, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
105
+ FUNCDEF_TU(void, cpy2Dto1D_shr, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
106
+ FUNCDEF_TU(void, cpy1Dto2D_shl, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
107
+ FUNCDEF_TU(void, cpy1Dto2D_shl_aligned, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
108
+ FUNCDEF_TU(void, cpy1Dto2D_shr, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); \
109
+ FUNCDEF_TU_S(uint32_t, copy_cnt, cpu, int16_t* dst, const int16_t* src, intptr_t srcStride); \
110
+ FUNCDEF_TU_S(int, count_nonzero, cpu, const int16_t* quantCoeff); \
111
+ FUNCDEF_TU(void, blockfill_s, cpu, int16_t* dst, intptr_t dstride, int16_t val); \
112
+ FUNCDEF_TU(void, blockfill_s_aligned, cpu, int16_t* dst, intptr_t dstride, int16_t val); \
113
+ FUNCDEF_CHROMA_PU(void, blockcopy_ss, cpu, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); \
114
+ FUNCDEF_CHROMA_PU(void, blockcopy_pp, cpu, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); \
115
+ FUNCDEF_PU(void, blockcopy_sp, cpu, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); \
116
+ FUNCDEF_PU(void, blockcopy_ps, cpu, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); \
117
+ FUNCDEF_PU(void, interp_8tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
118
+ FUNCDEF_PU(void, interp_8tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
119
+ FUNCDEF_PU(void, interp_8tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
120
+ FUNCDEF_PU(void, interp_8tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
121
+ FUNCDEF_PU(void, interp_8tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
122
+ FUNCDEF_PU(void, interp_8tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
123
+ FUNCDEF_PU(void, interp_8tap_hv_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY); \
124
+ FUNCDEF_CHROMA_PU(void, filterPixelToShort, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
125
+ FUNCDEF_CHROMA_PU(void, filterPixelToShort_aligned, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
126
+ FUNCDEF_CHROMA_PU(void, interp_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
127
+ FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
128
+ FUNCDEF_CHROMA_PU(void, interp_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
129
+ FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
130
+ FUNCDEF_CHROMA_PU(void, interp_4tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
131
+ FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
132
+ FUNCDEF_CHROMA_PU(void, interp_4tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
133
+ FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
134
+ FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
135
+ FUNCDEF_CHROMA_PU(void, addAvg_aligned, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
136
+ FUNCDEF_PU(void, pixel_avg_pp, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
137
+ FUNCDEF_PU(void, pixel_avg_pp_aligned, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
138
+ FUNCDEF_PU(void, sad_x3, cpu, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
139
+ FUNCDEF_PU(void, sad_x4, cpu, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
140
+ FUNCDEF_CHROMA_PU(int, pixel_sad, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
141
+ FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
142
+ FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s_aligned, cpu, const int16_t*, intptr_t); \
143
+ FUNCDEF_TU_S(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
144
+ FUNCDEF_TU_S(sse_t, pixel_ssd_s_aligned, cpu, const int16_t*, intptr_t); \
145
+ FUNCDEF_PU(sse_t, pixel_sse_pp, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
146
+ FUNCDEF_CHROMA_PU(sse_t, pixel_sse_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
147
+ FUNCDEF_PU(void, pixel_sub_ps, cpu, int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); \
148
+ FUNCDEF_PU(void, pixel_add_ps, cpu, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \
149
+ FUNCDEF_PU(void, pixel_add_ps_aligned, cpu, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \
150
+ FUNCDEF_CHROMA_PU(int, pixel_satd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
151
+ FUNCDEF_TU_S2(void, ssimDist, cpu, const pixel *fenc, uint32_t fStride, const pixel *recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k); \
152
+ FUNCDEF_TU_S2(void, normFact, cpu, const pixel *src, uint32_t blockSize, int shift, uint64_t *z_k)
153
+
154
+DECLS(neon);
155
+DECLS(sve);
156
+DECLS(sve2);
157
+
158
+
159
+void x265_pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
160
+
161
+uint64_t x265_pixel_var_8x8_neon(const pixel* pix, intptr_t stride);
162
+uint64_t x265_pixel_var_16x16_neon(const pixel* pix, intptr_t stride);
163
+uint64_t x265_pixel_var_32x32_neon(const pixel* pix, intptr_t stride);
164
+uint64_t x265_pixel_var_64x64_neon(const pixel* pix, intptr_t stride);
165
+
166
+void x265_getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
167
+void x265_getResidual8_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
168
+void x265_getResidual16_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
169
+void x265_getResidual32_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
170
+
171
+void x265_scale1D_128to64_neon(pixel *dst, const pixel *src);
172
+void x265_scale2D_64to32_neon(pixel* dst, const pixel* src, intptr_t stride);
173
+
174
+int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
175
+int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
176
+int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
177
+int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
178
+int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
179
+int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
180
+int x265_pixel_satd_8x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
181
+int x265_pixel_satd_8x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
182
+int x265_pixel_satd_8x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
183
+int x265_pixel_satd_8x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
184
+int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
185
+int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
186
+int x265_pixel_satd_16x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
187
+int x265_pixel_satd_16x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
188
+int x265_pixel_satd_16x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
189
+int x265_pixel_satd_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
190
+int x265_pixel_satd_16x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
191
+int x265_pixel_satd_16x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
192
+int x265_pixel_satd_16x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
193
+int x265_pixel_satd_24x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
194
+int x265_pixel_satd_24x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
195
+int x265_pixel_satd_32x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
196
+int x265_pixel_satd_32x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
197
+int x265_pixel_satd_32x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
198
+int x265_pixel_satd_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
199
+int x265_pixel_satd_32x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
200
+int x265_pixel_satd_32x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
201
x265_3.6.tar.gz/source/common/aarch64/intrapred-prim.cpp
Added
201
1
2
+#include "common.h"
3
+#include "primitives.h"
4
+
5
+
6
+#if 1
7
+#include "arm64-utils.h"
8
+#include <arm_neon.h>
9
+
10
+using namespace X265_NS;
11
+
12
+namespace
13
+{
14
+
15
+
16
+
17
+template<int width>
18
+void intra_pred_ang_neon(pixel *dst, intptr_t dstStride, const pixel *srcPix0, int dirMode, int bFilter)
19
+{
20
+ int width2 = width << 1;
21
+ // Flip the neighbours in the horizontal case.
22
+ int horMode = dirMode < 18;
23
+ pixel neighbourBuf129;
24
+ const pixel *srcPix = srcPix0;
25
+
26
+ if (horMode)
27
+ {
28
+ neighbourBuf0 = srcPix0;
29
+ //for (int i = 0; i < width << 1; i++)
30
+ //{
31
+ // neighbourBuf1 + i = srcPixwidth2 + 1 + i;
32
+ // neighbourBufwidth2 + 1 + i = srcPix1 + i;
33
+ //}
34
+ memcpy(&neighbourBuf1, &srcPixwidth2 + 1, sizeof(pixel) * (width << 1));
35
+ memcpy(&neighbourBufwidth2 + 1, &srcPix1, sizeof(pixel) * (width << 1));
36
+ srcPix = neighbourBuf;
37
+ }
38
+
39
+ // Intra prediction angle and inverse angle tables.
40
+ const int8_t angleTable17 = { -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 };
41
+ const int16_t invAngleTable8 = { 4096, 1638, 910, 630, 482, 390, 315, 256 };
42
+
43
+ // Get the prediction angle.
44
+ int angleOffset = horMode ? 10 - dirMode : dirMode - 26;
45
+ int angle = angleTable8 + angleOffset;
46
+
47
+ // Vertical Prediction.
48
+ if (!angle)
49
+ {
50
+ for (int y = 0; y < width; y++)
51
+ {
52
+ memcpy(&dsty * dstStride, srcPix + 1, sizeof(pixel)*width);
53
+ }
54
+ if (bFilter)
55
+ {
56
+ int topLeft = srcPix0, top = srcPix1;
57
+ for (int y = 0; y < width; y++)
58
+ {
59
+ dsty * dstStride = x265_clip((int16_t)(top + ((srcPixwidth2 + 1 + y - topLeft) >> 1)));
60
+ }
61
+ }
62
+ }
63
+ else // Angular prediction.
64
+ {
65
+ // Get the reference pixels. The reference base is the first pixel to the top (neighbourBuf1).
66
+ pixel refBuf64;
67
+ const pixel *ref;
68
+
69
+ // Use the projected left neighbours and the top neighbours.
70
+ if (angle < 0)
71
+ {
72
+ // Number of neighbours projected.
73
+ int nbProjected = -((width * angle) >> 5) - 1;
74
+ pixel *ref_pix = refBuf + nbProjected + 1;
75
+
76
+ // Project the neighbours.
77
+ int invAngle = invAngleTable- angleOffset - 1;
78
+ int invAngleSum = 128;
79
+ for (int i = 0; i < nbProjected; i++)
80
+ {
81
+ invAngleSum += invAngle;
82
+ ref_pix- 2 - i = srcPixwidth2 + (invAngleSum >> 8);
83
+ }
84
+
85
+ // Copy the top-left and top pixels.
86
+ //for (int i = 0; i < width + 1; i++)
87
+ //ref_pix-1 + i = srcPixi;
88
+
89
+ memcpy(&ref_pix-1, srcPix, (width + 1)*sizeof(pixel));
90
+ ref = ref_pix;
91
+ }
92
+ else // Use the top and top-right neighbours.
93
+ {
94
+ ref = srcPix + 1;
95
+ }
96
+
97
+ // Pass every row.
98
+ int angleSum = 0;
99
+ for (int y = 0; y < width; y++)
100
+ {
101
+ angleSum += angle;
102
+ int offset = angleSum >> 5;
103
+ int fraction = angleSum & 31;
104
+
105
+ if (fraction) // Interpolate
106
+ {
107
+ if (width >= 8 && sizeof(pixel) == 1)
108
+ {
109
+ const int16x8_t f0 = vdupq_n_s16(32 - fraction);
110
+ const int16x8_t f1 = vdupq_n_s16(fraction);
111
+ for (int x = 0; x < width; x += 8)
112
+ {
113
+ uint8x8_t in0 = *(uint8x8_t *)&refoffset + x;
114
+ uint8x8_t in1 = *(uint8x8_t *)&refoffset + x + 1;
115
+ int16x8_t lo = vmlaq_s16(vdupq_n_s16(16), vmovl_u8(in0), f0);
116
+ lo = vmlaq_s16(lo, vmovl_u8(in1), f1);
117
+ lo = vshrq_n_s16(lo, 5);
118
+ *(uint8x8_t *)&dsty * dstStride + x = vmovn_u16(lo);
119
+ }
120
+ }
121
+ else if (width >= 4 && sizeof(pixel) == 2)
122
+ {
123
+ const int32x4_t f0 = vdupq_n_s32(32 - fraction);
124
+ const int32x4_t f1 = vdupq_n_s32(fraction);
125
+ for (int x = 0; x < width; x += 4)
126
+ {
127
+ uint16x4_t in0 = *(uint16x4_t *)&refoffset + x;
128
+ uint16x4_t in1 = *(uint16x4_t *)&refoffset + x + 1;
129
+ int32x4_t lo = vmlaq_s32(vdupq_n_s32(16), vmovl_u16(in0), f0);
130
+ lo = vmlaq_s32(lo, vmovl_u16(in1), f1);
131
+ lo = vshrq_n_s32(lo, 5);
132
+ *(uint16x4_t *)&dsty * dstStride + x = vmovn_u32(lo);
133
+ }
134
+ }
135
+ else
136
+ {
137
+ for (int x = 0; x < width; x++)
138
+ {
139
+ dsty * dstStride + x = (pixel)(((32 - fraction) * refoffset + x + fraction * refoffset + x + 1 + 16) >> 5);
140
+ }
141
+ }
142
+ }
143
+ else // Copy.
144
+ {
145
+ memcpy(&dsty * dstStride, &refoffset, sizeof(pixel)*width);
146
+ }
147
+ }
148
+ }
149
+
150
+ // Flip for horizontal.
151
+ if (horMode)
152
+ {
153
+ if (width == 8)
154
+ {
155
+ transpose8x8(dst, dst, dstStride, dstStride);
156
+ }
157
+ else if (width == 16)
158
+ {
159
+ transpose16x16(dst, dst, dstStride, dstStride);
160
+ }
161
+ else if (width == 32)
162
+ {
163
+ transpose32x32(dst, dst, dstStride, dstStride);
164
+ }
165
+ else
166
+ {
167
+ for (int y = 0; y < width - 1; y++)
168
+ {
169
+ for (int x = y + 1; x < width; x++)
170
+ {
171
+ pixel tmp = dsty * dstStride + x;
172
+ dsty * dstStride + x = dstx * dstStride + y;
173
+ dstx * dstStride + y = tmp;
174
+ }
175
+ }
176
+ }
177
+ }
178
+}
179
+
180
+template<int log2Size>
181
+void all_angs_pred_neon(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
182
+{
183
+ const int size = 1 << log2Size;
184
+ for (int mode = 2; mode <= 34; mode++)
185
+ {
186
+ pixel *srcPix = (g_intraFilterFlagsmode & size ? filtPix : refPix);
187
+ pixel *out = dest + ((mode - 2) << (log2Size * 2));
188
+
189
+ intra_pred_ang_neon<size>(out, size, srcPix, mode, bLuma);
190
+
191
+ // Optimize code don't flip buffer
192
+ bool modeHor = (mode < 18);
193
+
194
+ // transpose the block if this is a horizontal mode
195
+ if (modeHor)
196
+ {
197
+ if (size == 8)
198
+ {
199
+ transpose8x8(out, out, size, size);
200
+ }
201
x265_3.6.tar.gz/source/common/aarch64/intrapred-prim.h
Added
17
1
2
+#ifndef INTRAPRED_PRIM_H__
3
+
4
+#if defined(__aarch64__)
5
+
6
+namespace X265_NS
7
+{
8
+// x265 private namespace
9
+
10
+void setupIntraPrimitives_neon(EncoderPrimitives &p);
11
+}
12
+
13
+#endif
14
+
15
+#endif
16
+
17
x265_3.6.tar.gz/source/common/aarch64/ipfilter-common.S
Added
201
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+// Macros below follow these conventions:
29
+// - input data in registers: v0, v1, v2, v3, v4, v5, v6, v7
30
+// - constants in registers: v24, v25, v26, v27, v31
31
+// - temporary registers: v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30.
32
+// - _32b macros output a result in v17.4s
33
+// - _64b and _32b_1 macros output results in v17.4s, v18.4s
34
+
35
+#include "asm.S"
36
+
37
+.arch armv8-a
38
+
39
+#ifdef __APPLE__
40
+.section __RODATA,__rodata
41
+#else
42
+.section .rodata
43
+#endif
44
+
45
+.align 4
46
+
47
+.macro vextin8 v
48
+ ldp d6, d7, x11, #16
49
+.if \v == 0
50
+ // qpel_filter_0 only uses values in v3
51
+ ext v3.8b, v6.8b, v7.8b, #4
52
+.else
53
+.if \v != 3
54
+ ext v0.8b, v6.8b, v7.8b, #1
55
+.endif
56
+ ext v1.8b, v6.8b, v7.8b, #2
57
+ ext v2.8b, v6.8b, v7.8b, #3
58
+ ext v3.8b, v6.8b, v7.8b, #4
59
+ ext v4.8b, v6.8b, v7.8b, #5
60
+ ext v5.8b, v6.8b, v7.8b, #6
61
+ ext v6.8b, v6.8b, v7.8b, #7
62
+.endif
63
+.endm
64
+
65
+.macro vextin8_64 v
66
+ ldp q6, q7, x11, #32
67
+.if \v == 0
68
+ // qpel_filter_0 only uses values in v3
69
+ ext v3.16b, v6.16b, v7.16b, #4
70
+.else
71
+.if \v != 3
72
+ // qpel_filter_3 does not use values in v0
73
+ ext v0.16b, v6.16b, v7.16b, #1
74
+.endif
75
+ ext v1.16b, v6.16b, v7.16b, #2
76
+ ext v2.16b, v6.16b, v7.16b, #3
77
+ ext v3.16b, v6.16b, v7.16b, #4
78
+ ext v4.16b, v6.16b, v7.16b, #5
79
+ ext v5.16b, v6.16b, v7.16b, #6
80
+.if \v == 1
81
+ ext v6.16b, v6.16b, v7.16b, #7
82
+ // qpel_filter_1 does not use v7
83
+.else
84
+ ext v16.16b, v6.16b, v7.16b, #7
85
+ ext v7.16b, v6.16b, v7.16b, #8
86
+ mov v6.16b, v16.16b
87
+.endif
88
+.endif
89
+.endm
90
+
91
+.macro vextin8_chroma v
92
+ ldp d6, d7, x11, #16
93
+.if \v == 0
94
+ // qpel_filter_chroma_0 only uses values in v1
95
+ ext v1.8b, v6.8b, v7.8b, #2
96
+.else
97
+ ext v0.8b, v6.8b, v7.8b, #1
98
+ ext v1.8b, v6.8b, v7.8b, #2
99
+ ext v2.8b, v6.8b, v7.8b, #3
100
+ ext v3.8b, v6.8b, v7.8b, #4
101
+.endif
102
+.endm
103
+
104
+.macro vextin8_chroma_64 v
105
+ ldp q16, q17, x11, #32
106
+.if \v == 0
107
+ // qpel_filter_chroma_0 only uses values in v1
108
+ ext v1.16b, v16.16b, v17.16b, #2
109
+.else
110
+ ext v0.16b, v16.16b, v17.16b, #1
111
+ ext v1.16b, v16.16b, v17.16b, #2
112
+ ext v2.16b, v16.16b, v17.16b, #3
113
+ ext v3.16b, v16.16b, v17.16b, #4
114
+.endif
115
+.endm
116
+
117
+.macro qpel_load_32b v
118
+.if \v == 0
119
+ add x6, x6, x11 // do not load 3 values that are not used in qpel_filter_0
120
+ ld1 {v3.8b}, x6, x1
121
+.elseif \v == 1 || \v == 2 || \v == 3
122
+.if \v != 3 // not used in qpel_filter_3
123
+ ld1 {v0.8b}, x6, x1
124
+.else
125
+ add x6, x6, x1
126
+.endif
127
+ ld1 {v1.8b}, x6, x1
128
+ ld1 {v2.8b}, x6, x1
129
+ ld1 {v3.8b}, x6, x1
130
+ ld1 {v4.8b}, x6, x1
131
+ ld1 {v5.8b}, x6, x1
132
+.if \v != 1 // not used in qpel_filter_1
133
+ ld1 {v6.8b}, x6, x1
134
+ ld1 {v7.8b}, x6
135
+.else
136
+ ld1 {v6.8b}, x6
137
+.endif
138
+.endif
139
+.endm
140
+
141
+.macro qpel_load_64b v
142
+.if \v == 0
143
+ add x6, x6, x11 // do not load 3 values that are not used in qpel_filter_0
144
+ ld1 {v3.16b}, x6, x1
145
+.elseif \v == 1 || \v == 2 || \v == 3
146
+.if \v != 3 // not used in qpel_filter_3
147
+ ld1 {v0.16b}, x6, x1
148
+.else
149
+ add x6, x6, x1
150
+.endif
151
+ ld1 {v1.16b}, x6, x1
152
+ ld1 {v2.16b}, x6, x1
153
+ ld1 {v3.16b}, x6, x1
154
+ ld1 {v4.16b}, x6, x1
155
+ ld1 {v5.16b}, x6, x1
156
+.if \v != 1 // not used in qpel_filter_1
157
+ ld1 {v6.16b}, x6, x1
158
+ ld1 {v7.16b}, x6
159
+.else
160
+ ld1 {v6.16b}, x6
161
+.endif
162
+.endif
163
+.endm
164
+
165
+.macro qpel_chroma_load_32b v
166
+.if \v == 0
167
+ // qpel_filter_chroma_0 only uses values in v1
168
+ add x6, x6, x1
169
+ ldr d1, x6
170
+.else
171
+ ld1 {v0.8b}, x6, x1
172
+ ld1 {v1.8b}, x6, x1
173
+ ld1 {v2.8b}, x6, x1
174
+ ld1 {v3.8b}, x6
175
+.endif
176
+.endm
177
+
178
+.macro qpel_chroma_load_64b v
179
+.if \v == 0
180
+ // qpel_filter_chroma_0 only uses values in v1
181
+ add x6, x6, x1
182
+ ldr q1, x6
183
+.else
184
+ ld1 {v0.16b}, x6, x1
185
+ ld1 {v1.16b}, x6, x1
186
+ ld1 {v2.16b}, x6, x1
187
+ ld1 {v3.16b}, x6
188
+.endif
189
+.endm
190
+
191
+// a, b, c, d, e, f, g, h
192
+// .hword 0, 0, 0, 64, 0, 0, 0, 0
193
+.macro qpel_start_0
194
+ movi v24.16b, #64
195
+.endm
196
+
197
+.macro qpel_filter_0_32b
198
+ umull v17.8h, v3.8b, v24.8b // 64*d
199
+.endm
200
+
201
x265_3.6.tar.gz/source/common/aarch64/ipfilter-sve2.S
Added
201
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// Functions in this file:
26
+// ***** luma_vpp *****
27
+// ***** luma_vps *****
28
+// ***** luma_vsp *****
29
+// ***** luma_vss *****
30
+// ***** luma_hpp *****
31
+// ***** luma_hps *****
32
+// ***** chroma_vpp *****
33
+// ***** chroma_vps *****
34
+// ***** chroma_vsp *****
35
+// ***** chroma_vss *****
36
+// ***** chroma_hpp *****
37
+// ***** chroma_hps *****
38
+
39
+#include "asm-sve.S"
40
+#include "ipfilter-common.S"
41
+
42
+.arch armv8-a+sve2
43
+
44
+#ifdef __APPLE__
45
+.section __RODATA,__rodata
46
+#else
47
+.section .rodata
48
+#endif
49
+
50
+.align 4
51
+
52
+.text
53
+
54
+.macro qpel_load_32b_sve2 v
55
+.if \v == 0
56
+ add x6, x6, x11 // do not load 3 values that are not used in qpel_filter_0
57
+ ld1b {z3.h}, p0/z, x6
58
+ add x6, x6, x1
59
+.elseif \v == 1 || \v == 2 || \v == 3
60
+.if \v != 3 // not used in qpel_filter_3
61
+ ld1b {z0.h}, p0/z, x6
62
+ add x6, x6, x1
63
+.else
64
+ add x6, x6, x1
65
+.endif
66
+ ld1b {z1.h}, p0/z, x6
67
+ add x6, x6, x1
68
+ ld1b {z2.h}, p0/z, x6
69
+ add x6, x6, x1
70
+ ld1b {z3.h}, p0/z, x6
71
+ add x6, x6, x1
72
+ ld1b {z4.h}, p0/z, x6
73
+ add x6, x6, x1
74
+ ld1b {z5.h}, p0/z, x6
75
+ add x6, x6, x1
76
+.if \v != 1 // not used in qpel_filter_1
77
+ ld1b {z6.h}, p0/z, x6
78
+ add x6, x6, x1
79
+ ld1b {z7.h}, p0/z, x6
80
+.else
81
+ ld1b {z6.h}, p0/z, x6
82
+.endif
83
+.endif
84
+.endm
85
+
86
+.macro qpel_load_64b_sve2_gt_16 v
87
+.if \v == 0
88
+ add x6, x6, x11 // do not load 3 values that are not used in qpel_filter_0
89
+ ld1b {z3.h}, p2/z, x6
90
+ add x6, x6, x1
91
+.elseif \v == 1 || \v == 2 || \v == 3
92
+.if \v != 3 // not used in qpel_filter_3
93
+ ld1b {z0.h}, p2/z, x6
94
+ add x6, x6, x1
95
+.else
96
+ add x6, x6, x1
97
+.endif
98
+ ld1b {z1.h}, p2/z, x6
99
+ add x6, x6, x1
100
+ ld1b {z2.h}, p2/z, x6
101
+ add x6, x6, x1
102
+ ld1b {z3.h}, p2/z, x6
103
+ add x6, x6, x1
104
+ ld1b {z4.h}, p2/z, x6
105
+ add x6, x6, x1
106
+ ld1b {z5.h}, p2/z, x6
107
+ add x6, x6, x1
108
+.if \v != 1 // not used in qpel_filter_1
109
+ ld1b {z6.h}, p2/z, x6
110
+ add x6, x6, x1
111
+ ld1b {z7.h}, p2/z, x6
112
+.else
113
+ ld1b {z6.h}, p2/z, x6
114
+.endif
115
+.endif
116
+.endm
117
+
118
+.macro qpel_chroma_load_32b_sve2 v
119
+.if \v == 0
120
+ // qpel_filter_chroma_0 only uses values in v1
121
+ add x6, x6, x1
122
+ ld1b {z1.h}, p0/z, x6
123
+.else
124
+ ld1b {z0.h}, p0/z, x6
125
+ add x6, x6, x1
126
+ ld1b {z1.h}, p0/z, x6
127
+ add x6, x6, x1
128
+ ld1b {z2.h}, p0/z, x6
129
+ add x6, x6, x1
130
+ ld1b {z3.h}, p0/z, x6
131
+.endif
132
+.endm
133
+
134
+.macro qpel_start_sve2_0
135
+ mov z24.h, #64
136
+.endm
137
+
138
+.macro qpel_filter_sve2_0_32b
139
+ mul z17.h, z3.h, z24.h // 64*d
140
+.endm
141
+
142
+.macro qpel_filter_sve2_0_64b
143
+ qpel_filter_sve2_0_32b
144
+ mul z18.h, z11.h, z24.h
145
+.endm
146
+
147
+.macro qpel_start_sve2_1
148
+ mov z24.h, #58
149
+ mov z25.h, #10
150
+ mov z26.h, #17
151
+ mov z27.h, #5
152
+.endm
153
+
154
+.macro qpel_filter_sve2_1_32b
155
+ mul z19.h, z2.h, z25.h // c*10
156
+ mul z17.h, z3.h, z24.h // d*58
157
+ mul z21.h, z4.h, z26.h // e*17
158
+ mul z23.h, z5.h, z27.h // f*5
159
+ sub z17.h, z17.h, z19.h // d*58 - c*10
160
+ lsl z18.h, z1.h, #2 // b*4
161
+ add z17.h, z17.h, z21.h // d*58 - c*10 + e*17
162
+ sub z21.h, z6.h, z0.h // g - a
163
+ add z17.h, z17.h, z18.h // d*58 - c*10 + e*17 + b*4
164
+ sub z21.h, z21.h, z23.h // g - a - f*5
165
+ add z17.h, z17.h, z21.h // d*58 - c*10 + e*17 + b*4 + g - a - f*5
166
+.endm
167
+
168
+.macro qpel_filter_sve2_1_64b
169
+ qpel_filter_sve2_1_32b
170
+ mul z20.h, z10.h, z25.h // c*10
171
+ mul z18.h, z11.h, z24.h // d*58
172
+ mul z21.h, z12.h, z26.h // e*17
173
+ mul z23.h, z13.h, z27.h // f*5
174
+ sub z18.h, z18.h, z20.h // d*58 - c*10
175
+ lsl z28.h, z30.h, #2 // b*4
176
+ add z18.h, z18.h, z21.h // d*58 - c*10 + e*17
177
+ sub z21.h, z14.h, z29.h // g - a
178
+ add z18.h, z18.h, z28.h // d*58 - c*10 + e*17 + b*4
179
+ sub z21.h, z21.h, z23.h // g - a - f*5
180
+ add z18.h, z18.h, z21.h // d*58 - c*10 + e*17 + b*4 + g - a - f*5
181
+.endm
182
+
183
+.macro qpel_start_sve2_2
184
+ mov z24.h, #11
185
+ mov z25.h, #40
186
+.endm
187
+
188
+.macro qpel_filter_sve2_2_32b
189
+ add z17.h, z3.h, z4.h // d + e
190
+ add z19.h, z2.h, z5.h // c + f
191
+ add z23.h, z1.h, z6.h // b + g
192
+ add z21.h, z0.h, z7.h // a + h
193
+ mul z17.h, z17.h, z25.h // 40 * (d + e)
194
+ mul z19.h, z19.h, z24.h // 11 * (c + f)
195
+ lsl z23.h, z23.h, #2 // (b + g) * 4
196
+ add z19.h, z19.h, z21.h // 11 * (c + f) + a + h
197
+ add z17.h, z17.h, z23.h // 40 * (d + e) + (b + g) * 4
198
+ sub z17.h, z17.h, z19.h // 40 * (d + e) + (b + g) * 4 - 11 * (c + f) - a - h
199
+.endm
200
+
201
x265_3.6.tar.gz/source/common/aarch64/ipfilter.S
Added
201
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2021 MulticoreWare, Inc
4
+ *
5
+ * Authors: Sebastian Pop <spop@amazon.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// Functions in this file:
26
+// ***** luma_vpp *****
27
+// ***** luma_vps *****
28
+// ***** luma_vsp *****
29
+// ***** luma_vss *****
30
+// ***** luma_hpp *****
31
+// ***** luma_hps *****
32
+// ***** chroma_vpp *****
33
+// ***** chroma_vps *****
34
+// ***** chroma_vsp *****
35
+// ***** chroma_vss *****
36
+// ***** chroma_hpp *****
37
+// ***** chroma_hps *****
38
+
39
+#include "asm.S"
40
+#include "ipfilter-common.S"
41
+
42
+#ifdef __APPLE__
43
+.section __RODATA,__rodata
44
+#else
45
+.section .rodata
46
+#endif
47
+
48
+.align 4
49
+
50
+.text
51
+
52
+// ***** luma_vpp *****
53
+// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
54
+.macro LUMA_VPP_4xN h
55
+function x265_interp_8tap_vert_pp_4x\h\()_neon
56
+ movrel x10, g_luma_s16
57
+ sub x0, x0, x1
58
+ sub x0, x0, x1, lsl #1 // src -= 3 * srcStride
59
+ lsl x4, x4, #4
60
+ ldr q0, x10, x4 // q0 = luma interpolate coeff
61
+ dup v24.8h, v0.h0
62
+ dup v25.8h, v0.h1
63
+ trn1 v24.2d, v24.2d, v25.2d
64
+ dup v26.8h, v0.h2
65
+ dup v27.8h, v0.h3
66
+ trn1 v26.2d, v26.2d, v27.2d
67
+ dup v28.8h, v0.h4
68
+ dup v29.8h, v0.h5
69
+ trn1 v28.2d, v28.2d, v29.2d
70
+ dup v30.8h, v0.h6
71
+ dup v31.8h, v0.h7
72
+ trn1 v30.2d, v30.2d, v31.2d
73
+
74
+ // prepare to load 8 lines
75
+ ld1 {v0.s}0, x0, x1
76
+ ld1 {v0.s}1, x0, x1
77
+ ushll v0.8h, v0.8b, #0
78
+ ld1 {v1.s}0, x0, x1
79
+ ld1 {v1.s}1, x0, x1
80
+ ushll v1.8h, v1.8b, #0
81
+ ld1 {v2.s}0, x0, x1
82
+ ld1 {v2.s}1, x0, x1
83
+ ushll v2.8h, v2.8b, #0
84
+ ld1 {v3.s}0, x0, x1
85
+ ld1 {v3.s}1, x0, x1
86
+ ushll v3.8h, v3.8b, #0
87
+
88
+ mov x9, #\h
89
+.loop_4x\h:
90
+ ld1 {v4.s}0, x0, x1
91
+ ld1 {v4.s}1, x0, x1
92
+ ushll v4.8h, v4.8b, #0
93
+
94
+ // row0-1
95
+ mul v16.8h, v0.8h, v24.8h
96
+ ext v21.16b, v0.16b, v1.16b, #8
97
+ mul v17.8h, v21.8h, v24.8h
98
+ mov v0.16b, v1.16b
99
+
100
+ // row2-3
101
+ mla v16.8h, v1.8h, v26.8h
102
+ ext v21.16b, v1.16b, v2.16b, #8
103
+ mla v17.8h, v21.8h, v26.8h
104
+ mov v1.16b, v2.16b
105
+
106
+ // row4-5
107
+ mla v16.8h, v2.8h, v28.8h
108
+ ext v21.16b, v2.16b, v3.16b, #8
109
+ mla v17.8h, v21.8h, v28.8h
110
+ mov v2.16b, v3.16b
111
+
112
+ // row6-7
113
+ mla v16.8h, v3.8h, v30.8h
114
+ ext v21.16b, v3.16b, v4.16b, #8
115
+ mla v17.8h, v21.8h, v30.8h
116
+ mov v3.16b, v4.16b
117
+
118
+ // sum row0-7
119
+ trn1 v20.2d, v16.2d, v17.2d
120
+ trn2 v21.2d, v16.2d, v17.2d
121
+ add v16.8h, v20.8h, v21.8h
122
+
123
+ sqrshrun v16.8b, v16.8h, #6
124
+ st1 {v16.s}0, x2, x3
125
+ st1 {v16.s}1, x2, x3
126
+
127
+ sub x9, x9, #2
128
+ cbnz x9, .loop_4x\h
129
+ ret
130
+endfunc
131
+.endm
132
+
133
+LUMA_VPP_4xN 4
134
+LUMA_VPP_4xN 8
135
+LUMA_VPP_4xN 16
136
+
137
+// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
138
+.macro LUMA_VPP w, h
139
+function x265_interp_8tap_vert_pp_\w\()x\h\()_neon
140
+ cmp x4, #0
141
+ b.eq 0f
142
+ cmp x4, #1
143
+ b.eq 1f
144
+ cmp x4, #2
145
+ b.eq 2f
146
+ cmp x4, #3
147
+ b.eq 3f
148
+0:
149
+ FILTER_LUMA_VPP \w, \h, 0
150
+1:
151
+ FILTER_LUMA_VPP \w, \h, 1
152
+2:
153
+ FILTER_LUMA_VPP \w, \h, 2
154
+3:
155
+ FILTER_LUMA_VPP \w, \h, 3
156
+endfunc
157
+.endm
158
+
159
+LUMA_VPP 8, 4
160
+LUMA_VPP 8, 8
161
+LUMA_VPP 8, 16
162
+LUMA_VPP 8, 32
163
+LUMA_VPP 12, 16
164
+LUMA_VPP 16, 4
165
+LUMA_VPP 16, 8
166
+LUMA_VPP 16, 16
167
+LUMA_VPP 16, 32
168
+LUMA_VPP 16, 64
169
+LUMA_VPP 16, 12
170
+LUMA_VPP 24, 32
171
+LUMA_VPP 32, 8
172
+LUMA_VPP 32, 16
173
+LUMA_VPP 32, 32
174
+LUMA_VPP 32, 64
175
+LUMA_VPP 32, 24
176
+LUMA_VPP 48, 64
177
+LUMA_VPP 64, 16
178
+LUMA_VPP 64, 32
179
+LUMA_VPP 64, 64
180
+LUMA_VPP 64, 48
181
+
182
+// ***** luma_vps *****
183
+// void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
184
+.macro LUMA_VPS_4xN h
185
+function x265_interp_8tap_vert_ps_4x\h\()_neon
186
+ lsl x3, x3, #1
187
+ lsl x5, x4, #6
188
+ lsl x4, x1, #2
189
+ sub x4, x4, x1
190
+ sub x0, x0, x4
191
+
192
+ mov w6, #8192
193
+ dup v28.4s, w6
194
+ mov x4, #\h
195
+ movrel x12, g_lumaFilter
196
+ add x12, x12, x5
197
+ ld1r {v16.2d}, x12, #8
198
+ ld1r {v17.2d}, x12, #8
199
+ ld1r {v18.2d}, x12, #8
200
+ ld1r {v19.2d}, x12, #8
201
x265_3.6.tar.gz/source/common/aarch64/loopfilter-prim.cpp
Added
201
1
2
+#include "loopfilter-prim.h"
3
+
4
+#define PIXEL_MIN 0
5
+
6
+
7
+
8
+#if !(HIGH_BIT_DEPTH) && defined(HAVE_NEON)
9
+#include<arm_neon.h>
10
+
11
+namespace
12
+{
13
+
14
+
15
+/* get the sign of input variable (TODO: this is a dup, make common) */
16
+static inline int8_t signOf(int x)
17
+{
18
+ return (x >> 31) | ((int)((((uint32_t) - x)) >> 31));
19
+}
20
+
21
+static inline int8x8_t sign_diff_neon(const uint8x8_t in0, const uint8x8_t in1)
22
+{
23
+ int16x8_t in = vsubl_u8(in0, in1);
24
+ return vmovn_s16(vmaxq_s16(vminq_s16(in, vdupq_n_s16(1)), vdupq_n_s16(-1)));
25
+}
26
+
27
+static void calSign_neon(int8_t *dst, const pixel *src1, const pixel *src2, const int endX)
28
+{
29
+ int x = 0;
30
+ for (; (x + 8) <= endX; x += 8)
31
+ {
32
+ *(int8x8_t *)&dstx = sign_diff_neon(*(uint8x8_t *)&src1x, *(uint8x8_t *)&src2x);
33
+ }
34
+
35
+ for (; x < endX; x++)
36
+ {
37
+ dstx = signOf(src1x - src2x);
38
+ }
39
+}
40
+
41
+static void processSaoCUE0_neon(pixel *rec, int8_t *offsetEo, int width, int8_t *signLeft, intptr_t stride)
42
+{
43
+
44
+
45
+ int y;
46
+ int8_t signRight, signLeft0;
47
+ int8_t edgeType;
48
+
49
+ for (y = 0; y < 2; y++)
50
+ {
51
+ signLeft0 = signLefty;
52
+ int x = 0;
53
+
54
+ if (width >= 8)
55
+ {
56
+ int8x8_t vsignRight;
57
+ int8x8x2_t shifter;
58
+ shifter.val10 = signLeft0;
59
+ static const int8x8_t index = {8, 0, 1, 2, 3, 4, 5, 6};
60
+ int8x8_t tbl = *(int8x8_t *)offsetEo;
61
+ for (; (x + 8) <= width; x += 8)
62
+ {
63
+ uint8x8_t in = *(uint8x8_t *)&recx;
64
+ vsignRight = sign_diff_neon(in, *(uint8x8_t *)&recx + 1);
65
+ shifter.val0 = vneg_s8(vsignRight);
66
+ int8x8_t tmp = shifter.val0;
67
+ int8x8_t edge = vtbl2_s8(shifter, index);
68
+ int8x8_t vedgeType = vadd_s8(vadd_s8(vsignRight, edge), vdup_n_s8(2));
69
+ shifter.val10 = tmp7;
70
+ int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
71
+ t1 = vaddw_u8(t1, in);
72
+ t1 = vmaxq_s16(t1, vdupq_n_s16(0));
73
+ t1 = vminq_s16(t1, vdupq_n_s16(255));
74
+ *(uint8x8_t *)&recx = vmovn_u16(t1);
75
+ }
76
+ signLeft0 = shifter.val10;
77
+ }
78
+ for (; x < width; x++)
79
+ {
80
+ signRight = ((recx - recx + 1) < 0) ? -1 : ((recx - recx + 1) > 0) ? 1 : 0;
81
+ edgeType = signRight + signLeft0 + 2;
82
+ signLeft0 = -signRight;
83
+ recx = x265_clip(recx + offsetEoedgeType);
84
+ }
85
+ rec += stride;
86
+ }
87
+}
88
+
89
+static void processSaoCUE1_neon(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int width)
90
+{
91
+ int x = 0;
92
+ int8_t signDown;
93
+ int edgeType;
94
+
95
+ if (width >= 8)
96
+ {
97
+ int8x8_t tbl = *(int8x8_t *)offsetEo;
98
+ for (; (x + 8) <= width; x += 8)
99
+ {
100
+ uint8x8_t in0 = *(uint8x8_t *)&recx;
101
+ uint8x8_t in1 = *(uint8x8_t *)&recx + stride;
102
+ int8x8_t vsignDown = sign_diff_neon(in0, in1);
103
+ int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1x), vdup_n_s8(2));
104
+ *(int8x8_t *)&upBuff1x = vneg_s8(vsignDown);
105
+ int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
106
+ t1 = vaddw_u8(t1, in0);
107
+ *(uint8x8_t *)&recx = vqmovun_s16(t1);
108
+ }
109
+ }
110
+ for (; x < width; x++)
111
+ {
112
+ signDown = signOf(recx - recx + stride);
113
+ edgeType = signDown + upBuff1x + 2;
114
+ upBuff1x = -signDown;
115
+ recx = x265_clip(recx + offsetEoedgeType);
116
+ }
117
+}
118
+
119
+static void processSaoCUE1_2Rows_neon(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int width)
120
+{
121
+ int y;
122
+ int8_t signDown;
123
+ int edgeType;
124
+
125
+ for (y = 0; y < 2; y++)
126
+ {
127
+ int x = 0;
128
+ if (width >= 8)
129
+ {
130
+ int8x8_t tbl = *(int8x8_t *)offsetEo;
131
+ for (; (x + 8) <= width; x += 8)
132
+ {
133
+ uint8x8_t in0 = *(uint8x8_t *)&recx;
134
+ uint8x8_t in1 = *(uint8x8_t *)&recx + stride;
135
+ int8x8_t vsignDown = sign_diff_neon(in0, in1);
136
+ int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1x), vdup_n_s8(2));
137
+ *(int8x8_t *)&upBuff1x = vneg_s8(vsignDown);
138
+ int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
139
+ t1 = vaddw_u8(t1, in0);
140
+ t1 = vmaxq_s16(t1, vdupq_n_s16(0));
141
+ t1 = vminq_s16(t1, vdupq_n_s16(255));
142
+ *(uint8x8_t *)&recx = vmovn_u16(t1);
143
+
144
+ }
145
+ }
146
+ for (; x < width; x++)
147
+ {
148
+ signDown = signOf(recx - recx + stride);
149
+ edgeType = signDown + upBuff1x + 2;
150
+ upBuff1x = -signDown;
151
+ recx = x265_clip(recx + offsetEoedgeType);
152
+ }
153
+ rec += stride;
154
+ }
155
+}
156
+
157
+static void processSaoCUE2_neon(pixel *rec, int8_t *bufft, int8_t *buff1, int8_t *offsetEo, int width, intptr_t stride)
158
+{
159
+ int x;
160
+
161
+ if (abs(buff1 - bufft) < 16)
162
+ {
163
+ for (x = 0; x < width; x++)
164
+ {
165
+ int8_t signDown = signOf(recx - recx + stride + 1);
166
+ int edgeType = signDown + buff1x + 2;
167
+ bufftx + 1 = -signDown;
168
+ recx = x265_clip(recx + offsetEoedgeType);;
169
+ }
170
+ }
171
+ else
172
+ {
173
+ int8x8_t tbl = *(int8x8_t *)offsetEo;
174
+ x = 0;
175
+ for (; (x + 8) <= width; x += 8)
176
+ {
177
+ uint8x8_t in0 = *(uint8x8_t *)&recx;
178
+ uint8x8_t in1 = *(uint8x8_t *)&recx + stride + 1;
179
+ int8x8_t vsignDown = sign_diff_neon(in0, in1);
180
+ int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&buff1x), vdup_n_s8(2));
181
+ *(int8x8_t *)&bufftx + 1 = vneg_s8(vsignDown);
182
+ int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
183
+ t1 = vaddw_u8(t1, in0);
184
+ t1 = vmaxq_s16(t1, vdupq_n_s16(0));
185
+ t1 = vminq_s16(t1, vdupq_n_s16(255));
186
+ *(uint8x8_t *)&recx = vmovn_u16(t1);
187
+ }
188
+ for (; x < width; x++)
189
+ {
190
+ int8_t signDown = signOf(recx - recx + stride + 1);
191
+ int edgeType = signDown + buff1x + 2;
192
+ bufftx + 1 = -signDown;
193
+ recx = x265_clip(recx + offsetEoedgeType);;
194
+ }
195
+
196
+ }
197
+}
198
+
199
+
200
+static void processSaoCUE3_neon(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int startX, int endX)
201
x265_3.6.tar.gz/source/common/aarch64/loopfilter-prim.h
Added
18
1
2
+#ifndef _LOOPFILTER_NEON_H__
3
+#define _LOOPFILTER_NEON_H__
4
+
5
+#include "common.h"
6
+#include "primitives.h"
7
+
8
+#define PIXEL_MIN 0
9
+
10
+namespace X265_NS
11
+{
12
+void setupLoopFilterPrimitives_neon(EncoderPrimitives &p);
13
+
14
+};
15
+
16
+
17
+#endif
18
x265_3.6.tar.gz/source/common/aarch64/mc-a-common.S
Added
50
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+.arch armv8-a
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.macro addAvg_start
37
+ lsl x3, x3, #1
38
+ lsl x4, x4, #1
39
+ mov w11, #0x40
40
+ dup v30.16b, w11
41
+.endm
42
+
43
+.macro addavg_1 v0, v1
44
+ add \v0\().8h, \v0\().8h, \v1\().8h
45
+ saddl v16.4s, \v0\().4h, v30.4h
46
+ saddl2 v17.4s, \v0\().8h, v30.8h
47
+ shrn \v0\().4h, v16.4s, #7
48
+ shrn2 \v0\().8h, v17.4s, #7
49
+.endm
50
x265_3.6.tar.gz/source/common/aarch64/mc-a-sve2.S
Added
201
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "mc-a-common.S"
27
+
28
+.arch armv8-a+sve2
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+function PFX(pixel_avg_pp_12x16_sve2)
41
+ sub x1, x1, #4
42
+ sub x3, x3, #4
43
+ sub x5, x5, #4
44
+ ptrue p0.s, vl1
45
+ ptrue p1.b, vl8
46
+ mov x11, #4
47
+.rept 16
48
+ ld1w {z0.s}, p0/z, x2
49
+ ld1b {z1.b}, p1/z, x2, x11
50
+ ld1w {z2.s}, p0/z, x4
51
+ ld1b {z3.b}, p1/z, x4, x11
52
+ add x2, x2, #4
53
+ add x2, x2, x3
54
+ add x4, x4, #4
55
+ add x4, x4, x5
56
+ urhadd z0.b, p1/m, z0.b, z2.b
57
+ urhadd z1.b, p1/m, z1.b, z3.b
58
+ st1b {z0.b}, p1, x0
59
+ st1b {z1.b}, p1, x0, x11
60
+ add x0, x0, #4
61
+ add x0, x0, x1
62
+.endr
63
+ ret
64
+endfunc
65
+
66
+function PFX(pixel_avg_pp_24x32_sve2)
67
+ mov w12, #4
68
+ rdvl x9, #1
69
+ cmp x9, #16
70
+ bgt .vl_gt_16_pixel_avg_pp_24x32
71
+ sub x1, x1, #16
72
+ sub x3, x3, #16
73
+ sub x5, x5, #16
74
+.lpavg_24x32_sve2:
75
+ sub w12, w12, #1
76
+.rept 8
77
+ ld1 {v0.16b}, x2, #16
78
+ ld1 {v1.8b}, x2, x3
79
+ ld1 {v2.16b}, x4, #16
80
+ ld1 {v3.8b}, x4, x5
81
+ urhadd v0.16b, v0.16b, v2.16b
82
+ urhadd v1.8b, v1.8b, v3.8b
83
+ st1 {v0.16b}, x0, #16
84
+ st1 {v1.8b}, x0, x1
85
+.endr
86
+ cbnz w12, .lpavg_24x32_sve2
87
+ ret
88
+.vl_gt_16_pixel_avg_pp_24x32:
89
+ mov x10, #24
90
+ mov x11, #0
91
+ whilelt p0.b, x11, x10
92
+.vl_gt_16_loop_pixel_avg_pp_24x32:
93
+ sub w12, w12, #1
94
+.rept 8
95
+ ld1b {z0.b}, p0/z, x2
96
+ ld1b {z2.b}, p0/z, x4
97
+ add x2, x2, x3
98
+ add x4, x4, x5
99
+ urhadd z0.b, p0/m, z0.b, z2.b
100
+ st1b {z0.b}, p0, x0
101
+ add x0, x0, x1
102
+.endr
103
+ cbnz w12, .vl_gt_16_loop_pixel_avg_pp_24x32
104
+ ret
105
+endfunc
106
+
107
+.macro pixel_avg_pp_32xN_sve2 h
108
+function PFX(pixel_avg_pp_32x\h\()_sve2)
109
+ rdvl x9, #1
110
+ cmp x9, #16
111
+ bgt .vl_gt_16_pixel_avg_pp_32_\h
112
+.rept \h
113
+ ld1 {v0.16b-v1.16b}, x2, x3
114
+ ld1 {v2.16b-v3.16b}, x4, x5
115
+ urhadd v0.16b, v0.16b, v2.16b
116
+ urhadd v1.16b, v1.16b, v3.16b
117
+ st1 {v0.16b-v1.16b}, x0, x1
118
+.endr
119
+ ret
120
+.vl_gt_16_pixel_avg_pp_32_\h:
121
+ ptrue p0.b, vl32
122
+.rept \h
123
+ ld1b {z0.b}, p0/z, x2
124
+ ld1b {z2.b}, p0/z, x4
125
+ add x2, x2, x3
126
+ add x4, x4, x5
127
+ urhadd z0.b, p0/m, z0.b, z2.b
128
+ st1b {z0.b}, p0, x0
129
+ add x0, x0, x1
130
+.endr
131
+ ret
132
+endfunc
133
+.endm
134
+
135
+pixel_avg_pp_32xN_sve2 8
136
+pixel_avg_pp_32xN_sve2 16
137
+pixel_avg_pp_32xN_sve2 24
138
+
139
+.macro pixel_avg_pp_32xN1_sve2 h
140
+function PFX(pixel_avg_pp_32x\h\()_sve2)
141
+ rdvl x9, #1
142
+ cmp x9, #16
143
+ bgt .vl_gt_16_pixel_avg_pp_32xN1_\h
144
+ mov w12, #\h / 8
145
+.lpavg_sve2_32x\h\():
146
+ sub w12, w12, #1
147
+.rept 8
148
+ ld1 {v0.16b-v1.16b}, x2, x3
149
+ ld1 {v2.16b-v3.16b}, x4, x5
150
+ urhadd v0.16b, v0.16b, v2.16b
151
+ urhadd v1.16b, v1.16b, v3.16b
152
+ st1 {v0.16b-v1.16b}, x0, x1
153
+.endr
154
+ cbnz w12, .lpavg_sve2_32x\h
155
+ ret
156
+.vl_gt_16_pixel_avg_pp_32xN1_\h:
157
+ ptrue p0.b, vl32
158
+ mov w12, #\h / 8
159
+.eq_32_loop_pixel_avg_pp_32xN1_\h\():
160
+ sub w12, w12, #1
161
+.rept 8
162
+ ld1b {z0.b}, p0/z, x2
163
+ ld1b {z2.b}, p0/z, x4
164
+ add x2, x2, x3
165
+ add x4, x4, x5
166
+ urhadd z0.b, p0/m, z0.b, z2.b
167
+ st1b {z0.b}, p0, x0
168
+ add x0, x0, x1
169
+.endr
170
+ cbnz w12, .eq_32_loop_pixel_avg_pp_32xN1_\h
171
+ ret
172
+endfunc
173
+.endm
174
+
175
+pixel_avg_pp_32xN1_sve2 32
176
+pixel_avg_pp_32xN1_sve2 64
177
+
178
+function PFX(pixel_avg_pp_48x64_sve2)
179
+ rdvl x9, #1
180
+ cmp x9, #16
181
+ bgt .vl_gt_16_pixel_avg_pp_48x64
182
+ mov w12, #8
183
+.lpavg_48x64_sve2:
184
+ sub w12, w12, #1
185
+.rept 8
186
+ ld1 {v0.16b-v2.16b}, x2, x3
187
+ ld1 {v3.16b-v5.16b}, x4, x5
188
+ urhadd v0.16b, v0.16b, v3.16b
189
+ urhadd v1.16b, v1.16b, v4.16b
190
+ urhadd v2.16b, v2.16b, v5.16b
191
+ st1 {v0.16b-v2.16b}, x0, x1
192
+.endr
193
+ cbnz w12, .lpavg_48x64_sve2
194
+ ret
195
+.vl_gt_16_pixel_avg_pp_48x64:
196
+ cmp x9, #32
197
+ bgt .vl_gt_32_pixel_avg_pp_48x64
198
+ ptrue p0.b, vl32
199
+ ptrue p1.b, vl16
200
+ mov w12, #8
201
x265_3.5.tar.gz/source/common/aarch64/mc-a.S -> x265_3.6.tar.gz/source/common/aarch64/mc-a.S
Changed
201
1
2
/*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
+ * Copyright (C) 2020-2021 MulticoreWare, Inc
5
*
6
* Authors: Hongbin Liu <liuhongbin1@huawei.com>
7
+ * Sebastian Pop <spop@amazon.com>
8
*
9
* This program is free software; you can redistribute it and/or modify
10
* it under the terms of the GNU General Public License as published by
11
12
*****************************************************************************/
13
14
#include "asm.S"
15
+#include "mc-a-common.S"
16
17
+#ifdef __APPLE__
18
+.section __RODATA,__rodata
19
+#else
20
.section .rodata
21
+#endif
22
23
.align 4
24
25
.text
26
27
.macro pixel_avg_pp_4xN_neon h
28
-function x265_pixel_avg_pp_4x\h\()_neon
29
+function PFX(pixel_avg_pp_4x\h\()_neon)
30
.rept \h
31
ld1 {v0.s}0, x2, x3
32
ld1 {v1.s}0, x4, x5
33
34
pixel_avg_pp_4xN_neon 16
35
36
.macro pixel_avg_pp_8xN_neon h
37
-function x265_pixel_avg_pp_8x\h\()_neon
38
+function PFX(pixel_avg_pp_8x\h\()_neon)
39
.rept \h
40
ld1 {v0.8b}, x2, x3
41
ld1 {v1.8b}, x4, x5
42
43
pixel_avg_pp_8xN_neon 8
44
pixel_avg_pp_8xN_neon 16
45
pixel_avg_pp_8xN_neon 32
46
+
47
+function PFX(pixel_avg_pp_12x16_neon)
48
+ sub x1, x1, #4
49
+ sub x3, x3, #4
50
+ sub x5, x5, #4
51
+.rept 16
52
+ ld1 {v0.s}0, x2, #4
53
+ ld1 {v1.8b}, x2, x3
54
+ ld1 {v2.s}0, x4, #4
55
+ ld1 {v3.8b}, x4, x5
56
+ urhadd v4.8b, v0.8b, v2.8b
57
+ urhadd v5.8b, v1.8b, v3.8b
58
+ st1 {v4.s}0, x0, #4
59
+ st1 {v5.8b}, x0, x1
60
+.endr
61
+ ret
62
+endfunc
63
+
64
+.macro pixel_avg_pp_16xN_neon h
65
+function PFX(pixel_avg_pp_16x\h\()_neon)
66
+.rept \h
67
+ ld1 {v0.16b}, x2, x3
68
+ ld1 {v1.16b}, x4, x5
69
+ urhadd v2.16b, v0.16b, v1.16b
70
+ st1 {v2.16b}, x0, x1
71
+.endr
72
+ ret
73
+endfunc
74
+.endm
75
+
76
+pixel_avg_pp_16xN_neon 4
77
+pixel_avg_pp_16xN_neon 8
78
+pixel_avg_pp_16xN_neon 12
79
+pixel_avg_pp_16xN_neon 16
80
+pixel_avg_pp_16xN_neon 32
81
+
82
+function PFX(pixel_avg_pp_16x64_neon)
83
+ mov w12, #8
84
+.lpavg_16x64:
85
+ sub w12, w12, #1
86
+.rept 8
87
+ ld1 {v0.16b}, x2, x3
88
+ ld1 {v1.16b}, x4, x5
89
+ urhadd v2.16b, v0.16b, v1.16b
90
+ st1 {v2.16b}, x0, x1
91
+.endr
92
+ cbnz w12, .lpavg_16x64
93
+ ret
94
+endfunc
95
+
96
+function PFX(pixel_avg_pp_24x32_neon)
97
+ sub x1, x1, #16
98
+ sub x3, x3, #16
99
+ sub x5, x5, #16
100
+ mov w12, #4
101
+.lpavg_24x32:
102
+ sub w12, w12, #1
103
+.rept 8
104
+ ld1 {v0.16b}, x2, #16
105
+ ld1 {v1.8b}, x2, x3
106
+ ld1 {v2.16b}, x4, #16
107
+ ld1 {v3.8b}, x4, x5
108
+ urhadd v0.16b, v0.16b, v2.16b
109
+ urhadd v1.8b, v1.8b, v3.8b
110
+ st1 {v0.16b}, x0, #16
111
+ st1 {v1.8b}, x0, x1
112
+.endr
113
+ cbnz w12, .lpavg_24x32
114
+ ret
115
+endfunc
116
+
117
+.macro pixel_avg_pp_32xN_neon h
118
+function PFX(pixel_avg_pp_32x\h\()_neon)
119
+.rept \h
120
+ ld1 {v0.16b-v1.16b}, x2, x3
121
+ ld1 {v2.16b-v3.16b}, x4, x5
122
+ urhadd v0.16b, v0.16b, v2.16b
123
+ urhadd v1.16b, v1.16b, v3.16b
124
+ st1 {v0.16b-v1.16b}, x0, x1
125
+.endr
126
+ ret
127
+endfunc
128
+.endm
129
+
130
+pixel_avg_pp_32xN_neon 8
131
+pixel_avg_pp_32xN_neon 16
132
+pixel_avg_pp_32xN_neon 24
133
+
134
+.macro pixel_avg_pp_32xN1_neon h
135
+function PFX(pixel_avg_pp_32x\h\()_neon)
136
+ mov w12, #\h / 8
137
+.lpavg_32x\h\():
138
+ sub w12, w12, #1
139
+.rept 8
140
+ ld1 {v0.16b-v1.16b}, x2, x3
141
+ ld1 {v2.16b-v3.16b}, x4, x5
142
+ urhadd v0.16b, v0.16b, v2.16b
143
+ urhadd v1.16b, v1.16b, v3.16b
144
+ st1 {v0.16b-v1.16b}, x0, x1
145
+.endr
146
+ cbnz w12, .lpavg_32x\h
147
+ ret
148
+endfunc
149
+.endm
150
+
151
+pixel_avg_pp_32xN1_neon 32
152
+pixel_avg_pp_32xN1_neon 64
153
+
154
+function PFX(pixel_avg_pp_48x64_neon)
155
+ mov w12, #8
156
+.lpavg_48x64:
157
+ sub w12, w12, #1
158
+.rept 8
159
+ ld1 {v0.16b-v2.16b}, x2, x3
160
+ ld1 {v3.16b-v5.16b}, x4, x5
161
+ urhadd v0.16b, v0.16b, v3.16b
162
+ urhadd v1.16b, v1.16b, v4.16b
163
+ urhadd v2.16b, v2.16b, v5.16b
164
+ st1 {v0.16b-v2.16b}, x0, x1
165
+.endr
166
+ cbnz w12, .lpavg_48x64
167
+ ret
168
+endfunc
169
+
170
+.macro pixel_avg_pp_64xN_neon h
171
+function PFX(pixel_avg_pp_64x\h\()_neon)
172
+ mov w12, #\h / 4
173
+.lpavg_64x\h\():
174
+ sub w12, w12, #1
175
+.rept 4
176
+ ld1 {v0.16b-v3.16b}, x2, x3
177
+ ld1 {v4.16b-v7.16b}, x4, x5
178
+ urhadd v0.16b, v0.16b, v4.16b
179
+ urhadd v1.16b, v1.16b, v5.16b
180
+ urhadd v2.16b, v2.16b, v6.16b
181
+ urhadd v3.16b, v3.16b, v7.16b
182
+ st1 {v0.16b-v3.16b}, x0, x1
183
+.endr
184
+ cbnz w12, .lpavg_64x\h
185
+ ret
186
+endfunc
187
+.endm
188
+
189
+pixel_avg_pp_64xN_neon 16
190
+pixel_avg_pp_64xN_neon 32
191
+pixel_avg_pp_64xN_neon 48
192
+pixel_avg_pp_64xN_neon 64
193
+
194
+// void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
195
+.macro addAvg_2xN h
196
+function PFX(addAvg_2x\h\()_neon)
197
+ addAvg_start
198
+.rept \h / 2
199
+ ldr w10, x0
200
+ ldr w11, x1
201
x265_3.6.tar.gz/source/common/aarch64/p2s-common.S
Added
104
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+.arch armv8-a
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+#if HIGH_BIT_DEPTH
39
+# if BIT_DEPTH == 10
40
+# define P2S_SHIFT 4
41
+# elif BIT_DEPTH == 12
42
+# define P2S_SHIFT 2
43
+# endif
44
+.macro p2s_start
45
+ add x3, x3, x3
46
+ add x1, x1, x1
47
+ movi v31.8h, #0xe0, lsl #8
48
+.endm
49
+
50
+#else // if !HIGH_BIT_DEPTH
51
+# define P2S_SHIFT 6
52
+.macro p2s_start
53
+ add x3, x3, x3
54
+ movi v31.8h, #0xe0, lsl #8
55
+.endm
56
+#endif // HIGH_BIT_DEPTH
57
+
58
+.macro p2s_2x2
59
+#if HIGH_BIT_DEPTH
60
+ ld1 {v0.s}0, x0, x1
61
+ ld1 {v0.s}1, x0, x1
62
+ shl v3.8h, v0.8h, #P2S_SHIFT
63
+#else
64
+ ldrh w10, x0
65
+ add x0, x0, x1
66
+ ldrh w11, x0
67
+ orr w10, w10, w11, lsl #16
68
+ add x0, x0, x1
69
+ dup v0.4s, w10
70
+ ushll v3.8h, v0.8b, #P2S_SHIFT
71
+#endif
72
+ add v3.8h, v3.8h, v31.8h
73
+ st1 {v3.s}0, x2, x3
74
+ st1 {v3.s}1, x2, x3
75
+.endm
76
+
77
+.macro p2s_6x2
78
+#if HIGH_BIT_DEPTH
79
+ ld1 {v0.d}0, x0, #8
80
+ ld1 {v1.s}0, x0, x1
81
+ ld1 {v0.d}1, x0, #8
82
+ ld1 {v1.s}1, x0, x1
83
+ shl v3.8h, v0.8h, #P2S_SHIFT
84
+ shl v4.8h, v1.8h, #P2S_SHIFT
85
+#else
86
+ ldr s0, x0
87
+ ldrh w10, x0, #4
88
+ add x0, x0, x1
89
+ ld1 {v0.s}1, x0
90
+ ldrh w11, x0, #4
91
+ add x0, x0, x1
92
+ orr w10, w10, w11, lsl #16
93
+ dup v1.4s, w10
94
+ ushll v3.8h, v0.8b, #P2S_SHIFT
95
+ ushll v4.8h, v1.8b, #P2S_SHIFT
96
+#endif
97
+ add v3.8h, v3.8h, v31.8h
98
+ add v4.8h, v4.8h, v31.8h
99
+ st1 {v3.d}0, x2, #8
100
+ st1 {v4.s}0, x2, x3
101
+ st1 {v3.d}1, x2, #8
102
+ st1 {v4.s}1, x2, x3
103
+.endm
104
x265_3.6.tar.gz/source/common/aarch64/p2s-sve.S
Added
201
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "p2s-common.S"
27
+
28
+.arch armv8-a+sve
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+#if HIGH_BIT_DEPTH
41
+# if BIT_DEPTH == 10
42
+# define P2S_SHIFT 4
43
+# elif BIT_DEPTH == 12
44
+# define P2S_SHIFT 2
45
+# endif
46
+
47
+.macro p2s_start_sve
48
+ add x3, x3, x3
49
+ add x1, x1, x1
50
+ mov z31.h, #0xe0, lsl #8
51
+.endm
52
+
53
+#else // if !HIGH_BIT_DEPTH
54
+# define P2S_SHIFT 6
55
+.macro p2s_start_sve
56
+ add x3, x3, x3
57
+ mov z31.h, #0xe0, lsl #8
58
+.endm
59
+
60
+#endif // HIGH_BIT_DEPTH
61
+
62
+// filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
63
+.macro p2s_2xN_sve h
64
+function PFX(filterPixelToShort_2x\h\()_sve)
65
+ p2s_start_sve
66
+.rept \h / 2
67
+ p2s_2x2
68
+.endr
69
+ ret
70
+endfunc
71
+.endm
72
+
73
+p2s_2xN_sve 4
74
+p2s_2xN_sve 8
75
+p2s_2xN_sve 16
76
+
77
+.macro p2s_6xN_sve h
78
+function PFX(filterPixelToShort_6x\h\()_sve)
79
+ p2s_start_sve
80
+ sub x3, x3, #8
81
+#if HIGH_BIT_DEPTH
82
+ sub x1, x1, #8
83
+#endif
84
+.rept \h / 2
85
+ p2s_6x2
86
+.endr
87
+ ret
88
+endfunc
89
+.endm
90
+
91
+p2s_6xN_sve 8
92
+p2s_6xN_sve 16
93
+
94
+function PFX(filterPixelToShort_4x2_sve)
95
+ p2s_start_sve
96
+#if HIGH_BIT_DEPTH
97
+ ptrue p0.h, vl8
98
+ index z1.d, #0, x1
99
+ index z2.d, #0, x3
100
+ ld1d {z3.d}, p0/z, x0, z1.d
101
+ lsl z3.h, p0/m, z3.h, #P2S_SHIFT
102
+ add z3.h, p0/m, z3.h, z31.h
103
+ st1d {z3.d}, p0, x2, z2.d
104
+#else
105
+ ptrue p0.h, vl4
106
+ ld1b {z0.h}, p0/z, x0
107
+ add x0, x0, x1
108
+ ld1b {z1.h}, p0/z, x0
109
+ lsl z0.h, p0/m, z0.h, #P2S_SHIFT
110
+ lsl z1.h, p0/m, z1.h, #P2S_SHIFT
111
+ add z0.h, p0/m, z0.h, z31.h
112
+ add z1.h, p0/m, z1.h, z31.h
113
+ st1h {z0.h}, p0, x2
114
+ add x2, x2, x3
115
+ st1h {z1.h}, p0, x2
116
+#endif
117
+ ret
118
+endfunc
119
+
120
+
121
+.macro p2s_8xN_sve h
122
+function PFX(filterPixelToShort_8x\h\()_sve)
123
+ p2s_start_sve
124
+ ptrue p0.h, vl8
125
+.rept \h
126
+#if HIGH_BIT_DEPTH
127
+ ld1d {z0.d}, p0/z, x0
128
+ add x0, x0, x1
129
+ lsl z0.h, p0/m, z0.h, #P2S_SHIFT
130
+ add z0.h, p0/m, z0.h, z31.h
131
+ st1h {z0.h}, p0, x2
132
+ add x2, x2, x3
133
+#else
134
+ ld1b {z0.h}, p0/z, x0
135
+ add x0, x0, x1
136
+ lsl z0.h, p0/m, z0.h, #P2S_SHIFT
137
+ add z0.h, p0/m, z0.h, z31.h
138
+ st1h {z0.h}, p0, x2
139
+ add x2, x2, x3
140
+#endif
141
+.endr
142
+ ret
143
+endfunc
144
+.endm
145
+
146
+p2s_8xN_sve 2
147
+
148
+.macro p2s_32xN_sve h
149
+function PFX(filterPixelToShort_32x\h\()_sve)
150
+#if HIGH_BIT_DEPTH
151
+ p2s_start_sve
152
+ rdvl x9, #1
153
+ cmp x9, #16
154
+ bgt .vl_gt_16_filterPixelToShort_high_32x\h
155
+ ptrue p0.h, vl8
156
+.rept \h
157
+ ld1h {z0.h}, p0/z, x0
158
+ ld1h {z1.h}, p0/z, x0, #1, mul vl
159
+ ld1h {z2.h}, p0/z, x0, #2, mul vl
160
+ ld1h {z3.h}, p0/z, x0, #3, mul vl
161
+ add x0, x0, x1
162
+ lsl z0.h, p0/m, z0.h, #P2S_SHIFT
163
+ lsl z1.h, p0/m, z1.h, #P2S_SHIFT
164
+ lsl z2.h, p0/m, z2.h, #P2S_SHIFT
165
+ lsl z3.h, p0/m, z3.h, #P2S_SHIFT
166
+ add z0.h, p0/m, z0.h, z31.h
167
+ add z1.h, p0/m, z1.h, z31.h
168
+ add z2.h, p0/m, z2.h, z31.h
169
+ add z3.h, p0/m, z3.h, z31.h
170
+ st1h {z0.h}, p0, x2
171
+ st1h {z1.h}, p0, x2, #1, mul vl
172
+ st1h {z2.h}, p0, x2, #2, mul vl
173
+ st1h {z3.h}, p0, x2, #3, mul vl
174
+ add x2, x2, x3
175
+.endr
176
+ ret
177
+.vl_gt_16_filterPixelToShort_high_32x\h\():
178
+ cmp x9, #48
179
+ bgt .vl_gt_48_filterPixelToShort_high_32x\h
180
+ ptrue p0.h, vl16
181
+.rept \h
182
+ ld1h {z0.h}, p0/z, x0
183
+ ld1h {z1.h}, p0/z, x0, #1, mul vl
184
+ add x0, x0, x1
185
+ lsl z0.h, p0/m, z0.h, #P2S_SHIFT
186
+ lsl z1.h, p0/m, z1.h, #P2S_SHIFT
187
+ add z0.h, p0/m, z0.h, z31.h
188
+ add z1.h, p0/m, z1.h, z31.h
189
+ st1h {z0.h}, p0, x2
190
+ st1h {z1.h}, p0, x2, #1, mul vl
191
+ add x2, x2, x3
192
+.endr
193
+ ret
194
+.vl_gt_48_filterPixelToShort_high_32x\h\():
195
+ ptrue p0.h, vl32
196
+.rept \h
197
+ ld1h {z0.h}, p0/z, x0
198
+ add x0, x0, x1
199
+ lsl z0.h, p0/m, z0.h, #P2S_SHIFT
200
+ add z0.h, p0/m, z0.h, z31.h
201
x265_3.6.tar.gz/source/common/aarch64/p2s.S
Added
201
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2021 MulticoreWare, Inc
4
+ *
5
+ * Authors: Sebastian Pop <spop@amazon.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+#include "p2s-common.S"
27
+
28
+#ifdef __APPLE__
29
+.section __RODATA,__rodata
30
+#else
31
+.section .rodata
32
+#endif
33
+
34
+.align 4
35
+
36
+.text
37
+
38
+// filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
39
+.macro p2s_2xN h
40
+function PFX(filterPixelToShort_2x\h\()_neon)
41
+ p2s_start
42
+.rept \h / 2
43
+ p2s_2x2
44
+.endr
45
+ ret
46
+endfunc
47
+.endm
48
+
49
+p2s_2xN 4
50
+p2s_2xN 8
51
+p2s_2xN 16
52
+
53
+.macro p2s_6xN h
54
+function PFX(filterPixelToShort_6x\h\()_neon)
55
+ p2s_start
56
+ sub x3, x3, #8
57
+#if HIGH_BIT_DEPTH
58
+ sub x1, x1, #8
59
+#endif
60
+.rept \h / 2
61
+ p2s_6x2
62
+.endr
63
+ ret
64
+endfunc
65
+.endm
66
+
67
+p2s_6xN 8
68
+p2s_6xN 16
69
+
70
+function PFX(filterPixelToShort_4x2_neon)
71
+ p2s_start
72
+#if HIGH_BIT_DEPTH
73
+ ld1 {v0.d}0, x0, x1
74
+ ld1 {v0.d}1, x0, x1
75
+ shl v3.8h, v0.8h, #P2S_SHIFT
76
+#else
77
+ ld1 {v0.s}0, x0, x1
78
+ ld1 {v0.s}1, x0, x1
79
+ ushll v3.8h, v0.8b, #P2S_SHIFT
80
+#endif
81
+ add v3.8h, v3.8h, v31.8h
82
+ st1 {v3.d}0, x2, x3
83
+ st1 {v3.d}1, x2, x3
84
+ ret
85
+endfunc
86
+
87
+function PFX(filterPixelToShort_4x4_neon)
88
+ p2s_start
89
+#if HIGH_BIT_DEPTH
90
+ ld1 {v0.d}0, x0, x1
91
+ ld1 {v0.d}1, x0, x1
92
+ shl v3.8h, v0.8h, #P2S_SHIFT
93
+#else
94
+ ld1 {v0.s}0, x0, x1
95
+ ld1 {v0.s}1, x0, x1
96
+ ushll v3.8h, v0.8b, #P2S_SHIFT
97
+#endif
98
+ add v3.8h, v3.8h, v31.8h
99
+ st1 {v3.d}0, x2, x3
100
+ st1 {v3.d}1, x2, x3
101
+#if HIGH_BIT_DEPTH
102
+ ld1 {v1.d}0, x0, x1
103
+ ld1 {v1.d}1, x0, x1
104
+ shl v4.8h, v1.8h, #P2S_SHIFT
105
+#else
106
+ ld1 {v1.s}0, x0, x1
107
+ ld1 {v1.s}1, x0, x1
108
+ ushll v4.8h, v1.8b, #P2S_SHIFT
109
+#endif
110
+ add v4.8h, v4.8h, v31.8h
111
+ st1 {v4.d}0, x2, x3
112
+ st1 {v4.d}1, x2, x3
113
+ ret
114
+endfunc
115
+
116
+.macro p2s_4xN h
117
+function PFX(filterPixelToShort_4x\h\()_neon)
118
+ p2s_start
119
+.rept \h / 2
120
+#if HIGH_BIT_DEPTH
121
+ ld1 {v0.16b}, x0, x1
122
+ shl v0.8h, v0.8h, #P2S_SHIFT
123
+#else
124
+ ld1 {v0.8b}, x0, x1
125
+ ushll v0.8h, v0.8b, #P2S_SHIFT
126
+#endif
127
+ add v2.4h, v0.4h, v31.4h
128
+ st1 {v2.4h}, x2, x3
129
+#if HIGH_BIT_DEPTH
130
+ ld1 {v1.16b}, x0, x1
131
+ shl v1.8h, v1.8h, #P2S_SHIFT
132
+#else
133
+ ld1 {v1.8b}, x0, x1
134
+ ushll v1.8h, v1.8b, #P2S_SHIFT
135
+#endif
136
+ add v3.4h, v1.4h, v31.4h
137
+ st1 {v3.4h}, x2, x3
138
+.endr
139
+ ret
140
+endfunc
141
+.endm
142
+
143
+p2s_4xN 8
144
+p2s_4xN 16
145
+p2s_4xN 32
146
+
147
+.macro p2s_8xN h
148
+function PFX(filterPixelToShort_8x\h\()_neon)
149
+ p2s_start
150
+.rept \h / 2
151
+#if HIGH_BIT_DEPTH
152
+ ld1 {v0.16b}, x0, x1
153
+ ld1 {v1.16b}, x0, x1
154
+ shl v0.8h, v0.8h, #P2S_SHIFT
155
+ shl v1.8h, v1.8h, #P2S_SHIFT
156
+#else
157
+ ld1 {v0.8b}, x0, x1
158
+ ld1 {v1.8b}, x0, x1
159
+ ushll v0.8h, v0.8b, #P2S_SHIFT
160
+ ushll v1.8h, v1.8b, #P2S_SHIFT
161
+#endif
162
+ add v2.8h, v0.8h, v31.8h
163
+ st1 {v2.8h}, x2, x3
164
+ add v3.8h, v1.8h, v31.8h
165
+ st1 {v3.8h}, x2, x3
166
+.endr
167
+ ret
168
+endfunc
169
+.endm
170
+
171
+p2s_8xN 2
172
+p2s_8xN 4
173
+p2s_8xN 6
174
+p2s_8xN 8
175
+p2s_8xN 12
176
+p2s_8xN 16
177
+p2s_8xN 32
178
+p2s_8xN 64
179
+
180
+.macro p2s_12xN h
181
+function PFX(filterPixelToShort_12x\h\()_neon)
182
+ p2s_start
183
+ sub x3, x3, #16
184
+.rept \h
185
+#if HIGH_BIT_DEPTH
186
+ ld1 {v0.16b-v1.16b}, x0, x1
187
+ shl v2.8h, v0.8h, #P2S_SHIFT
188
+ shl v3.8h, v1.8h, #P2S_SHIFT
189
+#else
190
+ ld1 {v0.16b}, x0, x1
191
+ ushll v2.8h, v0.8b, #P2S_SHIFT
192
+ ushll2 v3.8h, v0.16b, #P2S_SHIFT
193
+#endif
194
+ add v2.8h, v2.8h, v31.8h
195
+ add v3.8h, v3.8h, v31.8h
196
+ st1 {v2.16b}, x2, #16
197
+ st1 {v3.8b}, x2, x3
198
+.endr
199
+ ret
200
+endfunc
201
x265_3.6.tar.gz/source/common/aarch64/pixel-prim.cpp
Added
201
1
2
+#include "common.h"
3
+#include "slicetype.h" // LOWRES_COST_MASK
4
+#include "primitives.h"
5
+#include "x265.h"
6
+
7
+#include "pixel-prim.h"
8
+#include "arm64-utils.h"
9
+#if HAVE_NEON
10
+
11
+#include <arm_neon.h>
12
+
13
+using namespace X265_NS;
14
+
15
+
16
+
17
+namespace
18
+{
19
+
20
+
21
+/* SATD SA8D variants - based on x264 */
22
+static inline void SUMSUB_AB(int16x8_t &sum, int16x8_t &sub, const int16x8_t a, const int16x8_t b)
23
+{
24
+ sum = vaddq_s16(a, b);
25
+ sub = vsubq_s16(a, b);
26
+}
27
+
28
+static inline void transpose_8h(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
29
+{
30
+ t1 = vtrn1q_s16(s1, s2);
31
+ t2 = vtrn2q_s16(s1, s2);
32
+}
33
+
34
+static inline void transpose_4s(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
35
+{
36
+ t1 = vtrn1q_s32(s1, s2);
37
+ t2 = vtrn2q_s32(s1, s2);
38
+}
39
+
40
+#if (X265_DEPTH <= 10)
41
+static inline void transpose_2d(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
42
+{
43
+ t1 = vtrn1q_s64(s1, s2);
44
+ t2 = vtrn2q_s64(s1, s2);
45
+}
46
+#endif
47
+
48
+
49
+static inline void SUMSUB_ABCD(int16x8_t &s1, int16x8_t &d1, int16x8_t &s2, int16x8_t &d2,
50
+ int16x8_t a, int16x8_t b, int16x8_t c, int16x8_t d)
51
+{
52
+ SUMSUB_AB(s1, d1, a, b);
53
+ SUMSUB_AB(s2, d2, c, d);
54
+}
55
+
56
+static inline void HADAMARD4_V(int16x8_t &r1, int16x8_t &r2, int16x8_t &r3, int16x8_t &r4,
57
+ int16x8_t &t1, int16x8_t &t2, int16x8_t &t3, int16x8_t &t4)
58
+{
59
+ SUMSUB_ABCD(t1, t2, t3, t4, r1, r2, r3, r4);
60
+ SUMSUB_ABCD(r1, r3, r2, r4, t1, t3, t2, t4);
61
+}
62
+
63
+
64
+static int _satd_4x8_8x4_end_neon(int16x8_t v0, int16x8_t v1, int16x8_t v2, int16x8_t v3)
65
+
66
+{
67
+
68
+ int16x8_t v4, v5, v6, v7, v16, v17, v18, v19;
69
+
70
+
71
+ SUMSUB_AB(v16, v17, v0, v1);
72
+ SUMSUB_AB(v18, v19, v2, v3);
73
+
74
+ SUMSUB_AB(v4 , v6 , v16, v18);
75
+ SUMSUB_AB(v5 , v7 , v17, v19);
76
+
77
+ v0 = vtrn1q_s16(v4, v5);
78
+ v1 = vtrn2q_s16(v4, v5);
79
+ v2 = vtrn1q_s16(v6, v7);
80
+ v3 = vtrn2q_s16(v6, v7);
81
+
82
+ SUMSUB_AB(v16, v17, v0, v1);
83
+ SUMSUB_AB(v18, v19, v2, v3);
84
+
85
+ v0 = vtrn1q_s32(v16, v18);
86
+ v1 = vtrn2q_s32(v16, v18);
87
+ v2 = vtrn1q_s32(v17, v19);
88
+ v3 = vtrn2q_s32(v17, v19);
89
+
90
+ v0 = vabsq_s16(v0);
91
+ v1 = vabsq_s16(v1);
92
+ v2 = vabsq_s16(v2);
93
+ v3 = vabsq_s16(v3);
94
+
95
+ v0 = vmaxq_u16(v0, v1);
96
+ v1 = vmaxq_u16(v2, v3);
97
+
98
+ v0 = vaddq_u16(v0, v1);
99
+ return vaddlvq_u16(v0);
100
+}
101
+
102
+static inline int _satd_4x4_neon(int16x8_t v0, int16x8_t v1)
103
+{
104
+ int16x8_t v2, v3;
105
+ SUMSUB_AB(v2, v3, v0, v1);
106
+
107
+ v0 = vzip1q_s64(v2, v3);
108
+ v1 = vzip2q_s64(v2, v3);
109
+ SUMSUB_AB(v2, v3, v0, v1);
110
+
111
+ v0 = vtrn1q_s16(v2, v3);
112
+ v1 = vtrn2q_s16(v2, v3);
113
+ SUMSUB_AB(v2, v3, v0, v1);
114
+
115
+ v0 = vtrn1q_s32(v2, v3);
116
+ v1 = vtrn2q_s32(v2, v3);
117
+
118
+ v0 = vabsq_s16(v0);
119
+ v1 = vabsq_s16(v1);
120
+ v0 = vmaxq_u16(v0, v1);
121
+
122
+ return vaddlvq_s16(v0);
123
+}
124
+
125
+static void _satd_8x4v_8x8h_neon(int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3, int16x8_t &v20,
126
+ int16x8_t &v21, int16x8_t &v22, int16x8_t &v23)
127
+{
128
+ int16x8_t v16, v17, v18, v19, v4, v5, v6, v7;
129
+
130
+ SUMSUB_AB(v16, v18, v0, v2);
131
+ SUMSUB_AB(v17, v19, v1, v3);
132
+
133
+ HADAMARD4_V(v20, v21, v22, v23, v0, v1, v2, v3);
134
+
135
+ transpose_8h(v0, v1, v16, v17);
136
+ transpose_8h(v2, v3, v18, v19);
137
+ transpose_8h(v4, v5, v20, v21);
138
+ transpose_8h(v6, v7, v22, v23);
139
+
140
+ SUMSUB_AB(v16, v17, v0, v1);
141
+ SUMSUB_AB(v18, v19, v2, v3);
142
+ SUMSUB_AB(v20, v21, v4, v5);
143
+ SUMSUB_AB(v22, v23, v6, v7);
144
+
145
+ transpose_4s(v0, v2, v16, v18);
146
+ transpose_4s(v1, v3, v17, v19);
147
+ transpose_4s(v4, v6, v20, v22);
148
+ transpose_4s(v5, v7, v21, v23);
149
+
150
+ v0 = vabsq_s16(v0);
151
+ v1 = vabsq_s16(v1);
152
+ v2 = vabsq_s16(v2);
153
+ v3 = vabsq_s16(v3);
154
+ v4 = vabsq_s16(v4);
155
+ v5 = vabsq_s16(v5);
156
+ v6 = vabsq_s16(v6);
157
+ v7 = vabsq_s16(v7);
158
+
159
+ v0 = vmaxq_u16(v0, v2);
160
+ v1 = vmaxq_u16(v1, v3);
161
+ v2 = vmaxq_u16(v4, v6);
162
+ v3 = vmaxq_u16(v5, v7);
163
+
164
+}
165
+
166
+#if HIGH_BIT_DEPTH
167
+
168
+#if (X265_DEPTH > 10)
169
+static inline void transpose_2d(int32x4_t &t1, int32x4_t &t2, const int32x4_t s1, const int32x4_t s2)
170
+{
171
+ t1 = vtrn1q_s64(s1, s2);
172
+ t2 = vtrn2q_s64(s1, s2);
173
+}
174
+
175
+static inline void ISUMSUB_AB(int32x4_t &sum, int32x4_t &sub, const int32x4_t a, const int32x4_t b)
176
+{
177
+ sum = vaddq_s32(a, b);
178
+ sub = vsubq_s32(a, b);
179
+}
180
+
181
+static inline void ISUMSUB_AB_FROM_INT16(int32x4_t &suml, int32x4_t &sumh, int32x4_t &subl, int32x4_t &subh,
182
+ const int16x8_t a, const int16x8_t b)
183
+{
184
+ suml = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
185
+ sumh = vaddl_high_s16(a, b);
186
+ subl = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
187
+ subh = vsubl_high_s16(a, b);
188
+}
189
+
190
+#endif
191
+
192
+static inline void _sub_8x8_fly(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2,
193
+ int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3,
194
+ int16x8_t &v20, int16x8_t &v21, int16x8_t &v22, int16x8_t &v23)
195
+{
196
+ uint16x8_t r0, r1, r2, r3;
197
+ uint16x8_t t0, t1, t2, t3;
198
+ int16x8_t v16, v17;
199
+ int16x8_t v18, v19;
200
+
201
x265_3.6.tar.gz/source/common/aarch64/pixel-prim.h
Added
25
1
2
+#ifndef PIXEL_PRIM_NEON_H__
3
+#define PIXEL_PRIM_NEON_H__
4
+
5
+#include "common.h"
6
+#include "slicetype.h" // LOWRES_COST_MASK
7
+#include "primitives.h"
8
+#include "x265.h"
9
+
10
+
11
+
12
+namespace X265_NS
13
+{
14
+
15
+
16
+
17
+void setupPixelPrimitives_neon(EncoderPrimitives &p);
18
+
19
+
20
+}
21
+
22
+
23
+#endif
24
+
25
x265_3.6.tar.gz/source/common/aarch64/pixel-util-common.S
Added
86
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+.arch armv8-a
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.macro pixel_var_start
39
+ movi v0.16b, #0
40
+ movi v1.16b, #0
41
+ movi v2.16b, #0
42
+ movi v3.16b, #0
43
+.endm
44
+
45
+.macro pixel_var_1 v
46
+ uaddw v0.8h, v0.8h, \v\().8b
47
+ umull v30.8h, \v\().8b, \v\().8b
48
+ uaddw2 v1.8h, v1.8h, \v\().16b
49
+ umull2 v31.8h, \v\().16b, \v\().16b
50
+ uadalp v2.4s, v30.8h
51
+ uadalp v3.4s, v31.8h
52
+.endm
53
+
54
+.macro pixel_var_end
55
+ uaddlv s0, v0.8h
56
+ uaddlv s1, v1.8h
57
+ add v2.4s, v2.4s, v3.4s
58
+ fadd s0, s0, s1
59
+ uaddlv d2, v2.4s
60
+ fmov w0, s0
61
+ fmov x2, d2
62
+ orr x0, x0, x2, lsl #32
63
+.endm
64
+
65
+.macro ssimDist_start
66
+ movi v0.16b, #0
67
+ movi v1.16b, #0
68
+.endm
69
+
70
+.macro ssimDist_end
71
+ uaddlv d0, v0.4s
72
+ uaddlv d1, v1.4s
73
+ str d0, x6
74
+ str d1, x4
75
+.endm
76
+
77
+.macro normFact_start
78
+ movi v0.16b, #0
79
+.endm
80
+
81
+.macro normFact_end
82
+ uaddlv d0, v0.4s
83
+ str d0, x3
84
+.endm
85
+
86
x265_3.6.tar.gz/source/common/aarch64/pixel-util-sve.S
Added
201
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "pixel-util-common.S"
27
+
28
+.arch armv8-a+sve
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+function PFX(pixel_sub_ps_8x16_sve)
41
+ lsl x1, x1, #1
42
+ ptrue p0.h, vl8
43
+.rept 8
44
+ ld1b {z0.h}, p0/z, x2
45
+ ld1b {z1.h}, p0/z, x3
46
+ add x2, x2, x4
47
+ add x3, x3, x5
48
+ ld1b {z2.h}, p0/z, x2
49
+ ld1b {z3.h}, p0/z, x3
50
+ add x2, x2, x4
51
+ add x3, x3, x5
52
+ sub z4.h, z0.h, z1.h
53
+ sub z5.h, z2.h, z3.h
54
+ st1 {v4.8h}, x0, x1
55
+ st1 {v5.8h}, x0, x1
56
+.endr
57
+ ret
58
+endfunc
59
+
60
+//******* satd *******
61
+.macro satd_4x4_sve
62
+ ld1b {z0.h}, p0/z, x0
63
+ ld1b {z2.h}, p0/z, x2
64
+ add x0, x0, x1
65
+ add x2, x2, x3
66
+ ld1b {z1.h}, p0/z, x0
67
+ ld1b {z3.h}, p0/z, x2
68
+ add x0, x0, x1
69
+ add x2, x2, x3
70
+ ld1b {z4.h}, p0/z, x0
71
+ ld1b {z6.h}, p0/z, x2
72
+ add x0, x0, x1
73
+ add x2, x2, x3
74
+ ld1b {z5.h}, p0/z, x0
75
+ ld1b {z7.h}, p0/z, x2
76
+ add x0, x0, x1
77
+ add x2, x2, x3
78
+
79
+ sub z0.h, z0.h, z2.h
80
+ sub z1.h, z1.h, z3.h
81
+ sub z2.h, z4.h, z6.h
82
+ sub z3.h, z5.h, z7.h
83
+
84
+ add z4.h, z0.h, z2.h
85
+ add z5.h, z1.h, z3.h
86
+ sub z6.h, z0.h, z2.h
87
+ sub z7.h, z1.h, z3.h
88
+
89
+ add z0.h, z4.h, z5.h
90
+ sub z1.h, z4.h, z5.h
91
+
92
+ add z2.h, z6.h, z7.h
93
+ sub z3.h, z6.h, z7.h
94
+
95
+ trn1 z4.h, z0.h, z2.h
96
+ trn2 z5.h, z0.h, z2.h
97
+
98
+ trn1 z6.h, z1.h, z3.h
99
+ trn2 z7.h, z1.h, z3.h
100
+
101
+ add z0.h, z4.h, z5.h
102
+ sub z1.h, z4.h, z5.h
103
+
104
+ add z2.h, z6.h, z7.h
105
+ sub z3.h, z6.h, z7.h
106
+
107
+ trn1 z4.s, z0.s, z1.s
108
+ trn2 z5.s, z0.s, z1.s
109
+
110
+ trn1 z6.s, z2.s, z3.s
111
+ trn2 z7.s, z2.s, z3.s
112
+
113
+ abs z4.h, p0/m, z4.h
114
+ abs z5.h, p0/m, z5.h
115
+ abs z6.h, p0/m, z6.h
116
+ abs z7.h, p0/m, z7.h
117
+
118
+ smax z4.h, p0/m, z4.h, z5.h
119
+ smax z6.h, p0/m, z6.h, z7.h
120
+
121
+ add z0.h, z4.h, z6.h
122
+
123
+ uaddlp v0.2s, v0.4h
124
+ uaddlp v0.1d, v0.2s
125
+.endm
126
+
127
+// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
128
+function PFX(pixel_satd_4x4_sve)
129
+ ptrue p0.h, vl4
130
+ satd_4x4_sve
131
+ fmov x0, d0
132
+ ret
133
+endfunc
134
+
135
+function PFX(pixel_satd_8x4_sve)
136
+ ptrue p0.h, vl4
137
+ mov x4, x0
138
+ mov x5, x2
139
+ satd_4x4_sve
140
+ add x0, x4, #4
141
+ add x2, x5, #4
142
+ umov x6, v0.d0
143
+ satd_4x4_sve
144
+ umov x0, v0.d0
145
+ add x0, x0, x6
146
+ ret
147
+endfunc
148
+
149
+function PFX(pixel_satd_8x12_sve)
150
+ ptrue p0.h, vl4
151
+ mov x4, x0
152
+ mov x5, x2
153
+ mov x7, #0
154
+ satd_4x4_sve
155
+ umov x6, v0.d0
156
+ add x7, x7, x6
157
+ add x0, x4, #4
158
+ add x2, x5, #4
159
+ satd_4x4_sve
160
+ umov x6, v0.d0
161
+ add x7, x7, x6
162
+.rept 2
163
+ sub x0, x0, #4
164
+ sub x2, x2, #4
165
+ mov x4, x0
166
+ mov x5, x2
167
+ satd_4x4_sve
168
+ umov x6, v0.d0
169
+ add x7, x7, x6
170
+ add x0, x4, #4
171
+ add x2, x5, #4
172
+ satd_4x4_sve
173
+ umov x6, v0.d0
174
+ add x7, x7, x6
175
+.endr
176
+ mov x0, x7
177
+ ret
178
+endfunc
179
+
180
+.macro LOAD_DIFF_16x4_sve v0 v1 v2 v3 v4 v5 v6 v7
181
+ mov x11, #8 // in order to consider CPUs whose vector size is greater than 128 bits
182
+ ld1b {z0.h}, p0/z, x0
183
+ ld1b {z1.h}, p0/z, x0, x11
184
+ ld1b {z2.h}, p0/z, x2
185
+ ld1b {z3.h}, p0/z, x2, x11
186
+ add x0, x0, x1
187
+ add x2, x2, x3
188
+ ld1b {z4.h}, p0/z, x0
189
+ ld1b {z5.h}, p0/z, x0, x11
190
+ ld1b {z6.h}, p0/z, x2
191
+ ld1b {z7.h}, p0/z, x2, x11
192
+ add x0, x0, x1
193
+ add x2, x2, x3
194
+ ld1b {z29.h}, p0/z, x0
195
+ ld1b {z9.h}, p0/z, x0, x11
196
+ ld1b {z10.h}, p0/z, x2
197
+ ld1b {z11.h}, p0/z, x2, x11
198
+ add x0, x0, x1
199
+ add x2, x2, x3
200
+ ld1b {z12.h}, p0/z, x0
201
x265_3.6.tar.gz/source/common/aarch64/pixel-util-sve2.S
Added
201
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "pixel-util-common.S"
27
+
28
+.arch armv8-a+sve2
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+// uint64_t pixel_var(const pixel* pix, intptr_t i_stride)
41
+function PFX(pixel_var_8x8_sve2)
42
+ ptrue p0.h, vl8
43
+ ld1b {z0.h}, p0/z, x0
44
+ add x0, x0, x1
45
+ mul z31.h, z0.h, z0.h
46
+ uaddlp v1.4s, v31.8h
47
+.rept 7
48
+ ld1b {z4.h}, p0/z, x0
49
+ add x0, x0, x1
50
+ add z0.h, z0.h, z4.h
51
+ mul z31.h, z4.h, z4.h
52
+ uadalp z1.s, p0/m, z31.h
53
+.endr
54
+ uaddlv s0, v0.8h
55
+ uaddlv d1, v1.4s
56
+ fmov w0, s0
57
+ fmov x1, d1
58
+ orr x0, x0, x1, lsl #32
59
+ ret
60
+endfunc
61
+
62
+function PFX(pixel_var_16x16_sve2)
63
+ rdvl x9, #1
64
+ cmp x9, #16
65
+ bgt .vl_gt_16_pixel_var_16x16
66
+ pixel_var_start
67
+ mov w12, #16
68
+.loop_var_16_sve2:
69
+ sub w12, w12, #1
70
+ ld1 {v4.16b}, x0, x1
71
+ pixel_var_1 v4
72
+ cbnz w12, .loop_var_16_sve2
73
+ pixel_var_end
74
+ ret
75
+.vl_gt_16_pixel_var_16x16:
76
+ ptrue p0.h, vl16
77
+ mov z0.d, #0
78
+.rept 16
79
+ ld1b {z4.h}, p0/z, x0
80
+ add x0, x0, x1
81
+ add z0.h, z0.h, z4.h
82
+ mul z30.h, z4.h, z4.h
83
+ uadalp z1.s, p0/m, z30.h
84
+.endr
85
+ uaddv d0, p0, z0.h
86
+ uaddv d1, p0, z1.s
87
+ fmov w0, s0
88
+ fmov x1, d1
89
+ orr x0, x0, x1, lsl #32
90
+ ret
91
+endfunc
92
+
93
+function PFX(pixel_var_32x32_sve2)
94
+ rdvl x9, #1
95
+ cmp x9, #16
96
+ bgt .vl_gt_16_pixel_var_32x32
97
+ pixel_var_start
98
+ mov w12, #32
99
+.loop_var_32_sve2:
100
+ sub w12, w12, #1
101
+ ld1 {v4.16b-v5.16b}, x0, x1
102
+ pixel_var_1 v4
103
+ pixel_var_1 v5
104
+ cbnz w12, .loop_var_32_sve2
105
+ pixel_var_end
106
+ ret
107
+.vl_gt_16_pixel_var_32x32:
108
+ cmp x9, #48
109
+ bgt .vl_gt_48_pixel_var_32x32
110
+ ptrue p0.b, vl32
111
+ mov z0.d, #0
112
+ mov z1.d, #0
113
+.rept 32
114
+ ld1b {z4.b}, p0/z, x0
115
+ add x0, x0, x1
116
+ uaddwb z0.h, z0.h, z4.b
117
+ uaddwt z0.h, z0.h, z4.b
118
+ umullb z28.h, z4.b, z4.b
119
+ umullt z29.h, z4.b, z4.b
120
+ uadalp z1.s, p0/m, z28.h
121
+ uadalp z1.s, p0/m, z29.h
122
+.endr
123
+ uaddv d0, p0, z0.h
124
+ uaddv d1, p0, z1.s
125
+ fmov w0, s0
126
+ fmov x1, d1
127
+ orr x0, x0, x1, lsl #32
128
+ ret
129
+.vl_gt_48_pixel_var_32x32:
130
+ ptrue p0.h, vl32
131
+ mov z0.d, #0
132
+ mov z1.d, #0
133
+.rept 32
134
+ ld1b {z4.h}, p0/z, x0
135
+ add x0, x0, x1
136
+ add z0.h, z0.h, z4.h
137
+ mul z28.h, z4.h, z4.h
138
+ uadalp z1.s, p0/m, z28.h
139
+.endr
140
+ uaddv d0, p0, z0.h
141
+ uaddv d1, p0, z1.s
142
+ fmov w0, s0
143
+ fmov x1, d1
144
+ orr x0, x0, x1, lsl #32
145
+ ret
146
+endfunc
147
+
148
+function PFX(pixel_var_64x64_sve2)
149
+ rdvl x9, #1
150
+ cmp x9, #16
151
+ bgt .vl_gt_16_pixel_var_64x64
152
+ pixel_var_start
153
+ mov w12, #64
154
+.loop_var_64_sve2:
155
+ sub w12, w12, #1
156
+ ld1 {v4.16b-v7.16b}, x0, x1
157
+ pixel_var_1 v4
158
+ pixel_var_1 v5
159
+ pixel_var_1 v6
160
+ pixel_var_1 v7
161
+ cbnz w12, .loop_var_64_sve2
162
+ pixel_var_end
163
+ ret
164
+.vl_gt_16_pixel_var_64x64:
165
+ cmp x9, #48
166
+ bgt .vl_gt_48_pixel_var_64x64
167
+ ptrue p0.b, vl32
168
+ mov z0.d, #0
169
+ mov z2.d, #0
170
+.rept 64
171
+ ld1b {z4.b}, p0/z, x0
172
+ ld1b {z5.b}, p0/z, x0, #1, mul vl
173
+ add x0, x0, x1
174
+ uaddwb z0.h, z0.h, z4.b
175
+ uaddwt z0.h, z0.h, z4.b
176
+ uaddwb z0.h, z0.h, z5.b
177
+ uaddwt z0.h, z0.h, z5.b
178
+ umullb z24.h, z4.b, z4.b
179
+ umullt z25.h, z4.b, z4.b
180
+ umullb z26.h, z5.b, z5.b
181
+ umullt z27.h, z5.b, z5.b
182
+ uadalp z2.s, p0/m, z24.h
183
+ uadalp z2.s, p0/m, z25.h
184
+ uadalp z2.s, p0/m, z26.h
185
+ uadalp z2.s, p0/m, z27.h
186
+.endr
187
+ uaddv d0, p0, z0.h
188
+ uaddv d1, p0, z2.s
189
+ fmov w0, s0
190
+ fmov x1, d1
191
+ orr x0, x0, x1, lsl #32
192
+ ret
193
+.vl_gt_48_pixel_var_64x64:
194
+ cmp x9, #112
195
+ bgt .vl_gt_112_pixel_var_64x64
196
+ ptrue p0.b, vl64
197
+ mov z0.d, #0
198
+ mov z1.d, #0
199
+.rept 64
200
+ ld1b {z4.b}, p0/z, x0
201
x265_3.5.tar.gz/source/common/aarch64/pixel-util.S -> x265_3.6.tar.gz/source/common/aarch64/pixel-util.S
Changed
201
1
2
/*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
+ * Copyright (C) 2020-2021 MulticoreWare, Inc
5
*
6
* Authors: Yimeng Su <yimeng.su@huawei.com>
7
* Hongbin Liu <liuhongbin1@huawei.com>
8
+ * Sebastian Pop <spop@amazon.com>
9
*
10
* This program is free software; you can redistribute it and/or modify
11
* it under the terms of the GNU General Public License as published by
12
13
*****************************************************************************/
14
15
#include "asm.S"
16
+#include "pixel-util-common.S"
17
18
+#ifdef __APPLE__
19
+.section __RODATA,__rodata
20
+#else
21
.section .rodata
22
+#endif
23
24
.align 4
25
26
.text
27
28
+// uint64_t pixel_var(const pixel* pix, intptr_t i_stride)
29
+function PFX(pixel_var_8x8_neon)
30
+ ld1 {v4.8b}, x0, x1 // pixx
31
+ uxtl v0.8h, v4.8b // sum = pixx
32
+ umull v1.8h, v4.8b, v4.8b
33
+ uaddlp v1.4s, v1.8h // sqr = pixx * pixx
34
+
35
+.rept 7
36
+ ld1 {v4.8b}, x0, x1 // pixx
37
+ umull v31.8h, v4.8b, v4.8b
38
+ uaddw v0.8h, v0.8h, v4.8b // sum += pixx
39
+ uadalp v1.4s, v31.8h // sqr += pixx * pixx
40
+.endr
41
+ uaddlv s0, v0.8h
42
+ uaddlv d1, v1.4s
43
+ fmov w0, s0
44
+ fmov x1, d1
45
+ orr x0, x0, x1, lsl #32 // return sum + ((uint64_t)sqr << 32);
46
+ ret
47
+endfunc
48
+
49
+function PFX(pixel_var_16x16_neon)
50
+ pixel_var_start
51
+ mov w12, #16
52
+.loop_var_16:
53
+ sub w12, w12, #1
54
+ ld1 {v4.16b}, x0, x1
55
+ pixel_var_1 v4
56
+ cbnz w12, .loop_var_16
57
+ pixel_var_end
58
+ ret
59
+endfunc
60
+
61
+function PFX(pixel_var_32x32_neon)
62
+ pixel_var_start
63
+ mov w12, #32
64
+.loop_var_32:
65
+ sub w12, w12, #1
66
+ ld1 {v4.16b-v5.16b}, x0, x1
67
+ pixel_var_1 v4
68
+ pixel_var_1 v5
69
+ cbnz w12, .loop_var_32
70
+ pixel_var_end
71
+ ret
72
+endfunc
73
+
74
+function PFX(pixel_var_64x64_neon)
75
+ pixel_var_start
76
+ mov w12, #64
77
+.loop_var_64:
78
+ sub w12, w12, #1
79
+ ld1 {v4.16b-v7.16b}, x0, x1
80
+ pixel_var_1 v4
81
+ pixel_var_1 v5
82
+ pixel_var_1 v6
83
+ pixel_var_1 v7
84
+ cbnz w12, .loop_var_64
85
+ pixel_var_end
86
+ ret
87
+endfunc
88
+
89
+// void getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride)
90
+function PFX(getResidual4_neon)
91
+ lsl x4, x3, #1
92
+.rept 2
93
+ ld1 {v0.8b}, x0, x3
94
+ ld1 {v1.8b}, x1, x3
95
+ ld1 {v2.8b}, x0, x3
96
+ ld1 {v3.8b}, x1, x3
97
+ usubl v4.8h, v0.8b, v1.8b
98
+ usubl v5.8h, v2.8b, v3.8b
99
+ st1 {v4.8b}, x2, x4
100
+ st1 {v5.8b}, x2, x4
101
+.endr
102
+ ret
103
+endfunc
104
+
105
+function PFX(getResidual8_neon)
106
+ lsl x4, x3, #1
107
+.rept 4
108
+ ld1 {v0.8b}, x0, x3
109
+ ld1 {v1.8b}, x1, x3
110
+ ld1 {v2.8b}, x0, x3
111
+ ld1 {v3.8b}, x1, x3
112
+ usubl v4.8h, v0.8b, v1.8b
113
+ usubl v5.8h, v2.8b, v3.8b
114
+ st1 {v4.16b}, x2, x4
115
+ st1 {v5.16b}, x2, x4
116
+.endr
117
+ ret
118
+endfunc
119
+
120
+function PFX(getResidual16_neon)
121
+ lsl x4, x3, #1
122
+.rept 8
123
+ ld1 {v0.16b}, x0, x3
124
+ ld1 {v1.16b}, x1, x3
125
+ ld1 {v2.16b}, x0, x3
126
+ ld1 {v3.16b}, x1, x3
127
+ usubl v4.8h, v0.8b, v1.8b
128
+ usubl2 v5.8h, v0.16b, v1.16b
129
+ usubl v6.8h, v2.8b, v3.8b
130
+ usubl2 v7.8h, v2.16b, v3.16b
131
+ st1 {v4.8h-v5.8h}, x2, x4
132
+ st1 {v6.8h-v7.8h}, x2, x4
133
+.endr
134
+ ret
135
+endfunc
136
+
137
+function PFX(getResidual32_neon)
138
+ lsl x4, x3, #1
139
+ mov w12, #4
140
+.loop_residual_32:
141
+ sub w12, w12, #1
142
+.rept 4
143
+ ld1 {v0.16b-v1.16b}, x0, x3
144
+ ld1 {v2.16b-v3.16b}, x1, x3
145
+ ld1 {v4.16b-v5.16b}, x0, x3
146
+ ld1 {v6.16b-v7.16b}, x1, x3
147
+ usubl v16.8h, v0.8b, v2.8b
148
+ usubl2 v17.8h, v0.16b, v2.16b
149
+ usubl v18.8h, v1.8b, v3.8b
150
+ usubl2 v19.8h, v1.16b, v3.16b
151
+ usubl v20.8h, v4.8b, v6.8b
152
+ usubl2 v21.8h, v4.16b, v6.16b
153
+ usubl v22.8h, v5.8b, v7.8b
154
+ usubl2 v23.8h, v5.16b, v7.16b
155
+ st1 {v16.8h-v19.8h}, x2, x4
156
+ st1 {v20.8h-v23.8h}, x2, x4
157
+.endr
158
+ cbnz w12, .loop_residual_32
159
+ ret
160
+endfunc
161
+
162
+// void pixel_sub_ps_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1)
163
+function PFX(pixel_sub_ps_4x4_neon)
164
+ lsl x1, x1, #1
165
+.rept 2
166
+ ld1 {v0.8b}, x2, x4
167
+ ld1 {v1.8b}, x3, x5
168
+ ld1 {v2.8b}, x2, x4
169
+ ld1 {v3.8b}, x3, x5
170
+ usubl v4.8h, v0.8b, v1.8b
171
+ usubl v5.8h, v2.8b, v3.8b
172
+ st1 {v4.4h}, x0, x1
173
+ st1 {v5.4h}, x0, x1
174
+.endr
175
+ ret
176
+endfunc
177
+
178
+function PFX(pixel_sub_ps_8x8_neon)
179
+ lsl x1, x1, #1
180
+.rept 4
181
+ ld1 {v0.8b}, x2, x4
182
+ ld1 {v1.8b}, x3, x5
183
+ ld1 {v2.8b}, x2, x4
184
+ ld1 {v3.8b}, x3, x5
185
+ usubl v4.8h, v0.8b, v1.8b
186
+ usubl v5.8h, v2.8b, v3.8b
187
+ st1 {v4.8h}, x0, x1
188
+ st1 {v5.8h}, x0, x1
189
+.endr
190
+ ret
191
+endfunc
192
+
193
+function PFX(pixel_sub_ps_16x16_neon)
194
+ lsl x1, x1, #1
195
+.rept 8
196
+ ld1 {v0.16b}, x2, x4
197
+ ld1 {v1.16b}, x3, x5
198
+ ld1 {v2.16b}, x2, x4
199
+ ld1 {v3.16b}, x3, x5
200
+ usubl v4.8h, v0.8b, v1.8b
201
x265_3.6.tar.gz/source/common/aarch64/sad-a-common.S
Added
201
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+#include "asm.S"
29
+
30
+.arch armv8-a
31
+
32
+#ifdef __APPLE__
33
+.section __RODATA,__rodata
34
+#else
35
+.section .rodata
36
+#endif
37
+
38
+.align 4
39
+
40
+.macro SAD_START_4 f
41
+ ld1 {v0.s}0, x0, x1
42
+ ld1 {v0.s}1, x0, x1
43
+ ld1 {v1.s}0, x2, x3
44
+ ld1 {v1.s}1, x2, x3
45
+ \f v16.8h, v0.8b, v1.8b
46
+.endm
47
+
48
+.macro SAD_4 h
49
+.rept \h / 2 - 1
50
+ SAD_START_4 uabal
51
+.endr
52
+.endm
53
+
54
+.macro SAD_START_8 f
55
+ ld1 {v0.8b}, x0, x1
56
+ ld1 {v1.8b}, x2, x3
57
+ ld1 {v2.8b}, x0, x1
58
+ ld1 {v3.8b}, x2, x3
59
+ \f v16.8h, v0.8b, v1.8b
60
+ \f v17.8h, v2.8b, v3.8b
61
+.endm
62
+
63
+.macro SAD_8 h
64
+.rept \h / 2 - 1
65
+ SAD_START_8 uabal
66
+.endr
67
+.endm
68
+
69
+.macro SAD_START_16 f
70
+ ld1 {v0.16b}, x0, x1
71
+ ld1 {v1.16b}, x2, x3
72
+ ld1 {v2.16b}, x0, x1
73
+ ld1 {v3.16b}, x2, x3
74
+ \f v16.8h, v0.8b, v1.8b
75
+ \f\()2 v17.8h, v0.16b, v1.16b
76
+ uabal v16.8h, v2.8b, v3.8b
77
+ uabal2 v17.8h, v2.16b, v3.16b
78
+.endm
79
+
80
+.macro SAD_16 h
81
+.rept \h / 2 - 1
82
+ SAD_START_16 uabal
83
+.endr
84
+.endm
85
+
86
+.macro SAD_START_32
87
+ movi v16.16b, #0
88
+ movi v17.16b, #0
89
+ movi v18.16b, #0
90
+ movi v19.16b, #0
91
+.endm
92
+
93
+.macro SAD_32
94
+ ld1 {v0.16b-v1.16b}, x0, x1
95
+ ld1 {v2.16b-v3.16b}, x2, x3
96
+ ld1 {v4.16b-v5.16b}, x0, x1
97
+ ld1 {v6.16b-v7.16b}, x2, x3
98
+ uabal v16.8h, v0.8b, v2.8b
99
+ uabal2 v17.8h, v0.16b, v2.16b
100
+ uabal v18.8h, v1.8b, v3.8b
101
+ uabal2 v19.8h, v1.16b, v3.16b
102
+ uabal v16.8h, v4.8b, v6.8b
103
+ uabal2 v17.8h, v4.16b, v6.16b
104
+ uabal v18.8h, v5.8b, v7.8b
105
+ uabal2 v19.8h, v5.16b, v7.16b
106
+.endm
107
+
108
+.macro SAD_END_32
109
+ add v16.8h, v16.8h, v17.8h
110
+ add v17.8h, v18.8h, v19.8h
111
+ add v16.8h, v16.8h, v17.8h
112
+ uaddlv s0, v16.8h
113
+ fmov w0, s0
114
+ ret
115
+.endm
116
+
117
+.macro SAD_START_64
118
+ movi v16.16b, #0
119
+ movi v17.16b, #0
120
+ movi v18.16b, #0
121
+ movi v19.16b, #0
122
+ movi v20.16b, #0
123
+ movi v21.16b, #0
124
+ movi v22.16b, #0
125
+ movi v23.16b, #0
126
+.endm
127
+
128
+.macro SAD_64
129
+ ld1 {v0.16b-v3.16b}, x0, x1
130
+ ld1 {v4.16b-v7.16b}, x2, x3
131
+ ld1 {v24.16b-v27.16b}, x0, x1
132
+ ld1 {v28.16b-v31.16b}, x2, x3
133
+ uabal v16.8h, v0.8b, v4.8b
134
+ uabal2 v17.8h, v0.16b, v4.16b
135
+ uabal v18.8h, v1.8b, v5.8b
136
+ uabal2 v19.8h, v1.16b, v5.16b
137
+ uabal v20.8h, v2.8b, v6.8b
138
+ uabal2 v21.8h, v2.16b, v6.16b
139
+ uabal v22.8h, v3.8b, v7.8b
140
+ uabal2 v23.8h, v3.16b, v7.16b
141
+
142
+ uabal v16.8h, v24.8b, v28.8b
143
+ uabal2 v17.8h, v24.16b, v28.16b
144
+ uabal v18.8h, v25.8b, v29.8b
145
+ uabal2 v19.8h, v25.16b, v29.16b
146
+ uabal v20.8h, v26.8b, v30.8b
147
+ uabal2 v21.8h, v26.16b, v30.16b
148
+ uabal v22.8h, v27.8b, v31.8b
149
+ uabal2 v23.8h, v27.16b, v31.16b
150
+.endm
151
+
152
+.macro SAD_END_64
153
+ add v16.8h, v16.8h, v17.8h
154
+ add v17.8h, v18.8h, v19.8h
155
+ add v16.8h, v16.8h, v17.8h
156
+ uaddlp v16.4s, v16.8h
157
+ add v18.8h, v20.8h, v21.8h
158
+ add v19.8h, v22.8h, v23.8h
159
+ add v17.8h, v18.8h, v19.8h
160
+ uaddlp v17.4s, v17.8h
161
+ add v16.4s, v16.4s, v17.4s
162
+ uaddlv d0, v16.4s
163
+ fmov x0, d0
164
+ ret
165
+.endm
166
+
167
+.macro SAD_START_12
168
+ movrel x12, sad12_mask
169
+ ld1 {v31.16b}, x12
170
+ movi v16.16b, #0
171
+ movi v17.16b, #0
172
+.endm
173
+
174
+.macro SAD_12
175
+ ld1 {v0.16b}, x0, x1
176
+ and v0.16b, v0.16b, v31.16b
177
+ ld1 {v1.16b}, x2, x3
178
+ and v1.16b, v1.16b, v31.16b
179
+ ld1 {v2.16b}, x0, x1
180
+ and v2.16b, v2.16b, v31.16b
181
+ ld1 {v3.16b}, x2, x3
182
+ and v3.16b, v3.16b, v31.16b
183
+ uabal v16.8h, v0.8b, v1.8b
184
+ uabal2 v17.8h, v0.16b, v1.16b
185
+ uabal v16.8h, v2.8b, v3.8b
186
+ uabal2 v17.8h, v2.16b, v3.16b
187
+.endm
188
+
189
+.macro SAD_END_12
190
+ add v16.8h, v16.8h, v17.8h
191
+ uaddlv s0, v16.8h
192
+ fmov w0, s0
193
+ ret
194
+.endm
195
+
196
+.macro SAD_START_24
197
+ movi v16.16b, #0
198
+ movi v17.16b, #0
199
+ movi v18.16b, #0
200
+ sub x1, x1, #16
201
x265_3.6.tar.gz/source/common/aarch64/sad-a-sve2.S
Added
201
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "sad-a-common.S"
27
+
28
+.arch armv8-a+sve2
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+.macro SAD_SVE2_16 h
41
+ mov z16.d, #0
42
+ ptrue p0.h, vl16
43
+.rept \h
44
+ ld1b {z0.h}, p0/z, x0
45
+ ld1b {z2.h}, p0/z, x2
46
+ add x0, x0, x1
47
+ add x2, x2, x3
48
+ uaba z16.h, z0.h, z2.h
49
+.endr
50
+ uaddv d0, p0, z16.h
51
+ fmov w0, s0
52
+ ret
53
+.endm
54
+
55
+.macro SAD_SVE2_32 h
56
+ ptrue p0.b, vl32
57
+.rept \h
58
+ ld1b {z0.b}, p0/z, x0
59
+ ld1b {z4.b}, p0/z, x2
60
+ add x0, x0, x1
61
+ add x2, x2, x3
62
+ uabalb z16.h, z0.b, z4.b
63
+ uabalt z16.h, z0.b, z4.b
64
+.endr
65
+ uaddv d0, p0, z16.h
66
+ fmov w0, s0
67
+ ret
68
+.endm
69
+
70
+.macro SAD_SVE2_64 h
71
+ cmp x9, #48
72
+ bgt .vl_gt_48_pixel_sad_64x\h
73
+ mov z16.d, #0
74
+ mov z17.d, #0
75
+ mov z18.d, #0
76
+ mov z19.d, #0
77
+ ptrue p0.b, vl32
78
+.rept \h
79
+ ld1b {z0.b}, p0/z, x0
80
+ ld1b {z1.b}, p0/z, x0, #1, mul vl
81
+ ld1b {z4.b}, p0/z, x2
82
+ ld1b {z5.b}, p0/z, x2, #1, mul vl
83
+ add x0, x0, x1
84
+ add x2, x2, x3
85
+ uabalb z16.h, z0.b, z4.b
86
+ uabalt z17.h, z0.b, z4.b
87
+ uabalb z18.h, z1.b, z5.b
88
+ uabalt z19.h, z1.b, z5.b
89
+.endr
90
+ add z16.h, z16.h, z17.h
91
+ add z17.h, z18.h, z19.h
92
+ add z16.h, z16.h, z17.h
93
+ uadalp z24.s, p0/m, z16.h
94
+ uaddv d5, p0, z24.s
95
+ fmov x0, d5
96
+ ret
97
+.vl_gt_48_pixel_sad_64x\h\():
98
+ mov z16.d, #0
99
+ mov z17.d, #0
100
+ mov z24.d, #0
101
+ ptrue p0.b, vl64
102
+.rept \h
103
+ ld1b {z0.b}, p0/z, x0
104
+ ld1b {z4.b}, p0/z, x2
105
+ add x0, x0, x1
106
+ add x2, x2, x3
107
+ uabalb z16.h, z0.b, z4.b
108
+ uabalt z17.h, z0.b, z4.b
109
+.endr
110
+ add z16.h, z16.h, z17.h
111
+ uadalp z24.s, p0/m, z16.h
112
+ uaddv d5, p0, z24.s
113
+ fmov x0, d5
114
+ ret
115
+.endm
116
+
117
+.macro SAD_SVE2_24 h
118
+ mov z16.d, #0
119
+ mov x10, #24
120
+ mov x11, #0
121
+ whilelt p0.b, x11, x10
122
+.rept \h
123
+ ld1b {z0.b}, p0/z, x0
124
+ ld1b {z8.b}, p0/z, x2
125
+ add x0, x0, x1
126
+ add x2, x2, x3
127
+ uabalb z16.h, z0.b, z8.b
128
+ uabalt z16.h, z0.b, z8.b
129
+.endr
130
+ uaddv d5, p0, z16.h
131
+ fmov w0, s5
132
+ ret
133
+.endm
134
+
135
+.macro SAD_SVE2_48 h
136
+ cmp x9, #48
137
+ bgt .vl_gt_48_pixel_sad_48x\h
138
+ mov z16.d, #0
139
+ mov z17.d, #0
140
+ mov z18.d, #0
141
+ mov z19.d, #0
142
+ ptrue p0.b, vl32
143
+ ptrue p1.b, vl16
144
+.rept \h
145
+ ld1b {z0.b}, p0/z, x0
146
+ ld1b {z1.b}, p1/z, x0, #1, mul vl
147
+ ld1b {z8.b}, p0/z, x2
148
+ ld1b {z9.b}, p1/z, x2, #1, mul vl
149
+ add x0, x0, x1
150
+ add x2, x2, x3
151
+ uabalb z16.h, z0.b, z8.b
152
+ uabalt z17.h, z0.b, z8.b
153
+ uabalb z18.h, z1.b, z9.b
154
+ uabalt z19.h, z1.b, z9.b
155
+.endr
156
+ add z16.h, z16.h, z17.h
157
+ add z17.h, z18.h, z19.h
158
+ add z16.h, z16.h, z17.h
159
+ uaddv d5, p0, z16.h
160
+ fmov w0, s5
161
+ ret
162
+.vl_gt_48_pixel_sad_48x\h\():
163
+ mov z16.d, #0
164
+ mov z17.d, #0
165
+ mov x10, #48
166
+ mov x11, #0
167
+ whilelt p0.b, x11, x10
168
+.rept \h
169
+ ld1b {z0.b}, p0/z, x0
170
+ ld1b {z8.b}, p0/z, x2
171
+ add x0, x0, x1
172
+ add x2, x2, x3
173
+ uabalb z16.h, z0.b, z8.b
174
+ uabalt z17.h, z0.b, z8.b
175
+.endr
176
+ add z16.h, z16.h, z17.h
177
+ uaddv d5, p0, z16.h
178
+ fmov w0, s5
179
+ ret
180
+.endm
181
+
182
+// Fully unrolled.
183
+.macro SAD_FUNC_SVE2 w, h
184
+function PFX(pixel_sad_\w\()x\h\()_sve2)
185
+ rdvl x9, #1
186
+ cmp x9, #16
187
+ bgt .vl_gt_16_pixel_sad_\w\()x\h
188
+ SAD_START_\w uabdl
189
+ SAD_\w \h
190
+.if \w > 4
191
+ add v16.8h, v16.8h, v17.8h
192
+.endif
193
+ uaddlv s0, v16.8h
194
+ fmov w0, s0
195
+ ret
196
+.vl_gt_16_pixel_sad_\w\()x\h\():
197
+.if \w == 4 || \w == 8 || \w == 12
198
+ SAD_START_\w uabdl
199
+ SAD_\w \h
200
+.if \w > 4
201
x265_3.5.tar.gz/source/common/aarch64/sad-a.S -> x265_3.6.tar.gz/source/common/aarch64/sad-a.S
Changed
201
1
2
/*****************************************************************************
3
- * Copyright (C) 2020 MulticoreWare, Inc
4
+ * Copyright (C) 2020-2021 MulticoreWare, Inc
5
*
6
* Authors: Hongbin Liu <liuhongbin1@huawei.com>
7
+ * Sebastian Pop <spop@amazon.com>
8
*
9
* This program is free software; you can redistribute it and/or modify
10
* it under the terms of the GNU General Public License as published by
11
12
*****************************************************************************/
13
14
#include "asm.S"
15
+#include "sad-a-common.S"
16
17
+#ifdef __APPLE__
18
+.section __RODATA,__rodata
19
+#else
20
.section .rodata
21
+#endif
22
23
.align 4
24
25
.text
26
27
-.macro SAD_X_START_8 x
28
- ld1 {v0.8b}, x0, x9
29
-.if \x == 3
30
- ld1 {v1.8b}, x1, x4
31
- ld1 {v2.8b}, x2, x4
32
- ld1 {v3.8b}, x3, x4
33
-.elseif \x == 4
34
- ld1 {v1.8b}, x1, x5
35
- ld1 {v2.8b}, x2, x5
36
- ld1 {v3.8b}, x3, x5
37
- ld1 {v4.8b}, x4, x5
38
-.endif
39
- uabdl v16.8h, v0.8b, v1.8b
40
- uabdl v17.8h, v0.8b, v2.8b
41
- uabdl v18.8h, v0.8b, v3.8b
42
-.if \x == 4
43
- uabdl v19.8h, v0.8b, v4.8b
44
+// Fully unrolled.
45
+.macro SAD_FUNC w, h
46
+function PFX(pixel_sad_\w\()x\h\()_neon)
47
+ SAD_START_\w uabdl
48
+ SAD_\w \h
49
+.if \w > 4
50
+ add v16.8h, v16.8h, v17.8h
51
.endif
52
+ uaddlv s0, v16.8h
53
+ fmov w0, s0
54
+ ret
55
+endfunc
56
+.endm
57
+
58
+// Loop unrolled 4.
59
+.macro SAD_FUNC_LOOP w, h
60
+function PFX(pixel_sad_\w\()x\h\()_neon)
61
+ SAD_START_\w
62
+
63
+ mov w9, #\h/8
64
+.loop_\w\()x\h:
65
+ sub w9, w9, #1
66
+.rept 4
67
+ SAD_\w
68
+.endr
69
+ cbnz w9, .loop_\w\()x\h
70
+
71
+ SAD_END_\w
72
+endfunc
73
.endm
74
75
-.macro SAD_X_8 x
76
- ld1 {v0.8b}, x0, x9
77
+SAD_FUNC 4, 4
78
+SAD_FUNC 4, 8
79
+SAD_FUNC 4, 16
80
+SAD_FUNC 8, 4
81
+SAD_FUNC 8, 8
82
+SAD_FUNC 8, 16
83
+SAD_FUNC 8, 32
84
+SAD_FUNC 16, 4
85
+SAD_FUNC 16, 8
86
+SAD_FUNC 16, 12
87
+SAD_FUNC 16, 16
88
+SAD_FUNC 16, 32
89
+SAD_FUNC 16, 64
90
+
91
+SAD_FUNC_LOOP 32, 8
92
+SAD_FUNC_LOOP 32, 16
93
+SAD_FUNC_LOOP 32, 24
94
+SAD_FUNC_LOOP 32, 32
95
+SAD_FUNC_LOOP 32, 64
96
+SAD_FUNC_LOOP 64, 16
97
+SAD_FUNC_LOOP 64, 32
98
+SAD_FUNC_LOOP 64, 48
99
+SAD_FUNC_LOOP 64, 64
100
+SAD_FUNC_LOOP 12, 16
101
+SAD_FUNC_LOOP 24, 32
102
+SAD_FUNC_LOOP 48, 64
103
+
104
+// SAD_X3 and SAD_X4 code start
105
+
106
+// static void x264_pixel_sad_x3_##size(pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, intptr_t i_stride, int scores3)
107
+// static void x264_pixel_sad_x4_##size(pixel *fenc, pixel *pix0, pixel *pix1,pixel *pix2, pixel *pix3, intptr_t i_stride, int scores4)
108
+.macro SAD_X_FUNC x, w, h
109
+function PFX(sad_x\x\()_\w\()x\h\()_neon)
110
+ mov x9, #FENC_STRIDE
111
+
112
+// Make function arguments for x == 3 look like x == 4.
113
.if \x == 3
114
- ld1 {v1.8b}, x1, x4
115
- ld1 {v2.8b}, x2, x4
116
- ld1 {v3.8b}, x3, x4
117
-.elseif \x == 4
118
- ld1 {v1.8b}, x1, x5
119
- ld1 {v2.8b}, x2, x5
120
- ld1 {v3.8b}, x3, x5
121
- ld1 {v4.8b}, x4, x5
122
+ mov x6, x5
123
+ mov x5, x4
124
.endif
125
- uabal v16.8h, v0.8b, v1.8b
126
- uabal v17.8h, v0.8b, v2.8b
127
- uabal v18.8h, v0.8b, v3.8b
128
-.if \x == 4
129
- uabal v19.8h, v0.8b, v4.8b
130
+
131
+.if \w == 12
132
+ movrel x12, sad12_mask
133
+ ld1 {v31.16b}, x12
134
.endif
135
+
136
+ SAD_X_START_\w \h, \x, uabdl
137
+ SAD_X_\w \h, \x
138
+ SAD_X_END_\w \x
139
+endfunc
140
.endm
141
142
-.macro SAD_X_8xN x, h
143
-function x265_sad_x\x\()_8x\h\()_neon
144
+.macro SAD_X_LOOP x, w, h
145
+function PFX(sad_x\x\()_\w\()x\h\()_neon)
146
mov x9, #FENC_STRIDE
147
- SAD_X_START_8 \x
148
-.rept \h - 1
149
- SAD_X_8 \x
150
-.endr
151
- uaddlv s0, v16.8h
152
- uaddlv s1, v17.8h
153
- uaddlv s2, v18.8h
154
-.if \x == 4
155
- uaddlv s3, v19.8h
156
-.endif
157
158
+// Make function arguments for x == 3 look like x == 4.
159
.if \x == 3
160
- stp s0, s1, x5
161
- str s2, x5, #8
162
-.elseif \x == 4
163
- stp s0, s1, x6
164
- stp s2, s3, x6, #8
165
+ mov x6, x5
166
+ mov x5, x4
167
.endif
168
- ret
169
+ SAD_X_START_\w \x
170
+ mov w12, #\h/4
171
+.loop_sad_x\x\()_\w\()x\h:
172
+ sub w12, w12, #1
173
+ .rept 4
174
+ .if \w == 24
175
+ ld1 {v6.16b}, x0, #16
176
+ ld1 {v7.8b}, x0, x9
177
+ .elseif \w == 32
178
+ ld1 {v6.16b-v7.16b}, x0, x9
179
+ .elseif \w == 48
180
+ ld1 {v4.16b-v6.16b}, x0, x9
181
+ .elseif \w == 64
182
+ ld1 {v4.16b-v7.16b}, x0, x9
183
+ .endif
184
+ SAD_X_\w x1, v16, v20
185
+ SAD_X_\w x2, v17, v21
186
+ SAD_X_\w x3, v18, v22
187
+ .if \x == 4
188
+ SAD_X_\w x4, v19, v23
189
+ .endif
190
+ .endr
191
+ cbnz w12, .loop_sad_x\x\()_\w\()x\h
192
+ SAD_X_END_\w \x
193
endfunc
194
.endm
195
196
-SAD_X_8xN 3 4
197
-SAD_X_8xN 3 8
198
-SAD_X_8xN 3 16
199
-SAD_X_8xN 3 32
200
201
x265_3.6.tar.gz/source/common/aarch64/ssd-a-common.S
Added
39
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+// This file contains the macros written using NEON instruction set
26
+// that are also used by the SVE2 functions
27
+
28
+#include "asm.S"
29
+
30
+.arch armv8-a
31
+
32
+.macro ret_v0_w0
33
+ trn2 v1.2d, v0.2d, v0.2d
34
+ add v0.2s, v0.2s, v1.2s
35
+ addp v0.2s, v0.2s, v0.2s
36
+ fmov w0, s0
37
+ ret
38
+.endm
39
x265_3.6.tar.gz/source/common/aarch64/ssd-a-sve.S
Added
80
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+
27
+.arch armv8-a+sve
28
+
29
+#ifdef __APPLE__
30
+.section __RODATA,__rodata
31
+#else
32
+.section .rodata
33
+#endif
34
+
35
+.align 4
36
+
37
+.text
38
+
39
+function PFX(pixel_sse_pp_4x4_sve)
40
+ ptrue p0.s, vl4
41
+ ld1b {z0.s}, p0/z, x0
42
+ ld1b {z17.s}, p0/z, x2
43
+ add x0, x0, x1
44
+ add x2, x2, x3
45
+ sub z0.s, p0/m, z0.s, z17.s
46
+ mul z0.s, p0/m, z0.s, z0.s
47
+.rept 3
48
+ ld1b {z16.s}, p0/z, x0
49
+ ld1b {z17.s}, p0/z, x2
50
+ add x0, x0, x1
51
+ add x2, x2, x3
52
+ sub z16.s, p0/m, z16.s, z17.s
53
+ mla z0.s, p0/m, z16.s, z16.s
54
+.endr
55
+ uaddv d0, p0, z0.s
56
+ fmov w0, s0
57
+ ret
58
+endfunc
59
+
60
+function PFX(pixel_sse_pp_4x8_sve)
61
+ ptrue p0.s, vl4
62
+ ld1b {z0.s}, p0/z, x0
63
+ ld1b {z17.s}, p0/z, x2
64
+ add x0, x0, x1
65
+ add x2, x2, x3
66
+ sub z0.s, p0/m, z0.s, z17.s
67
+ mul z0.s, p0/m, z0.s, z0.s
68
+.rept 7
69
+ ld1b {z16.s}, p0/z, x0
70
+ ld1b {z17.s}, p0/z, x2
71
+ add x0, x0, x1
72
+ add x2, x2, x3
73
+ sub z16.s, p0/m, z16.s, z17.s
74
+ mla z0.s, p0/m, z16.s, z16.s
75
+.endr
76
+ uaddv d0, p0, z0.s
77
+ fmov w0, s0
78
+ ret
79
+endfunc
80
x265_3.6.tar.gz/source/common/aarch64/ssd-a-sve2.S
Added
201
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2022-2023 MulticoreWare, Inc
4
+ *
5
+ * Authors: David Chen <david.chen@myais.com.cn>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm-sve.S"
26
+#include "ssd-a-common.S"
27
+
28
+.arch armv8-a+sve2
29
+
30
+#ifdef __APPLE__
31
+.section __RODATA,__rodata
32
+#else
33
+.section .rodata
34
+#endif
35
+
36
+.align 4
37
+
38
+.text
39
+
40
+function PFX(pixel_sse_pp_32x32_sve2)
41
+ rdvl x9, #1
42
+ cmp x9, #16
43
+ bgt .vl_gt_16_pixel_sse_pp_32x32
44
+ mov w12, #8
45
+ movi v0.16b, #0
46
+ movi v1.16b, #0
47
+.loop_sse_pp_32_sve2:
48
+ sub w12, w12, #1
49
+.rept 4
50
+ ld1 {v16.16b,v17.16b}, x0, x1
51
+ ld1 {v18.16b,v19.16b}, x2, x3
52
+ usubl v2.8h, v16.8b, v18.8b
53
+ usubl2 v3.8h, v16.16b, v18.16b
54
+ usubl v4.8h, v17.8b, v19.8b
55
+ usubl2 v5.8h, v17.16b, v19.16b
56
+ smlal v0.4s, v2.4h, v2.4h
57
+ smlal2 v1.4s, v2.8h, v2.8h
58
+ smlal v0.4s, v3.4h, v3.4h
59
+ smlal2 v1.4s, v3.8h, v3.8h
60
+ smlal v0.4s, v4.4h, v4.4h
61
+ smlal2 v1.4s, v4.8h, v4.8h
62
+ smlal v0.4s, v5.4h, v5.4h
63
+ smlal2 v1.4s, v5.8h, v5.8h
64
+.endr
65
+ cbnz w12, .loop_sse_pp_32_sve2
66
+ add v0.4s, v0.4s, v1.4s
67
+ ret_v0_w0
68
+.vl_gt_16_pixel_sse_pp_32x32:
69
+ ptrue p0.b, vl32
70
+ ld1b {z16.b}, p0/z, x0
71
+ ld1b {z18.b}, p0/z, x2
72
+ add x0, x0, x1
73
+ add x2, x2, x3
74
+ usublb z1.h, z16.b, z18.b
75
+ usublt z2.h, z16.b, z18.b
76
+ smullb z0.s, z1.h, z1.h
77
+ smlalt z0.s, z1.h, z1.h
78
+ smlalb z0.s, z2.h, z2.h
79
+ smlalt z0.s, z2.h, z2.h
80
+.rept 31
81
+ ld1b {z16.b}, p0/z, x0
82
+ ld1b {z18.b}, p0/z, x2
83
+ add x0, x0, x1
84
+ add x2, x2, x3
85
+ usublb z1.h, z16.b, z18.b
86
+ usublt z2.h, z16.b, z18.b
87
+ smullb z0.s, z1.h, z1.h
88
+ smlalt z0.s, z1.h, z1.h
89
+ smlalb z0.s, z2.h, z2.h
90
+ smlalt z0.s, z2.h, z2.h
91
+.endr
92
+ uaddv d3, p0, z0.s
93
+ fmov w0, s3
94
+ ret
95
+endfunc
96
+
97
+function PFX(pixel_sse_pp_32x64_sve2)
98
+ rdvl x9, #1
99
+ cmp x9, #16
100
+ bgt .vl_gt_16_pixel_sse_pp_32x64
101
+ ptrue p0.b, vl16
102
+ ld1b {z16.b}, p0/z, x0
103
+ ld1b {z17.b}, p0/z, x0, #1, mul vl
104
+ ld1b {z18.b}, p0/z, x2
105
+ ld1b {z19.b}, p0/z, x2, #1, mul vl
106
+ add x0, x0, x1
107
+ add x2, x2, x3
108
+ usublb z1.h, z16.b, z18.b
109
+ usublt z2.h, z16.b, z18.b
110
+ usublb z3.h, z17.b, z19.b
111
+ usublt z4.h, z17.b, z19.b
112
+ smullb z20.s, z1.h, z1.h
113
+ smullt z21.s, z1.h, z1.h
114
+ smlalb z20.s, z2.h, z2.h
115
+ smlalt z21.s, z2.h, z2.h
116
+ smlalb z20.s, z3.h, z3.h
117
+ smlalt z21.s, z3.h, z3.h
118
+ smlalb z20.s, z4.h, z4.h
119
+ smlalt z21.s, z4.h, z4.h
120
+.rept 63
121
+ ld1b {z16.b}, p0/z, x0
122
+ ld1b {z17.b}, p0/z, x0, #1, mul vl
123
+ ld1b {z18.b}, p0/z, x2
124
+ ld1b {z19.b}, p0/z, x2, #1, mul vl
125
+ add x0, x0, x1
126
+ add x2, x2, x3
127
+ usublb z1.h, z16.b, z18.b
128
+ usublt z2.h, z16.b, z18.b
129
+ usublb z3.h, z17.b, z19.b
130
+ usublt z4.h, z17.b, z19.b
131
+ smlalb z20.s, z1.h, z1.h
132
+ smlalt z21.s, z1.h, z1.h
133
+ smlalb z20.s, z2.h, z2.h
134
+ smlalt z21.s, z2.h, z2.h
135
+ smlalb z20.s, z3.h, z3.h
136
+ smlalt z21.s, z3.h, z3.h
137
+ smlalb z20.s, z4.h, z4.h
138
+ smlalt z21.s, z4.h, z4.h
139
+.endr
140
+ uaddv d3, p0, z20.s
141
+ fmov w0, s3
142
+ uaddv d4, p0, z21.s
143
+ fmov w1, s4
144
+ add w0, w0, w1
145
+ ret
146
+.vl_gt_16_pixel_sse_pp_32x64:
147
+ ptrue p0.b, vl32
148
+ ld1b {z16.b}, p0/z, x0
149
+ ld1b {z18.b}, p0/z, x2
150
+ add x0, x0, x1
151
+ add x2, x2, x3
152
+ usublb z1.h, z16.b, z18.b
153
+ usublt z2.h, z16.b, z18.b
154
+ smullb z20.s, z1.h, z1.h
155
+ smullt z21.s, z1.h, z1.h
156
+ smlalb z20.s, z2.h, z2.h
157
+ smlalt z21.s, z2.h, z2.h
158
+.rept 63
159
+ ld1b {z16.b}, p0/z, x0
160
+ ld1b {z18.b}, p0/z, x2
161
+ add x0, x0, x1
162
+ add x2, x2, x3
163
+ usublb z1.h, z16.b, z18.b
164
+ usublt z2.h, z16.b, z18.b
165
+ smlalb z20.s, z1.h, z1.h
166
+ smlalt z21.s, z1.h, z1.h
167
+ smlalb z20.s, z2.h, z2.h
168
+ smlalt z21.s, z2.h, z2.h
169
+.endr
170
+ uaddv d3, p0, z20.s
171
+ fmov w0, s3
172
+ uaddv d4, p0, z21.s
173
+ fmov w1, s4
174
+ add w0, w0, w1
175
+ ret
176
+endfunc
177
+
178
+function PFX(pixel_sse_pp_64x64_sve2)
179
+ rdvl x9, #1
180
+ cmp x9, #16
181
+ bgt .vl_gt_16_pixel_sse_pp_64x64
182
+ mov w12, #16
183
+ movi v0.16b, #0
184
+ movi v1.16b, #0
185
+
186
+.loop_sse_pp_64_sve2:
187
+ sub w12, w12, #1
188
+.rept 4
189
+ ld1 {v16.16b-v19.16b}, x0, x1
190
+ ld1 {v20.16b-v23.16b}, x2, x3
191
+
192
+ usubl v2.8h, v16.8b, v20.8b
193
+ usubl2 v3.8h, v16.16b, v20.16b
194
+ usubl v4.8h, v17.8b, v21.8b
195
+ usubl2 v5.8h, v17.16b, v21.16b
196
+ smlal v0.4s, v2.4h, v2.4h
197
+ smlal2 v1.4s, v2.8h, v2.8h
198
+ smlal v0.4s, v3.4h, v3.4h
199
+ smlal2 v1.4s, v3.8h, v3.8h
200
+ smlal v0.4s, v4.4h, v4.4h
201
x265_3.6.tar.gz/source/common/aarch64/ssd-a.S
Added
201
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2021 MulticoreWare, Inc
4
+ *
5
+ * Authors: Sebastian Pop <spop@amazon.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+#include "ssd-a-common.S"
27
+
28
+#ifdef __APPLE__
29
+.section __RODATA,__rodata
30
+#else
31
+.section .rodata
32
+#endif
33
+
34
+.align 4
35
+
36
+.text
37
+
38
+function PFX(pixel_sse_pp_4x4_neon)
39
+ ld1 {v16.s}0, x0, x1
40
+ ld1 {v17.s}0, x2, x3
41
+ ld1 {v18.s}0, x0, x1
42
+ ld1 {v19.s}0, x2, x3
43
+ ld1 {v20.s}0, x0, x1
44
+ ld1 {v21.s}0, x2, x3
45
+ ld1 {v22.s}0, x0, x1
46
+ ld1 {v23.s}0, x2, x3
47
+
48
+ usubl v1.8h, v16.8b, v17.8b
49
+ usubl v2.8h, v18.8b, v19.8b
50
+ usubl v3.8h, v20.8b, v21.8b
51
+ usubl v4.8h, v22.8b, v23.8b
52
+
53
+ smull v0.4s, v1.4h, v1.4h
54
+ smlal v0.4s, v2.4h, v2.4h
55
+ smlal v0.4s, v3.4h, v3.4h
56
+ smlal v0.4s, v4.4h, v4.4h
57
+ ret_v0_w0
58
+endfunc
59
+
60
+function PFX(pixel_sse_pp_4x8_neon)
61
+ ld1 {v16.s}0, x0, x1
62
+ ld1 {v17.s}0, x2, x3
63
+ usubl v1.8h, v16.8b, v17.8b
64
+ ld1 {v16.s}0, x0, x1
65
+ ld1 {v17.s}0, x2, x3
66
+ smull v0.4s, v1.4h, v1.4h
67
+.rept 6
68
+ usubl v1.8h, v16.8b, v17.8b
69
+ ld1 {v16.s}0, x0, x1
70
+ smlal v0.4s, v1.4h, v1.4h
71
+ ld1 {v17.s}0, x2, x3
72
+.endr
73
+ usubl v1.8h, v16.8b, v17.8b
74
+ smlal v0.4s, v1.4h, v1.4h
75
+ ret_v0_w0
76
+endfunc
77
+
78
+function PFX(pixel_sse_pp_8x8_neon)
79
+ ld1 {v16.8b}, x0, x1
80
+ ld1 {v17.8b}, x2, x3
81
+ usubl v1.8h, v16.8b, v17.8b
82
+ ld1 {v16.8b}, x0, x1
83
+ smull v0.4s, v1.4h, v1.4h
84
+ smlal2 v0.4s, v1.8h, v1.8h
85
+ ld1 {v17.8b}, x2, x3
86
+
87
+.rept 6
88
+ usubl v1.8h, v16.8b, v17.8b
89
+ ld1 {v16.8b}, x0, x1
90
+ smlal v0.4s, v1.4h, v1.4h
91
+ smlal2 v0.4s, v1.8h, v1.8h
92
+ ld1 {v17.8b}, x2, x3
93
+.endr
94
+ usubl v1.8h, v16.8b, v17.8b
95
+ smlal v0.4s, v1.4h, v1.4h
96
+ smlal2 v0.4s, v1.8h, v1.8h
97
+ ret_v0_w0
98
+endfunc
99
+
100
+function PFX(pixel_sse_pp_8x16_neon)
101
+ ld1 {v16.8b}, x0, x1
102
+ ld1 {v17.8b}, x2, x3
103
+ usubl v1.8h, v16.8b, v17.8b
104
+ ld1 {v16.8b}, x0, x1
105
+ smull v0.4s, v1.4h, v1.4h
106
+ smlal2 v0.4s, v1.8h, v1.8h
107
+ ld1 {v17.8b}, x2, x3
108
+
109
+.rept 14
110
+ usubl v1.8h, v16.8b, v17.8b
111
+ ld1 {v16.8b}, x0, x1
112
+ smlal v0.4s, v1.4h, v1.4h
113
+ smlal2 v0.4s, v1.8h, v1.8h
114
+ ld1 {v17.8b}, x2, x3
115
+.endr
116
+ usubl v1.8h, v16.8b, v17.8b
117
+ smlal v0.4s, v1.4h, v1.4h
118
+ smlal2 v0.4s, v1.8h, v1.8h
119
+ ret_v0_w0
120
+endfunc
121
+
122
+.macro sse_pp_16xN h
123
+function PFX(pixel_sse_pp_16x\h\()_neon)
124
+ ld1 {v16.16b}, x0, x1
125
+ ld1 {v17.16b}, x2, x3
126
+ usubl v1.8h, v16.8b, v17.8b
127
+ usubl2 v2.8h, v16.16b, v17.16b
128
+ ld1 {v16.16b}, x0, x1
129
+ ld1 {v17.16b}, x2, x3
130
+ smull v0.4s, v1.4h, v1.4h
131
+ smlal2 v0.4s, v1.8h, v1.8h
132
+ smlal v0.4s, v2.4h, v2.4h
133
+ smlal2 v0.4s, v2.8h, v2.8h
134
+.rept \h - 2
135
+ usubl v1.8h, v16.8b, v17.8b
136
+ usubl2 v2.8h, v16.16b, v17.16b
137
+ ld1 {v16.16b}, x0, x1
138
+ smlal v0.4s, v1.4h, v1.4h
139
+ smlal2 v0.4s, v1.8h, v1.8h
140
+ ld1 {v17.16b}, x2, x3
141
+ smlal v0.4s, v2.4h, v2.4h
142
+ smlal2 v0.4s, v2.8h, v2.8h
143
+.endr
144
+ usubl v1.8h, v16.8b, v17.8b
145
+ usubl2 v2.8h, v16.16b, v17.16b
146
+ smlal v0.4s, v1.4h, v1.4h
147
+ smlal2 v0.4s, v1.8h, v1.8h
148
+ smlal v0.4s, v2.4h, v2.4h
149
+ smlal2 v0.4s, v2.8h, v2.8h
150
+ ret_v0_w0
151
+endfunc
152
+.endm
153
+
154
+sse_pp_16xN 16
155
+sse_pp_16xN 32
156
+
157
+function PFX(pixel_sse_pp_32x32_neon)
158
+ mov w12, #8
159
+ movi v0.16b, #0
160
+ movi v1.16b, #0
161
+.loop_sse_pp_32:
162
+ sub w12, w12, #1
163
+.rept 4
164
+ ld1 {v16.16b,v17.16b}, x0, x1
165
+ ld1 {v18.16b,v19.16b}, x2, x3
166
+ usubl v2.8h, v16.8b, v18.8b
167
+ usubl2 v3.8h, v16.16b, v18.16b
168
+ usubl v4.8h, v17.8b, v19.8b
169
+ usubl2 v5.8h, v17.16b, v19.16b
170
+ smlal v0.4s, v2.4h, v2.4h
171
+ smlal2 v1.4s, v2.8h, v2.8h
172
+ smlal v0.4s, v3.4h, v3.4h
173
+ smlal2 v1.4s, v3.8h, v3.8h
174
+ smlal v0.4s, v4.4h, v4.4h
175
+ smlal2 v1.4s, v4.8h, v4.8h
176
+ smlal v0.4s, v5.4h, v5.4h
177
+ smlal2 v1.4s, v5.8h, v5.8h
178
+.endr
179
+ cbnz w12, .loop_sse_pp_32
180
+ add v0.4s, v0.4s, v1.4s
181
+ ret_v0_w0
182
+endfunc
183
+
184
+function PFX(pixel_sse_pp_32x64_neon)
185
+ mov w12, #16
186
+ movi v0.16b, #0
187
+ movi v1.16b, #0
188
+.loop_sse_pp_32x64:
189
+ sub w12, w12, #1
190
+.rept 4
191
+ ld1 {v16.16b,v17.16b}, x0, x1
192
+ ld1 {v18.16b,v19.16b}, x2, x3
193
+ usubl v2.8h, v16.8b, v18.8b
194
+ usubl2 v3.8h, v16.16b, v18.16b
195
+ usubl v4.8h, v17.8b, v19.8b
196
+ usubl2 v5.8h, v17.16b, v19.16b
197
+ smlal v0.4s, v2.4h, v2.4h
198
+ smlal2 v1.4s, v2.8h, v2.8h
199
+ smlal v0.4s, v3.4h, v3.4h
200
+ smlal2 v1.4s, v3.8h, v3.8h
201
x265_3.5.tar.gz/source/common/common.h -> x265_3.6.tar.gz/source/common/common.h
Changed
51
1
2
typedef uint64_t pixel4;
3
typedef int64_t ssum2_t;
4
#define SHIFT_TO_BITPLANE 9
5
-#define HISTOGRAM_BINS 1024
6
#else
7
typedef uint8_t pixel;
8
typedef uint16_t sum_t;
9
10
typedef uint32_t pixel4;
11
typedef int32_t ssum2_t; // Signed sum
12
#define SHIFT_TO_BITPLANE 7
13
-#define HISTOGRAM_BINS 256
14
#endif // if HIGH_BIT_DEPTH
15
16
#if X265_DEPTH < 10
17
18
19
#define MIN_QPSCALE 0.21249999999999999
20
#define MAX_MAX_QPSCALE 615.46574234477100
21
+#define FRAME_BRIGHTNESS_THRESHOLD 50.0 // Min % of pixels in a frame, that are above BRIGHTNESS_THRESHOLD for it to be considered a bright frame
22
+#define FRAME_EDGE_THRESHOLD 10.0 // Min % of edge pixels in a frame, for it to be considered to have high edge density
23
24
25
template<typename T>
26
27
#define FILLER_OVERHEAD (NAL_TYPE_OVERHEAD + START_CODE_OVERHEAD + 1)
28
29
#define MAX_NUM_DYN_REFINE (NUM_CU_DEPTH * X265_REFINE_INTER_LEVELS)
30
+#define X265_BYTE 8
31
+
32
+#define MAX_MCSTF_TEMPORAL_WINDOW_LENGTH 8
33
34
namespace X265_NS {
35
36
37
#define x265_unlink(fileName) unlink(fileName)
38
#define x265_rename(oldName, newName) rename(oldName, newName)
39
#endif
40
+/* Close a file */
41
+#define x265_fclose(file) if (file != NULL) fclose(file); file=NULL;
42
+#define x265_fread(val, size, readSize, fileOffset,errorMessage)\
43
+ if (fread(val, size, readSize, fileOffset) != readSize)\
44
+ {\
45
+ x265_log(NULL, X265_LOG_ERROR, errorMessage); \
46
+ return; \
47
+ }
48
int x265_exp2fix8(double x);
49
50
double x265_ssim2dB(double ssim);
51
x265_3.5.tar.gz/source/common/cpu.cpp -> x265_3.6.tar.gz/source/common/cpu.cpp
Changed
58
1
2
* Steve Borho <steve@borho.org>
3
* Hongbin Liu <liuhongbin1@huawei.com>
4
* Yimeng Su <yimeng.su@huawei.com>
5
+ * Josh Dekker <josh@itanimul.li>
6
+ * Jean-Baptiste Kempf <jb@videolan.org>
7
*
8
* This program is free software; you can redistribute it and/or modify
9
* it under the terms of the GNU General Public License as published by
10
11
{ "NEON", X265_CPU_NEON },
12
{ "FastNeonMRC", X265_CPU_FAST_NEON_MRC },
13
14
+#elif X265_ARCH_ARM64
15
+ { "NEON", X265_CPU_NEON },
16
+#if defined(HAVE_SVE)
17
+ { "SVE", X265_CPU_SVE },
18
+#endif
19
+#if defined(HAVE_SVE2)
20
+ { "SVE2", X265_CPU_SVE2 },
21
+#endif
22
#elif X265_ARCH_POWER8
23
{ "Altivec", X265_CPU_ALTIVEC },
24
25
26
flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
27
#endif
28
// TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
29
-#elif X265_ARCH_ARM64
30
- flags |= X265_CPU_NEON;
31
#endif // if HAVE_ARMV6
32
return flags;
33
}
34
35
+#elif X265_ARCH_ARM64
36
+
37
+uint32_t cpu_detect(bool benableavx512)
38
+{
39
+ int flags = 0;
40
+
41
+ #if defined(HAVE_SVE2)
42
+ flags |= X265_CPU_SVE2;
43
+ flags |= X265_CPU_SVE;
44
+ flags |= X265_CPU_NEON;
45
+ #elif defined(HAVE_SVE)
46
+ flags |= X265_CPU_SVE;
47
+ flags |= X265_CPU_NEON;
48
+ #elif HAVE_NEON
49
+ flags |= X265_CPU_NEON;
50
+ #endif
51
+
52
+ return flags;
53
+}
54
+
55
#elif X265_ARCH_POWER8
56
57
uint32_t cpu_detect(bool benableavx512)
58
x265_3.5.tar.gz/source/common/frame.cpp -> x265_3.6.tar.gz/source/common/frame.cpp
Changed
102
1
2
m_edgeBitPlane = NULL;
3
m_edgeBitPic = NULL;
4
m_isInsideWindow = 0;
5
+
6
+ // mcstf
7
+ m_isSubSampled = NULL;
8
+ m_mcstf = NULL;
9
+ m_refPicCnt0 = 0;
10
+ m_refPicCnt1 = 0;
11
+ m_nextMCSTF = NULL;
12
+ m_prevMCSTF = NULL;
13
+
14
+ m_tempLayer = 0;
15
+ m_sameLayerRefPic = false;
16
}
17
18
bool Frame::create(x265_param *param, float* quantOffsets)
19
{
20
m_fencPic = new PicYuv;
21
m_param = param;
22
+
23
+ if (m_param->bEnableTemporalFilter)
24
+ {
25
+ m_mcstf = new TemporalFilter;
26
+ m_mcstf->init(param);
27
+
28
+ m_fencPicSubsampled2 = new PicYuv;
29
+ m_fencPicSubsampled4 = new PicYuv;
30
+
31
+ if (!m_fencPicSubsampled2->createScaledPicYUV(param, 2))
32
+ return false;
33
+ if (!m_fencPicSubsampled4->createScaledPicYUV(param, 4))
34
+ return false;
35
+
36
+ CHECKED_MALLOC_ZERO(m_isSubSampled, int, 1);
37
+ }
38
+
39
CHECKED_MALLOC_ZERO(m_rcData, RcStats, 1);
40
41
if (param->bCTUInfo)
42
43
return false;
44
}
45
46
+bool Frame::createSubSample()
47
+{
48
+
49
+ m_fencPicSubsampled2 = new PicYuv;
50
+ m_fencPicSubsampled4 = new PicYuv;
51
+
52
+ if (!m_fencPicSubsampled2->createScaledPicYUV(m_param, 2))
53
+ return false;
54
+ if (!m_fencPicSubsampled4->createScaledPicYUV(m_param, 4))
55
+ return false;
56
+ CHECKED_MALLOC_ZERO(m_isSubSampled, int, 1);
57
+ return true;
58
+fail:
59
+ return false;
60
+}
61
+
62
bool Frame::allocEncodeData(x265_param *param, const SPS& sps)
63
{
64
m_encData = new FrameData;
65
66
m_fencPic = NULL;
67
}
68
69
+ if (m_param->bEnableTemporalFilter)
70
+ {
71
+
72
+ if (m_fencPicSubsampled2)
73
+ {
74
+ m_fencPicSubsampled2->destroy();
75
+ delete m_fencPicSubsampled2;
76
+ m_fencPicSubsampled2 = NULL;
77
+ }
78
+
79
+ if (m_fencPicSubsampled4)
80
+ {
81
+ m_fencPicSubsampled4->destroy();
82
+ delete m_fencPicSubsampled4;
83
+ m_fencPicSubsampled4 = NULL;
84
+ }
85
+ delete m_mcstf;
86
+ X265_FREE(m_isSubSampled);
87
+ }
88
+
89
if (m_reconPic)
90
{
91
m_reconPic->destroy();
92
93
X265_FREE(m_addOnPrevChange);
94
m_addOnPrevChange = NULL;
95
}
96
- m_lowres.destroy();
97
+
98
+ m_lowres.destroy(m_param);
99
X265_FREE(m_rcData);
100
101
if (m_param->bDynamicRefine)
102
x265_3.5.tar.gz/source/common/frame.h -> x265_3.6.tar.gz/source/common/frame.h
Changed
60
1
2
#include "common.h"
3
#include "lowres.h"
4
#include "threading.h"
5
+#include "temporalfilter.h"
6
7
namespace X265_NS {
8
// private namespace
9
10
double count4;
11
double offset4;
12
double bufferFillFinal;
13
+ int64_t currentSatd;
14
};
15
16
class Frame
17
18
19
/* Data associated with x265_picture */
20
PicYuv* m_fencPic;
21
+ PicYuv* m_fencPicSubsampled2;
22
+ PicYuv* m_fencPicSubsampled4;
23
+
24
int m_poc;
25
int m_encodeOrder;
26
+ int m_gopOffset;
27
int64_t m_pts; // user provided presentation time stamp
28
int64_t m_reorderedPts;
29
int64_t m_dts;
30
31
bool m_classifyFrame;
32
int m_fieldNum;
33
34
+ /*MCSTF*/
35
+ TemporalFilter* m_mcstf;
36
+ int m_refPicCnt2;
37
+ Frame* m_nextMCSTF; // PicList doubly linked list pointers
38
+ Frame* m_prevMCSTF;
39
+ int* m_isSubSampled;
40
+
41
/* aq-mode 4 : Gaussian, edge and theta frames for edge information */
42
pixel* m_edgePic;
43
pixel* m_gaussianPic;
44
45
46
int m_isInsideWindow;
47
48
+ /*Frame's temporal layer info*/
49
+ uint8_t m_tempLayer;
50
+ int8_t m_gopId;
51
+ bool m_sameLayerRefPic;
52
+
53
Frame();
54
55
bool create(x265_param *param, float* quantOffsets);
56
+ bool createSubSample();
57
bool allocEncodeData(x265_param *param, const SPS& sps);
58
void reinit(const SPS& sps);
59
void destroy();
60
x265_3.5.tar.gz/source/common/framedata.cpp -> x265_3.6.tar.gz/source/common/framedata.cpp
Changed
10
1
2
}
3
else
4
return false;
5
- CHECKED_MALLOC_ZERO(m_cuStat, RCStatCU, sps.numCUsInFrame);
6
+ CHECKED_MALLOC_ZERO(m_cuStat, RCStatCU, sps.numCUsInFrame + 1);
7
CHECKED_MALLOC(m_rowStat, RCStatRow, sps.numCuInHeight);
8
reinit(sps);
9
10
x265_3.5.tar.gz/source/common/lowres.cpp -> x265_3.6.tar.gz/source/common/lowres.cpp
Changed
154
1
2
3
using namespace X265_NS;
4
5
+/*
6
+ * Down Sample input picture
7
+ */
8
+static
9
+void frame_lowres_core(const pixel* src0, pixel* dst0,
10
+ intptr_t src_stride, intptr_t dst_stride, int width, int height)
11
+{
12
+ for (int y = 0; y < height; y++)
13
+ {
14
+ const pixel* src1 = src0 + src_stride;
15
+ for (int x = 0; x < width; x++)
16
+ {
17
+ // slower than naive bilinear, but matches asm
18
+#define FILTER(a, b, c, d) ((((a + b + 1) >> 1) + ((c + d + 1) >> 1) + 1) >> 1)
19
+ dst0x = FILTER(src02 * x, src12 * x, src02 * x + 1, src12 * x + 1);
20
+#undef FILTER
21
+ }
22
+ src0 += src_stride * 2;
23
+ dst0 += dst_stride;
24
+ }
25
+}
26
+
27
bool PicQPAdaptationLayer::create(uint32_t width, uint32_t height, uint32_t partWidth, uint32_t partHeight, uint32_t numAQPartInWidthExt, uint32_t numAQPartInHeightExt)
28
{
29
aqPartWidth = partWidth;
30
31
32
size_t planesize = lumaStride * (lines + 2 * origPic->m_lumaMarginY);
33
size_t padoffset = lumaStride * origPic->m_lumaMarginY + origPic->m_lumaMarginX;
34
- if (!!param->rc.aqMode || !!param->rc.hevcAq || !!param->bAQMotion)
35
+ if (!!param->rc.aqMode || !!param->rc.hevcAq || !!param->bAQMotion || !!param->bEnableWeightedPred || !!param->bEnableWeightedBiPred)
36
{
37
CHECKED_MALLOC_ZERO(qpAqOffset, double, cuCountFullRes);
38
CHECKED_MALLOC_ZERO(invQscaleFactor, int, cuCountFullRes);
39
40
}
41
}
42
43
+ if (param->bHistBasedSceneCut)
44
+ {
45
+ quarterSampleLowResWidth = widthFullRes / 4;
46
+ quarterSampleLowResHeight = heightFullRes / 4;
47
+ quarterSampleLowResOriginX = 16;
48
+ quarterSampleLowResOriginY = 16;
49
+ quarterSampleLowResStrideY = quarterSampleLowResWidth + 2 * quarterSampleLowResOriginY;
50
+
51
+ size_t quarterSampleLowResPlanesize = quarterSampleLowResStrideY * (quarterSampleLowResHeight + 2 * quarterSampleLowResOriginX);
52
+ /* allocate quarter sampled lowres buffers */
53
+ CHECKED_MALLOC_ZERO(quarterSampleLowResBuffer, pixel, quarterSampleLowResPlanesize);
54
+
55
+ // Allocate memory for Histograms
56
+ picHistogram = X265_MALLOC(uint32_t***, NUMBER_OF_SEGMENTS_IN_WIDTH * sizeof(uint32_t***));
57
+ picHistogram0 = X265_MALLOC(uint32_t**, NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
58
+ for (uint32_t wd = 1; wd < NUMBER_OF_SEGMENTS_IN_WIDTH; wd++) {
59
+ picHistogramwd = picHistogram0 + wd * NUMBER_OF_SEGMENTS_IN_HEIGHT;
60
+ }
61
+
62
+ for (uint32_t regionInPictureWidthIndex = 0; regionInPictureWidthIndex < NUMBER_OF_SEGMENTS_IN_WIDTH; regionInPictureWidthIndex++)
63
+ {
64
+ for (uint32_t regionInPictureHeightIndex = 0; regionInPictureHeightIndex < NUMBER_OF_SEGMENTS_IN_HEIGHT; regionInPictureHeightIndex++)
65
+ {
66
+ picHistogramregionInPictureWidthIndexregionInPictureHeightIndex = X265_MALLOC(uint32_t*, NUMBER_OF_SEGMENTS_IN_WIDTH *sizeof(uint32_t*));
67
+ picHistogramregionInPictureWidthIndexregionInPictureHeightIndex0 = X265_MALLOC(uint32_t, 3 * HISTOGRAM_NUMBER_OF_BINS * sizeof(uint32_t));
68
+ for (uint32_t wd = 1; wd < 3; wd++) {
69
+ picHistogramregionInPictureWidthIndexregionInPictureHeightIndexwd = picHistogramregionInPictureWidthIndexregionInPictureHeightIndex0 + wd * HISTOGRAM_NUMBER_OF_BINS;
70
+ }
71
+ }
72
+ }
73
+ }
74
+
75
return true;
76
77
fail:
78
return false;
79
}
80
81
-void Lowres::destroy()
82
+void Lowres::destroy(x265_param* param)
83
{
84
X265_FREE(buffer0);
85
if(bEnableHME)
86
87
X265_FREE(invQscaleFactor8x8);
88
X265_FREE(edgeInclined);
89
X265_FREE(qpAqMotionOffset);
90
- X265_FREE(blockVariance);
91
+ if (param->bDynamicRefine || param->bEnableFades)
92
+ X265_FREE(blockVariance);
93
if (maxAQDepth > 0)
94
{
95
for (uint32_t d = 0; d < 4; d++)
96
97
98
delete pAQLayer;
99
}
100
+
101
+ // Histograms
102
+ if (param->bHistBasedSceneCut)
103
+ {
104
+ for (uint32_t segmentInFrameWidthIdx = 0; segmentInFrameWidthIdx < NUMBER_OF_SEGMENTS_IN_WIDTH; segmentInFrameWidthIdx++)
105
+ {
106
+ if (picHistogramsegmentInFrameWidthIdx)
107
+ {
108
+ for (uint32_t segmentInFrameHeightIdx = 0; segmentInFrameHeightIdx < NUMBER_OF_SEGMENTS_IN_HEIGHT; segmentInFrameHeightIdx++)
109
+ {
110
+ if (picHistogramsegmentInFrameWidthIdxsegmentInFrameHeightIdx)
111
+ X265_FREE(picHistogramsegmentInFrameWidthIdxsegmentInFrameHeightIdx0);
112
+ X265_FREE(picHistogramsegmentInFrameWidthIdxsegmentInFrameHeightIdx);
113
+ }
114
+ }
115
+ }
116
+ if (picHistogram)
117
+ X265_FREE(picHistogram0);
118
+ X265_FREE(picHistogram);
119
+
120
+ X265_FREE(quarterSampleLowResBuffer);
121
+
122
+ }
123
}
124
// (re) initialize lowres state
125
void Lowres::init(PicYuv *origPic, int poc)
126
127
indB = 0;
128
memset(costEst, -1, sizeof(costEst));
129
memset(weightedCostDelta, 0, sizeof(weightedCostDelta));
130
- interPCostPercDiff = 0.0;
131
- intraCostPercDiff = 0.0;
132
- m_bIsMaxThres = false;
133
- m_bIsHardScenecut = false;
134
135
if (qpAqOffset && invQscaleFactor)
136
memset(costEstAq, -1, sizeof(costEstAq));
137
138
}
139
140
fpelPlane0 = lowresPlane0;
141
+
142
+ if (origPic->m_param->bHistBasedSceneCut)
143
+ {
144
+ // Quarter Sampled Input Picture Formation
145
+ // TO DO: Replace with ASM function
146
+ frame_lowres_core(
147
+ lowresPlane0,
148
+ quarterSampleLowResBuffer + quarterSampleLowResOriginX + quarterSampleLowResOriginY * quarterSampleLowResStrideY,
149
+ lumaStride,
150
+ quarterSampleLowResStrideY,
151
+ widthFullRes / 4, heightFullRes / 4);
152
+ }
153
}
154
x265_3.5.tar.gz/source/common/lowres.h -> x265_3.6.tar.gz/source/common/lowres.h
Changed
73
1
2
namespace X265_NS {
3
// private namespace
4
5
+#define HISTOGRAM_NUMBER_OF_BINS 256
6
+#define NUMBER_OF_SEGMENTS_IN_WIDTH 4
7
+#define NUMBER_OF_SEGMENTS_IN_HEIGHT 4
8
+
9
struct ReferencePlanes
10
{
11
ReferencePlanes() { memset(this, 0, sizeof(ReferencePlanes)); }
12
13
14
int frameNum; // Presentation frame number
15
int sliceType; // Slice type decided by lookahead
16
+ int sliceTypeReq; // Slice type required as per the QP file
17
int width; // width of lowres frame in pixels
18
int lines; // height of lowres frame in pixel lines
19
int leadingBframes; // number of leading B frames for P or I
20
21
double* qpAqOffset; // AQ QP offset values for each 16x16 CU
22
double* qpCuTreeOffset; // cuTree QP offset values for each 16x16 CU
23
double* qpAqMotionOffset;
24
- int* invQscaleFactor; // qScale values for qp Aq Offsets
25
+ int* invQscaleFactor; // qScale values for qp Aq Offsets
26
int* invQscaleFactor8x8; // temporary buffer for qg-size 8
27
uint32_t* blockVariance;
28
uint64_t wp_ssd3; // This is different than SSDY, this is sum(pixel^2) - sum(pixel)^2 for entire frame
29
uint64_t wp_sum3;
30
double frameVariance;
31
- int* edgeInclined;
32
+ int* edgeInclined;
33
34
35
/* cutree intermediate data */
36
37
uint32_t heightFullRes;
38
uint32_t m_maxCUSize;
39
uint32_t m_qgSize;
40
-
41
+
42
uint16_t* propagateCost;
43
double weightedCostDeltaX265_BFRAME_MAX + 2;
44
ReferencePlanes weightedRefX265_BFRAME_MAX + 2;
45
+
46
/* For hist-based scenecut */
47
- bool m_bIsMaxThres;
48
- double interPCostPercDiff;
49
- double intraCostPercDiff;
50
- bool m_bIsHardScenecut;
51
+ int quarterSampleLowResWidth; // width of 1/4 lowres frame in pixels
52
+ int quarterSampleLowResHeight; // height of 1/4 lowres frame in pixels
53
+ int quarterSampleLowResStrideY;
54
+ int quarterSampleLowResOriginX;
55
+ int quarterSampleLowResOriginY;
56
+ pixel *quarterSampleLowResBuffer;
57
+ bool bHistScenecutAnalyzed;
58
+
59
+ uint16_t picAvgVariance;
60
+ uint16_t picAvgVarianceCb;
61
+ uint16_t picAvgVarianceCr;
62
+
63
+ uint32_t ****picHistogram;
64
+ uint64_t averageIntensityPerSegmentNUMBER_OF_SEGMENTS_IN_WIDTHNUMBER_OF_SEGMENTS_IN_HEIGHT3;
65
+ uint8_t averageIntensity3;
66
67
bool create(x265_param* param, PicYuv *origPic, uint32_t qgSize);
68
- void destroy();
69
+ void destroy(x265_param* param);
70
void init(PicYuv *origPic, int poc);
71
};
72
}
73
x265_3.5.tar.gz/source/common/mv.h -> x265_3.6.tar.gz/source/common/mv.h
Changed
10
1
2
{
3
return x >= _min.x && x <= _max.x && y >= _min.y && y <= _max.y;
4
}
5
+
6
+ void set(int32_t _x, int32_t _y) { x = _x; y = _y; }
7
};
8
}
9
10
x265_3.5.tar.gz/source/common/param.cpp -> x265_3.6.tar.gz/source/common/param.cpp
Changed
201
1
2
param->bAnnexB = 1;
3
param->bRepeatHeaders = 0;
4
param->bEnableAccessUnitDelimiters = 0;
5
+ param->bEnableEndOfBitstream = 0;
6
+ param->bEnableEndOfSequence = 0;
7
param->bEmitHRDSEI = 0;
8
param->bEmitInfoSEI = 1;
9
param->bEmitHDRSEI = 0; /*Deprecated*/
10
11
param->keyframeMax = 250;
12
param->gopLookahead = 0;
13
param->bOpenGOP = 1;
14
+ param->craNal = 0;
15
param->bframes = 4;
16
param->lookaheadDepth = 20;
17
param->bFrameAdaptive = X265_B_ADAPT_TRELLIS;
18
param->bBPyramid = 1;
19
param->scenecutThreshold = 40; /* Magic number pulled in from x264 */
20
- param->edgeTransitionThreshold = 0.03;
21
param->bHistBasedSceneCut = 0;
22
param->lookaheadSlices = 8;
23
param->lookaheadThreads = 0;
24
25
param->bEnableHRDConcatFlag = 0;
26
param->bEnableFades = 0;
27
param->bEnableSceneCutAwareQp = 0;
28
- param->fwdScenecutWindow = 500;
29
- param->fwdRefQpDelta = 5;
30
- param->fwdNonRefQpDelta = param->fwdRefQpDelta + (SLICE_TYPE_DELTA * param->fwdRefQpDelta);
31
- param->bwdScenecutWindow = 100;
32
- param->bwdRefQpDelta = -1;
33
- param->bwdNonRefQpDelta = -1;
34
+ param->fwdMaxScenecutWindow = 1200;
35
+ param->bwdMaxScenecutWindow = 600;
36
+ for (int i = 0; i < 6; i++)
37
+ {
38
+ int deltas6 = { 5, 4, 3, 2, 1, 0 };
39
+
40
+ param->fwdScenecutWindowi = 200;
41
+ param->fwdRefQpDeltai = deltasi;
42
+ param->fwdNonRefQpDeltai = param->fwdRefQpDeltai + (SLICE_TYPE_DELTA * param->fwdRefQpDeltai);
43
+
44
+ param->bwdScenecutWindowi = 100;
45
+ param->bwdRefQpDeltai = -1;
46
+ param->bwdNonRefQpDeltai = -1;
47
+ }
48
49
/* Intra Coding Tools */
50
param->bEnableConstrainedIntra = 0;
51
52
param->rc.rfConstantMin = 0;
53
param->rc.bStatRead = 0;
54
param->rc.bStatWrite = 0;
55
+ param->rc.dataShareMode = X265_SHARE_MODE_FILE;
56
param->rc.statFileName = NULL;
57
+ param->rc.sharedMemName = NULL;
58
+ param->rc.bEncFocusedFramesOnly = 0;
59
param->rc.complexityBlur = 20;
60
param->rc.qblur = 0.5;
61
param->rc.zoneCount = 0;
62
63
param->maxLuma = PIXEL_MAX;
64
param->log2MaxPocLsb = 8;
65
param->maxSlices = 1;
66
+ param->videoSignalTypePreset = NULL;
67
68
/*Conformance window*/
69
param->confWinRightOffset = 0;
70
71
param->bEnableSvtHevc = 0;
72
param->svtHevcParam = NULL;
73
74
+ /* MCSTF */
75
+ param->bEnableTemporalFilter = 0;
76
+ param->temporalFilterStrength = 0.95;
77
+
78
#ifdef SVT_HEVC
79
param->svtHevcParam = svtParam;
80
svt_param_default(param);
81
#endif
82
+ /* Film grain characteristics model filename */
83
+ param->filmGrain = NULL;
84
+ param->bEnableSBRC = 0;
85
}
86
87
int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
88
89
#define atof(str) x265_atof(str, bError)
90
#define atobool(str) (x265_atobool(str, bError))
91
92
+int x265_scenecut_aware_qp_param_parse(x265_param* p, const char* name, const char* value)
93
+{
94
+ bool bError = false;
95
+ char nameBuf64;
96
+ if (!name)
97
+ return X265_PARAM_BAD_NAME;
98
+ // skip -- prefix if provided
99
+ if (name0 == '-' && name1 == '-')
100
+ name += 2;
101
+ // s/_/-/g
102
+ if (strlen(name) + 1 < sizeof(nameBuf) && strchr(name, '_'))
103
+ {
104
+ char *c;
105
+ strcpy(nameBuf, name);
106
+ while ((c = strchr(nameBuf, '_')) != 0)
107
+ *c = '-';
108
+ name = nameBuf;
109
+ }
110
+ if (!value)
111
+ value = "true";
112
+ else if (value0 == '=')
113
+ value++;
114
+#define OPT(STR) else if (!strcmp(name, STR))
115
+ if (0);
116
+ OPT("scenecut-aware-qp") p->bEnableSceneCutAwareQp = x265_atoi(value, bError);
117
+ OPT("masking-strength") bError = parseMaskingStrength(p, value);
118
+ else
119
+ return X265_PARAM_BAD_NAME;
120
+#undef OPT
121
+ return bError ? X265_PARAM_BAD_VALUE : 0;
122
+}
123
+
124
+
125
+/* internal versions of string-to-int with additional error checking */
126
+#undef atoi
127
+#undef atof
128
+#define atoi(str) x265_atoi(str, bError)
129
+#define atof(str) x265_atof(str, bError)
130
+#define atobool(str) (x265_atobool(str, bError))
131
+
132
int x265_zone_param_parse(x265_param* p, const char* name, const char* value)
133
{
134
bool bError = false;
135
136
{
137
bError = false;
138
p->scenecutThreshold = atoi(value);
139
- p->bHistBasedSceneCut = 0;
140
}
141
}
142
- OPT("temporal-layers") p->bEnableTemporalSubLayers = atobool(value);
143
+ OPT("temporal-layers") p->bEnableTemporalSubLayers = atoi(value);
144
OPT("keyint") p->keyframeMax = atoi(value);
145
OPT("min-keyint") p->keyframeMin = atoi(value);
146
OPT("rc-lookahead") p->lookaheadDepth = atoi(value);
147
148
int pass = x265_clip3(0, 3, atoi(value));
149
p->rc.bStatWrite = pass & 1;
150
p->rc.bStatRead = pass & 2;
151
+ p->rc.dataShareMode = X265_SHARE_MODE_FILE;
152
}
153
OPT("stats") p->rc.statFileName = strdup(value);
154
OPT("scaling-list") p->scalingLists = strdup(value);
155
156
OPT("opt-ref-list-length-pps") p->bOptRefListLengthPPS = atobool(value);
157
OPT("multi-pass-opt-rps") p->bMultiPassOptRPS = atobool(value);
158
OPT("scenecut-bias") p->scenecutBias = atof(value);
159
- OPT("hist-scenecut")
160
- {
161
- p->bHistBasedSceneCut = atobool(value);
162
- if (bError)
163
- {
164
- bError = false;
165
- p->bHistBasedSceneCut = 0;
166
- }
167
- if (p->bHistBasedSceneCut)
168
- {
169
- bError = false;
170
- p->scenecutThreshold = 0;
171
- }
172
- }
173
- OPT("hist-threshold") p->edgeTransitionThreshold = atof(value);
174
+ OPT("hist-scenecut") p->bHistBasedSceneCut = atobool(value);
175
OPT("rskip-edge-threshold") p->edgeVarThreshold = atoi(value)/100.0f;
176
OPT("lookahead-threads") p->lookaheadThreads = atoi(value);
177
OPT("opt-cu-delta-qp") p->bOptCUDeltaQP = atobool(value);
178
179
OPT("multi-pass-opt-distortion") p->analysisMultiPassDistortion = atobool(value);
180
OPT("aq-motion") p->bAQMotion = atobool(value);
181
OPT("dynamic-rd") p->dynamicRd = atof(value);
182
+ OPT("cra-nal") p->craNal = atobool(value);
183
OPT("analysis-reuse-level")
184
{
185
p->analysisReuseLevel = atoi(value);
186
187
}
188
OPT("fades") p->bEnableFades = atobool(value);
189
OPT("scenecut-aware-qp") p->bEnableSceneCutAwareQp = atoi(value);
190
- OPT("masking-strength")
191
- {
192
- int window1;
193
- double refQpDelta1, nonRefQpDelta1;
194
-
195
- if (p->bEnableSceneCutAwareQp == FORWARD)
196
- {
197
- if (3 == sscanf(value, "%d,%lf,%lf", &window1, &refQpDelta1, &nonRefQpDelta1))
198
- {
199
- if (window1 > 0)
200
- p->fwdScenecutWindow = window1;
201
x265_3.5.tar.gz/source/common/param.h -> x265_3.6.tar.gz/source/common/param.h
Changed
17
1
2
void getParamAspectRatio(x265_param *p, int& width, int& height);
3
bool parseLambdaFile(x265_param *param);
4
void x265_copy_params(x265_param* dst, x265_param* src);
5
+bool parseMaskingStrength(x265_param* p, const char* value);
6
7
/* this table is kept internal to avoid confusion, since log level indices start at -1 */
8
static const char * const logLevelNames = { "none", "error", "warning", "info", "debug", "full", 0 };
9
10
int x265_param_default_preset(x265_param *, const char *preset, const char *tune);
11
int x265_param_apply_profile(x265_param *, const char *profile);
12
int x265_param_parse(x265_param *p, const char *name, const char *value);
13
+int x265_scenecut_aware_qp_param_parse(x265_param* p, const char* name, const char* value);
14
int x265_zone_param_parse(x265_param* p, const char* name, const char* value);
15
#define PARAM_NS X265_NS
16
#endif
17
x265_3.5.tar.gz/source/common/piclist.cpp -> x265_3.6.tar.gz/source/common/piclist.cpp
Changed
134
1
2
m_count++;
3
}
4
5
+void PicList::pushFrontMCSTF(Frame& curFrame)
6
+{
7
+ X265_CHECK(!curFrame.m_nextMCSTF && !curFrame.m_nextMCSTF, "piclist: picture already in OPB list\n"); // ensure frame is not in a list
8
+ curFrame.m_nextMCSTF = m_start;
9
+ curFrame.m_prevMCSTF = NULL;
10
+
11
+ if (m_count)
12
+ {
13
+ m_start->m_prevMCSTF = &curFrame;
14
+ m_start = &curFrame;
15
+ }
16
+ else
17
+ {
18
+ m_start = m_end = &curFrame;
19
+ }
20
+ m_count++;
21
+
22
+}
23
+
24
void PicList::pushBack(Frame& curFrame)
25
{
26
X265_CHECK(!curFrame.m_next && !curFrame.m_prev, "piclist: picture already in list\n"); // ensure frame is not in a list
27
28
m_count++;
29
}
30
31
+void PicList::pushBackMCSTF(Frame& curFrame)
32
+{
33
+ X265_CHECK(!curFrame.m_nextMCSTF && !curFrame.m_prevMCSTF, "piclist: picture already in OPB list\n"); // ensure frame is not in a list
34
+ curFrame.m_nextMCSTF = NULL;
35
+ curFrame.m_prevMCSTF = m_end;
36
+
37
+ if (m_count)
38
+ {
39
+ m_end->m_nextMCSTF = &curFrame;
40
+ m_end = &curFrame;
41
+ }
42
+ else
43
+ {
44
+ m_start = m_end = &curFrame;
45
+ }
46
+ m_count++;
47
+}
48
+
49
Frame *PicList::popFront()
50
{
51
if (m_start)
52
53
return curFrame;
54
}
55
56
+Frame* PicList::getPOCMCSTF(int poc)
57
+{
58
+ Frame *curFrame = m_start;
59
+ while (curFrame && curFrame->m_poc != poc)
60
+ curFrame = curFrame->m_nextMCSTF;
61
+ return curFrame;
62
+}
63
+
64
Frame *PicList::popBack()
65
{
66
if (m_end)
67
68
return NULL;
69
}
70
71
+Frame *PicList::popBackMCSTF()
72
+{
73
+ if (m_end)
74
+ {
75
+ Frame* temp = m_end;
76
+ m_count--;
77
+
78
+ if (m_count)
79
+ {
80
+ m_end = m_end->m_prevMCSTF;
81
+ m_end->m_nextMCSTF = NULL;
82
+ }
83
+ else
84
+ {
85
+ m_start = m_end = NULL;
86
+ }
87
+ temp->m_nextMCSTF = temp->m_prevMCSTF = NULL;
88
+ return temp;
89
+ }
90
+ else
91
+ return NULL;
92
+}
93
+
94
Frame* PicList::getCurFrame(void)
95
{
96
Frame *curFrame = m_start;
97
98
99
curFrame.m_next = curFrame.m_prev = NULL;
100
}
101
+
102
+void PicList::removeMCSTF(Frame& curFrame)
103
+{
104
+#if _DEBUG
105
+ Frame *tmp = m_start;
106
+ while (tmp && tmp != &curFrame)
107
+ {
108
+ tmp = tmp->m_nextMCSTF;
109
+ }
110
+
111
+ X265_CHECK(tmp == &curFrame, "framelist: pic being removed was not in list\n"); // verify pic is in this list
112
+#endif
113
+
114
+ m_count--;
115
+ if (m_count)
116
+ {
117
+ if (m_start == &curFrame)
118
+ m_start = curFrame.m_nextMCSTF;
119
+ if (m_end == &curFrame)
120
+ m_end = curFrame.m_prevMCSTF;
121
+
122
+ if (curFrame.m_nextMCSTF)
123
+ curFrame.m_nextMCSTF->m_prevMCSTF = curFrame.m_prevMCSTF;
124
+ if (curFrame.m_prevMCSTF)
125
+ curFrame.m_prevMCSTF->m_nextMCSTF = curFrame.m_nextMCSTF;
126
+ }
127
+ else
128
+ {
129
+ m_start = m_end = NULL;
130
+ }
131
+
132
+ curFrame.m_nextMCSTF = curFrame.m_prevMCSTF = NULL;
133
+}
134
x265_3.5.tar.gz/source/common/piclist.h -> x265_3.6.tar.gz/source/common/piclist.h
Changed
33
1
2
3
/** Push picture to end of the list */
4
void pushBack(Frame& pic);
5
+ void pushBackMCSTF(Frame& pic);
6
7
/** Push picture to beginning of the list */
8
void pushFront(Frame& pic);
9
+ void pushFrontMCSTF(Frame& pic);
10
11
/** Pop picture from end of the list */
12
Frame* popBack();
13
+ Frame* popBackMCSTF();
14
15
/** Pop picture from beginning of the list */
16
Frame* popFront();
17
18
/** Find frame with specified POC */
19
Frame* getPOC(int poc);
20
+ /* Find next MCSTF frame with specified POC */
21
+ Frame* getPOCMCSTF(int poc);
22
23
/** Get the current Frame from the list **/
24
Frame* getCurFrame(void);
25
26
/** Remove picture from list */
27
void remove(Frame& pic);
28
+ /* Remove MCSTF picture from list */
29
+ void removeMCSTF(Frame& pic);
30
31
Frame* first() { return m_start; }
32
33
x265_3.5.tar.gz/source/common/picyuv.cpp -> x265_3.6.tar.gz/source/common/picyuv.cpp
Changed
60
1
2
return false;
3
}
4
5
+/*Copy pixels from the picture buffer of a frame to picture buffer of another frame*/
6
+void PicYuv::copyFromFrame(PicYuv* source)
7
+{
8
+ uint32_t numCuInHeight = (m_picHeight + m_param->maxCUSize - 1) / m_param->maxCUSize;
9
+
10
+ int maxHeight = numCuInHeight * m_param->maxCUSize;
11
+ memcpy(m_picBuf0, source->m_picBuf0, sizeof(pixel)* m_stride * (maxHeight + (m_lumaMarginY * 2)));
12
+ m_picOrg0 = m_picBuf0 + m_lumaMarginY * m_stride + m_lumaMarginX;
13
+
14
+ if (m_picCsp != X265_CSP_I400)
15
+ {
16
+ memcpy(m_picBuf1, source->m_picBuf1, sizeof(pixel)* m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
17
+ memcpy(m_picBuf2, source->m_picBuf2, sizeof(pixel)* m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
18
+
19
+ m_picOrg1 = m_picBuf1 + m_chromaMarginY * m_strideC + m_chromaMarginX;
20
+ m_picOrg2 = m_picBuf2 + m_chromaMarginY * m_strideC + m_chromaMarginX;
21
+ }
22
+ else
23
+ {
24
+ m_picBuf1 = m_picBuf2 = NULL;
25
+ m_picOrg1 = m_picOrg2 = NULL;
26
+ }
27
+}
28
+
29
+bool PicYuv::createScaledPicYUV(x265_param* param, uint8_t scaleFactor)
30
+{
31
+ m_param = param;
32
+ m_picWidth = m_param->sourceWidth / scaleFactor;
33
+ m_picHeight = m_param->sourceHeight / scaleFactor;
34
+
35
+ m_picCsp = m_param->internalCsp;
36
+ m_hChromaShift = CHROMA_H_SHIFT(m_picCsp);
37
+ m_vChromaShift = CHROMA_V_SHIFT(m_picCsp);
38
+
39
+ uint32_t numCuInWidth = (m_picWidth + param->maxCUSize - 1) / param->maxCUSize;
40
+ uint32_t numCuInHeight = (m_picHeight + param->maxCUSize - 1) / param->maxCUSize;
41
+
42
+ m_lumaMarginX = 128; // search margin for L0 and L1 ME in horizontal direction
43
+ m_lumaMarginY = 128; // search margin for L0 and L1 ME in vertical direction
44
+ m_stride = (numCuInWidth * param->maxCUSize) + (m_lumaMarginX << 1);
45
+
46
+ int maxHeight = numCuInHeight * param->maxCUSize;
47
+ CHECKED_MALLOC_ZERO(m_picBuf0, pixel, m_stride * (maxHeight + (m_lumaMarginY * 2)));
48
+ m_picOrg0 = m_picBuf0 + m_lumaMarginY * m_stride + m_lumaMarginX;
49
+ m_picBuf1 = m_picBuf2 = NULL;
50
+ m_picOrg1 = m_picOrg2 = NULL;
51
+ return true;
52
+
53
+fail:
54
+ return false;
55
+}
56
+
57
int PicYuv::getLumaBufLen(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp)
58
{
59
m_picWidth = picWidth;
60
x265_3.5.tar.gz/source/common/picyuv.h -> x265_3.6.tar.gz/source/common/picyuv.h
Changed
15
1
2
PicYuv();
3
4
bool create(x265_param* param, bool picAlloc = true, pixel *pixelbuf = NULL);
5
+ bool createScaledPicYUV(x265_param* param, uint8_t scaleFactor);
6
bool createOffsets(const SPS& sps);
7
void destroy();
8
int getLumaBufLen(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp);
9
10
void copyFromPicture(const x265_picture&, const x265_param& param, int padx, int pady);
11
+ void copyFromFrame(PicYuv* source);
12
13
intptr_t getChromaAddrOffset(uint32_t ctuAddr, uint32_t absPartIdx) const { return m_cuOffsetCctuAddr + m_buOffsetCabsPartIdx; }
14
15
x265_3.5.tar.gz/source/common/pixel.cpp -> x265_3.6.tar.gz/source/common/pixel.cpp
Changed
51
1
2
{
3
int satd = 0;
4
5
-#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
6
+#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 && !HIGH_BIT_DEPTH
7
pixelcmp_t satd_4x4 = x265_pixel_satd_4x4_neon;
8
#endif
9
10
11
{
12
int satd = 0;
13
14
-#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
15
+#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 && !HIGH_BIT_DEPTH
16
pixelcmp_t satd_8x4 = x265_pixel_satd_8x4_neon;
17
#endif
18
19
20
}
21
}
22
23
+static
24
+void frame_subsample_luma(const pixel* src0, pixel* dst0, intptr_t src_stride, intptr_t dst_stride, int width, int height)
25
+{
26
+ for (int y = 0; y < height; y++, src0 += 2 * src_stride, dst0 += dst_stride)
27
+ {
28
+ const pixel *inRow = src0;
29
+ const pixel *inRowBelow = src0 + src_stride;
30
+ pixel *target = dst0;
31
+ for (int x = 0; x < width; x++)
32
+ {
33
+ targetx = (((inRow0 + inRowBelow0 + 1) >> 1) + ((inRow1 + inRowBelow1 + 1) >> 1) + 1) >> 1;
34
+ inRow += 2;
35
+ inRowBelow += 2;
36
+ }
37
+ }
38
+}
39
+
40
/* structural similarity metric */
41
static void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24)
42
{
43
44
p.cuBLOCK_16x16.normFact = normFact_c;
45
p.cuBLOCK_32x32.normFact = normFact_c;
46
p.cuBLOCK_64x64.normFact = normFact_c;
47
+ /* SubSample Luma*/
48
+ p.frameSubSampleLuma = frame_subsample_luma;
49
}
50
}
51
x265_3.5.tar.gz/source/common/ppc/intrapred_altivec.cpp -> x265_3.6.tar.gz/source/common/ppc/intrapred_altivec.cpp
Changed
10
1
2
#include <assert.h>
3
#include <math.h>
4
#include <cmath>
5
-#include <linux/types.h>
6
+#include <sys/types.h>
7
#include <stdlib.h>
8
#include <stdio.h>
9
#include <stdint.h>
10
x265_3.5.tar.gz/source/common/primitives.h -> x265_3.6.tar.gz/source/common/primitives.h
Changed
28
1
2
typedef void(*psyRdoQuant_t2)(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos);
3
typedef void(*ssimDistortion_t)(const pixel *fenc, uint32_t fStride, const pixel *recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k);
4
typedef void(*normFactor_t)(const pixel *src, uint32_t blockSize, int shift, uint64_t *z_k);
5
+/* SubSampling Luma */
6
+typedef void (*downscaleluma_t)(const pixel* src0, pixel* dstf, intptr_t src_stride, intptr_t dst_stride, int width, int height);
7
/* Function pointers to optimized encoder primitives. Each pointer can reference
8
* either an assembly routine, a SIMD intrinsic primitive, or a C function */
9
struct EncoderPrimitives
10
11
12
downscale_t frameInitLowres;
13
downscale_t frameInitLowerRes;
14
+ /* Sub Sample Luma */
15
+ downscaleluma_t frameSubSampleLuma;
16
cutree_propagate_cost propagateCost;
17
cutree_fix8_unpack fix8Unpack;
18
cutree_fix8_pack fix8Pack;
19
20
21
#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
22
extern "C" {
23
-#include "aarch64/pixel-util.h"
24
+#include "aarch64/fun-decls.h"
25
}
26
#endif
27
28
x265_3.6.tar.gz/source/common/ringmem.cpp
Added
201
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2013-2017 MulticoreWare, Inc
4
+ *
5
+ * Authors: liwei <liwei@multicorewareinc.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com
23
+ *****************************************************************************/
24
+
25
+#include "ringmem.h"
26
+
27
+#ifndef _WIN32
28
+#include <sys/mman.h>
29
+#endif ////< _WIN32
30
+
31
+#ifdef _WIN32
32
+#define X265_SHARED_MEM_NAME "Local\\_x265_shr_mem_"
33
+#define X265_SEMAPHORE_RINGMEM_WRITER_NAME "_x265_semW_"
34
+#define X265_SEMAPHORE_RINGMEM_READER_NAME "_x265_semR_"
35
+#else /* POSIX / pthreads */
36
+#define X265_SHARED_MEM_NAME "/tmp/_x265_shr_mem_"
37
+#define X265_SEMAPHORE_RINGMEM_WRITER_NAME "/tmp/_x265_semW_"
38
+#define X265_SEMAPHORE_RINGMEM_READER_NAME "/tmp/_x265_semR_"
39
+#endif
40
+
41
+#define RINGMEM_ALLIGNMENT 64
42
+
43
+namespace X265_NS {
44
+ RingMem::RingMem()
45
+ : m_initialized(false)
46
+ , m_protectRW(false)
47
+ , m_itemSize(0)
48
+ , m_itemCnt(0)
49
+ , m_dataPool(NULL)
50
+ , m_shrMem(NULL)
51
+#ifdef _WIN32
52
+ , m_handle(NULL)
53
+#else //_WIN32
54
+ , m_filepath(NULL)
55
+#endif //_WIN32
56
+ , m_writeSem(NULL)
57
+ , m_readSem(NULL)
58
+ {
59
+ }
60
+
61
+
62
+ RingMem::~RingMem()
63
+ {
64
+ }
65
+
66
+ bool RingMem::skipRead(int32_t cnt) {
67
+ if (!m_initialized)
68
+ {
69
+ return false;
70
+ }
71
+
72
+ if (m_protectRW)
73
+ {
74
+ for (int i = 0; i < cnt; i++)
75
+ {
76
+ m_readSem->take();
77
+ }
78
+ }
79
+
80
+ ATOMIC_ADD(&m_shrMem->m_read, cnt);
81
+
82
+ if (m_protectRW)
83
+ {
84
+ m_writeSem->give(cnt);
85
+ }
86
+
87
+ return true;
88
+ }
89
+
90
+ bool RingMem::skipWrite(int32_t cnt) {
91
+ if (!m_initialized)
92
+ {
93
+ return false;
94
+ }
95
+
96
+ if (m_protectRW)
97
+ {
98
+ for (int i = 0; i < cnt; i++)
99
+ {
100
+ m_writeSem->take();
101
+ }
102
+ }
103
+
104
+ ATOMIC_ADD(&m_shrMem->m_write, cnt);
105
+
106
+ if (m_protectRW)
107
+ {
108
+ m_readSem->give(cnt);
109
+ }
110
+
111
+ return true;
112
+ }
113
+
114
+ ///< initialize
115
+ bool RingMem::init(int32_t itemSize, int32_t itemCnt, const char *name, bool protectRW)
116
+ {
117
+ ///< check parameters
118
+ if (itemSize <= 0 || itemCnt <= 0 || NULL == name)
119
+ {
120
+ ///< invalid parameters
121
+ return false;
122
+ }
123
+
124
+ if (!m_initialized)
125
+ {
126
+ ///< formating names
127
+ char nameBufMAX_SHR_NAME_LEN = { 0 };
128
+
129
+ ///< shared memory name
130
+ snprintf(nameBuf, sizeof(nameBuf) - 1, "%s%s", X265_SHARED_MEM_NAME, name);
131
+
132
+ ///< create or open shared memory
133
+ bool newCreated = false;
134
+
135
+ ///< calculate the size of the shared memory
136
+ int32_t shrMemSize = (itemSize * itemCnt + sizeof(ShrMemCtrl) + RINGMEM_ALLIGNMENT - 1) & ~(RINGMEM_ALLIGNMENT - 1);
137
+
138
+#ifdef _WIN32
139
+ HANDLE h = OpenFileMappingA(FILE_MAP_WRITE | FILE_MAP_READ, FALSE, nameBuf);
140
+ if (!h)
141
+ {
142
+ h = CreateFileMappingA(INVALID_HANDLE_VALUE, NULL, PAGE_READWRITE, 0, shrMemSize, nameBuf);
143
+
144
+ if (!h)
145
+ {
146
+ return false;
147
+ }
148
+
149
+ newCreated = true;
150
+ }
151
+
152
+ void *pool = MapViewOfFile(h, FILE_MAP_ALL_ACCESS, 0, 0, 0);
153
+
154
+ ///< should not close the handle here, otherwise the OpenFileMapping would fail
155
+ //CloseHandle(h);
156
+ m_handle = h;
157
+
158
+ if (!pool)
159
+ {
160
+ return false;
161
+ }
162
+
163
+#else /* POSIX / pthreads */
164
+ mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH;
165
+ int flag = O_RDWR;
166
+ int shrfd = -1;
167
+ if ((shrfd = open(nameBuf, flag, mode)) < 0)
168
+ {
169
+ flag |= O_CREAT;
170
+
171
+ shrfd = open(nameBuf, flag, mode);
172
+ if (shrfd < 0)
173
+ {
174
+ return false;
175
+ }
176
+ newCreated = true;
177
+
178
+ lseek(shrfd, shrMemSize - 1, SEEK_SET);
179
+
180
+ if (-1 == write(shrfd, "\0", 1))
181
+ {
182
+ close(shrfd);
183
+ return false;
184
+ }
185
+
186
+ if (lseek(shrfd, 0, SEEK_END) < shrMemSize)
187
+ {
188
+ close(shrfd);
189
+ return false;
190
+ }
191
+ }
192
+
193
+ void *pool = mmap(0,
194
+ shrMemSize,
195
+ PROT_READ | PROT_WRITE,
196
+ MAP_SHARED,
197
+ shrfd,
198
+ 0);
199
+
200
+ close(shrfd);
201
x265_3.6.tar.gz/source/common/ringmem.h
Added
92
1
2
+/*****************************************************************************
3
+ * Copyright (C) 2013-2017 MulticoreWare, Inc
4
+ *
5
+ * Authors: liwei <liwei@multicorewareinc.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com
23
+ *****************************************************************************/
24
+
25
+#ifndef X265_RINGMEM_H
26
+#define X265_RINGMEM_H
27
+
28
+#include "common.h"
29
+#include "threading.h"
30
+
31
+#if _MSC_VER
32
+#define snprintf _snprintf
33
+#define strdup _strdup
34
+#endif
35
+
36
+namespace X265_NS {
37
+
38
+#define MAX_SHR_NAME_LEN 256
39
+
40
+ class RingMem {
41
+ public:
42
+ RingMem();
43
+ ~RingMem();
44
+
45
+ bool skipRead(int32_t cnt);
46
+
47
+ bool skipWrite(int32_t cnt);
48
+
49
+ ///< initialize
50
+ ///< protectRW: if use the semaphore the protect the write and read operation.
51
+ bool init(int32_t itemSize, int32_t itemCnt, const char *name, bool protectRW = false);
52
+ ///< finalize
53
+ void release();
54
+
55
+ typedef void(*fnRWSharedData)(void *dst, void *src, int32_t size);
56
+
57
+ ///< data read
58
+ bool readNext(void* dst, fnRWSharedData callback);
59
+ ///< data write
60
+ bool writeData(void *data, fnRWSharedData callback);
61
+
62
+ private:
63
+ bool m_initialized;
64
+ bool m_protectRW;
65
+
66
+ int32_t m_itemSize;
67
+ int32_t m_itemCnt;
68
+ ///< data pool
69
+ void *m_dataPool;
70
+ typedef struct {
71
+ ///< index to write
72
+ int32_t m_write;
73
+ ///< index to read
74
+ int32_t m_read;
75
+
76
+ }ShrMemCtrl;
77
+
78
+ ShrMemCtrl *m_shrMem;
79
+#ifdef _WIN32
80
+ void *m_handle;
81
+#else // _WIN32
82
+ char *m_filepath;
83
+#endif // _WIN32
84
+
85
+ ///< Semaphores
86
+ NamedSemaphore *m_writeSem;
87
+ NamedSemaphore *m_readSem;
88
+ };
89
+};
90
+
91
+#endif // ifndef X265_RINGMEM_H
92
x265_3.5.tar.gz/source/common/slice.h -> x265_3.6.tar.gz/source/common/slice.h
Changed
35
1
2
HRDInfo hrdParameters;
3
ProfileTierLevel ptl;
4
uint32_t maxTempSubLayers;
5
- uint32_t numReorderPics;
6
- uint32_t maxDecPicBuffering;
7
- uint32_t maxLatencyIncrease;
8
+ uint32_t numReorderPicsMAX_T_LAYERS;
9
+ uint32_t maxDecPicBufferingMAX_T_LAYERS;
10
+ uint32_t maxLatencyIncreaseMAX_T_LAYERS;
11
};
12
13
struct Window
14
15
uint32_t maxAMPDepth;
16
17
uint32_t maxTempSubLayers; // max number of Temporal Sub layers
18
- uint32_t maxDecPicBuffering; // these are dups of VPS values
19
- uint32_t maxLatencyIncrease;
20
- int numReorderPics;
21
+ uint32_t maxDecPicBufferingMAX_T_LAYERS; // these are dups of VPS values
22
+ uint32_t maxLatencyIncreaseMAX_T_LAYERS;
23
+ int numReorderPicsMAX_T_LAYERS;
24
25
RPS spsrpsMAX_NUM_SHORT_TERM_RPS;
26
int spsrpsNum;
27
28
int m_iNumRPSInSPS;
29
const x265_param *m_param;
30
int m_fieldNum;
31
+ Frame* m_mcstfRefFrameList2MAX_MCSTF_TEMPORAL_WINDOW_LENGTH;
32
33
Slice()
34
{
35
x265_3.6.tar.gz/source/common/temporalfilter.cpp
Added
201
1
2
+/*****************************************************************************
3
+* Copyright (C) 2013-2021 MulticoreWare, Inc
4
+*
5
+ * Authors: Ashok Kumar Mishra <ashok@multicorewareinc.com>
6
+ *
7
+* This program is free software; you can redistribute it and/or modify
8
+* it under the terms of the GNU General Public License as published by
9
+* the Free Software Foundation; either version 2 of the License, or
10
+* (at your option) any later version.
11
+*
12
+* This program is distributed in the hope that it will be useful,
13
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+* GNU General Public License for more details.
16
+*
17
+* You should have received a copy of the GNU General Public License
18
+* along with this program; if not, write to the Free Software
19
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+*
21
+* This program is also available under a commercial proprietary license.
22
+* For more information, contact us at license @ x265.com.
23
+*****************************************************************************/
24
+#include "common.h"
25
+#include "temporalfilter.h"
26
+#include "primitives.h"
27
+
28
+#include "frame.h"
29
+#include "slice.h"
30
+#include "framedata.h"
31
+#include "analysis.h"
32
+
33
+using namespace X265_NS;
34
+
35
+void OrigPicBuffer::addPicture(Frame* inFrame)
36
+{
37
+ m_mcstfPicList.pushFrontMCSTF(*inFrame);
38
+}
39
+
40
+void OrigPicBuffer::addEncPicture(Frame* inFrame)
41
+{
42
+ m_mcstfOrigPicFreeList.pushFrontMCSTF(*inFrame);
43
+}
44
+
45
+void OrigPicBuffer::addEncPictureToPicList(Frame* inFrame)
46
+{
47
+ m_mcstfOrigPicList.pushFrontMCSTF(*inFrame);
48
+}
49
+
50
+OrigPicBuffer::~OrigPicBuffer()
51
+{
52
+ while (!m_mcstfOrigPicList.empty())
53
+ {
54
+ Frame* curFrame = m_mcstfOrigPicList.popBackMCSTF();
55
+ curFrame->destroy();
56
+ delete curFrame;
57
+ }
58
+
59
+ while (!m_mcstfOrigPicFreeList.empty())
60
+ {
61
+ Frame* curFrame = m_mcstfOrigPicFreeList.popBackMCSTF();
62
+ curFrame->destroy();
63
+ delete curFrame;
64
+ }
65
+}
66
+
67
+void OrigPicBuffer::setOrigPicList(Frame* inFrame, int frameCnt)
68
+{
69
+ Slice* slice = inFrame->m_encData->m_slice;
70
+ uint8_t j = 0;
71
+ for (int iterPOC = (inFrame->m_poc - inFrame->m_mcstf->m_range);
72
+ iterPOC <= (inFrame->m_poc + inFrame->m_mcstf->m_range); iterPOC++)
73
+ {
74
+ if (iterPOC != inFrame->m_poc)
75
+ {
76
+ if (iterPOC < 0)
77
+ continue;
78
+ if (iterPOC >= frameCnt)
79
+ break;
80
+
81
+ Frame *iterFrame = m_mcstfPicList.getPOCMCSTF(iterPOC);
82
+ X265_CHECK(iterFrame, "Reference frame not found in OPB");
83
+ if (iterFrame != NULL)
84
+ {
85
+ slice->m_mcstfRefFrameList1j = iterFrame;
86
+ iterFrame->m_refPicCnt1--;
87
+ }
88
+
89
+ iterFrame = m_mcstfOrigPicList.getPOCMCSTF(iterPOC);
90
+ if (iterFrame != NULL)
91
+ {
92
+
93
+ slice->m_mcstfRefFrameList1j = iterFrame;
94
+
95
+ iterFrame->m_refPicCnt1--;
96
+ Frame *cFrame = m_mcstfOrigPicList.getPOCMCSTF(inFrame->m_poc);
97
+ X265_CHECK(cFrame, "Reference frame not found in encoded OPB");
98
+ cFrame->m_refPicCnt1--;
99
+ }
100
+ j++;
101
+ }
102
+ }
103
+}
104
+
105
+void OrigPicBuffer::recycleOrigPicList()
106
+{
107
+ Frame *iterFrame = m_mcstfPicList.first();
108
+
109
+ while (iterFrame)
110
+ {
111
+ Frame *curFrame = iterFrame;
112
+ iterFrame = iterFrame->m_nextMCSTF;
113
+ if (!curFrame->m_refPicCnt1)
114
+ {
115
+ m_mcstfPicList.removeMCSTF(*curFrame);
116
+ iterFrame = m_mcstfPicList.first();
117
+ }
118
+ }
119
+
120
+ iterFrame = m_mcstfOrigPicList.first();
121
+
122
+ while (iterFrame)
123
+ {
124
+ Frame *curFrame = iterFrame;
125
+ iterFrame = iterFrame->m_nextMCSTF;
126
+ if (!curFrame->m_refPicCnt1)
127
+ {
128
+ m_mcstfOrigPicList.removeMCSTF(*curFrame);
129
+ *curFrame->m_isSubSampled = false;
130
+ m_mcstfOrigPicFreeList.pushFrontMCSTF(*curFrame);
131
+ iterFrame = m_mcstfOrigPicList.first();
132
+ }
133
+ }
134
+}
135
+
136
+void OrigPicBuffer::addPictureToFreelist(Frame* inFrame)
137
+{
138
+ m_mcstfOrigPicFreeList.pushBack(*inFrame);
139
+}
140
+
141
+TemporalFilter::TemporalFilter()
142
+{
143
+ m_sourceWidth = 0;
144
+ m_sourceHeight = 0,
145
+ m_QP = 0;
146
+ m_sliceTypeConfig = 3;
147
+ m_numRef = 0;
148
+ m_useSADinME = 1;
149
+
150
+ m_range = 2;
151
+ m_chromaFactor = 0.55;
152
+ m_sigmaMultiplier = 9.0;
153
+ m_sigmaZeroPoint = 10.0;
154
+ m_motionVectorFactor = 16;
155
+}
156
+
157
+void TemporalFilter::init(const x265_param* param)
158
+{
159
+ m_param = param;
160
+ m_bitDepth = param->internalBitDepth;
161
+ m_sourceWidth = param->sourceWidth;
162
+ m_sourceHeight = param->sourceHeight;
163
+ m_internalCsp = param->internalCsp;
164
+ m_numComponents = (m_internalCsp != X265_CSP_I400) ? MAX_NUM_COMPONENT : 1;
165
+
166
+ m_metld = new MotionEstimatorTLD;
167
+
168
+ predPUYuv.create(FENC_STRIDE, X265_CSP_I400);
169
+}
170
+
171
+int TemporalFilter::createRefPicInfo(TemporalFilterRefPicInfo* refFrame, x265_param* param)
172
+{
173
+ CHECKED_MALLOC_ZERO(refFrame->mvs, MV, sizeof(MV)* ((m_sourceWidth ) / 4) * ((m_sourceHeight ) / 4));
174
+ refFrame->mvsStride = m_sourceWidth / 4;
175
+ CHECKED_MALLOC_ZERO(refFrame->mvs0, MV, sizeof(MV)* ((m_sourceWidth ) / 16) * ((m_sourceHeight ) / 16));
176
+ refFrame->mvsStride0 = m_sourceWidth / 16;
177
+ CHECKED_MALLOC_ZERO(refFrame->mvs1, MV, sizeof(MV)* ((m_sourceWidth ) / 16) * ((m_sourceHeight ) / 16));
178
+ refFrame->mvsStride1 = m_sourceWidth / 16;
179
+ CHECKED_MALLOC_ZERO(refFrame->mvs2, MV, sizeof(MV)* ((m_sourceWidth ) / 16)*((m_sourceHeight ) / 16));
180
+ refFrame->mvsStride2 = m_sourceWidth / 16;
181
+
182
+ CHECKED_MALLOC_ZERO(refFrame->noise, int, sizeof(int) * ((m_sourceWidth) / 4) * ((m_sourceHeight) / 4));
183
+ CHECKED_MALLOC_ZERO(refFrame->error, int, sizeof(int) * ((m_sourceWidth) / 4) * ((m_sourceHeight) / 4));
184
+
185
+ refFrame->slicetype = X265_TYPE_AUTO;
186
+
187
+ refFrame->compensatedPic = new PicYuv;
188
+ refFrame->compensatedPic->create(param, true);
189
+
190
+ return 1;
191
+fail:
192
+ return 0;
193
+}
194
+
195
+int TemporalFilter::motionErrorLumaSAD(
196
+ PicYuv *orig,
197
+ PicYuv *buffer,
198
+ int x,
199
+ int y,
200
+ int dx,
201
x265_3.6.tar.gz/source/common/temporalfilter.h
Added
187
1
2
+/*****************************************************************************
3
+* Copyright (C) 2013-2021 MulticoreWare, Inc
4
+*
5
+ * Authors: Ashok Kumar Mishra <ashok@multicorewareinc.com>
6
+ *
7
+* This program is free software; you can redistribute it and/or modify
8
+* it under the terms of the GNU General Public License as published by
9
+* the Free Software Foundation; either version 2 of the License, or
10
+* (at your option) any later version.
11
+*
12
+* This program is distributed in the hope that it will be useful,
13
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+* GNU General Public License for more details.
16
+*
17
+* You should have received a copy of the GNU General Public License
18
+* along with this program; if not, write to the Free Software
19
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20
+*
21
+* This program is also available under a commercial proprietary license.
22
+* For more information, contact us at license @ x265.com.
23
+*****************************************************************************/
24
+
25
+#ifndef X265_TEMPORAL_FILTER_H
26
+#define X265_TEMPORAL_FILTER_H
27
+
28
+#include "x265.h"
29
+#include "picyuv.h"
30
+#include "mv.h"
31
+#include "piclist.h"
32
+#include "yuv.h"
33
+#include "motion.h"
34
+
35
+const int s_interpolationFilter168 =
36
+{
37
+ { 0, 0, 0, 64, 0, 0, 0, 0 }, //0
38
+ { 0, 1, -3, 64, 4, -2, 0, 0 }, //1 -->-->
39
+ { 0, 1, -6, 62, 9, -3, 1, 0 }, //2 -->
40
+ { 0, 2, -8, 60, 14, -5, 1, 0 }, //3 -->-->
41
+ { 0, 2, -9, 57, 19, -7, 2, 0 }, //4
42
+ { 0, 3, -10, 53, 24, -8, 2, 0 }, //5 -->-->
43
+ { 0, 3, -11, 50, 29, -9, 2, 0 }, //6 -->
44
+ { 0, 3, -11, 44, 35, -10, 3, 0 }, //7 -->-->
45
+ { 0, 1, -7, 38, 38, -7, 1, 0 }, //8
46
+ { 0, 3, -10, 35, 44, -11, 3, 0 }, //9 -->-->
47
+ { 0, 2, -9, 29, 50, -11, 3, 0 }, //10-->
48
+ { 0, 2, -8, 24, 53, -10, 3, 0 }, //11-->-->
49
+ { 0, 2, -7, 19, 57, -9, 2, 0 }, //12
50
+ { 0, 1, -5, 14, 60, -8, 2, 0 }, //13-->-->
51
+ { 0, 1, -3, 9, 62, -6, 1, 0 }, //14-->
52
+ { 0, 0, -2, 4, 64, -3, 1, 0 } //15-->-->
53
+};
54
+
55
+const double s_refStrengths34 =
56
+{ // abs(POC offset)
57
+ // 1, 2 3 4
58
+ {0.85, 0.57, 0.41, 0.33}, // m_range * 2
59
+ {1.13, 0.97, 0.81, 0.57}, // m_range
60
+ {0.30, 0.30, 0.30, 0.30} // otherwise
61
+};
62
+
63
+namespace X265_NS {
64
+ class OrigPicBuffer
65
+ {
66
+ public:
67
+ PicList m_mcstfPicList;
68
+ PicList m_mcstfOrigPicFreeList;
69
+ PicList m_mcstfOrigPicList;
70
+
71
+ ~OrigPicBuffer();
72
+ void addPicture(Frame*);
73
+ void addEncPicture(Frame*);
74
+ void setOrigPicList(Frame*, int);
75
+ void recycleOrigPicList();
76
+ void addPictureToFreelist(Frame*);
77
+ void addEncPictureToPicList(Frame*);
78
+ };
79
+
80
+ struct MotionEstimatorTLD
81
+ {
82
+ MotionEstimate me;
83
+
84
+ MotionEstimatorTLD()
85
+ {
86
+ me.init(X265_CSP_I400);
87
+ me.setQP(X265_LOOKAHEAD_QP);
88
+ }
89
+
90
+ ~MotionEstimatorTLD() {}
91
+ };
92
+
93
+ struct TemporalFilterRefPicInfo
94
+ {
95
+ PicYuv* picBuffer;
96
+ PicYuv* picBufferSubSampled2;
97
+ PicYuv* picBufferSubSampled4;
98
+ MV* mvs;
99
+ MV* mvs0;
100
+ MV* mvs1;
101
+ MV* mvs2;
102
+ uint32_t mvsStride;
103
+ uint32_t mvsStride0;
104
+ uint32_t mvsStride1;
105
+ uint32_t mvsStride2;
106
+ int* error;
107
+ int* noise;
108
+
109
+ int16_t origOffset;
110
+ bool isFilteredFrame;
111
+ PicYuv* compensatedPic;
112
+
113
+ int* isSubsampled;
114
+
115
+ int slicetype;
116
+ };
117
+
118
+ class TemporalFilter
119
+ {
120
+ public:
121
+ TemporalFilter();
122
+ ~TemporalFilter() {}
123
+
124
+ void init(const x265_param* param);
125
+
126
+ //private:
127
+ // Private static member variables
128
+ const x265_param *m_param;
129
+ int32_t m_bitDepth;
130
+ int m_range;
131
+ uint8_t m_numRef;
132
+ double m_chromaFactor;
133
+ double m_sigmaMultiplier;
134
+ double m_sigmaZeroPoint;
135
+ int m_motionVectorFactor;
136
+ int m_padding;
137
+
138
+ // Private member variables
139
+
140
+ int m_sourceWidth;
141
+ int m_sourceHeight;
142
+ int m_QP;
143
+
144
+ int m_internalCsp;
145
+ int m_numComponents;
146
+ uint8_t m_sliceTypeConfig;
147
+
148
+ MotionEstimatorTLD* m_metld;
149
+ Yuv predPUYuv;
150
+ int m_useSADinME;
151
+
152
+ int createRefPicInfo(TemporalFilterRefPicInfo* refFrame, x265_param* param);
153
+
154
+ void bilateralFilter(Frame* frame, TemporalFilterRefPicInfo* mctfRefList, double overallStrength);
155
+
156
+ void motionEstimationLuma(MV *mvs, uint32_t mvStride, PicYuv *orig, PicYuv *buffer, int bs,
157
+ MV *previous = 0, uint32_t prevmvStride = 0, int factor = 1);
158
+
159
+ void motionEstimationLumaDoubleRes(MV *mvs, uint32_t mvStride, PicYuv *orig, PicYuv *buffer, int blockSize,
160
+ MV *previous, uint32_t prevMvStride, int factor, int* minError);
161
+
162
+ int motionErrorLumaSSD(PicYuv *orig,
163
+ PicYuv *buffer,
164
+ int x,
165
+ int y,
166
+ int dx,
167
+ int dy,
168
+ int bs,
169
+ int besterror = 8 * 8 * 1024 * 1024);
170
+
171
+ int motionErrorLumaSAD(PicYuv *orig,
172
+ PicYuv *buffer,
173
+ int x,
174
+ int y,
175
+ int dx,
176
+ int dy,
177
+ int bs,
178
+ int besterror = 8 * 8 * 1024 * 1024);
179
+
180
+ void destroyRefPicInfo(TemporalFilterRefPicInfo* curFrame);
181
+
182
+ void applyMotion(MV *mvs, uint32_t mvsStride, PicYuv *input, PicYuv *output);
183
+
184
+ };
185
+}
186
+#endif
187
x265_3.5.tar.gz/source/common/threading.h -> x265_3.6.tar.gz/source/common/threading.h
Changed
201
1
2
*
3
* Authors: Steve Borho <steve@borho.org>
4
* Min Chen <chenm003@163.com>
5
+ liwei <liwei@multicorewareinc.com>
6
*
7
* This program is free software; you can redistribute it and/or modify
8
* it under the terms of the GNU General Public License as published by
9
10
int m_val;
11
};
12
13
+class NamedSemaphore
14
+{
15
+public:
16
+ NamedSemaphore() : m_sem(NULL)
17
+ {
18
+ }
19
+
20
+ ~NamedSemaphore()
21
+ {
22
+ }
23
+
24
+ bool create(const char* name, const int initcnt, const int maxcnt)
25
+ {
26
+ if(!m_sem)
27
+ {
28
+ m_sem = CreateSemaphoreA(NULL, initcnt, maxcnt, name);
29
+ }
30
+ return m_sem != NULL;
31
+ }
32
+
33
+ bool give(const int32_t cnt)
34
+ {
35
+ return ReleaseSemaphore(m_sem, (LONG)cnt, NULL) != FALSE;
36
+ }
37
+
38
+ bool take(const uint32_t time_out = INFINITE)
39
+ {
40
+ int32_t rt = WaitForSingleObject(m_sem, time_out);
41
+ return rt != WAIT_TIMEOUT && rt != WAIT_FAILED;
42
+ }
43
+
44
+ void release()
45
+ {
46
+ CloseHandle(m_sem);
47
+ m_sem = NULL;
48
+ }
49
+
50
+private:
51
+ HANDLE m_sem;
52
+};
53
+
54
#else /* POSIX / pthreads */
55
56
typedef pthread_t ThreadHandle;
57
58
int m_val;
59
};
60
61
+#define TIMEOUT_INFINITE 0xFFFFFFFF
62
+
63
+class NamedSemaphore
64
+{
65
+public:
66
+ NamedSemaphore()
67
+ : m_sem(NULL)
68
+#ifndef __APPLE__
69
+ , m_name(NULL)
70
+#endif //__APPLE__
71
+ {
72
+ }
73
+
74
+ ~NamedSemaphore()
75
+ {
76
+ }
77
+
78
+ bool create(const char* name, const int initcnt, const int maxcnt)
79
+ {
80
+ bool ret = false;
81
+
82
+ if (initcnt >= maxcnt)
83
+ {
84
+ return false;
85
+ }
86
+
87
+#ifdef __APPLE__
88
+ do
89
+ {
90
+ int32_t pshared = name != NULL ? PTHREAD_PROCESS_SHARED : PTHREAD_PROCESS_PRIVATE;
91
+
92
+ m_sem = (mac_sem_t *)malloc(sizeof(mac_sem_t));
93
+ if (!m_sem)
94
+ {
95
+ break;
96
+ }
97
+
98
+ if (pthread_mutexattr_init(&m_sem->mutexAttr))
99
+ {
100
+ break;
101
+ }
102
+
103
+ if (pthread_mutexattr_setpshared(&m_sem->mutexAttr, pshared))
104
+ {
105
+ break;
106
+ }
107
+
108
+ if (pthread_condattr_init(&m_sem->condAttr))
109
+ {
110
+ break;
111
+ }
112
+
113
+ if (pthread_condattr_setpshared(&m_sem->condAttr, pshared))
114
+ {
115
+ break;
116
+ }
117
+
118
+ if (pthread_mutex_init(&m_sem->mutex, &m_sem->mutexAttr))
119
+ {
120
+ break;
121
+ }
122
+
123
+ if (pthread_cond_init(&m_sem->cond, &m_sem->condAttr))
124
+ {
125
+ break;
126
+ }
127
+
128
+ m_sem->curCnt = initcnt;
129
+ m_sem->maxCnt = maxcnt;
130
+
131
+ ret = true;
132
+ } while (0);
133
+
134
+ if (!ret)
135
+ {
136
+ release();
137
+ }
138
+
139
+#else //__APPLE__
140
+ m_sem = sem_open(name, O_CREAT | O_EXCL, 0666, initcnt);
141
+ if (m_sem != SEM_FAILED)
142
+ {
143
+ m_name = strdup(name);
144
+ ret = true;
145
+ }
146
+ else
147
+ {
148
+ if (EEXIST == errno)
149
+ {
150
+ m_sem = sem_open(name, 0);
151
+ if (m_sem != SEM_FAILED)
152
+ {
153
+ m_name = strdup(name);
154
+ ret = true;
155
+ }
156
+ }
157
+ }
158
+#endif //__APPLE__
159
+
160
+ return ret;
161
+ }
162
+
163
+ bool give(const int32_t cnt)
164
+ {
165
+ if (!m_sem)
166
+ {
167
+ return false;
168
+ }
169
+
170
+#ifdef __APPLE__
171
+ if (pthread_mutex_lock(&m_sem->mutex))
172
+ {
173
+ return false;
174
+ }
175
+
176
+ int oldCnt = m_sem->curCnt;
177
+ m_sem->curCnt += cnt;
178
+ if (m_sem->curCnt > m_sem->maxCnt)
179
+ {
180
+ m_sem->curCnt = m_sem->maxCnt;
181
+ }
182
+
183
+ bool ret = true;
184
+ if (!oldCnt)
185
+ {
186
+ ret = 0 == pthread_cond_broadcast(&m_sem->cond);
187
+ }
188
+
189
+ if (pthread_mutex_unlock(&m_sem->mutex))
190
+ {
191
+ return false;
192
+ }
193
+
194
+ return ret;
195
+#else //__APPLE__
196
+ int ret = 0;
197
+ int32_t curCnt = cnt;
198
+ while (curCnt-- && !ret) {
199
+ ret = sem_post(m_sem);
200
+ }
201
x265_3.5.tar.gz/source/common/threadpool.cpp -> x265_3.6.tar.gz/source/common/threadpool.cpp
Changed
10
1
2
/* limit threads based on param->numaPools
3
* For windows because threads can't be allocated to live across sockets
4
* changing the default behavior to be per-socket pools -- FIXME */
5
-#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
6
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 || HAVE_LIBNUMA
7
if (!p->numaPools || (strcmp(p->numaPools, "NULL") == 0 || strcmp(p->numaPools, "*") == 0 || strcmp(p->numaPools, "") == 0))
8
{
9
char poolString50 = "";
10
x265_3.5.tar.gz/source/common/version.cpp -> x265_3.6.tar.gz/source/common/version.cpp
Changed
10
1
2
#define ONOS "Unk-OS"
3
#endif
4
5
-#if X86_64
6
+#if defined(_LP64) || defined(_WIN64)
7
#define BITS "64 bit"
8
#else
9
#define BITS "32 bit"
10
x265_3.5.tar.gz/source/common/x86/asm-primitives.cpp -> x265_3.6.tar.gz/source/common/x86/asm-primitives.cpp
Changed
85
1
2
3
p.frameInitLowres = PFX(frame_init_lowres_core_sse2);
4
p.frameInitLowerRes = PFX(frame_init_lowres_core_sse2);
5
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_sse2);
6
// TODO: the planecopy_sp is really planecopy_SC now, must be fix it
7
//p.planecopy_sp = PFX(downShift_16_sse2);
8
p.planecopy_sp_shl = PFX(upShift_16_sse2);
9
10
{
11
ASSIGN2(p.scale1D_128to64, scale1D_128to64_ssse3);
12
p.scale2D_64to32 = PFX(scale2D_64to32_ssse3);
13
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_ssse3);
14
15
// p.puLUMA_4x4.satd = p.cuBLOCK_4x4.sa8d = PFX(pixel_satd_4x4_ssse3); this one is broken
16
ALL_LUMA_PU(satd, pixel_satd, ssse3);
17
18
p.puLUMA_64x48.copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx);
19
p.puLUMA_64x64.copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx);
20
p.propagateCost = PFX(mbtree_propagate_cost_avx);
21
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_avx);
22
}
23
if (cpuMask & X265_CPU_XOP)
24
{
25
26
LUMA_VAR(xop);
27
p.frameInitLowres = PFX(frame_init_lowres_core_xop);
28
p.frameInitLowerRes = PFX(frame_init_lowres_core_xop);
29
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_xop);
30
}
31
if (cpuMask & X265_CPU_AVX2)
32
{
33
34
35
p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
36
p.frameInitLowerRes = PFX(frame_init_lowres_core_avx2);
37
+
38
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_avx2);
39
+
40
p.propagateCost = PFX(mbtree_propagate_cost_avx2);
41
p.fix8Unpack = PFX(cutree_fix8_unpack_avx2);
42
p.fix8Pack = PFX(cutree_fix8_pack_avx2);
43
44
//p.frameInitLowres = PFX(frame_init_lowres_core_mmx2);
45
p.frameInitLowres = PFX(frame_init_lowres_core_sse2);
46
p.frameInitLowerRes = PFX(frame_init_lowres_core_sse2);
47
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_sse2);
48
49
ALL_LUMA_TU(blockfill_sNONALIGNED, blockfill_s, sse2);
50
ALL_LUMA_TU(blockfill_sALIGNED, blockfill_s, sse2);
51
52
ASSIGN2(p.scale1D_128to64, scale1D_128to64_ssse3);
53
p.scale2D_64to32 = PFX(scale2D_64to32_ssse3);
54
55
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_ssse3);
56
+
57
ASSIGN2(p.puLUMA_8x4.convert_p2s, filterPixelToShort_8x4_ssse3);
58
ASSIGN2(p.puLUMA_8x8.convert_p2s, filterPixelToShort_8x8_ssse3);
59
ASSIGN2(p.puLUMA_8x16.convert_p2s, filterPixelToShort_8x16_ssse3);
60
61
p.frameInitLowres = PFX(frame_init_lowres_core_avx);
62
p.frameInitLowerRes = PFX(frame_init_lowres_core_avx);
63
p.propagateCost = PFX(mbtree_propagate_cost_avx);
64
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_avx);
65
}
66
if (cpuMask & X265_CPU_XOP)
67
{
68
69
p.cuBLOCK_16x16.sse_pp = PFX(pixel_ssd_16x16_xop);
70
p.frameInitLowres = PFX(frame_init_lowres_core_xop);
71
p.frameInitLowerRes = PFX(frame_init_lowres_core_xop);
72
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_xop);
73
74
}
75
#if X86_64
76
77
p.saoCuStatsE2 = PFX(saoCuStatsE2_avx2);
78
p.saoCuStatsE3 = PFX(saoCuStatsE3_avx2);
79
80
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_avx2);
81
+
82
if (cpuMask & X265_CPU_BMI2)
83
{
84
p.scanPosLast = PFX(scanPosLast_avx2_bmi2);
85
x265_3.5.tar.gz/source/common/x86/const-a.asm -> x265_3.6.tar.gz/source/common/x86/const-a.asm
Changed
10
1
2
const pw_2000, times 16 dw 0x2000
3
const pw_8000, times 8 dw 0x8000
4
const pw_3fff, times 16 dw 0x3fff
5
-const pw_32_0, times 4 dw 32,
6
+const pw_32_0, times 4 dw 32
7
times 4 dw 0
8
const pw_pixel_max, times 16 dw ((1 << BIT_DEPTH)-1)
9
10
x265_3.5.tar.gz/source/common/x86/h-ipfilter8.asm -> x265_3.6.tar.gz/source/common/x86/h-ipfilter8.asm
Changed
20
1
2
ALIGN 32
3
interp4_hps_shuf: times 2 db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
4
5
+ALIGN 32
6
+const interp_4tap_8x8_horiz_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7
7
+
8
SECTION .text
9
10
cextern pw_1
11
12
13
RET
14
15
-ALIGN 32
16
-const interp_4tap_8x8_horiz_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7
17
18
%macro FILTER_H4_w6 3
19
movu %1, srcq - 1
20
x265_3.5.tar.gz/source/common/x86/mc-a2.asm -> x265_3.6.tar.gz/source/common/x86/mc-a2.asm
Changed
201
1
2
FRAME_INIT_LOWRES
3
%endif
4
5
+%macro SUBSAMPLEFILT8x4 7
6
+ mova %3, r0+%7
7
+ mova %4, r0+r2+%7
8
+ pavgb %3, %4
9
+ pavgb %4, r0+r2*2+%7
10
+ PALIGNR %1, %3, 1, m6
11
+ PALIGNR %2, %4, 1, m6
12
+%if cpuflag(xop)
13
+ pavgb %1, %3
14
+ pavgb %2, %4
15
+%else
16
+ pavgb %1, %3
17
+ pavgb %2, %4
18
+ psrlw %5, %1, 8
19
+ psrlw %6, %2, 8
20
+ pand %1, m7
21
+ pand %2, m7
22
+%endif
23
+%endmacro
24
+
25
+%macro SUBSAMPLEFILT32x4U 1
26
+ movu m1, r0+r2
27
+ pavgb m0, m1, r0
28
+ movu m3, r0+r2+1
29
+ pavgb m2, m3, r0+1
30
+ pavgb m1, r0+r2*2
31
+ pavgb m3, r0+r2*2+1
32
+ pavgb m0, m2
33
+ pavgb m1, m3
34
+
35
+ movu m3, r0+r2+mmsize
36
+ pavgb m2, m3, r0+mmsize
37
+ movu m5, r0+r2+1+mmsize
38
+ pavgb m4, m5, r0+1+mmsize
39
+ pavgb m2, m4
40
+
41
+ pshufb m0, m7
42
+ pshufb m2, m7
43
+ punpcklqdq m0, m0, m2
44
+ vpermq m0, m0, q3120
45
+ movu %1, m0
46
+%endmacro
47
+
48
+%macro SUBSAMPLEFILT16x2 3
49
+ mova m3, r0+%3+mmsize
50
+ mova m2, r0+%3
51
+ pavgb m3, r0+%3+r2+mmsize
52
+ pavgb m2, r0+%3+r2
53
+ PALIGNR %1, m3, 1, m6
54
+ pavgb %1, m3
55
+ PALIGNR m3, m2, 1, m6
56
+ pavgb m3, m2
57
+%if cpuflag(xop)
58
+ vpperm m3, m3, %1, m6
59
+%else
60
+ pand m3, m7
61
+ pand %1, m7
62
+ packuswb m3, %1
63
+%endif
64
+ mova %2, m3
65
+ mova %1, m2
66
+%endmacro
67
+
68
+%macro SUBSAMPLEFILT8x2U 2
69
+ mova m2, r0+%2
70
+ pavgb m2, r0+%2+r2
71
+ mova m0, r0+%2+1
72
+ pavgb m0, r0+%2+r2+1
73
+ pavgb m1, m3
74
+ pavgb m0, m2
75
+ pand m1, m7
76
+ pand m0, m7
77
+ packuswb m0, m1
78
+ mova %1, m0
79
+%endmacro
80
+
81
+%macro SUBSAMPLEFILT8xU 2
82
+ mova m3, r0+%2+8
83
+ mova m2, r0+%2
84
+ pavgw m3, r0+%2+r2+8
85
+ pavgw m2, r0+%2+r2
86
+ movu m1, r0+%2+10
87
+ movu m0, r0+%2+2
88
+ pavgw m1, r0+%2+r2+10
89
+ pavgw m0, r0+%2+r2+2
90
+ pavgw m1, m3
91
+ pavgw m0, m2
92
+ psrld m3, m1, 16
93
+ pand m1, m7
94
+ pand m0, m7
95
+ packssdw m0, m1
96
+ movu %1, m0
97
+%endmacro
98
+
99
+%macro SUBSAMPLEFILT8xA 3
100
+ movu m3, r0+%3+mmsize
101
+ movu m2, r0+%3
102
+ pavgw m3, r0+%3+r2+mmsize
103
+ pavgw m2, r0+%3+r2
104
+ PALIGNR %1, m3, 2, m6
105
+ pavgw %1, m3
106
+ PALIGNR m3, m2, 2, m6
107
+ pavgw m3, m2
108
+%if cpuflag(xop)
109
+ vpperm m3, m3, %1, m6
110
+%else
111
+ pand m3, m7
112
+ pand %1, m7
113
+ packssdw m3, %1
114
+%endif
115
+%if cpuflag(avx2)
116
+ vpermq m3, m3, q3120
117
+%endif
118
+ movu %2, m3
119
+ movu %1, m2
120
+%endmacro
121
+
122
+;-----------------------------------------------------------------------------
123
+; void frame_subsample_luma( uint8_t *src0, uint8_t *dst0,
124
+; intptr_t src_stride, intptr_t dst_stride, int width, int height )
125
+;-----------------------------------------------------------------------------
126
+
127
+%macro FRAME_SUBSAMPLE_LUMA 0
128
+cglobal frame_subsample_luma, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
129
+%if HIGH_BIT_DEPTH
130
+ shl dword r3m, 1
131
+ FIX_STRIDES r2
132
+ shl dword r4m, 1
133
+%endif
134
+%if mmsize >= 16
135
+ add dword r4m, mmsize-1
136
+ and dword r4m, ~(mmsize-1)
137
+%endif
138
+ ; src += 2*(height-1)*stride + 2*width
139
+ mov r6d, r5m
140
+ dec r6d
141
+ imul r6d, r2d
142
+ add r6d, r4m
143
+ lea r0, r0+r6*2
144
+ ; dst += (height-1)*stride + width
145
+ mov r6d, r5m
146
+ dec r6d
147
+ imul r6d, r3m
148
+ add r6d, r4m
149
+ add r1, r6
150
+ ; gap = stride - width
151
+ mov r6d, r3m
152
+ sub r6d, r4m
153
+ PUSH r6
154
+ %define dst_gap rsp+gprsize
155
+ mov r6d, r2d
156
+ sub r6d, r4m
157
+ shl r6d, 1
158
+ PUSH r6
159
+ %define src_gap rsp
160
+%if HIGH_BIT_DEPTH
161
+%if cpuflag(xop)
162
+ mova m6, deinterleave_shuf32a
163
+ mova m7, deinterleave_shuf32b
164
+%else
165
+ pcmpeqw m7, m7
166
+ psrld m7, 16
167
+%endif
168
+.vloop:
169
+ mov r6d, r4m
170
+%ifnidn cpuname, mmx2
171
+ movu m0, r0
172
+ movu m1, r0+r2
173
+ pavgw m0, m1
174
+ pavgw m1, r0+r2*2
175
+%endif
176
+.hloop:
177
+ sub r0, mmsize*2
178
+ sub r1, mmsize
179
+%ifidn cpuname, mmx2
180
+ SUBSAMPLEFILT8xU r1, 0
181
+%else
182
+ SUBSAMPLEFILT8xA m0, r1, 0
183
+%endif
184
+ sub r6d, mmsize
185
+ jg .hloop
186
+%else ; !HIGH_BIT_DEPTH
187
+%if cpuflag(avx2)
188
+ mova m7, deinterleave_shuf
189
+%elif cpuflag(xop)
190
+ mova m6, deinterleave_shuf32a
191
+ mova m7, deinterleave_shuf32b
192
+%else
193
+ pcmpeqb m7, m7
194
+ psrlw m7, 8
195
+%endif
196
+.vloop:
197
+ mov r6d, r4m
198
+%ifnidn cpuname, mmx2
199
+%if mmsize <= 16
200
+ mova m0, r0
201
x265_3.5.tar.gz/source/common/x86/mc.h -> x265_3.6.tar.gz/source/common/x86/mc.h
Changed
19
1
2
3
#undef LOWRES
4
5
+#define SUBSAMPLELUMA(cpu) \
6
+ void PFX(frame_subsample_luma_ ## cpu)(const pixel* src0, pixel* dst0, intptr_t src_stride, intptr_t dst_stride, int width, int height);
7
+SUBSAMPLELUMA(mmx2)
8
+SUBSAMPLELUMA(sse2)
9
+SUBSAMPLELUMA(ssse3)
10
+SUBSAMPLELUMA(avx)
11
+SUBSAMPLELUMA(avx2)
12
+SUBSAMPLELUMA(xop)
13
+
14
+#undef SUBSAMPLELUMA
15
+
16
#define PROPAGATE_COST(cpu) \
17
void PFX(mbtree_propagate_cost_ ## cpu)(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, \
18
const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
19
x265_3.5.tar.gz/source/common/x86/x86inc.asm -> x265_3.6.tar.gz/source/common/x86/x86inc.asm
Changed
96
1
2
%endif
3
%endmacro
4
5
-%macro DEFINE_ARGS_INTERNAL 3+
6
- %ifnum %2
7
- DEFINE_ARGS %3
8
- %elif %1 == 4
9
- DEFINE_ARGS %2
10
- %elif %1 > 4
11
- DEFINE_ARGS %2, %3
12
- %endif
13
-%endmacro
14
-
15
%if WIN64 ; Windows x64 ;=================================================
16
17
DECLARE_REG 0, rcx
18
19
DECLARE_REG 13, R12, 112
20
DECLARE_REG 14, R13, 120
21
22
-%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
23
+%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
24
%assign num_args %1
25
%assign regs_used %2
26
ASSERT regs_used >= num_args
27
28
WIN64_SPILL_XMM %3
29
%endif
30
LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
31
- DEFINE_ARGS_INTERNAL %0, %4, %5
32
+ %if %0 > 4
33
+ %ifnum %4
34
+ DEFINE_ARGS %5
35
+ %else
36
+ DEFINE_ARGS %4, %5
37
+ %endif
38
+ %elifnnum %4
39
+ DEFINE_ARGS %4
40
+ %endif
41
%endmacro
42
43
%macro WIN64_PUSH_XMM 0
44
45
DECLARE_REG 13, R12, 64
46
DECLARE_REG 14, R13, 72
47
48
-%macro PROLOGUE 2-5+ 0; #args, #regs, #xmm_regs, stack_size, arg_names...
49
+%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
50
%assign num_args %1
51
%assign regs_used %2
52
%assign xmm_regs_used %3
53
54
PUSH_IF_USED 9, 10, 11, 12, 13, 14
55
ALLOC_STACK %4
56
LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
57
- DEFINE_ARGS_INTERNAL %0, %4, %5
58
+ %if %0 > 4
59
+ %ifnum %4
60
+ DEFINE_ARGS %5
61
+ %else
62
+ DEFINE_ARGS %4, %5
63
+ %endif
64
+ %elifnnum %4
65
+ DEFINE_ARGS %4
66
+ %endif
67
%endmacro
68
69
%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
70
71
72
DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
73
74
-%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, stack_size, arg_names...
75
+%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
76
%assign num_args %1
77
%assign regs_used %2
78
ASSERT regs_used >= num_args
79
80
PUSH_IF_USED 3, 4, 5, 6
81
ALLOC_STACK %4
82
LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
83
- DEFINE_ARGS_INTERNAL %0, %4, %5
84
+ %if %0 > 4
85
+ %ifnum %4
86
+ DEFINE_ARGS %5
87
+ %else
88
+ DEFINE_ARGS %4, %5
89
+ %endif
90
+ %elifnnum %4
91
+ DEFINE_ARGS %4
92
+ %endif
93
%endmacro
94
95
%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
96
x265_3.5.tar.gz/source/common/x86/x86util.asm -> x265_3.6.tar.gz/source/common/x86/x86util.asm
Changed
13
1
2
%elif %1==2
3
%if mmsize==8
4
SBUTTERFLY dq, %3, %4, %5
5
- %else
6
+ %elif %0==6
7
TRANS q, ORDER, %3, %4, %5, %6
8
+ %else
9
+ TRANS q, ORDER, %3, %4, %5
10
%endif
11
%elif %1==4
12
SBUTTERFLY qdq, %3, %4, %5
13
x265_3.5.tar.gz/source/encoder/analysis.cpp -> x265_3.6.tar.gz/source/encoder/analysis.cpp
Changed
10
1
2
qp += distortionData->offsetctu.m_cuAddr;
3
}
4
5
- if (m_param->analysisLoadReuseLevel == 10 && m_param->rc.cuTree)
6
+ if (m_param->analysisLoadReuseLevel >= 2 && m_param->rc.cuTree)
7
{
8
int cuIdx = (ctu.m_cuAddr * ctu.m_numPartitions) + cuGeom.absPartIdx;
9
if (ctu.m_slice->m_sliceType == I_SLICE)
10
x265_3.5.tar.gz/source/encoder/api.cpp -> x265_3.6.tar.gz/source/encoder/api.cpp
Changed
50
1
2
memcpy(zoneParam, param, sizeof(x265_param));
3
for (int i = 0; i < param->rc.zonefileCount; i++)
4
{
5
- param->rc.zonesi.startFrame = -1;
6
encoder->configureZone(zoneParam, param->rc.zonesi.zoneParam);
7
}
8
9
10
if (numEncoded < 0)
11
encoder->m_aborted = true;
12
13
+ if ((!encoder->m_numDelayedPic && !numEncoded) && (encoder->m_param->bEnableEndOfSequence || encoder->m_param->bEnableEndOfBitstream))
14
+ {
15
+ Bitstream bs;
16
+ encoder->getEndNalUnits(encoder->m_nalList, bs);
17
+ *pp_nal = &encoder->m_nalList.m_nal0;
18
+ if (pi_nal) *pi_nal = encoder->m_nalList.m_numNal;
19
+ }
20
+
21
return numEncoded;
22
}
23
24
25
&PARAM_NS::x265_param_free,
26
&PARAM_NS::x265_param_default,
27
&PARAM_NS::x265_param_parse,
28
+ &PARAM_NS::x265_scenecut_aware_qp_param_parse,
29
&PARAM_NS::x265_param_apply_profile,
30
&PARAM_NS::x265_param_default_preset,
31
&x265_picture_alloc,
32
33
if (param->csvLogLevel)
34
{
35
fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, Scenecut, ");
36
+ if (!!param->bEnableTemporalSubLayers)
37
+ fprintf(csvfp, "Temporal Sub Layer ID, ");
38
if (param->csvLogLevel >= 2)
39
fprintf(csvfp, "I/P cost ratio, ");
40
if (param->rc.rateControlMode == X265_RC_CRF)
41
42
const x265_frame_stats* frameStats = &pic->frameData;
43
fprintf(param->csvfpt, "%d, %c-SLICE, %4d, %2.2lf, %10d, %d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc,
44
frameStats->qp, (int)frameStats->bits, frameStats->bScenecut);
45
+ if (!!param->bEnableTemporalSubLayers)
46
+ fprintf(param->csvfpt, "%d,", frameStats->tLayer);
47
if (param->csvLogLevel >= 2)
48
fprintf(param->csvfpt, "%.2f,", frameStats->ipCostRatio);
49
if (param->rc.rateControlMode == X265_RC_CRF)
50
x265_3.5.tar.gz/source/encoder/dpb.cpp -> x265_3.6.tar.gz/source/encoder/dpb.cpp
Changed
201
1
2
{
3
Frame *curFrame = iterFrame;
4
iterFrame = iterFrame->m_next;
5
- if (!curFrame->m_encData->m_bHasReferences && !curFrame->m_countRefEncoders)
6
+ bool isMCSTFReferenced = false;
7
+
8
+ if (curFrame->m_param->bEnableTemporalFilter)
9
+ isMCSTFReferenced =!!(curFrame->m_refPicCnt1);
10
+
11
+ if (!curFrame->m_encData->m_bHasReferences && !curFrame->m_countRefEncoders && !isMCSTFReferenced)
12
{
13
curFrame->m_bChromaExtended = false;
14
15
+ if (curFrame->m_param->bEnableTemporalFilter)
16
+ *curFrame->m_isSubSampled = false;
17
+
18
// Reset column counter
19
X265_CHECK(curFrame->m_reconRowFlag != NULL, "curFrame->m_reconRowFlag check failure");
20
X265_CHECK(curFrame->m_reconColCount != NULL, "curFrame->m_reconColCount check failure");
21
22
{
23
newFrame->m_encData->m_bHasReferences = false;
24
25
+ newFrame->m_tempLayer = (newFrame->m_param->bEnableTemporalSubLayers && !m_bTemporalSublayer) ? 1 : newFrame->m_tempLayer;
26
// Adjust NAL type for unreferenced B frames (change from _R "referenced"
27
// to _N "non-referenced" NAL unit type)
28
switch (slice->m_nalUnitType)
29
{
30
case NAL_UNIT_CODED_SLICE_TRAIL_R:
31
- slice->m_nalUnitType = m_bTemporalSublayer ? NAL_UNIT_CODED_SLICE_TSA_N : NAL_UNIT_CODED_SLICE_TRAIL_N;
32
+ slice->m_nalUnitType = newFrame->m_param->bEnableTemporalSubLayers ? NAL_UNIT_CODED_SLICE_TSA_N : NAL_UNIT_CODED_SLICE_TRAIL_N;
33
break;
34
case NAL_UNIT_CODED_SLICE_RADL_R:
35
slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_RADL_N;
36
37
38
m_picList.pushFront(*newFrame);
39
40
+ if (m_bTemporalSublayer && getTemporalLayerNonReferenceFlag())
41
+ {
42
+ switch (slice->m_nalUnitType)
43
+ {
44
+ case NAL_UNIT_CODED_SLICE_TRAIL_R:
45
+ slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_TRAIL_N;
46
+ break;
47
+ case NAL_UNIT_CODED_SLICE_RADL_R:
48
+ slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_RADL_N;
49
+ break;
50
+ case NAL_UNIT_CODED_SLICE_RASL_R:
51
+ slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_RASL_N;
52
+ break;
53
+ default:
54
+ break;
55
+ }
56
+ }
57
// Do decoding refresh marking if any
58
decodingRefreshMarking(pocCurr, slice->m_nalUnitType);
59
60
- computeRPS(pocCurr, slice->isIRAP(), &slice->m_rps, slice->m_sps->maxDecPicBuffering);
61
-
62
+ computeRPS(pocCurr, newFrame->m_tempLayer, slice->isIRAP(), &slice->m_rps, slice->m_sps->maxDecPicBufferingnewFrame->m_tempLayer);
63
+ bool isTSAPic = ((slice->m_nalUnitType == 2) || (slice->m_nalUnitType == 3)) ? true : false;
64
// Mark pictures in m_piclist as unreferenced if they are not included in RPS
65
- applyReferencePictureSet(&slice->m_rps, pocCurr);
66
+ applyReferencePictureSet(&slice->m_rps, pocCurr, newFrame->m_tempLayer, isTSAPic);
67
+
68
+
69
+ if (m_bTemporalSublayer && newFrame->m_tempLayer > 0
70
+ && !(slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RADL_N // Check if not a leading picture
71
+ || slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RADL_R
72
+ || slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RASL_N
73
+ || slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RASL_R)
74
+ )
75
+ {
76
+ if (isTemporalLayerSwitchingPoint(pocCurr, newFrame->m_tempLayer) || (slice->m_sps->maxTempSubLayers == 1))
77
+ {
78
+ if (getTemporalLayerNonReferenceFlag())
79
+ {
80
+ slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_TSA_N;
81
+ }
82
+ else
83
+ {
84
+ slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_TSA_R;
85
+ }
86
+ }
87
+ else if (isStepwiseTemporalLayerSwitchingPoint(&slice->m_rps, pocCurr, newFrame->m_tempLayer))
88
+ {
89
+ bool isSTSA = true;
90
+ int id = newFrame->m_gopOffset % x265_gop_ra_lengthnewFrame->m_gopId;
91
+ for (int ii = id; (ii < x265_gop_ra_lengthnewFrame->m_gopId && isSTSA == true); ii++)
92
+ {
93
+ int tempIdRef = x265_gop_ranewFrame->m_gopIdii.layer;
94
+ if (tempIdRef == newFrame->m_tempLayer)
95
+ {
96
+ for (int jj = 0; jj < slice->m_rps.numberOfPositivePictures + slice->m_rps.numberOfNegativePictures; jj++)
97
+ {
98
+ if (slice->m_rps.bUsedjj)
99
+ {
100
+ int refPoc = x265_gop_ranewFrame->m_gopIdii.poc_offset + slice->m_rps.deltaPOCjj;
101
+ int kk = 0;
102
+ for (kk = 0; kk < x265_gop_ra_lengthnewFrame->m_gopId; kk++)
103
+ {
104
+ if (x265_gop_ranewFrame->m_gopIdkk.poc_offset == refPoc)
105
+ {
106
+ break;
107
+ }
108
+ }
109
+ if (x265_gop_ranewFrame->m_gopIdkk.layer >= newFrame->m_tempLayer)
110
+ {
111
+ isSTSA = false;
112
+ break;
113
+ }
114
+ }
115
+ }
116
+ }
117
+ }
118
+ if (isSTSA == true)
119
+ {
120
+ if (getTemporalLayerNonReferenceFlag())
121
+ {
122
+ slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_STSA_N;
123
+ }
124
+ else
125
+ {
126
+ slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_STSA_R;
127
+ }
128
+ }
129
+ }
130
+ }
131
132
if (slice->m_sliceType != I_SLICE)
133
slice->m_numRefIdx0 = x265_clip3(1, newFrame->m_param->maxNumReferences, slice->m_rps.numberOfNegativePictures);
134
135
}
136
}
137
138
-void DPB::computeRPS(int curPoc, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer)
139
+void DPB::computeRPS(int curPoc, int tempId, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer)
140
{
141
unsigned int poci = 0, numNeg = 0, numPos = 0;
142
143
144
{
145
if ((iterPic->m_poc != curPoc) && iterPic->m_encData->m_bHasReferences)
146
{
147
- if ((m_lastIDR >= curPoc) || (m_lastIDR <= iterPic->m_poc))
148
+ if ((!m_bTemporalSublayer || (iterPic->m_tempLayer <= tempId)) && ((m_lastIDR >= curPoc) || (m_lastIDR <= iterPic->m_poc)))
149
{
150
rps->pocpoci = iterPic->m_poc;
151
rps->deltaPOCpoci = rps->pocpoci - curPoc;
152
153
rps->sortDeltaPOC();
154
}
155
156
+bool DPB::getTemporalLayerNonReferenceFlag()
157
+{
158
+ Frame* curFrame = m_picList.first();
159
+ if (curFrame->m_encData->m_bHasReferences)
160
+ {
161
+ curFrame->m_sameLayerRefPic = true;
162
+ return false;
163
+ }
164
+ else
165
+ return true;
166
+}
167
+
168
/* Marking reference pictures when an IDR/CRA is encountered. */
169
void DPB::decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType)
170
{
171
172
}
173
174
/** Function for applying picture marking based on the Reference Picture Set */
175
-void DPB::applyReferencePictureSet(RPS *rps, int curPoc)
176
+void DPB::applyReferencePictureSet(RPS *rps, int curPoc, int tempId, bool isTSAPicture)
177
{
178
// loop through all pictures in the reference picture buffer
179
Frame* iterFrame = m_picList.first();
180
181
}
182
if (!referenced)
183
iterFrame->m_encData->m_bHasReferences = false;
184
+
185
+ if (m_bTemporalSublayer)
186
+ {
187
+ //check that pictures of higher temporal layers are not used
188
+ assert(referenced == 0 || iterFrame->m_encData->m_bHasReferences == false || iterFrame->m_tempLayer <= tempId);
189
+
190
+ //check that pictures of higher or equal temporal layer are not in the RPS if the current picture is a TSA picture
191
+ if (isTSAPicture)
192
+ {
193
+ assert(referenced == 0 || iterFrame->m_tempLayer < tempId);
194
+ }
195
+ //check that pictures marked as temporal layer non-reference pictures are not used for reference
196
+ if (iterFrame->m_tempLayer == tempId)
197
+ {
198
+ assert(referenced == 0 || iterFrame->m_sameLayerRefPic == true);
199
+ }
200
+ }
201
x265_3.5.tar.gz/source/encoder/dpb.h -> x265_3.6.tar.gz/source/encoder/dpb.h
Changed
35
1
2
int m_lastIDR;
3
int m_pocCRA;
4
int m_bOpenGOP;
5
+ int m_craNal;
6
int m_bhasLeadingPicture;
7
bool m_bRefreshPending;
8
bool m_bTemporalSublayer;
9
10
m_bRefreshPending = false;
11
m_frameDataFreeList = NULL;
12
m_bOpenGOP = param->bOpenGOP;
13
- m_bTemporalSublayer = !!param->bEnableTemporalSubLayers;
14
+ m_craNal = param->craNal;
15
+ m_bTemporalSublayer = (param->bEnableTemporalSubLayers > 2);
16
}
17
18
~DPB();
19
20
21
protected:
22
23
- void computeRPS(int curPoc, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer);
24
+ void computeRPS(int curPoc,int tempId, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer);
25
26
- void applyReferencePictureSet(RPS *rps, int curPoc);
27
+ void applyReferencePictureSet(RPS *rps, int curPoc, int tempId, bool isTSAPicture);
28
+ bool getTemporalLayerNonReferenceFlag();
29
void decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType);
30
+ bool isTemporalLayerSwitchingPoint(int curPoc, int tempId);
31
+ bool isStepwiseTemporalLayerSwitchingPoint(RPS *rps, int curPoc, int tempId);
32
33
NalUnitType getNalUnitType(int curPoc, bool bIsKeyFrame);
34
};
35
x265_3.5.tar.gz/source/encoder/encoder.cpp -> x265_3.6.tar.gz/source/encoder/encoder.cpp
Changed
201
1
2
{
3
{ 1, 1, 1, 1, 1, 5, 1, 2, 2, 2, 50 },
4
{ 1, 1, 1, 1, 1, 5, 0, 16, 9, 9, 81 },
5
- { 1, 1, 1, 1, 1, 5, 0, 1, 1, 1, 82 }
6
+ { 1, 1, 1, 1, 1, 5, 0, 1, 1, 1, 82 },
7
+ { 1, 1, 1, 1, 1, 5, 0, 18, 9, 9, 84 }
8
+};
9
+
10
+typedef struct
11
+{
12
+ int bEnableVideoSignalTypePresentFlag;
13
+ int bEnableColorDescriptionPresentFlag;
14
+ int bEnableChromaLocInfoPresentFlag;
15
+ int colorPrimaries;
16
+ int transferCharacteristics;
17
+ int matrixCoeffs;
18
+ int bEnableVideoFullRangeFlag;
19
+ int chromaSampleLocTypeTopField;
20
+ int chromaSampleLocTypeBottomField;
21
+ const char* systemId;
22
+}VideoSignalTypePresets;
23
+
24
+VideoSignalTypePresets vstPresets =
25
+{
26
+ {1, 1, 1, 6, 6, 6, 0, 0, 0, "BT601_525"},
27
+ {1, 1, 1, 5, 6, 5, 0, 0, 0, "BT601_626"},
28
+ {1, 1, 1, 1, 1, 1, 0, 0, 0, "BT709_YCC"},
29
+ {1, 1, 0, 1, 1, 0, 0, 0, 0, "BT709_RGB"},
30
+ {1, 1, 1, 9, 14, 1, 0, 2, 2, "BT2020_YCC_NCL"},
31
+ {1, 1, 0, 9, 16, 9, 0, 0, 0, "BT2020_RGB"},
32
+ {1, 1, 1, 9, 16, 9, 0, 2, 2, "BT2100_PQ_YCC"},
33
+ {1, 1, 1, 9, 16, 14, 0, 2, 2, "BT2100_PQ_ICTCP"},
34
+ {1, 1, 0, 9, 16, 0, 0, 0, 0, "BT2100_PQ_RGB"},
35
+ {1, 1, 1, 9, 18, 9, 0, 2, 2, "BT2100_HLG_YCC"},
36
+ {1, 1, 0, 9, 18, 0, 0, 0, 0, "BT2100_HLG_RGB"},
37
+ {1, 1, 0, 1, 1, 0, 1, 0, 0, "FR709_RGB"},
38
+ {1, 1, 0, 9, 14, 0, 1, 0, 0, "FR2020_RGB"},
39
+ {1, 1, 1, 12, 1, 6, 1, 1, 1, "FRP3D65_YCC"}
40
};
41
}
42
43
44
m_threadPool = NULL;
45
m_analysisFileIn = NULL;
46
m_analysisFileOut = NULL;
47
+ m_filmGrainIn = NULL;
48
m_naluFile = NULL;
49
m_offsetEmergency = NULL;
50
m_iFrameNum = 0;
51
52
m_prevTonemapPayload.payload = NULL;
53
m_startPoint = 0;
54
m_saveCTUSize = 0;
55
- m_edgePic = NULL;
56
- m_edgeHistThreshold = 0;
57
- m_chromaHistThreshold = 0.0;
58
- m_scaledEdgeThreshold = 0.0;
59
- m_scaledChromaThreshold = 0.0;
60
m_zoneIndex = 0;
61
+ m_origPicBuffer = 0;
62
}
63
64
inline char *strcatFilename(const char *input, const char *suffix)
65
66
}
67
}
68
69
- if (m_param->bHistBasedSceneCut)
70
- {
71
- m_planeSizes0 = (m_param->sourceWidth >> x265_cli_cspsp->internalCsp.width0) * (m_param->sourceHeight >> x265_cli_cspsm_param->internalCsp.height0);
72
- uint32_t pixelbytes = m_param->internalBitDepth > 8 ? 2 : 1;
73
- m_edgePic = X265_MALLOC(pixel, m_planeSizes0 * pixelbytes);
74
- m_edgeHistThreshold = m_param->edgeTransitionThreshold;
75
- m_chromaHistThreshold = x265_min(m_edgeHistThreshold * 10.0, MAX_SCENECUT_THRESHOLD);
76
- m_scaledEdgeThreshold = x265_min(m_edgeHistThreshold * SCENECUT_STRENGTH_FACTOR, MAX_SCENECUT_THRESHOLD);
77
- m_scaledChromaThreshold = x265_min(m_chromaHistThreshold * SCENECUT_STRENGTH_FACTOR, MAX_SCENECUT_THRESHOLD);
78
- if (m_param->sourceBitDepth != m_param->internalBitDepth)
79
- {
80
- int size = m_param->sourceWidth * m_param->sourceHeight;
81
- int hshift = CHROMA_H_SHIFT(m_param->internalCsp);
82
- int vshift = CHROMA_V_SHIFT(m_param->internalCsp);
83
- int widthC = m_param->sourceWidth >> hshift;
84
- int heightC = m_param->sourceHeight >> vshift;
85
-
86
- m_inputPic0 = X265_MALLOC(pixel, size);
87
- if (m_param->internalCsp != X265_CSP_I400)
88
- {
89
- for (int j = 1; j < 3; j++)
90
- {
91
- m_inputPicj = X265_MALLOC(pixel, widthC * heightC);
92
- }
93
- }
94
- }
95
- }
96
-
97
// Do not allow WPP if only one row or fewer than 3 columns, it is pointless and unstable
98
if (rows == 1 || cols < 3)
99
{
100
101
lookAheadThreadPooli.start();
102
m_lookahead->m_numPools = pools;
103
m_dpb = new DPB(m_param);
104
+
105
+ if (m_param->bEnableTemporalFilter)
106
+ m_origPicBuffer = new OrigPicBuffer();
107
+
108
m_rateControl = new RateControl(*m_param, this);
109
if (!m_param->bResetZoneConfig)
110
{
111
112
}
113
}
114
}
115
+ if (m_param->filmGrain)
116
+ {
117
+ m_filmGrainIn = x265_fopen(m_param->filmGrain, "rb");
118
+ if (!m_filmGrainIn)
119
+ {
120
+ x265_log_file(NULL, X265_LOG_ERROR, "Failed to open film grain characteristics binary file %s\n", m_param->filmGrain);
121
+ }
122
+ }
123
+
124
m_bZeroLatency = !m_param->bframes && !m_param->lookaheadDepth && m_param->frameNumThreads == 1 && m_param->maxSlices == 1;
125
m_aborted |= parseLambdaFile(m_param);
126
127
128
}
129
}
130
131
- if (m_param->bHistBasedSceneCut)
132
- {
133
- if (m_edgePic != NULL)
134
- {
135
- X265_FREE_ZERO(m_edgePic);
136
- }
137
-
138
- if (m_param->sourceBitDepth != m_param->internalBitDepth)
139
- {
140
- X265_FREE_ZERO(m_inputPic0);
141
- if (m_param->internalCsp != X265_CSP_I400)
142
- {
143
- for (int i = 1; i < 3; i++)
144
- {
145
- X265_FREE_ZERO(m_inputPici);
146
- }
147
- }
148
- }
149
- }
150
-
151
for (int i = 0; i < m_param->frameNumThreads; i++)
152
{
153
if (m_frameEncoderi)
154
155
delete zoneReadCount;
156
delete zoneWriteCount;
157
}
158
+
159
+ if (m_param->bEnableTemporalFilter)
160
+ delete m_origPicBuffer;
161
+
162
if (m_rateControl)
163
{
164
m_rateControl->destroy();
165
166
}
167
if (m_naluFile)
168
fclose(m_naluFile);
169
+ if (m_filmGrainIn)
170
+ x265_fclose(m_filmGrainIn);
171
172
#ifdef SVT_HEVC
173
X265_FREE(m_svtAppData);
174
175
/* release string arguments that were strdup'd */
176
free((char*)m_param->rc.lambdaFileName);
177
free((char*)m_param->rc.statFileName);
178
+ free((char*)m_param->rc.sharedMemName);
179
free((char*)m_param->analysisReuseFileName);
180
free((char*)m_param->scalingLists);
181
free((char*)m_param->csvfn);
182
183
free((char*)m_param->toneMapFile);
184
free((char*)m_param->analysisSave);
185
free((char*)m_param->analysisLoad);
186
+ free((char*)m_param->videoSignalTypePreset);
187
PARAM_NS::x265_param_free(m_param);
188
}
189
}
190
191
dest->planes2 = (char*)dest->planes1 + src->stride1 * (src->height >> x265_cli_cspssrc->colorSpace.height1);
192
}
193
194
-bool Encoder::computeHistograms(x265_picture *pic)
195
+bool Encoder::isFilterThisframe(uint8_t sliceTypeConfig, int curSliceType)
196
{
197
- pixel *src = NULL, *planeV = NULL, *planeU = NULL;
198
- uint32_t widthC, heightC;
199
- int hshift, vshift;
200
-
201
x265_3.5.tar.gz/source/encoder/encoder.h -> x265_3.6.tar.gz/source/encoder/encoder.h
Changed
72
1
2
#include "nal.h"
3
#include "framedata.h"
4
#include "svt.h"
5
+#include "temporalfilter.h"
6
#ifdef ENABLE_HDR10_PLUS
7
#include "dynamicHDR10/hdr10plus.h"
8
#endif
9
10
int m_bToneMap; // Enables tone-mapping
11
int m_enableNal;
12
13
- /* For histogram based scene-cut detection */
14
- pixel* m_edgePic;
15
- pixel* m_inputPic3;
16
- int32_t m_curYUVHist3HISTOGRAM_BINS;
17
- int32_t m_prevYUVHist3HISTOGRAM_BINS;
18
- int32_t m_curEdgeHist2;
19
- int32_t m_prevEdgeHist2;
20
- uint32_t m_planeSizes3;
21
- double m_edgeHistThreshold;
22
- double m_chromaHistThreshold;
23
- double m_scaledEdgeThreshold;
24
- double m_scaledChromaThreshold;
25
-
26
#ifdef ENABLE_HDR10_PLUS
27
const hdr10plus_api *m_hdr10plus_api;
28
uint8_t **m_cim;
29
30
31
ThreadSafeInteger* zoneReadCount;
32
ThreadSafeInteger* zoneWriteCount;
33
+ /* Film grain model file */
34
+ FILE* m_filmGrainIn;
35
+ OrigPicBuffer* m_origPicBuffer;
36
37
Encoder();
38
~Encoder()
39
40
41
void getStreamHeaders(NALList& list, Entropy& sbacCoder, Bitstream& bs);
42
43
+ void getEndNalUnits(NALList& list, Bitstream& bs);
44
+
45
void fetchStats(x265_stats* stats, size_t statsSizeBytes);
46
47
void printSummary();
48
49
50
void copyPicture(x265_picture *dest, const x265_picture *src);
51
52
- bool computeHistograms(x265_picture *pic);
53
- void computeHistogramSAD(double *maxUVNormalizedSAD, double *edgeNormalizedSAD, int curPoc);
54
- double normalizeRange(int32_t value, int32_t minValue, int32_t maxValue, double rangeStart, double rangeEnd);
55
- void findSceneCuts(x265_picture *pic, bool& bDup, double m_maxUVSADVal, double m_edgeSADVal, bool& isMaxThres, bool& isHardSC);
56
-
57
void initRefIdx();
58
void analyseRefIdx(int *numRefIdx);
59
void updateRefIdx();
60
61
62
void configureDolbyVisionParams(x265_param* p);
63
64
+ void configureVideoSignalTypePreset(x265_param* p);
65
+
66
+ bool isFilterThisframe(uint8_t sliceTypeConfig, int curSliceType);
67
+ bool generateMcstfRef(Frame* frameEnc, FrameEncoder* currEncoder);
68
+
69
protected:
70
71
void initVPS(VPS *vps);
72
x265_3.5.tar.gz/source/encoder/entropy.cpp -> x265_3.6.tar.gz/source/encoder/entropy.cpp
Changed
41
1
2
3
for (uint32_t i = 0; i < vps.maxTempSubLayers; i++)
4
{
5
- WRITE_UVLC(vps.maxDecPicBuffering - 1, "vps_max_dec_pic_buffering_minus1i");
6
- WRITE_UVLC(vps.numReorderPics, "vps_num_reorder_picsi");
7
- WRITE_UVLC(vps.maxLatencyIncrease + 1, "vps_max_latency_increase_plus1i");
8
+ WRITE_UVLC(vps.maxDecPicBufferingi - 1, "vps_max_dec_pic_buffering_minus1i");
9
+ WRITE_UVLC(vps.numReorderPicsi, "vps_num_reorder_picsi");
10
+ WRITE_UVLC(vps.maxLatencyIncreasei + 1, "vps_max_latency_increase_plus1i");
11
}
12
13
WRITE_CODE(0, 6, "vps_max_nuh_reserved_zero_layer_id");
14
15
16
for (uint32_t i = 0; i < sps.maxTempSubLayers; i++)
17
{
18
- WRITE_UVLC(sps.maxDecPicBuffering - 1, "sps_max_dec_pic_buffering_minus1i");
19
- WRITE_UVLC(sps.numReorderPics, "sps_num_reorder_picsi");
20
- WRITE_UVLC(sps.maxLatencyIncrease + 1, "sps_max_latency_increase_plus1i");
21
+ WRITE_UVLC(sps.maxDecPicBufferingi - 1, "sps_max_dec_pic_buffering_minus1i");
22
+ WRITE_UVLC(sps.numReorderPicsi, "sps_num_reorder_picsi");
23
+ WRITE_UVLC(sps.maxLatencyIncreasei + 1, "sps_max_latency_increase_plus1i");
24
}
25
26
WRITE_UVLC(sps.log2MinCodingBlockSize - 3, "log2_min_coding_block_size_minus3");
27
28
29
if (maxTempSubLayers > 1)
30
{
31
- WRITE_FLAG(0, "sub_layer_profile_present_flagi");
32
- WRITE_FLAG(0, "sub_layer_level_present_flagi");
33
+ for(int i = 0; i < maxTempSubLayers - 1; i++)
34
+ {
35
+ WRITE_FLAG(0, "sub_layer_profile_present_flagi");
36
+ WRITE_FLAG(0, "sub_layer_level_present_flagi");
37
+ }
38
for (int i = maxTempSubLayers - 1; i < 8 ; i++)
39
WRITE_CODE(0, 2, "reserved_zero_2bits");
40
}
41
x265_3.5.tar.gz/source/encoder/frameencoder.cpp -> x265_3.6.tar.gz/source/encoder/frameencoder.cpp
Changed
200
1
2
#include "common.h"
3
#include "slicetype.h"
4
#include "nal.h"
5
+#include "temporalfilter.h"
6
7
namespace X265_NS {
8
void weightAnalyse(Slice& slice, Frame& frame, x265_param& param);
9
10
delete m_rce.picTimingSEI;
11
delete m_rce.hrdTiming;
12
}
13
+
14
+ if (m_param->bEnableTemporalFilter)
15
+ {
16
+ delete m_frameEncTF->m_metld;
17
+
18
+ for (int i = 0; i < (m_frameEncTF->m_range << 1); i++)
19
+ m_frameEncTF->destroyRefPicInfo(&m_mcstfRefListi);
20
+
21
+ delete m_frameEncTF;
22
+ }
23
}
24
25
bool FrameEncoder::init(Encoder *top, int numRows, int numCols)
26
27
m_sliceAddrBits = (uint16_t)(tmp + 1);
28
}
29
30
+ if (m_param->bEnableTemporalFilter)
31
+ {
32
+ m_frameEncTF = new TemporalFilter();
33
+ if (m_frameEncTF)
34
+ m_frameEncTF->init(m_param);
35
+
36
+ for (int i = 0; i < (m_frameEncTF->m_range << 1); i++)
37
+ ok &= !!m_frameEncTF->createRefPicInfo(&m_mcstfRefListi, m_param);
38
+ }
39
+
40
return ok;
41
}
42
43
44
m_ssimCnt = 0;
45
memset(&(m_frame->m_encData->m_frameStats), 0, sizeof(m_frame->m_encData->m_frameStats));
46
47
- if (!m_param->bHistBasedSceneCut && m_param->rc.aqMode != X265_AQ_EDGE && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
48
+ if (m_param->rc.aqMode != X265_AQ_EDGE && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
49
{
50
int height = m_frame->m_fencPic->m_picHeight;
51
int width = m_frame->m_fencPic->m_picWidth;
52
53
* unit) */
54
Slice* slice = m_frame->m_encData->m_slice;
55
56
+ if (m_param->bEnableEndOfSequence && m_frame->m_lowres.sliceType == X265_TYPE_IDR && m_frame->m_poc)
57
+ {
58
+ m_bs.resetBits();
59
+ m_nalList.serialize(NAL_UNIT_EOS, m_bs);
60
+ }
61
+
62
if (m_param->bEnableAccessUnitDelimiters && (m_frame->m_poc || m_param->bRepeatHeaders))
63
{
64
m_bs.resetBits();
65
66
int qp = m_top->m_rateControl->rateControlStart(m_frame, &m_rce, m_top);
67
m_rce.newQp = qp;
68
69
+ if (m_param->bEnableTemporalFilter)
70
+ {
71
+ m_frameEncTF->m_QP = qp;
72
+ m_frameEncTF->bilateralFilter(m_frame, m_mcstfRefList, m_param->temporalFilterStrength);
73
+ }
74
+
75
if (m_nr)
76
{
77
if (qp > QP_MAX_SPEC && m_frame->m_param->rc.vbvBufferSize)
78
79
// wait after removal of the access unit with the most recent
80
// buffering period SEI message
81
sei->m_auCpbRemovalDelay = X265_MIN(X265_MAX(1, m_rce.encodeOrder - prevBPSEI), (1 << hrd->cpbRemovalDelayLength));
82
- sei->m_picDpbOutputDelay = slice->m_sps->numReorderPics + poc - m_rce.encodeOrder;
83
+ sei->m_picDpbOutputDelay = slice->m_sps->numReorderPicsm_frame->m_tempLayer + poc - m_rce.encodeOrder;
84
}
85
86
sei->writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
87
88
m_seiAlternativeTC.m_preferredTransferCharacteristics = m_param->preferredTransferCharacteristics;
89
m_seiAlternativeTC.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
90
}
91
-
92
+ /* Write Film grain characteristics if present */
93
+ if (this->m_top->m_filmGrainIn)
94
+ {
95
+ FilmGrainCharacteristics m_filmGrain;
96
+ /* Read the Film grain model file */
97
+ readModel(&m_filmGrain, this->m_top->m_filmGrainIn);
98
+ m_filmGrain.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
99
+ }
100
/* Write user SEI */
101
for (int i = 0; i < m_frame->m_userSEI.numPayloads; i++)
102
{
103
104
if (m_param->bDynamicRefine && m_top->m_startPoint <= m_frame->m_encodeOrder) //Avoid collecting data that will not be used by future frames.
105
collectDynDataFrame();
106
107
+ if (m_param->bEnableTemporalFilter && m_top->isFilterThisframe(m_frame->m_mcstf->m_sliceTypeConfig, m_frame->m_lowres.sliceType))
108
+ {
109
+ //Reset the MCSTF context in Frame Encoder and Frame
110
+ for (int i = 0; i < (m_frameEncTF->m_range << 1); i++)
111
+ {
112
+ memset(m_mcstfRefListi.mvs0, 0, sizeof(MV) * ((m_param->sourceWidth / 16) * (m_param->sourceHeight / 16)));
113
+ memset(m_mcstfRefListi.mvs1, 0, sizeof(MV) * ((m_param->sourceWidth / 16) * (m_param->sourceHeight / 16)));
114
+ memset(m_mcstfRefListi.mvs2, 0, sizeof(MV) * ((m_param->sourceWidth / 16) * (m_param->sourceHeight / 16)));
115
+ memset(m_mcstfRefListi.mvs, 0, sizeof(MV) * ((m_param->sourceWidth / 4) * (m_param->sourceHeight / 4)));
116
+ memset(m_mcstfRefListi.noise, 0, sizeof(int) * ((m_param->sourceWidth / 4) * (m_param->sourceHeight / 4)));
117
+ memset(m_mcstfRefListi.error, 0, sizeof(int) * ((m_param->sourceWidth / 4) * (m_param->sourceHeight / 4)));
118
+
119
+ m_frame->m_mcstf->m_numRef = 0;
120
+ }
121
+ }
122
+
123
+
124
if (m_param->rc.bStatWrite)
125
{
126
int totalI = 0, totalP = 0, totalSkip = 0;
127
128
129
m_bs.writeByteAlignment();
130
131
- m_nalList.serialize(slice->m_nalUnitType, m_bs);
132
+ m_nalList.serialize(slice->m_nalUnitType, m_bs, (!!m_param->bEnableTemporalSubLayers ? m_frame->m_tempLayer + 1 : (1 + (slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N))));
133
}
134
}
135
else
136
137
m_entropyCoder.codeSliceHeaderWPPEntryPoints(m_substreamSizes, (slice->m_sps->numCuInHeight - 1), maxStreamSize);
138
m_bs.writeByteAlignment();
139
140
- m_nalList.serialize(slice->m_nalUnitType, m_bs);
141
+ m_nalList.serialize(slice->m_nalUnitType, m_bs, (!!m_param->bEnableTemporalSubLayers ? m_frame->m_tempLayer + 1 : (1 + (slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N))));
142
}
143
144
if (m_param->decodedPictureHashSEI)
145
146
m_nr->nrOffsetDenoisecat0 = 0;
147
}
148
}
149
+
150
+void FrameEncoder::readModel(FilmGrainCharacteristics* m_filmGrain, FILE* filmgrain)
151
+{
152
+ char const* errorMessage = "Error reading FilmGrain characteristics\n";
153
+ FilmGrain m_fg;
154
+ x265_fread((char* )&m_fg, sizeof(bool) * 3 + sizeof(uint8_t), 1, filmgrain, errorMessage);
155
+ m_filmGrain->m_filmGrainCharacteristicsCancelFlag = m_fg.m_filmGrainCharacteristicsCancelFlag;
156
+ m_filmGrain->m_filmGrainCharacteristicsPersistenceFlag = m_fg.m_filmGrainCharacteristicsPersistenceFlag;
157
+ m_filmGrain->m_filmGrainModelId = m_fg.m_filmGrainModelId;
158
+ m_filmGrain->m_separateColourDescriptionPresentFlag = m_fg.m_separateColourDescriptionPresentFlag;
159
+ if (m_filmGrain->m_separateColourDescriptionPresentFlag)
160
+ {
161
+ ColourDescription m_clr;
162
+ x265_fread((char* )&m_clr, sizeof(bool) + sizeof(uint8_t) * 5, 1, filmgrain, errorMessage);
163
+ m_filmGrain->m_filmGrainBitDepthLumaMinus8 = m_clr.m_filmGrainBitDepthLumaMinus8;
164
+ m_filmGrain->m_filmGrainBitDepthChromaMinus8 = m_clr.m_filmGrainBitDepthChromaMinus8;
165
+ m_filmGrain->m_filmGrainFullRangeFlag = m_clr.m_filmGrainFullRangeFlag;
166
+ m_filmGrain->m_filmGrainColourPrimaries = m_clr.m_filmGrainColourPrimaries;
167
+ m_filmGrain->m_filmGrainTransferCharacteristics = m_clr.m_filmGrainTransferCharacteristics;
168
+ m_filmGrain->m_filmGrainMatrixCoeffs = m_clr.m_filmGrainMatrixCoeffs;
169
+ }
170
+ FGPresent m_present;
171
+ x265_fread((char* )&m_present, sizeof(bool) * 3 + sizeof(uint8_t) * 2, 1, filmgrain, errorMessage);
172
+ m_filmGrain->m_blendingModeId = m_present.m_blendingModeId;
173
+ m_filmGrain->m_log2ScaleFactor = m_present.m_log2ScaleFactor;
174
+ m_filmGrain->m_compModel0.bPresentFlag = m_present.m_presentFlag0;
175
+ m_filmGrain->m_compModel1.bPresentFlag = m_present.m_presentFlag1;
176
+ m_filmGrain->m_compModel2.bPresentFlag = m_present.m_presentFlag2;
177
+ for (int i = 0; i < MAX_NUM_COMPONENT; i++)
178
+ {
179
+ if (m_filmGrain->m_compModeli.bPresentFlag)
180
+ {
181
+ x265_fread((char* )(&m_filmGrain->m_compModeli.m_filmGrainNumIntensityIntervalMinus1), sizeof(uint8_t), 1, filmgrain, errorMessage);
182
+ x265_fread((char* )(&m_filmGrain->m_compModeli.numModelValues), sizeof(uint8_t), 1, filmgrain, errorMessage);
183
+ m_filmGrain->m_compModeli.intensityValues = (FilmGrainCharacteristics::CompModelIntensityValues* ) malloc(sizeof(FilmGrainCharacteristics::CompModelIntensityValues) * (m_filmGrain->m_compModeli.m_filmGrainNumIntensityIntervalMinus1+1)) ;
184
+ for (int j = 0; j <= m_filmGrain->m_compModeli.m_filmGrainNumIntensityIntervalMinus1; j++)
185
+ {
186
+ x265_fread((char* )(&m_filmGrain->m_compModeli.intensityValuesj.intensityIntervalLowerBound), sizeof(uint8_t), 1, filmgrain, errorMessage);
187
+ x265_fread((char* )(&m_filmGrain->m_compModeli.intensityValuesj.intensityIntervalUpperBound), sizeof(uint8_t), 1, filmgrain, errorMessage);
188
+ m_filmGrain->m_compModeli.intensityValuesj.compModelValue = (int* ) malloc(sizeof(int) * (m_filmGrain->m_compModeli.numModelValues));
189
+ for (int k = 0; k < m_filmGrain->m_compModeli.numModelValues; k++)
190
+ {
191
+ x265_fread((char* )(&m_filmGrain->m_compModeli.intensityValuesj.compModelValuek), sizeof(int), 1, filmgrain, errorMessage);
192
+ }
193
+ }
194
+ }
195
+ }
196
+}
197
#if ENABLE_LIBVMAF
198
void FrameEncoder::vmafFrameLevelScore()
199
{
200
x265_3.5.tar.gz/source/encoder/frameencoder.h -> x265_3.6.tar.gz/source/encoder/frameencoder.h
Changed
63
1
2
#include "ratecontrol.h"
3
#include "reference.h"
4
#include "nal.h"
5
+#include "temporalfilter.h"
6
7
namespace X265_NS {
8
// private x265 namespace
9
10
}
11
};
12
13
+/*Film grain characteristics*/
14
+struct FilmGrain
15
+{
16
+ bool m_filmGrainCharacteristicsCancelFlag;
17
+ bool m_filmGrainCharacteristicsPersistenceFlag;
18
+ bool m_separateColourDescriptionPresentFlag;
19
+ uint8_t m_filmGrainModelId;
20
+ uint8_t m_blendingModeId;
21
+ uint8_t m_log2ScaleFactor;
22
+};
23
+
24
+struct ColourDescription
25
+{
26
+ bool m_filmGrainFullRangeFlag;
27
+ uint8_t m_filmGrainBitDepthLumaMinus8;
28
+ uint8_t m_filmGrainBitDepthChromaMinus8;
29
+ uint8_t m_filmGrainColourPrimaries;
30
+ uint8_t m_filmGrainTransferCharacteristics;
31
+ uint8_t m_filmGrainMatrixCoeffs;
32
+};
33
+
34
+struct FGPresent
35
+{
36
+ uint8_t m_blendingModeId;
37
+ uint8_t m_log2ScaleFactor;
38
+ bool m_presentFlag3;
39
+};
40
+
41
// Manages the wave-front processing of a single encoding frame
42
class FrameEncoder : public WaveFront, public Thread
43
{
44
45
FrameFilter m_frameFilter;
46
NALList m_nalList;
47
48
+ // initialization for mcstf
49
+ TemporalFilter* m_frameEncTF;
50
+ TemporalFilterRefPicInfo m_mcstfRefListMAX_MCSTF_TEMPORAL_WINDOW_LENGTH;
51
+
52
class WeightAnalysis : public BondedTaskGroup
53
{
54
public:
55
56
void collectDynDataFrame();
57
void computeAvgTrainingData();
58
void collectDynDataRow(CUData& ctu, FrameStats* rowStats);
59
+ void readModel(FilmGrainCharacteristics* m_filmGrain, FILE* filmgrain);
60
};
61
}
62
63
x265_3.5.tar.gz/source/encoder/level.cpp -> x265_3.6.tar.gz/source/encoder/level.cpp
Changed
86
1
2
* for intra-only profiles (vps.ptl.intraConstraintFlag) */
3
vps.ptl.lowerBitRateConstraintFlag = true;
4
5
- vps.maxTempSubLayers = param.bEnableTemporalSubLayers ? 2 : 1;
6
+ vps.maxTempSubLayers = !!param.bEnableTemporalSubLayers ? param.bEnableTemporalSubLayers : 1;
7
8
if (param.internalCsp == X265_CSP_I420 && param.internalBitDepth <= 10)
9
{
10
11
12
/* The value of sps_max_dec_pic_buffering_minus1 HighestTid + 1 shall be less than
13
* or equal to MaxDpbSize */
14
- if (vps.maxDecPicBuffering > maxDpbSize)
15
+ if (vps.maxDecPicBufferingvps.maxTempSubLayers - 1 > maxDpbSize)
16
continue;
17
18
/* For level 5 and higher levels, the value of CtbSizeY shall be equal to 32 or 64 */
19
20
}
21
22
/* The value of NumPocTotalCurr shall be less than or equal to 8 */
23
- int numPocTotalCurr = param.maxNumReferences + vps.numReorderPics;
24
- if (numPocTotalCurr > 8)
25
+ int numPocTotalCurr = param.maxNumReferences + vps.numReorderPicsvps.maxTempSubLayers - 1;
26
+ if (numPocTotalCurr > 10)
27
{
28
x265_log(¶m, X265_LOG_WARNING, "level %s detected, but NumPocTotalCurr (total references) is non-compliant\n", levelsi.name);
29
vps.ptl.profileIdc = Profile::NONE;
30
31
* circumstances it will be quite noisy */
32
bool enforceLevel(x265_param& param, VPS& vps)
33
{
34
- vps.numReorderPics = (param.bBPyramid && param.bframes > 1) ? 2 : !!param.bframes;
35
- vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 2, (uint32_t)param.maxNumReferences) + 1);
36
+ vps.maxTempSubLayers = !!param.bEnableTemporalSubLayers ? param.bEnableTemporalSubLayers : 1;
37
+ for (uint32_t i = 0; i < vps.maxTempSubLayers; i++)
38
+ {
39
+ vps.numReorderPicsi = (i == 0) ? ((param.bBPyramid && param.bframes > 1) ? 2 : !!param.bframes) : i;
40
+ vps.maxDecPicBufferingi = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPicsi + 2, (uint32_t)param.maxNumReferences) + 1);
41
+ }
42
43
+ if (!!param.bEnableTemporalSubLayers)
44
+ {
45
+ for (int i = 0; i < MAX_T_LAYERS - 1; i++)
46
+ {
47
+ // a lower layer can not have higher value of numReorderPics than a higher layer
48
+ if (vps.numReorderPicsi + 1 < vps.numReorderPicsi)
49
+ {
50
+ vps.numReorderPicsi + 1 = vps.numReorderPicsi;
51
+ }
52
+ // the value of numReorderPicsi shall be in the range of 0 to maxDecPicBufferingi - 1, inclusive
53
+ if (vps.numReorderPicsi > vps.maxDecPicBufferingi - 1)
54
+ {
55
+ vps.maxDecPicBufferingi = vps.numReorderPicsi + 1;
56
+ }
57
+ // a lower layer can not have higher value of maxDecPicBuffering than a higher layer
58
+ if (vps.maxDecPicBufferingi + 1 < vps.maxDecPicBufferingi)
59
+ {
60
+ vps.maxDecPicBufferingi + 1 = vps.maxDecPicBufferingi;
61
+ }
62
+ }
63
+
64
+ // the value of numReorderPicsi shall be in the range of 0 to maxDecPicBuffering i - 1, inclusive
65
+ if (vps.numReorderPicsMAX_T_LAYERS - 1 > vps.maxDecPicBufferingMAX_T_LAYERS - 1 - 1)
66
+ {
67
+ vps.maxDecPicBufferingMAX_T_LAYERS - 1 = vps.numReorderPicsMAX_T_LAYERS - 1 + 1;
68
+ }
69
+ }
70
/* no level specified by user, just auto-detect from the configuration */
71
if (param.levelIdc <= 0)
72
return true;
73
74
}
75
76
int savedRefCount = param.maxNumReferences;
77
- while (vps.maxDecPicBuffering > maxDpbSize && param.maxNumReferences > 1)
78
+ while (vps.maxDecPicBufferingvps.maxTempSubLayers - 1 > maxDpbSize && param.maxNumReferences > 1)
79
{
80
param.maxNumReferences--;
81
- vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 1, (uint32_t)param.maxNumReferences) + 1);
82
+ vps.maxDecPicBufferingvps.maxTempSubLayers - 1 = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPicsvps.maxTempSubLayers - 1 + 1, (uint32_t)param.maxNumReferences) + 1);
83
}
84
if (param.maxNumReferences != savedRefCount)
85
x265_log(¶m, X265_LOG_WARNING, "Lowering max references to %d to meet level requirement\n", param.maxNumReferences);
86
x265_3.5.tar.gz/source/encoder/motion.cpp -> x265_3.6.tar.gz/source/encoder/motion.cpp
Changed
33
1
2
X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
3
}
4
5
+/* Called by lookahead, luma only, no use of PicYuv */
6
+void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int method, const int refine)
7
+{
8
+ partEnum = partitionFromSizes(pwidth, pheight);
9
+ X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
10
+ sad = primitives.pupartEnum.sad;
11
+ ads = primitives.pupartEnum.ads;
12
+ satd = primitives.pupartEnum.satd;
13
+ sad_x3 = primitives.pupartEnum.sad_x3;
14
+ sad_x4 = primitives.pupartEnum.sad_x4;
15
+
16
+
17
+ blockwidth = pwidth;
18
+ blockOffset = offset;
19
+ absPartIdx = ctuAddr = -1;
20
+
21
+ /* Search params */
22
+ searchMethod = method;
23
+ subpelRefine = refine;
24
+
25
+ /* copy PU block into cache */
26
+ primitives.pupartEnum.copy_pp(fencPUYuv.m_buf0, FENC_STRIDE, fencY + offset, stride);
27
+ X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
28
+}
29
+
30
/* Called by Search::predInterSearch() or --pme equivalent, chroma residual might be considered */
31
void MotionEstimate::setSourcePU(const Yuv& srcFencYuv, int _ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight, const int method, const int refine, bool bChroma)
32
{
33
x265_3.5.tar.gz/source/encoder/motion.h -> x265_3.6.tar.gz/source/encoder/motion.h
Changed
10
1
2
void init(int csp);
3
4
/* Methods called at slice setup */
5
-
6
+ void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int searchMethod, const int subpelRefine);
7
void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int searchMethod, const int searchL0, const int searchL1, const int subpelRefine);
8
void setSourcePU(const Yuv& srcFencYuv, int ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight, const int searchMethod, const int subpelRefine, bool bChroma);
9
10
x265_3.5.tar.gz/source/encoder/nal.cpp -> x265_3.6.tar.gz/source/encoder/nal.cpp
Changed
19
1
2
other.m_buffer = X265_MALLOC(uint8_t, m_allocSize);
3
}
4
5
-void NALList::serialize(NalUnitType nalUnitType, const Bitstream& bs)
6
+void NALList::serialize(NalUnitType nalUnitType, const Bitstream& bs, uint8_t temporalID)
7
{
8
static const char startCodePrefix = { 0, 0, 0, 1 };
9
10
11
* nuh_reserved_zero_6bits 6-bits
12
* nuh_temporal_id_plus1 3-bits */
13
outbytes++ = (uint8_t)nalUnitType << 1;
14
- outbytes++ = 1 + (nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N);
15
+ outbytes++ = temporalID;
16
17
/* 7.4.1 ...
18
* Within the NAL unit, the following three-byte sequences shall not occur at
19
x265_3.5.tar.gz/source/encoder/nal.h -> x265_3.6.tar.gz/source/encoder/nal.h
Changed
10
1
2
3
void takeContents(NALList& other);
4
5
- void serialize(NalUnitType nalUnitType, const Bitstream& bs);
6
+ void serialize(NalUnitType nalUnitType, const Bitstream& bs, uint8_t temporalID = 1);
7
8
uint32_t serializeSubstreams(uint32_t* streamSizeBytes, uint32_t streamCount, const Bitstream* streams);
9
};
10
x265_3.5.tar.gz/source/encoder/ratecontrol.cpp -> x265_3.6.tar.gz/source/encoder/ratecontrol.cpp
Changed
201
1
2
#define BR_SHIFT 6
3
#define CPB_SHIFT 4
4
5
+#define SHARED_DATA_ALIGNMENT 4 ///< 4btye, 32bit
6
+#define CUTREE_SHARED_MEM_NAME "cutree"
7
+#define GOP_CNT_CU_TREE 3
8
+
9
using namespace X265_NS;
10
11
/* Amortize the partial cost of I frames over the next N frames */
12
13
return output;
14
}
15
16
+typedef struct CUTreeSharedDataItem
17
+{
18
+ uint8_t *type;
19
+ uint16_t *stats;
20
+}CUTreeSharedDataItem;
21
+
22
+void static ReadSharedCUTreeData(void *dst, void *src, int32_t size)
23
+{
24
+ CUTreeSharedDataItem *statsDst = reinterpret_cast<CUTreeSharedDataItem *>(dst);
25
+ uint8_t *typeSrc = reinterpret_cast<uint8_t *>(src);
26
+ *statsDst->type = *typeSrc;
27
+
28
+ ///< for memory alignment, the type will take 32bit in the shared memory
29
+ int32_t offset = (sizeof(*statsDst->type) + SHARED_DATA_ALIGNMENT - 1) & ~(SHARED_DATA_ALIGNMENT - 1);
30
+ uint16_t *statsSrc = reinterpret_cast<uint16_t *>(typeSrc + offset);
31
+ memcpy(statsDst->stats, statsSrc, size - offset);
32
+}
33
+
34
+void static WriteSharedCUTreeData(void *dst, void *src, int32_t size)
35
+{
36
+ CUTreeSharedDataItem *statsSrc = reinterpret_cast<CUTreeSharedDataItem *>(src);
37
+ uint8_t *typeDst = reinterpret_cast<uint8_t *>(dst);
38
+ *typeDst = *statsSrc->type;
39
+
40
+ ///< for memory alignment, the type will take 32bit in the shared memory
41
+ int32_t offset = (sizeof(*statsSrc->type) + SHARED_DATA_ALIGNMENT - 1) & ~(SHARED_DATA_ALIGNMENT - 1);
42
+ uint16_t *statsDst = reinterpret_cast<uint16_t *>(typeDst + offset);
43
+ memcpy(statsDst, statsSrc->stats, size - offset);
44
+}
45
+
46
+
47
inline double qScale2bits(RateControlEntry *rce, double qScale)
48
{
49
if (qScale < 0.1)
50
51
m_lastAbrResetPoc = -1;
52
m_statFileOut = NULL;
53
m_cutreeStatFileOut = m_cutreeStatFileIn = NULL;
54
+ m_cutreeShrMem = NULL;
55
m_rce2Pass = NULL;
56
m_encOrder = NULL;
57
m_lastBsliceSatdCost = 0;
58
59
m_initVbv = false;
60
m_singleFrameVbv = 0;
61
m_rateTolerance = 1.0;
62
+ m_encodedSegmentBits = 0;
63
+ m_segDur = 0;
64
65
if (m_param->rc.vbvBufferSize)
66
{
67
68
m_cuTreeStats.qpBufferi = NULL;
69
}
70
71
-bool RateControl::init(const SPS& sps)
72
+bool RateControl::initCUTreeSharedMem()
73
{
74
- if (m_isVbv && !m_initVbv)
75
- {
76
- /* We don't support changing the ABR bitrate right now,
77
- * so if the stream starts as CBR, keep it CBR. */
78
- if (m_param->rc.vbvBufferSize < (int)(m_param->rc.vbvMaxBitrate / m_fps))
79
+ if (!m_cutreeShrMem) {
80
+ m_cutreeShrMem = new RingMem();
81
+ if (!m_cutreeShrMem)
82
{
83
- m_param->rc.vbvBufferSize = (int)(m_param->rc.vbvMaxBitrate / m_fps);
84
- x265_log(m_param, X265_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
85
- m_param->rc.vbvBufferSize);
86
+ return false;
87
}
88
- int vbvBufferSize = m_param->rc.vbvBufferSize * 1000;
89
- int vbvMaxBitrate = m_param->rc.vbvMaxBitrate * 1000;
90
91
- if (m_param->bEmitHRDSEI && !m_param->decoderVbvMaxRate)
92
+ ///< now cutree data form at most 3 gops would be stored in the shared memory at the same time
93
+ int32_t itemSize = (sizeof(uint8_t) + SHARED_DATA_ALIGNMENT - 1) & ~(SHARED_DATA_ALIGNMENT - 1);
94
+ if (m_param->rc.qgSize == 8)
95
{
96
- const HRDInfo* hrd = &sps.vuiParameters.hrdParameters;
97
- vbvBufferSize = hrd->cpbSizeValue << (hrd->cpbSizeScale + CPB_SHIFT);
98
- vbvMaxBitrate = hrd->bitRateValue << (hrd->bitRateScale + BR_SHIFT);
99
+ itemSize += sizeof(uint16_t) * m_ncu * 4;
100
}
101
- m_bufferRate = vbvMaxBitrate / m_fps;
102
- m_vbvMaxRate = vbvMaxBitrate;
103
- m_bufferSize = vbvBufferSize;
104
- m_singleFrameVbv = m_bufferRate * 1.1 > m_bufferSize;
105
+ else
106
+ {
107
+ itemSize += sizeof(uint16_t) * m_ncu;
108
+ }
109
+
110
+ int32_t itemCnt = X265_MIN(m_param->keyframeMax, (int)(m_fps + 0.5));
111
+ itemCnt *= GOP_CNT_CU_TREE;
112
113
- if (m_param->rc.vbvBufferInit > 1.)
114
- m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, m_param->rc.vbvBufferInit / m_param->rc.vbvBufferSize);
115
- if (m_param->vbvBufferEnd > 1.)
116
- m_param->vbvBufferEnd = x265_clip3(0.0, 1.0, m_param->vbvBufferEnd / m_param->rc.vbvBufferSize);
117
- if (m_param->vbvEndFrameAdjust > 1.)
118
- m_param->vbvEndFrameAdjust = x265_clip3(0.0, 1.0, m_param->vbvEndFrameAdjust);
119
- m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, X265_MAX(m_param->rc.vbvBufferInit, m_bufferRate / m_bufferSize));
120
- m_bufferFillFinal = m_bufferSize * m_param->rc.vbvBufferInit;
121
- m_bufferFillActual = m_bufferFillFinal;
122
- m_bufferExcess = 0;
123
- m_minBufferFill = m_param->minVbvFullness / 100;
124
- m_maxBufferFill = 1 - (m_param->maxVbvFullness / 100);
125
- m_initVbv = true;
126
+ char shrnameMAX_SHR_NAME_LEN = { 0 };
127
+ strcpy(shrname, m_param->rc.sharedMemName);
128
+ strcat(shrname, CUTREE_SHARED_MEM_NAME);
129
+
130
+ if (!m_cutreeShrMem->init(itemSize, itemCnt, shrname))
131
+ {
132
+ return false;
133
+ }
134
}
135
136
+ return true;
137
+}
138
+
139
+void RateControl::initVBV(const SPS& sps)
140
+{
141
+ /* We don't support changing the ABR bitrate right now,
142
+ * so if the stream starts as CBR, keep it CBR. */
143
+ if (m_param->rc.vbvBufferSize < (int)(m_param->rc.vbvMaxBitrate / m_fps))
144
+ {
145
+ m_param->rc.vbvBufferSize = (int)(m_param->rc.vbvMaxBitrate / m_fps);
146
+ x265_log(m_param, X265_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
147
+ m_param->rc.vbvBufferSize);
148
+ }
149
+ int vbvBufferSize = m_param->rc.vbvBufferSize * 1000;
150
+ int vbvMaxBitrate = m_param->rc.vbvMaxBitrate * 1000;
151
+
152
+ if (m_param->bEmitHRDSEI && !m_param->decoderVbvMaxRate)
153
+ {
154
+ const HRDInfo* hrd = &sps.vuiParameters.hrdParameters;
155
+ vbvBufferSize = hrd->cpbSizeValue << (hrd->cpbSizeScale + CPB_SHIFT);
156
+ vbvMaxBitrate = hrd->bitRateValue << (hrd->bitRateScale + BR_SHIFT);
157
+ }
158
+ m_bufferRate = vbvMaxBitrate / m_fps;
159
+ m_vbvMaxRate = vbvMaxBitrate;
160
+ m_bufferSize = vbvBufferSize;
161
+ m_singleFrameVbv = m_bufferRate * 1.1 > m_bufferSize;
162
+
163
+ if (m_param->rc.vbvBufferInit > 1.)
164
+ m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, m_param->rc.vbvBufferInit / m_param->rc.vbvBufferSize);
165
+ if (m_param->vbvBufferEnd > 1.)
166
+ m_param->vbvBufferEnd = x265_clip3(0.0, 1.0, m_param->vbvBufferEnd / m_param->rc.vbvBufferSize);
167
+ if (m_param->vbvEndFrameAdjust > 1.)
168
+ m_param->vbvEndFrameAdjust = x265_clip3(0.0, 1.0, m_param->vbvEndFrameAdjust);
169
+ m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, X265_MAX(m_param->rc.vbvBufferInit, m_bufferRate / m_bufferSize));
170
+ m_bufferFillFinal = m_bufferSize * m_param->rc.vbvBufferInit;
171
+ m_bufferFillActual = m_bufferFillFinal;
172
+ m_bufferExcess = 0;
173
+ m_minBufferFill = m_param->minVbvFullness / 100;
174
+ m_maxBufferFill = 1 - (m_param->maxVbvFullness / 100);
175
+ m_initVbv = true;
176
+}
177
+
178
+bool RateControl::init(const SPS& sps)
179
+{
180
+ if (m_isVbv && !m_initVbv)
181
+ initVBV(sps);
182
+
183
if (!m_param->bResetZoneConfig && (m_relativeComplexity == NULL))
184
{
185
m_relativeComplexity = X265_MALLOC(double, m_param->reconfigWindowSize);
186
187
188
m_totalBits = 0;
189
m_encodedBits = 0;
190
+ m_encodedSegmentBits = 0;
191
m_framesDone = 0;
192
+ m_segDur = 0;
193
m_residualCost = 0;
194
m_partialResidualCost = 0;
195
m_amortizeFraction = 0.85;
196
197
/* Load stat file and init 2pass algo */
198
if (m_param->rc.bStatRead)
199
{
200
- m_expectedBitsSum = 0;
201
x265_3.5.tar.gz/source/encoder/ratecontrol.h -> x265_3.6.tar.gz/source/encoder/ratecontrol.h
Changed
90
1
2
3
#include "common.h"
4
#include "sei.h"
5
+#include "ringmem.h"
6
7
namespace X265_NS {
8
// encoder namespace
9
10
#define MIN_AMORTIZE_FRACTION 0.2
11
#define CLIP_DURATION(f) x265_clip3(MIN_FRAME_DURATION, MAX_FRAME_DURATION, f)
12
13
-/*Scenecut Aware QP*/
14
-#define WINDOW1_DELTA 1.0 /* The offset for the frames coming in the window-1*/
15
-#define WINDOW2_DELTA 0.7 /* The offset for the frames coming in the window-2*/
16
-#define WINDOW3_DELTA 0.4 /* The offset for the frames coming in the window-3*/
17
-
18
struct Predictor
19
{
20
double coeffMin;
21
22
Predictor rowPreds32;
23
Predictor* rowPred2;
24
25
+ int64_t currentSatd;
26
int64_t lastSatd; /* Contains the picture cost of the previous frame, required for resetAbr and VBV */
27
int64_t leadingNoBSatd;
28
int64_t rowTotalBits; /* update cplxrsum and totalbits at the end of 2 rows */
29
30
double rowCplxrSum;
31
double qpNoVbv;
32
double bufferFill;
33
+ double bufferFillFinal;
34
+ double bufferFillActual;
35
double targetFill;
36
bool vbvEndAdj;
37
double frameDuration;
38
39
double m_qCompress;
40
int64_t m_totalBits; /* total bits used for already encoded frames (after ammortization) */
41
int64_t m_encodedBits; /* bits used for encoded frames (without ammortization) */
42
+ int64_t m_encodedSegmentBits; /* bits used for encoded frames in a segment*/
43
+ double m_segDur;
44
double m_fps;
45
int64_t m_satdCostWindow50;
46
int64_t m_encodedBitsWindow50;
47
48
FILE* m_statFileOut;
49
FILE* m_cutreeStatFileOut;
50
FILE* m_cutreeStatFileIn;
51
+ ///< store the cutree data in memory instead of file
52
+ RingMem *m_cutreeShrMem;
53
double m_lastAccumPNorm;
54
double m_expectedBitsSum; /* sum of qscale2bits after rceq, ratefactor, and overflow, only includes finished frames */
55
int64_t m_predictedBits;
56
57
RateControl(x265_param& p, Encoder *enc);
58
bool init(const SPS& sps);
59
void initHRD(SPS& sps);
60
+ void initVBV(const SPS& sps);
61
void reconfigureRC();
62
63
void setFinalFrameCount(int count);
64
65
int writeRateControlFrameStats(Frame* curFrame, RateControlEntry* rce);
66
bool initPass2();
67
68
+ bool initCUTreeSharedMem();
69
+ void skipCUTreeSharedMemRead(int32_t cnt);
70
+
71
double forwardMasking(Frame* curFrame, double q);
72
double backwardMasking(Frame* curFrame, double q);
73
74
75
double rateEstimateQscale(Frame* pic, RateControlEntry *rce); // main logic for calculating QP based on ABR
76
double tuneAbrQScaleFromFeedback(double qScale);
77
double tuneQScaleForZone(RateControlEntry *rce, double qScale); // Tune qScale to adhere to zone budget
78
+ double tuneQscaleForSBRC(Frame* curFrame, double q); // Tune qScale to adhere to segment budget
79
void accumPQpUpdate();
80
81
int getPredictorType(int lowresSliceType, int sliceType);
82
83
double tuneQScaleForGrain(double rcOverflow);
84
void splitdeltaPOC(char deltapoc, RateControlEntry *rce);
85
void splitbUsed(char deltapoc, RateControlEntry *rce);
86
+ void checkAndResetCRF(RateControlEntry* rce);
87
};
88
}
89
#endif // ifndef X265_RATECONTROL_H
90
x265_3.5.tar.gz/source/encoder/sei.cpp -> x265_3.6.tar.gz/source/encoder/sei.cpp
Changed
10
1
2
{
3
if (nalUnitType != NAL_UNIT_UNSPECIFIED)
4
bs.writeByteAlignment();
5
- list.serialize(nalUnitType, bs);
6
+ list.serialize(nalUnitType, bs, (1 + (nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N)));
7
}
8
}
9
10
x265_3.5.tar.gz/source/encoder/sei.h -> x265_3.6.tar.gz/source/encoder/sei.h
Changed
103
1
2
}
3
};
4
5
+/* Film grain characteristics */
6
+class FilmGrainCharacteristics : public SEI
7
+{
8
+ public:
9
+
10
+ FilmGrainCharacteristics()
11
+ {
12
+ m_payloadType = FILM_GRAIN_CHARACTERISTICS;
13
+ m_payloadSize = 0;
14
+ }
15
+
16
+ struct CompModelIntensityValues
17
+ {
18
+ uint8_t intensityIntervalLowerBound;
19
+ uint8_t intensityIntervalUpperBound;
20
+ int* compModelValue;
21
+ };
22
+
23
+ struct CompModel
24
+ {
25
+ bool bPresentFlag;
26
+ uint8_t numModelValues;
27
+ uint8_t m_filmGrainNumIntensityIntervalMinus1;
28
+ CompModelIntensityValues* intensityValues;
29
+ };
30
+
31
+ CompModel m_compModelMAX_NUM_COMPONENT;
32
+ bool m_filmGrainCharacteristicsPersistenceFlag;
33
+ bool m_filmGrainCharacteristicsCancelFlag;
34
+ bool m_separateColourDescriptionPresentFlag;
35
+ bool m_filmGrainFullRangeFlag;
36
+ uint8_t m_filmGrainModelId;
37
+ uint8_t m_blendingModeId;
38
+ uint8_t m_log2ScaleFactor;
39
+ uint8_t m_filmGrainBitDepthLumaMinus8;
40
+ uint8_t m_filmGrainBitDepthChromaMinus8;
41
+ uint8_t m_filmGrainColourPrimaries;
42
+ uint8_t m_filmGrainTransferCharacteristics;
43
+ uint8_t m_filmGrainMatrixCoeffs;
44
+
45
+ void writeSEI(const SPS&)
46
+ {
47
+ WRITE_FLAG(m_filmGrainCharacteristicsCancelFlag, "film_grain_characteristics_cancel_flag");
48
+
49
+ if (!m_filmGrainCharacteristicsCancelFlag)
50
+ {
51
+ WRITE_CODE(m_filmGrainModelId, 2, "film_grain_model_id");
52
+ WRITE_FLAG(m_separateColourDescriptionPresentFlag, "separate_colour_description_present_flag");
53
+ if (m_separateColourDescriptionPresentFlag)
54
+ {
55
+ WRITE_CODE(m_filmGrainBitDepthLumaMinus8, 3, "film_grain_bit_depth_luma_minus8");
56
+ WRITE_CODE(m_filmGrainBitDepthChromaMinus8, 3, "film_grain_bit_depth_chroma_minus8");
57
+ WRITE_FLAG(m_filmGrainFullRangeFlag, "film_grain_full_range_flag");
58
+ WRITE_CODE(m_filmGrainColourPrimaries, X265_BYTE, "film_grain_colour_primaries");
59
+ WRITE_CODE(m_filmGrainTransferCharacteristics, X265_BYTE, "film_grain_transfer_characteristics");
60
+ WRITE_CODE(m_filmGrainMatrixCoeffs, X265_BYTE, "film_grain_matrix_coeffs");
61
+ }
62
+ WRITE_CODE(m_blendingModeId, 2, "blending_mode_id");
63
+ WRITE_CODE(m_log2ScaleFactor, 4, "log2_scale_factor");
64
+ for (uint8_t c = 0; c < 3; c++)
65
+ {
66
+ WRITE_FLAG(m_compModelc.bPresentFlag && m_compModelc.m_filmGrainNumIntensityIntervalMinus1 + 1 > 0 && m_compModelc.numModelValues > 0, "comp_model_present_flagc");
67
+ }
68
+ for (uint8_t c = 0; c < 3; c++)
69
+ {
70
+ if (m_compModelc.bPresentFlag && m_compModelc.m_filmGrainNumIntensityIntervalMinus1 + 1 > 0 && m_compModelc.numModelValues > 0)
71
+ {
72
+ assert(m_compModelc.m_filmGrainNumIntensityIntervalMinus1 + 1 <= 256);
73
+ assert(m_compModelc.numModelValues <= X265_BYTE);
74
+ WRITE_CODE(m_compModelc.m_filmGrainNumIntensityIntervalMinus1 , X265_BYTE, "num_intensity_intervals_minus1c");
75
+ WRITE_CODE(m_compModelc.numModelValues - 1, 3, "num_model_values_minus1c");
76
+ for (uint8_t interval = 0; interval < m_compModelc.m_filmGrainNumIntensityIntervalMinus1 + 1; interval++)
77
+ {
78
+ WRITE_CODE(m_compModelc.intensityValuesinterval.intensityIntervalLowerBound, X265_BYTE, "intensity_interval_lower_boundci");
79
+ WRITE_CODE(m_compModelc.intensityValuesinterval.intensityIntervalUpperBound, X265_BYTE, "intensity_interval_upper_boundci");
80
+ for (uint8_t j = 0; j < m_compModelc.numModelValues; j++)
81
+ {
82
+ WRITE_SVLC(m_compModelc.intensityValuesinterval.compModelValuej,"comp_model_valueci");
83
+ }
84
+ }
85
+ }
86
+ }
87
+ WRITE_FLAG(m_filmGrainCharacteristicsPersistenceFlag, "film_grain_characteristics_persistence_flag");
88
+ }
89
+ if (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
90
+ {
91
+ WRITE_FLAG(1, "payload_bit_equal_to_one");
92
+ while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
93
+ {
94
+ WRITE_FLAG(0, "payload_bit_equal_to_zero");
95
+ }
96
+ }
97
+ }
98
+};
99
+
100
static const uint32_t ISO_IEC_11578_LEN = 16;
101
102
class SEIuserDataUnregistered : public SEI
103
x265_3.5.tar.gz/source/encoder/slicetype.cpp -> x265_3.6.tar.gz/source/encoder/slicetype.cpp
Changed
201
1
2
3
namespace X265_NS {
4
5
+uint32_t acEnergyVarHist(uint64_t sum_ssd, int shift)
6
+{
7
+ uint32_t sum = (uint32_t)sum_ssd;
8
+ uint32_t ssd = (uint32_t)(sum_ssd >> 32);
9
+
10
+ return ssd - ((uint64_t)sum * sum >> shift);
11
+}
12
+
13
bool computeEdge(pixel* edgePic, pixel* refPic, pixel* edgeTheta, intptr_t stride, int height, int width, bool bcalcTheta, pixel whitePixel)
14
{
15
intptr_t rowOne = 0, rowTwo = 0, rowThree = 0, colOne = 0, colTwo = 0, colThree = 0;
16
17
{
18
for (int colNum = 0; colNum < width; colNum++)
19
{
20
- if ((rowNum >= 2) && (colNum >= 2) && (rowNum != height - 2) && (colNum != width - 2)) //Ignoring the border pixels of the picture
21
+ if ((rowNum >= 2) && (colNum >= 2) && (rowNum < height - 2) && (colNum < width - 2)) //Ignoring the border pixels of the picture
22
{
23
/* 5x5 Gaussian filter
24
2 4 5 4 2
25
26
if (param->rc.aqMode == X265_AQ_EDGE)
27
edgeFilter(curFrame, param);
28
29
- if (param->rc.aqMode == X265_AQ_EDGE && !param->bHistBasedSceneCut && param->recursionSkipMode == EDGE_BASED_RSKIP)
30
+ if (param->rc.aqMode == X265_AQ_EDGE && param->recursionSkipMode == EDGE_BASED_RSKIP)
31
{
32
pixel* src = curFrame->m_edgePic + curFrame->m_fencPic->m_lumaMarginY * curFrame->m_fencPic->m_stride + curFrame->m_fencPic->m_lumaMarginX;
33
primitives.planecopy_pp_shr(src, curFrame->m_fencPic->m_stride, curFrame->m_edgeBitPic,
34
35
m_countPreLookahead = 0;
36
#endif
37
38
- memset(m_histogram, 0, sizeof(m_histogram));
39
+ m_accHistDiffRunningAvgCb = X265_MALLOC(uint32_t*, NUMBER_OF_SEGMENTS_IN_WIDTH * sizeof(uint32_t*));
40
+ m_accHistDiffRunningAvgCb0 = X265_MALLOC(uint32_t, NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
41
+ memset(m_accHistDiffRunningAvgCb0, 0, sizeof(uint32_t) * NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
42
+ for (uint32_t w = 1; w < NUMBER_OF_SEGMENTS_IN_WIDTH; w++) {
43
+ m_accHistDiffRunningAvgCbw = m_accHistDiffRunningAvgCb0 + w * NUMBER_OF_SEGMENTS_IN_HEIGHT;
44
+ }
45
+
46
+ m_accHistDiffRunningAvgCr = X265_MALLOC(uint32_t*, NUMBER_OF_SEGMENTS_IN_WIDTH * sizeof(uint32_t*));
47
+ m_accHistDiffRunningAvgCr0 = X265_MALLOC(uint32_t, NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
48
+ memset(m_accHistDiffRunningAvgCr0, 0, sizeof(uint32_t) * NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
49
+ for (uint32_t w = 1; w < NUMBER_OF_SEGMENTS_IN_WIDTH; w++) {
50
+ m_accHistDiffRunningAvgCrw = m_accHistDiffRunningAvgCr0 + w * NUMBER_OF_SEGMENTS_IN_HEIGHT;
51
+ }
52
+
53
+ m_accHistDiffRunningAvg = X265_MALLOC(uint32_t*, NUMBER_OF_SEGMENTS_IN_WIDTH * sizeof(uint32_t*));
54
+ m_accHistDiffRunningAvg0 = X265_MALLOC(uint32_t, NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
55
+ memset(m_accHistDiffRunningAvg0, 0, sizeof(uint32_t) * NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT);
56
+ for (uint32_t w = 1; w < NUMBER_OF_SEGMENTS_IN_WIDTH; w++) {
57
+ m_accHistDiffRunningAvgw = m_accHistDiffRunningAvg0 + w * NUMBER_OF_SEGMENTS_IN_HEIGHT;
58
+ }
59
+
60
+ m_resetRunningAvg = true;
61
+
62
+ m_segmentCountThreshold = (uint32_t)(((float)((NUMBER_OF_SEGMENTS_IN_WIDTH * NUMBER_OF_SEGMENTS_IN_HEIGHT) * 50) / 100) + 0.5);
63
+
64
+ if (m_param->bEnableTemporalSubLayers > 2)
65
+ {
66
+ switch (m_param->bEnableTemporalSubLayers)
67
+ {
68
+ case 3:
69
+ m_gopId = 0;
70
+ break;
71
+ case 4:
72
+ m_gopId = 1;
73
+ break;
74
+ case 5:
75
+ m_gopId = 2;
76
+ break;
77
+ default:
78
+ break;
79
+ }
80
+ }
81
}
82
83
#if DETAILED_CU_STATS
84
85
m_pooli.stopWorkers();
86
}
87
}
88
+
89
void Lookahead::destroy()
90
{
91
// these two queues will be empty unless the encode was aborted
92
93
default:
94
return;
95
}
96
- if (!m_param->analysisLoad || !m_param->bDisableLookahead)
97
+ if (!curFrame->m_param->analysisLoad || !curFrame->m_param->bDisableLookahead)
98
{
99
X265_CHECK(curFrame->m_lowres.costEstb - p0p1 - b > 0, "Slice cost not estimated\n")
100
101
- if (m_param->rc.cuTree && !m_param->rc.bStatRead)
102
+ if (curFrame->m_param->rc.cuTree && !curFrame->m_param->rc.bStatRead)
103
/* update row satds based on cutree offsets */
104
curFrame->m_lowres.satdCost = frameCostRecalculate(frames, p0, p1, b);
105
- else if (!m_param->analysisLoad || m_param->scaleFactor || m_param->bAnalysisType == HEVC_INFO)
106
+ else if (!curFrame->m_param->analysisLoad || curFrame->m_param->scaleFactor || curFrame->m_param->bAnalysisType == HEVC_INFO)
107
{
108
- if (m_param->rc.aqMode)
109
+ if (curFrame->m_param->rc.aqMode)
110
curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstAqb - p0p1 - b;
111
else
112
curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstb - p0p1 - b;
113
}
114
- if (m_param->rc.vbvBufferSize && m_param->rc.vbvMaxBitrate)
115
+ if (curFrame->m_param->rc.vbvBufferSize && curFrame->m_param->rc.vbvMaxBitrate)
116
{
117
/* aggregate lowres row satds to CTU resolution */
118
curFrame->m_lowres.lowresCostForRc = curFrame->m_lowres.lowresCostsb - p0p1 - b;
119
uint32_t lowresRow = 0, lowresCol = 0, lowresCuIdx = 0, sum = 0, intraSum = 0;
120
- uint32_t scale = m_param->maxCUSize / (2 * X265_LOWRES_CU_SIZE);
121
- uint32_t numCuInHeight = (m_param->sourceHeight + m_param->maxCUSize - 1) / m_param->maxCUSize;
122
+ uint32_t scale = curFrame->m_param->maxCUSize / (2 * X265_LOWRES_CU_SIZE);
123
+ uint32_t numCuInHeight = (curFrame->m_param->sourceHeight + curFrame->m_param->maxCUSize - 1) / curFrame->m_param->maxCUSize;
124
uint32_t widthInLowresCu = (uint32_t)m_8x8Width, heightInLowresCu = (uint32_t)m_8x8Height;
125
double *qp_offset = 0;
126
/* Factor in qpoffsets based on Aq/Cutree in CU costs */
127
- if (m_param->rc.aqMode || m_param->bAQMotion)
128
- qp_offset = (framesb->sliceType == X265_TYPE_B || !m_param->rc.cuTree) ? framesb->qpAqOffset : framesb->qpCuTreeOffset;
129
+ if (curFrame->m_param->rc.aqMode || curFrame->m_param->bAQMotion)
130
+ qp_offset = (framesb->sliceType == X265_TYPE_B || !curFrame->m_param->rc.cuTree) ? framesb->qpAqOffset : framesb->qpCuTreeOffset;
131
132
for (uint32_t row = 0; row < numCuInHeight; row++)
133
{
134
135
if (qp_offset)
136
{
137
double qpOffset;
138
- if (m_param->rc.qgSize == 8)
139
+ if (curFrame->m_param->rc.qgSize == 8)
140
qpOffset = (qp_offsetlowresCol * 2 + lowresRow * widthInLowresCu * 4 +
141
qp_offsetlowresCol * 2 + lowresRow * widthInLowresCu * 4 + 1 +
142
qp_offsetlowresCol * 2 + lowresRow * widthInLowresCu * 4 + curFrame->m_lowres.maxBlocksInRowFullRes +
143
144
int32_t intraCuCost = curFrame->m_lowres.intraCostlowresCuIdx;
145
curFrame->m_lowres.intraCostlowresCuIdx = (intraCuCost * x265_exp2fix8(qpOffset) + 128) >> 8;
146
}
147
- if (m_param->bIntraRefresh && slice->m_sliceType == X265_TYPE_P)
148
+ if (curFrame->m_param->bIntraRefresh && slice->m_sliceType == X265_TYPE_P)
149
for (uint32_t x = curFrame->m_encData->m_pir.pirStartCol; x <= curFrame->m_encData->m_pir.pirEndCol; x++)
150
diff += curFrame->m_lowres.intraCostlowresCuIdx - lowresCuCost;
151
curFrame->m_lowres.lowresCostForRclowresCuIdx = lowresCuCost;
152
153
}
154
}
155
156
+uint32_t LookaheadTLD::calcVariance(pixel* inpSrc, intptr_t stride, intptr_t blockOffset, uint32_t plane)
157
+{
158
+ pixel* src = inpSrc + blockOffset;
159
+
160
+ uint32_t var;
161
+ if (!plane)
162
+ var = acEnergyVarHist(primitives.cuBLOCK_8x8.var(src, stride), 6);
163
+ else
164
+ var = acEnergyVarHist(primitives.cuBLOCK_4x4.var(src, stride), 4);
165
+
166
+ x265_emms();
167
+ return var;
168
+}
169
+
170
+/*
171
+** Compute Block and Picture Variance, Block Mean for all blocks in the picture
172
+*/
173
+void LookaheadTLD::computePictureStatistics(Frame *curFrame)
174
+{
175
+ int maxCol = curFrame->m_fencPic->m_picWidth;
176
+ int maxRow = curFrame->m_fencPic->m_picHeight;
177
+ intptr_t inpStride = curFrame->m_fencPic->m_stride;
178
+
179
+ // Variance
180
+ uint64_t picTotVariance = 0;
181
+ uint32_t variance;
182
+
183
+ uint64_t blockXY = 0;
184
+ pixel* src = curFrame->m_fencPic->m_picOrg0;
185
+
186
+ for (int blockY = 0; blockY < maxRow; blockY += 8)
187
+ {
188
+ uint64_t rowVariance = 0;
189
+ for (int blockX = 0; blockX < maxCol; blockX += 8)
190
+ {
191
+ intptr_t blockOffsetLuma = blockX + (blockY * inpStride);
192
+
193
+ variance = calcVariance(
194
+ src,
195
+ inpStride,
196
+ blockOffsetLuma, 0);
197
+
198
+ rowVariance += variance;
199
+ blockXY++;
200
+ }
201
x265_3.5.tar.gz/source/encoder/slicetype.h -> x265_3.6.tar.gz/source/encoder/slicetype.h
Changed
110
1
2
#define EDGE_INCLINATION 45
3
#define TEMPORAL_SCENECUT_THRESHOLD 50
4
5
+#define X265_ABS(a) (((a) < 0) ? (-(a)) : (a))
6
+
7
+#define PICTURE_DIFF_VARIANCE_TH 390
8
+#define PICTURE_VARIANCE_TH 1500
9
+#define LOW_VAR_SCENE_CHANGE_TH 2250
10
+#define HIGH_VAR_SCENE_CHANGE_TH 3500
11
+
12
+#define PICTURE_DIFF_VARIANCE_CHROMA_TH 10
13
+#define PICTURE_VARIANCE_CHROMA_TH 20
14
+#define LOW_VAR_SCENE_CHANGE_CHROMA_TH 2250/4
15
+#define HIGH_VAR_SCENE_CHANGE_CHROMA_TH 3500/4
16
+
17
+#define FLASH_TH 1.5
18
+#define FADE_TH 4
19
+#define INTENSITY_CHANGE_TH 4
20
+
21
+#define NUM64x64INPIC(w,h) ((w*h)>> (MAX_LOG2_CU_SIZE<<1))
22
+
23
#if HIGH_BIT_DEPTH
24
#define EDGE_THRESHOLD 1023.0
25
#else
26
27
28
~LookaheadTLD() { X265_FREE(wbuffer0); }
29
30
+ void collectPictureStatistics(Frame *curFrame);
31
+ void computeIntensityHistogramBinsLuma(Frame *curFrame, uint64_t *sumAvgIntensityTotalSegmentsLuma);
32
+
33
+ void computeIntensityHistogramBinsChroma(
34
+ Frame *curFrame,
35
+ uint64_t *sumAverageIntensityCb,
36
+ uint64_t *sumAverageIntensityCr);
37
+
38
+ void calculateHistogram(
39
+ pixel *inputSrc,
40
+ uint32_t inputWidth,
41
+ uint32_t inputHeight,
42
+ intptr_t stride,
43
+ uint8_t dsFactor,
44
+ uint32_t *histogram,
45
+ uint64_t *sum);
46
+
47
+ void computePictureStatistics(Frame *curFrame);
48
+
49
+ uint32_t calcVariance(pixel* src, intptr_t stride, intptr_t blockOffset, uint32_t plane);
50
+
51
void calcAdaptiveQuantFrame(Frame *curFrame, x265_param* param);
52
+ void calcFrameSegment(Frame *curFrame);
53
void lowresIntraEstimate(Lowres& fenc, uint32_t qgSize);
54
55
void weightsAnalyse(Lowres& fenc, Lowres& ref);
56
57
58
/* pre-lookahead */
59
int m_fullQueueSize;
60
- int m_histogramX265_BFRAME_MAX + 1;
61
int m_lastKeyframe;
62
int m_8x8Width;
63
int m_8x8Height;
64
65
bool m_isFadeIn;
66
uint64_t m_fadeCount;
67
int m_fadeStart;
68
+
69
+ uint32_t **m_accHistDiffRunningAvgCb;
70
+ uint32_t **m_accHistDiffRunningAvgCr;
71
+ uint32_t **m_accHistDiffRunningAvg;
72
+
73
+ bool m_resetRunningAvg;
74
+ uint32_t m_segmentCountThreshold;
75
+
76
+ int8_t m_gopId;
77
+
78
Lookahead(x265_param *param, ThreadPool *pool);
79
#if DETAILED_CU_STATS
80
int64_t m_slicetypeDecideElapsedTime;
81
82
83
void getEstimatedPictureCost(Frame *pic);
84
void setLookaheadQueue();
85
+ int findSliceType(int poc);
86
87
protected:
88
89
90
/* called by slicetypeAnalyse() to make slice decisions */
91
bool scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, int numFrames);
92
bool scenecutInternal(Lowres **frames, int p0, int p1, bool bRealScenecut);
93
+
94
+ bool histBasedScenecut(Lowres **frames, int p0, int p1, int numFrames);
95
+ bool detectHistBasedSceneChange(Lowres **frames, int p0, int p1, int p2);
96
+
97
void slicetypePath(Lowres **frames, int length, char(*best_paths)X265_LOOKAHEAD_MAX + 1);
98
int64_t slicetypePathCost(Lowres **frames, char *path, int64_t threshold);
99
int64_t vbvFrameCost(Lowres **frames, int p0, int p1, int b);
100
101
102
/* called by getEstimatedPictureCost() to finalize cuTree costs */
103
int64_t frameCostRecalculate(Lowres **frames, int p0, int p1, int b);
104
+ /*Compute index for positioning B-Ref frames*/
105
+ void placeBref(Frame** frames, int start, int end, int num, int *brefs);
106
+ void compCostBref(Lowres **frame, int start, int end, int num);
107
};
108
109
class PreLookaheadGroup : public BondedTaskGroup
110
x265_3.5.tar.gz/source/output/output.cpp -> x265_3.6.tar.gz/source/output/output.cpp
Changed
19
1
2
3
using namespace X265_NS;
4
5
-ReconFile* ReconFile::open(const char *fname, int width, int height, uint32_t bitdepth, uint32_t fpsNum, uint32_t fpsDenom, int csp)
6
+ReconFile* ReconFile::open(const char *fname, int width, int height, uint32_t bitdepth, uint32_t fpsNum, uint32_t fpsDenom, int csp, int sourceBitDepth)
7
{
8
const char * s = strrchr(fname, '.');
9
10
if (s && !strcmp(s, ".y4m"))
11
- return new Y4MOutput(fname, width, height, fpsNum, fpsDenom, csp);
12
+ return new Y4MOutput(fname, width, height, bitdepth, fpsNum, fpsDenom, csp, sourceBitDepth);
13
else
14
- return new YUVOutput(fname, width, height, bitdepth, csp);
15
+ return new YUVOutput(fname, width, height, bitdepth, csp, sourceBitDepth);
16
}
17
18
OutputFile* OutputFile::open(const char *fname, InputFileInfo& inputInfo)
19
x265_3.5.tar.gz/source/output/output.h -> x265_3.6.tar.gz/source/output/output.h
Changed
10
1
2
ReconFile() {}
3
4
static ReconFile* open(const char *fname, int width, int height, uint32_t bitdepth,
5
- uint32_t fpsNum, uint32_t fpsDenom, int csp);
6
+ uint32_t fpsNum, uint32_t fpsDenom, int csp, int sourceBitDepth);
7
8
virtual bool isFail() const = 0;
9
10
x265_3.5.tar.gz/source/output/y4m.cpp -> x265_3.6.tar.gz/source/output/y4m.cpp
Changed
145
1
2
using namespace X265_NS;
3
using namespace std;
4
5
-Y4MOutput::Y4MOutput(const char *filename, int w, int h, uint32_t fpsNum, uint32_t fpsDenom, int csp)
6
+Y4MOutput::Y4MOutput(const char* filename, int w, int h, uint32_t bitdepth, uint32_t fpsNum, uint32_t fpsDenom, int csp, int inputdepth)
7
: width(w)
8
, height(h)
9
+ , bitDepth(bitdepth)
10
, colorSpace(csp)
11
, frameSize(0)
12
+ , inputDepth(inputdepth)
13
{
14
ofs.open(filename, ios::binary | ios::out);
15
buf = new charwidth;
16
17
18
if (ofs)
19
{
20
- ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << fpsNum << ":" << fpsDenom << " Ip" << " C" << cf << "\n";
21
+ if (bitDepth == 10)
22
+ ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << fpsNum << ":" << fpsDenom << " Ip" << " C" << cf << "p10" << " XYSCSS = " << cf << "P10" << "\n";
23
+ else if (bitDepth == 12)
24
+ ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << fpsNum << ":" << fpsDenom << " Ip" << " C" << cf << "p12" << " XYSCSS = " << cf << "P12" << "\n";
25
+ else
26
+ ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << fpsNum << ":" << fpsDenom << " Ip" << " C" << cf << "\n";
27
+
28
header = ofs.tellp();
29
}
30
31
32
bool Y4MOutput::writePicture(const x265_picture& pic)
33
{
34
std::ofstream::pos_type outPicPos = header;
35
- outPicPos += (uint64_t)pic.poc * (6 + frameSize);
36
+ if (pic.bitDepth > 8)
37
+ outPicPos += (uint64_t)(pic.poc * (6 + frameSize * 2));
38
+ else
39
+ outPicPos += (uint64_t)pic.poc * (6 + frameSize);
40
ofs.seekp(outPicPos);
41
ofs << "FRAME\n";
42
43
-#if HIGH_BIT_DEPTH
44
- if (pic.bitDepth > 8 && pic.poc == 0)
45
- x265_log(NULL, X265_LOG_WARNING, "y4m: down-shifting reconstructed pixels to 8 bits\n");
46
-#else
47
- if (pic.bitDepth > 8 && pic.poc == 0)
48
- x265_log(NULL, X265_LOG_WARNING, "y4m: forcing reconstructed pixels to 8 bits\n");
49
-#endif
50
+ if (inputDepth > 8)
51
+ {
52
+ if (pic.bitDepth == 8 && pic.poc == 0)
53
+ x265_log(NULL, X265_LOG_WARNING, "y4m: down-shifting reconstructed pixels to 8 bits\n");
54
+ }
55
56
X265_CHECK(pic.colorSpace == colorSpace, "invalid chroma subsampling\n");
57
58
-#if HIGH_BIT_DEPTH
59
-
60
- // encoder gave us short pixels, downshift, then write
61
- X265_CHECK(pic.bitDepth > 8, "invalid bit depth\n");
62
- int shift = pic.bitDepth - 8;
63
- for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
64
+ if (inputDepth > 8)//if HIGH_BIT_DEPTH
65
{
66
- uint16_t *src = (uint16_t*)pic.planesi;
67
- for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
68
+ if (pic.bitDepth == 8)
69
{
70
- for (int w = 0; w < width >> x265_cli_cspscolorSpace.widthi; w++)
71
- bufw = (char)(srcw >> shift);
72
-
73
- ofs.write(buf, width >> x265_cli_cspscolorSpace.widthi);
74
- src += pic.stridei / sizeof(*src);
75
+ // encoder gave us short pixels, downshift, then write
76
+ X265_CHECK(pic.bitDepth == 8, "invalid bit depth\n");
77
+ int shift = pic.bitDepth - 8;
78
+ for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
79
+ {
80
+ char *src = (char*)pic.planesi;
81
+ for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
82
+ {
83
+ for (int w = 0; w < width >> x265_cli_cspscolorSpace.widthi; w++)
84
+ bufw = (char)(srcw >> shift);
85
+
86
+ ofs.write(buf, width >> x265_cli_cspscolorSpace.widthi);
87
+ src += pic.stridei / sizeof(*src);
88
+ }
89
+ }
90
+ }
91
+ else
92
+ {
93
+ X265_CHECK(pic.bitDepth > 8, "invalid bit depth\n");
94
+ for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
95
+ {
96
+ uint16_t *src = (uint16_t*)pic.planesi;
97
+ for (int h = 0; h < (height * 1) >> x265_cli_cspscolorSpace.heighti; h++)
98
+ {
99
+ ofs.write((const char*)src, (width * 2) >> x265_cli_cspscolorSpace.widthi);
100
+ src += pic.stridei / sizeof(*src);
101
+ }
102
+ }
103
}
104
}
105
-
106
-#else // if HIGH_BIT_DEPTH
107
-
108
- X265_CHECK(pic.bitDepth == 8, "invalid bit depth\n");
109
- for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
110
+ else if (inputDepth == 8 && pic.bitDepth > 8)
111
{
112
- char *src = (char*)pic.planesi;
113
- for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
114
+ X265_CHECK(pic.bitDepth > 8, "invalid bit depth\n");
115
+ for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
116
{
117
- ofs.write(src, width >> x265_cli_cspscolorSpace.widthi);
118
- src += pic.stridei / sizeof(*src);
119
+ uint16_t* src = (uint16_t*)pic.planesi;
120
+ for (int h = 0; h < (height * 1) >> x265_cli_cspscolorSpace.heighti; h++)
121
+ {
122
+ ofs.write((const char*)src, (width * 2) >> x265_cli_cspscolorSpace.widthi);
123
+ src += pic.stridei / sizeof(*src);
124
+ }
125
+ }
126
+ }
127
+ else
128
+ {
129
+ X265_CHECK(pic.bitDepth == 8, "invalid bit depth\n");
130
+ for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
131
+ {
132
+ char *src = (char*)pic.planesi;
133
+ for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
134
+ {
135
+ ofs.write(src, width >> x265_cli_cspscolorSpace.widthi);
136
+ src += pic.stridei / sizeof(*src);
137
+ }
138
}
139
}
140
-
141
-#endif // if HIGH_BIT_DEPTH
142
143
return true;
144
}
145
x265_3.5.tar.gz/source/output/y4m.h -> x265_3.6.tar.gz/source/output/y4m.h
Changed
25
1
2
3
int height;
4
5
+ uint32_t bitDepth;
6
+
7
int colorSpace;
8
9
uint32_t frameSize;
10
11
+ int inputDepth;
12
+
13
std::ofstream ofs;
14
15
std::ofstream::pos_type header;
16
17
18
public:
19
20
- Y4MOutput(const char *filename, int width, int height, uint32_t fpsNum, uint32_t fpsDenom, int csp);
21
+ Y4MOutput(const char *filename, int width, int height, uint32_t bitdepth, uint32_t fpsNum, uint32_t fpsDenom, int csp, int inputDepth);
22
23
virtual ~Y4MOutput();
24
25
x265_3.5.tar.gz/source/output/yuv.cpp -> x265_3.6.tar.gz/source/output/yuv.cpp
Changed
107
1
2
using namespace X265_NS;
3
using namespace std;
4
5
-YUVOutput::YUVOutput(const char *filename, int w, int h, uint32_t d, int csp)
6
+YUVOutput::YUVOutput(const char *filename, int w, int h, uint32_t d, int csp, int inputdepth)
7
: width(w)
8
, height(h)
9
, depth(d)
10
, colorSpace(csp)
11
, frameSize(0)
12
+ , inputDepth(inputdepth)
13
{
14
ofs.open(filename, ios::binary | ios::out);
15
buf = new charwidth;
16
17
X265_CHECK(pic.colorSpace == colorSpace, "invalid chroma subsampling\n");
18
X265_CHECK(pic.bitDepth == (int)depth, "invalid bit depth\n");
19
20
-#if HIGH_BIT_DEPTH
21
- if (depth == 8)
22
+ if (inputDepth > 8)
23
{
24
- int shift = pic.bitDepth - 8;
25
- ofs.seekp((std::streamoff)fileOffset);
26
- for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
27
- {
28
- uint16_t *src = (uint16_t*)pic.planesi;
29
- for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
30
- {
31
- for (int w = 0; w < width >> x265_cli_cspscolorSpace.widthi; w++)
32
- bufw = (char)(srcw >> shift);
33
+ if (depth == 8)
34
+ {
35
+ int shift = pic.bitDepth - 8;
36
+ ofs.seekp((std::streamoff)fileOffset);
37
+ for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
38
+ {
39
+ uint16_t *src = (uint16_t*)pic.planesi;
40
+ for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
41
+ {
42
+ for (int w = 0; w < width >> x265_cli_cspscolorSpace.widthi; w++)
43
+ bufw = (char)(srcw >> shift);
44
45
- ofs.write(buf, width >> x265_cli_cspscolorSpace.widthi);
46
- src += pic.stridei / sizeof(*src);
47
- }
48
- }
49
+ ofs.write(buf, width >> x265_cli_cspscolorSpace.widthi);
50
+ src += pic.stridei / sizeof(*src);
51
+ }
52
+ }
53
+ }
54
+ else
55
+ {
56
+ ofs.seekp((std::streamoff)(fileOffset * 2));
57
+ for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
58
+ {
59
+ uint16_t *src = (uint16_t*)pic.planesi;
60
+ for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
61
+ {
62
+ ofs.write((const char*)src, (width * 2) >> x265_cli_cspscolorSpace.widthi);
63
+ src += pic.stridei / sizeof(*src);
64
+ }
65
+ }
66
+ }
67
}
68
else
69
{
70
- ofs.seekp((std::streamoff)(fileOffset * 2));
71
- for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
72
- {
73
- uint16_t *src = (uint16_t*)pic.planesi;
74
- for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
75
- {
76
- ofs.write((const char*)src, (width * 2) >> x265_cli_cspscolorSpace.widthi);
77
- src += pic.stridei / sizeof(*src);
78
- }
79
- }
80
+ ofs.seekp((std::streamoff)fileOffset);
81
+ for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
82
+ {
83
+ char *src = (char*)pic.planesi;
84
+ for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
85
+ {
86
+ ofs.write(src, width >> x265_cli_cspscolorSpace.widthi);
87
+ src += pic.stridei / sizeof(*src);
88
+ }
89
+ }
90
}
91
-#else // if HIGH_BIT_DEPTH
92
- ofs.seekp((std::streamoff)fileOffset);
93
- for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++)
94
- {
95
- char *src = (char*)pic.planesi;
96
- for (int h = 0; h < height >> x265_cli_cspscolorSpace.heighti; h++)
97
- {
98
- ofs.write(src, width >> x265_cli_cspscolorSpace.widthi);
99
- src += pic.stridei / sizeof(*src);
100
- }
101
- }
102
-
103
-#endif // if HIGH_BIT_DEPTH
104
105
return true;
106
}
107
x265_3.5.tar.gz/source/output/yuv.h -> x265_3.6.tar.gz/source/output/yuv.h
Changed
18
1
2
3
uint32_t frameSize;
4
5
+ int inputDepth;
6
+
7
char *buf;
8
9
std::ofstream ofs;
10
11
public:
12
13
- YUVOutput(const char *filename, int width, int height, uint32_t bitdepth, int csp);
14
+ YUVOutput(const char *filename, int width, int height, uint32_t bitdepth, int csp, int inputDepth);
15
16
virtual ~YUVOutput();
17
18
x265_3.5.tar.gz/source/test/CMakeLists.txt -> x265_3.6.tar.gz/source/test/CMakeLists.txt
Changed
24
1
2
3
# add ARM assembly files
4
if(ARM OR CROSS_COMPILE_ARM)
5
- if(NOT ARM64)
6
- enable_language(ASM)
7
- set(NASM_SRC checkasm-arm.S)
8
- add_custom_command(
9
- OUTPUT checkasm-arm.obj
10
- COMMAND ${CMAKE_CXX_COMPILER}
11
- ARGS ${NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S -o checkasm-arm.obj
12
- DEPENDS checkasm-arm.S)
13
- endif()
14
+ enable_language(ASM)
15
+ set(NASM_SRC checkasm-arm.S)
16
+ add_custom_command(
17
+ OUTPUT checkasm-arm.obj
18
+ COMMAND ${CMAKE_CXX_COMPILER}
19
+ ARGS ${NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S -o checkasm-arm.obj
20
+ DEPENDS checkasm-arm.S)
21
endif(ARM OR CROSS_COMPILE_ARM)
22
23
# add PowerPC assembly files
24
x265_3.5.tar.gz/source/test/pixelharness.cpp -> x265_3.6.tar.gz/source/test/pixelharness.cpp
Changed
63
1
2
return true;
3
}
4
5
+bool PixelHarness::check_downscaleluma_t(downscaleluma_t ref, downscaleluma_t opt)
6
+{
7
+ ALIGN_VAR_16(pixel, ref_destf32 * 32);
8
+ ALIGN_VAR_16(pixel, opt_destf32 * 32);
9
+
10
+ intptr_t src_stride = 64;
11
+ intptr_t dst_stride = 32;
12
+ int bx = 32;
13
+ int by = 32;
14
+ int j = 0;
15
+ for (int i = 0; i < ITERS; i++)
16
+ {
17
+ int index = i % TEST_CASES;
18
+ ref(pixel_test_buffindex + j, ref_destf, src_stride, dst_stride, bx, by);
19
+ checked(opt, pixel_test_buffindex + j, opt_destf, src_stride, dst_stride, bx, by);
20
+
21
+ if (memcmp(ref_destf, opt_destf, 32 * 32 * sizeof(pixel)))
22
+ return false;
23
+
24
+ reportfail();
25
+ j += INCR;
26
+ }
27
+
28
+ return true;
29
+}
30
+
31
bool PixelHarness::check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt)
32
{
33
ALIGN_VAR_16(int16_t, ref_dest64 * 64);
34
35
}
36
}
37
38
+ if (opt.frameSubSampleLuma)
39
+ {
40
+ if (!check_downscaleluma_t(ref.frameSubSampleLuma, opt.frameSubSampleLuma))
41
+ {
42
+ printf("SubSample Luma failed!\n");
43
+ return false;
44
+ }
45
+ }
46
+
47
if (opt.scale1D_128to64NONALIGNED)
48
{
49
if (!check_scale1D_pp(ref.scale1D_128to64NONALIGNED, opt.scale1D_128to64NONALIGNED))
50
51
REPORT_SPEEDUP(opt.frameInitLowres, ref.frameInitLowres, pbuf2, pbuf1, pbuf2, pbuf3, pbuf4, 64, 64, 64, 64);
52
}
53
54
+ if (opt.frameSubSampleLuma)
55
+ {
56
+ HEADER0("downscaleluma");
57
+ REPORT_SPEEDUP(opt.frameSubSampleLuma, ref.frameSubSampleLuma, pbuf2, pbuf1, 64, 64, 64, 64);
58
+ }
59
+
60
if (opt.scale1D_128to64NONALIGNED)
61
{
62
HEADER0("scale1D_128to64");
63
x265_3.5.tar.gz/source/test/pixelharness.h -> x265_3.6.tar.gz/source/test/pixelharness.h
Changed
9
1
2
bool check_integral_inith(integralh_t ref, integralh_t opt);
3
bool check_ssimDist(ssimDistortion_t ref, ssimDistortion_t opt);
4
bool check_normFact(normFactor_t ref, normFactor_t opt, int block);
5
+ bool check_downscaleluma_t(downscaleluma_t ref, downscaleluma_t opt);
6
7
public:
8
9
x265_3.5.tar.gz/source/test/rate-control-tests.txt -> x265_3.6.tar.gz/source/test/rate-control-tests.txt
Changed
10
1
2
112_1920x1080_25.yuv,--preset ultrafast --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 15000 --hrd --strict-cbr
3
Traffic_4096x2048_30.yuv,--preset superfast --bitrate 20000 --vbv-maxrate 20000 --vbv-bufsize 20000 --repeat-headers --strict-cbr
4
Traffic_4096x2048_30.yuv,--preset faster --bitrate 8000 --vbv-maxrate 8000 --vbv-bufsize 6000 --aud --repeat-headers --no-open-gop --hrd --pmode --pme
5
-News-4k.y4m,--preset veryfast --bitrate 3000 --vbv-maxrate 5000 --vbv-bufsize 5000 --repeat-headers --temporal-layers
6
+News-4k.y4m,--preset veryfast --bitrate 3000 --vbv-maxrate 5000 --vbv-bufsize 5000 --repeat-headers --temporal-layers 3
7
NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --bitrate 18000 --vbv-bufsize 20000 --vbv-maxrate 18000 --strict-cbr
8
NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --bitrate 8000 --vbv-bufsize 12000 --vbv-maxrate 10000 --tune grain
9
big_buck_bunny_360p24.y4m,--preset medium --bitrate 400 --vbv-bufsize 600 --vbv-maxrate 600 --aud --hrd --tune fast-decode
10
x265_3.5.tar.gz/source/test/regression-tests.txt -> x265_3.6.tar.gz/source/test/regression-tests.txt
Changed
91
1
2
BasketballDrive_1920x1080_50.y4m,--preset faster --aq-strength 2 --merange 190 --slices 3
3
BasketballDrive_1920x1080_50.y4m,--preset medium --ctu 16 --max-tu-size 8 --subme 7 --qg-size 16 --cu-lossless --tu-inter-depth 3 --limit-tu 1
4
BasketballDrive_1920x1080_50.y4m,--preset medium --keyint -1 --nr-inter 100 -F4 --no-sao
5
-BasketballDrive_1920x1080_50.y4m,--preset medium --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 2 --bitrate 7000 --limit-modes::--preset medium --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 2 --bitrate 7000 --limit-modes
6
+BasketballDrive_1920x1080_50.y4m,--preset medium --analysis-save x265_analysis.dat --analysis-save-reuse-level 2 --bitrate 7000 --limit-modes::--preset medium --analysis-load x265_analysis.dat --analysis-load-reuse-level 2 --bitrate 7000 --limit-modes
7
BasketballDrive_1920x1080_50.y4m,--preset slow --nr-intra 100 -F4 --aq-strength 3 --qg-size 16 --limit-refs 1
8
BasketballDrive_1920x1080_50.y4m,--preset slower --lossless --chromaloc 3 --subme 0 --limit-tu 4
9
-BasketballDrive_1920x1080_50.y4m,--preset slower --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --bitrate 7000 --limit-tu 0::--preset slower --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 10 --bitrate 7000 --limit-tu 0
10
+BasketballDrive_1920x1080_50.y4m,--preset slower --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --bitrate 7000 --limit-tu 0::--preset slower --analysis-load x265_analysis.dat --analysis-load-reuse-level 10 --bitrate 7000 --limit-tu 0
11
BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1 --aq-mode 3 --limit-tu 3
12
-BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --crf 18 --tskip-fast --limit-tu 2::--preset veryslow --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 5 --crf 18 --tskip-fast --limit-tu 2
13
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --crf 18 --tskip-fast --limit-tu 2::--preset veryslow --analysis-load x265_analysis.dat --analysis-load-reuse-level 5 --crf 18 --tskip-fast --limit-tu 2
14
BasketballDrive_1920x1080_50.y4m,--preset veryslow --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
15
Coastguard-4k.y4m,--preset ultrafast --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
16
Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop
17
18
Coastguard-4k.y4m,--preset slow --tune psnr --cbqpoffs -1 --crqpoffs 1 --limit-refs 1
19
CrowdRun_1920x1080_50_10bit_422.yuv,--preset ultrafast --weightp --tune zerolatency --qg-size 16
20
CrowdRun_1920x1080_50_10bit_422.yuv,--preset superfast --weightp --no-wpp --sao
21
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset veryfast --temporal-layers --tune grain
22
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset veryfast --temporal-layers 2 --tune grain
23
CrowdRun_1920x1080_50_10bit_422.yuv,--preset faster --max-tu-size 4 --min-cu-size 32
24
CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --aq-mode 0 --sar 2 --range full
25
CrowdRun_1920x1080_50_10bit_422.yuv,--preset medium --no-wpp --no-cutree --no-strong-intra-smoothing --limit-refs 1
26
27
CrowdRun_1920x1080_50_10bit_422.yuv,--preset slower --tune ssim --tune fastdecode --limit-refs 2
28
CrowdRun_1920x1080_50_10bit_444.yuv,--preset ultrafast --weightp --no-wpp --no-open-gop
29
CrowdRun_1920x1080_50_10bit_444.yuv,--preset superfast --weightp --dither --no-psy-rd
30
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryfast --temporal-layers --repeat-headers --limit-refs 2
31
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryfast --temporal-layers 2 --repeat-headers --limit-refs 2
32
CrowdRun_1920x1080_50_10bit_444.yuv,--preset medium --dither --keyint -1 --rdoq-level 1 --limit-modes
33
CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --tskip --tskip-fast --no-scenecut --limit-tu 1
34
CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --aq-mode 3 --aq-strength 1.5 --aq-motion --bitrate 5000
35
36
CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --hevc-aq --no-cutree --qg-size 16
37
DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset superfast --weightp --qg-size 16
38
DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset medium --tune psnr --bframes 16 --limit-modes
39
-DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
40
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers 2 --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
41
DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset veryfast --weightp --nr-intra 1000 -F4
42
DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset medium --nr-inter 500 -F4 --no-psy-rdoq
43
DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset slower --no-weightp --rdoq-level 0 --limit-refs 3 --tu-inter-depth 4 --limit-tu 3
44
-DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset fast --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1::--preset fast --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 5 --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1
45
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset fast --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1::--preset fast --analysis-load x265_analysis.dat --analysis-load-reuse-level 5 --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1
46
FourPeople_1280x720_60.y4m,--preset superfast --no-wpp --lookahead-slices 2
47
FourPeople_1280x720_60.y4m,--preset veryfast --aq-mode 2 --aq-strength 1.5 --qg-size 8
48
FourPeople_1280x720_60.y4m,--preset medium --qp 38 --no-psy-rd
49
50
ducks_take_off_420_1_720p50.y4m,--preset medium --selective-sao 4 --sao --crf 20
51
Traffic_4096x2048_30p.y4m, --preset medium --frame-dup --dup-threshold 60 --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000
52
Kimono1_1920x1080_24_400.yuv,--preset superfast --qp 28 --zones 0,139,q=32
53
-sintel_trailer_2k_1920x1080_24.yuv, --preset medium --hist-scenecut --hist-threshold 0.02 --frame-dup --dup-threshold 60 --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000
54
-sintel_trailer_2k_1920x1080_24.yuv, --preset medium --hist-scenecut --hist-threshold 0.02
55
-sintel_trailer_2k_1920x1080_24.yuv, --preset ultrafast --hist-scenecut --hist-threshold 0.02
56
crowd_run_1920x1080_50.yuv, --preset faster --ctu 32 --rskip 2 --rskip-edge-threshold 5
57
crowd_run_1920x1080_50.yuv, --preset fast --ctu 64 --rskip 2 --rskip-edge-threshold 5 --aq-mode 4
58
-crowd_run_1920x1080_50.yuv, --preset slow --ctu 32 --rskip 2 --rskip-edge-threshold 5 --hist-scenecut --hist-threshold 0.1
59
-crowd_run_1920x1080_50.yuv, --preset slower --ctu 16 --rskip 2 --rskip-edge-threshold 5 --hist-scenecut --hist-threshold 0.1 --aq-mode 4
60
+crowd_run_1920x1080_50.yuv, --preset ultrafast --video-signal-type-preset BT2100_PQ_YCC:BT2100x108n0005
61
+crowd_run_1920x1080_50.yuv, --preset ultrafast --eob --eos
62
63
# Main12 intraCost overflow bug test
64
720p50_parkrun_ter.y4m,--preset medium
65
66
67
#scaled save/load test
68
crowd_run_1080p50.y4m,--preset ultrafast --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 8000 --vbv-bufsize 8000::crowd_run_2160p50.y4m, --preset ultrafast --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 12000 --vbv-bufsize 12000
69
-crowd_run_1080p50.y4m,--preset superfast --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m, --preset superfast --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000
70
-crowd_run_1080p50.y4m,--preset fast --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m, --preset fast --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 5 --scale-factor 2 --qp 18
71
+crowd_run_1080p50.y4m,--preset superfast --analysis-save x265_analysis.dat --analysis-save-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m, --preset superfast --analysis-load x265_analysis.dat --analysis-load-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000
72
+crowd_run_1080p50.y4m,--preset fast --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m, --preset fast --analysis-load x265_analysis.dat --analysis-load-reuse-level 5 --scale-factor 2 --qp 18
73
crowd_run_1080p50.y4m,--preset medium --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-maxrate 5000 --vbv-bufsize 5000 --early-skip --tu-inter-depth 3::crowd_run_2160p50.y4m, --preset medium --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 4 --dynamic-refine::crowd_run_2160p50.y4m, --preset medium --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 3 --refine-inter 3
74
-RaceHorses_416x240_30.y4m,--preset slow --no-cutree --ctu 16 --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --crf 22 --vbv-maxrate 1000 --vbv-bufsize 1000::RaceHorses_832x480_30.y4m, --preset slow --no-cutree --ctu 32 --analysis-load x265_analysis.dat --analysis-save x265_analysis_2.dat --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --crf 16 --vbv-maxrate 4000 --vbv-bufsize 4000 --refine-intra 0 --refine-inter 1::RaceHorses_1664x960_30.y4m,--preset slow --no-cutree --ctu 64 --analysis-load x265_analysis_2.dat --analysis-load-reuse-level 10 --scale-factor 2 --crf 12 --vbv-maxrate 7000 --vbv-bufsize 7000 --refine-intra 2 --refine-inter 2
75
+RaceHorses_416x240_30.y4m,--preset slow --ctu 16 --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --crf 22 --vbv-maxrate 1000 --vbv-bufsize 1000::RaceHorses_832x480_30.y4m, --preset slow --ctu 32 --analysis-load x265_analysis.dat --analysis-save x265_analysis_2.dat --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --crf 16 --vbv-maxrate 4000 --vbv-bufsize 4000 --refine-intra 0 --refine-inter 1::RaceHorses_1664x960_30.y4m,--preset slow --ctu 64 --analysis-load x265_analysis_2.dat --analysis-load-reuse-level 10 --scale-factor 2 --crf 12 --vbv-maxrate 7000 --vbv-bufsize 7000 --refine-intra 2 --refine-inter 2
76
ElFunete_960x540_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune psnr --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500 --analysis-save-reuse-level 10 --analysis-save elfuente_960x540.dat --scale-factor 2::ElFunete_1920x1080_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune psnr --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500 --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --analysis-save elfuente_1920x1080.dat --limit-tu 0 --scale-factor 2 --analysis-load elfuente_960x540.dat --refine-intra 4 --refine-inter 2::ElFuente_3840x2160_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune=psnr --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000 --analysis-load-reuse-level 10 --limit-tu 0 --scale-factor 2 --analysis-load elfuente_1920x1080.dat --refine-intra 4 --refine-inter 2
77
#save/load with ctu distortion refinement
78
CrowdRun_1920x1080_50_10bit_422.yuv,--no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --refine-ctu-distortion 1 --bitrate 7000::--no-cutree --analysis-load x265_analysis.dat --refine-ctu-distortion 1 --bitrate 7000 --analysis-load-reuse-level 5
79
#segment encoding
80
BasketballDrive_1920x1080_50.y4m, --preset ultrafast --no-open-gop --chunk-start 100 --chunk-end 200
81
82
+#Test FG SEI message addition
83
+#OldTownCross_1920x1080_50_10bit_422.yuv,--preset slower --tune grain --film-grain "OldTownCross_1920x1080_50_10bit_422.bin"
84
+#RaceHorses_416x240_30_10bit.yuv,--preset ultrafast --signhide --colormatrix bt709 --film-grain "RaceHorses_416x240_30_10bit.bin"
85
+
86
+#Temporal layers tests
87
+ducks_take_off_420_720p50.y4m,--preset slow --temporal-layers 3 --b-adapt 0
88
+parkrun_ter_720p50.y4m,--preset medium --temporal-layers 4 --b-adapt 0
89
+BasketballDrive_1920x1080_50.y4m, --preset medium --no-open-gop --keyint 50 --min-keyint 50 --temporal-layers 5 --b-adapt 0
90
# vim: tw=200
91
x265_3.5.tar.gz/source/test/save-load-tests.txt -> x265_3.6.tar.gz/source/test/save-load-tests.txt
Changed
16
1
2
# not auto-detected.
3
crowd_run_1080p50.y4m, --preset ultrafast --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 8000 --vbv-bufsize 8000::crowd_run_2160p50.y4m, --preset ultrafast --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 12000 --vbv-bufsize 12000
4
crowd_run_540p50.y4m, --preset ultrafast --no-cutree --analysis-save x265_analysis.dat --scale-factor 2 --crf 26 --vbv-maxrate 8000 --vbv-bufsize 8000::crowd_run_1080p50.y4m, --preset ultrafast --no-cutree --analysis-load x265_analysis.dat --scale-factor 2 --crf 26 --vbv-maxrate 12000 --vbv-bufsize 12000
5
-crowd_run_1080p50.y4m, --preset superfast --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m, --preset superfast --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000
6
-crowd_run_1080p50.y4m, --preset fast --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m, --preset fast --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 5 --scale-factor 2 --qp 18
7
-crowd_run_1080p50.y4m, --preset medium --no-cutree --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-maxrate 5000 --vbv-bufsize 5000 --early-skip --tu-inter-depth 3::crowd_run_2160p50.y4m, --preset medium --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 4 --dynamic-refine::crowd_run_2160p50.y4m, --preset medium --no-cutree --analysis-load x265_analysis.dat --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 3 --refine-inter 3
8
+crowd_run_1080p50.y4m, --preset superfast --analysis-save x265_analysis.dat --analysis-save-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m, --preset superfast --analysis-load x265_analysis.dat --analysis-load-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000
9
+crowd_run_1080p50.y4m, --preset fast --analysis-save x265_analysis.dat --analysis-save-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m, --preset fast --analysis-load x265_analysis.dat --analysis-load-reuse-level 5 --scale-factor 2 --qp 18
10
+crowd_run_1080p50.y4m, --preset medium --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-maxrate 5000 --vbv-bufsize 5000 --early-skip --tu-inter-depth 3::crowd_run_2160p50.y4m, --preset medium --analysis-load x265_analysis.dat --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 4 --dynamic-refine::crowd_run_2160p50.y4m, --preset medium --analysis-load x265_analysis.dat --analysis-load-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 3 --refine-inter 3
11
RaceHorses_416x240_30.y4m, --preset slow --no-cutree --ctu 16 --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --crf 22 --vbv-maxrate 1000 --vbv-bufsize 1000::RaceHorses_832x480_30.y4m, --preset slow --no-cutree --ctu 32 --analysis-load x265_analysis.dat --analysis-save x265_analysis_2.dat --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --crf 16 --vbv-maxrate 4000 --vbv-bufsize 4000 --refine-intra 0 --refine-inter 1::RaceHorses_1664x960_30.y4m, --preset slow --no-cutree --ctu 64 --analysis-load x265_analysis_2.dat --analysis-load-reuse-level 10 --scale-factor 2 --crf 12 --vbv-maxrate 7000 --vbv-bufsize 7000 --refine-intra 2 --refine-inter 2
12
-crowd_run_540p50.y4m, --preset veryslow --no-cutree --analysis-save x265_analysis_540.dat --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-bufsize 15000 --vbv-maxrate 9000::crowd_run_1080p50.y4m, --preset veryslow --no-cutree --analysis-save x265_analysis_1080.dat --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_1080p50.y4m, --preset veryslow --no-cutree --analysis-save x265_analysis_1080.dat --analysis-load x265_analysis_540.dat --refine-intra 4 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_2160p50.y4m, --preset veryslow --no-cutree --analysis-save x265_analysis_2160.dat --analysis-load x265_analysis_1080.dat --refine-intra 3 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000::crowd_run_2160p50.y4m, --preset veryslow --no-cutree --analysis-load x265_analysis_2160.dat --refine-intra 2 --dynamic-refine --analysis-load-reuse-level 10 --scale-factor 1 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000
13
+crowd_run_540p50.y4m, --preset veryslow --analysis-save x265_analysis_540.dat --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-bufsize 15000 --vbv-maxrate 9000::crowd_run_1080p50.y4m, --preset veryslow --analysis-save x265_analysis_1080.dat --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_1080p50.y4m, --preset veryslow --analysis-save x265_analysis_1080.dat --analysis-load x265_analysis_540.dat --refine-intra 4 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_2160p50.y4m, --preset veryslow --analysis-save x265_analysis_2160.dat --analysis-load x265_analysis_1080.dat --refine-intra 3 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000::crowd_run_2160p50.y4m, --preset veryslow --analysis-load x265_analysis_2160.dat --refine-intra 2 --dynamic-refine --analysis-load-reuse-level 10 --scale-factor 1 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000
14
crowd_run_540p50.y4m, --preset medium --no-cutree --analysis-save x265_analysis_540.dat --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-bufsize 15000 --vbv-maxrate 9000::crowd_run_1080p50.y4m, --preset medium --no-cutree --analysis-save x265_analysis_1080.dat --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_1080p50.y4m, --preset medium --no-cutree --analysis-save x265_analysis_1080.dat --analysis-load x265_analysis_540.dat --refine-intra 4 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_2160p50.y4m, --preset medium --no-cutree --analysis-save x265_analysis_2160.dat --analysis-load x265_analysis_1080.dat --refine-intra 3 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000::crowd_run_2160p50.y4m, --preset medium --no-cutree --analysis-load x265_analysis_2160.dat --refine-intra 2 --dynamic-refine --analysis-load-reuse-level 10 --scale-factor 1 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000
15
News-4k.y4m, --preset medium --analysis-save x265_analysis_fdup.dat --frame-dup --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000::News-4k.y4m, --analysis-load x265_analysis_fdup.dat --frame-dup --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000
16
x265_3.5.tar.gz/source/test/smoke-tests.txt -> x265_3.6.tar.gz/source/test/smoke-tests.txt
Changed
9
1
2
# Main12 intraCost overflow bug test
3
720p50_parkrun_ter.y4m,--preset medium
4
720p50_parkrun_ter.y4m,--preset=fast --hevc-aq --no-cutree
5
+# Test FG SEI message addition
6
+# CrowdRun_1920x1080_50_10bit_444.yuv,--preset=ultrafast --weightp --keyint -1 --film-grain "CrowdRun_1920x1080_50_10bit_444.bin"
7
+# DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=veryfast --min-cu 16 --film-grain "DucksAndLegs_1920x1080_60_10bit_422.bin"
8
+# NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset=superfast --bitrate 10000 --sao --limit-sao --cll --max-cll "1000,400" --film-grain "NebutaFestival_2560x1600_60_10bit_crop.bin"
9
x265_3.5.tar.gz/source/test/testbench.cpp -> x265_3.6.tar.gz/source/test/testbench.cpp
Changed
43
1
2
{ "AVX512", X265_CPU_AVX512 },
3
{ "ARMv6", X265_CPU_ARMV6 },
4
{ "NEON", X265_CPU_NEON },
5
+ { "SVE2", X265_CPU_SVE2 },
6
+ { "SVE", X265_CPU_SVE },
7
{ "FastNeonMRC", X265_CPU_FAST_NEON_MRC },
8
{ "", 0 },
9
};
10
11
12
EncoderPrimitives asmprim;
13
memset(&asmprim, 0, sizeof(asmprim));
14
- setupAssemblyPrimitives(asmprim, test_archi.flag);
15
-
16
-#if X265_ARCH_ARM64
17
- /* Temporary workaround because luma_vsp assembly primitive has not been completed
18
- * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
19
- * Otherwise, segment fault occurs. */
20
- setupAliasCPrimitives(cprim, asmprim, test_archi.flag);
21
-#endif
22
23
+ setupAssemblyPrimitives(asmprim, test_archi.flag);
24
setupAliasPrimitives(asmprim);
25
memcpy(&primitives, &asmprim, sizeof(EncoderPrimitives));
26
for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++)
27
28
#if X265_ARCH_X86
29
setupInstrinsicPrimitives(optprim, cpuid);
30
#endif
31
- setupAssemblyPrimitives(optprim, cpuid);
32
33
-#if X265_ARCH_ARM64
34
- /* Temporary workaround because luma_vsp assembly primitive has not been completed
35
- * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
36
- * Otherwise, segment fault occurs. */
37
- setupAliasCPrimitives(cprim, optprim, cpuid);
38
-#endif
39
+ setupAssemblyPrimitives(optprim, cpuid);
40
41
/* Note that we do not setup aliases for performance tests, that would be
42
* redundant. The testbench only verifies they are correctly aliased */
43
x265_3.5.tar.gz/source/test/testharness.h -> x265_3.6.tar.gz/source/test/testharness.h
Changed
48
1
2
#include <x86intrin.h>
3
#elif ( !defined(__APPLE__) && defined (__GNUC__) && defined(__ARM_NEON__))
4
#include <arm_neon.h>
5
-#elif defined(__GNUC__) && (!defined(__clang__) || __clang_major__ < 4)
6
+#else
7
/* fallback for older GCC/MinGW */
8
static inline uint32_t __rdtsc(void)
9
{
10
11
#if X265_ARCH_X86
12
asm volatile("rdtsc" : "=a" (a) ::"edx");
13
#elif X265_ARCH_ARM
14
-#if X265_ARCH_ARM64
15
- asm volatile("mrs %0, cntvct_el0" : "=r"(a));
16
-#else
17
// TOD-DO: verify following inline asm to get cpu Timestamp Counter for ARM arch
18
// asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(a));
19
20
// TO-DO: replace clock() function with appropriate ARM cpu instructions
21
a = clock();
22
-#endif
23
+#elif X265_ARCH_ARM64
24
+ asm volatile("mrs %0, cntvct_el0" : "=r"(a));
25
#endif
26
return a;
27
}
28
29
x265_emms(); \
30
float optperf = (10.0f * cycles / runs) / 4; \
31
float refperf = (10.0f * refcycles / refruns) / 4; \
32
- printf("\t%3.2fx ", refperf / optperf); \
33
- printf("\t %-8.2lf \t %-8.2lf\n", optperf, refperf); \
34
+ printf(" | \t%3.2fx | ", refperf / optperf); \
35
+ printf("\t %-8.2lf | \t %-8.2lf\n", optperf, refperf); \
36
}
37
38
extern "C" {
39
40
* needs an explicit asm check because it only sometimes crashes in normal use. */
41
intptr_t PFX(checkasm_call)(intptr_t (*func)(), int *ok, ...);
42
float PFX(checkasm_call_float)(float (*func)(), int *ok, ...);
43
-#elif X265_ARCH_ARM == 0
44
+#elif (X265_ARCH_ARM == 0 && X265_ARCH_ARM64 == 0)
45
#define PFX(stack_pagealign)(func, align) func()
46
#endif
47
48
x265_3.5.tar.gz/source/x265.cpp -> x265_3.6.tar.gz/source/x265.cpp
Changed
18
1
2
3
int ret = 0;
4
5
+ if (cliopt0.scenecutAwareQpConfig)
6
+ {
7
+ if (!cliopt0.parseScenecutAwareQpConfig())
8
+ {
9
+ x265_log(NULL, X265_LOG_ERROR, "Unable to parse scenecut aware qp config file \n");
10
+ fclose(cliopt0.scenecutAwareQpConfig);
11
+ cliopt0.scenecutAwareQpConfig = NULL;
12
+ }
13
+ }
14
+
15
AbrEncoder* abrEnc = new AbrEncoder(cliopt, numEncodes, ret);
16
int threadsActive = abrEnc->m_numActiveEncodes.get();
17
while (threadsActive)
18
x265_3.5.tar.gz/source/x265.h -> x265_3.6.tar.gz/source/x265.h
Changed
201
1
2
#define X265_H
3
#include <stdint.h>
4
#include <stdio.h>
5
+#include <sys/stat.h>
6
#include "x265_config.h"
7
#ifdef __cplusplus
8
extern "C" {
9
10
NAL_UNIT_CODED_SLICE_TRAIL_N = 0,
11
NAL_UNIT_CODED_SLICE_TRAIL_R,
12
NAL_UNIT_CODED_SLICE_TSA_N,
13
- NAL_UNIT_CODED_SLICE_TLA_R,
14
+ NAL_UNIT_CODED_SLICE_TSA_R,
15
NAL_UNIT_CODED_SLICE_STSA_N,
16
NAL_UNIT_CODED_SLICE_STSA_R,
17
NAL_UNIT_CODED_SLICE_RADL_N,
18
19
double vmafFrameScore;
20
double bufferFillFinal;
21
double unclippedBufferFillFinal;
22
+ uint8_t tLayer;
23
} x265_frame_stats;
24
25
typedef struct x265_ctu_info_t
26
27
/* ARM */
28
#define X265_CPU_ARMV6 0x0000001
29
#define X265_CPU_NEON 0x0000002 /* ARM NEON */
30
+#define X265_CPU_SVE2 0x0000008 /* ARM SVE2 */
31
+#define X265_CPU_SVE 0x0000010 /* ARM SVE2 */
32
#define X265_CPU_FAST_NEON_MRC 0x0000004 /* Transfer from NEON to ARM register is fast (Cortex-A9) */
33
34
/* IBM Power8 */
35
36
#define SLICE_TYPE_DELTA 0.3 /* The offset decremented or incremented for P-frames or b-frames respectively*/
37
#define BACKWARD_WINDOW 1 /* Scenecut window before a scenecut */
38
#define FORWARD_WINDOW 2 /* Scenecut window after a scenecut */
39
+#define BWD_WINDOW_DELTA 0.4
40
+
41
+#define X265_MAX_GOP_CONFIG 3
42
+#define X265_MAX_GOP_LENGTH 16
43
+#define MAX_T_LAYERS 7
44
+
45
+#define X265_IPRATIO_STRENGTH 1.43
46
47
typedef struct x265_cli_csp
48
{
49
50
typedef struct x265_zone
51
{
52
int startFrame, endFrame; /* range of frame numbers */
53
+ int keyframeMax; /* it store the default/user defined keyframeMax value*/
54
int bForceQp; /* whether to use qp vs bitrate factor */
55
int qp;
56
float bitrateFactor;
57
58
59
static const x265_vmaf_commondata vcd = { { NULL, (char *)"/usr/local/share/model/vmaf_v0.6.1.pkl", NULL, NULL, 0, 0, 0, 0, 0, 0, 0, NULL, 0, 1, 0 } };
60
61
+typedef struct x265_temporal_layer {
62
+ int poc_offset; /* POC offset */
63
+ int8_t layer; /* Current layer */
64
+ int8_t qp_offset; /* QP offset */
65
+} x265_temporal_layer;
66
+
67
+static const int8_t x265_temporal_layer_bframesMAX_T_LAYERS = {-1, -1, 3, 7, 15, -1, -1};
68
+
69
+static const int8_t x265_gop_ra_lengthX265_MAX_GOP_CONFIG = { 4, 8, 16};
70
+static const x265_temporal_layer x265_gop_raX265_MAX_GOP_CONFIGX265_MAX_GOP_LENGTH = {
71
+ {
72
+ {
73
+ 4,
74
+ 0,
75
+ 1,
76
+ },
77
+ {
78
+ 2,
79
+ 1,
80
+ 5,
81
+ },
82
+ {
83
+ 1,
84
+ 2,
85
+ 3,
86
+ },
87
+ {
88
+ 3,
89
+ 2,
90
+ 5,
91
+ },
92
+ {
93
+ -1,
94
+ -1,
95
+ -1,
96
+ },
97
+ {
98
+ -1,
99
+ -1,
100
+ -1,
101
+ },
102
+ {
103
+ -1,
104
+ -1,
105
+ -1,
106
+ },
107
+ {
108
+ -1,
109
+ -1,
110
+ -1,
111
+ },
112
+ {
113
+ -1,
114
+ -1,
115
+ -1,
116
+ },
117
+ {
118
+ -1,
119
+ -1,
120
+ -1,
121
+ },
122
+ {
123
+ -1,
124
+ -1,
125
+ -1,
126
+ },
127
+ {
128
+ -1,
129
+ -1,
130
+ -1,
131
+ },
132
+ {
133
+ -1,
134
+ -1,
135
+ -1,
136
+ },
137
+ {
138
+ -1,
139
+ -1,
140
+ -1,
141
+ },
142
+ {
143
+ -1,
144
+ -1,
145
+ -1,
146
+ },
147
+ {
148
+ -1,
149
+ -1,
150
+ -1,
151
+ }
152
+ },
153
+
154
+ {
155
+ {
156
+ 8,
157
+ 0,
158
+ 1,
159
+ },
160
+ {
161
+ 4,
162
+ 1,
163
+ 5,
164
+ },
165
+ {
166
+ 2,
167
+ 2,
168
+ 4,
169
+ },
170
+ {
171
+ 1,
172
+ 3,
173
+ 5,
174
+ },
175
+ {
176
+ 3,
177
+ 3,
178
+ 2,
179
+ },
180
+ {
181
+ 6,
182
+ 2,
183
+ 5,
184
+ },
185
+ {
186
+ 5,
187
+ 3,
188
+ 4,
189
+ },
190
+ {
191
+ 7,
192
+ 3,
193
+ 5,
194
+ },
195
+ {
196
+ -1,
197
+ -1,
198
+ -1,
199
+ },
200
+ {
201
x265_3.5.tar.gz/source/x265cli.cpp -> x265_3.6.tar.gz/source/x265cli.cpp
Changed
201
1
2
#include "x265cli.h"
3
#include "svt.h"
4
5
-#define START_CODE 0x00000001
6
-#define START_CODE_BYTES 4
7
+#define START_CODE 0x00000001
8
+#define START_CODE_BYTES 4
9
10
#ifdef __cplusplus
11
namespace X265_NS {
12
13
H0(" --rdpenalty <0..2> penalty for 32x32 intra TU in non-I slices. 0:disabled 1:RD-penalty 2:maximum. Default %d\n", param->rdPenalty);
14
H0("\nSlice decision options:\n");
15
H0(" --no-open-gop Enable open-GOP, allows I slices to be non-IDR. Default %s\n", OPT(param->bOpenGOP));
16
+ H0(" --cra-nal Force nal type to CRA to all frames expect first frame, works only with keyint 1. Default %s\n", OPT(param->craNal));
17
H0("-I/--keyint <integer> Max IDR period in frames. -1 for infinite-gop. Default %d\n", param->keyframeMax);
18
H0("-i/--min-keyint <integer> Scenecuts closer together than this are coded as I, not IDR. Default: auto\n");
19
H0(" --gop-lookahead <integer> Extends gop boundary if a scenecut is found within this from keyint boundary. Default 0\n");
20
21
H1(" --scenecut-bias <0..100.0> Bias for scenecut detection. Default %.2f\n", param->scenecutBias);
22
H0(" --hist-scenecut Enables histogram based scene-cut detection using histogram based algorithm.\n");
23
H0(" --no-hist-scenecut Disables histogram based scene-cut detection using histogram based algorithm.\n");
24
- H1(" --hist-threshold <0.0..1.0> Luma Edge histogram's Normalized SAD threshold for histogram based scenecut detection Default %.2f\n", param->edgeTransitionThreshold);
25
H0(" --no-fades Enable detection and handling of fade-in regions. Default %s\n", OPT(param->bEnableFades));
26
H1(" --scenecut-aware-qp <0..3> Enable increasing QP for frames inside the scenecut window around scenecut. Default %s\n", OPT(param->bEnableSceneCutAwareQp));
27
H1(" 0 - Disabled\n");
28
29
H1(" 2 - Backward masking\n");
30
H1(" 3 - Bidirectional masking\n");
31
H1(" --masking-strength <string> Comma separated values which specify the duration and offset for the QP increment for inter-frames when scenecut-aware-qp is enabled.\n");
32
+ H1(" --scenecut-qp-config <file> File containing scenecut-aware-qp mode, window duration and offsets settings required for the masking. Works only with --pass 2\n");
33
H0(" --radl <integer> Number of RADL pictures allowed in front of IDR. Default %d\n", param->radl);
34
H0(" --intra-refresh Use Periodic Intra Refresh instead of IDR frames\n");
35
H0(" --rc-lookahead <integer> Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth);
36
37
H0(" --aq-strength <float> Reduces blocking and blurring in flat and textured areas (0 to 3.0). Default %.2f\n", param->rc.aqStrength);
38
H0(" --qp-adaptation-range <float> Delta QP range by QP adaptation based on a psycho-visual model (1.0 to 6.0). Default %.2f\n", param->rc.qpAdaptationRange);
39
H0(" --no-aq-motion Block level QP adaptation based on the relative motion between the block and the frame. Default %s\n", OPT(param->bAQMotion));
40
+ H1(" --no-sbrc Enables the segment based rate control. Default %s\n", OPT(param->bEnableSBRC));
41
H0(" --qg-size <int> Specifies the size of the quantization group (64, 32, 16, 8). Default %d\n", param->rc.qgSize);
42
H0(" --no-cutree Enable cutree for Adaptive Quantization. Default %s\n", OPT(param->rc.cuTree));
43
H0(" --no-rc-grain Enable ratecontrol mode to handle grains specifically. turned on with tune grain. Default %s\n", OPT(param->rc.bEnableGrain));
44
45
H1(" q=<integer> (force QP)\n");
46
H1(" or b=<float> (bitrate multiplier)\n");
47
H0(" --zonefile <filename> Zone file containing the zone boundaries and the parameters to be reconfigured.\n");
48
+ H0(" --no-zonefile-rc-init This allow to use rate-control history across zones in zonefile.\n");
49
H1(" --lambda-file <string> Specify a file containing replacement values for the lambda tables\n");
50
H1(" MAX_MAX_QP+1 floats for lambda table, then again for lambda2 table\n");
51
H1(" Blank lines and lines starting with hash(#) are ignored\n");
52
53
H0(" --master-display <string> SMPTE ST 2086 master display color volume info SEI (HDR)\n");
54
H0(" format: G(x,y)B(x,y)R(x,y)WP(x,y)L(max,min)\n");
55
H0(" --max-cll <string> Specify content light level info SEI as \"cll,fall\" (HDR).\n");
56
+ H0(" --video-signal-type-preset <string> Specify combinations of color primaries, transfer characteristics, color matrix, range of luma and chroma signals, and chroma sample location\n");
57
+ H0(" format: <system-id>:<color-volume>\n");
58
+ H0(" This has higher precedence than individual VUI parameters. If any individual VUI option is specified together with this,\n");
59
+ H0(" which changes the values set corresponding to the system-id or color-volume, it will be discarded.\n");
60
+ H0(" The color-volume can be used only with the system-id options BT2100_PQ_YCC, BT2100_PQ_ICTCP, and BT2100_PQ_RGB.\n");
61
+ H0(" system-id options and their corresponding values:\n");
62
+ H0(" BT601_525: --colorprim smpte170m --transfer smpte170m --colormatrix smpte170m --range limited --chromaloc 0\n");
63
+ H0(" BT601_626: --colorprim bt470bg --transfer smpte170m --colormatrix bt470bg --range limited --chromaloc 0\n");
64
+ H0(" BT709_YCC: --colorprim bt709 --transfer bt709 --colormatrix bt709 --range limited --chromaloc 0\n");
65
+ H0(" BT709_RGB: --colorprim bt709 --transfer bt709 --colormatrix gbr --range limited\n");
66
+ H0(" BT2020_YCC_NCL: --colorprim bt2020 --transfer bt2020-10 --colormatrix bt709 --range limited --chromaloc 2\n");
67
+ H0(" BT2020_RGB: --colorprim bt2020 --transfer smpte2084 --colormatrix bt2020nc --range limited\n");
68
+ H0(" BT2100_PQ_YCC: --colorprim bt2020 --transfer smpte2084 --colormatrix bt2020nc --range limited --chromaloc 2\n");
69
+ H0(" BT2100_PQ_ICTCP: --colorprim bt2020 --transfer smpte2084 --colormatrix ictcp --range limited --chromaloc 2\n");
70
+ H0(" BT2100_PQ_RGB: --colorprim bt2020 --transfer smpte2084 --colormatrix gbr --range limited\n");
71
+ H0(" BT2100_HLG_YCC: --colorprim bt2020 --transfer arib-std-b67 --colormatrix bt2020nc --range limited --chromaloc 2\n");
72
+ H0(" BT2100_HLG_RGB: --colorprim bt2020 --transfer arib-std-b67 --colormatrix gbr --range limited\n");
73
+ H0(" FR709_RGB: --colorprim bt709 --transfer bt709 --colormatrix gbr --range full\n");
74
+ H0(" FR2020_RGB: --colorprim bt2020 --transfer bt2020-10 --colormatrix gbr --range full\n");
75
+ H0(" FRP3D65_YCC: --colorprim smpte432 --transfer bt709 --colormatrix smpte170m --range full --chromaloc 1\n");
76
+ H0(" color-volume options and their corresponding values:\n");
77
+ H0(" P3D65x1000n0005: --master-display G(13250,34500)B(7500,3000)R(34000,16000)WP(15635,16450)L(10000000,5)\n");
78
+ H0(" P3D65x4000n005: --master-display G(13250,34500)B(7500,3000)R(34000,16000)WP(15635,16450)L(40000000,50)\n");
79
+ H0(" BT2100x108n0005: --master-display G(8500,39850)B(6550,2300)R(34000,146000)WP(15635,16450)L(10000000,1)\n");
80
H0(" --no-cll Emit content light level info SEI. Default %s\n", OPT(param->bEmitCLL));
81
H0(" --no-hdr10 Control dumping of HDR10 SEI packet. If max-cll or master-display has non-zero values, this is enabled. Default %s\n", OPT(param->bEmitHDR10SEI));
82
H0(" --no-hdr-opt Add luma and chroma offsets for HDR/WCG content. Default %s. Now deprecated.\n", OPT(param->bHDROpt));
83
84
H0(" --no-repeat-headers Emit SPS and PPS headers at each keyframe. Default %s\n", OPT(param->bRepeatHeaders));
85
H0(" --no-info Emit SEI identifying encoder and parameters. Default %s\n", OPT(param->bEmitInfoSEI));
86
H0(" --no-hrd Enable HRD parameters signaling. Default %s\n", OPT(param->bEmitHRDSEI));
87
- H0(" --no-idr-recovery-sei Emit recovery point infor SEI at each IDR frame \n");
88
- H0(" --no-temporal-layers Enable a temporal sublayer for unreferenced B frames. Default %s\n", OPT(param->bEnableTemporalSubLayers));
89
+ H0(" --no-idr-recovery-sei Emit recovery point infor SEI at each IDR frame \n");
90
+ H0(" --temporal-layers Enable a temporal sublayer for unreferenced B frames. Default %s\n", OPT(param->bEnableTemporalSubLayers));
91
H0(" --no-aud Emit access unit delimiters at the start of each access unit. Default %s\n", OPT(param->bEnableAccessUnitDelimiters));
92
+ H0(" --no-eob Emit end of bitstream nal unit at the end of the bitstream. Default %s\n", OPT(param->bEnableEndOfBitstream));
93
+ H0(" --no-eos Emit end of sequence nal unit at the end of every coded video sequence. Default %s\n", OPT(param->bEnableEndOfSequence));
94
H1(" --hash <integer> Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default %d\n", param->decodedPictureHashSEI);
95
H0(" --atc-sei <integer> Emit the alternative transfer characteristics SEI message where the integer is the preferred transfer characteristics. Default disabled\n");
96
H0(" --pic-struct <integer> Set the picture structure and emits it in the picture timing SEI message. Values in the range 0..12. See D.3.3 of the HEVC spec. for a detailed explanation.\n");
97
98
H0(" --lowpass-dct Use low-pass subband dct approximation. Default %s\n", OPT(param->bLowPassDct));
99
H0(" --no-frame-dup Enable Frame duplication. Default %s\n", OPT(param->bEnableFrameDuplication));
100
H0(" --dup-threshold <integer> PSNR threshold for Frame duplication. Default %d\n", param->dupThreshold);
101
+ H0(" --no-mcstf Enable GOP based temporal filter. Default %d\n", param->bEnableTemporalFilter);
102
#ifdef SVT_HEVC
103
H0(" --nosvt Enable SVT HEVC encoder %s\n", OPT(param->bEnableSvtHevc));
104
H0(" --no-svt-hme Enable Hierarchial motion estimation(HME) in SVT HEVC encoder \n");
105
106
H1(" 2 - unable to open encoder\n");
107
H1(" 3 - unable to generate stream headers\n");
108
H1(" 4 - encoder abort\n");
109
+ H0("\nSEI Message Options\n");
110
+ H0(" --film-grain <filename> File containing Film Grain Characteristics to be written as a SEI Message\n");
111
+
112
#undef OPT
113
#undef H0
114
#undef H1
115
116
117
memcpy(globalParam->rc.zoneszonefileCount.zoneParam, globalParam, sizeof(x265_param));
118
119
+ if (zonefileCount == 0)
120
+ globalParam->rc.zoneszonefileCount.keyframeMax = globalParam->keyframeMax;
121
+
122
for (optind = 0;;)
123
{
124
int long_options_index = -1;
125
126
return true;
127
}
128
}
129
+ OPT("scenecut-qp-config")
130
+ {
131
+ this->scenecutAwareQpConfig = x265_fopen(optarg, "rb");
132
+ if (!this->scenecutAwareQpConfig)
133
+ x265_log_file(param, X265_LOG_ERROR, "%s scenecut aware qp config file not found or error in opening config file\n", optarg);
134
+ }
135
OPT("zonefile")
136
{
137
this->zoneFile = x265_fopen(optarg, "rb");
138
if (!this->zoneFile)
139
x265_log_file(param, X265_LOG_ERROR, "%s zone file not found or error in opening zone file\n", optarg);
140
}
141
+ OPT("no-zonefile-rc-init") this->param->bNoResetZoneConfig = true;
142
OPT("fullhelp")
143
{
144
param->logLevel = X265_LOG_FULL;
145
146
if (reconFileBitDepth == 0)
147
reconFileBitDepth = param->internalBitDepth;
148
this->recon = ReconFile::open(reconfn, param->sourceWidth, param->sourceHeight, reconFileBitDepth,
149
- param->fpsNum, param->fpsDenom, param->internalCsp);
150
+ param->fpsNum, param->fpsDenom, param->internalCsp, param->sourceBitDepth);
151
if (this->recon->isFail())
152
{
153
x265_log(param, X265_LOG_WARNING, "unable to write reconstructed outputs file\n");
154
155
param->rc.zones = X265_MALLOC(x265_zone, param->rc.zonefileCount);
156
for (int i = 0; i < param->rc.zonefileCount; i++)
157
{
158
+ param->rc.zonesi.startFrame = -1;
159
while (fgets(line, sizeof(line), zoneFile))
160
{
161
if (*line == '#' || (strcmp(line, "\r\n") == 0))
162
163
return 1;
164
}
165
166
- /* Parse the RPU file and extract the RPU corresponding to the current picture
167
- * and fill the rpu field of the input picture */
168
- int CLIOptions::rpuParser(x265_picture * pic)
169
- {
170
- uint8_t byteVal;
171
- uint32_t code = 0;
172
- int bytesRead = 0;
173
- pic->rpu.payloadSize = 0;
174
-
175
- if (!pic->pts)
176
- {
177
- while (bytesRead++ < 4 && fread(&byteVal, sizeof(uint8_t), 1, dolbyVisionRpu))
178
- code = (code << 8) | byteVal;
179
-
180
- if (code != START_CODE)
181
- {
182
- x265_log(NULL, X265_LOG_ERROR, "Invalid Dolby Vision RPU startcode in POC %d\n", pic->pts);
183
- return 1;
184
- }
185
- }
186
-
187
- bytesRead = 0;
188
- while (fread(&byteVal, sizeof(uint8_t), 1, dolbyVisionRpu))
189
- {
190
- code = (code << 8) | byteVal;
191
- if (bytesRead++ < 3)
192
- continue;
193
- if (bytesRead >= 1024)
194
- {
195
- x265_log(NULL, X265_LOG_ERROR, "Invalid Dolby Vision RPU size in POC %d\n", pic->pts);
196
- return 1;
197
- }
198
-
199
- if (code != START_CODE)
200
- pic->rpu.payloadpic->rpu.payloadSize++ = (code >> (3 * 8)) & 0xFF;
201
x265_3.5.tar.gz/source/x265cli.h -> x265_3.6.tar.gz/source/x265cli.h
Changed
104
1
2
{ "no-fast-intra", no_argument, NULL, 0 },
3
{ "no-open-gop", no_argument, NULL, 0 },
4
{ "open-gop", no_argument, NULL, 0 },
5
+ { "cra-nal", no_argument, NULL, 0 },
6
{ "keyint", required_argument, NULL, 'I' },
7
{ "min-keyint", required_argument, NULL, 'i' },
8
{ "gop-lookahead", required_argument, NULL, 0 },
9
10
{ "scenecut-bias", required_argument, NULL, 0 },
11
{ "hist-scenecut", no_argument, NULL, 0},
12
{ "no-hist-scenecut", no_argument, NULL, 0},
13
- { "hist-threshold", required_argument, NULL, 0},
14
{ "fades", no_argument, NULL, 0 },
15
{ "no-fades", no_argument, NULL, 0 },
16
{ "scenecut-aware-qp", required_argument, NULL, 0 },
17
18
{ "qp", required_argument, NULL, 'q' },
19
{ "aq-mode", required_argument, NULL, 0 },
20
{ "aq-strength", required_argument, NULL, 0 },
21
+ { "sbrc", no_argument, NULL, 0 },
22
+ { "no-sbrc", no_argument, NULL, 0 },
23
{ "rc-grain", no_argument, NULL, 0 },
24
{ "no-rc-grain", no_argument, NULL, 0 },
25
{ "ipratio", required_argument, NULL, 0 },
26
27
{ "crop-rect", required_argument, NULL, 0 }, /* DEPRECATED */
28
{ "master-display", required_argument, NULL, 0 },
29
{ "max-cll", required_argument, NULL, 0 },
30
+ {"video-signal-type-preset", required_argument, NULL, 0 },
31
{ "min-luma", required_argument, NULL, 0 },
32
{ "max-luma", required_argument, NULL, 0 },
33
{ "log2-max-poc-lsb", required_argument, NULL, 8 },
34
35
{ "repeat-headers", no_argument, NULL, 0 },
36
{ "aud", no_argument, NULL, 0 },
37
{ "no-aud", no_argument, NULL, 0 },
38
+ { "eob", no_argument, NULL, 0 },
39
+ { "no-eob", no_argument, NULL, 0 },
40
+ { "eos", no_argument, NULL, 0 },
41
+ { "no-eos", no_argument, NULL, 0 },
42
{ "info", no_argument, NULL, 0 },
43
{ "no-info", no_argument, NULL, 0 },
44
{ "zones", required_argument, NULL, 0 },
45
{ "qpfile", required_argument, NULL, 0 },
46
{ "zonefile", required_argument, NULL, 0 },
47
+ { "no-zonefile-rc-init", no_argument, NULL, 0 },
48
{ "lambda-file", required_argument, NULL, 0 },
49
{ "b-intra", no_argument, NULL, 0 },
50
{ "no-b-intra", no_argument, NULL, 0 },
51
52
{ "dynamic-refine", no_argument, NULL, 0 },
53
{ "no-dynamic-refine", no_argument, NULL, 0 },
54
{ "strict-cbr", no_argument, NULL, 0 },
55
- { "temporal-layers", no_argument, NULL, 0 },
56
- { "no-temporal-layers", no_argument, NULL, 0 },
57
+ { "temporal-layers", required_argument, NULL, 0 },
58
{ "qg-size", required_argument, NULL, 0 },
59
{ "recon-y4m-exec", required_argument, NULL, 0 },
60
{ "analyze-src-pics", no_argument, NULL, 0 },
61
62
{ "frame-dup", no_argument, NULL, 0 },
63
{ "no-frame-dup", no_argument, NULL, 0 },
64
{ "dup-threshold", required_argument, NULL, 0 },
65
+ { "mcstf", no_argument, NULL, 0 },
66
+ { "no-mcstf", no_argument, NULL, 0 },
67
#ifdef SVT_HEVC
68
{ "svt", no_argument, NULL, 0 },
69
{ "no-svt", no_argument, NULL, 0 },
70
71
{ "abr-ladder", required_argument, NULL, 0 },
72
{ "min-vbv-fullness", required_argument, NULL, 0 },
73
{ "max-vbv-fullness", required_argument, NULL, 0 },
74
+ { "scenecut-qp-config", required_argument, NULL, 0 },
75
+ { "film-grain", required_argument, NULL, 0 },
76
{ 0, 0, 0, 0 },
77
{ 0, 0, 0, 0 },
78
{ 0, 0, 0, 0 },
79
80
FILE* qpfile;
81
FILE* zoneFile;
82
FILE* dolbyVisionRpu; /* File containing Dolby Vision BL RPU metadata */
83
+ FILE* scenecutAwareQpConfig; /* File containing scenecut aware frame quantization related CLI options */
84
const char* reconPlayCmd;
85
const x265_api* api;
86
x265_param* param;
87
88
qpfile = NULL;
89
zoneFile = NULL;
90
dolbyVisionRpu = NULL;
91
+ scenecutAwareQpConfig = NULL;
92
reconPlayCmd = NULL;
93
api = NULL;
94
param = NULL;
95
96
bool parseQPFile(x265_picture &pic_org);
97
bool parseZoneFile();
98
int rpuParser(x265_picture * pic);
99
+ bool parseScenecutAwareQpConfig();
100
+ bool parseScenecutAwareQpParam(int argc, char **argv, x265_param* globalParam);
101
};
102
#ifdef __cplusplus
103
}
104
x265_3.5.tar.gz/x265Version.txt -> x265_3.6.tar.gz/x265Version.txt
Changed
8
1
2
#Attribute: Values
3
-repositorychangeset: f0c1022b6
4
+repositorychangeset: aa7f602f7
5
releasetagdistance: 1
6
-releasetag: 3.5
7
+releasetag: 3.6
8