We truncated the diff of some files because they were too big.
If you want to see the full diff for every file, click here.
Changes of Revision 30
x265.changes
Changed
x
1
2
-------------------------------------------------------------------
3
+Tue Oct 9 20:03:53 UTC 2018 - aloisio@gmx.com
4
+
5
+- Update to version 2.9
6
+ New features:
7
+ * Support for chunked encoding
8
+ + :option:`--chunk-start and --chunk-end`
9
+ + Frames preceding first frame of chunk in display order
10
+ will be encoded, however, they will be discarded in the
11
+ bitstream.
12
+ + Frames following last frame of the chunk in display order
13
+ will be used in taking lookahead decisions, but, they will
14
+ not be encoded.
15
+ + This feature can be enabled only in closed GOP structures.
16
+ Default disabled.
17
+ * Support for HDR10+ version 1 SEI messages.
18
+ Encoder enhancements:
19
+ * Create API function for allocating and freeing
20
+ x265_analysis_data.
21
+ * CEA 608/708 support: Read SEI messages from text file and
22
+ encode it using userSEI message.
23
+ Bug fixes:
24
+ * Disable noise reduction when vbv is enabled.
25
+ * Support minLuma and maxLuma values changed by the
26
+ commandline.
27
+ version 2.8
28
+ New features:
29
+ * :option:`--asm avx512` used to enable AVX-512 in x265.
30
+ Default disabled.
31
+ + For 4K main10 high-quality encoding, we are seeing good
32
+ gains; for other resolutions and presets, we don't
33
+ recommend using this setting for now.
34
+ * :option:`--dynamic-refine` dynamically switches between
35
+ different inter refine levels. Default disabled.
36
+ + It is recommended to use :option:`--refine-intra 4' with
37
+ dynamic refinement for a better trade-off between encode
38
+ efficiency and performance than using static refinement.
39
+ * :option:`--single-sei`
40
+ + Encode SEI messages in a single NAL unit instead of
41
+ multiple NAL units. Default disabled.
42
+ * :option:`--max-ausize-factor` controls the maximum AU size
43
+ defined in HEVC specification.
44
+ + It represents the percentage of maximum AU size used.
45
+ Default is 1.
46
+ * VMAF (Video Multi-Method Assessment Fusion)
47
+ + Added VMAF support for objective quality measurement of a
48
+ video sequence.
49
+ + Enable cmake option ENABLE_LIBVMAF to report per frame and
50
+ aggregate VMAF score. The frame level VMAF score does not
51
+ include temporal scores.
52
+ + This is supported only on linux for now.
53
+ Encoder enhancements:
54
+ * Introduced refine-intra level 4 to improve quality.
55
+ * Support for HLG-graded content and pic_struct in SEI message.
56
+ Bug Fixes:
57
+ * Fix 32 bit build error (using CMAKE GUI) in Linux.
58
+ * Fix 32 bit build error for asm primitives.
59
+ * Fix build error on mac OS.
60
+ * Fix VBV Lookahead in analysis load to achieve target bitrate.
61
+
62
+- Added x265-fix_enable512.patch
63
+
64
+-------------------------------------------------------------------
65
Fri May 4 22:21:57 UTC 2018 - zaitor@opensuse.org
66
67
- Build with nasm >= 2.13 for openSUSE Leap 42.3 and SLE-12, since
68
x265.spec
Changed
83
1
2
# based on the spec file from https://build.opensuse.org/package/view_file/home:Simmphonie/libx265/
3
4
Name: x265
5
-%define soname 151
6
+%define soname 165
7
%define libname lib%{name}
8
%define libsoname %{libname}-%{soname}
9
-Version: 2.7
10
+Version: 2.9
11
Release: 0
12
License: GPL-2.0+
13
Summary: A free h265/HEVC encoder - encoder binary
14
15
Source0: https://bitbucket.org/multicoreware/x265/downloads/%{name}_%{version}.tar.gz
16
Patch0: arm.patch
17
Patch1: x265.pkgconfig.patch
18
+Patch2: x265-fix_enable512.patch
19
BuildRequires: gcc
20
BuildRequires: gcc-c++
21
BuildRequires: cmake >= 2.8.8
22
BuildRequires: pkg-config
23
BuildRequires: nasm >= 2.13
24
-%if 0%{?suse_version} > 1310
25
%ifarch x86_64
26
BuildRequires: libnuma-devel >= 2.0.9
27
%endif
28
-%endif
29
-BuildRoot: %{_tmppath}/%{name}-%{version}-build
30
31
%description
32
x265 is a free library for encoding next-generation H265/HEVC video
33
34
35
%description -n %{libname}-devel
36
x265 is a free library for encoding next-generation H265/HEVC video
37
-streams.
38
+streams.
39
40
%prep
41
%setup -q -n %{name}_%{version}
42
%patch0 -p1
43
%patch1 -p1
44
+%patch2 -p1
45
46
sed -i -e "s/0.0/%{soname}.0/g" source/cmake/version.cmake
47
48
49
%build
50
-%if 0%{?suse_version} < 1330
51
+%if 0%{?suse_version} < 1500
52
cd source
53
%else
54
%define __builddir ./source/build
55
56
make %{?_smp_mflags}
57
58
%install
59
-%if 0%{?suse_version} < 1330
60
+%if 0%{?suse_version} < 1500
61
cd source
62
%endif
63
%cmake_install
64
65
%postun -n %{libsoname} -p /sbin/ldconfig
66
67
%files -n %{libsoname}
68
-%defattr(0644,root,root)
69
%{_libdir}/%{libname}.so.%{soname}*
70
71
-%files
72
-%defattr(0755,root,root)
73
+%files
74
%{_bindir}/%{name}
75
76
%files -n %{libname}-devel
77
-%defattr(0644,root,root)
78
+%license COPYING
79
+%doc readme.rst
80
%{_includedir}/%{name}.h
81
%{_includedir}/%{name}_config.h
82
%{_libdir}/pkgconfig/%{name}.pc
83
x265-fix_enable512.patch
Added
27
1
2
+--- a/source/common/cpu.cpp
3
++++ b/source/common/cpu.cpp
4
+@@ -110,6 +110,11 @@ const cpu_name_t cpu_names[] =
5
+ { "", 0 },
6
+ };
7
+
8
++bool detect512()
9
++{
10
++ return(enable512);
11
++}
12
++
13
+ #if X265_ARCH_X86
14
+
15
+ extern "C" {
16
+@@ -123,10 +128,6 @@ uint64_t PFX(cpu_xgetbv)(int xcr);
17
+ #pragma warning(disable: 4309) // truncation of constant value
18
+ #endif
19
+
20
+-bool detect512()
21
+-{
22
+- return(enable512);
23
+-}
24
+ uint32_t cpu_detect(bool benableavx512 )
25
+ {
26
+
27
x265_2.7.tar.gz/.hg_archival.txt -> x265_2.9.tar.gz/.hg_archival.txt
Changed
8
1
2
repo: 09fe40627f03a0f9c3e6ac78b22ac93da23f9fdf
3
-node: e41a9bf2bac4a7af2bec2bbadf91e63752d320ef
4
+node: f9681d731f2e56c2ca185cec10daece5939bee07
5
branch: stable
6
-tag: 2.7
7
+tag: 2.9
8
x265_2.7.tar.gz/.hgtags -> x265_2.9.tar.gz/.hgtags
Changed
7
1
2
e7a4dd48293b7956d4a20df257d23904cc78e376 2.4
3
64b2d0bf45a52511e57a6b7299160b961ca3d51c 2.5
4
0e9ea76945c89962cd46cee6537586e2054b2935 2.6
5
+e41a9bf2bac4a7af2bec2bbadf91e63752d320ef 2.7
6
+a158a3a029663133455268e2a63ae6b0af2df720 2.8
7
x265_2.7.tar.gz/doc/reST/api.rst -> x265_2.9.tar.gz/doc/reST/api.rst
Changed
51
1
2
* returns negative on error, 0 access unit were output.*/
3
int x265_set_analysis_data(x265_encoder *encoder, x265_analysis_data *analysis_data, int poc, uint32_t cuBytes);
4
5
+**x265_alloc_analysis_data()** may be used to allocate memory for the x265_analysis_data::
6
+
7
+ /* x265_alloc_analysis_data:
8
+ * Allocate memory for the x265_analysis_data object's internal structures. */
9
+ void x265_alloc_analysis_data(x265_param *param, x265_analysis_data* analysis);
10
+
11
+**x265_free_analysis_data()** may be used to free memory for the x265_analysis_data::
12
+
13
+ /* x265_free_analysis_data:
14
+ * Free the allocated memory for x265_analysis_data object's internal structures. */
15
+ void x265_free_analysis_data(x265_param *param, x265_analysis_data* analysis);
16
+
17
Pictures
18
========
19
20
21
* release library static allocations, reset configured CTU size */
22
void x265_cleanup(void);
23
24
+VMAF (Video Multi-Method Assessment Fusion)
25
+==========================================
26
+
27
+If you set the ENABLE_LIBVMAF cmake option to ON, then x265 will report per frame
28
+and aggregate VMAF score for the given input and dump the scores in csv file.
29
+The user also need to specify the :option:`--recon` in command line to get the VMAF scores.
30
+
31
+ /* x265_calculate_vmafScore:
32
+ * returns VMAF score for the input video.
33
+ * This api must be called only after encoding was done. */
34
+ double x265_calculate_vmafscore(x265_param*, x265_vmaf_data*);
35
+
36
+ /* x265_calculate_vmaf_framelevelscore:
37
+ * returns VMAF score for each frame in a given input video. The frame level VMAF score does not include temporal scores. */
38
+ double x265_calculate_vmaf_framelevelscore(x265_vmaf_framedata*);
39
+
40
+.. Note::
41
42
+ When setting ENABLE_LIBVMAF cmake option to ON, it is recommended to
43
+ also set ENABLE_SHARED to OFF to prevent build problems.
44
+ We only need the static library from these builds.
45
+
46
+ Binaries build with windows will not have VMAF support.
47
+
48
Multi-library Interface
49
=======================
50
51
x265_2.7.tar.gz/doc/reST/cli.rst -> x265_2.9.tar.gz/doc/reST/cli.rst
Changed
201
1
2
2. unable to open encoder
3
3. unable to generate stream headers
4
4. encoder abort
5
-
6
+
7
Logging/Statistic Options
8
=========================
9
10
11
**BufferFill** Bits available for the next frame. Includes bits carried
12
over from the current frame.
13
14
+ **BufferFillFinal** Buffer bits available after removing the frame out of CPB.
15
+
16
**Latency** Latency in terms of number of frames between when the frame
17
was given in and when the frame is given out.
18
19
20
21
.. option:: --csv-log-level <integer>
22
23
- Controls the level of detail (and size) of --csv log files
24
-
25
- 0. summary **(default)**
26
- 1. frame level logging
27
- 2. frame level logging with performance statistics
28
+ Controls the level of detail (and size) of --csv log files
29
+
30
+ 0. summary **(default)**
31
+ 1. frame level logging
32
+ 2. frame level logging with performance statistics
33
34
.. option:: --ssim, --no-ssim
35
36
37
"*" - same as default
38
"none" - no thread pools are created, only frame parallelism possible
39
"-" - same as "none"
40
- "10" - allocate one pool, using up to 10 cores on node 0
41
+ "10" - allocate one pool, using up to 10 cores on all available nodes
42
"-,+" - allocate one pool, using all cores on node 1
43
"+,-,+" - allocate one pool, using only cores on nodes 0 and 2
44
"+,-,+,-" - allocate one pool, using only cores on nodes 0 and 2
45
46
47
**CLI ONLY**
48
49
+.. option:: --chunk-start <integer>
50
+
51
+ First frame of the chunk. Frames preceeding this in display order will
52
+ be encoded, however, they will be discarded in the bitstream. This
53
+ feature can be enabled only in closed GOP structures.
54
+ Default 0 (disabled).
55
+
56
+.. option:: --chunk-end <integer>
57
+
58
+ Last frame of the chunk. Frames following this in display order will be
59
+ used in taking lookahead decisions, but, they will not be encoded.
60
+ This feature can be enabled only in closed GOP structures.
61
+ Default 0 (disabled).
62
+
63
Profile, Level, Tier
64
====================
65
66
67
encoding options, the encoder will attempt to modify/set the right
68
encode specifications. If the encoder is unable to do so, this option
69
will be turned OFF. Highly experimental.
70
-
71
+
72
Default: disabled
73
-
74
+
75
.. note::
76
77
:option:`--profile`, :option:`--level-idc`, and
78
79
Default 3.
80
81
.. option:: --limit-modes, --no-limit-modes
82
-
83
+
84
When enabled, limit-modes will limit modes analyzed for each CU using cost
85
metrics from the 4 sub-CUs. When multiple inter modes like :option:`--rect`
86
and/or :option:`--amp` are enabled, this feature will use motion cost
87
88
89
Default: enabled, disabled for :option:`--tune grain`
90
91
+.. option:: --splitrd-skip, --no-splitrd-skip
92
+
93
+ Enable skipping split RD analysis when sum of split CU rdCost larger than one
94
+ split CU rdCost for Intra CU. Default disabled.
95
+
96
.. option:: --fast-intra, --no-fast-intra
97
98
Perform an initial scan of every fifth intra angular mode, then
99
100
101
Note that --analysis-reuse-level must be paired with analysis-reuse-mode.
102
103
- +--------------+------------------------------------------+
104
- | Level | Description |
105
- +==============+==========================================+
106
- | 1 | Lookahead information |
107
- +--------------+------------------------------------------+
108
- | 2 to 4 | Level 1 + intra/inter modes, ref's |
109
- +--------------+------------------------------------------+
110
- | 5,6 and 9 | Level 2 + rect-amp |
111
- +--------------+------------------------------------------+
112
- | 7 | Level 5 + AVC size CU refinement |
113
- +--------------+------------------------------------------+
114
- | 8 | Level 5 + AVC size Full CU analysis-info |
115
- +--------------+------------------------------------------+
116
- | 10 | Level 5 + Full CU analysis-info |
117
- +--------------+------------------------------------------+
118
+ +--------------+------------------------------------------+
119
+ | Level | Description |
120
+ +==============+==========================================+
121
+ | 1 | Lookahead information |
122
+ +--------------+------------------------------------------+
123
+ | 2 to 4 | Level 1 + intra/inter modes, ref's |
124
+ +--------------+------------------------------------------+
125
+ | 5 and 6 | Level 2 + rect-amp |
126
+ +--------------+------------------------------------------+
127
+ | 7 | Level 5 + AVC size CU refinement |
128
+ +--------------+------------------------------------------+
129
+ | 8 and 9 | Level 5 + AVC size Full CU analysis-info |
130
+ +--------------+------------------------------------------+
131
+ | 10 | Level 5 + Full CU analysis-info |
132
+ +--------------+------------------------------------------+
133
134
.. option:: --refine-mv-type <string>
135
136
- Reuse MV information received through API call. Currently receives information for AVC size and the accepted
137
- string input is "avc". Default is disabled.
138
+ Reuse MV information received through API call. Currently receives information for AVC size and the accepted
139
+ string input is "avc". Default is disabled.
140
141
.. option:: --scale-factor
142
143
- Factor by which input video is scaled down for analysis save mode.
144
- This option should be coupled with analysis-reuse-mode option, --analysis-reuse-level 10.
145
- The ctu size of load should be double the size of save. Default 0.
146
+ Factor by which input video is scaled down for analysis save mode.
147
+ This option should be coupled with analysis-reuse-mode option,
148
+ --analysis-reuse-level 10. The ctu size of load can either be the
149
+ same as that of save or double the size of save. Default 0.
150
+
151
+.. option:: --refine-intra <0..4>
152
153
-.. option:: --refine-intra <0..3>
154
-
155
Enables refinement of intra blocks in current encode.
156
157
Level 0 - Forces both mode and depth from the save encode.
158
159
160
Level 3 - Perform analysis of intra modes for depth reused from first encode.
161
162
- Default 0.
163
+ Level 4 - Does not reuse any analysis information - redo analysis for the intra block.
164
165
+ Default 0.
166
+
167
.. option:: --refine-inter <0..3>
168
169
Enables refinement of inter blocks in current encode.
170
171
172
Default 0.
173
174
+.. option:: --dynamic-refine, --no-dynamic-refine
175
+
176
+ Dynamically switches :option:`--refine-inter` levels 0-3 based on the content and
177
+ the encoder settings. It is recommended to use :option:`--refine-intra` 4 with dynamic
178
+ refinement. Default disabled.
179
+
180
.. option:: --refine-mv
181
182
Enables refinement of motion vector for scaled video. Evaluates the best
183
motion vector by searching the surrounding eight integer and subpel pixel
184
- positions.
185
+ positions.
186
187
Options which affect the transform unit quad-tree, sometimes referred to
188
as the residual quad-tree (RQT).
189
190
quad-tree begins at the same depth of the coded tree unit, but if the
191
maximum TU size is smaller than the CU size then transform QT begins
192
at the depth of the max-tu-size. Default: 32.
193
-
194
+
195
.. option:: --dynamic-rd <0..4>
196
-
197
+
198
Increases the RD level at points where quality drops due to VBV rate
199
control enforcement. The number of CUs for which the RD is reconfigured
200
is determined based on the strength. Strength 1 gives the best FPS,
201
x265_2.7.tar.gz/doc/reST/presets.rst -> x265_2.9.tar.gz/doc/reST/presets.rst
Changed
13
1
2
that strictly minimises QP fluctuations across frames, while still allowing
3
the encoder to hit bitrate targets and VBV buffer limits (with a slightly
4
higher margin of error than normal). It is highly recommended that this
5
-algorithm is used only through the :option:`--tune` *grain* feature.
6
+algorithm is used only through the :option:`--tune` *grain* feature.
7
+Overriding the `--tune` *grain* settings might result in grain strobing, especially
8
+when enabling features like :option:`--aq-mode` and :option:`--cutree` that modify
9
+per-block QPs within a given frame.
10
11
Fast Decode
12
~~~~~~~~~~~
13
x265_2.7.tar.gz/doc/reST/releasenotes.rst -> x265_2.9.tar.gz/doc/reST/releasenotes.rst
Changed
71
1
2
Release Notes
3
*************
4
5
+Version 2.9
6
+===========
7
+
8
+Release date - 05/10/2018
9
+
10
+New features
11
+-------------
12
+1. Support for chunked encoding
13
+
14
+ :option:`--chunk-start and --chunk-end`
15
+ Frames preceding first frame of chunk in display order will be encoded, however, they will be discarded in the bitstream.
16
+ Frames following last frame of the chunk in display order will be used in taking lookahead decisions, but, they will not be encoded.
17
+ This feature can be enabled only in closed GOP structures. Default disabled.
18
+
19
+2. Support for HDR10+ version 1 SEI messages.
20
+
21
+Encoder enhancements
22
+--------------------
23
+1. Create API function for allocating and freeing x265_analysis_data.
24
+2. CEA 608/708 support: Read SEI messages from text file and encode it using userSEI message.
25
+
26
+Bug fixes
27
+---------
28
+1. Disable noise reduction when vbv is enabled.
29
+2. Support minLuma and maxLuma values changed by the commandline.
30
+
31
+Version 2.8
32
+===========
33
+
34
+Release date - 21/05/2018
35
+
36
+New features
37
+-------------
38
+1. :option:`--asm avx512` used to enable AVX-512 in x265. Default disabled.
39
+ For 4K main10 high-quality encoding, we are seeing good gains; for other resolutions and presets, we don't recommend using this setting for now.
40
+
41
+2. :option:`--dynamic-refine` dynamically switches between different inter refine levels. Default disabled.
42
+ It is recommended to use :option:`--refine-intra 4' with dynamic refinement for a better trade-off between encode efficiency and performance than using static refinement.
43
+
44
+3. :option:`--single-sei`
45
+ Encode SEI messages in a single NAL unit instead of multiple NAL units. Default disabled.
46
+
47
+4. :option:`--max-ausize-factor` controls the maximum AU size defined in HEVC specification.
48
+ It represents the percentage of maximum AU size used. Default is 1.
49
+
50
+5. VMAF (Video Multi-Method Assessment Fusion)
51
+ Added VMAF support for objective quality measurement of a video sequence.
52
+ Enable cmake option ENABLE_LIBVMAF to report per frame and aggregate VMAF score. The frame level VMAF score does not include temporal scores.
53
+ This is supported only on linux for now.
54
+
55
+Encoder enhancements
56
+--------------------
57
+1. Introduced refine-intra level 4 to improve quality.
58
+2. Support for HLG-graded content and pic_struct in SEI message.
59
+
60
+Bug Fixes
61
+---------
62
+1. Fix 32 bit build error (using CMAKE GUI) in Linux.
63
+2. Fix 32 bit build error for asm primitives.
64
+3. Fix build error on mac OS.
65
+4. Fix VBV Lookahead in analysis load to achieve target bitrate.
66
+
67
+
68
Version 2.7
69
===========
70
71
x265_2.7.tar.gz/source/CMakeLists.txt -> x265_2.9.tar.gz/source/CMakeLists.txt
Changed
57
1
2
option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
3
mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
4
# X265_BUILD must be incremented each time the public API is changed
5
-set(X265_BUILD 151)
6
+set(X265_BUILD 165)
7
configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
8
"${PROJECT_BINARY_DIR}/x265.def")
9
configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
10
11
if("${SYSPROC}" STREQUAL "" OR X86MATCH GREATER "-1")
12
set(X86 1)
13
add_definitions(-DX265_ARCH_X86=1)
14
- if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
15
+ if(CMAKE_CXX_FLAGS STREQUAL "-m32")
16
+ message(STATUS "Detected x86 target processor")
17
+ elseif("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
18
set(X64 1)
19
add_definitions(-DX86_64=1)
20
message(STATUS "Detected x86_64 target processor")
21
- else()
22
- message(STATUS "Detected x86 target processor")
23
endif()
24
elseif(POWERMATCH GREATER "-1")
25
message(STATUS "Detected POWER target processor")
26
27
if(NO_ATOMICS)
28
add_definitions(-DNO_ATOMICS=1)
29
endif(NO_ATOMICS)
30
+ find_library(VMAF vmaf)
31
+ option(ENABLE_LIBVMAF "Enable VMAF" OFF)
32
+ if(ENABLE_LIBVMAF)
33
+ add_definitions(-DENABLE_LIBVMAF)
34
+ endif()
35
endif(UNIX)
36
37
if(X64 AND NOT WIN32)
38
39
if(EXTRA_LIB)
40
target_link_libraries(x265-static ${EXTRA_LIB})
41
endif()
42
+if(ENABLE_LIBVMAF)
43
+ target_link_libraries(x265-static ${VMAF})
44
+endif()
45
install(TARGETS x265-static
46
LIBRARY DESTINATION ${LIB_INSTALL_DIR}
47
ARCHIVE DESTINATION ${LIB_INSTALL_DIR})
48
49
ARCHIVE DESTINATION ${LIB_INSTALL_DIR})
50
endif()
51
install(FILES x265.h "${PROJECT_BINARY_DIR}/x265_config.h" DESTINATION include)
52
-if(WIN32)
53
+if((WIN32 AND ENABLE_CLI) OR (WIN32 AND ENABLE_SHARED))
54
if(MSVC_IDE)
55
install(FILES "${PROJECT_BINARY_DIR}/Debug/x265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS Debug)
56
install(FILES "${PROJECT_BINARY_DIR}/RelWithDebInfo/x265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS RelWithDebInfo)
57
x265_2.7.tar.gz/source/common/common.cpp -> x265_2.9.tar.gz/source/common/common.cpp
Changed
10
1
2
#endif
3
}
4
5
-#define X265_ALIGNBYTES 32
6
+#define X265_ALIGNBYTES 64
7
8
#if _WIN32
9
#if defined(__MINGW32__) && !defined(__MINGW64_VERSION_MAJOR)
10
x265_2.7.tar.gz/source/common/common.h -> x265_2.9.tar.gz/source/common/common.h
Changed
26
1
2
#define ALIGN_VAR_8(T, var) T var __attribute__((aligned(8)))
3
#define ALIGN_VAR_16(T, var) T var __attribute__((aligned(16)))
4
#define ALIGN_VAR_32(T, var) T var __attribute__((aligned(32)))
5
+#define ALIGN_VAR_64(T, var) T var __attribute__((aligned(64)))
6
#if defined(__MINGW32__)
7
#define fseeko fseeko64
8
#define ftello ftello64
9
10
#define ALIGN_VAR_8(T, var) __declspec(align(8)) T var
11
#define ALIGN_VAR_16(T, var) __declspec(align(16)) T var
12
#define ALIGN_VAR_32(T, var) __declspec(align(32)) T var
13
+#define ALIGN_VAR_64(T, var) __declspec(align(64)) T var
14
#define fseeko _fseeki64
15
#define ftello _ftelli64
16
#endif // if defined(__GNUC__)
17
18
#define START_CODE_OVERHEAD 3
19
#define FILLER_OVERHEAD (NAL_TYPE_OVERHEAD + START_CODE_OVERHEAD + 1)
20
21
+#define MAX_NUM_DYN_REFINE (NUM_CU_DEPTH * X265_REFINE_INTER_LEVELS)
22
+
23
namespace X265_NS {
24
25
enum { SAO_NUM_OFFSET = 4 };
26
x265_2.7.tar.gz/source/common/cpu.cpp -> x265_2.9.tar.gz/source/common/cpu.cpp
Changed
200
1
2
#endif // if X265_ARCH_ARM
3
4
namespace X265_NS {
5
+static bool enable512 = false;
6
const cpu_name_t cpu_names[] =
7
{
8
#if X265_ARCH_X86
9
-#define MMX2 X265_CPU_MMX | X265_CPU_MMX2 | X265_CPU_CMOV
10
+#define MMX2 X265_CPU_MMX | X265_CPU_MMX2
11
{ "MMX2", MMX2 },
12
{ "MMXEXT", MMX2 },
13
{ "SSE", MMX2 | X265_CPU_SSE },
14
15
{ "BMI2", AVX | X265_CPU_LZCNT | X265_CPU_BMI1 | X265_CPU_BMI2 },
16
#define AVX2 AVX | X265_CPU_FMA3 | X265_CPU_LZCNT | X265_CPU_BMI1 | X265_CPU_BMI2 | X265_CPU_AVX2
17
{ "AVX2", AVX2},
18
+ { "AVX512", AVX2 | X265_CPU_AVX512 },
19
#undef AVX2
20
#undef AVX
21
#undef SSE2
22
#undef MMX2
23
{ "Cache32", X265_CPU_CACHELINE_32 },
24
{ "Cache64", X265_CPU_CACHELINE_64 },
25
- { "SlowCTZ", X265_CPU_SLOW_CTZ },
26
{ "SlowAtom", X265_CPU_SLOW_ATOM },
27
{ "SlowPshufb", X265_CPU_SLOW_PSHUFB },
28
{ "SlowPalignr", X265_CPU_SLOW_PALIGNR },
29
30
/* cpu-a.asm */
31
int PFX(cpu_cpuid_test)(void);
32
void PFX(cpu_cpuid)(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
33
-void PFX(cpu_xgetbv)(uint32_t op, uint32_t *eax, uint32_t *edx);
34
+uint64_t PFX(cpu_xgetbv)(int xcr);
35
}
36
37
#if defined(_MSC_VER)
38
#pragma warning(disable: 4309) // truncation of constant value
39
#endif
40
41
-uint32_t cpu_detect(void)
42
+bool detect512()
43
+{
44
+ return(enable512);
45
+}
46
+uint32_t cpu_detect(bool benableavx512 )
47
{
48
- uint32_t cpu = 0;
49
50
+ uint32_t cpu = 0;
51
uint32_t eax, ebx, ecx, edx;
52
uint32_t vendor[4] = { 0 };
53
uint32_t max_extended_cap, max_basic_cap;
54
+ uint64_t xcr0 = 0;
55
56
#if !X86_64
57
if (!PFX(cpu_cpuid_test)())
58
return 0;
59
#endif
60
61
- PFX(cpu_cpuid)(0, &eax, vendor + 0, vendor + 2, vendor + 1);
62
- max_basic_cap = eax;
63
+ PFX(cpu_cpuid)(0, &max_basic_cap, vendor + 0, vendor + 2, vendor + 1);
64
if (max_basic_cap == 0)
65
return 0;
66
67
68
return cpu;
69
if (edx & 0x02000000)
70
cpu |= X265_CPU_MMX2 | X265_CPU_SSE;
71
- if (edx & 0x00008000)
72
- cpu |= X265_CPU_CMOV;
73
- else
74
- return cpu;
75
if (edx & 0x04000000)
76
cpu |= X265_CPU_SSE2;
77
if (ecx & 0x00000001)
78
cpu |= X265_CPU_SSE3;
79
if (ecx & 0x00000200)
80
- cpu |= X265_CPU_SSSE3;
81
+ cpu |= X265_CPU_SSSE3 | X265_CPU_SSE2_IS_FAST;
82
if (ecx & 0x00080000)
83
cpu |= X265_CPU_SSE4;
84
if (ecx & 0x00100000)
85
cpu |= X265_CPU_SSE42;
86
- /* Check OXSAVE and AVX bits */
87
- if ((ecx & 0x18000000) == 0x18000000)
88
+
89
+ if (ecx & 0x08000000) /* XGETBV supported and XSAVE enabled by OS */
90
{
91
/* Check for OS support */
92
- PFX(cpu_xgetbv)(0, &eax, &edx);
93
- if ((eax & 0x6) == 0x6)
94
+ xcr0 = PFX(cpu_xgetbv)(0);
95
+ if ((xcr0 & 0x6) == 0x6) /* XMM/YMM state */
96
{
97
+ if (ecx & 0x10000000)
98
cpu |= X265_CPU_AVX;
99
if (ecx & 0x00001000)
100
cpu |= X265_CPU_FMA3;
101
102
{
103
PFX(cpu_cpuid)(7, &eax, &ebx, &ecx, &edx);
104
/* AVX2 requires OS support, but BMI1/2 don't. */
105
- if ((cpu & X265_CPU_AVX) && (ebx & 0x00000020))
106
- cpu |= X265_CPU_AVX2;
107
if (ebx & 0x00000008)
108
- {
109
cpu |= X265_CPU_BMI1;
110
- if (ebx & 0x00000100)
111
- cpu |= X265_CPU_BMI2;
112
+ if (ebx & 0x00000100)
113
+ cpu |= X265_CPU_BMI2;
114
+
115
+ if ((xcr0 & 0x6) == 0x6) /* XMM/YMM state */
116
+ {
117
+ if (ebx & 0x00000020)
118
+ cpu |= X265_CPU_AVX2;
119
+ if (benableavx512)
120
+ {
121
+ if ((xcr0 & 0xE0) == 0xE0) /* OPMASK/ZMM state */
122
+ {
123
+ if ((ebx & 0xD0030000) == 0xD0030000)
124
+ {
125
+ cpu |= X265_CPU_AVX512;
126
+ enable512 = true;
127
+ }
128
+ }
129
+ }
130
}
131
}
132
133
- if (cpu & X265_CPU_SSSE3)
134
- cpu |= X265_CPU_SSE2_IS_FAST;
135
-
136
PFX(cpu_cpuid)(0x80000000, &eax, &ebx, &ecx, &edx);
137
max_extended_cap = eax;
138
139
140
{
141
if (edx & 0x00400000)
142
cpu |= X265_CPU_MMX2;
143
- if (!(cpu & X265_CPU_LZCNT))
144
- cpu |= X265_CPU_SLOW_CTZ;
145
if ((cpu & X265_CPU_SSE2) && !(cpu & X265_CPU_SSE2_IS_FAST))
146
cpu |= X265_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
147
}
148
149
int model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
150
if (family == 6)
151
{
152
- /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
153
- * theoretically support sse2, but it's significantly slower than mmx for
154
- * almost all of x264's functions, so let's just pretend they don't. */
155
- if (model == 9 || model == 13 || model == 14)
156
- {
157
- cpu &= ~(X265_CPU_SSE2 | X265_CPU_SSE3);
158
- X265_CHECK(!(cpu & (X265_CPU_SSSE3 | X265_CPU_SSE4)), "unexpected CPU ID %d\n", cpu);
159
- }
160
/* Detect Atom CPU */
161
- else if (model == 28)
162
+ if (model == 28)
163
{
164
cpu |= X265_CPU_SLOW_ATOM;
165
- cpu |= X265_CPU_SLOW_CTZ;
166
cpu |= X265_CPU_SLOW_PSHUFB;
167
}
168
169
170
int PFX(cpu_fast_neon_mrc_test)(void);
171
}
172
173
-uint32_t cpu_detect(void)
174
+uint32_t cpu_detect(bool benableavx512)
175
{
176
int flags = 0;
177
178
179
180
#elif X265_ARCH_POWER8
181
182
-uint32_t cpu_detect(void)
183
+uint32_t cpu_detect(bool benableavx512)
184
{
185
#if HAVE_ALTIVEC
186
return X265_CPU_ALTIVEC;
187
188
189
#else // if X265_ARCH_POWER8
190
191
-uint32_t cpu_detect(void)
192
+uint32_t cpu_detect(bool benableavx512)
193
{
194
return 0;
195
}
196
197
#endif // if X265_ARCH_X86
198
}
199
+
200
x265_2.7.tar.gz/source/common/cpu.h -> x265_2.9.tar.gz/source/common/cpu.h
Changed
19
1
2
#define X265_CPU_H
3
4
#include "common.h"
5
-
6
/* All assembly functions are prefixed with X265_NS (macro expanded) */
7
#define PFX3(prefix, name) prefix ## _ ## name
8
#define PFX2(prefix, name) PFX3(prefix, name)
9
10
#endif
11
12
namespace X265_NS {
13
-uint32_t cpu_detect(void);
14
+uint32_t cpu_detect(bool);
15
+bool detect512();
16
17
struct cpu_name_t
18
{
19
x265_2.7.tar.gz/source/common/cudata.cpp -> x265_2.9.tar.gz/source/common/cudata.cpp
Changed
29
1
2
dir |= (1 << list);
3
candMvField[count][list].mv = colmv;
4
candMvField[count][list].refIdx = refIdx;
5
- if (m_encData->m_param->scaleFactor && m_encData->m_param->analysisSave && m_log2CUSize[0] < 4)
6
- {
7
- MV dist(MAX_MV, MAX_MV);
8
- candMvField[count][list].mv = dist;
9
- }
10
}
11
}
12
13
14
15
int curRefPOC = m_slice->m_refPOCList[picList][refIdx];
16
int curPOC = m_slice->m_poc;
17
-
18
- if (m_encData->m_param->scaleFactor && m_encData->m_param->analysisSave && (m_log2CUSize[0] < 4))
19
- {
20
- MV dist(MAX_MV, MAX_MV);
21
- pmv[numMvc++] = amvpCand[num++] = dist;
22
- }
23
- else
24
- pmv[numMvc++] = amvpCand[num++] = scaleMvByPOCDist(neighbours[MD_COLLOCATED].mv[picList], curPOC, curRefPOC, colPOC, colRefPOC);
25
+ pmv[numMvc++] = amvpCand[num++] = scaleMvByPOCDist(neighbours[MD_COLLOCATED].mv[picList], curPOC, curRefPOC, colPOC, colRefPOC);
26
}
27
}
28
29
x265_2.7.tar.gz/source/common/cudata.h -> x265_2.9.tar.gz/source/common/cudata.h
Changed
27
1
2
uint64_t m_fAc_den[3];
3
uint64_t m_fDc_den[3];
4
5
+ /* Feature values per CTU for dynamic refinement */
6
+ uint64_t* m_collectCURd;
7
+ uint32_t* m_collectCUVariance;
8
+ uint32_t* m_collectCUCount;
9
+
10
CUData();
11
12
void initialize(const CUDataMemPool& dataPool, uint32_t depth, const x265_param& param, int instance);
13
14
coeff_t* trCoeffMemBlock;
15
MV* mvMemBlock;
16
sse_t* distortionMemBlock;
17
+ uint64_t* dynRefineRdBlock;
18
+ uint32_t* dynRefCntBlock;
19
+ uint32_t* dynRefVarBlock;
20
21
- CUDataMemPool() { charMemBlock = NULL; trCoeffMemBlock = NULL; mvMemBlock = NULL; distortionMemBlock = NULL; }
22
+ CUDataMemPool() { charMemBlock = NULL; trCoeffMemBlock = NULL; mvMemBlock = NULL; distortionMemBlock = NULL;
23
+ dynRefineRdBlock = NULL; dynRefCntBlock = NULL; dynRefVarBlock = NULL;}
24
25
bool create(uint32_t depth, uint32_t csp, uint32_t numInstances, const x265_param& param)
26
{
27
x265_2.7.tar.gz/source/common/dct.cpp -> x265_2.9.tar.gz/source/common/dct.cpp
Changed
130
1
2
sum += sbacGetEntropyBits(mstate, firstC2Flag);
3
}
4
}
5
-
6
return (sum & 0x00FFFFFF) + (c1 << 26) + (firstC2Idx << 28);
7
}
8
+template<int log2TrSize>
9
+static void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos)
10
+{
11
+ const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
12
+ const int scaleBits = SCALE_BITS - 2 * transformShift;
13
+ const uint32_t trSize = 1 << log2TrSize;
14
+
15
+ for (int y = 0; y < MLS_CG_SIZE; y++)
16
+ {
17
+ for (int x = 0; x < MLS_CG_SIZE; x++)
18
+ {
19
+ int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
20
+ costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
21
+ *totalUncodedCost += costUncoded[blkPos + x];
22
+ *totalRdCost += costUncoded[blkPos + x];
23
+ }
24
+ blkPos += trSize;
25
+ }
26
+}
27
+template<int log2TrSize>
28
+static void psyRdoQuant_c(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
29
+{
30
+ const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
31
+ const int scaleBits = SCALE_BITS - 2 * transformShift;
32
+ const uint32_t trSize = 1 << log2TrSize;
33
+ int max = X265_MAX(0, (2 * transformShift + 1));
34
+
35
+ for (int y = 0; y < MLS_CG_SIZE; y++)
36
+ {
37
+ for (int x = 0; x < MLS_CG_SIZE; x++)
38
+ {
39
+ int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
40
+ int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
41
+
42
+ costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
43
+
44
+ /* when no residual coefficient is coded, predicted coef == recon coef */
45
+ costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
46
+
47
+ *totalUncodedCost += costUncoded[blkPos + x];
48
+ *totalRdCost += costUncoded[blkPos + x];
49
+ }
50
+ blkPos += trSize;
51
+ }
52
+}
53
+template<int log2TrSize>
54
+static void psyRdoQuant_c_1(int16_t *m_resiDctCoeff, /*int16_t *m_fencDctCoeff, */ int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, /* int64_t *psyScale,*/ uint32_t blkPos)
55
+{
56
+ const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
57
+ const int scaleBits = SCALE_BITS - 2 * transformShift;
58
+ const uint32_t trSize = 1 << log2TrSize;
59
+
60
+ for (int y = 0; y < MLS_CG_SIZE; y++)
61
+ {
62
+ for (int x = 0; x < MLS_CG_SIZE; x++)
63
+ {
64
+ int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
65
+ costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
66
+ *totalUncodedCost += costUncoded[blkPos + x];
67
+ *totalRdCost += costUncoded[blkPos + x];
68
+ }
69
+ blkPos += trSize;
70
+ }
71
+}
72
+template<int log2TrSize>
73
+static void psyRdoQuant_c_2(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
74
+{
75
+ const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
76
+
77
+ const uint32_t trSize = 1 << log2TrSize;
78
+ int max = X265_MAX(0, (2 * transformShift + 1));
79
+
80
+ for (int y = 0; y < MLS_CG_SIZE; y++)
81
+ {
82
+ for (int x = 0; x < MLS_CG_SIZE; x++)
83
+ {
84
+ int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
85
+ int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
86
+ costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
87
+ *totalUncodedCost += costUncoded[blkPos + x];
88
+ *totalRdCost += costUncoded[blkPos + x];
89
+ }
90
+ blkPos += trSize;
91
+ }
92
+}
93
94
namespace X265_NS {
95
// x265 private namespace
96
-
97
void setupDCTPrimitives_c(EncoderPrimitives& p)
98
{
99
p.dequant_scaling = dequant_scaling_c;
100
p.dequant_normal = dequant_normal_c;
101
p.quant = quant_c;
102
p.nquant = nquant_c;
103
+ p.cu[BLOCK_4x4].nonPsyRdoQuant = nonPsyRdoQuant_c<2>;
104
+ p.cu[BLOCK_8x8].nonPsyRdoQuant = nonPsyRdoQuant_c<3>;
105
+ p.cu[BLOCK_16x16].nonPsyRdoQuant = nonPsyRdoQuant_c<4>;
106
+ p.cu[BLOCK_32x32].nonPsyRdoQuant = nonPsyRdoQuant_c<5>;
107
+ p.cu[BLOCK_4x4].psyRdoQuant = psyRdoQuant_c<2>;
108
+ p.cu[BLOCK_8x8].psyRdoQuant = psyRdoQuant_c<3>;
109
+ p.cu[BLOCK_16x16].psyRdoQuant = psyRdoQuant_c<4>;
110
+ p.cu[BLOCK_32x32].psyRdoQuant = psyRdoQuant_c<5>;
111
p.dst4x4 = dst4_c;
112
p.cu[BLOCK_4x4].dct = dct4_c;
113
p.cu[BLOCK_8x8].dct = dct8_c;
114
115
p.cu[BLOCK_8x8].copy_cnt = copy_count<8>;
116
p.cu[BLOCK_16x16].copy_cnt = copy_count<16>;
117
p.cu[BLOCK_32x32].copy_cnt = copy_count<32>;
118
-
119
+ p.cu[BLOCK_4x4].psyRdoQuant_1p = psyRdoQuant_c_1<2>;
120
+ p.cu[BLOCK_4x4].psyRdoQuant_2p = psyRdoQuant_c_2<2>;
121
+ p.cu[BLOCK_8x8].psyRdoQuant_1p = psyRdoQuant_c_1<3>;
122
+ p.cu[BLOCK_8x8].psyRdoQuant_2p = psyRdoQuant_c_2<3>;
123
+ p.cu[BLOCK_16x16].psyRdoQuant_1p = psyRdoQuant_c_1<4>;
124
+ p.cu[BLOCK_16x16].psyRdoQuant_2p = psyRdoQuant_c_2<4>;
125
+ p.cu[BLOCK_32x32].psyRdoQuant_1p = psyRdoQuant_c_1<5>;
126
+ p.cu[BLOCK_32x32].psyRdoQuant_2p = psyRdoQuant_c_2<5>;
127
p.scanPosLast = scanPosLast_c;
128
p.findPosFirstLast = findPosFirstLast_c;
129
p.costCoeffNxN = costCoeffNxN_c;
130
x265_2.7.tar.gz/source/common/frame.cpp -> x265_2.9.tar.gz/source/common/frame.cpp
Changed
56
1
2
m_addOnDepth = NULL;
3
m_addOnCtuInfo = NULL;
4
m_addOnPrevChange = NULL;
5
+ m_classifyFrame = false;
6
}
7
8
bool Frame::create(x265_param *param, float* quantOffsets)
9
10
m_analysisData.wt = NULL;
11
m_analysisData.intraData = NULL;
12
m_analysisData.interData = NULL;
13
- m_analysis2Pass.analysisFramedata = NULL;
14
+ m_analysisData.distortionData = NULL;
15
}
16
17
- if (m_fencPic->create(param, !!m_param->bCopyPicToFrame) && m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode || !!param->bAQMotion, param->rc.qgSize))
18
+ if (param->bDynamicRefine)
19
+ {
20
+ int size = m_param->maxCUDepth * X265_REFINE_INTER_LEVELS;
21
+ CHECKED_MALLOC_ZERO(m_classifyRd, uint64_t, size);
22
+ CHECKED_MALLOC_ZERO(m_classifyVariance, uint64_t, size);
23
+ CHECKED_MALLOC_ZERO(m_classifyCount, uint32_t, size);
24
+ }
25
+
26
+ if (m_fencPic->create(param, !!m_param->bCopyPicToFrame) && m_lowres.create(param, m_fencPic, param->rc.qgSize))
27
{
28
X265_CHECK((m_reconColCount == NULL), "m_reconColCount was initialized");
29
m_numRows = (m_fencPic->m_picHeight + param->maxCUSize - 1) / param->maxCUSize;
30
31
32
if (quantOffsets)
33
{
34
- int32_t cuCount;
35
- if (param->rc.qgSize == 8)
36
- cuCount = m_lowres.maxBlocksInRowFullRes * m_lowres.maxBlocksInColFullRes;
37
- else
38
- cuCount = m_lowres.maxBlocksInRow * m_lowres.maxBlocksInCol;
39
+ int32_t cuCount = (param->rc.qgSize == 8) ? m_lowres.maxBlocksInRowFullRes * m_lowres.maxBlocksInColFullRes :
40
+ m_lowres.maxBlocksInRow * m_lowres.maxBlocksInCol;
41
m_quantOffsets = new float[cuCount];
42
}
43
return true;
44
45
}
46
m_lowres.destroy();
47
X265_FREE(m_rcData);
48
+
49
+ if (m_param->bDynamicRefine)
50
+ {
51
+ X265_FREE_ZERO(m_classifyRd);
52
+ X265_FREE_ZERO(m_classifyVariance);
53
+ X265_FREE_ZERO(m_classifyCount);
54
+ }
55
}
56
x265_2.7.tar.gz/source/common/frame.h -> x265_2.9.tar.gz/source/common/frame.h
Changed
24
1
2
Frame* m_prev;
3
x265_param* m_param; // Points to the latest param set for the frame.
4
x265_analysis_data m_analysisData;
5
- x265_analysis_2Pass m_analysis2Pass;
6
RcStats* m_rcData;
7
8
Event m_copyMVType;
9
10
uint8_t** m_addOnDepth;
11
uint8_t** m_addOnCtuInfo;
12
int** m_addOnPrevChange;
13
+
14
+ /* Average feature values of frames being considered for classification */
15
+ uint64_t* m_classifyRd;
16
+ uint64_t* m_classifyVariance;
17
+ uint32_t* m_classifyCount;
18
+
19
+ bool m_classifyFrame;
20
+
21
Frame();
22
23
bool create(x265_param *param, float* quantOffsets);
24
x265_2.7.tar.gz/source/common/framedata.cpp -> x265_2.9.tar.gz/source/common/framedata.cpp
Changed
53
1
2
if (param.rc.bStatWrite)
3
m_spsrps = const_cast<RPS*>(sps.spsrps);
4
bool isallocated = m_cuMemPool.create(0, param.internalCsp, sps.numCUsInFrame, param);
5
+ if (m_param->bDynamicRefine)
6
+ {
7
+ CHECKED_MALLOC_ZERO(m_cuMemPool.dynRefineRdBlock, uint64_t, MAX_NUM_DYN_REFINE * sps.numCUsInFrame);
8
+ CHECKED_MALLOC_ZERO(m_cuMemPool.dynRefCntBlock, uint32_t, MAX_NUM_DYN_REFINE * sps.numCUsInFrame);
9
+ CHECKED_MALLOC_ZERO(m_cuMemPool.dynRefVarBlock, uint32_t, MAX_NUM_DYN_REFINE * sps.numCUsInFrame);
10
+ }
11
if (isallocated)
12
+ {
13
for (uint32_t ctuAddr = 0; ctuAddr < sps.numCUsInFrame; ctuAddr++)
14
+ {
15
+ if (m_param->bDynamicRefine)
16
+ {
17
+ m_picCTU[ctuAddr].m_collectCURd = m_cuMemPool.dynRefineRdBlock + (ctuAddr * MAX_NUM_DYN_REFINE);
18
+ m_picCTU[ctuAddr].m_collectCUVariance = m_cuMemPool.dynRefVarBlock + (ctuAddr * MAX_NUM_DYN_REFINE);
19
+ m_picCTU[ctuAddr].m_collectCUCount = m_cuMemPool.dynRefCntBlock + (ctuAddr * MAX_NUM_DYN_REFINE);
20
+ }
21
m_picCTU[ctuAddr].initialize(m_cuMemPool, 0, param, ctuAddr);
22
+ }
23
+ }
24
else
25
return false;
26
CHECKED_MALLOC_ZERO(m_cuStat, RCStatCU, sps.numCUsInFrame);
27
28
{
29
memset(m_cuStat, 0, sps.numCUsInFrame * sizeof(*m_cuStat));
30
memset(m_rowStat, 0, sps.numCuInHeight * sizeof(*m_rowStat));
31
+ if (m_param->bDynamicRefine)
32
+ {
33
+ memset(m_picCTU->m_collectCURd, 0, MAX_NUM_DYN_REFINE * sizeof(uint64_t));
34
+ memset(m_picCTU->m_collectCUVariance, 0, MAX_NUM_DYN_REFINE * sizeof(uint32_t));
35
+ memset(m_picCTU->m_collectCUCount, 0, MAX_NUM_DYN_REFINE * sizeof(uint32_t));
36
+ }
37
}
38
39
void FrameData::destroy()
40
41
42
m_cuMemPool.destroy();
43
44
+ if (m_param->bDynamicRefine)
45
+ {
46
+ X265_FREE(m_cuMemPool.dynRefineRdBlock);
47
+ X265_FREE(m_cuMemPool.dynRefCntBlock);
48
+ X265_FREE(m_cuMemPool.dynRefVarBlock);
49
+ }
50
X265_FREE(m_cuStat);
51
X265_FREE(m_rowStat);
52
for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
53
x265_2.7.tar.gz/source/common/framedata.h -> x265_2.9.tar.gz/source/common/framedata.h
Changed
61
1
2
uint64_t cntInterPu[NUM_CU_DEPTH][INTER_MODES - 1];
3
uint64_t cntMergePu[NUM_CU_DEPTH][INTER_MODES - 1];
4
5
+ /* Feature values per row for dynamic refinement */
6
+ uint64_t rowRdDyn[MAX_NUM_DYN_REFINE];
7
+ uint32_t rowVarDyn[MAX_NUM_DYN_REFINE];
8
+ uint32_t rowCntDyn[MAX_NUM_DYN_REFINE];
9
+
10
FrameStats()
11
{
12
memset(this, 0, sizeof(FrameStats));
13
14
inline CUData* getPicCTU(uint32_t ctuAddr) { return &m_picCTU[ctuAddr]; }
15
};
16
17
-/* Stores intra analysis data for a single frame. This struct needs better packing */
18
-struct analysis_intra_data
19
-{
20
- uint8_t* depth;
21
- uint8_t* modes;
22
- char* partSizes;
23
- uint8_t* chromaModes;
24
-};
25
-
26
-/* Stores inter analysis data for a single frame */
27
-struct analysis_inter_data
28
-{
29
- int32_t* ref;
30
- uint8_t* depth;
31
- uint8_t* modes;
32
- uint8_t* partSize;
33
- uint8_t* mergeFlag;
34
- uint8_t* interDir;
35
- uint8_t* mvpIdx[2];
36
- int8_t* refIdx[2];
37
- MV* mv[2];
38
- int64_t* sadCost;
39
-};
40
-
41
-struct analysis2PassFrameData
42
-{
43
- uint8_t* depth;
44
- MV* m_mv[2];
45
- int* mvpIdx[2];
46
- int32_t* ref[2];
47
- uint8_t* modes;
48
- sse_t* distortion;
49
- sse_t* ctuDistortion;
50
- double* scaledDistortion;
51
- double averageDistortion;
52
- double sdDistortion;
53
- uint32_t highDistortionCtuCount;
54
- uint32_t lowDistortionCtuCount;
55
- double* offset;
56
- double* threshold;
57
-};
58
-
59
}
60
#endif // ifndef X265_FRAMEDATA_H
61
x265_2.7.tar.gz/source/common/ipfilter.cpp -> x265_2.9.tar.gz/source/common/ipfilter.cpp
Changed
41
1
2
p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>; \
3
p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>; \
4
p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \
5
- p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].p2s = filterPixelToShort_c<W, H>;
6
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].p2s[NONALIGNED] = filterPixelToShort_c<W, H>;\
7
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].p2s[ALIGNED] = filterPixelToShort_c<W, H>;
8
9
#define CHROMA_422(W, H) \
10
p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \
11
12
p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>; \
13
p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>; \
14
p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \
15
- p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].p2s = filterPixelToShort_c<W, H>;
16
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].p2s[NONALIGNED] = filterPixelToShort_c<W, H>;\
17
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].p2s[ALIGNED] = filterPixelToShort_c<W, H>;
18
19
#define CHROMA_444(W, H) \
20
p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \
21
22
p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>; \
23
p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>; \
24
p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \
25
- p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].p2s = filterPixelToShort_c<W, H>;
26
+ p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].p2s[NONALIGNED] = filterPixelToShort_c<W, H>;\
27
+ p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].p2s[ALIGNED] = filterPixelToShort_c<W, H>;
28
29
#define LUMA(W, H) \
30
p.pu[LUMA_ ## W ## x ## H].luma_hpp = interp_horiz_pp_c<8, W, H>; \
31
32
p.pu[LUMA_ ## W ## x ## H].luma_vsp = interp_vert_sp_c<8, W, H>; \
33
p.pu[LUMA_ ## W ## x ## H].luma_vss = interp_vert_ss_c<8, W, H>; \
34
p.pu[LUMA_ ## W ## x ## H].luma_hvpp = interp_hv_pp_c<8, W, H>; \
35
- p.pu[LUMA_ ## W ## x ## H].convert_p2s = filterPixelToShort_c<W, H>;
36
+ p.pu[LUMA_ ## W ## x ## H].convert_p2s[NONALIGNED] = filterPixelToShort_c<W, H>;\
37
+ p.pu[LUMA_ ## W ## x ## H].convert_p2s[ALIGNED] = filterPixelToShort_c<W, H>;
38
39
void setupFilterPrimitives_c(EncoderPrimitives& p)
40
{
41
x265_2.7.tar.gz/source/common/lowres.cpp -> x265_2.9.tar.gz/source/common/lowres.cpp
Changed
66
1
2
3
using namespace X265_NS;
4
5
-bool Lowres::create(PicYuv *origPic, int _bframes, bool bAQEnabled, uint32_t qgSize)
6
+bool Lowres::create(x265_param* param, PicYuv *origPic, uint32_t qgSize)
7
{
8
isLowres = true;
9
- bframes = _bframes;
10
+ bframes = param->bframes;
11
width = origPic->m_picWidth / 2;
12
lines = origPic->m_picHeight / 2;
13
lumaStride = width + 2 * origPic->m_lumaMarginX;
14
15
maxBlocksInRowFullRes = maxBlocksInRow * 2;
16
maxBlocksInColFullRes = maxBlocksInCol * 2;
17
int cuCount = maxBlocksInRow * maxBlocksInCol;
18
- int cuCountFullRes;
19
- if (qgSize == 8)
20
- cuCountFullRes = maxBlocksInRowFullRes * maxBlocksInColFullRes;
21
- else
22
- cuCountFullRes = cuCount;
23
+ int cuCountFullRes = (qgSize > 8) ? cuCount : cuCount << 2;
24
25
/* rounding the width to multiple of lowres CU size */
26
width = maxBlocksInRow * X265_LOWRES_CU_SIZE;
27
28
29
size_t planesize = lumaStride * (lines + 2 * origPic->m_lumaMarginY);
30
size_t padoffset = lumaStride * origPic->m_lumaMarginY + origPic->m_lumaMarginX;
31
- if (bAQEnabled)
32
+ if (!!param->rc.aqMode)
33
{
34
CHECKED_MALLOC_ZERO(qpAqOffset, double, cuCountFullRes);
35
- CHECKED_MALLOC_ZERO(qpAqMotionOffset, double, cuCountFullRes);
36
CHECKED_MALLOC_ZERO(invQscaleFactor, int, cuCountFullRes);
37
CHECKED_MALLOC_ZERO(qpCuTreeOffset, double, cuCountFullRes);
38
- CHECKED_MALLOC_ZERO(blockVariance, uint32_t, cuCountFullRes);
39
if (qgSize == 8)
40
CHECKED_MALLOC_ZERO(invQscaleFactor8x8, int, cuCount);
41
}
42
+ if (origPic->m_param->bAQMotion)
43
+ CHECKED_MALLOC_ZERO(qpAqMotionOffset, double, cuCountFullRes);
44
+ if (origPic->m_param->bDynamicRefine)
45
+ CHECKED_MALLOC_ZERO(blockVariance, uint32_t, cuCountFullRes);
46
CHECKED_MALLOC(propagateCost, uint16_t, cuCount);
47
48
/* allocate lowres buffers */
49
50
X265_FREE(lowresMvCosts[1][i]);
51
}
52
X265_FREE(qpAqOffset);
53
- X265_FREE(qpAqMotionOffset);
54
X265_FREE(invQscaleFactor);
55
X265_FREE(qpCuTreeOffset);
56
X265_FREE(propagateCost);
57
- X265_FREE(blockVariance);
58
X265_FREE(invQscaleFactor8x8);
59
+ X265_FREE(qpAqMotionOffset);
60
+ X265_FREE(blockVariance);
61
}
62
-
63
// (re) initialize lowres state
64
void Lowres::init(PicYuv *origPic, int poc)
65
{
66
x265_2.7.tar.gz/source/common/lowres.h -> x265_2.9.tar.gz/source/common/lowres.h
Changed
35
1
2
int qmvy = qmv.y + (qmv.y & 1);
3
int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);
4
pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * lumaStride;
5
- primitives.pu[LUMA_8x8].pixelavg_pp(buf, outstride, frefA, lumaStride, frefB, lumaStride, 32);
6
+ primitives.pu[LUMA_8x8].pixelavg_pp[(outstride % 64 == 0) && (lumaStride % 64 == 0)](buf, outstride, frefA, lumaStride, frefB, lumaStride, 32);
7
return buf;
8
}
9
else
10
11
int qmvy = qmv.y + (qmv.y & 1);
12
int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);
13
pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * lumaStride;
14
- primitives.pu[LUMA_8x8].pixelavg_pp(subpelbuf, 8, frefA, lumaStride, frefB, lumaStride, 32);
15
+ primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](subpelbuf, 8, frefA, lumaStride, frefB, lumaStride, 32);
16
return comp(fenc, FENC_STRIDE, subpelbuf, 8);
17
}
18
else
19
20
uint32_t* blockVariance;
21
uint64_t wp_ssd[3]; // This is different than SSDY, this is sum(pixel^2) - sum(pixel)^2 for entire frame
22
uint64_t wp_sum[3];
23
- uint64_t frameVariance;
24
25
/* cutree intermediate data */
26
uint16_t* propagateCost;
27
double weightedCostDelta[X265_BFRAME_MAX + 2];
28
ReferencePlanes weightedRef[X265_BFRAME_MAX + 2];
29
-
30
- bool create(PicYuv *origPic, int _bframes, bool bAqEnabled, uint32_t qgSize);
31
+ bool create(x265_param* param, PicYuv *origPic, uint32_t qgSize);
32
void destroy();
33
void init(PicYuv *origPic, int poc);
34
};
35
x265_2.7.tar.gz/source/common/param.cpp -> x265_2.9.tar.gz/source/common/param.cpp
Changed
201
1
2
memset(param, 0, sizeof(x265_param));
3
4
/* Applying default values to all elements in the param structure */
5
- param->cpuid = X265_NS::cpu_detect();
6
+ param->cpuid = X265_NS::cpu_detect(false);
7
param->bEnableWavefront = 1;
8
param->frameNumThreads = 0;
9
10
11
param->bEmitHRDSEI = 0;
12
param->bEmitInfoSEI = 1;
13
param->bEmitHDRSEI = 0;
14
+ param->bEmitIDRRecoverySEI = 0;
15
16
/* CU definitions */
17
param->maxCUSize = 64;
18
19
param->lookaheadThreads = 0;
20
param->scenecutBias = 5.0;
21
param->radl = 0;
22
+ param->chunkStart = 0;
23
+ param->chunkEnd = 0;
24
+
25
/* Intra Coding Tools */
26
param->bEnableConstrainedIntra = 0;
27
param->bEnableStrongIntraSmoothing = 1;
28
29
param->bEnableSAO = 1;
30
param->bSaoNonDeblocked = 0;
31
param->bLimitSAO = 0;
32
+
33
/* Coding Quality */
34
param->cbQpOffset = 0;
35
param->crQpOffset = 0;
36
37
param->scaleFactor = 0;
38
param->intraRefine = 0;
39
param->interRefine = 0;
40
+ param->bDynamicRefine = 0;
41
param->mvRefine = 0;
42
param->bUseAnalysisFile = 1;
43
param->csvfpt = NULL;
44
param->forceFlush = 0;
45
param->bDisableLookahead = 0;
46
param->bCopyPicToFrame = 1;
47
+ param->maxAUSizeFactor = 1;
48
+ param->naluFile = NULL;
49
50
/* DCT Approximations */
51
param->bLowPassDct = 0;
52
param->bMVType = 0;
53
+ param->bSingleSeiNal = 0;
54
+
55
+ /* SEI messages */
56
+ param->preferredTransferCharacteristics = -1;
57
+ param->pictureStructure = -1;
58
}
59
60
int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
61
62
if (0) ;
63
OPT("asm")
64
{
65
+#if X265_ARCH_X86
66
+ if (!strcasecmp(value, "avx512"))
67
+ {
68
+ p->cpuid = X265_NS::cpu_detect(true);
69
+ if (!(p->cpuid & X265_CPU_AVX512))
70
+ x265_log(p, X265_LOG_WARNING, "AVX512 is not supported\n");
71
+ }
72
+ else
73
+ {
74
+ if (bValueWasNull)
75
+ p->cpuid = atobool(value);
76
+ else
77
+ p->cpuid = parseCpuName(value, bError, false);
78
+ }
79
+#else
80
if (bValueWasNull)
81
p->cpuid = atobool(value);
82
else
83
- p->cpuid = parseCpuName(value, bError);
84
+ p->cpuid = parseCpuName(value, bError, false);
85
+#endif
86
}
87
OPT("fps")
88
{
89
90
OPT("limit-sao") p->bLimitSAO = atobool(value);
91
OPT("dhdr10-info") p->toneMapFile = strdup(value);
92
OPT("dhdr10-opt") p->bDhdr10opt = atobool(value);
93
+ OPT("idr-recovery-sei") p->bEmitIDRRecoverySEI = atobool(value);
94
OPT("const-vbv") p->rc.bEnableConstVbv = atobool(value);
95
OPT("ctu-info") p->bCTUInfo = atoi(value);
96
OPT("scale-factor") p->scaleFactor = atoi(value);
97
98
OPT("refine-mv")p->mvRefine = atobool(value);
99
OPT("force-flush")p->forceFlush = atoi(value);
100
OPT("splitrd-skip") p->bEnableSplitRdSkip = atobool(value);
101
- OPT("lowpass-dct") p->bLowPassDct = atobool(value);
102
+ OPT("lowpass-dct") p->bLowPassDct = atobool(value);
103
OPT("vbv-end") p->vbvBufferEnd = atof(value);
104
OPT("vbv-end-fr-adj") p->vbvEndFrameAdjust = atof(value);
105
OPT("copy-pic") p->bCopyPicToFrame = atobool(value);
106
107
{
108
bError = true;
109
}
110
- }
111
+ }
112
OPT("gop-lookahead") p->gopLookahead = atoi(value);
113
OPT("analysis-save") p->analysisSave = strdup(value);
114
OPT("analysis-load") p->analysisLoad = strdup(value);
115
OPT("radl") p->radl = atoi(value);
116
+ OPT("max-ausize-factor") p->maxAUSizeFactor = atof(value);
117
+ OPT("dynamic-refine") p->bDynamicRefine = atobool(value);
118
+ OPT("single-sei") p->bSingleSeiNal = atobool(value);
119
+ OPT("atc-sei") p->preferredTransferCharacteristics = atoi(value);
120
+ OPT("pic-struct") p->pictureStructure = atoi(value);
121
+ OPT("chunk-start") p->chunkStart = atoi(value);
122
+ OPT("chunk-end") p->chunkEnd = atoi(value);
123
+ OPT("nalu-file") p->naluFile = strdup(value);
124
else
125
return X265_PARAM_BAD_NAME;
126
}
127
128
* false || no - disabled
129
* integer bitmap value
130
* comma separated list of SIMD names, eg: SSE4.1,XOP */
131
-int parseCpuName(const char* value, bool& bError)
132
+int parseCpuName(const char* value, bool& bError, bool bEnableavx512)
133
{
134
if (!value)
135
{
136
137
if (isdigit(value[0]))
138
cpu = x265_atoi(value, bError);
139
else
140
- cpu = !strcmp(value, "auto") || x265_atobool(value, bError) ? X265_NS::cpu_detect() : 0;
141
+ cpu = !strcmp(value, "auto") || x265_atobool(value, bError) ? X265_NS::cpu_detect(bEnableavx512) : 0;
142
143
if (bError)
144
{
145
146
"Supported values for bCTUInfo are 0, 1, 2, 4, 6");
147
CHECK(param->interRefine > 3 || param->interRefine < 0,
148
"Invalid refine-inter value, refine-inter levels 0 to 3 supported");
149
- CHECK(param->intraRefine > 3 || param->intraRefine < 0,
150
+ CHECK(param->intraRefine > 4 || param->intraRefine < 0,
151
"Invalid refine-intra value, refine-intra levels 0 to 3 supported");
152
+ CHECK(param->maxAUSizeFactor < 0.5 || param->maxAUSizeFactor > 1.0,
153
+ "Supported factor for controlling max AU size is from 0.5 to 1");
154
#if !X86_64
155
CHECK(param->searchMethod == X265_SEA && (param->sourceWidth > 840 || param->sourceHeight > 480),
156
"SEA motion search does not support resolutions greater than 480p in 32 bit build");
157
158
if (param->masteringDisplayColorVolume || param->maxFALL || param->maxCLL)
159
param->bEmitHDRSEI = 1;
160
161
+ bool isSingleSEI = (param->bRepeatHeaders
162
+ || param->bEmitHRDSEI
163
+ || param->bEmitInfoSEI
164
+ || param->bEmitHDRSEI
165
+ || param->bEmitIDRRecoverySEI
166
+ || !!param->interlaceMode
167
+ || param->preferredTransferCharacteristics > 1
168
+ || param->toneMapFile
169
+ || param->naluFile);
170
+
171
+ if (!isSingleSEI && param->bSingleSeiNal)
172
+ {
173
+ param->bSingleSeiNal = 0;
174
+ x265_log(param, X265_LOG_WARNING, "None of the SEI messages are enabled. Disabling Single SEI NAL\n");
175
+ }
176
return check_failed;
177
}
178
179
180
TOOLVAL(param->bCTUInfo, "ctu-info=%d");
181
if (param->bMVType == AVC_INFO)
182
TOOLOPT(param->bMVType, "refine-mv-type=avc");
183
+ TOOLOPT(param->bDynamicRefine, "dynamic-refine");
184
if (param->maxSlices > 1)
185
TOOLVAL(param->maxSlices, "slices=%d");
186
if (param->bEnableLoopFilter)
187
188
TOOLOPT(!param->bSaoNonDeblocked && param->bEnableSAO, "sao");
189
TOOLOPT(param->rc.bStatWrite, "stats-write");
190
TOOLOPT(param->rc.bStatRead, "stats-read");
191
+ TOOLOPT(param->bSingleSeiNal, "single-sei");
192
#if ENABLE_HDR10_PLUS
193
TOOLOPT(param->toneMapFile != NULL, "dhdr10-info");
194
#endif
195
196
s += sprintf(s, " input-res=%dx%d", p->sourceWidth - padx, p->sourceHeight - pady);
197
s += sprintf(s, " interlace=%d", p->interlaceMode);
198
s += sprintf(s, " total-frames=%d", p->totalFrames);
199
+ if (p->chunkStart)
200
+ s += sprintf(s, " chunk-start=%d", p->chunkStart);
201
x265_2.7.tar.gz/source/common/param.h -> x265_2.9.tar.gz/source/common/param.h
Changed
10
1
2
char* x265_param2string(x265_param *param, int padx, int pady);
3
int x265_atoi(const char *str, bool& bError);
4
double x265_atof(const char *str, bool& bError);
5
-int parseCpuName(const char *value, bool& bError);
6
+int parseCpuName(const char *value, bool& bError, bool bEnableavx512);
7
void setParamAspectRatio(x265_param *p, int width, int height);
8
void getParamAspectRatio(x265_param *p, int& width, int& height);
9
bool parseLambdaFile(x265_param *param);
10
x265_2.7.tar.gz/source/common/picyuv.cpp -> x265_2.9.tar.gz/source/common/picyuv.cpp
Changed
21
1
2
pixel *uPic = m_picOrg[1];
3
pixel *vPic = m_picOrg[2];
4
5
+ if(param.minLuma != 0 || param.maxLuma != PIXEL_MAX)
6
+ {
7
+ for (int r = 0; r < height; r++)
8
+ {
9
+ for (int c = 0; c < width; c++)
10
+ {
11
+ yPic[c] = X265_MIN(yPic[c], (pixel)param.maxLuma);
12
+ yPic[c] = X265_MAX(yPic[c], (pixel)param.minLuma);
13
+ }
14
+ yPic += m_stride;
15
+ }
16
+ }
17
+ yPic = m_picOrg[0];
18
if (param.csvLogLevel >= 2 || param.maxCLL || param.maxFALL)
19
{
20
for (int r = 0; r < height; r++)
21
x265_2.7.tar.gz/source/common/picyuv.h -> x265_2.9.tar.gz/source/common/picyuv.h
Changed
9
1
2
pixel m_maxChromaVLevel;
3
pixel m_minChromaVLevel;
4
double m_avgChromaVLevel;
5
+ double m_vmafScore;
6
x265_param *m_param;
7
8
PicYuv();
9
x265_2.7.tar.gz/source/common/pixel.cpp -> x265_2.9.tar.gz/source/common/pixel.cpp
Changed
102
1
2
static void cuTreeFix8Pack(uint16_t *dst, double *src, int count)
3
{
4
for (int i = 0; i < count; i++)
5
- dst[i] = (uint16_t)(src[i] * 256.0);
6
+ dst[i] = (uint16_t)(int16_t)(src[i] * 256.0);
7
}
8
9
static void cuTreeFix8Unpack(double *dst, uint16_t *src, int count)
10
11
{
12
#define LUMA_PU(W, H) \
13
p.pu[LUMA_ ## W ## x ## H].copy_pp = blockcopy_pp_c<W, H>; \
14
- p.pu[LUMA_ ## W ## x ## H].addAvg = addAvg<W, H>; \
15
+ p.pu[LUMA_ ## W ## x ## H].addAvg[NONALIGNED] = addAvg<W, H>; \
16
+ p.pu[LUMA_ ## W ## x ## H].addAvg[ALIGNED] = addAvg<W, H>; \
17
p.pu[LUMA_ ## W ## x ## H].sad = sad<W, H>; \
18
p.pu[LUMA_ ## W ## x ## H].sad_x3 = sad_x3<W, H>; \
19
p.pu[LUMA_ ## W ## x ## H].sad_x4 = sad_x4<W, H>; \
20
- p.pu[LUMA_ ## W ## x ## H].pixelavg_pp = pixelavg_pp<W, H>;
21
-
22
+ p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[NONALIGNED] = pixelavg_pp<W, H>; \
23
+ p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[ALIGNED] = pixelavg_pp<W, H>;
24
#define LUMA_CU(W, H) \
25
p.cu[BLOCK_ ## W ## x ## H].sub_ps = pixel_sub_ps_c<W, H>; \
26
- p.cu[BLOCK_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>; \
27
+ p.cu[BLOCK_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_c<W, H>; \
28
+ p.cu[BLOCK_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_c<W, H>; \
29
p.cu[BLOCK_ ## W ## x ## H].copy_sp = blockcopy_sp_c<W, H>; \
30
p.cu[BLOCK_ ## W ## x ## H].copy_ps = blockcopy_ps_c<W, H>; \
31
p.cu[BLOCK_ ## W ## x ## H].copy_ss = blockcopy_ss_c<W, H>; \
32
- p.cu[BLOCK_ ## W ## x ## H].blockfill_s = blockfill_s_c<W>; \
33
+ p.cu[BLOCK_ ## W ## x ## H].blockfill_s[NONALIGNED] = blockfill_s_c<W>; \
34
+ p.cu[BLOCK_ ## W ## x ## H].blockfill_s[ALIGNED] = blockfill_s_c<W>; \
35
p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl<W>; \
36
p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shr = cpy2Dto1D_shr<W>; \
37
- p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl = cpy1Dto2D_shl<W>; \
38
+ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl<W>; \
39
+ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl<W>; \
40
p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shr = cpy1Dto2D_shr<W>; \
41
p.cu[BLOCK_ ## W ## x ## H].psy_cost_pp = psyCost_pp<BLOCK_ ## W ## x ## H>; \
42
p.cu[BLOCK_ ## W ## x ## H].transpose = transpose<W>; \
43
- p.cu[BLOCK_ ## W ## x ## H].ssd_s = pixel_ssd_s_c<W>; \
44
+ p.cu[BLOCK_ ## W ## x ## H].ssd_s[NONALIGNED] = pixel_ssd_s_c<W>; \
45
+ p.cu[BLOCK_ ## W ## x ## H].ssd_s[ALIGNED] = pixel_ssd_s_c<W>; \
46
p.cu[BLOCK_ ## W ## x ## H].var = pixel_var<W>; \
47
- p.cu[BLOCK_ ## W ## x ## H].calcresidual = getResidual<W>; \
48
+ p.cu[BLOCK_ ## W ## x ## H].calcresidual[NONALIGNED] = getResidual<W>; \
49
+ p.cu[BLOCK_ ## W ## x ## H].calcresidual[ALIGNED] = getResidual<W>; \
50
p.cu[BLOCK_ ## W ## x ## H].sse_pp = sse<W, H, pixel, pixel>; \
51
p.cu[BLOCK_ ## W ## x ## H].sse_ss = sse<W, H, int16_t, int16_t>;
52
53
54
p.cu[BLOCK_64x64].sa8d = sa8d16<64, 64>;
55
56
#define CHROMA_PU_420(W, H) \
57
- p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].addAvg = addAvg<W, H>; \
58
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].addAvg[NONALIGNED] = addAvg<W, H>; \
59
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].addAvg[ALIGNED] = addAvg<W, H>; \
60
p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].copy_pp = blockcopy_pp_c<W, H>; \
61
62
CHROMA_PU_420(2, 2);
63
64
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_c<W, H>; \
65
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_c<W, H>; \
66
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_c<W, H>; \
67
- p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>;
68
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_c<W, H>; \
69
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_c<W, H>;
70
71
CHROMA_CU_420(2, 2)
72
CHROMA_CU_420(4, 4)
73
74
p.chroma[X265_CSP_I420].cu[BLOCK_64x64].sa8d = sa8d16<32, 32>;
75
76
#define CHROMA_PU_422(W, H) \
77
- p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].addAvg = addAvg<W, H>; \
78
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].addAvg[NONALIGNED] = addAvg<W, H>; \
79
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].addAvg[ALIGNED] = addAvg<W, H>; \
80
p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].copy_pp = blockcopy_pp_c<W, H>; \
81
82
CHROMA_PU_422(2, 4);
83
84
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_c<W, H>; \
85
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_c<W, H>; \
86
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_c<W, H>; \
87
- p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>;
88
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_c<W, H>; \
89
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_c<W, H>;
90
91
CHROMA_CU_422(2, 4)
92
CHROMA_CU_422(4, 8)
93
94
p.weight_pp = weight_pp_c;
95
p.weight_sp = weight_sp_c;
96
97
- p.scale1D_128to64 = scale1D_128to64;
98
+ p.scale1D_128to64[NONALIGNED] = p.scale1D_128to64[ALIGNED] = scale1D_128to64;
99
p.scale2D_64to32 = scale2D_64to32;
100
p.frameInitLowres = frame_init_lowres_core;
101
p.ssim_4x4x2_core = ssim_4x4x2_core;
102
x265_2.7.tar.gz/source/common/predict.cpp -> x265_2.9.tar.gz/source/common/predict.cpp
Changed
72
1
2
MV mv0 = cu.m_mv[0][pu.puAbsPartIdx];
3
cu.clipMv(mv0);
4
5
- if (cu.m_slice->m_pps->bUseWeightPred && wp0->bPresentFlag)
6
+ if (cu.m_slice->m_pps->bUseWeightPred && wp0->wtPresent)
7
{
8
for (int plane = 0; plane < (bChroma ? 3 : 1); plane++)
9
{
10
11
pwp0 = refIdx0 >= 0 ? cu.m_slice->m_weightPredTable[0][refIdx0] : NULL;
12
pwp1 = refIdx1 >= 0 ? cu.m_slice->m_weightPredTable[1][refIdx1] : NULL;
13
14
- if (pwp0 && pwp1 && (pwp0->bPresentFlag || pwp1->bPresentFlag))
15
+ if (pwp0 && pwp1 && (pwp0->wtPresent || pwp1->wtPresent))
16
{
17
/* biprediction weighting */
18
for (int plane = 0; plane < (bChroma ? 3 : 1); plane++)
19
20
predInterChromaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
21
}
22
23
- if (pwp0 && pwp1 && (pwp0->bPresentFlag || pwp1->bPresentFlag))
24
+ if (pwp0 && pwp1 && (pwp0->wtPresent || pwp1->wtPresent))
25
addWeightBi(pu, predYuv, m_predShortYuv[0], m_predShortYuv[1], wv0, wv1, bLuma, bChroma);
26
else
27
predYuv.addAvg(m_predShortYuv[0], m_predShortYuv[1], pu.puAbsPartIdx, pu.width, pu.height, bLuma, bChroma);
28
29
MV mv0 = cu.m_mv[0][pu.puAbsPartIdx];
30
cu.clipMv(mv0);
31
32
- if (pwp0 && pwp0->bPresentFlag)
33
+ if (pwp0 && pwp0->wtPresent)
34
{
35
ShortYuv& shortYuv = m_predShortYuv[0];
36
37
38
/* uniprediction to L1 */
39
X265_CHECK(refIdx1 >= 0, "refidx1 was not positive\n");
40
41
- if (pwp1 && pwp1->bPresentFlag)
42
+ if (pwp1 && pwp1->wtPresent)
43
{
44
ShortYuv& shortYuv = m_predShortYuv[0];
45
46
47
int yFrac = mv.y & 3;
48
49
if (!(yFrac | xFrac))
50
- primitives.pu[partEnum].convert_p2s(src, srcStride, dst, dstStride);
51
+ {
52
+ bool srcbufferAlignCheck = (refPic.m_cuOffsetY[pu.ctuAddr] + refPic.m_buOffsetY[pu.cuAbsPartIdx + pu.puAbsPartIdx] + srcOffset) % 64 == 0;
53
+ bool dstbufferAlignCheck = (dstSYuv.getAddrOffset(pu.puAbsPartIdx, dstSYuv.m_size) % 64) == 0;
54
+ primitives.pu[partEnum].convert_p2s[srcStride % 64 == 0 && dstStride % 64 == 0 && srcbufferAlignCheck && dstbufferAlignCheck](src, srcStride, dst, dstStride);
55
+ }
56
else if (!yFrac)
57
primitives.pu[partEnum].luma_hps(src, srcStride, dst, dstStride, xFrac, 0);
58
else if (!xFrac)
59
60
61
if (!(yFrac | xFrac))
62
{
63
- primitives.chroma[m_csp].pu[partEnum].p2s(refCb, refStride, dstCb, dstStride);
64
- primitives.chroma[m_csp].pu[partEnum].p2s(refCr, refStride, dstCr, dstStride);
65
+ bool srcbufferAlignCheckC = (refPic.m_cuOffsetC[pu.ctuAddr] + refPic.m_buOffsetC[pu.cuAbsPartIdx + pu.puAbsPartIdx] + refOffset) % 64 == 0;
66
+ bool dstbufferAlignCheckC = dstSYuv.getChromaAddrOffset(pu.puAbsPartIdx) % 64 == 0;
67
+ primitives.chroma[m_csp].pu[partEnum].p2s[refStride % 64 == 0 && dstStride % 64 == 0 && srcbufferAlignCheckC && dstbufferAlignCheckC](refCb, refStride, dstCb, dstStride);
68
+ primitives.chroma[m_csp].pu[partEnum].p2s[refStride % 64 == 0 && dstStride % 64 == 0 && srcbufferAlignCheckC && dstbufferAlignCheckC](refCr, refStride, dstCr, dstStride);
69
}
70
else if (!yFrac)
71
{
72
x265_2.7.tar.gz/source/common/primitives.cpp -> x265_2.9.tar.gz/source/common/primitives.cpp
Changed
25
1
2
for (int i = 0; i < NUM_PU_SIZES; i++)
3
{
4
p.chroma[X265_CSP_I444].pu[i].copy_pp = p.pu[i].copy_pp;
5
- p.chroma[X265_CSP_I444].pu[i].addAvg = p.pu[i].addAvg;
6
+ p.chroma[X265_CSP_I444].pu[i].addAvg[NONALIGNED] = p.pu[i].addAvg[NONALIGNED];
7
+ p.chroma[X265_CSP_I444].pu[i].addAvg[ALIGNED] = p.pu[i].addAvg[ALIGNED];
8
p.chroma[X265_CSP_I444].pu[i].satd = p.pu[i].satd;
9
- p.chroma[X265_CSP_I444].pu[i].p2s = p.pu[i].convert_p2s;
10
+ p.chroma[X265_CSP_I444].pu[i].p2s[NONALIGNED] = p.pu[i].convert_p2s[NONALIGNED];
11
+ p.chroma[X265_CSP_I444].pu[i].p2s[ALIGNED] = p.pu[i].convert_p2s[ALIGNED];
12
}
13
14
for (int i = 0; i < NUM_CU_SIZES; i++)
15
16
p.chroma[X265_CSP_I444].cu[i].sa8d = p.cu[i].sa8d;
17
p.chroma[X265_CSP_I444].cu[i].sse_pp = p.cu[i].sse_pp;
18
p.chroma[X265_CSP_I444].cu[i].sub_ps = p.cu[i].sub_ps;
19
- p.chroma[X265_CSP_I444].cu[i].add_ps = p.cu[i].add_ps;
20
+ p.chroma[X265_CSP_I444].cu[i].add_ps[NONALIGNED] = p.cu[i].add_ps[NONALIGNED];
21
+ p.chroma[X265_CSP_I444].cu[i].add_ps[ALIGNED] = p.cu[i].add_ps[ALIGNED];
22
p.chroma[X265_CSP_I444].cu[i].copy_ps = p.cu[i].copy_ps;
23
p.chroma[X265_CSP_I444].cu[i].copy_sp = p.cu[i].copy_sp;
24
p.chroma[X265_CSP_I444].cu[i].copy_ss = p.cu[i].copy_ss;
25
x265_2.7.tar.gz/source/common/primitives.h -> x265_2.9.tar.gz/source/common/primitives.h
Changed
117
1
2
NUM_CU_SIZES
3
};
4
5
+enum AlignPrimitive
6
+{
7
+ NONALIGNED,
8
+ ALIGNED,
9
+ NUM_ALIGNMENT_TYPES
10
+};
11
+
12
enum { NUM_TR_SIZE = 4 }; // TU are 4x4, 8x8, 16x16, and 32x32
13
14
15
16
17
typedef void (*integralv_t)(uint32_t *sum, intptr_t stride);
18
typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride);
19
-
20
+typedef void(*nonPsyRdoQuant_t)(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos);
21
+typedef void(*psyRdoQuant_t)(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos);
22
+typedef void(*psyRdoQuant_t1)(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost,uint32_t blkPos);
23
+typedef void(*psyRdoQuant_t2)(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos);
24
/* Function pointers to optimized encoder primitives. Each pointer can reference
25
* either an assembly routine, a SIMD intrinsic primitive, or a C function */
26
struct EncoderPrimitives
27
28
filter_sp_t luma_vsp;
29
filter_ss_t luma_vss;
30
filter_hv_pp_t luma_hvpp; // combines hps + vsp
31
-
32
- pixelavg_pp_t pixelavg_pp; // quick bidir using pixels (borrowed from x264)
33
- addAvg_t addAvg; // bidir motion compensation, uses 16bit values
34
-
35
+ pixelavg_pp_t pixelavg_pp[NUM_ALIGNMENT_TYPES]; // quick bidir using pixels (borrowed from x264)
36
+ addAvg_t addAvg[NUM_ALIGNMENT_TYPES]; // bidir motion compensation, uses 16bit values
37
copy_pp_t copy_pp;
38
- filter_p2s_t convert_p2s;
39
+ filter_p2s_t convert_p2s[NUM_ALIGNMENT_TYPES];
40
}
41
pu[NUM_PU_SIZES];
42
43
44
dct_t standard_dct; // original dct function, used by lowpass_dct
45
dct_t lowpass_dct; // lowpass dct approximation
46
47
- calcresidual_t calcresidual;
48
+ calcresidual_t calcresidual[NUM_ALIGNMENT_TYPES];
49
pixel_sub_ps_t sub_ps;
50
- pixel_add_ps_t add_ps;
51
- blockfill_s_t blockfill_s; // block fill, for DC transforms
52
+ pixel_add_ps_t add_ps[NUM_ALIGNMENT_TYPES];
53
+ blockfill_s_t blockfill_s[NUM_ALIGNMENT_TYPES]; // block fill, for DC transforms
54
copy_cnt_t copy_cnt; // copy coeff while counting non-zero
55
count_nonzero_t count_nonzero;
56
cpy2Dto1D_shl_t cpy2Dto1D_shl;
57
cpy2Dto1D_shr_t cpy2Dto1D_shr;
58
- cpy1Dto2D_shl_t cpy1Dto2D_shl;
59
+ cpy1Dto2D_shl_t cpy1Dto2D_shl[NUM_ALIGNMENT_TYPES];
60
cpy1Dto2D_shr_t cpy1Dto2D_shr;
61
-
62
copy_sp_t copy_sp;
63
copy_ps_t copy_ps;
64
copy_ss_t copy_ss;
65
66
pixel_sse_t sse_pp; // Sum of Square Error (pixel, pixel) fenc alignment not assumed
67
pixel_sse_ss_t sse_ss; // Sum of Square Error (short, short) fenc alignment not assumed
68
pixelcmp_t psy_cost_pp; // difference in AC energy between two pixel blocks
69
- pixel_ssd_s_t ssd_s; // Sum of Square Error (residual coeff to self)
70
+ pixel_ssd_s_t ssd_s[NUM_ALIGNMENT_TYPES]; // Sum of Square Error (residual coeff to self)
71
pixelcmp_t sa8d; // Sum of Transformed Differences (8x8 Hadamard), uses satd for 4x4 intra TU
72
-
73
transpose_t transpose; // transpose pixel block; for use with intra all-angs
74
intra_allangs_t intra_pred_allangs;
75
intra_filter_t intra_filter;
76
intra_pred_t intra_pred[NUM_INTRA_MODE];
77
+ nonPsyRdoQuant_t nonPsyRdoQuant;
78
+ psyRdoQuant_t psyRdoQuant;
79
+ psyRdoQuant_t1 psyRdoQuant_1p;
80
+ psyRdoQuant_t2 psyRdoQuant_2p;
81
}
82
cu[NUM_CU_SIZES];
83
-
84
/* These remaining primitives work on either fixed block sizes or take
85
* block dimensions as arguments and thus do not belong in either the PU or
86
* the CU arrays */
87
88
dequant_scaling_t dequant_scaling;
89
dequant_normal_t dequant_normal;
90
denoiseDct_t denoiseDct;
91
- scale1D_t scale1D_128to64;
92
+ scale1D_t scale1D_128to64[NUM_ALIGNMENT_TYPES];
93
scale2D_t scale2D_64to32;
94
95
ssim_4x4x2_core_t ssim_4x4x2_core;
96
97
filter_ss_t filter_vss;
98
filter_pp_t filter_hpp;
99
filter_hps_t filter_hps;
100
- addAvg_t addAvg;
101
+ addAvg_t addAvg[NUM_ALIGNMENT_TYPES];
102
copy_pp_t copy_pp;
103
- filter_p2s_t p2s;
104
+ filter_p2s_t p2s[NUM_ALIGNMENT_TYPES];
105
106
}
107
pu[NUM_PU_SIZES];
108
109
pixelcmp_t sa8d; // if chroma CU is not multiple of 8x8, will use satd
110
pixel_sse_t sse_pp;
111
pixel_sub_ps_t sub_ps;
112
- pixel_add_ps_t add_ps;
113
+ pixel_add_ps_t add_ps[NUM_ALIGNMENT_TYPES];
114
115
copy_ps_t copy_ps;
116
copy_sp_t copy_sp;
117
x265_2.7.tar.gz/source/common/quant.cpp -> x265_2.9.tar.gz/source/common/quant.cpp
Changed
163
1
2
uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig)
3
{
4
const uint32_t sizeIdx = log2TrSize - 2;
5
-
6
if (cu.m_tqBypass[0])
7
{
8
- primitives.cu[sizeIdx].cpy1Dto2D_shl(residual, coeff, resiStride, 0);
9
+ primitives.cu[sizeIdx].cpy1Dto2D_shl[resiStride % 64 == 0](residual, coeff, resiStride, 0);
10
return;
11
}
12
-
13
// Values need to pass as input parameter in dequant
14
int rem = m_qpParam[ttype].rem;
15
int per = m_qpParam[ttype].per;
16
17
if (transformShift > 0)
18
primitives.cu[sizeIdx].cpy1Dto2D_shr(residual, m_resiDctCoeff, resiStride, transformShift);
19
else
20
- primitives.cu[sizeIdx].cpy1Dto2D_shl(residual, m_resiDctCoeff, resiStride, -transformShift);
21
+ primitives.cu[sizeIdx].cpy1Dto2D_shl[resiStride % 64 == 0](residual, m_resiDctCoeff, resiStride, -transformShift);
22
#endif
23
}
24
else
25
26
const int add_2nd = 1 << (shift_2nd - 1);
27
28
int dc_val = (((m_resiDctCoeff[0] * (64 >> 6) + add_1st) >> shift_1st) * (64 >> 3) + add_2nd) >> shift_2nd;
29
- primitives.cu[sizeIdx].blockfill_s(residual, resiStride, (int16_t)dc_val);
30
+ primitives.cu[sizeIdx].blockfill_s[resiStride % 64 == 0](residual, resiStride, (int16_t)dc_val);
31
return;
32
}
33
34
35
X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(dstCoeff), "numSig differ\n");
36
if (!numSig)
37
return 0;
38
-
39
const uint32_t trSize = 1 << log2TrSize;
40
int64_t lambda2 = m_qpParam[ttype].lambda2;
41
- const int64_t psyScale = ((int64_t)m_psyRdoqScale * m_qpParam[ttype].lambda);
42
-
43
+ int64_t psyScale = ((int64_t)m_psyRdoqScale * m_qpParam[ttype].lambda);
44
/* unquant constants for measuring distortion. Scaling list quant coefficients have a (1 << 4)
45
* scale applied that must be removed during unquant. Note that in real dequant there is clipping
46
* at several stages. We skip the clipping for simplicity when measuring RD cost */
47
48
for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++)
49
{
50
X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n");
51
-
52
uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
53
uint32_t blkPos = codeParams.scan[scanPosBase];
54
-
55
- // TODO: we can't SIMD optimize because PSYVALUE need 64-bits multiplication, convert to Double can work faster by FMA
56
- for (int y = 0; y < MLS_CG_SIZE; y++)
57
+ bool enable512 = detect512();
58
+ if (enable512)
59
+ primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
60
+ else
61
{
62
- for (int x = 0; x < MLS_CG_SIZE; x++)
63
- {
64
- int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
65
- int predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
66
-
67
- costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
68
-
69
- /* when no residual coefficient is coded, predicted coef == recon coef */
70
- costUncoded[blkPos + x] -= PSYVALUE(predictedCoef);
71
-
72
- totalUncodedCost += costUncoded[blkPos + x];
73
- totalRdCost += costUncoded[blkPos + x];
74
- }
75
- blkPos += trSize;
76
+ primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost,blkPos);
77
+ primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
78
}
79
}
80
}
81
82
for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++)
83
{
84
X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n");
85
-
86
uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
87
uint32_t blkPos = codeParams.scan[scanPosBase];
88
-
89
- for (int y = 0; y < MLS_CG_SIZE; y++)
90
- {
91
- for (int x = 0; x < MLS_CG_SIZE; x++)
92
- {
93
- int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
94
- costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
95
-
96
- totalUncodedCost += costUncoded[blkPos + x];
97
- totalRdCost += costUncoded[blkPos + x];
98
- }
99
- blkPos += trSize;
100
- }
101
+ primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
102
}
103
}
104
-
105
static const uint8_t table_cnt[5][SCAN_SET_SIZE] =
106
{
107
// patternSigCtx = 0
108
109
// TODO: does we need zero-coeff cost?
110
const uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
111
uint32_t blkPos = codeParams.scan[scanPosBase];
112
-
113
if (usePsyMask)
114
{
115
- // TODO: we can't SIMD optimize because PSYVALUE need 64-bits multiplication, convert to Double can work faster by FMA
116
+ bool enable512 = detect512();
117
+
118
+ if (enable512)
119
+ primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
120
+ else
121
+ {
122
+ primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
123
+ primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
124
+ }
125
+ blkPos = codeParams.scan[scanPosBase];
126
for (int y = 0; y < MLS_CG_SIZE; y++)
127
{
128
for (int x = 0; x < MLS_CG_SIZE; x++)
129
{
130
- int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
131
- int predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
132
-
133
- costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
134
-
135
- /* when no residual coefficient is coded, predicted coef == recon coef */
136
- costUncoded[blkPos + x] -= PSYVALUE(predictedCoef);
137
-
138
- totalUncodedCost += costUncoded[blkPos + x];
139
- totalRdCost += costUncoded[blkPos + x];
140
-
141
const uint32_t scanPosOffset = y * MLS_CG_SIZE + x;
142
const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset;
143
X265_CHECK(trSize > 4, "trSize check failure\n");
144
145
else
146
{
147
// non-psy path
148
+ primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
149
+ blkPos = codeParams.scan[scanPosBase];
150
for (int y = 0; y < MLS_CG_SIZE; y++)
151
{
152
for (int x = 0; x < MLS_CG_SIZE; x++)
153
{
154
- int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
155
- costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
156
-
157
- totalUncodedCost += costUncoded[blkPos + x];
158
- totalRdCost += costUncoded[blkPos + x];
159
-
160
const uint32_t scanPosOffset = y * MLS_CG_SIZE + x;
161
const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset;
162
X265_CHECK(trSize > 4, "trSize check failure\n");
163
x265_2.7.tar.gz/source/common/slice.cpp -> x265_2.9.tar.gz/source/common/slice.cpp
Changed
10
1
2
for (int yuv = 0; yuv < 3; yuv++)
3
{
4
WeightParam& wp = m_weightPredTable[l][i][yuv];
5
- wp.bPresentFlag = false;
6
+ wp.wtPresent = 0;
7
wp.log2WeightDenom = 0;
8
wp.inputWeight = 1;
9
wp.inputOffset = 0;
10
x265_2.7.tar.gz/source/common/slice.h -> x265_2.9.tar.gz/source/common/slice.h
Changed
37
1
2
uint32_t log2WeightDenom;
3
int inputWeight;
4
int inputOffset;
5
- bool bPresentFlag;
6
+ int wtPresent;
7
8
/* makes a non-h265 weight (i.e. fix7), into an h265 weight */
9
void setFromWeightAndOffset(int w, int o, int denom, bool bNormalize)
10
11
(w).inputWeight = (s); \
12
(w).log2WeightDenom = (d); \
13
(w).inputOffset = (o); \
14
- (w).bPresentFlag = (b); \
15
+ (w).wtPresent = (b); \
16
}
17
18
class Slice
19
20
bool getRapPicFlag() const
21
{
22
return m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL
23
+ || m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_N_LP
24
|| m_nalUnitType == NAL_UNIT_CODED_SLICE_CRA;
25
}
26
-
27
bool getIdrPicFlag() const
28
{
29
- return m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL;
30
+ return m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL
31
+ || m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_N_LP;
32
}
33
-
34
bool isIRAP() const { return m_nalUnitType >= 16 && m_nalUnitType <= 23; }
35
36
bool isIntra() const { return m_sliceType == I_SLICE; }
37
x265_2.7.tar.gz/source/common/x86/asm-primitives.cpp -> x265_2.9.tar.gz/source/common/x86/asm-primitives.cpp
Changed
201
1
2
p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sa8d = PFX(pixel_sa8d_8x16_ ## cpu); \
3
p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sa8d = PFX(pixel_sa8d_16x32_ ## cpu); \
4
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sa8d = PFX(pixel_sa8d_32x64_ ## cpu)
5
-
6
#define PIXEL_AVG(cpu) \
7
- p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_ ## cpu); \
8
- p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_64x48_ ## cpu); \
9
- p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_ ## cpu); \
10
- p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_64x16_ ## cpu); \
11
- p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_ ## cpu); \
12
- p.pu[LUMA_32x64].pixelavg_pp = PFX(pixel_avg_32x64_ ## cpu); \
13
- p.pu[LUMA_32x32].pixelavg_pp = PFX(pixel_avg_32x32_ ## cpu); \
14
- p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_32x24_ ## cpu); \
15
- p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_32x16_ ## cpu); \
16
- p.pu[LUMA_32x8].pixelavg_pp = PFX(pixel_avg_32x8_ ## cpu); \
17
- p.pu[LUMA_24x32].pixelavg_pp = PFX(pixel_avg_24x32_ ## cpu); \
18
- p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_ ## cpu); \
19
- p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_ ## cpu); \
20
- p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_ ## cpu); \
21
- p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_16x12_ ## cpu); \
22
- p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_ ## cpu); \
23
- p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_ ## cpu); \
24
- p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_12x16_ ## cpu); \
25
- p.pu[LUMA_8x32].pixelavg_pp = PFX(pixel_avg_8x32_ ## cpu); \
26
- p.pu[LUMA_8x16].pixelavg_pp = PFX(pixel_avg_8x16_ ## cpu); \
27
- p.pu[LUMA_8x8].pixelavg_pp = PFX(pixel_avg_8x8_ ## cpu); \
28
- p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_8x4_ ## cpu);
29
-
30
+ p.pu[LUMA_64x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_64x64_ ## cpu); \
31
+ p.pu[LUMA_64x48].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_64x48_ ## cpu); \
32
+ p.pu[LUMA_64x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_64x32_ ## cpu); \
33
+ p.pu[LUMA_64x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_64x16_ ## cpu); \
34
+ p.pu[LUMA_48x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_48x64_ ## cpu); \
35
+ p.pu[LUMA_32x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x64_ ## cpu); \
36
+ p.pu[LUMA_32x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x32_ ## cpu); \
37
+ p.pu[LUMA_32x24].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x24_ ## cpu); \
38
+ p.pu[LUMA_32x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x16_ ## cpu); \
39
+ p.pu[LUMA_32x8].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x8_ ## cpu); \
40
+ p.pu[LUMA_24x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_24x32_ ## cpu); \
41
+ p.pu[LUMA_16x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_16x64_ ## cpu); \
42
+ p.pu[LUMA_16x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_16x32_ ## cpu); \
43
+ p.pu[LUMA_16x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_16x16_ ## cpu); \
44
+ p.pu[LUMA_16x12].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_16x12_ ## cpu); \
45
+ p.pu[LUMA_16x8].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_16x8_ ## cpu); \
46
+ p.pu[LUMA_16x4].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_16x4_ ## cpu); \
47
+ p.pu[LUMA_12x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_12x16_ ## cpu); \
48
+ p.pu[LUMA_8x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_8x32_ ## cpu); \
49
+ p.pu[LUMA_8x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_8x16_ ## cpu); \
50
+ p.pu[LUMA_8x8].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_8x8_ ## cpu); \
51
+ p.pu[LUMA_8x4].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_8x4_ ## cpu); \
52
+ p.pu[LUMA_64x64].pixelavg_pp[ALIGNED] = PFX(pixel_avg_64x64_ ## cpu); \
53
+ p.pu[LUMA_64x48].pixelavg_pp[ALIGNED] = PFX(pixel_avg_64x48_ ## cpu); \
54
+ p.pu[LUMA_64x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_64x32_ ## cpu); \
55
+ p.pu[LUMA_64x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_64x16_ ## cpu); \
56
+ p.pu[LUMA_48x64].pixelavg_pp[ALIGNED] = PFX(pixel_avg_48x64_ ## cpu); \
57
+ p.pu[LUMA_32x64].pixelavg_pp[ALIGNED] = PFX(pixel_avg_32x64_ ## cpu); \
58
+ p.pu[LUMA_32x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_32x32_ ## cpu); \
59
+ p.pu[LUMA_32x24].pixelavg_pp[ALIGNED] = PFX(pixel_avg_32x24_ ## cpu); \
60
+ p.pu[LUMA_32x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_32x16_ ## cpu); \
61
+ p.pu[LUMA_32x8].pixelavg_pp[ALIGNED] = PFX(pixel_avg_32x8_ ## cpu); \
62
+ p.pu[LUMA_24x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_24x32_ ## cpu); \
63
+ p.pu[LUMA_16x64].pixelavg_pp[ALIGNED] = PFX(pixel_avg_16x64_ ## cpu); \
64
+ p.pu[LUMA_16x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_16x32_ ## cpu); \
65
+ p.pu[LUMA_16x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_16x16_ ## cpu); \
66
+ p.pu[LUMA_16x12].pixelavg_pp[ALIGNED] = PFX(pixel_avg_16x12_ ## cpu); \
67
+ p.pu[LUMA_16x8].pixelavg_pp[ALIGNED] = PFX(pixel_avg_16x8_ ## cpu); \
68
+ p.pu[LUMA_16x4].pixelavg_pp[ALIGNED] = PFX(pixel_avg_16x4_ ## cpu); \
69
+ p.pu[LUMA_12x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_12x16_ ## cpu); \
70
+ p.pu[LUMA_8x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_8x32_ ## cpu); \
71
+ p.pu[LUMA_8x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_8x16_ ## cpu); \
72
+ p.pu[LUMA_8x8].pixelavg_pp[ALIGNED] = PFX(pixel_avg_8x8_ ## cpu); \
73
+ p.pu[LUMA_8x4].pixelavg_pp[ALIGNED] = PFX(pixel_avg_8x4_ ## cpu);
74
#define PIXEL_AVG_W4(cpu) \
75
- p.pu[LUMA_4x4].pixelavg_pp = PFX(pixel_avg_4x4_ ## cpu); \
76
- p.pu[LUMA_4x8].pixelavg_pp = PFX(pixel_avg_4x8_ ## cpu); \
77
- p.pu[LUMA_4x16].pixelavg_pp = PFX(pixel_avg_4x16_ ## cpu);
78
-
79
+ p.pu[LUMA_4x4].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_4x4_ ## cpu); \
80
+ p.pu[LUMA_4x8].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_4x8_ ## cpu); \
81
+ p.pu[LUMA_4x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_4x16_ ## cpu); \
82
+ p.pu[LUMA_4x4].pixelavg_pp[ALIGNED] = PFX(pixel_avg_4x4_ ## cpu); \
83
+ p.pu[LUMA_4x8].pixelavg_pp[ALIGNED] = PFX(pixel_avg_4x8_ ## cpu); \
84
+ p.pu[LUMA_4x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_4x16_ ## cpu);
85
#define CHROMA_420_FILTERS(cpu) \
86
ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, cpu); \
87
ALL_CHROMA_420_PU(filter_hps, interp_4tap_horiz_ps, cpu); \
88
89
90
#define LUMA_PIXELSUB(cpu) \
91
p.cu[BLOCK_4x4].sub_ps = PFX(pixel_sub_ps_4x4_ ## cpu); \
92
- p.cu[BLOCK_4x4].add_ps = PFX(pixel_add_ps_4x4_ ## cpu); \
93
+ p.cu[BLOCK_4x4].add_ps[NONALIGNED] = PFX(pixel_add_ps_4x4_ ## cpu); \
94
+ p.cu[BLOCK_4x4].add_ps[ALIGNED] = PFX(pixel_add_ps_4x4_ ## cpu); \
95
ALL_LUMA_CU(sub_ps, pixel_sub_ps, cpu); \
96
- ALL_LUMA_CU(add_ps, pixel_add_ps, cpu);
97
+ ALL_LUMA_CU(add_ps[NONALIGNED], pixel_add_ps, cpu); \
98
+ ALL_LUMA_CU(add_ps[ALIGNED], pixel_add_ps, cpu);
99
100
#define CHROMA_420_PIXELSUB_PS(cpu) \
101
ALL_CHROMA_420_CU(sub_ps, pixel_sub_ps, cpu); \
102
- ALL_CHROMA_420_CU(add_ps, pixel_add_ps, cpu);
103
+ ALL_CHROMA_420_CU(add_ps[NONALIGNED], pixel_add_ps, cpu); \
104
+ ALL_CHROMA_420_CU(add_ps[ALIGNED], pixel_add_ps, cpu);
105
106
#define CHROMA_422_PIXELSUB_PS(cpu) \
107
ALL_CHROMA_422_CU(sub_ps, pixel_sub_ps, cpu); \
108
- ALL_CHROMA_422_CU(add_ps, pixel_add_ps, cpu);
109
+ ALL_CHROMA_422_CU(add_ps[NONALIGNED], pixel_add_ps, cpu); \
110
+ ALL_CHROMA_422_CU(add_ps[ALIGNED], pixel_add_ps, cpu);
111
112
#define LUMA_VAR(cpu) ALL_LUMA_CU(var, pixel_var, cpu)
113
114
-#define LUMA_ADDAVG(cpu) ALL_LUMA_PU(addAvg, addAvg, cpu); p.pu[LUMA_4x4].addAvg = PFX(addAvg_4x4_ ## cpu)
115
-#define CHROMA_420_ADDAVG(cpu) ALL_CHROMA_420_PU(addAvg, addAvg, cpu);
116
-#define CHROMA_422_ADDAVG(cpu) ALL_CHROMA_422_PU(addAvg, addAvg, cpu);
117
+#define LUMA_ADDAVG(cpu) ALL_LUMA_PU(addAvg[NONALIGNED], addAvg, cpu); \
118
+ p.pu[LUMA_4x4].addAvg[NONALIGNED] = PFX(addAvg_4x4_ ## cpu); \
119
+ ALL_LUMA_PU(addAvg[ALIGNED], addAvg, cpu); \
120
+ p.pu[LUMA_4x4].addAvg[ALIGNED] = PFX(addAvg_4x4_ ## cpu)
121
+#define CHROMA_420_ADDAVG(cpu) ALL_CHROMA_420_PU(addAvg[NONALIGNED], addAvg, cpu); \
122
+ ALL_CHROMA_420_PU(addAvg[ALIGNED], addAvg, cpu)
123
+#define CHROMA_422_ADDAVG(cpu) ALL_CHROMA_422_PU(addAvg[NONALIGNED], addAvg, cpu); \
124
+ ALL_CHROMA_422_PU(addAvg[ALIGNED], addAvg, cpu)
125
126
#define SETUP_INTRA_ANG_COMMON(mode, fno, cpu) \
127
p.cu[BLOCK_4x4].intra_pred[mode] = PFX(intra_pred_ang4_ ## fno ## _ ## cpu); \
128
129
ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, cpu); \
130
ALL_CHROMA_444_PU(filter_hps, interp_4tap_horiz_ps, cpu);
131
132
+#define ASSIGN2(func, fname) \
133
+ func[ALIGNED] = PFX(fname); \
134
+ func[NONALIGNED] = PFX(fname)
135
+
136
namespace X265_NS {
137
// private x265 namespace
138
139
140
141
void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // Main10
142
{
143
-#if !defined(X86_64)
144
-#error "Unsupported build configuration (32bit x86 and HIGH_BIT_DEPTH), you must configure ENABLE_ASSEMBLY=OFF"
145
-#endif
146
-
147
#if X86_64
148
p.scanPosLast = PFX(scanPosLast_x64);
149
#endif
150
151
CHROMA_422_VERT_FILTERS(_sse2);
152
CHROMA_444_VERT_FILTERS(sse2);
153
154
+#if X86_64
155
ALL_LUMA_PU(luma_hpp, interp_8tap_horiz_pp, sse2);
156
p.pu[LUMA_4x4].luma_hpp = PFX(interp_8tap_horiz_pp_4x4_sse2);
157
ALL_LUMA_PU(luma_hps, interp_8tap_horiz_ps, sse2);
158
p.pu[LUMA_4x4].luma_hps = PFX(interp_8tap_horiz_ps_4x4_sse2);
159
ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, sse2);
160
ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, sse2);
161
+#endif
162
163
p.ssim_4x4x2_core = PFX(pixel_ssim_4x4x2_core_sse2);
164
p.ssim_end_4 = PFX(pixel_ssim_end4_sse2);
165
- PIXEL_AVG(sse2);
166
+ ASSIGN2(p.pu[LUMA_64x64].pixelavg_pp, pixel_avg_64x64_sse2);
167
+ ASSIGN2(p.pu[LUMA_64x48].pixelavg_pp, pixel_avg_64x48_sse2);
168
+ ASSIGN2(p.pu[LUMA_64x32].pixelavg_pp, pixel_avg_64x32_sse2);
169
+ ASSIGN2(p.pu[LUMA_64x16].pixelavg_pp, pixel_avg_64x16_sse2);
170
+ ASSIGN2(p.pu[LUMA_48x64].pixelavg_pp, pixel_avg_48x64_sse2);
171
+ ASSIGN2(p.pu[LUMA_32x64].pixelavg_pp, pixel_avg_32x64_sse2);
172
+ ASSIGN2(p.pu[LUMA_32x32].pixelavg_pp, pixel_avg_32x32_sse2);
173
+ ASSIGN2(p.pu[LUMA_32x24].pixelavg_pp, pixel_avg_32x24_sse2);
174
+ ASSIGN2(p.pu[LUMA_32x16].pixelavg_pp, pixel_avg_32x16_sse2);
175
+ ASSIGN2(p.pu[LUMA_32x8].pixelavg_pp, pixel_avg_32x8_sse2);
176
+ ASSIGN2(p.pu[LUMA_24x32].pixelavg_pp, pixel_avg_24x32_sse2);
177
+ ASSIGN2(p.pu[LUMA_16x64].pixelavg_pp, pixel_avg_16x64_sse2);
178
+ ASSIGN2(p.pu[LUMA_16x32].pixelavg_pp, pixel_avg_16x32_sse2);
179
+ ASSIGN2(p.pu[LUMA_16x16].pixelavg_pp, pixel_avg_16x16_sse2);
180
+ ASSIGN2(p.pu[LUMA_16x12].pixelavg_pp, pixel_avg_16x12_sse2);
181
+ ASSIGN2(p.pu[LUMA_16x8].pixelavg_pp, pixel_avg_16x8_sse2);
182
+ ASSIGN2(p.pu[LUMA_16x4].pixelavg_pp, pixel_avg_16x4_sse2);
183
+ ASSIGN2(p.pu[LUMA_12x16].pixelavg_pp, pixel_avg_12x16_sse2);
184
+#if X86_64
185
+ ASSIGN2(p.pu[LUMA_8x32].pixelavg_pp, pixel_avg_8x32_sse2);
186
+ ASSIGN2(p.pu[LUMA_8x16].pixelavg_pp, pixel_avg_8x16_sse2);
187
+ ASSIGN2(p.pu[LUMA_8x8].pixelavg_pp, pixel_avg_8x8_sse2);
188
+ ASSIGN2(p.pu[LUMA_8x4].pixelavg_pp, pixel_avg_8x4_sse2);
189
+#endif
190
PIXEL_AVG_W4(mmx2);
191
LUMA_VAR(sse2);
192
193
194
- ALL_LUMA_TU(blockfill_s, blockfill_s, sse2);
195
+ ALL_LUMA_TU(blockfill_s[ALIGNED], blockfill_s, sse2);
196
+ ALL_LUMA_TU(blockfill_s[NONALIGNED], blockfill_s, sse2);
197
ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, sse2);
198
- ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, sse2);
199
+ ALL_LUMA_TU_S(cpy1Dto2D_shl[ALIGNED], cpy1Dto2D_shl_, sse2);
200
+ ALL_LUMA_TU_S(cpy1Dto2D_shl[NONALIGNED], cpy1Dto2D_shl_, sse2);
201
x265_2.7.tar.gz/source/common/x86/blockcopy8.asm -> x265_2.9.tar.gz/source/common/x86/blockcopy8.asm
Changed
201
1
2
%include "x86inc.asm"
3
%include "x86util.asm"
4
5
-SECTION_RODATA 32
6
+SECTION_RODATA 64
7
+
8
+ALIGN 64
9
+const shuf1_avx512, dq 0, 2, 4, 6, 1, 3, 5, 7
10
11
cextern pb_4
12
cextern pb_1
13
14
BLOCKCOPY_PP_W64_H4_avx 64, 48
15
BLOCKCOPY_PP_W64_H4_avx 64, 64
16
17
+;----------------------------------------------------------------------------------------------
18
+; blockcopy_pp avx512 code start
19
+;----------------------------------------------------------------------------------------------
20
+%macro PROCESS_BLOCKCOPY_PP_64X4_avx512 0
21
+movu m0, [r2]
22
+movu m1, [r2 + r3]
23
+movu m2, [r2 + 2 * r3]
24
+movu m3, [r2 + r4]
25
+
26
+movu [r0] , m0
27
+movu [r0 + r1] , m1
28
+movu [r0 + 2 * r1] , m2
29
+movu [r0 + r5] , m3
30
+%endmacro
31
+
32
+%macro PROCESS_BLOCKCOPY_PP_32X4_avx512 0
33
+movu ym0, [r2]
34
+vinserti32x8 m0, [r2 + r3], 1
35
+movu ym1, [r2 + 2 * r3]
36
+vinserti32x8 m1, [r2 + r4], 1
37
+
38
+movu [r0] , ym0
39
+vextracti32x8 [r0 + r1] , m0, 1
40
+movu [r0 + 2 * r1] , ym1
41
+vextracti32x8 [r0 + r5] , m1, 1
42
+%endmacro
43
+
44
+;----------------------------------------------------------------------------------------------
45
+; void blockcopy_pp_64x%1(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
46
+;----------------------------------------------------------------------------------------------
47
+%macro BLOCKCOPY_PP_W64_H4_avx512 1
48
+INIT_ZMM avx512
49
+cglobal blockcopy_pp_64x%1, 4, 6, 4
50
+lea r4, [3 * r3]
51
+lea r5, [3 * r1]
52
+
53
+%rep %1/4 - 1
54
+PROCESS_BLOCKCOPY_PP_64X4_avx512
55
+lea r2, [r2 + 4 * r3]
56
+lea r0, [r0 + 4 * r1]
57
+%endrep
58
+
59
+PROCESS_BLOCKCOPY_PP_64X4_avx512
60
+RET
61
+%endmacro
62
+
63
+BLOCKCOPY_PP_W64_H4_avx512 16
64
+BLOCKCOPY_PP_W64_H4_avx512 32
65
+BLOCKCOPY_PP_W64_H4_avx512 48
66
+BLOCKCOPY_PP_W64_H4_avx512 64
67
+
68
+%macro BLOCKCOPY_PP_W32_H4_avx512 1
69
+INIT_ZMM avx512
70
+cglobal blockcopy_pp_32x%1, 4, 6, 2
71
+ lea r4, [3 * r3]
72
+ lea r5, [3 * r1]
73
+
74
+%rep %1/4 - 1
75
+ PROCESS_BLOCKCOPY_PP_32X4_avx512
76
+ lea r2, [r2 + 4 * r3]
77
+ lea r0, [r0 + 4 * r1]
78
+%endrep
79
+ PROCESS_BLOCKCOPY_PP_32X4_avx512
80
+ RET
81
+%endmacro
82
+
83
+BLOCKCOPY_PP_W32_H4_avx512 8
84
+BLOCKCOPY_PP_W32_H4_avx512 16
85
+BLOCKCOPY_PP_W32_H4_avx512 24
86
+BLOCKCOPY_PP_W32_H4_avx512 32
87
+BLOCKCOPY_PP_W32_H4_avx512 48
88
+BLOCKCOPY_PP_W32_H4_avx512 64
89
+;----------------------------------------------------------------------------------------------
90
+; blockcopy_pp avx512 code end
91
+;----------------------------------------------------------------------------------------------
92
+
93
;-----------------------------------------------------------------------------
94
; void blockcopy_sp_2x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
95
;-----------------------------------------------------------------------------
96
97
98
BLOCKCOPY_SP_W64_H4_avx2 64, 64
99
100
+%macro PROCESS_BLOCKCOPY_SP_64x4_AVX512 0
101
+ movu m0, [r2]
102
+ movu m1, [r2 + 64]
103
+ movu m2, [r2 + r3]
104
+ movu m3, [r2 + r3 + 64]
105
+
106
+ packuswb m0, m1
107
+ packuswb m2, m3
108
+ vpermq m0, m4, m0
109
+ vpermq m2, m4, m2
110
+ movu [r0], m0
111
+ movu [r0 + r1], m2
112
+
113
+ movu m0, [r2 + 2 * r3]
114
+ movu m1, [r2 + 2 * r3 + 64]
115
+ movu m2, [r2 + r4]
116
+ movu m3, [r2 + r4 + 64]
117
+
118
+ packuswb m0, m1
119
+ packuswb m2, m3
120
+ vpermq m0, m4, m0
121
+ vpermq m2, m4, m2
122
+ movu [r0 + 2 * r1], m0
123
+ movu [r0 + r5], m2
124
+%endmacro
125
+
126
+%macro PROCESS_BLOCKCOPY_SP_32x4_AVX512 0
127
+ movu m0, [r2]
128
+ movu m1, [r2 + r3]
129
+ movu m2, [r2 + 2 * r3]
130
+ movu m3, [r2 + r4]
131
+
132
+ packuswb m0, m1
133
+ packuswb m2, m3
134
+ vpermq m0, m4, m0
135
+ vpermq m2, m4, m2
136
+ movu [r0], ym0
137
+ vextracti32x8 [r0 + r1], m0, 1
138
+ movu [r0 + 2 * r1], ym2
139
+ vextracti32x8 [r0 + r5], m2, 1
140
+%endmacro
141
+
142
+;-----------------------------------------------------------------------------
143
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
144
+;-----------------------------------------------------------------------------
145
+INIT_ZMM avx512
146
+cglobal blockcopy_sp_64x64, 4, 6, 5
147
+ mova m4, [shuf1_avx512]
148
+ add r3, r3
149
+ lea r4, [3 * r3]
150
+ lea r5, [3 * r1]
151
+
152
+%rep 15
153
+ PROCESS_BLOCKCOPY_SP_64x4_AVX512
154
+ lea r0, [r0 + 4 * r1]
155
+ lea r2, [r2 + 4 * r3]
156
+%endrep
157
+ PROCESS_BLOCKCOPY_SP_64x4_AVX512
158
+ RET
159
+
160
+%macro BLOCKCOPY_SP_32xN_AVX512 1
161
+INIT_ZMM avx512
162
+cglobal blockcopy_sp_32x%1, 4, 6, 5
163
+ mova m4, [shuf1_avx512]
164
+ add r3, r3
165
+ lea r4, [3 * r3]
166
+ lea r5, [3 * r1]
167
+
168
+%rep %1/4 - 1
169
+ PROCESS_BLOCKCOPY_SP_32x4_AVX512
170
+ lea r0, [r0 + 4 * r1]
171
+ lea r2, [r2 + 4 * r3]
172
+%endrep
173
+ PROCESS_BLOCKCOPY_SP_32x4_AVX512
174
+ RET
175
+%endmacro
176
+
177
+BLOCKCOPY_SP_32xN_AVX512 32
178
+BLOCKCOPY_SP_32xN_AVX512 64
179
+
180
;-----------------------------------------------------------------------------
181
; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val)
182
;-----------------------------------------------------------------------------
183
184
movu [r0 + r3 + 32], m0
185
RET
186
187
+;--------------------------------------------------------------------
188
+; void blockfill_s_32x32(int16_t* dst, intptr_t dstride, int16_t val)
189
+;--------------------------------------------------------------------
190
+INIT_ZMM avx512
191
+cglobal blockfill_s_32x32, 3, 4, 1
192
+add r1, r1
193
+lea r3, [3 * r1]
194
+movd xm0, r2d
195
+vpbroadcastw m0, xm0
196
+
197
+%rep 8
198
+movu [r0], m0
199
+movu [r0 + r1], m0
200
+movu [r0 + 2 * r1], m0
201
x265_2.7.tar.gz/source/common/x86/blockcopy8.h -> x265_2.9.tar.gz/source/common/x86/blockcopy8.h
Changed
51
1
2
FUNCDEF_TU_S(void, cpy2Dto1D_shl, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
3
FUNCDEF_TU_S(void, cpy2Dto1D_shl, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
4
FUNCDEF_TU_S(void, cpy2Dto1D_shl, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
5
+FUNCDEF_TU_S(void, cpy2Dto1D_shl, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
6
7
FUNCDEF_TU_S(void, cpy2Dto1D_shr, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
8
FUNCDEF_TU_S(void, cpy2Dto1D_shr, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
9
FUNCDEF_TU_S(void, cpy2Dto1D_shr, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
10
+FUNCDEF_TU_S(void, cpy2Dto1D_shr, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
11
12
FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
13
FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
14
FUNCDEF_TU_S(void, cpy1Dto2D_shl, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
15
-
16
+FUNCDEF_TU_S(void, cpy1Dto2D_shl, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
17
+FUNCDEF_TU_S(void, cpy1Dto2D_shl_aligned, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
18
FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
19
FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
20
FUNCDEF_TU_S(void, cpy1Dto2D_shr, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
21
+FUNCDEF_TU_S(void, cpy1Dto2D_shr, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
22
23
FUNCDEF_TU_S(uint32_t, copy_cnt, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride);
24
FUNCDEF_TU_S(uint32_t, copy_cnt, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride);
25
FUNCDEF_TU_S(uint32_t, copy_cnt, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride);
26
+FUNCDEF_TU_S(uint32_t, copy_cnt, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride);
27
28
FUNCDEF_TU(void, blockfill_s, sse2, int16_t* dst, intptr_t dstride, int16_t val);
29
FUNCDEF_TU(void, blockfill_s, avx2, int16_t* dst, intptr_t dstride, int16_t val);
30
+FUNCDEF_TU(void, blockfill_s, avx512, int16_t* dst, intptr_t dstride, int16_t val);
31
+FUNCDEF_TU(void, blockfill_s_aligned, avx512, int16_t* dst, intptr_t dstride, int16_t val);
32
33
FUNCDEF_CHROMA_PU(void, blockcopy_ss, sse2, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
34
FUNCDEF_CHROMA_PU(void, blockcopy_ss, avx, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
35
+FUNCDEF_CHROMA_PU(void, blockcopy_ss, avx512, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
36
37
FUNCDEF_CHROMA_PU(void, blockcopy_pp, sse2, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
38
FUNCDEF_CHROMA_PU(void, blockcopy_pp, avx, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
39
+FUNCDEF_CHROMA_PU(void, blockcopy_pp, avx512, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
40
41
FUNCDEF_PU(void, blockcopy_sp, sse2, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
42
FUNCDEF_PU(void, blockcopy_sp, sse4, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
43
FUNCDEF_PU(void, blockcopy_sp, avx2, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
44
+FUNCDEF_PU(void, blockcopy_sp, avx512, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
45
FUNCDEF_PU(void, blockcopy_ps, sse2, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
46
FUNCDEF_PU(void, blockcopy_ps, sse4, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
47
FUNCDEF_PU(void, blockcopy_ps, avx2, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
48
+FUNCDEF_PU(void, blockcopy_ps, avx512, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
49
50
#endif // ifndef X265_I386_PIXEL_H
51
x265_2.7.tar.gz/source/common/x86/const-a.asm -> x265_2.9.tar.gz/source/common/x86/const-a.asm
Changed
10
1
2
3
%include "x86inc.asm"
4
5
-SECTION_RODATA 32
6
+SECTION_RODATA 64
7
8
;; 8-bit constants
9
10
x265_2.7.tar.gz/source/common/x86/cpu-a.asm -> x265_2.9.tar.gz/source/common/x86/cpu-a.asm
Changed
46
1
2
RET
3
4
;-----------------------------------------------------------------------------
5
-; void cpu_xgetbv( int op, int *eax, int *edx )
6
+; uint64_t cpu_xgetbv( int xcr )
7
;-----------------------------------------------------------------------------
8
-cglobal cpu_xgetbv, 3,7
9
- push r2
10
- push r1
11
- mov ecx, r0d
12
+cglobal cpu_xgetbv
13
+ movifnidn ecx, r0m
14
xgetbv
15
- pop r4
16
- mov [r4], eax
17
- pop r4
18
- mov [r4], edx
19
- RET
20
+%if ARCH_X86_64
21
+ shl rdx, 32
22
+ or rax, rdx
23
+%endif
24
+ ret
25
26
%if ARCH_X86_64
27
28
29
%if WIN64
30
sub rsp, 32 ; shadow space
31
%endif
32
- and rsp, ~31
33
+ and rsp, ~(STACK_ALIGNMENT - 1)
34
mov rax, r0
35
mov r0, r1
36
mov r1, r2
37
38
push ebp
39
mov ebp, esp
40
sub esp, 12
41
- and esp, ~31
42
+ and esp, ~(STACK_ALIGNMENT - 1)
43
mov ecx, [ebp+8]
44
mov edx, [ebp+12]
45
mov [esp], edx
46
x265_2.7.tar.gz/source/common/x86/dct8.asm -> x265_2.9.tar.gz/source/common/x86/dct8.asm
Changed
201
1
2
3
%include "x86inc.asm"
4
%include "x86util.asm"
5
-SECTION_RODATA 32
6
+SECTION_RODATA 64
7
+
8
+tab_dct32: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
9
+ dw 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90
10
+ dw 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90, -90, -87, -80, -70, -57, -43, -25, -9, 9, 25, 43, 57, 70, 80, 87, 90
11
+ dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13, 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4, -22, -46, -67, -82, -90
12
+ dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89, 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89
13
+ dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22, -22, -61, -85, -90, -73, -38, 4, 46, 78, 90, 82, 54, 13, -31, -67, -88
14
+ dw 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87, -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43, 9, 57, 87
15
+ dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31, 31, 78, 90, 61, 4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85
16
+ dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83
17
+ dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38, -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82
18
+ dw 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80, -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70, 9, 80
19
+ dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46, 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82, 4, -78
20
+ dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75, 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75
21
+ dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54, -54, -85, 4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73
22
+ dw 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70, -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90, 9, -87, -43, 70
23
+ dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61, 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67
24
+ dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64
25
+ dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67, -67, -54, 78, 38, -85, -22, 90, 4, -90, 13, 88, -31, -82, 46, 73, -61
26
+ dw 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57, -57, 80, 25, -90, 9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57
27
+ dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73, 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88, 4, 85, -54
28
+ dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50, 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50
29
+ dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78, -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46
30
+ dw 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43, -43, 90, -57, -25, 87, -70, -9, 80, -80, 9, 70, -87, 25, 57, -90, 43
31
+ dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82, 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67, 4, -73, 88, -38
32
+ dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36
33
+ dw 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85, -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31
34
+ dw 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25, -25, 70, -90, 80, -43, -9, 57, -87, 87, -57, 9, 43, -80, 90, -70, 25
35
+ dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88, 88, -67, 31, 13, -54, 82, -90, 78, -46, 4, 38, -73, 90, -85, 61, -22
36
+ dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18, 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18
37
+ dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90, -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13
38
+ dw 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9, -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25, 9
39
+ dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90, 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4
40
+tab_dct16: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
41
+ dw 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90
42
+ dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89
43
+ dw 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87
44
+ dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83
45
+ dw 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80
46
+ dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75
47
+ dw 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70
48
+ dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64
49
+ dw 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57
50
+ dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50
51
+ dw 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43
52
+ dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36
53
+ dw 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25
54
+ dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18
55
+ dw 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9
56
+
57
+dct16_shuf_AVX512: dq 0, 1, 8, 9, 4, 5, 12, 13
58
+dct16_shuf1_AVX512: dq 2, 3, 10, 11, 6, 7, 14, 15
59
+dct16_shuf3_AVX512: dq 0, 1, 4, 5, 8, 9, 12, 13
60
+dct16_shuf4_AVX512: dq 2, 3, 6, 7, 10, 11, 14, 15
61
+dct16_shuf2_AVX512: dd 0, 4, 8, 12, 2, 6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30
62
+
63
+dct8_shuf5_AVX512: dq 0, 2, 4, 6, 1, 3, 5, 7
64
+dct8_shuf6_AVX512: dq 0, 2, 4, 6, 1, 3, 5, 7
65
+dct8_shuf8_AVX512: dd 0, 2, 8, 10, 4, 6, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
66
+dct8_shuf4_AVX512: times 2 dd 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
67
+dct16_shuf7_AVX512: dd 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
68
+dct16_shuf9_AVX512: dd 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
69
+
70
+dct32_shuf_AVX512: dd 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20 , 21, 24, 25, 28, 29
71
+dct32_shuf4_AVX512: times 2 dd 0, 4, 8, 12, 0, 4, 8, 12
72
+dct32_shuf5_AVX512: dd 0, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0
73
+dct32_shuf6_AVX512: dd 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0
74
+dct32_shuf7_AVX512: dd 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1
75
+dct32_shuf8_AVX512: dd -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
76
+dct16_shuf5_AVX512: dw 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
77
+dct16_shuf6_AVX512: dw 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
78
+dct16_shuf8_AVX512: dw 20, 0, 4, 2, 28, 8, 6, 10, 22, 16, 12, 18, 30, 24, 14, 26
79
+
80
+dct8_shuf7_AVX512: dw 0, 2, 16, 18, 8, 10, 24, 26, 4, 6, 20, 22, 12, 14, 28, 30
81
+dct8_shuf9_AVX512: times 2 dw 0, 8, 16, 24, 4, 12, 20, 28
82
+dct32_shuf1_AVX512: dw 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
83
+dct32_shuf2_AVX512: dw 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 15, 14, 13, 12, 11, 10, 9, 8, 31, 30, 29, 28, 27, 26, 25, 24
84
+dct32_shuf3_AVX512: times 2 dw 0, 8, 16, 24, 2, 10, 18, 26
85
+
86
+dct8_shuf: times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9
87
+dct8_shuf_AVX512: times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11
88
+
89
tab_dct8: dw 64, 64, 64, 64, 64, 64, 64, 64
90
dw 89, 75, 50, 18, -18, -50, -75, -89
91
dw 83, 36, -36, -83, -83, -36, 36, 83
92
93
dw 36, -83, 83, -36, -36, 83, -83, 36
94
dw 18, -50, 75, -89, 89, -75, 50, -18
95
96
-dct8_shuf: times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9
97
+tab_dct8_avx512: dw 64, 64, 64, 64, 89, 75, 50, 18
98
+ dw 83, 36, -36, -83, 75, -18, -89, -50
99
+ dw 64, -64, -64, 64, 50, -89, 18, 75
100
+ dw 36, -83, 83, -36, 18, -50, 75, -89
101
102
tab_dct16_1: dw 64, 64, 64, 64, 64, 64, 64, 64
103
dw 90, 87, 80, 70, 57, 43, 25, 9
104
105
dw 18, -50, 75, -89, 89, -75, 50, -18
106
dw 9, -25, 43, -57, 70, -80, 87, -90
107
108
-
109
tab_dct16_2: dw 64, 64, 64, 64, 64, 64, 64, 64
110
dw -9, -25, -43, -57, -70, -80, -87, -90
111
dw -89, -75, -50, -18, 18, 50, 75, 89
112
113
times 4 dw 50, -89, 18, 75
114
times 4 dw 18, -50, 75, -89
115
116
+avx512_idct8_1: times 8 dw 64, 83, 64, 36
117
+ times 8 dw 64, 36, -64, -83
118
+ times 8 dw 64, -36, -64, 83
119
+ times 8 dw 64, -83, 64, -36
120
+
121
+avx512_idct8_2: times 8 dw 89, 75, 50, 18
122
+ times 8 dw 75, -18, -89, -50
123
+ times 8 dw 50, -89, 18, 75
124
+ times 8 dw 18, -50, 75, -89
125
+
126
+avx512_idct8_3: dw 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36
127
+ dw 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83
128
+ dw 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83
129
+ dw -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36
130
+ dw 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89
131
+ dw 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75
132
+ dw 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50
133
+ dw -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89
134
+
135
idct8_shuf1: dd 0, 2, 4, 6, 1, 3, 5, 7
136
137
const idct8_shuf2, times 2 db 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
138
139
idct8_shuf3: times 2 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
140
141
+
142
+idct8_avx512_shuf3: times 4 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
143
+
144
tab_idct16_1: dw 90, 87, 80, 70, 57, 43, 25, 9
145
dw 87, 57, 9, -43, -80, -90, -70, -25
146
dw 80, 9, -70, -87, -25, 57, 90, 43
147
148
idct16_shuff: dd 0, 4, 2, 6, 1, 5, 3, 7
149
150
idct16_shuff1: dd 2, 6, 0, 4, 3, 7, 1, 5
151
+idct16_shuff2: dw 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
152
+idct16_shuff3: dw 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31
153
+idct16_shuff4: dd 0, 8, 2, 10, 4, 12, 6, 14
154
+idct16_shuff5: dd 1, 9, 3, 11, 5, 13, 7, 15
155
+
156
+
157
+tab_AVX512_idct16_1: dw 90, 87, 80, 70, 57, 43, 25, 9, 90, 87, 80, 70, 57, 43, 25, 9, 80, 9, -70, -87, -25, 57, 90, 43, 80, 9, -70, -87, -25, 57, 90, 43
158
+ dw 87, 57, 9, -43, -80, -90, -70, -25, 87, 57, 9, -43, -80, -90, -70, -25, 70, -43, -87, 9, 90, 25, -80, -57, 70, -43, -87, 9, 90, 25, -80, -57
159
+ dw 57, -80, -25, 90, -9, -87, 43, 70, 57, -80, -25, 90, -9, -87, 43, 70, 25, -70, 90, -80, 43, 9, -57, 87, 25, -70, 90, -80, 43, 9, -57, 87
160
+ dw 43, -90, 57, 25, -87, 70, 9, -80, 43, -90, 57, 25, -87, 70, 9, -80, 9, -25, 43, -57, 70, -80, 87, -90, 9, -25, 43, -57, 70, -80, 87, -90
161
+
162
+tab_AVX512_idct16_2: dw 64, 89, 83, 75, 64, 50, 36, 18, 64, 89, 83, 75, 64, 50, 36, 18, 64, 50, -36, -89, -64, 18, 83, 75, 64, 50, -36, -89, -64, 18, 83, 75
163
+ dw 64, 75, 36, -18, -64, -89, -83, -50, 64, 75, 36, -18, -64, -89, -83, -50, 64, 18, -83, -50, 64, 75, -36, -89, 64, 18, -83, -50, 64, 75, -36, -89
164
+ dw 64, -18, -83, 50, 64, -75, -36, 89, 64, -18, -83, 50, 64, -75, -36, 89, 64, -75, 36, 18, -64, 89, -83, 50, 64, -75, 36, 18, -64, 89, -83, 50
165
+ dw 64, -50, -36, 89, -64, -18, 83, -75, 64, -50, -36, 89, -64, -18, 83, -75, 64, -89, 83, -75, 64, -50, 36, -18, 64, -89, 83, -75, 64, -50, 36, -18
166
+
167
+idct16_AVX512_shuff: dd 0, 4, 2, 6, 1, 5, 3, 7, 8, 12, 10, 14, 9, 13, 11, 15
168
+
169
+idct16_AVX512_shuff1: dd 2, 6, 0, 4, 3, 7, 1, 5, 10, 14, 8, 12, 11, 15, 9, 13
170
+
171
+idct16_AVX512_shuff2: dq 0, 1, 8, 9, 4, 5, 12, 13
172
+idct16_AVX512_shuff3: dq 2, 3, 10, 11, 6, 7, 14, 15
173
+idct16_AVX512_shuff4: dq 4, 5, 12, 13, 0, 1, 8, 9
174
+idct16_AVX512_shuff5: dq 6, 7, 14, 15, 2, 3, 10, 11
175
+idct16_AVX512_shuff6: times 4 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
176
177
tab_idct32_1: dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4
178
dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13
179
180
dw 64, -87, 75, -57, 36, -9, -18, 43, -64, 80, -89, 90, -83, 70, -50, 25
181
dw 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9
182
183
+
184
+tab_idct32_AVX512_1: dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 90 ,90 ,88 ,85, 82, 78, 73, 67, 90, 82, 67, 46, 22, -4, -31, -54, 90, 82, 67, 46, 22, -4, -31, -54
185
+ dw 61, 54, 46, 38, 31, 22, 13, 4, 61, 54, 46, 38, 31, 22, 13, 4, -73, -85, -90, -88, -78, -61, -38, -13, -73, -85, -90, -88, -78, -61, -38, -13
186
+ dw 88, 67, 31, -13, -54, -82, -90, -78, 88, 67, 31, -13, -54, -82, -90, -78, 85, 46, -13, -67, -90, -73, -22, 38, 85, 46, -13, -67, -90, -73, -22, 38
187
+ dw -46, -4, 38, 73, 90, 85, 61, 22, -46, -4, 38, 73, 90, 85, 61, 22, 82, 88, 54, -4, -61, -90, -78, -31, 82, 88, 54, -4, -61, -90, -78, -31
188
+ dw 82, 22, -54, -90, -61, 13, 78, 85, 82, 22, -54, -90, -61, 13, 78, 85, 78, -4, -82, -73, 13, 85, 67, -22, 78, -4, -82, -73, 13, 85, 67, -22
189
+ dw 31, -46, -90, -67, 4, 73, 88, 38, 31, -46, -90, -67, 4, 73, 88, 38, -88, -61, 31, 90, 54, -38, -90, -46, -88, -61, 31, 90, 54, -38, -90, -46
190
+ dw 73, -31, -90, -22, 78, 67, -38, -90, 73, -31, -90, -22, 78, 67, -38, -90, 67, -54, -78, 38, 85, -22, -90, 4, 67, -54, -78, 38, 85, -22, -90, 4
191
+ dw -13, 82, 61, -46, -88, -4, 85, 54, -13, 82, 61, -46, -88, -4, 85, 54, 90, 13, -88, -31, 82, 46, -73, -61, 90, 13, -88, -31, 82, 46, -73, -61
192
+
193
+tab_idct32_AVX512_5: dw 4, -13, 22, -31, 38, -46, 54, -61, 4, -13, 22, -31, 38, -46, 54, -61, 13, -38, 61, -78, 88, -90, 85, -73, 13, -38, 61, -78, 88, -90, 85, -73
194
+ dw 67, -73, 78, -82, 85, -88, 90, -90, 67, -73, 78, -82, 85, -88, 90, -90, 54, -31, 4, 22, -46, 67, -82, 90, 54, -31, 4, 22, -46, 67, -82, 90
195
+ dw 22, -61, 85, -90, 73, -38, -4, 46, 22, -61, 85, -90, 73, -38, -4, 46, 31, -78, 90, -61, 4, 54, -88, 82, 31, -78, 90, -61, 4, 54, -88, 82
196
+ dw -78, 90, -82, 54, -13, -31, 67, -88, -78, 90, -82, 54, -13, -31, 67, -88, -38, -22, 73, -90, 67, -13, -46, 85, -38, -22, 73, -90, 67, -13, -46, 85
197
+ dw 38, -88, 73, -4, -67, 90, -46, -31, 38, -88, 73, -4, -67, 90, -46, -31, 46, -90, 38, 54, -90, 31, 61, -88, 46, -90, 38, 54, -90, 31, 61, -88
198
+ dw 85, -78, 13, 61, -90, 54, 22, -82, 85, -78, 13, 61, -90, 54, 22, -82, 22, 67, -85, 13, 73, -82, 4, 78, 22, 67, -85, 13, 73, -82, 4, 78
199
+ dw 54, -85, -4, 88, -46, -61, 82, 13, 54, -85, -4, 88, -46, -61, 82, 13, 61, -73, -46, 82, 31, -88, -13, 90, 61, -73, -46, 82, 31, -88, -13, 90
200
+ dw -90, 38, 67, -78, -22, 90, -31, -73, -90, 38, 67, -78, -22, 90, -31, -73, -4, -90, 22, 85, -38, -78, 54, 67, -4, -90, 22, 85, -38, -78, 54, 67
201
x265_2.7.tar.gz/source/common/x86/dct8.h -> x265_2.9.tar.gz/source/common/x86/dct8.h
Changed
26
1
2
FUNCDEF_TU_S2(void, idct, ssse3, const int16_t* src, int16_t* dst, intptr_t dstStride);
3
FUNCDEF_TU_S2(void, idct, sse4, const int16_t* src, int16_t* dst, intptr_t dstStride);
4
FUNCDEF_TU_S2(void, idct, avx2, const int16_t* src, int16_t* dst, intptr_t dstStride);
5
+FUNCDEF_TU_S2(void, nonPsyRdoQuant, avx512, int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos);
6
+FUNCDEF_TU_S2(void, psyRdoQuant, avx512, int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, int64_t* costUncoded, int64_t* totalUncodedCost, int64_t* totalRdCost, int64_t *psyScale, uint32_t blkPos);
7
+FUNCDEF_TU_S2(void, nonPsyRdoQuant, avx2, int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos);
8
+FUNCDEF_TU_S2(void, psyRdoQuant_1p, avx2, int16_t* m_resiDctCoeff, int64_t* costUncoded, int64_t* totalUncodedCost, int64_t* totalRdCost, uint32_t blkPos);
9
+FUNCDEF_TU_S2(void, psyRdoQuant_2p, avx2, int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, int64_t* costUncoded, int64_t* totalUncodedCost, int64_t* totalRdCost, int64_t *psyScale, uint32_t blkPos);
10
11
void PFX(dst4_ssse3)(const int16_t* src, int16_t* dst, intptr_t srcStride);
12
void PFX(dst4_sse2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
13
14
void PFX(idst4_avx2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
15
void PFX(denoise_dct_sse4)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
16
void PFX(denoise_dct_avx2)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
17
-
18
+void PFX(denoise_dct_avx512)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
19
+void PFX(dct8_avx512)(const int16_t* src, int16_t* dst, intptr_t srcStride);
20
+void PFX(idct8_avx512)(const int16_t* src, int16_t* dst, intptr_t dstStride);
21
+void PFX(idct16_avx512)(const int16_t* src, int16_t* dst, intptr_t dstStride);
22
+void PFX(idct32_avx512)(const int16_t* src, int16_t* dst, intptr_t dstStride);
23
+void PFX(dct32_avx512)(const int16_t* src, int16_t* dst, intptr_t srcStride);
24
+void PFX(dct16_avx512)(const int16_t* src, int16_t* dst, intptr_t srcStride);
25
#endif // ifndef X265_DCT8_H
26
x265_2.7.tar.gz/source/common/x86/h-ipfilter16.asm -> x265_2.9.tar.gz/source/common/x86/h-ipfilter16.asm
Changed
201
1
2
3
h_pd_524800: times 8 dd 524800
4
5
-tab_LumaCoeff: dw 0, 0, 0, 64, 0, 0, 0, 0
6
+h_tab_LumaCoeff: dw 0, 0, 0, 64, 0, 0, 0, 0
7
dw -1, 4, -10, 58, 17, -5, 1, 0
8
dw -1, 4, -11, 40, 40, -11, 4, -1
9
dw 0, 1, -5, 17, 58, -10, 4, -1
10
11
db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
12
13
const interp8_hpp_shuf_new, db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
14
- db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
15
-
16
+ db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
17
+
18
+ALIGN 64
19
+interp8_hpp_shuf1_load_avx512: times 4 db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
20
+interp8_hpp_shuf2_load_avx512: times 4 db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
21
+interp8_hpp_shuf1_store_avx512: times 4 db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
22
+
23
SECTION .text
24
cextern pd_8
25
cextern pd_32
26
27
add r3d, r3d
28
29
%ifdef PIC
30
- lea r6, [tab_LumaCoeff]
31
+ lea r6, [h_tab_LumaCoeff]
32
mova m0, [r6 + r4]
33
%else
34
- mova m0, [tab_LumaCoeff + r4]
35
+ mova m0, [h_tab_LumaCoeff + r4]
36
%endif
37
38
%ifidn %3, pp
39
40
;------------------------------------------------------------------------------------------------------------
41
; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
42
;------------------------------------------------------------------------------------------------------------
43
- FILTER_HOR_LUMA_sse2 4, 4, pp
44
+%if ARCH_X86_64
45
+ FILTER_HOR_LUMA_sse2 4, 4, pp
46
FILTER_HOR_LUMA_sse2 4, 8, pp
47
FILTER_HOR_LUMA_sse2 4, 16, pp
48
FILTER_HOR_LUMA_sse2 8, 4, pp
49
50
FILTER_HOR_LUMA_sse2 64, 32, ps
51
FILTER_HOR_LUMA_sse2 64, 48, ps
52
FILTER_HOR_LUMA_sse2 64, 64, ps
53
+%endif
54
55
;-----------------------------------------------------------------------------
56
; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
57
58
add r3, r3
59
60
%ifdef PIC
61
- lea r6, [tab_LumaCoeff]
62
+ lea r6, [h_tab_LumaCoeff]
63
mova m0, [r6 + r4]
64
%else
65
- mova m0, [tab_LumaCoeff + r4]
66
+ mova m0, [h_tab_LumaCoeff + r4]
67
%endif
68
69
%ifidn %3, pp
70
71
shl r4d, 4
72
73
%ifdef PIC
74
- lea r6, [tab_LumaCoeff]
75
+ lea r6, [h_tab_LumaCoeff]
76
mova m0, [r6 + r4]
77
%else
78
- mova m0, [tab_LumaCoeff + r4]
79
+ mova m0, [h_tab_LumaCoeff + r4]
80
%endif
81
82
%ifidn %3, pp
83
84
shl r4d, 4
85
86
%ifdef PIC
87
- lea r6, [tab_LumaCoeff]
88
+ lea r6, [h_tab_LumaCoeff]
89
mova m0, [r6 + r4]
90
%else
91
- mova m0, [tab_LumaCoeff + r4]
92
+ mova m0, [h_tab_LumaCoeff + r4]
93
%endif
94
%ifidn %3, pp
95
mova m1, [INTERP_OFFSET_PP]
96
97
shl r4d, 4
98
99
%ifdef PIC
100
- lea r6, [tab_LumaCoeff]
101
+ lea r6, [h_tab_LumaCoeff]
102
mova m0, [r6 + r4]
103
%else
104
- mova m0, [tab_LumaCoeff + r4]
105
+ mova m0, [h_tab_LumaCoeff + r4]
106
%endif
107
108
%ifidn %3, pp
109
110
shl r4d, 4
111
112
%ifdef PIC
113
- lea r6, [tab_LumaCoeff]
114
+ lea r6, [h_tab_LumaCoeff]
115
mova m0, [r6 + r4]
116
%else
117
- mova m0, [tab_LumaCoeff + r4]
118
+ mova m0, [h_tab_LumaCoeff + r4]
119
%endif
120
%ifidn %3, pp
121
mova m1, [pd_32]
122
123
mov r4d, r4m
124
shl r4d, 4
125
%ifdef PIC
126
- lea r5, [tab_LumaCoeff]
127
+ lea r5, [h_tab_LumaCoeff]
128
vpbroadcastq m0, [r5 + r4]
129
vpbroadcastq m1, [r5 + r4 + 8]
130
%else
131
- vpbroadcastq m0, [tab_LumaCoeff + r4]
132
- vpbroadcastq m1, [tab_LumaCoeff + r4 + 8]
133
+ vpbroadcastq m0, [h_tab_LumaCoeff + r4]
134
+ vpbroadcastq m1, [h_tab_LumaCoeff + r4 + 8]
135
%endif
136
lea r6, [pw_pixel_max]
137
mova m3, [interp8_hpp_shuf]
138
139
;-------------------------------------------------------------------------------------------------------------
140
; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
141
;-------------------------------------------------------------------------------------------------------------
142
-%macro FILTER_HOR_LUMA_W8 1
143
+%macro PROCESS_IPFILTER_LUMA_PP_8x2_AVX2 0
144
+ movu xm7, [r0]
145
+ movu xm8, [r0 + 8]
146
+ vinserti128 m7, m7, [r0 + r1], 1
147
+ vinserti128 m8, m8, [r0 + r1 + 8], 1
148
+ pshufb m10, m7, m14
149
+ pshufb m7, m13
150
+ pshufb m11, m8, m14
151
+ pshufb m8, m13
152
+
153
+ pmaddwd m7, m0
154
+ pmaddwd m10, m1
155
+ paddd m7, m10
156
+ pmaddwd m10, m11, m3
157
+ pmaddwd m9, m8, m2
158
+ paddd m10, m9
159
+ paddd m7, m10
160
+ paddd m7, m4
161
+ psrad m7, INTERP_SHIFT_PP
162
+
163
+ movu xm9, [r0 + 16]
164
+ vinserti128 m9, m9, [r0 + r1 + 16], 1
165
+ pshufb m10, m9, m14
166
+ pshufb m9, m13
167
+ pmaddwd m8, m0
168
+ pmaddwd m11, m1
169
+ paddd m8, m11
170
+ pmaddwd m10, m3
171
+ pmaddwd m9, m2
172
+ paddd m9, m10
173
+ paddd m8, m9
174
+ paddd m8, m4
175
+ psrad m8, INTERP_SHIFT_PP
176
+
177
+ packusdw m7, m8
178
+ pshufb m7, m12
179
+ CLIPW m7, m5, m6
180
+ movu [r2], xm7
181
+ vextracti128 [r2 + r3], m7, 1
182
+%endmacro
183
+
184
+%macro IPFILTER_LUMA_AVX2_8xN 1
185
INIT_YMM avx2
186
-cglobal interp_8tap_horiz_pp_8x%1, 4,6,8
187
- add r1d, r1d
188
- add r3d, r3d
189
- sub r0, 6
190
- mov r4d, r4m
191
- shl r4d, 4
192
+cglobal interp_8tap_horiz_pp_8x%1, 5,6,15
193
+ shl r1d, 1
194
+ shl r3d, 1
195
+ sub r0, 6
196
+ mov r4d, r4m
197
+ shl r4d, 4
198
+
199
%ifdef PIC
200
- lea r5, [tab_LumaCoeff]
201
x265_2.7.tar.gz/source/common/x86/h4-ipfilter16.asm -> x265_2.9.tar.gz/source/common/x86/h4-ipfilter16.asm
Changed
201
1
2
3
tab_Tm16: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
4
5
-tab_ChromaCoeff: dw 0, 64, 0, 0
6
+h4_tab_ChromaCoeff: dw 0, 64, 0, 0
7
dw -2, 58, 10, -2
8
dw -4, 54, 16, -2
9
dw -6, 46, 28, -4
10
11
add r4d, r4d
12
13
%ifdef PIC
14
- lea r6, [tab_ChromaCoeff]
15
+ lea r6, [h4_tab_ChromaCoeff]
16
movddup m0, [r6 + r4 * 4]
17
%else
18
- movddup m0, [tab_ChromaCoeff + r4 * 4]
19
+ movddup m0, [h4_tab_ChromaCoeff + r4 * 4]
20
%endif
21
22
%ifidn %3, ps
23
24
; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
25
;-----------------------------------------------------------------------------
26
27
+%if ARCH_X86_64
28
FILTER_HOR_CHROMA_sse3 2, 4, pp
29
FILTER_HOR_CHROMA_sse3 2, 8, pp
30
FILTER_HOR_CHROMA_sse3 2, 16, pp
31
32
FILTER_HOR_CHROMA_sse3 64, 32, ps
33
FILTER_HOR_CHROMA_sse3 64, 48, ps
34
FILTER_HOR_CHROMA_sse3 64, 64, ps
35
+%endif
36
37
%macro FILTER_W2_2 1
38
movu m3, [r0]
39
40
add r4d, r4d
41
42
%ifdef PIC
43
- lea r%6, [tab_ChromaCoeff]
44
+ lea r%6, [h4_tab_ChromaCoeff]
45
movh m0, [r%6 + r4 * 4]
46
%else
47
- movh m0, [tab_ChromaCoeff + r4 * 4]
48
+ movh m0, [h4_tab_ChromaCoeff + r4 * 4]
49
%endif
50
51
punpcklqdq m0, m0
52
53
add r4d, r4d
54
55
%ifdef PIC
56
- lea r%4, [tab_ChromaCoeff]
57
+ lea r%4, [h4_tab_ChromaCoeff]
58
movh m0, [r%4 + r4 * 4]
59
%else
60
- movh m0, [tab_ChromaCoeff + r4 * 4]
61
+ movh m0, [h4_tab_ChromaCoeff + r4 * 4]
62
%endif
63
64
punpcklqdq m0, m0
65
66
sub r0, 2
67
mov r4d, r4m
68
%ifdef PIC
69
- lea r5, [tab_ChromaCoeff]
70
+ lea r5, [h4_tab_ChromaCoeff]
71
vpbroadcastq m0, [r5 + r4 * 8]
72
%else
73
- vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
74
+ vpbroadcastq m0, [h4_tab_ChromaCoeff + r4 * 8]
75
%endif
76
mova m1, [h4_interp8_hpp_shuf]
77
vpbroadcastd m2, [pd_32]
78
79
sub r0, 2
80
mov r4d, r4m
81
%ifdef PIC
82
- lea r5, [tab_ChromaCoeff]
83
+ lea r5, [h4_tab_ChromaCoeff]
84
vpbroadcastq m0, [r5 + r4 * 8]
85
%else
86
- vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
87
+ vpbroadcastq m0, [h4_tab_ChromaCoeff + r4 * 8]
88
%endif
89
mova m1, [h4_interp8_hpp_shuf]
90
vpbroadcastd m2, [pd_32]
91
92
sub r0, 2
93
mov r4d, r4m
94
%ifdef PIC
95
- lea r5, [tab_ChromaCoeff]
96
+ lea r5, [h4_tab_ChromaCoeff]
97
vpbroadcastq m0, [r5 + r4 * 8]
98
%else
99
- vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
100
+ vpbroadcastq m0, [h4_tab_ChromaCoeff + r4 * 8]
101
%endif
102
mova m1, [h4_interp8_hpp_shuf]
103
vpbroadcastd m2, [pd_32]
104
105
sub r0, 2
106
mov r4d, r4m
107
%ifdef PIC
108
- lea r5, [tab_ChromaCoeff]
109
+ lea r5, [h4_tab_ChromaCoeff]
110
vpbroadcastq m0, [r5 + r4 * 8]
111
%else
112
- vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
113
+ vpbroadcastq m0, [h4_tab_ChromaCoeff + r4 * 8]
114
%endif
115
mova m1, [h4_interp8_hpp_shuf]
116
vpbroadcastd m2, [pd_32]
117
118
sub r0, 2
119
mov r4d, r4m
120
%ifdef PIC
121
- lea r5, [tab_ChromaCoeff]
122
+ lea r5, [h4_tab_ChromaCoeff]
123
vpbroadcastq m0, [r5 + r4 * 8]
124
%else
125
- vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
126
+ vpbroadcastq m0, [h4_tab_ChromaCoeff + r4 * 8]
127
%endif
128
mova m1, [h4_interp8_hpp_shuf]
129
vpbroadcastd m2, [pd_32]
130
131
sub r0, 2
132
mov r4d, r4m
133
%ifdef PIC
134
- lea r5, [tab_ChromaCoeff]
135
+ lea r5, [h4_tab_ChromaCoeff]
136
vpbroadcastq m0, [r5 + r4 * 8]
137
%else
138
- vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
139
+ vpbroadcastq m0, [h4_tab_ChromaCoeff + r4 * 8]
140
%endif
141
mova m1, [h4_interp8_hpp_shuf]
142
vpbroadcastd m2, [pd_32]
143
144
sub r0, 2
145
mov r4d, r4m
146
%ifdef PIC
147
- lea r5, [tab_ChromaCoeff]
148
+ lea r5, [h4_tab_ChromaCoeff]
149
vpbroadcastq m0, [r5 + r4 * 8]
150
%else
151
- vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
152
+ vpbroadcastq m0, [h4_tab_ChromaCoeff + r4 * 8]
153
%endif
154
mova m1, [h4_interp8_hpp_shuf]
155
vpbroadcastd m2, [pd_32]
156
157
sub r0, 2
158
mov r4d, r4m
159
%ifdef PIC
160
- lea r5, [tab_ChromaCoeff]
161
+ lea r5, [h4_tab_ChromaCoeff]
162
vpbroadcastq m0, [r5 + r4 * 8]
163
%else
164
- vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
165
+ vpbroadcastq m0, [h4_tab_ChromaCoeff + r4 * 8]
166
%endif
167
mova m1, [h4_interp8_hpp_shuf]
168
vpbroadcastd m2, [pd_32]
169
170
sub r0, 2
171
mov r4d, r4m
172
%ifdef PIC
173
- lea r5, [tab_ChromaCoeff]
174
+ lea r5, [h4_tab_ChromaCoeff]
175
vpbroadcastq m0, [r5 + r4 * 8]
176
%else
177
- vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
178
+ vpbroadcastq m0, [h4_tab_ChromaCoeff + r4 * 8]
179
%endif
180
mova m1, [h4_interp8_hpp_shuf]
181
vpbroadcastd m2, [pd_32]
182
183
sub r0, 2
184
mov r4d, r4m
185
%ifdef PIC
186
- lea r5, [tab_ChromaCoeff]
187
+ lea r5, [h4_tab_ChromaCoeff]
188
vpbroadcastq m0, [r5 + r4 * 8]
189
%else
190
- vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
191
+ vpbroadcastq m0, [h4_tab_ChromaCoeff + r4 * 8]
192
%endif
193
mova m1, [h4_interp8_hpp_shuf]
194
vpbroadcastd m2, [pd_32]
195
196
mov r5d, r5m
197
198
%ifdef PIC
199
- lea r6, [tab_ChromaCoeff]
200
+ lea r6, [h4_tab_ChromaCoeff]
201
x265_2.7.tar.gz/source/common/x86/intrapred.h -> x265_2.9.tar.gz/source/common/x86/intrapred.h
Changed
19
1
2
FUNCDEF_TU_S2(void, intra_pred_dc, sse2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
3
FUNCDEF_TU_S2(void, intra_pred_dc, sse4, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
4
FUNCDEF_TU_S2(void, intra_pred_dc, avx2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
5
-
6
+FUNCDEF_TU_S2(void, intra_pred_dc, avx512, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
7
FUNCDEF_TU_S2(void, intra_pred_planar, sse2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
8
FUNCDEF_TU_S2(void, intra_pred_planar, sse4, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
9
FUNCDEF_TU_S2(void, intra_pred_planar, avx2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
10
11
DECL_ALL(ssse3);
12
DECL_ALL(sse4);
13
DECL_ALL(avx2);
14
-
15
+DECL_ALL(avx512);
16
#undef DECL_ALL
17
#undef DECL_ANGS
18
#undef DECL_ANG
19
x265_2.7.tar.gz/source/common/x86/intrapred16.asm -> x265_2.9.tar.gz/source/common/x86/intrapred16.asm
Changed
201
1
2
const pw_ang8_16, db 0, 0, 0, 0, 0, 0, 12, 13, 10, 11, 6, 7, 4, 5, 0, 1
3
const pw_ang8_17, db 0, 0, 14, 15, 12, 13, 10, 11, 8, 9, 4, 5, 2, 3, 0, 1
4
const pw_swap16, times 2 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
5
-
6
+const pw_swap16_avx512, times 4 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
7
const pw_ang16_13, db 14, 15, 8, 9, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
8
const pw_ang16_16, db 0, 0, 0, 0, 0, 0, 10, 11, 8, 9, 6, 7, 2, 3, 0, 1
9
10
11
;-----------------------------------------------------------------------------------
12
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter)
13
;-----------------------------------------------------------------------------------
14
+%if ARCH_X86_64
15
INIT_XMM sse2
16
cglobal intra_pred_dc8, 5, 8, 2
17
movu m0, [r2 + 34]
18
19
mov [r0 + r7], r3w
20
.end:
21
RET
22
+%endif
23
24
;-------------------------------------------------------------------------------------------------------
25
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
26
;-------------------------------------------------------------------------------------------------------
27
+%if ARCH_X86_64
28
+;This code is meant for 64 bit architecture
29
INIT_XMM sse2
30
cglobal intra_pred_dc16, 5, 10, 4
31
lea r3, [r2 + 66]
32
33
mov [r9 + r1 * 8], r3w
34
.end:
35
RET
36
+%endif
37
38
;-------------------------------------------------------------------------------------------
39
; void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter)
40
41
;-------------------------------------------------------------------------------------------------------
42
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
43
;-------------------------------------------------------------------------------------------------------
44
+%if ARCH_X86_64
45
INIT_YMM avx2
46
cglobal intra_pred_dc16, 3, 9, 4
47
mov r3d, r4m
48
49
movu [r0 + r2 * 1 + 0], m0
50
movu [r0 + r2 * 1 + mmsize], m0
51
RET
52
+INIT_ZMM avx512
53
+cglobal intra_pred_dc32, 3,3,2
54
+ add r2, 2
55
+ add r1d, r1d
56
+ movu m0, [r2]
57
+ movu m1, [r2 + 2 * mmsize]
58
+ paddw m0, m1
59
+ vextracti32x8 ym1, m0, 1
60
+ paddw ym0, ym1
61
+ vextracti32x4 xm1, m0, 1
62
+ paddw xm0, xm1
63
+ pmaddwd xm0, [pw_1]
64
+ movhlps xm1, xm0
65
+ paddd xm0, xm1
66
+ vpsrldq xm1, xm0, 4
67
+ paddd xm0, xm1
68
+ paddd xm0, [pd_32] ; sum = sum + 32
69
+ psrld xm0, 6 ; sum = sum / 64
70
+ vpbroadcastw m0, xm0
71
+ lea r2, [r1 * 3]
72
+ ; store DC 32x32
73
+ movu [r0 + r1 * 0 + 0], m0
74
+ movu [r0 + r1 * 1 + 0], m0
75
+ movu [r0 + r1 * 2 + 0], m0
76
+ movu [r0 + r2 * 1 + 0], m0
77
+ lea r0, [r0 + r1 * 4]
78
+ movu [r0 + r1 * 0 + 0], m0
79
+ movu [r0 + r1 * 1 + 0], m0
80
+ movu [r0 + r1 * 2 + 0], m0
81
+ movu [r0 + r2 * 1 + 0], m0
82
+ lea r0, [r0 + r1 * 4]
83
+ movu [r0 + r1 * 0 + 0], m0
84
+ movu [r0 + r1 * 1 + 0], m0
85
+ movu [r0 + r1 * 2 + 0], m0
86
+ movu [r0 + r2 * 1 + 0], m0
87
+ lea r0, [r0 + r1 * 4]
88
+ movu [r0 + r1 * 0 + 0], m0
89
+ movu [r0 + r1 * 1 + 0], m0
90
+ movu [r0 + r1 * 2 + 0], m0
91
+ movu [r0 + r2 * 1 + 0], m0
92
+ lea r0, [r0 + r1 * 4]
93
+ movu [r0 + r1 * 0 + 0], m0
94
+ movu [r0 + r1 * 1 + 0], m0
95
+ movu [r0 + r1 * 2 + 0], m0
96
+ movu [r0 + r2 * 1 + 0], m0
97
+ lea r0, [r0 + r1 * 4]
98
+ movu [r0 + r1 * 0 + 0], m0
99
+ movu [r0 + r1 * 1 + 0], m0
100
+ movu [r0 + r1 * 2 + 0], m0
101
+ movu [r0 + r2 * 1 + 0], m0
102
+ lea r0, [r0 + r1 * 4]
103
+ movu [r0 + r1 * 0 + 0], m0
104
+ movu [r0 + r1 * 1 + 0], m0
105
+ movu [r0 + r1 * 2 + 0], m0
106
+ movu [r0 + r2 * 1 + 0], m0
107
+ lea r0, [r0 + r1 * 4]
108
+ movu [r0 + r1 * 0 + 0], m0
109
+ movu [r0 + r1 * 1 + 0], m0
110
+ movu [r0 + r1 * 2 + 0], m0
111
+ movu [r0 + r2 * 1 + 0], m0
112
+ RET
113
+%endif
114
115
;---------------------------------------------------------------------------------------
116
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
117
118
;---------------------------------------------------------------------------------------
119
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
120
;---------------------------------------------------------------------------------------
121
+%if ARCH_X86_64
122
INIT_XMM sse2
123
cglobal intra_pred_planar32, 3,3,16
124
movd m3, [r2 + 66] ; topRight = above[32]
125
126
%endrep
127
RET
128
%endif
129
-
130
+%endif
131
;---------------------------------------------------------------------------------------
132
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
133
;---------------------------------------------------------------------------------------
134
135
STORE_4x4
136
RET
137
138
+%if ARCH_X86_64
139
cglobal intra_pred_ang4_26, 3,3,3
140
movh m0, [r2 + 2] ;[8 7 6 5 4 3 2 1]
141
add r1d, r1d
142
143
mov [r0 + r3], r2w
144
.quit:
145
RET
146
+%endif
147
148
cglobal intra_pred_ang4_27, 3,3,5
149
movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1]
150
151
152
%macro TRANSPOSE_STORE_AVX2 11
153
jnz .skip%11
154
- punpckhwd m%9, m%1, m%2
155
- punpcklwd m%1, m%2
156
- punpckhwd m%2, m%3, m%4
157
- punpcklwd m%3, m%4
158
-
159
- punpckldq m%4, m%1, m%3
160
- punpckhdq m%1, m%3
161
- punpckldq m%3, m%9, m%2
162
- punpckhdq m%9, m%2
163
-
164
- punpckhwd m%10, m%5, m%6
165
- punpcklwd m%5, m%6
166
- punpckhwd m%6, m%7, m%8
167
- punpcklwd m%7, m%8
168
-
169
- punpckldq m%8, m%5, m%7
170
- punpckhdq m%5, m%7
171
- punpckldq m%7, m%10, m%6
172
- punpckhdq m%10, m%6
173
-
174
- punpcklqdq m%6, m%4, m%8
175
- punpckhqdq m%2, m%4, m%8
176
- punpcklqdq m%4, m%1, m%5
177
- punpckhqdq m%8, m%1, m%5
178
-
179
- punpcklqdq m%1, m%3, m%7
180
- punpckhqdq m%5, m%3, m%7
181
- punpcklqdq m%3, m%9, m%10
182
- punpckhqdq m%7, m%9, m%10
183
+ punpckhwd ym%9, ym%1, ym%2
184
+ punpcklwd ym%1, ym%2
185
+ punpckhwd ym%2, ym%3, ym%4
186
+ punpcklwd ym%3, ym%4
187
+
188
+ punpckldq ym%4, ym%1, ym%3
189
+ punpckhdq ym%1, ym%3
190
+ punpckldq ym%3, ym%9, ym%2
191
+ punpckhdq ym%9, ym%2
192
+
193
+ punpckhwd ym%10, ym%5, ym%6
194
+ punpcklwd ym%5, ym%6
195
+ punpckhwd ym%6, ym%7, ym%8
196
+ punpcklwd ym%7, ym%8
197
+
198
+ punpckldq ym%8, ym%5, ym%7
199
+ punpckhdq ym%5, ym%7
200
+ punpckldq ym%7, ym%10, ym%6
201
x265_2.7.tar.gz/source/common/x86/ipfilter16.asm -> x265_2.9.tar.gz/source/common/x86/ipfilter16.asm
Changed
201
1
2
%endif
3
4
5
-SECTION_RODATA 32
6
+SECTION_RODATA 64
7
8
tab_c_524800: times 4 dd 524800
9
tab_c_n8192: times 8 dw -8192
10
pd_524800: times 8 dd 524800
11
12
+tab_ChromaCoeff: dw 0, 64, 0, 0
13
+ dw -2, 58, 10, -2
14
+ dw -4, 54, 16, -2
15
+ dw -6, 46, 28, -4
16
+ dw -4, 36, 36, -4
17
+ dw -4, 28, 46, -6
18
+ dw -2, 16, 54, -4
19
+ dw -2, 10, 58, -2
20
+
21
+tab_LumaCoeff: dw 0, 0, 0, 64, 0, 0, 0, 0
22
+ dw -1, 4, -10, 58, 17, -5, 1, 0
23
+ dw -1, 4, -11, 40, 40, -11, 4, -1
24
+ dw 0, 1, -5, 17, 58, -10, 4, -1
25
+
26
+ALIGN 64
27
+tab_LumaCoeffH_avx512:
28
+ times 4 dw 0, 0, 0, 64, 0, 0, 0, 0
29
+ times 4 dw -1, 4, -10, 58, 17, -5, 1, 0
30
+ times 4 dw -1, 4, -11, 40, 40, -11, 4, -1
31
+ times 4 dw 0, 1, -5, 17, 58, -10, 4, -1
32
+
33
ALIGN 32
34
tab_LumaCoeffV: times 4 dw 0, 0
35
times 4 dw 0, 64
36
37
times 4 dw -5, 17
38
times 4 dw 58, -10
39
times 4 dw 4, -1
40
+
41
ALIGN 32
42
tab_LumaCoeffVer: times 8 dw 0, 0
43
times 8 dw 0, 64
44
45
times 8 dw -5, 17
46
times 8 dw 58, -10
47
times 8 dw 4, -1
48
-
49
+
50
+ALIGN 64
51
+const tab_ChromaCoeffV_avx512, times 16 dw 0, 64
52
+ times 16 dw 0, 0
53
+
54
+ times 16 dw -2, 58
55
+ times 16 dw 10, -2
56
+
57
+ times 16 dw -4, 54
58
+ times 16 dw 16, -2
59
+
60
+ times 16 dw -6, 46
61
+ times 16 dw 28, -4
62
+
63
+ times 16 dw -4, 36
64
+ times 16 dw 36, -4
65
+
66
+ times 16 dw -4, 28
67
+ times 16 dw 46, -6
68
+
69
+ times 16 dw -2, 16
70
+ times 16 dw 54, -4
71
+
72
+ times 16 dw -2, 10
73
+ times 16 dw 58, -2
74
+
75
+ALIGN 64
76
+tab_LumaCoeffVer_avx512: times 16 dw 0, 0
77
+ times 16 dw 0, 64
78
+ times 16 dw 0, 0
79
+ times 16 dw 0, 0
80
+
81
+ times 16 dw -1, 4
82
+ times 16 dw -10, 58
83
+ times 16 dw 17, -5
84
+ times 16 dw 1, 0
85
+
86
+ times 16 dw -1, 4
87
+ times 16 dw -11, 40
88
+ times 16 dw 40, -11
89
+ times 16 dw 4, -1
90
+
91
+ times 16 dw 0, 1
92
+ times 16 dw -5, 17
93
+ times 16 dw 58, -10
94
+ times 16 dw 4, -1
95
+
96
+ALIGN 64
97
+const interp8_hpp_shuf1_load_avx512, times 4 db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
98
+
99
+ALIGN 64
100
+const interp8_hpp_shuf2_load_avx512, times 4 db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
101
+
102
+ALIGN 64
103
+const interp8_hpp_shuf1_store_avx512, times 4 db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
104
+
105
SECTION .text
106
cextern pd_8
107
cextern pd_32
108
109
;-------------------------------------------------------------------------------------------------------------
110
; void interp_8tap_vert_pp_%2x%3(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
111
;-------------------------------------------------------------------------------------------------------------
112
+%if ARCH_X86_64
113
FILTER_VER_LUMA_sse2 pp, 4, 4
114
FILTER_VER_LUMA_sse2 pp, 8, 8
115
FILTER_VER_LUMA_sse2 pp, 8, 4
116
117
FILTER_VER_LUMA_sse2 ps, 48, 64
118
FILTER_VER_LUMA_sse2 ps, 64, 16
119
FILTER_VER_LUMA_sse2 ps, 16, 64
120
+%endif
121
+
122
+;-----------------------------------------------------------------------------
123
+;p2s and p2s_aligned avx512 code start
124
+;-----------------------------------------------------------------------------
125
+%macro P2S_64x4_AVX512 0
126
+ movu m0, [r0]
127
+ movu m1, [r0 + r1]
128
+ movu m2, [r0 + r1 * 2]
129
+ movu m3, [r0 + r5]
130
+ psllw m0, (14 - BIT_DEPTH)
131
+ psllw m1, (14 - BIT_DEPTH)
132
+ psllw m2, (14 - BIT_DEPTH)
133
+ psllw m3, (14 - BIT_DEPTH)
134
+ psubw m0, m4
135
+ psubw m1, m4
136
+ psubw m2, m4
137
+ psubw m3, m4
138
+ movu [r2], m0
139
+ movu [r2 + r3], m1
140
+ movu [r2 + r3 * 2], m2
141
+ movu [r2 + r4], m3
142
+
143
+ movu m0, [r0 + mmsize]
144
+ movu m1, [r0 + r1 + mmsize]
145
+ movu m2, [r0 + r1 * 2 + mmsize]
146
+ movu m3, [r0 + r5 + mmsize]
147
+ psllw m0, (14 - BIT_DEPTH)
148
+ psllw m1, (14 - BIT_DEPTH)
149
+ psllw m2, (14 - BIT_DEPTH)
150
+ psllw m3, (14 - BIT_DEPTH)
151
+ psubw m0, m4
152
+ psubw m1, m4
153
+ psubw m2, m4
154
+ psubw m3, m4
155
+ movu [r2 + mmsize], m0
156
+ movu [r2 + r3 + mmsize], m1
157
+ movu [r2 + r3 * 2 + mmsize], m2
158
+ movu [r2 + r4 + mmsize], m3
159
+%endmacro
160
+
161
+%macro P2S_ALIGNED_64x4_AVX512 0
162
+ mova m0, [r0]
163
+ mova m1, [r0 + r1]
164
+ mova m2, [r0 + r1 * 2]
165
+ mova m3, [r0 + r5]
166
+ psllw m0, (14 - BIT_DEPTH)
167
+ psllw m1, (14 - BIT_DEPTH)
168
+ psllw m2, (14 - BIT_DEPTH)
169
+ psllw m3, (14 - BIT_DEPTH)
170
+ psubw m0, m4
171
+ psubw m1, m4
172
+ psubw m2, m4
173
+ psubw m3, m4
174
+ mova [r2], m0
175
+ mova [r2 + r3], m1
176
+ mova [r2 + r3 * 2], m2
177
+ mova [r2 + r4], m3
178
+
179
+ mova m0, [r0 + mmsize]
180
+ mova m1, [r0 + r1 + mmsize]
181
+ mova m2, [r0 + r1 * 2 + mmsize]
182
+ mova m3, [r0 + r5 + mmsize]
183
+ psllw m0, (14 - BIT_DEPTH)
184
+ psllw m1, (14 - BIT_DEPTH)
185
+ psllw m2, (14 - BIT_DEPTH)
186
+ psllw m3, (14 - BIT_DEPTH)
187
+ psubw m0, m4
188
+ psubw m1, m4
189
+ psubw m2, m4
190
+ psubw m3, m4
191
+ mova [r2 + mmsize], m0
192
+ mova [r2 + r3 + mmsize], m1
193
+ mova [r2 + r3 * 2 + mmsize], m2
194
+ mova [r2 + r4 + mmsize], m3
195
+%endmacro
196
+
197
+%macro P2S_32x4_AVX512 0
198
+ movu m0, [r0]
199
+ movu m1, [r0 + r1]
200
+ movu m2, [r0 + r1 * 2]
201
x265_2.7.tar.gz/source/common/x86/ipfilter8.asm -> x265_2.9.tar.gz/source/common/x86/ipfilter8.asm
Changed
201
1
2
%include "x86inc.asm"
3
%include "x86util.asm"
4
5
-SECTION_RODATA 32
6
+SECTION_RODATA 64
7
const tab_Tm, db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
8
db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
9
db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14
10
11
12
const pd_526336, times 8 dd 8192*64+2048
13
14
+const tab_ChromaCoeff, db 0, 64, 0, 0
15
+ db -2, 58, 10, -2
16
+ db -4, 54, 16, -2
17
+ db -6, 46, 28, -4
18
+ db -4, 36, 36, -4
19
+ db -4, 28, 46, -6
20
+ db -2, 16, 54, -4
21
+ db -2, 10, 58, -2
22
+
23
const tab_LumaCoeff, db 0, 0, 0, 64, 0, 0, 0, 0
24
db -1, 4, -10, 58, 17, -5, 1, 0
25
db -1, 4, -11, 40, 40, -11, 4, -1
26
27
times 16 db 58, -10
28
times 16 db 4, -1
29
30
+ALIGN 64
31
+const tab_ChromaCoeffVer_32_avx512, times 32 db 0, 64
32
+ times 32 db 0, 0
33
+
34
+ times 32 db -2, 58
35
+ times 32 db 10, -2
36
+
37
+ times 32 db -4, 54
38
+ times 32 db 16, -2
39
+
40
+ times 32 db -6, 46
41
+ times 32 db 28, -4
42
+
43
+ times 32 db -4, 36
44
+ times 32 db 36, -4
45
+
46
+ times 32 db -4, 28
47
+ times 32 db 46, -6
48
+
49
+ times 32 db -2, 16
50
+ times 32 db 54, -4
51
+
52
+ times 32 db -2, 10
53
+ times 32 db 58, -2
54
+
55
+ALIGN 64
56
+const pw_ChromaCoeffVer_32_avx512, times 16 dw 0, 64
57
+ times 16 dw 0, 0
58
+
59
+ times 16 dw -2, 58
60
+ times 16 dw 10, -2
61
+
62
+ times 16 dw -4, 54
63
+ times 16 dw 16, -2
64
+
65
+ times 16 dw -6, 46
66
+ times 16 dw 28, -4
67
+
68
+ times 16 dw -4, 36
69
+ times 16 dw 36, -4
70
+
71
+ times 16 dw -4, 28
72
+ times 16 dw 46, -6
73
+
74
+ times 16 dw -2, 16
75
+ times 16 dw 54, -4
76
+
77
+ times 16 dw -2, 10
78
+ times 16 dw 58, -2
79
+
80
+ALIGN 64
81
+const pw_LumaCoeffVer_avx512, times 16 dw 0, 0
82
+ times 16 dw 0, 64
83
+ times 16 dw 0, 0
84
+ times 16 dw 0, 0
85
+
86
+ times 16 dw -1, 4
87
+ times 16 dw -10, 58
88
+ times 16 dw 17, -5
89
+ times 16 dw 1, 0
90
+
91
+ times 16 dw -1, 4
92
+ times 16 dw -11, 40
93
+ times 16 dw 40, -11
94
+ times 16 dw 4, -1
95
+
96
+ times 16 dw 0, 1
97
+ times 16 dw -5, 17
98
+ times 16 dw 58, -10
99
+ times 16 dw 4, -1
100
+
101
+ALIGN 64
102
+const tab_LumaCoeffVer_32_avx512, times 32 db 0, 0
103
+ times 32 db 0, 64
104
+ times 32 db 0, 0
105
+ times 32 db 0, 0
106
+
107
+ times 32 db -1, 4
108
+ times 32 db -10, 58
109
+ times 32 db 17, -5
110
+ times 32 db 1, 0
111
+
112
+ times 32 db -1, 4
113
+ times 32 db -11, 40
114
+ times 32 db 40, -11
115
+ times 32 db 4, -1
116
+
117
+ times 32 db 0, 1
118
+ times 32 db -5, 17
119
+ times 32 db 58, -10
120
+ times 32 db 4, -1
121
+
122
const tab_c_64_n64, times 8 db 64, -64
123
124
const interp8_hps_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7
125
126
-SECTION .text
127
+const interp4_horiz_shuf_load1_avx512, times 2 db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
128
+const interp4_horiz_shuf_load2_avx512, times 2 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
129
+const interp4_horiz_shuf_load3_avx512, times 2 db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
130
+
131
+ALIGN 64
132
+interp4_vps_store1_avx512: dq 0, 1, 8, 9, 2, 3, 10, 11
133
+interp4_vps_store2_avx512: dq 4, 5, 12, 13, 6, 7, 14, 15
134
+const interp4_hps_shuf_avx512, dq 0, 4, 1, 5, 2, 6, 3, 7
135
+const interp4_hps_store_16xN_avx512, dq 0, 2, 1, 3, 4, 6, 5, 7
136
+const interp8_hps_store_avx512, dq 0, 1, 4, 5, 2, 3, 6, 7
137
+const interp8_vsp_store_avx512, dq 0, 2, 4, 6, 1, 3, 5, 7
138
139
+SECTION .text
140
cextern pb_128
141
cextern pw_1
142
cextern pw_32
143
144
P2S_H_32xN_avx2 48
145
146
;-----------------------------------------------------------------------------
147
+;p2s and p2s_aligned 32xN avx512 code start
148
+;-----------------------------------------------------------------------------
149
+
150
+%macro PROCESS_P2S_32x4_AVX512 0
151
+ pmovzxbw m0, [r0]
152
+ pmovzxbw m1, [r0 + r1]
153
+ pmovzxbw m2, [r0 + r1 * 2]
154
+ pmovzxbw m3, [r0 + r5]
155
+
156
+ psllw m0, 6
157
+ psllw m1, 6
158
+ psllw m2, 6
159
+ psllw m3, 6
160
+ psubw m0, m4
161
+ psubw m1, m4
162
+ psubw m2, m4
163
+ psubw m3, m4
164
+
165
+ movu [r2], m0
166
+ movu [r2 + r3], m1
167
+ movu [r2 + r3 * 2], m2
168
+ movu [r2 + r6], m3
169
+%endmacro
170
+
171
+;-----------------------------------------------------------------------------
172
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
173
+;-----------------------------------------------------------------------------
174
+%if ARCH_X86_64
175
+INIT_ZMM avx512
176
+cglobal filterPixelToShort_32x8, 3, 7, 5
177
+ mov r3d, r3m
178
+ add r3d, r3d
179
+ lea r5, [r1 * 3]
180
+ lea r6, [r3 * 3]
181
+
182
+ ; load constant
183
+ vpbroadcastd m4, [pw_2000]
184
+
185
+ PROCESS_P2S_32x4_AVX512
186
+ lea r0, [r0 + r1 * 4]
187
+ lea r2, [r2 + r3 * 4]
188
+ PROCESS_P2S_32x4_AVX512
189
+ RET
190
+
191
+INIT_ZMM avx512
192
+cglobal filterPixelToShort_32x16, 3, 7, 5
193
+ mov r3d, r3m
194
+ add r3d, r3d
195
+ lea r5, [r1 * 3]
196
+ lea r6, [r3 * 3]
197
+
198
+ ; load constant
199
+ vpbroadcastd m4, [pw_2000]
200
+
201
x265_2.7.tar.gz/source/common/x86/ipfilter8.h -> x265_2.9.tar.gz/source/common/x86/ipfilter8.h
Changed
16
1
2
FUNCDEF_PU(void, interp_8tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
3
FUNCDEF_PU(void, interp_8tap_hv_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY); \
4
FUNCDEF_CHROMA_PU(void, filterPixelToShort, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
5
+ FUNCDEF_CHROMA_PU(void, filterPixelToShort_aligned, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
6
FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
7
FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
8
FUNCDEF_CHROMA_PU(void, interp_4tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
9
10
SETUP_FUNC_DEF(sse3);
11
SETUP_FUNC_DEF(sse4);
12
SETUP_FUNC_DEF(avx2);
13
+SETUP_FUNC_DEF(avx512);
14
15
#endif // ifndef X265_IPFILTER8_H
16
x265_2.7.tar.gz/source/common/x86/loopfilter.asm -> x265_2.9.tar.gz/source/common/x86/loopfilter.asm
Changed
50
1
2
;============================================================================================================
3
INIT_XMM sse4
4
%if HIGH_BIT_DEPTH
5
+%if ARCH_X86_64
6
cglobal saoCuOrgE0, 4,5,9
7
mov r4d, r4m
8
movh m6, [r1]
9
10
sub r4d, 16
11
jnz .loopH
12
RET
13
-
14
+%endif
15
%else ; HIGH_BIT_DEPTH == 1
16
17
cglobal saoCuOrgE0, 5, 5, 8, rec, offsetEo, lcuWidth, signLeft, stride
18
19
20
INIT_YMM avx2
21
%if HIGH_BIT_DEPTH
22
+%if ARCH_X86_64
23
cglobal saoCuOrgE0, 4,4,9
24
vbroadcasti128 m6, [r1]
25
movzx r1d, byte [r3]
26
27
dec r2d
28
jnz .loop
29
RET
30
+%endif
31
%else ; HIGH_BIT_DEPTH
32
cglobal saoCuOrgE0, 5, 5, 7, rec, offsetEo, lcuWidth, signLeft, stride
33
34
35
RET
36
%endif
37
38
+%if ARCH_X86_64
39
INIT_YMM avx2
40
%if HIGH_BIT_DEPTH
41
cglobal saoCuOrgB0, 5,7,8
42
43
.end:
44
RET
45
%endif
46
+%endif
47
48
;============================================================================================================
49
; void calSign(int8_t *dst, const Pixel *src1, const Pixel *src2, const int width)
50
x265_2.7.tar.gz/source/common/x86/mc-a.asm -> x265_2.9.tar.gz/source/common/x86/mc-a.asm
Changed
201
1
2
%error Unsupport bit depth!
3
%endif
4
5
-SECTION_RODATA 32
6
+SECTION_RODATA 64
7
8
-ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
9
-ch_shuf_adj: times 8 db 0
10
- times 8 db 2
11
- times 8 db 4
12
- times 8 db 6
13
+ALIGN 64
14
+const shuf_avx512, dq 0, 2, 4, 6, 1, 3, 5, 7
15
16
SECTION .text
17
18
19
;------------------------------------------------------------------------------
20
; avx2 asm for addAvg high_bit_depth
21
;------------------------------------------------------------------------------
22
+%if ARCH_X86_64
23
INIT_YMM avx2
24
cglobal addAvg_8x2, 6,6,2, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
25
movu xm0, [r0]
26
27
movu [r2], xm0
28
movu [r2 + r5], xm2
29
RET
30
+%endif
31
32
%macro ADDAVG_W8_H4_AVX2 1
33
cglobal addAvg_8x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
34
35
RET
36
%endmacro
37
38
+%if ARCH_X86_64
39
ADDAVG_W8_H4_AVX2 4
40
ADDAVG_W8_H4_AVX2 8
41
ADDAVG_W8_H4_AVX2 12
42
ADDAVG_W8_H4_AVX2 16
43
ADDAVG_W8_H4_AVX2 32
44
ADDAVG_W8_H4_AVX2 64
45
+%endif
46
47
+%if ARCH_X86_64
48
cglobal addAvg_12x16, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
49
mova m4, [pw_ %+ ADDAVG_ROUND]
50
mova m5, [pw_pixel_max]
51
52
dec r6d
53
jnz .loop
54
RET
55
+%endif
56
57
%macro ADDAVG_W16_H4_AVX2 1
58
cglobal addAvg_16x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
59
60
RET
61
%endmacro
62
63
+%if ARCH_X86_64
64
ADDAVG_W16_H4_AVX2 4
65
ADDAVG_W16_H4_AVX2 8
66
ADDAVG_W16_H4_AVX2 12
67
68
ADDAVG_W16_H4_AVX2 24
69
ADDAVG_W16_H4_AVX2 32
70
ADDAVG_W16_H4_AVX2 64
71
+%endif
72
73
+%if ARCH_X86_64
74
cglobal addAvg_24x32, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
75
mova m4, [pw_ %+ ADDAVG_ROUND]
76
mova m5, [pw_pixel_max]
77
78
dec r6d
79
jnz .loop
80
RET
81
+%endif
82
83
%macro ADDAVG_W32_H2_AVX2 1
84
cglobal addAvg_32x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
85
86
RET
87
%endmacro
88
89
+%if ARCH_X86_64
90
ADDAVG_W32_H2_AVX2 8
91
ADDAVG_W32_H2_AVX2 16
92
ADDAVG_W32_H2_AVX2 24
93
ADDAVG_W32_H2_AVX2 32
94
ADDAVG_W32_H2_AVX2 48
95
ADDAVG_W32_H2_AVX2 64
96
+%endif
97
98
+%if ARCH_X86_64
99
cglobal addAvg_48x64, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
100
mova m4, [pw_ %+ ADDAVG_ROUND]
101
mova m5, [pw_pixel_max]
102
103
dec r6d
104
jnz .loop
105
RET
106
+%endif
107
108
%macro ADDAVG_W64_H1_AVX2 1
109
cglobal addAvg_64x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
110
111
RET
112
%endmacro
113
114
+%if ARCH_X86_64
115
ADDAVG_W64_H1_AVX2 16
116
ADDAVG_W64_H1_AVX2 32
117
ADDAVG_W64_H1_AVX2 48
118
ADDAVG_W64_H1_AVX2 64
119
+%endif
120
+;-----------------------------------------------------------------------------
121
+;addAvg avx512 high bit depth code start
122
+;-----------------------------------------------------------------------------
123
+%macro PROCESS_ADDAVG_16x4_HBD_AVX512 0
124
+ movu ym0, [r0]
125
+ vinserti32x8 m0, [r0 + r3], 1
126
+ movu ym1, [r1]
127
+ vinserti32x8 m1, [r1 + r4], 1
128
+
129
+ paddw m0, m1
130
+ pmulhrsw m0, m3
131
+ paddw m0, m4
132
+ pmaxsw m0, m2
133
+ pminsw m0, m5
134
+
135
+ movu [r2], ym0
136
+ vextracti32x8 [r2 + r5], m0, 1
137
+
138
+ movu ym0, [r0 + 2 * r3]
139
+ vinserti32x8 m0, [r0 + r6], 1
140
+ movu ym1, [r1 + 2 * r4]
141
+ vinserti32x8 m1, [r1 + r7], 1
142
+
143
+ paddw m0, m1
144
+ pmulhrsw m0, m3
145
+ paddw m0, m4
146
+ pmaxsw m0, m2
147
+ pminsw m0, m5
148
+
149
+ movu [r2 + 2 * r5], ym0
150
+ vextracti32x8 [r2 + r8], m0, 1
151
+%endmacro
152
+
153
+%macro PROCESS_ADDAVG_32x4_HBD_AVX512 0
154
+ movu m0, [r0]
155
+ movu m1, [r1]
156
+ paddw m0, m1
157
+ pmulhrsw m0, m3
158
+ paddw m0, m4
159
+ pmaxsw m0, m2
160
+ pminsw m0, m5
161
+ movu [r2], m0
162
+
163
+ movu m0, [r0 + r3]
164
+ movu m1, [r1 + r4]
165
+ paddw m0, m1
166
+ pmulhrsw m0, m3
167
+ paddw m0, m4
168
+ pmaxsw m0, m2
169
+ pminsw m0, m5
170
+ movu [r2 + r5], m0
171
+
172
+ movu m0, [r0 + 2 * r3]
173
+ movu m1, [r1 + 2 * r4]
174
+ paddw m0, m1
175
+ pmulhrsw m0, m3
176
+ paddw m0, m4
177
+ pmaxsw m0, m2
178
+ pminsw m0, m5
179
+ movu [r2 + 2 * r5], m0
180
+
181
+ movu m0, [r0 + r6]
182
+ movu m1, [r1 + r7]
183
+ paddw m0, m1
184
+ pmulhrsw m0, m3
185
+ paddw m0, m4
186
+ pmaxsw m0, m2
187
+ pminsw m0, m5
188
+ movu [r2 + r8], m0
189
+%endmacro
190
+
191
+%macro PROCESS_ADDAVG_64x4_HBD_AVX512 0
192
+ movu m0, [r0]
193
+ movu m1, [r1]
194
+ paddw m0, m1
195
+ pmulhrsw m0, m3
196
+ paddw m0, m4
197
+ pmaxsw m0, m2
198
+ pminsw m0, m5
199
+ movu [r2], m0
200
+
201
x265_2.7.tar.gz/source/common/x86/pixel-a.asm -> x265_2.9.tar.gz/source/common/x86/pixel-a.asm
Changed
201
1
2
times 2 dw 1, -1
3
times 4 dw 1
4
times 2 dw 1, -1
5
+psy_pp_shuff1: dq 0, 1, 8, 9, 4, 5, 12, 13
6
+psy_pp_shuff2: dq 2, 3, 10, 11, 6, 7, 14, 15
7
+psy_pp_shuff3: dq 0, 0, 8, 8, 1, 1, 9, 9
8
9
ALIGN 32
10
transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
11
12
%endif ; ARCH_X86_64=1
13
%endif ; HIGH_BIT_DEPTH
14
15
+%macro SATD_AVX512_LOAD4 2 ; size, opmask
16
+ vpbroadcast%1 m0, [r0]
17
+ vpbroadcast%1 m0 {%2}, [r0+2*r1]
18
+ vpbroadcast%1 m2, [r2]
19
+ vpbroadcast%1 m2 {%2}, [r2+2*r3]
20
+ add r0, r1
21
+ add r2, r3
22
+ vpbroadcast%1 m1, [r0]
23
+ vpbroadcast%1 m1 {%2}, [r0+2*r1]
24
+ vpbroadcast%1 m3, [r2]
25
+ vpbroadcast%1 m3 {%2}, [r2+2*r3]
26
+%endmacro
27
+
28
+%macro SATD_AVX512_LOAD8 5 ; size, halfreg, opmask1, opmask2, opmask3
29
+ vpbroadcast%1 %{2}0, [r0]
30
+ vpbroadcast%1 %{2}0 {%3}, [r0+2*r1]
31
+ vpbroadcast%1 %{2}2, [r2]
32
+ vpbroadcast%1 %{2}2 {%3}, [r2+2*r3]
33
+ vpbroadcast%1 m0 {%4}, [r0+4*r1]
34
+ vpbroadcast%1 m2 {%4}, [r2+4*r3]
35
+ vpbroadcast%1 m0 {%5}, [r0+2*r4]
36
+ vpbroadcast%1 m2 {%5}, [r2+2*r5]
37
+ vpbroadcast%1 %{2}1, [r0+r1]
38
+ vpbroadcast%1 %{2}1 {%3}, [r0+r4]
39
+ vpbroadcast%1 %{2}3, [r2+r3]
40
+ vpbroadcast%1 %{2}3 {%3}, [r2+r5]
41
+ lea r0, [r0+4*r1]
42
+ lea r2, [r2+4*r3]
43
+ vpbroadcast%1 m1 {%4}, [r0+r1]
44
+ vpbroadcast%1 m3 {%4}, [r2+r3]
45
+ vpbroadcast%1 m1 {%5}, [r0+r4]
46
+ vpbroadcast%1 m3 {%5}, [r2+r5]
47
+%endmacro
48
+
49
+%macro SATD_AVX512_PACKED 0
50
+ DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
51
+ SUMSUB_BA w, 0, 1, 2
52
+ SBUTTERFLY qdq, 0, 1, 2
53
+ SUMSUB_BA w, 0, 1, 2
54
+ HMAXABSW2 0, 1, 2, 3
55
+%endmacro
56
+
57
+%macro SATD_AVX512_END 0-1 0 ; sa8d
58
+ paddw m0 {k1}{z}, m1 ; zero-extend to dwords
59
+%if ARCH_X86_64
60
+%if mmsize == 64
61
+ vextracti32x8 ym1, m0, 1
62
+ paddd ym0, ym1
63
+%endif
64
+%if mmsize >= 32
65
+ vextracti128 xm1, ym0, 1
66
+ paddd xmm0, xm0, xm1
67
+%endif
68
+ punpckhqdq xmm1, xmm0, xmm0
69
+ paddd xmm0, xmm1
70
+ movq rax, xmm0
71
+ rorx rdx, rax, 32
72
+%if %1
73
+ lea eax, [rax+rdx+1]
74
+ shr eax, 1
75
+%else
76
+ add eax, edx
77
+%endif
78
+%else
79
+ HADDD m0, m1
80
+ movd eax, xm0
81
+%if %1
82
+ inc eax
83
+ shr eax, 1
84
+%endif
85
+%endif
86
+ RET
87
+%endmacro
88
+
89
+%macro HMAXABSW2 4 ; a, b, tmp1, tmp2
90
+ pabsw m%1, m%1
91
+ pabsw m%2, m%2
92
+ psrldq m%3, m%1, 2
93
+ psrld m%4, m%2, 16
94
+ pmaxsw m%1, m%3
95
+ pmaxsw m%2, m%4
96
+%endmacro
97
+%if HIGH_BIT_DEPTH==0
98
+INIT_ZMM avx512
99
+cglobal pixel_satd_16x8_internal
100
+ vbroadcasti64x4 m6, [hmul_16p]
101
+ kxnorb k2, k2, k2
102
+ mov r4d, 0x55555555
103
+ knotw k2, k2
104
+ kmovd k1, r4d
105
+ lea r4, [3*r1]
106
+ lea r5, [3*r3]
107
+satd_16x8_avx512:
108
+ vbroadcasti128 ym0, [r0]
109
+ vbroadcasti32x4 m0 {k2}, [r0+4*r1] ; 0 0 4 4
110
+ vbroadcasti128 ym4, [r2]
111
+ vbroadcasti32x4 m4 {k2}, [r2+4*r3]
112
+ vbroadcasti128 ym2, [r0+2*r1]
113
+ vbroadcasti32x4 m2 {k2}, [r0+2*r4] ; 2 2 6 6
114
+ vbroadcasti128 ym5, [r2+2*r3]
115
+ vbroadcasti32x4 m5 {k2}, [r2+2*r5]
116
+ DIFF_SUMSUB_SSSE3 0, 4, 2, 5, 6
117
+ vbroadcasti128 ym1, [r0+r1]
118
+ vbroadcasti128 ym4, [r2+r3]
119
+ vbroadcasti128 ym3, [r0+r4]
120
+ vbroadcasti128 ym5, [r2+r5]
121
+ lea r0, [r0+4*r1]
122
+ lea r2, [r2+4*r3]
123
+ vbroadcasti32x4 m1 {k2}, [r0+r1] ; 1 1 5 5
124
+ vbroadcasti32x4 m4 {k2}, [r2+r3]
125
+ vbroadcasti32x4 m3 {k2}, [r0+r4] ; 3 3 7 7
126
+ vbroadcasti32x4 m5 {k2}, [r2+r5]
127
+ DIFF_SUMSUB_SSSE3 1, 4, 3, 5, 6
128
+ HADAMARD4_V 0, 1, 2, 3, 4
129
+ HMAXABSW2 0, 2, 4, 5
130
+ HMAXABSW2 1, 3, 4, 5
131
+ paddw m4, m0, m2 ; m1
132
+ paddw m2, m1, m3 ; m0
133
+ ret
134
+
135
+cglobal pixel_satd_8x8_internal
136
+ vbroadcasti64x4 m4, [hmul_16p]
137
+ mov r4d, 0x55555555
138
+ kmovd k1, r4d ; 01010101
139
+ kshiftlb k2, k1, 5 ; 10100000
140
+ kshiftlb k3, k1, 4 ; 01010000
141
+ lea r4, [3*r1]
142
+ lea r5, [3*r3]
143
+satd_8x8_avx512:
144
+ SATD_AVX512_LOAD8 q, ym, k1, k2, k3 ; 2 0 2 0 6 4 6 4
145
+ SATD_AVX512_PACKED ; 3 1 3 1 7 5 7 5
146
+ ret
147
+
148
+cglobal pixel_satd_16x8, 4,6
149
+ call pixel_satd_16x8_internal_avx512
150
+ jmp satd_zmm_avx512_end
151
+
152
+cglobal pixel_satd_16x16, 4,6
153
+ call pixel_satd_16x8_internal_avx512
154
+ lea r0, [r0+4*r1]
155
+ lea r2, [r2+4*r3]
156
+ paddw m7, m0, m1
157
+ call satd_16x8_avx512
158
+ paddw m1, m7
159
+ jmp satd_zmm_avx512_end
160
+
161
+cglobal pixel_satd_8x8, 4,6
162
+ call pixel_satd_8x8_internal_avx512
163
+satd_zmm_avx512_end:
164
+ SATD_AVX512_END
165
+
166
+cglobal pixel_satd_8x16, 4,6
167
+ call pixel_satd_8x8_internal_avx512
168
+ lea r0, [r0+4*r1]
169
+ lea r2, [r2+4*r3]
170
+ paddw m5, m0, m1
171
+ call satd_8x8_avx512
172
+ paddw m1, m5
173
+ jmp satd_zmm_avx512_end
174
+
175
+INIT_YMM avx512
176
+cglobal pixel_satd_4x8_internal
177
+ vbroadcasti128 m4, [hmul_4p]
178
+ mov r4d, 0x55550c
179
+ kmovd k2, r4d ; 00001100
180
+ kshiftlb k3, k2, 2 ; 00110000
181
+ kshiftlb k4, k2, 4 ; 11000000
182
+ kshiftrd k1, k2, 8 ; 01010101
183
+ lea r4, [3*r1]
184
+ lea r5, [3*r3]
185
+satd_4x8_avx512:
186
+ SATD_AVX512_LOAD8 d, xm, k2, k3, k4 ; 0 0 2 2 4 4 6 6
187
+satd_ymm_avx512: ; 1 1 3 3 5 5 7 7
188
+ SATD_AVX512_PACKED
189
+ ret
190
+
191
+cglobal pixel_satd_8x4, 4,5
192
+ mova m4, [hmul_16p]
193
+ mov r4d, 0x5555
194
+ kmovw k1, r4d
195
+ SATD_AVX512_LOAD4 q, k1 ; 2 0 2 0
196
+ call satd_ymm_avx512 ; 3 1 3 1
197
+ jmp satd_ymm_avx512_end2
198
+
199
+cglobal pixel_satd_4x8, 4,6
200
+ call pixel_satd_4x8_internal_avx512
201
x265_2.7.tar.gz/source/common/x86/pixel-util.h -> x265_2.9.tar.gz/source/common/x86/pixel-util.h
Changed
33
1
2
3
#define DEFINE_UTILS(cpu) \
4
FUNCDEF_TU_S2(void, getResidual, cpu, const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); \
5
+ FUNCDEF_TU_S2(void, getResidual_aligned, cpu, const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); \
6
FUNCDEF_TU_S2(void, transpose, cpu, pixel* dest, const pixel* src, intptr_t stride); \
7
FUNCDEF_TU(int, count_nonzero, cpu, const int16_t* quantCoeff); \
8
uint32_t PFX(quant_ ## cpu(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)); \
9
10
void PFX(weight_pp_ ## cpu(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)); \
11
void PFX(weight_sp_ ## cpu(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)); \
12
void PFX(scale1D_128to64_ ## cpu(pixel*, const pixel*)); \
13
+ void PFX(scale1D_128to64_aligned_ ## cpu(pixel*, const pixel*)); \
14
void PFX(scale2D_64to32_ ## cpu(pixel*, const pixel*, intptr_t)); \
15
uint32_t PFX(costCoeffRemain_ ## cpu(uint16_t *absCoeff, int numNonZero, int idx)); \
16
uint32_t PFX(costC1C2Flag_sse2(uint16_t *absCoeff, intptr_t numNonZero, uint8_t *baseCtxMod, intptr_t ctxOffset)); \
17
18
DEFINE_UTILS(ssse3);
19
DEFINE_UTILS(sse4);
20
DEFINE_UTILS(avx2);
21
+DEFINE_UTILS(avx512);
22
23
#undef DEFINE_UTILS
24
25
26
uint32_t PFX(costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
27
uint32_t PFX(costCoeffNxN_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
28
29
+int PFX(count_nonzero_16x16_avx512(const int16_t* quantCoeff));
30
+int PFX(count_nonzero_32x32_avx512(const int16_t* quantCoeff));
31
+
32
#endif // ifndef X265_PIXEL_UTIL_H
33
x265_2.7.tar.gz/source/common/x86/pixel-util8.asm -> x265_2.9.tar.gz/source/common/x86/pixel-util8.asm
Changed
201
1
2
;* Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
3
;* Nabajit Deka <nabajit@multicorewareinc.com>
4
;* Rajesh Paulraj <rajesh@multicorewareinc.com>
5
+;* Praveen Kumar Tiwari <praveen@multicorewareinc.com>
6
;*
7
;* This program is free software; you can redistribute it and/or modify
8
;* it under the terms of the GNU General Public License as published by
9
10
%include "x86inc.asm"
11
%include "x86util.asm"
12
13
-SECTION_RODATA 32
14
+SECTION_RODATA 64
15
+
16
+var_shuf_avx512: db 0,-1, 1,-1, 2,-1, 3,-1, 4,-1, 5,-1, 6,-1, 7,-1
17
+ db 8,-1, 9,-1,10,-1,11,-1,12,-1,13,-1,14,-1,15,-1
18
+ALIGN 64
19
+const dequant_shuf1_avx512, dq 0, 2, 4, 6, 1, 3, 5, 7
20
+const dequant_shuf2_avx512, dq 0, 4, 1, 5, 2, 6, 3, 7
21
22
%if BIT_DEPTH == 12
23
ssim_c1: times 4 dd 107321.76 ; .01*.01*4095*4095*64
24
25
%endrep
26
RET
27
%endif
28
+
29
+%macro PROCESS_GETRESIDUAL32_W4_HBD_AVX512 0
30
+ movu m0, [r0]
31
+ movu m1, [r0 + r3]
32
+ movu m2, [r0 + r3 * 2]
33
+ movu m3, [r0 + r4]
34
+ lea r0, [r0 + r3 * 4]
35
+
36
+ movu m4, [r1]
37
+ movu m5, [r1 + r3]
38
+ movu m6, [r1 + r3 * 2]
39
+ movu m7, [r1 + r4]
40
+ lea r1, [r1 + r3 * 4]
41
+
42
+ psubw m0, m4
43
+ psubw m1, m5
44
+ psubw m2, m6
45
+ psubw m3, m7
46
+
47
+ movu [r2], m0
48
+ movu [r2 + r3], m1
49
+ movu [r2 + r3 * 2], m2
50
+ movu [r2 + r4], m3
51
+ lea r2, [r2 + r3 * 4]
52
+%endmacro
53
+
54
+%macro PROCESS_GETRESIDUAL32_W4_HBD_AVX512_END 0
55
+ movu m0, [r0]
56
+ movu m1, [r0 + r3]
57
+ movu m2, [r0 + r3 * 2]
58
+ movu m3, [r0 + r4]
59
+
60
+ movu m4, [r1]
61
+ movu m5, [r1 + r3]
62
+ movu m6, [r1 + r3 * 2]
63
+ movu m7, [r1 + r4]
64
+
65
+ psubw m0, m4
66
+ psubw m1, m5
67
+ psubw m2, m6
68
+ psubw m3, m7
69
+
70
+ movu [r2], m0
71
+ movu [r2 + r3], m1
72
+ movu [r2 + r3 * 2], m2
73
+ movu [r2 + r4], m3
74
+%endmacro
75
+
76
+%macro PROCESS_GETRESIDUAL32_W4_AVX512 0
77
+ pmovzxbw m0, [r0]
78
+ pmovzxbw m1, [r0 + r3]
79
+ pmovzxbw m2, [r0 + r3 * 2]
80
+ pmovzxbw m3, [r0 + r4]
81
+ lea r0, [r0 + r3 * 4]
82
+
83
+ pmovzxbw m4, [r1]
84
+ pmovzxbw m5, [r1 + r3]
85
+ pmovzxbw m6, [r1 + r3 * 2]
86
+ pmovzxbw m7, [r1 + r4]
87
+ lea r1, [r1 + r3 * 4]
88
+
89
+ psubw m0, m4
90
+ psubw m1, m5
91
+ psubw m2, m6
92
+ psubw m3, m7
93
+
94
+ movu [r2], m0
95
+ movu [r2 + r3 * 2], m1
96
+ lea r2, [r2 + r3 * 4]
97
+ movu [r2], m2
98
+ movu [r2 + r3 * 2], m3
99
+ lea r2, [r2 + r3 * 4]
100
+%endmacro
101
+
102
+%macro PROCESS_GETRESIDUAL32_W4_AVX512_END 0
103
+ pmovzxbw m0, [r0]
104
+ pmovzxbw m1, [r0 + r3]
105
+ pmovzxbw m2, [r0 + r3 * 2]
106
+ pmovzxbw m3, [r0 + r4]
107
+
108
+ pmovzxbw m4, [r1]
109
+ pmovzxbw m5, [r1 + r3]
110
+ pmovzxbw m6, [r1 + r3 * 2]
111
+ pmovzxbw m7, [r1 + r4]
112
+
113
+ psubw m0, m4
114
+ psubw m1, m5
115
+ psubw m2, m6
116
+ psubw m3, m7
117
+
118
+ movu [r2], m0
119
+ movu [r2 + r3 * 2], m1
120
+ lea r2, [r2 + r3 * 4]
121
+ movu [r2], m2
122
+ movu [r2 + r3 * 2], m3
123
+%endmacro
124
+
125
+
126
+%if HIGH_BIT_DEPTH
127
+INIT_ZMM avx512
128
+cglobal getResidual32, 4,5,8
129
+ add r3, r3
130
+ lea r4, [r3 * 3]
131
+
132
+ PROCESS_GETRESIDUAL32_W4_HBD_AVX512
133
+ PROCESS_GETRESIDUAL32_W4_HBD_AVX512
134
+ PROCESS_GETRESIDUAL32_W4_HBD_AVX512
135
+ PROCESS_GETRESIDUAL32_W4_HBD_AVX512
136
+ PROCESS_GETRESIDUAL32_W4_HBD_AVX512
137
+ PROCESS_GETRESIDUAL32_W4_HBD_AVX512
138
+ PROCESS_GETRESIDUAL32_W4_HBD_AVX512
139
+ PROCESS_GETRESIDUAL32_W4_HBD_AVX512_END
140
+ RET
141
+%else
142
+INIT_ZMM avx512
143
+cglobal getResidual32, 4,5,8
144
+ lea r4, [r3 * 3]
145
+
146
+ PROCESS_GETRESIDUAL32_W4_AVX512
147
+ PROCESS_GETRESIDUAL32_W4_AVX512
148
+ PROCESS_GETRESIDUAL32_W4_AVX512
149
+ PROCESS_GETRESIDUAL32_W4_AVX512
150
+ PROCESS_GETRESIDUAL32_W4_AVX512
151
+ PROCESS_GETRESIDUAL32_W4_AVX512
152
+ PROCESS_GETRESIDUAL32_W4_AVX512
153
+ PROCESS_GETRESIDUAL32_W4_AVX512_END
154
+ RET
155
+%endif
156
+
157
+%macro PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512 0
158
+ movu m0, [r0]
159
+ movu m1, [r0 + r3]
160
+ movu m2, [r0 + r3 * 2]
161
+ movu m3, [r0 + r4]
162
+ lea r0, [r0 + r3 * 4]
163
+
164
+ movu m4, [r1]
165
+ movu m5, [r1 + r3]
166
+ movu m6, [r1 + r3 * 2]
167
+ movu m7, [r1 + r4]
168
+ lea r1, [r1 + r3 * 4]
169
+
170
+ psubw m0, m4
171
+ psubw m1, m5
172
+ psubw m2, m6
173
+ psubw m3, m7
174
+
175
+ movu [r2], m0
176
+ movu [r2 + r3], m1
177
+ movu [r2 + r3 * 2], m2
178
+ movu [r2 + r4], m3
179
+ lea r2, [r2 + r3 * 4]
180
+%endmacro
181
+
182
+%macro PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512_END 0
183
+ movu m0, [r0]
184
+ movu m1, [r0 + r3]
185
+ movu m2, [r0 + r3 * 2]
186
+ movu m3, [r0 + r4]
187
+
188
+ movu m4, [r1]
189
+ movu m5, [r1 + r3]
190
+ movu m6, [r1 + r3 * 2]
191
+ movu m7, [r1 + r4]
192
+
193
+ psubw m0, m4
194
+ psubw m1, m5
195
+ psubw m2, m6
196
+ psubw m3, m7
197
+
198
+ movu [r2], m0
199
+ movu [r2 + r3], m1
200
+ movu [r2 + r3 * 2], m2
201
x265_2.7.tar.gz/source/common/x86/pixel.h -> x265_2.9.tar.gz/source/common/x86/pixel.h
Changed
37
1
2
void PFX(downShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
3
void PFX(upShift_16_sse2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
4
void PFX(upShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
5
+void PFX(upShift_16_avx512)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
6
void PFX(upShift_8_sse4)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
7
void PFX(upShift_8_avx2)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
8
pixel PFX(planeClipAndMax_avx2)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
9
10
FUNCDEF_PU(void, pixel_sad_x3, cpu, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
11
FUNCDEF_PU(void, pixel_sad_x4, cpu, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
12
FUNCDEF_PU(void, pixel_avg, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
13
+ FUNCDEF_PU(void, pixel_avg_aligned, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
14
FUNCDEF_PU(void, pixel_add_ps, cpu, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \
15
+ FUNCDEF_PU(void, pixel_add_ps_aligned, cpu, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \
16
FUNCDEF_PU(void, pixel_sub_ps, cpu, int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); \
17
FUNCDEF_CHROMA_PU(int, pixel_satd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
18
FUNCDEF_CHROMA_PU(int, pixel_sad, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
19
FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
20
FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
21
+ FUNCDEF_CHROMA_PU(void, addAvg_aligned, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
22
FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
23
+ FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s_aligned, cpu, const int16_t*, intptr_t); \
24
FUNCDEF_TU_S(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
25
+ FUNCDEF_TU_S(sse_t, pixel_ssd_s_aligned, cpu, const int16_t*, intptr_t); \
26
FUNCDEF_TU(uint64_t, pixel_var, cpu, const pixel*, intptr_t); \
27
FUNCDEF_TU(int, psyCost_pp, cpu, const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride); \
28
FUNCDEF_TU(int, psyCost_ss, cpu, const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
29
30
DECL_PIXELS(avx);
31
DECL_PIXELS(xop);
32
DECL_PIXELS(avx2);
33
+DECL_PIXELS(avx512);
34
35
#undef DECL_PIXELS
36
37
x265_2.7.tar.gz/source/common/x86/pixeladd8.asm -> x265_2.9.tar.gz/source/common/x86/pixeladd8.asm
Changed
201
1
2
3
%include "x86inc.asm"
4
%include "x86util.asm"
5
+SECTION_RODATA 64
6
7
-SECTION_RODATA 32
8
-
9
+ALIGN 64
10
+const store_shuf1_avx512, dq 0, 2, 4, 6, 1, 3, 5, 7
11
SECTION .text
12
-
13
cextern pw_pixel_max
14
15
;-----------------------------------------------------------------------------
16
17
PIXEL_ADD_PS_W32_H4_avx2 32
18
PIXEL_ADD_PS_W32_H4_avx2 64
19
20
-
21
;-----------------------------------------------------------------------------
22
; void pixel_add_ps_64x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
23
;-----------------------------------------------------------------------------
24
25
RET
26
27
%endif
28
+
29
+;-----------------------------------------------------------------------------
30
+; pixel_add_ps avx512 code start
31
+;-----------------------------------------------------------------------------
32
+%macro PROCESS_ADD_PS_64x4_AVX512 0
33
+ pmovzxbw m0, [r2]
34
+ pmovzxbw m1, [r2 + mmsize/2]
35
+ movu m2, [r3]
36
+ movu m3, [r3 + mmsize]
37
+ paddw m0, m2
38
+ paddw m1, m3
39
+ packuswb m0, m1
40
+ vpermq m0, m4, m0
41
+ movu [r0], m0
42
+ pmovzxbw m0, [r2 + r4]
43
+ pmovzxbw m1, [r2 + r4 + mmsize/2]
44
+ movu m2, [r3 + r5]
45
+ movu m3, [r3 + r5 + mmsize]
46
+ paddw m0, m2
47
+ paddw m1, m3
48
+ packuswb m0, m1
49
+ vpermq m0, m4, m0
50
+ movu [r0 + r1], m0
51
+ pmovzxbw m0, [r2 + 2 * r4]
52
+ pmovzxbw m1, [r2 + 2 * r4 + mmsize/2]
53
+ movu m2, [r3 + 2 * r5]
54
+ movu m3, [r3 + 2 * r5 + mmsize]
55
+ paddw m0, m2
56
+ paddw m1, m3
57
+ packuswb m0, m1
58
+ vpermq m0, m4, m0
59
+ movu [r0 + 2 * r1], m0
60
+
61
+ pmovzxbw m0, [r2 + r7]
62
+ pmovzxbw m1, [r2 + r7 + mmsize/2]
63
+ movu m2, [r3 + r8]
64
+ movu m3, [r3 + r8 + mmsize]
65
+ paddw m0, m2
66
+ paddw m1, m3
67
+ packuswb m0, m1
68
+ vpermq m0, m4, m0
69
+ movu [r0 + r6], m0
70
+%endmacro
71
+
72
+%macro PROCESS_ADD_PS_64x4_HBD_AVX512 0
73
+ movu m0, [r2]
74
+ movu m1, [r2 + mmsize]
75
+ movu m2, [r3]
76
+ movu m3, [r3 + mmsize]
77
+ paddw m0, m2
78
+ paddw m1, m3
79
+
80
+ CLIPW2 m0, m1, m4, m5
81
+ movu [r0], m0
82
+ movu [r0 + mmsize], m1
83
+
84
+ movu m0, [r2 + r4]
85
+ movu m1, [r2 + r4 + mmsize]
86
+ movu m2, [r3 + r5]
87
+ movu m3, [r3 + r5 + mmsize]
88
+ paddw m0, m2
89
+ paddw m1, m3
90
+
91
+ CLIPW2 m0, m1, m4, m5
92
+ movu [r0 + r1], m0
93
+ movu [r0 + r1 + mmsize], m1
94
+
95
+ movu m0, [r2 + r4 * 2]
96
+ movu m1, [r2 + r4 * 2 + mmsize]
97
+ movu m2, [r3 + r5 * 2]
98
+ movu m3, [r3 + r5 * 2 + mmsize]
99
+ paddw m0, m2
100
+ paddw m1, m3
101
+
102
+ CLIPW2 m0, m1, m4, m5
103
+ movu [r0 + r1 * 2], m0
104
+ movu [r0 + r1 * 2 + mmsize], m1
105
+
106
+ movu m0, [r2 + r6]
107
+ movu m1, [r2 + r6 + mmsize]
108
+ movu m2, [r3 + r7]
109
+ movu m3, [r3 + r7 + mmsize]
110
+ paddw m0, m2
111
+ paddw m1, m3
112
+
113
+ CLIPW2 m0, m1, m4, m5
114
+ movu [r0 + r8], m0
115
+ movu [r0 + r8 + mmsize], m1
116
+%endmacro
117
+
118
+%macro PROCESS_ADD_PS_64x4_ALIGNED_AVX512 0
119
+ pmovzxbw m0, [r2]
120
+ pmovzxbw m1, [r2 + mmsize/2]
121
+ mova m2, [r3]
122
+ mova m3, [r3 + mmsize]
123
+ paddw m0, m2
124
+ paddw m1, m3
125
+ packuswb m0, m1
126
+ vpermq m0, m4, m0
127
+ mova [r0], m0
128
+ pmovzxbw m0, [r2 + r4]
129
+ pmovzxbw m1, [r2 + r4 + mmsize/2]
130
+ mova m2, [r3 + r5]
131
+ mova m3, [r3 + r5 + mmsize]
132
+ paddw m0, m2
133
+ paddw m1, m3
134
+ packuswb m0, m1
135
+ vpermq m0, m4, m0
136
+ mova [r0 + r1], m0
137
+ pmovzxbw m0, [r2 + 2 * r4]
138
+ pmovzxbw m1, [r2 + 2 * r4 + mmsize/2]
139
+ mova m2, [r3 + 2 * r5]
140
+ mova m3, [r3 + 2 * r5 + mmsize]
141
+ paddw m0, m2
142
+ paddw m1, m3
143
+ packuswb m0, m1
144
+ vpermq m0, m4, m0
145
+ mova [r0 + 2 * r1], m0
146
+
147
+ pmovzxbw m0, [r2 + r7]
148
+ pmovzxbw m1, [r2 + r7 + mmsize/2]
149
+ mova m2, [r3 + r8]
150
+ mova m3, [r3 + r8 + mmsize]
151
+ paddw m0, m2
152
+ paddw m1, m3
153
+ packuswb m0, m1
154
+ vpermq m0, m4, m0
155
+ mova [r0 + r6], m0
156
+%endmacro
157
+
158
+%macro PROCESS_ADD_PS_64x4_HBD_ALIGNED_AVX512 0
159
+ mova m0, [r2]
160
+ mova m1, [r2 + mmsize]
161
+ mova m2, [r3]
162
+ mova m3, [r3 + mmsize]
163
+ paddw m0, m2
164
+ paddw m1, m3
165
+
166
+ CLIPW2 m0, m1, m4, m5
167
+ mova [r0], m0
168
+ mova [r0 + mmsize], m1
169
+
170
+ mova m0, [r2 + r4]
171
+ mova m1, [r2 + r4 + mmsize]
172
+ mova m2, [r3 + r5]
173
+ mova m3, [r3 + r5 + mmsize]
174
+ paddw m0, m2
175
+ paddw m1, m3
176
+
177
+ CLIPW2 m0, m1, m4, m5
178
+ mova [r0 + r1], m0
179
+ mova [r0 + r1 + mmsize], m1
180
+
181
+ mova m0, [r2 + r4 * 2]
182
+ mova m1, [r2 + r4 * 2 + mmsize]
183
+ mova m2, [r3 + r5 * 2]
184
+ mova m3, [r3 + r5 * 2 + mmsize]
185
+ paddw m0, m2
186
+ paddw m1, m3
187
+
188
+ CLIPW2 m0, m1, m4, m5
189
+ mova [r0 + r1 * 2], m0
190
+ mova [r0 + r1 * 2 + mmsize], m1
191
+
192
+ mova m0, [r2 + r6]
193
+ mova m1, [r2 + r6 + mmsize]
194
+ mova m2, [r3 + r7]
195
+ mova m3, [r3 + r7 + mmsize]
196
+ paddw m0, m2
197
+ paddw m1, m3
198
+
199
+ CLIPW2 m0, m1, m4, m5
200
+ mova [r0 + r8], m0
201
x265_2.7.tar.gz/source/common/x86/sad-a.asm -> x265_2.9.tar.gz/source/common/x86/sad-a.asm
Changed
201
1
2
lea r0, [r0 + r1]
3
%endmacro
4
5
-%macro SAD_W16 0
6
-;-----------------------------------------------------------------------------
7
-; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
8
-;-----------------------------------------------------------------------------
9
-cglobal pixel_sad_16x16, 4,4,8
10
- movu m0, [r2]
11
- movu m1, [r2+r3]
12
- lea r2, [r2+2*r3]
13
- movu m2, [r2]
14
- movu m3, [r2+r3]
15
- lea r2, [r2+2*r3]
16
- psadbw m0, [r0]
17
- psadbw m1, [r0+r1]
18
- lea r0, [r0+2*r1]
19
- movu m4, [r2]
20
- paddw m0, m1
21
- psadbw m2, [r0]
22
- psadbw m3, [r0+r1]
23
- lea r0, [r0+2*r1]
24
- movu m5, [r2+r3]
25
- lea r2, [r2+2*r3]
26
- paddw m2, m3
27
- movu m6, [r2]
28
- movu m7, [r2+r3]
29
- lea r2, [r2+2*r3]
30
- paddw m0, m2
31
- psadbw m4, [r0]
32
- psadbw m5, [r0+r1]
33
- lea r0, [r0+2*r1]
34
- movu m1, [r2]
35
- paddw m4, m5
36
- psadbw m6, [r0]
37
- psadbw m7, [r0+r1]
38
- lea r0, [r0+2*r1]
39
- movu m2, [r2+r3]
40
- lea r2, [r2+2*r3]
41
- paddw m6, m7
42
- movu m3, [r2]
43
- paddw m0, m4
44
- movu m4, [r2+r3]
45
- lea r2, [r2+2*r3]
46
- paddw m0, m6
47
- psadbw m1, [r0]
48
- psadbw m2, [r0+r1]
49
- lea r0, [r0+2*r1]
50
- movu m5, [r2]
51
- paddw m1, m2
52
- psadbw m3, [r0]
53
- psadbw m4, [r0+r1]
54
- lea r0, [r0+2*r1]
55
- movu m6, [r2+r3]
56
- lea r2, [r2+2*r3]
57
- paddw m3, m4
58
- movu m7, [r2]
59
- paddw m0, m1
60
- movu m1, [r2+r3]
61
- paddw m0, m3
62
- psadbw m5, [r0]
63
- psadbw m6, [r0+r1]
64
- lea r0, [r0+2*r1]
65
- paddw m5, m6
66
- psadbw m7, [r0]
67
- psadbw m1, [r0+r1]
68
- paddw m7, m1
69
- paddw m0, m5
70
- paddw m0, m7
71
- SAD_END_SSE2
72
+%macro SAD_W16 1 ; h
73
+cglobal pixel_sad_16x%1, 4,4
74
+%ifidn cpuname, sse2
75
+.skip_prologue:
76
+%endif
77
+%assign %%i 0
78
+%if ARCH_X86_64
79
+ lea r6, [3*r1] ; r6 results in fewer REX prefixes than r4 and both are volatile
80
+ lea r5, [3*r3]
81
+%rep %1/4
82
+ movu m1, [r2]
83
+ psadbw m1, [r0]
84
+ movu m3, [r2+r3]
85
+ psadbw m3, [r0+r1]
86
+ movu m2, [r2+2*r3]
87
+ psadbw m2, [r0+2*r1]
88
+ movu m4, [r2+r5]
89
+ psadbw m4, [r0+r6]
90
+%if %%i != %1/4-1
91
+ lea r2, [r2+4*r3]
92
+ lea r0, [r0+4*r1]
93
+%endif
94
+ paddw m1, m3
95
+ paddw m2, m4
96
+ ACCUM paddw, 0, 1, %%i
97
+ paddw m0, m2
98
+ %assign %%i %%i+1
99
+%endrep
100
+%else ; The cost of having to save and restore registers on x86-32
101
+%rep %1/2 ; nullifies the benefit of having 3*stride in registers.
102
+ movu m1, [r2]
103
+ psadbw m1, [r0]
104
+ movu m2, [r2+r3]
105
+ psadbw m2, [r0+r1]
106
+%if %%i != %1/2-1
107
+ lea r2, [r2+2*r3]
108
+ lea r0, [r0+2*r1]
109
+%endif
110
+ ACCUM paddw, 0, 1, %%i
111
+ paddw m0, m2
112
+ %assign %%i %%i+1
113
+%endrep
114
+%endif
115
+ SAD_END_SSE2
116
+ %endmacro
117
118
-;-----------------------------------------------------------------------------
119
-; int pixel_sad_16x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
120
-;-----------------------------------------------------------------------------
121
-cglobal pixel_sad_16x8, 4,4
122
- movu m0, [r2]
123
- movu m2, [r2+r3]
124
- lea r2, [r2+2*r3]
125
- movu m3, [r2]
126
- movu m4, [r2+r3]
127
- psadbw m0, [r0]
128
- psadbw m2, [r0+r1]
129
- lea r0, [r0+2*r1]
130
- psadbw m3, [r0]
131
- psadbw m4, [r0+r1]
132
- lea r0, [r0+2*r1]
133
- lea r2, [r2+2*r3]
134
- paddw m0, m2
135
- paddw m3, m4
136
- paddw m0, m3
137
- movu m1, [r2]
138
- movu m2, [r2+r3]
139
- lea r2, [r2+2*r3]
140
- movu m3, [r2]
141
- movu m4, [r2+r3]
142
- psadbw m1, [r0]
143
- psadbw m2, [r0+r1]
144
- lea r0, [r0+2*r1]
145
- psadbw m3, [r0]
146
- psadbw m4, [r0+r1]
147
- lea r0, [r0+2*r1]
148
- lea r2, [r2+2*r3]
149
- paddw m1, m2
150
- paddw m3, m4
151
- paddw m0, m1
152
- paddw m0, m3
153
- SAD_END_SSE2
154
+INIT_XMM sse2
155
+SAD_W16 8
156
+SAD_W16 16
157
+INIT_XMM sse3
158
+SAD_W16 8
159
+SAD_W16 16
160
+INIT_XMM sse2, aligned
161
+SAD_W16 8
162
+SAD_W16 16
163
164
+%macro SAD_Wx 0
165
;-----------------------------------------------------------------------------
166
; int pixel_sad_16x12( uint8_t *, intptr_t, uint8_t *, intptr_t )
167
;-----------------------------------------------------------------------------
168
169
%endmacro
170
171
INIT_XMM sse2
172
-SAD_W16
173
+SAD_Wx
174
INIT_XMM sse3
175
-SAD_W16
176
+SAD_Wx
177
INIT_XMM sse2, aligned
178
-SAD_W16
179
+SAD_Wx
180
181
%macro SAD_INC_4x8P_SSE 1
182
movq m1, [r0]
183
184
SAD_INC_4x8P_SSE 1
185
SAD_INC_4x8P_SSE 1
186
SAD_END_SSE2
187
+
188
+%macro SAD_W48_AVX512 3 ; w, h, d/q
189
+cglobal pixel_sad_%1x%2, 4,4
190
+ kxnorb k1, k1, k1
191
+ kaddb k1, k1, k1
192
+%assign %%i 0
193
+%if ARCH_X86_64 && %2 != 4
194
+ lea r6, [3*r1]
195
+ lea r5, [3*r3]
196
+%rep %2/4
197
+ mov%3 m1, [r0]
198
+ vpbroadcast%3 m1 {k1}, [r0+r1]
199
+ mov%3 m3, [r2]
200
+ vpbroadcast%3 m3 {k1}, [r2+r3]
201
x265_2.7.tar.gz/source/common/x86/sad16-a.asm -> x265_2.9.tar.gz/source/common/x86/sad16-a.asm
Changed
201
1
2
SAD_12 12, 16
3
4
5
+%macro PROCESS_SAD_64x8_AVX512 0
6
+ movu m1, [r2]
7
+ movu m2, [r2 + mmsize]
8
+ movu m3, [r2 + r3]
9
+ movu m4, [r2 + r3 + mmsize]
10
+ psubw m1, [r0]
11
+ psubw m2, [r0 + mmsize]
12
+ psubw m3, [r0 + r1]
13
+ psubw m4, [r0 + r1 + mmsize]
14
+ pabsw m1, m1
15
+ pabsw m2, m2
16
+ pabsw m3, m3
17
+ pabsw m4, m4
18
+ paddw m1, m2
19
+ paddw m3, m4
20
+ paddw m5, m1, m3
21
+
22
+ movu m1, [r2 + 2 * r3]
23
+ movu m2, [r2 + 2 * r3 + mmsize]
24
+ movu m3, [r2 + r5]
25
+ movu m4, [r2 + r5 + mmsize]
26
+ psubw m1, [r0 + 2 * r1]
27
+ psubw m2, [r0 + 2 * r1 + mmsize]
28
+ psubw m3, [r0 + r4]
29
+ psubw m4, [r0 + r4 + mmsize]
30
+ pabsw m1, m1
31
+ pabsw m2, m2
32
+ pabsw m3, m3
33
+ pabsw m4, m4
34
+ paddw m1, m2
35
+ paddw m3, m4
36
+ paddw m1, m3
37
+
38
+ lea r0, [r0 + 4 * r1]
39
+ lea r2, [r2 + 4 * r3]
40
+
41
+ pmaddwd m5, m6
42
+ paddd m0, m5
43
+ pmaddwd m1, m6
44
+ paddd m0, m1
45
+
46
+ movu m1, [r2]
47
+ movu m2, [r2 + mmsize]
48
+ movu m3, [r2 + r3]
49
+ movu m4, [r2 + r3 + mmsize]
50
+ psubw m1, [r0]
51
+ psubw m2, [r0 + mmsize]
52
+ psubw m3, [r0 + r1]
53
+ psubw m4, [r0 + r1 + mmsize]
54
+ pabsw m1, m1
55
+ pabsw m2, m2
56
+ pabsw m3, m3
57
+ pabsw m4, m4
58
+ paddw m1, m2
59
+ paddw m3, m4
60
+ paddw m5, m1, m3
61
+
62
+ movu m1, [r2 + 2 * r3]
63
+ movu m2, [r2 + 2 * r3 + mmsize]
64
+ movu m3, [r2 + r5]
65
+ movu m4, [r2 + r5 + mmsize]
66
+ psubw m1, [r0 + 2 * r1]
67
+ psubw m2, [r0 + 2 * r1 + mmsize]
68
+ psubw m3, [r0 + r4]
69
+ psubw m4, [r0 + r4 + mmsize]
70
+ pabsw m1, m1
71
+ pabsw m2, m2
72
+ pabsw m3, m3
73
+ pabsw m4, m4
74
+ paddw m1, m2
75
+ paddw m3, m4
76
+ paddw m1, m3
77
+
78
+ pmaddwd m5, m6
79
+ paddd m0, m5
80
+ pmaddwd m1, m6
81
+ paddd m0, m1
82
+%endmacro
83
+
84
+
85
+%macro PROCESS_SAD_32x8_AVX512 0
86
+ movu m1, [r2]
87
+ movu m2, [r2 + r3]
88
+ movu m3, [r2 + 2 * r3]
89
+ movu m4, [r2 + r5]
90
+ psubw m1, [r0]
91
+ psubw m2, [r0 + r1]
92
+ psubw m3, [r0 + 2 * r1]
93
+ psubw m4, [r0 + r4]
94
+ pabsw m1, m1
95
+ pabsw m2, m2
96
+ pabsw m3, m3
97
+ pabsw m4, m4
98
+ paddw m1, m2
99
+ paddw m3, m4
100
+ paddw m5, m1, m3
101
+
102
+ lea r0, [r0 + 4 * r1]
103
+ lea r2, [r2 + 4 * r3]
104
+
105
+ movu m1, [r2]
106
+ movu m2, [r2 + r3]
107
+ movu m3, [r2 + 2 * r3]
108
+ movu m4, [r2 + r5]
109
+ psubw m1, [r0]
110
+ psubw m2, [r0 + r1]
111
+ psubw m3, [r0 + 2 * r1]
112
+ psubw m4, [r0 + r4]
113
+ pabsw m1, m1
114
+ pabsw m2, m2
115
+ pabsw m3, m3
116
+ pabsw m4, m4
117
+ paddw m1, m2
118
+ paddw m3, m4
119
+ paddw m1, m3
120
+
121
+ pmaddwd m5, m6
122
+ paddd m0, m5
123
+ pmaddwd m1, m6
124
+ paddd m0, m1
125
+%endmacro
126
+
127
+%macro PROCESS_SAD_16x8_AVX512 0
128
+ movu ym1, [r2]
129
+ vinserti64x4 m1, [r2 + r3], 1
130
+ movu ym2, [r2 + 2 * r3]
131
+ vinserti64x4 m2, [r2 + r5], 1
132
+ movu ym3, [r0]
133
+ vinserti64x4 m3, [r0 + r1], 1
134
+ movu ym4, [r0 + 2 * r1]
135
+ vinserti64x4 m4, [r0 + r4], 1
136
+
137
+ psubw m1, m3
138
+ psubw m2, m4
139
+ pabsw m1, m1
140
+ pabsw m2, m2
141
+ paddw m5, m1, m2
142
+
143
+ lea r0, [r0 + 4 * r1]
144
+ lea r2, [r2 + 4 * r3]
145
+
146
+ movu ym1, [r2]
147
+ vinserti64x4 m1, [r2 + r3], 1
148
+ movu ym2, [r2 + 2 * r3]
149
+ vinserti64x4 m2, [r2 + r5], 1
150
+ movu ym3, [r0]
151
+ vinserti64x4 m3, [r0 + r1], 1
152
+ movu ym4, [r0 + 2 * r1]
153
+ vinserti64x4 m4, [r0 + r4], 1
154
+
155
+ psubw m1, m3
156
+ psubw m2, m4
157
+ pabsw m1, m1
158
+ pabsw m2, m2
159
+ paddw m1, m2
160
+
161
+ pmaddwd m5, m6
162
+ paddd m0, m5
163
+ pmaddwd m1, m6
164
+ paddd m0, m1
165
+%endmacro
166
+
167
+%macro PROCESS_SAD_AVX512_END 0
168
+ vextracti32x8 ym1, m0, 1
169
+ paddd ym0, ym1
170
+ vextracti64x2 xm1, m0, 1
171
+ paddd xm0, xm1
172
+ pshufd xm1, xm0, 00001110b
173
+ paddd xm0, xm1
174
+ pshufd xm1, xm0, 00000001b
175
+ paddd xm0, xm1
176
+ movd eax, xm0
177
+%endmacro
178
+
179
+;-----------------------------------------------------------------------------
180
+; int pixel_sad_64x%1( uint16_t *, intptr_t, uint16_t *, intptr_t )
181
+;-----------------------------------------------------------------------------
182
+%if ARCH_X86_64
183
+INIT_ZMM avx512
184
+cglobal pixel_sad_64x16, 4,6,7
185
+ pxor m0, m0
186
+
187
+ vbroadcasti32x8 m6, [pw_1]
188
+
189
+ add r3d, r3d
190
+ add r1d, r1d
191
+ lea r4d, [r1 * 3]
192
+ lea r5d, [r3 * 3]
193
+
194
+ PROCESS_SAD_64x8_AVX512
195
+ lea r2, [r2 + 4 * r3]
196
+ lea r0, [r0 + 4 * r1]
197
+ PROCESS_SAD_64x8_AVX512
198
+ PROCESS_SAD_AVX512_END
199
+ RET
200
+
201
x265_2.7.tar.gz/source/common/x86/ssd-a.asm -> x265_2.9.tar.gz/source/common/x86/ssd-a.asm
Changed
201
1
2
3
; Function to find ssd for 32x16 block, sse2, 12 bit depth
4
; Defined sepeartely to be called from SSD_ONE_32 macro
5
+%if ARCH_X86_64
6
+;This code is written for 64 bit architecture
7
INIT_XMM sse2
8
cglobal ssd_ss_32x16
9
pxor m8, m8
10
11
paddq m4, m5
12
paddq m9, m4
13
ret
14
+%endif
15
16
%macro SSD_ONE_32 0
17
+%if ARCH_X86_64
18
cglobal pixel_ssd_ss_32x64, 4,7,10
19
add r1d, r1d
20
add r3d, r3d
21
22
call ssd_ss_32x16
23
movq rax, m9
24
RET
25
+%endif
26
%endmacro
27
+
28
%macro SSD_ONE_SS_32 0
29
cglobal pixel_ssd_ss_32x32, 4,5,8
30
add r1d, r1d
31
32
RET
33
%endmacro
34
35
+%if ARCH_X86_64
36
INIT_YMM avx2
37
cglobal pixel_ssd_16x16, 4,7,3
38
FIX_STRIDES r1, r3
39
40
movq rax, xm3
41
RET
42
43
+INIT_ZMM avx512
44
+cglobal pixel_ssd_32x2
45
+ pxor m0, m0
46
+ movu m1, [r0]
47
+ psubw m1, [r2]
48
+ pmaddwd m1, m1
49
+ paddd m0, m1
50
+ movu m1, [r0 + r1]
51
+ psubw m1, [r2 + r3]
52
+ pmaddwd m1, m1
53
+ paddd m0, m1
54
+ lea r0, [r0 + r1 * 2]
55
+ lea r2, [r2 + r3 * 2]
56
+
57
+ mova m1, m0
58
+ pxor m2, m2
59
+ punpckldq m0, m2
60
+ punpckhdq m1, m2
61
+
62
+ paddq m3, m0
63
+ paddq m3, m1
64
+ret
65
+
66
+INIT_ZMM avx512
67
+cglobal pixel_ssd_32x32, 4,5,5
68
+ shl r1d, 1
69
+ shl r3d, 1
70
+ pxor m3, m3
71
+ mov r4, 16
72
+.iterate:
73
+ call pixel_ssd_32x2
74
+ dec r4d
75
+ jne .iterate
76
+
77
+ vextracti32x8 ym4, m3, 1
78
+ paddq ym3, ym4
79
+ vextracti32x4 xm4, m3, 1
80
+ paddq xm3, xm4
81
+ movhlps xm4, xm3
82
+ paddq xm3, xm4
83
+ movq rax, xm3
84
+RET
85
+
86
+INIT_ZMM avx512
87
+cglobal pixel_ssd_32x64, 4,5,5
88
+ shl r1d, 1
89
+ shl r3d, 1
90
+ pxor m3, m3
91
+ mov r4, 32
92
+.iterate:
93
+ call pixel_ssd_32x2
94
+ dec r4d
95
+ jne .iterate
96
+
97
+ vextracti32x8 ym4, m3, 1
98
+ paddq ym3, ym4
99
+ vextracti32x4 xm4, m3, 1
100
+ paddq xm3, xm4
101
+ movhlps xm4, xm3
102
+ paddq xm3, xm4
103
+ movq rax, xm3
104
+RET
105
+
106
+INIT_ZMM avx512
107
+cglobal pixel_ssd_64x64, 4,5,5
108
+ FIX_STRIDES r1, r3
109
+ mov r4d, 64
110
+ pxor m3, m3
111
+
112
+.loop:
113
+ pxor m0, m0
114
+ movu m1, [r0]
115
+ psubw m1, [r2]
116
+ pmaddwd m1, m1
117
+ paddd m0, m1
118
+ movu m1, [r0 + mmsize]
119
+ psubw m1, [r2 + mmsize]
120
+ pmaddwd m1, m1
121
+ paddd m0, m1
122
+
123
+ lea r0, [r0 + r1]
124
+ lea r2, [r2 + r3]
125
+
126
+ mova m1, m0
127
+ pxor m2, m2
128
+ punpckldq m0, m2
129
+ punpckhdq m1, m2
130
+ paddq m3, m0
131
+ paddq m3, m1
132
+
133
+ dec r4d
134
+ jg .loop
135
+
136
+ vextracti32x8 ym4, m3, 1
137
+ paddq ym3, ym4
138
+ vextracti32x4 xm4, m3, 1
139
+ paddq xm3, xm4
140
+ movhlps xm4, xm3
141
+ paddq xm3, xm4
142
+ movq rax, xm3
143
+ RET
144
+%endif
145
INIT_MMX mmx2
146
SSD_ONE 4, 4
147
SSD_ONE 4, 8
148
149
%if BIT_DEPTH <= 10
150
SSD_ONE 32, 64
151
SSD_ONE 32, 32
152
+%if ARCH_X86_64
153
SSD_TWO 64, 64
154
+%endif
155
%else
156
SSD_ONE_32
157
SSD_ONE_SS_32
158
159
HADDD m2, m0
160
movd eax, xm2
161
RET
162
+;-----------------------------------------------------------------------------
163
+; ssd_ss avx512 code start
164
+;-----------------------------------------------------------------------------
165
+%if ARCH_X86_64
166
+%macro PROCESS_SSD_SS_64x4_AVX512 0
167
+ movu m0, [r0]
168
+ movu m1, [r0 + mmsize]
169
+ movu m2, [r0 + r1]
170
+ movu m3, [r0 + r1 + mmsize]
171
+ movu m4, [r2]
172
+ movu m5, [r2 + mmsize]
173
+ movu m6, [r2 + r3]
174
+ movu m7, [r2 + r3 + mmsize]
175
+
176
+ psubw m0, m4
177
+ psubw m1, m5
178
+ psubw m2, m6
179
+ psubw m3, m7
180
+ pmaddwd m0, m0
181
+ pmaddwd m1, m1
182
+ pmaddwd m2, m2
183
+ pmaddwd m3, m3
184
+ paddd m8, m0
185
+ paddd m8, m1
186
+ paddd m8, m2
187
+ paddd m8, m3
188
189
+ movu m0, [r0 + 2 * r1]
190
+ movu m1, [r0 + 2 * r1 + mmsize]
191
+ movu m2, [r0 + r5]
192
+ movu m3, [r0 + r5 + mmsize]
193
+ movu m4, [r2 + 2 * r3]
194
+ movu m5, [r2 + 2 * r3 + mmsize]
195
+ movu m6, [r2 + r6]
196
+ movu m7, [r2 + r6 + mmsize]
197
+
198
+ psubw m0, m4
199
+ psubw m1, m5
200
+ psubw m2, m6
201
x265_2.7.tar.gz/source/common/x86/v4-ipfilter16.asm -> x265_2.9.tar.gz/source/common/x86/v4-ipfilter16.asm
Changed
17
1
2
RET
3
%endmacro
4
5
+%if ARCH_X86_64
6
FILTER_VER_CHROMA_AVX2_4xN pp, 16, 1, 6
7
FILTER_VER_CHROMA_AVX2_4xN ps, 16, 0, INTERP_SHIFT_PS
8
FILTER_VER_CHROMA_AVX2_4xN sp, 16, 1, INTERP_SHIFT_SP
9
10
FILTER_VER_CHROMA_AVX2_4xN ps, 32, 0, INTERP_SHIFT_PS
11
FILTER_VER_CHROMA_AVX2_4xN sp, 32, 1, INTERP_SHIFT_SP
12
FILTER_VER_CHROMA_AVX2_4xN ss, 32, 0, 6
13
+%endif
14
15
%macro FILTER_VER_CHROMA_AVX2_8x8 3
16
INIT_YMM avx2
17
x265_2.7.tar.gz/source/common/x86/v4-ipfilter8.asm -> x265_2.9.tar.gz/source/common/x86/v4-ipfilter8.asm
Changed
201
1
2
const v4_interp4_vpp_shuf1, dd 0, 1, 1, 2, 2, 3, 3, 4
3
dd 2, 3, 3, 4, 4, 5, 5, 6
4
5
-const tab_ChromaCoeff, db 0, 64, 0, 0
6
+const v4_tab_ChromaCoeff, db 0, 64, 0, 0
7
db -2, 58, 10, -2
8
db -4, 54, 16, -2
9
db -6, 46, 28, -4
10
11
mova m6, [r5 + r4]
12
mova m5, [r5 + r4 + 16]
13
%else
14
- mova m6, [tab_ChromaCoeff + r4]
15
- mova m5, [tab_ChromaCoeff + r4 + 16]
16
+ mova m6, [v4_tab_ChromaCoeff + r4]
17
+ mova m5, [v4_tab_ChromaCoeff + r4 + 16]
18
%endif
19
20
%ifidn %1,pp
21
22
sub r0, r1
23
24
%ifdef PIC
25
- lea r5, [tab_ChromaCoeff]
26
+ lea r5, [v4_tab_ChromaCoeff]
27
movd m0, [r5 + r4 * 4]
28
%else
29
- movd m0, [tab_ChromaCoeff + r4 * 4]
30
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
31
%endif
32
lea r4, [r1 * 3]
33
lea r5, [r0 + 4 * r1]
34
35
sub r0, r1
36
37
%ifdef PIC
38
- lea r5, [tab_ChromaCoeff]
39
+ lea r5, [v4_tab_ChromaCoeff]
40
movd m0, [r5 + r4 * 4]
41
%else
42
- movd m0, [tab_ChromaCoeff + r4 * 4]
43
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
44
%endif
45
46
pshufb m0, [tab_Cm]
47
48
sub r0, r1
49
50
%ifdef PIC
51
- lea r5, [tab_ChromaCoeff]
52
+ lea r5, [v4_tab_ChromaCoeff]
53
movd m0, [r5 + r4 * 4]
54
%else
55
- movd m0, [tab_ChromaCoeff + r4 * 4]
56
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
57
%endif
58
59
pshufb m0, [tab_Cm]
60
61
sub r0, r1
62
63
%ifdef PIC
64
- lea r5, [tab_ChromaCoeff]
65
+ lea r5, [v4_tab_ChromaCoeff]
66
movd m0, [r5 + r4 * 4]
67
%else
68
- movd m0, [tab_ChromaCoeff + r4 * 4]
69
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
70
%endif
71
72
pshufb m0, [tab_Cm]
73
74
sub r0, r1
75
76
%ifdef PIC
77
- lea r5, [tab_ChromaCoeff]
78
+ lea r5, [v4_tab_ChromaCoeff]
79
movd m0, [r5 + r4 * 4]
80
%else
81
- movd m0, [tab_ChromaCoeff + r4 * 4]
82
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
83
%endif
84
85
pshufb m0, [tab_Cm]
86
87
punpcklbw m4, m2, m3
88
89
%ifdef PIC
90
- lea r6, [tab_ChromaCoeff]
91
+ lea r6, [v4_tab_ChromaCoeff]
92
movd m5, [r6 + r4 * 4]
93
%else
94
- movd m5, [tab_ChromaCoeff + r4 * 4]
95
+ movd m5, [v4_tab_ChromaCoeff + r4 * 4]
96
%endif
97
98
pshufb m6, m5, [tab_Vm]
99
100
add r3d, r3d
101
102
%ifdef PIC
103
- lea r5, [tab_ChromaCoeff]
104
+ lea r5, [v4_tab_ChromaCoeff]
105
movd m0, [r5 + r4 * 4]
106
%else
107
- movd m0, [tab_ChromaCoeff + r4 * 4]
108
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
109
%endif
110
111
pshufb m0, [tab_Cm]
112
113
add r3d, r3d
114
115
%ifdef PIC
116
- lea r5, [tab_ChromaCoeff]
117
+ lea r5, [v4_tab_ChromaCoeff]
118
movd m0, [r5 + r4 * 4]
119
%else
120
- movd m0, [tab_ChromaCoeff + r4 * 4]
121
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
122
%endif
123
124
pshufb m0, [tab_Cm]
125
126
add r3d, r3d
127
128
%ifdef PIC
129
- lea r5, [tab_ChromaCoeff]
130
+ lea r5, [v4_tab_ChromaCoeff]
131
movd m0, [r5 + r4 * 4]
132
%else
133
- movd m0, [tab_ChromaCoeff + r4 * 4]
134
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
135
%endif
136
137
pshufb m0, [tab_Cm]
138
139
add r3d, r3d
140
141
%ifdef PIC
142
- lea r5, [tab_ChromaCoeff]
143
+ lea r5, [v4_tab_ChromaCoeff]
144
movd m5, [r5 + r4 * 4]
145
%else
146
- movd m5, [tab_ChromaCoeff + r4 * 4]
147
+ movd m5, [v4_tab_ChromaCoeff + r4 * 4]
148
%endif
149
150
pshufb m6, m5, [tab_Vm]
151
152
add r3d, r3d
153
154
%ifdef PIC
155
- lea r5, [tab_ChromaCoeff]
156
+ lea r5, [v4_tab_ChromaCoeff]
157
movd m5, [r5 + r4 * 4]
158
%else
159
- movd m5, [tab_ChromaCoeff + r4 * 4]
160
+ movd m5, [v4_tab_ChromaCoeff + r4 * 4]
161
%endif
162
163
pshufb m6, m5, [tab_Vm]
164
165
add r3d, r3d
166
167
%ifdef PIC
168
- lea r5, [tab_ChromaCoeff]
169
+ lea r5, [v4_tab_ChromaCoeff]
170
movd m5, [r5 + r4 * 4]
171
%else
172
- movd m5, [tab_ChromaCoeff + r4 * 4]
173
+ movd m5, [v4_tab_ChromaCoeff + r4 * 4]
174
%endif
175
176
pshufb m6, m5, [tab_Vm]
177
178
add r3d, r3d
179
180
%ifdef PIC
181
- lea r5, [tab_ChromaCoeff]
182
+ lea r5, [v4_tab_ChromaCoeff]
183
movd m0, [r5 + r4 * 4]
184
%else
185
- movd m0, [tab_ChromaCoeff + r4 * 4]
186
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
187
%endif
188
189
pshufb m1, m0, [tab_Vm]
190
191
add r3d, r3d
192
193
%ifdef PIC
194
- lea r5, [tab_ChromaCoeff]
195
+ lea r5, [v4_tab_ChromaCoeff]
196
movd m0, [r5 + r4 * 4]
197
%else
198
- movd m0, [tab_ChromaCoeff + r4 * 4]
199
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
200
%endif
201
x265_2.7.tar.gz/source/common/x86/x86inc.asm -> x265_2.9.tar.gz/source/common/x86/x86inc.asm
Changed
201
1
2
%endif
3
4
%macro SECTION_RODATA 0-1 32
5
- SECTION .rodata align=%1
6
+ %ifidn __OUTPUT_FORMAT__,win32
7
+ SECTION .rdata align=%1
8
+ %elif WIN64
9
+ SECTION .rdata align=%1
10
+ %else
11
+ SECTION .rodata align=%1
12
+ %endif
13
%endmacro
14
15
%if WIN64
16
17
%endmacro
18
19
%define required_stack_alignment ((mmsize + 15) & ~15)
20
+%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
21
+%define high_mm_regs (16*cpuflag(avx512))
22
23
%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
24
%ifnum %1
25
26
27
%macro WIN64_PUSH_XMM 0
28
; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
29
- %if xmm_regs_used > 6
30
+ %if xmm_regs_used > 6 + high_mm_regs
31
movaps [rstk + stack_offset + 8], xmm6
32
%endif
33
- %if xmm_regs_used > 7
34
+ %if xmm_regs_used > 7 + high_mm_regs
35
movaps [rstk + stack_offset + 24], xmm7
36
%endif
37
- %if xmm_regs_used > 8
38
+ %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
39
+ %if %%xmm_regs_on_stack > 0
40
%assign %%i 8
41
- %rep xmm_regs_used-8
42
+ %rep %%xmm_regs_on_stack
43
movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
44
%assign %%i %%i+1
45
%endrep
46
47
48
%macro WIN64_SPILL_XMM 1
49
%assign xmm_regs_used %1
50
- ASSERT xmm_regs_used <= 16
51
- %if xmm_regs_used > 8
52
+ ASSERT xmm_regs_used <= 16 + high_mm_regs
53
+ %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
54
+ %if %%xmm_regs_on_stack > 0
55
; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
56
%assign %%pad (xmm_regs_used-8)*16 + 32
57
%assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
58
59
60
%macro WIN64_RESTORE_XMM_INTERNAL 0
61
%assign %%pad_size 0
62
- %if xmm_regs_used > 8
63
- %assign %%i xmm_regs_used
64
- %rep xmm_regs_used-8
65
+ %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
66
+ %if %%xmm_regs_on_stack > 0
67
+ %assign %%i xmm_regs_used - high_mm_regs
68
+ %rep %%xmm_regs_on_stack
69
%assign %%i %%i-1
70
movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
71
%endrep
72
73
%assign %%pad_size stack_size_padded
74
%endif
75
%endif
76
- %if xmm_regs_used > 7
77
+ %if xmm_regs_used > 7 + high_mm_regs
78
movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
79
%endif
80
- %if xmm_regs_used > 6
81
+ %if xmm_regs_used > 6 + high_mm_regs
82
movaps xmm6, [rsp + stack_offset - %%pad_size + 8]
83
%endif
84
%endmacro
85
86
%assign xmm_regs_used 0
87
%endmacro
88
89
-%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
90
+%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6 + high_mm_regs
91
92
%macro RET 0
93
WIN64_RESTORE_XMM_INTERNAL
94
POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
95
- %if mmsize == 32
96
+ %if vzeroupper_required
97
vzeroupper
98
%endif
99
AUTO_REP_RET
100
101
DECLARE_REG 13, R12, 64
102
DECLARE_REG 14, R13, 72
103
104
-%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
105
+%macro PROLOGUE 2-5+ 0; #args, #regs, #xmm_regs, [stack_size,] arg_names...
106
%assign num_args %1
107
%assign regs_used %2
108
+ %assign xmm_regs_used %3
109
ASSERT regs_used >= num_args
110
SETUP_STACK_POINTER %4
111
ASSERT regs_used <= 15
112
113
DEFINE_ARGS_INTERNAL %0, %4, %5
114
%endmacro
115
116
-%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
117
+%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
118
119
%macro RET 0
120
%if stack_size_padded > 0
121
122
%endif
123
%endif
124
POP_IF_USED 14, 13, 12, 11, 10, 9
125
- %if mmsize == 32
126
+ %if vzeroupper_required
127
vzeroupper
128
%endif
129
AUTO_REP_RET
130
131
DEFINE_ARGS_INTERNAL %0, %4, %5
132
%endmacro
133
134
-%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
135
+%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
136
137
%macro RET 0
138
%if stack_size_padded > 0
139
140
%endif
141
%endif
142
POP_IF_USED 6, 5, 4, 3
143
- %if mmsize == 32
144
+ %if vzeroupper_required
145
vzeroupper
146
%endif
147
AUTO_REP_RET
148
149
%assign stack_offset 0 ; stack pointer offset relative to the return address
150
%assign stack_size 0 ; amount of stack space that can be freely used inside a function
151
%assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
152
- %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
153
+ %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
154
%ifnidn %3, ""
155
PROLOGUE %3
156
%endif
157
%endmacro
158
159
+; Create a global symbol from a local label with the correct name mangling and type
160
+%macro cglobal_label 1
161
+ %if FORMAT_ELF
162
+ global current_function %+ %1:function hidden
163
+ %else
164
+ global current_function %+ %1
165
+ %endif
166
+ %1:
167
+%endmacro
168
+
169
%macro cextern 1
170
%xdefine %1 mangle(private_prefix %+ _ %+ %1)
171
CAT_XDEFINE cglobaled_, %1, 1
172
173
%assign cpuflags_bmi1 (1<<16)| cpuflags_avx | cpuflags_lzcnt
174
%assign cpuflags_bmi2 (1<<17)| cpuflags_bmi1
175
%assign cpuflags_avx2 (1<<18)| cpuflags_fma3 | cpuflags_bmi2
176
+%assign cpuflags_avx512 (1<<19)| cpuflags_avx2 ; F, CD, BW, DQ, VL
177
178
-%assign cpuflags_cache32 (1<<19)
179
-%assign cpuflags_cache64 (1<<20)
180
-%assign cpuflags_slowctz (1<<21)
181
+%assign cpuflags_cache32 (1<<20)
182
+%assign cpuflags_cache64 (1<<21)
183
%assign cpuflags_aligned (1<<22) ; not a cpu feature, but a function variant
184
%assign cpuflags_atom (1<<23)
185
186
187
%endif
188
%endmacro
189
190
-; Merge mmx and sse*
191
+; Merge mmx and sse*, and avx*
192
; m# is a simd register of the currently selected size
193
; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
194
; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
195
-; (All 3 remain in sync through SWAP.)
196
+; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m#
197
+; (All 4 remain in sync through SWAP.)
198
199
%macro CAT_XDEFINE 3
200
%xdefine %1%2 %3
201
x265_2.7.tar.gz/source/common/x86/x86util.asm -> x265_2.9.tar.gz/source/common/x86/x86util.asm
Changed
101
1
2
pminsw %2, %4
3
%endmacro
4
5
+%macro MOVHL 2 ; dst, src
6
+%ifidn %1, %2
7
+ punpckhqdq %1, %2
8
+%elif cpuflag(avx)
9
+ punpckhqdq %1, %2, %2
10
+%elif cpuflag(sse4)
11
+ pshufd %1, %2, q3232 ; pshufd is slow on some older CPUs, so only use it on more modern ones
12
+%else
13
+ movhlps %1, %2 ; may cause an int/float domain transition and has a dependency on dst
14
+%endif
15
+%endmacro
16
+
17
%macro HADDD 2 ; sum junk
18
-%if sizeof%1 == 32
19
-%define %2 xmm%2
20
- vextracti128 %2, %1, 1
21
-%define %1 xmm%1
22
- paddd %1, %2
23
+%if sizeof%1 >= 64
24
+ vextracti32x8 ymm%2, zmm%1, 1
25
+ paddd ymm%1, ymm%2
26
%endif
27
-%if mmsize >= 16
28
-%if cpuflag(xop) && sizeof%1 == 16
29
- vphadddq %1, %1
30
+%if sizeof%1 >= 32
31
+ vextracti128 xmm%2, ymm%1, 1
32
+ paddd xmm%1, xmm%2
33
+%endif
34
+%if sizeof%1 >= 16
35
+ MOVHL xmm%2, xmm%1
36
+ paddd xmm%1, xmm%2
37
%endif
38
- movhlps %2, %1
39
- paddd %1, %2
40
+%if cpuflag(xop) && sizeof%1 == 16
41
+ vphadddq xmm%1, xmm%1
42
%endif
43
%if notcpuflag(xop)
44
- PSHUFLW %2, %1, q0032
45
- paddd %1, %2
46
+ PSHUFLW xmm%2, xmm%1, q1032
47
+ paddd xmm%1, xmm%2
48
%endif
49
-%undef %1
50
-%undef %2
51
%endmacro
52
53
%macro HADDW 2 ; reg, tmp
54
%if cpuflag(xop) && sizeof%1 == 16
55
vphaddwq %1, %1
56
- movhlps %2, %1
57
+ MOVHL %2, %1
58
paddd %1, %2
59
%else
60
pmaddwd %1, [pw_1]
61
62
%macro HADDUW 2
63
%if cpuflag(xop) && sizeof%1 == 16
64
vphadduwq %1, %1
65
- movhlps %2, %1
66
+ MOVHL %2, %1
67
paddd %1, %2
68
%else
69
HADDUWD %1, %2
70
71
%if %6 ; %5 aligned?
72
mova %1, %4
73
psubw %1, %5
74
+%elif cpuflag(avx)
75
+ movu %1, %4
76
+ psubw %1, %5
77
%else
78
movu %1, %4
79
movu %2, %5
80
psubw %1, %2
81
%endif
82
%else ; !HIGH_BIT_DEPTH
83
-%ifidn %3, none
84
movh %1, %4
85
movh %2, %5
86
+%ifidn %3, none
87
punpcklbw %1, %2
88
punpcklbw %2, %2
89
- psubw %1, %2
90
%else
91
- movh %1, %4
92
punpcklbw %1, %3
93
- movh %2, %5
94
punpcklbw %2, %3
95
- psubw %1, %2
96
%endif
97
+ psubw %1, %2
98
%endif ; HIGH_BIT_DEPTH
99
%endmacro
100
101
x265_2.7.tar.gz/source/common/yuv.cpp -> x265_2.9.tar.gz/source/common/yuv.cpp
Changed
39
1
2
3
void Yuv::addClip(const Yuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t log2SizeL, int picCsp)
4
{
5
- primitives.cu[log2SizeL - 2].add_ps(m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
6
+ primitives.cu[log2SizeL - 2].add_ps[(m_size % 64 == 0) && (srcYuv0.m_size % 64 == 0) && (srcYuv1.m_size % 64 == 0)](m_buf[0],
7
+ m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
8
if (m_csp != X265_CSP_I400 && picCsp != X265_CSP_I400)
9
{
10
- primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
11
- primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
12
+ primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps[(m_csize % 64 == 0) && (srcYuv0.m_csize % 64 ==0) && (srcYuv1.m_csize % 64 == 0)](m_buf[1],
13
+ m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
14
+ primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps[(m_csize % 64 == 0) && (srcYuv0.m_csize % 64 == 0) && (srcYuv1.m_csize % 64 == 0)](m_buf[2],
15
+ m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
16
}
17
if (picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
18
{
19
20
const int16_t* srcY0 = srcYuv0.getLumaAddr(absPartIdx);
21
const int16_t* srcY1 = srcYuv1.getLumaAddr(absPartIdx);
22
pixel* dstY = getLumaAddr(absPartIdx);
23
- primitives.pu[part].addAvg(srcY0, srcY1, dstY, srcYuv0.m_size, srcYuv1.m_size, m_size);
24
+ primitives.pu[part].addAvg[(srcYuv0.m_size % 64 == 0) && (srcYuv1.m_size % 64 == 0) && (m_size % 64 == 0)](srcY0, srcY1, dstY, srcYuv0.m_size, srcYuv1.m_size, m_size);
25
}
26
if (bChroma)
27
{
28
29
const int16_t* srcV1 = srcYuv1.getCrAddr(absPartIdx);
30
pixel* dstU = getCbAddr(absPartIdx);
31
pixel* dstV = getCrAddr(absPartIdx);
32
- primitives.chroma[m_csp].pu[part].addAvg(srcU0, srcU1, dstU, srcYuv0.m_csize, srcYuv1.m_csize, m_csize);
33
- primitives.chroma[m_csp].pu[part].addAvg(srcV0, srcV1, dstV, srcYuv0.m_csize, srcYuv1.m_csize, m_csize);
34
+ primitives.chroma[m_csp].pu[part].addAvg[(srcYuv0.m_csize % 64 == 0) && (srcYuv1.m_csize % 64 == 0) && (m_csize % 64 == 0)](srcU0, srcU1, dstU, srcYuv0.m_csize, srcYuv1.m_csize, m_csize);
35
+ primitives.chroma[m_csp].pu[part].addAvg[(srcYuv0.m_csize % 64 == 0) && (srcYuv1.m_csize % 64 == 0) && (m_csize % 64 == 0)](srcV0, srcV1, dstV, srcYuv0.m_csize, srcYuv1.m_csize, m_csize);
36
}
37
}
38
39
x265_2.7.tar.gz/source/common/yuv.h -> x265_2.9.tar.gz/source/common/yuv.h
Changed
9
1
2
class Yuv
3
{
4
public:
5
-
6
pixel* m_buf[3];
7
8
uint32_t m_size;
9
x265_2.7.tar.gz/source/dynamicHDR10/SeiMetadataDictionary.cpp -> x265_2.9.tar.gz/source/dynamicHDR10/SeiMetadataDictionary.cpp
Changed
28
1
2
const std::string BezierCurveNames::NumberOfAnchors = std::string("NumberOfAnchors");
3
const std::string BezierCurveNames::KneePointX = std::string("KneePointX");
4
const std::string BezierCurveNames::KneePointY = std::string("KneePointY");
5
+const std::string BezierCurveNames::AnchorsTag = std::string("Anchors");
6
const std::string BezierCurveNames::Anchors[] = {std::string("Anchor0"),
7
std::string("Anchor1"),
8
std::string("Anchor2"),
9
10
11
const std::string PercentileNames::TagName = std::string("PercentileLuminance");
12
const std::string PercentileNames::NumberOfPercentiles = std::string("NumberOfPercentiles");
13
+const std::string PercentileNames::DistributionIndex = std::string("DistributionIndex");
14
+const std::string PercentileNames::DistributionValues = std::string("DistributionValues");
15
const std::string PercentileNames::PercentilePercentageValue[] = {std::string("PercentilePercentage0"),
16
std::string("PercentilePercentage1"),
17
std::string("PercentilePercentage2"),
18
19
20
21
const std::string LuminanceNames::TagName = std::string("LuminanceParameters");
22
+const std::string LuminanceNames::LlcTagName = std::string("LuminanceDistributions");
23
const std::string LuminanceNames::AverageRGB = std::string("AverageRGB");
24
+const std::string LuminanceNames::MaxSCL = std::string("MaxScl");
25
const std::string LuminanceNames::MaxSCL0 = std::string("MaxScl0");
26
const std::string LuminanceNames::MaxSCL1 = std::string("MaxScl1");
27
const std::string LuminanceNames::MaxSCL2 = std::string("MaxScl2");
28
x265_2.7.tar.gz/source/dynamicHDR10/SeiMetadataDictionary.h -> x265_2.9.tar.gz/source/dynamicHDR10/SeiMetadataDictionary.h
Changed
28
1
2
static const std::string NumberOfAnchors;
3
static const std::string KneePointX;
4
static const std::string KneePointY;
5
+ static const std::string AnchorsTag;
6
static const std::string Anchors[14];
7
};
8
//Ellipse Selection Data
9
10
public:
11
static const std::string TagName;
12
static const std::string NumberOfPercentiles;
13
+ static const std::string DistributionIndex;
14
+ static const std::string DistributionValues;
15
static const std::string PercentilePercentageValue[15];
16
static const std::string PercentileLuminanceValue[15];
17
};
18
19
{
20
public:
21
static const std::string TagName;
22
+ static const std::string LlcTagName;
23
static const std::string AverageRGB;
24
+ static const std::string MaxSCL;
25
static const std::string MaxSCL0;
26
static const std::string MaxSCL1;
27
static const std::string MaxSCL2;
28
x265_2.7.tar.gz/source/dynamicHDR10/metadataFromJson.cpp -> x265_2.9.tar.gz/source/dynamicHDR10/metadataFromJson.cpp
Changed
201
1
2
int mCurrentStreamBit;
3
int mCurrentStreamByte;
4
5
- bool luminanceParamFromJson(const Json &data, LuminanceParameters &obj)
6
+ bool luminanceParamFromJson(const Json &data, LuminanceParameters &obj, const JsonType jsonType)
7
{
8
JsonObject lumJsonData = data.object_items();
9
if(!lumJsonData.empty())
10
{
11
- JsonObject percentileData = lumJsonData[PercentileNames::TagName].object_items();
12
- obj.order = percentileData[PercentileNames::NumberOfPercentiles].int_value();
13
-
14
- obj.averageLuminance = static_cast<float>(lumJsonData[LuminanceNames::AverageRGB].number_value());
15
- obj.maxRLuminance = static_cast<float>(lumJsonData[LuminanceNames::MaxSCL0].number_value());
16
- obj.maxGLuminance = static_cast<float>(lumJsonData[LuminanceNames::MaxSCL1].number_value());
17
- obj.maxBLuminance = static_cast<float>(lumJsonData[LuminanceNames::MaxSCL2].number_value());
18
-
19
- if(!percentileData.empty())
20
- {
21
- obj.percentiles.resize(obj.order);
22
- for(int i = 0; i < obj.order; ++i)
23
- {
24
- std::string percentileTag = PercentileNames::TagName;
25
- percentileTag += std::to_string(i);
26
- obj.percentiles[i] = static_cast<unsigned int>(percentileData[percentileTag].int_value());
27
- }
28
- }
29
-
30
- return true;
31
- }
32
- return false;
33
- }
34
-
35
- bool percentagesFromJson(const Json &data, std::vector<unsigned int> &percentages)
36
- {
37
- JsonObject jsonData = data.object_items();
38
- if(!jsonData.empty())
39
- {
40
- JsonObject percentileData = jsonData[PercentileNames::TagName].object_items();
41
- int order = percentileData[PercentileNames::NumberOfPercentiles].int_value();
42
-
43
- percentages.resize(order);
44
- for(int i = 0; i < order; ++i)
45
- {
46
- std::string percentileTag = PercentileNames::PercentilePercentageValue[i];
47
- percentages[i] = static_cast<unsigned int>(percentileData[percentileTag].int_value());
48
- }
49
-
50
- return true;
51
- }
52
+ switch(jsonType)
53
+ {
54
+ case LEGACY:
55
+ {
56
+ obj.averageLuminance = static_cast<float>(lumJsonData[LuminanceNames::AverageRGB].number_value());
57
+ obj.maxRLuminance = static_cast<float>(lumJsonData[LuminanceNames::MaxSCL0].number_value());
58
+ obj.maxGLuminance = static_cast<float>(lumJsonData[LuminanceNames::MaxSCL1].number_value());
59
+ obj.maxBLuminance = static_cast<float>(lumJsonData[LuminanceNames::MaxSCL2].number_value());
60
+
61
+ JsonObject percentileData = lumJsonData[PercentileNames::TagName].object_items();
62
+ obj.order = percentileData[PercentileNames::NumberOfPercentiles].int_value();
63
+ if(!percentileData.empty())
64
+ {
65
+ obj.percentiles.resize(obj.order);
66
+ for(int i = 0; i < obj.order; ++i)
67
+ {
68
+ std::string percentileTag = PercentileNames::TagName;
69
+ percentileTag += std::to_string(i);
70
+ obj.percentiles[i] = static_cast<unsigned int>(percentileData[percentileTag].int_value());
71
+ }
72
+ }
73
+ return true;
74
+ } break;
75
+ case LLC:
76
+ {
77
+ obj.averageLuminance = static_cast<float>(lumJsonData[LuminanceNames::AverageRGB].number_value());
78
+ JsonArray maxScl = lumJsonData[LuminanceNames::MaxSCL].array_items();
79
+ obj.maxRLuminance = static_cast<float>(maxScl[0].number_value());
80
+ obj.maxGLuminance = static_cast<float>(maxScl[1].number_value());
81
+ obj.maxBLuminance = static_cast<float>(maxScl[2].number_value());
82
+
83
+ JsonObject percentileData = lumJsonData[LuminanceNames::LlcTagName].object_items();
84
+ if(!percentileData.empty())
85
+ {
86
+ JsonArray distributionValues = percentileData[PercentileNames::DistributionValues].array_items();
87
+ obj.order = static_cast<int>(distributionValues.size());
88
+ obj.percentiles.resize(obj.order);
89
+ for(int i = 0; i < obj.order; ++i)
90
+ {
91
+ obj.percentiles[i] = static_cast<unsigned int>(distributionValues[i].int_value());
92
+ }
93
+ }
94
+ return true;
95
+ } break;
96
+ }
97
+ }
98
return false;
99
}
100
101
- bool percentagesFromJson(const Json &data, unsigned int *percentages)
102
+ bool percentagesFromJson(const Json &data, std::vector<unsigned int> &percentages, const JsonType jsonType)
103
{
104
JsonObject jsonData = data.object_items();
105
if(!jsonData.empty())
106
{
107
- JsonObject percentileData = jsonData[PercentileNames::TagName].object_items();
108
- int order = percentileData[PercentileNames::NumberOfPercentiles].int_value();
109
-
110
- for(int i = 0; i < order; ++i)
111
- {
112
- std::string percentileTag = PercentileNames::PercentilePercentageValue[i];
113
- percentages[i] = static_cast<unsigned int>(percentileData[percentileTag].int_value());
114
- }
115
+ switch(jsonType)
116
+ {
117
+ case LEGACY:
118
+ {
119
+ JsonObject percentileData = jsonData[PercentileNames::TagName].object_items();
120
+ int order = percentileData[PercentileNames::NumberOfPercentiles].int_value();
121
+ percentages.resize(order);
122
+ for(int i = 0; i < order; ++i)
123
+ {
124
+ std::string percentileTag = PercentileNames::PercentilePercentageValue[i];
125
+ percentages[i] = static_cast<unsigned int>(percentileData[percentileTag].int_value());
126
+ }
127
+ return true;
128
+ } break;
129
+ case LLC:
130
+ {
131
+ JsonObject percentileData = jsonData[LuminanceNames::LlcTagName].object_items();
132
+ if(!percentileData.empty())
133
+ {
134
+ JsonArray percentageValues = percentileData[PercentileNames::DistributionIndex].array_items();
135
+ int order = static_cast<int>(percentageValues.size());
136
+ percentages.resize(order);
137
+ for(int i = 0; i < order; ++i)
138
+ {
139
+ percentages[i] = static_cast<unsigned int>(percentageValues[i].int_value());
140
+ }
141
+ }
142
+ return true;
143
+ } break;
144
+ }
145
146
- return true;
147
}
148
return false;
149
}
150
151
- bool bezierCurveFromJson(const Json &data, BezierCurveData &obj)
152
+ bool bezierCurveFromJson(const Json &data, BezierCurveData &obj, const JsonType jsonType)
153
{
154
JsonObject jsonData = data.object_items();
155
if(!jsonData.empty())
156
{
157
- obj.order = jsonData[BezierCurveNames::NumberOfAnchors].int_value();
158
- obj.coeff.resize(obj.order);
159
- obj.sPx = jsonData[BezierCurveNames::KneePointX].int_value();
160
- obj.sPy = jsonData[BezierCurveNames::KneePointY].int_value();
161
- for(int i = 0; i < obj.order; ++i)
162
- {
163
- obj.coeff[i] = jsonData[BezierCurveNames::Anchors[i]].int_value();
164
- }
165
-
166
- return true;
167
+ switch(jsonType)
168
+ {
169
+ case LEGACY:
170
+ {
171
+ obj.sPx = jsonData[BezierCurveNames::KneePointX].int_value();
172
+ obj.sPy = jsonData[BezierCurveNames::KneePointY].int_value();
173
+ obj.order = jsonData[BezierCurveNames::NumberOfAnchors].int_value();
174
+ obj.coeff.resize(obj.order);
175
+ for(int i = 0; i < obj.order; ++i)
176
+ {
177
+ obj.coeff[i] = jsonData[BezierCurveNames::Anchors[i]].int_value();
178
+ }
179
+ return true;
180
+ } break;
181
+ case LLC:
182
+ {
183
+ obj.sPx = jsonData[BezierCurveNames::KneePointX].int_value();
184
+ obj.sPy = jsonData[BezierCurveNames::KneePointY].int_value();
185
+ JsonArray anchorValues = data[BezierCurveNames::AnchorsTag].array_items();
186
+ obj.order = static_cast<int>(anchorValues.size());
187
+ obj.coeff.resize(obj.order);
188
+ for(int i = 0; i < obj.order; ++i)
189
+ {
190
+ obj.coeff[i] = anchorValues[i].int_value();
191
+ }
192
+ return true;
193
+ } break;
194
+ }
195
}
196
return false;
197
}
198
199
void setPayloadSize(uint8_t *dataStream, int positionOnStream, int payload)
200
{
201
x265_2.7.tar.gz/source/dynamicHDR10/metadataFromJson.h -> x265_2.9.tar.gz/source/dynamicHDR10/metadataFromJson.h
Changed
31
1
2
#define METADATAFROMJSON_H
3
4
#include<stdint.h>
5
-#include "string"
6
+#include<cstring>
7
#include "JsonHelper.h"
8
9
class metadataFromJson
10
11
metadataFromJson();
12
~metadataFromJson();
13
14
+ enum JsonType{
15
+ LEGACY,
16
+ LLC
17
+ };
18
+
19
20
/**
21
* @brief frameMetadataFromJson: Generates a sigle frame metadata array from Json file with all
22
23
24
class DynamicMetaIO;
25
DynamicMetaIO *mPimpl;
26
- void fillMetadataArray(const JsonArray &fileData, int frame, uint8_t *&metadata);
27
+ void fillMetadataArray(const JsonArray &fileData, int frame, const JsonType jsonType, uint8_t *&metadata);
28
};
29
30
#endif // METADATAFROMJSON_H
31
x265_2.7.tar.gz/source/encoder/analysis.cpp -> x265_2.9.tar.gz/source/encoder/analysis.cpp
Changed
201
1
2
using namespace X265_NS;
3
4
/* An explanation of rate distortion levels (--rd-level)
5
- *
6
+ *
7
* rd-level 0 generates no recon per CU (NO RDO or Quant)
8
*
9
* sa8d selection between merge / skip / inter / intra and split
10
11
for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
12
ctu.m_log2CUSize[i] = (uint8_t)m_param->maxLog2CUSize - ctu.m_cuDepth[i];
13
}
14
- if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead)
15
+ if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && (m_slice->m_sliceType != I_SLICE))
16
{
17
- m_multipassAnalysis = (analysis2PassFrameData*)m_frame->m_analysis2Pass.analysisFramedata;
18
- m_multipassDepth = &m_multipassAnalysis->depth[ctu.m_cuAddr * ctu.m_numPartitions];
19
- if (m_slice->m_sliceType != I_SLICE)
20
+ int numPredDir = m_slice->isInterP() ? 1 : 2;
21
+ m_reuseInterDataCTU = m_frame->m_analysisData.interData;
22
+ for (int dir = 0; dir < numPredDir; dir++)
23
{
24
- int numPredDir = m_slice->isInterP() ? 1 : 2;
25
- for (int dir = 0; dir < numPredDir; dir++)
26
- {
27
- m_multipassMv[dir] = &m_multipassAnalysis->m_mv[dir][ctu.m_cuAddr * ctu.m_numPartitions];
28
- m_multipassMvpIdx[dir] = &m_multipassAnalysis->mvpIdx[dir][ctu.m_cuAddr * ctu.m_numPartitions];
29
- m_multipassRef[dir] = &m_multipassAnalysis->ref[dir][ctu.m_cuAddr * ctu.m_numPartitions];
30
- }
31
- m_multipassModes = &m_multipassAnalysis->modes[ctu.m_cuAddr * ctu.m_numPartitions];
32
+ m_reuseMv[dir] = &m_reuseInterDataCTU->mv[dir][ctu.m_cuAddr * ctu.m_numPartitions];
33
+ m_reuseMvpIdx[dir] = &m_reuseInterDataCTU->mvpIdx[dir][ctu.m_cuAddr * ctu.m_numPartitions];
34
}
35
+ m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * ctu.m_numPartitions];
36
+ m_reuseModes = &m_reuseInterDataCTU->modes[ctu.m_cuAddr * ctu.m_numPartitions];
37
+ m_reuseDepth = &m_reuseInterDataCTU->depth[ctu.m_cuAddr * ctu.m_numPartitions];
38
}
39
-
40
+
41
if ((m_param->analysisSave || m_param->analysisLoad) && m_slice->m_sliceType != I_SLICE && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel < 10)
42
{
43
int numPredDir = m_slice->isInterP() ? 1 : 2;
44
- m_reuseInterDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData;
45
+ m_reuseInterDataCTU = m_frame->m_analysisData.interData;
46
m_reuseRef = &m_reuseInterDataCTU->ref [ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
47
m_reuseDepth = &m_reuseInterDataCTU->depth[ctu.m_cuAddr * ctu.m_numPartitions];
48
m_reuseModes = &m_reuseInterDataCTU->modes[ctu.m_cuAddr * ctu.m_numPartitions];
49
50
51
if (m_slice->m_sliceType == I_SLICE)
52
{
53
- analysis_intra_data* intraDataCTU = (analysis_intra_data*)m_frame->m_analysisData.intraData;
54
+ x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
55
if (m_param->analysisLoad && m_param->analysisReuseLevel > 1)
56
{
57
memcpy(ctu.m_cuDepth, &intraDataCTU->depth[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
58
59
60
if (bCopyAnalysis)
61
{
62
- analysis_inter_data* interDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData;
63
+ x265_analysis_inter_data* interDataCTU = m_frame->m_analysisData.interData;
64
int posCTU = ctu.m_cuAddr * numPartition;
65
memcpy(ctu.m_cuDepth, &interDataCTU->depth[posCTU], sizeof(uint8_t) * numPartition);
66
memcpy(ctu.m_predMode, &interDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
67
68
69
if ((m_slice->m_sliceType == P_SLICE || m_param->bIntraInBFrames) && !m_param->bMVType)
70
{
71
- analysis_intra_data* intraDataCTU = (analysis_intra_data*)m_frame->m_analysisData.intraData;
72
+ x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
73
memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
74
memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[posCTU], sizeof(uint8_t) * numPartition);
75
}
76
77
}
78
else if ((m_param->analysisLoad && m_param->analysisReuseLevel == 10) || ((m_param->bMVType == AVC_INFO) && m_param->analysisReuseLevel >= 7 && ctu.m_numPartitions <= 16))
79
{
80
- analysis_inter_data* interDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData;
81
+ x265_analysis_inter_data* interDataCTU = m_frame->m_analysisData.interData;
82
int posCTU = ctu.m_cuAddr * numPartition;
83
memcpy(ctu.m_cuDepth, &interDataCTU->depth[posCTU], sizeof(uint8_t) * numPartition);
84
memcpy(ctu.m_predMode, &interDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
85
memcpy(ctu.m_partSize, &interDataCTU->partSize[posCTU], sizeof(uint8_t) * numPartition);
86
if ((m_slice->m_sliceType == P_SLICE || m_param->bIntraInBFrames) && !(m_param->bMVType == AVC_INFO))
87
{
88
- analysis_intra_data* intraDataCTU = (analysis_intra_data*)m_frame->m_analysisData.intraData;
89
+ x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
90
memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
91
memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[posCTU], sizeof(uint8_t) * numPartition);
92
}
93
94
bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
95
bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
96
97
- bool bAlreadyDecided = parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] != (uint8_t)ALL_IDX;
98
- bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
99
+ bool bAlreadyDecided = m_param->intraRefine != 4 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] != (uint8_t)ALL_IDX;
100
+ bool bDecidedDepth = m_param->intraRefine != 4 && parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
101
int split = 0;
102
- if (m_param->intraRefine)
103
+ if (m_param->intraRefine && m_param->intraRefine != 4)
104
{
105
- split = ((cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1)) && bDecidedDepth);
106
+ split = m_param->scaleFactor && bDecidedDepth && (!mightNotSplit ||
107
+ ((cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1))));
108
if (cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize]) && !bDecidedDepth)
109
bAlreadyDecided = false;
110
}
111
112
if (bAlreadyDecided)
113
{
114
- if (bDecidedDepth)
115
+ if (bDecidedDepth && mightNotSplit)
116
{
117
Mode& mode = md.pred[0];
118
md.bestMode = &mode;
119
120
121
if (m_evaluateInter)
122
{
123
- if (m_param->interRefine == 2)
124
+ if (m_refineLevel == 2)
125
{
126
if (parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP)
127
skipModes = true;
128
129
}
130
}
131
}
132
- if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_multipassAnalysis)
133
+ if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
134
{
135
- if (mightNotSplit && depth == m_multipassDepth[cuGeom.absPartIdx])
136
+ if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
137
{
138
- if (m_multipassModes[cuGeom.absPartIdx] == MODE_SKIP)
139
+ if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
140
{
141
md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
142
md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
143
144
md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
145
checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
146
if (m_param->rdLevel)
147
- skipModes = (m_param->bEnableEarlySkip || m_param->interRefine == 2)
148
+ skipModes = (m_param->bEnableEarlySkip || m_refineLevel == 2)
149
&& md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth
150
}
151
if (md.bestMode && m_param->bEnableRecursionSkip && !bCtuInfoCheck && !(m_param->bMVType && m_param->analysisReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])))
152
153
154
if (m_evaluateInter)
155
{
156
- if (m_param->interRefine == 2)
157
+ if (m_refineLevel == 2)
158
{
159
if (parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP)
160
skipModes = true;
161
162
}
163
}
164
165
- if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_multipassAnalysis)
166
+ if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
167
{
168
- if (mightNotSplit && depth == m_multipassDepth[cuGeom.absPartIdx])
169
+ if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
170
{
171
- if (m_multipassModes[cuGeom.absPartIdx] == MODE_SKIP)
172
+ if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
173
{
174
md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
175
md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
176
177
md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
178
md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
179
checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
180
- skipModes = (m_param->bEnableEarlySkip || m_param->interRefine == 2) &&
181
+ skipModes = (m_param->bEnableEarlySkip || m_refineLevel == 2) &&
182
md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
183
refMasks[0] = allSplitRefs;
184
md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
185
186
bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
187
bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
188
189
- int split = (m_param->interRefine && cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1) && bDecidedDepth);
190
+ TrainingData td;
191
+ td.init(parentCTU, cuGeom);
192
193
- if (bDecidedDepth)
194
+ if (!m_param->bDynamicRefine)
195
+ m_refineLevel = m_param->interRefine;
196
+ else
197
+ m_refineLevel = m_frame->m_classifyFrame ? 1 : 3;
198
+ int split = (m_param->scaleFactor && bDecidedDepth && (!mightNotSplit ||
199
+ (m_refineLevel && cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1))));
200
+ td.split = split;
201
x265_2.7.tar.gz/source/encoder/analysis.h -> x265_2.9.tar.gz/source/encoder/analysis.h
Changed
69
1
2
3
protected:
4
/* Analysis data for save/load mode, writes/reads data based on absPartIdx */
5
- analysis_inter_data* m_reuseInterDataCTU;
6
- int32_t* m_reuseRef;
7
- uint8_t* m_reuseDepth;
8
- uint8_t* m_reuseModes;
9
- uint8_t* m_reusePartSize;
10
- uint8_t* m_reuseMergeFlag;
11
+ x265_analysis_inter_data* m_reuseInterDataCTU;
12
+ int32_t* m_reuseRef;
13
+ uint8_t* m_reuseDepth;
14
+ uint8_t* m_reuseModes;
15
+ uint8_t* m_reusePartSize;
16
+ uint8_t* m_reuseMergeFlag;
17
+ x265_analysis_MV* m_reuseMv[2];
18
+ uint8_t* m_reuseMvpIdx[2];
19
20
uint32_t m_splitRefIdx[4];
21
uint64_t* cacheCost;
22
23
-
24
- analysis2PassFrameData* m_multipassAnalysis;
25
- uint8_t* m_multipassDepth;
26
- MV* m_multipassMv[2];
27
- int* m_multipassMvpIdx[2];
28
- int32_t* m_multipassRef[2];
29
- uint8_t* m_multipassModes;
30
-
31
uint8_t m_evaluateInter;
32
+ int32_t m_refineLevel;
33
+
34
uint8_t* m_additionalCtuInfo;
35
int* m_prevCtuInfoChange;
36
+
37
+ struct TrainingData
38
+ {
39
+ uint32_t cuVariance;
40
+ uint8_t predMode;
41
+ uint8_t partSize;
42
+ uint8_t mergeFlag;
43
+ int split;
44
+
45
+ void init(const CUData& parentCTU, const CUGeom& cuGeom)
46
+ {
47
+ cuVariance = 0;
48
+ predMode = parentCTU.m_predMode[cuGeom.absPartIdx];
49
+ partSize = parentCTU.m_partSize[cuGeom.absPartIdx];
50
+ mergeFlag = parentCTU.m_mergeFlag[cuGeom.absPartIdx];
51
+ split = 0;
52
+ }
53
+ };
54
+
55
/* refine RD based on QP for rd-levels 5 and 6 */
56
void qprdRefine(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp);
57
58
59
void encodeResidue(const CUData& parentCTU, const CUGeom& cuGeom);
60
61
int calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom, int32_t complexCheck = 0, double baseQP = -1);
62
+ uint32_t calculateCUVariance(const CUData& ctu, const CUGeom& cuGeom);
63
+
64
+ void classifyCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& bestMode, TrainingData& trainData);
65
+ void trainCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& bestMode, TrainingData& trainData);
66
67
void calculateNormFactor(CUData& ctu, int qp);
68
void normFactor(const pixel* src, uint32_t blockSize, CUData& ctu, int qp, TextType ttype);
69
x265_2.7.tar.gz/source/encoder/api.cpp -> x265_2.9.tar.gz/source/encoder/api.cpp
Changed
201
1
2
#include "nal.h"
3
#include "bitcost.h"
4
5
+#if ENABLE_LIBVMAF
6
+#include "libvmaf.h"
7
+#endif
8
+
9
/* multilib namespace reflectors */
10
#if LINKED_8BIT
11
namespace x265_8bit {
12
13
pic_in->analysisData.wt = NULL;
14
pic_in->analysisData.intraData = NULL;
15
pic_in->analysisData.interData = NULL;
16
- pic_in->analysis2Pass.analysisFramedata = NULL;
17
+ pic_in->analysisData.distortionData = NULL;
18
}
19
20
- if (pp_nal && numEncoded > 0)
21
+ if (pp_nal && numEncoded > 0 && encoder->m_outputCount >= encoder->m_latestParam->chunkStart)
22
{
23
*pp_nal = &encoder->m_nalList.m_nal[0];
24
if (pi_nal) *pi_nal = encoder->m_nalList.m_numNal;
25
26
else if (pi_nal)
27
*pi_nal = 0;
28
29
- if (numEncoded && encoder->m_param->csvLogLevel)
30
+ if (numEncoded && encoder->m_param->csvLogLevel && encoder->m_outputCount >= encoder->m_latestParam->chunkStart)
31
x265_csvlog_frame(encoder->m_param, pic_out);
32
33
if (numEncoded < 0)
34
35
encoder->fetchStats(outputStats, statsSizeBytes);
36
}
37
}
38
+#if ENABLE_LIBVMAF
39
+void x265_vmaf_encoder_log(x265_encoder* enc, int argc, char **argv, x265_param *param, x265_vmaf_data *vmafdata)
40
+{
41
+ if (enc)
42
+ {
43
+ Encoder *encoder = static_cast<Encoder*>(enc);
44
+ x265_stats stats;
45
+ stats.aggregateVmafScore = x265_calculate_vmafscore(param, vmafdata);
46
+ if(vmafdata->reference_file)
47
+ fclose(vmafdata->reference_file);
48
+ if(vmafdata->distorted_file)
49
+ fclose(vmafdata->distorted_file);
50
+ if(vmafdata)
51
+ x265_free(vmafdata);
52
+ encoder->fetchStats(&stats, sizeof(stats));
53
+ int padx = encoder->m_sps.conformanceWindow.rightOffset;
54
+ int pady = encoder->m_sps.conformanceWindow.bottomOffset;
55
+ x265_csvlog_encode(encoder->m_param, &stats, padx, pady, argc, argv);
56
+ }
57
+}
58
+#endif
59
60
void x265_encoder_log(x265_encoder* enc, int argc, char **argv)
61
{
62
if (enc)
63
{
64
Encoder *encoder = static_cast<Encoder*>(enc);
65
- x265_stats stats;
66
+ x265_stats stats;
67
encoder->fetchStats(&stats, sizeof(stats));
68
int padx = encoder->m_sps.conformanceWindow.rightOffset;
69
int pady = encoder->m_sps.conformanceWindow.bottomOffset;
70
71
return -1;
72
}
73
74
+void x265_alloc_analysis_data(x265_param *param, x265_analysis_data* analysis)
75
+{
76
+ x265_analysis_inter_data *interData = analysis->interData = NULL;
77
+ x265_analysis_intra_data *intraData = analysis->intraData = NULL;
78
+ x265_analysis_distortion_data *distortionData = analysis->distortionData = NULL;
79
+ bool isVbv = param->rc.vbvMaxBitrate > 0 && param->rc.vbvBufferSize > 0;
80
+ int numDir = 2; //irrespective of P or B slices set direction as 2
81
+ uint32_t numPlanes = param->internalCsp == X265_CSP_I400 ? 1 : 3;
82
+
83
+#if X265_DEPTH < 10 && (LINKED_10BIT || LINKED_12BIT)
84
+ uint32_t numCUs_sse_t = param->internalBitDepth > 8 ? analysis->numCUsInFrame << 1 : analysis->numCUsInFrame;
85
+#elif X265_DEPTH >= 10 && LINKED_8BIT
86
+ uint32_t numCUs_sse_t = param->internalBitDepth > 8 ? analysis->numCUsInFrame : (analysis->numCUsInFrame + 1U) >> 1;
87
+#else
88
+ uint32_t numCUs_sse_t = analysis->numCUsInFrame;
89
+#endif
90
+
91
+ //Allocate memory for distortionData pointer
92
+ CHECKED_MALLOC_ZERO(distortionData, x265_analysis_distortion_data, 1);
93
+ CHECKED_MALLOC_ZERO(distortionData->distortion, sse_t, analysis->numPartitions * numCUs_sse_t);
94
+ if (param->rc.bStatRead)
95
+ {
96
+ CHECKED_MALLOC_ZERO(distortionData->ctuDistortion, sse_t, numCUs_sse_t);
97
+ CHECKED_MALLOC_ZERO(distortionData->scaledDistortion, double, analysis->numCUsInFrame);
98
+ CHECKED_MALLOC_ZERO(distortionData->offset, double, analysis->numCUsInFrame);
99
+ CHECKED_MALLOC_ZERO(distortionData->threshold, double, analysis->numCUsInFrame);
100
+ }
101
+ analysis->distortionData = distortionData;
102
+
103
+ if (param->bDisableLookahead && isVbv)
104
+ {
105
+ CHECKED_MALLOC_ZERO(analysis->lookahead.intraSatdForVbv, uint32_t, analysis->numCuInHeight);
106
+ CHECKED_MALLOC_ZERO(analysis->lookahead.satdForVbv, uint32_t, analysis->numCuInHeight);
107
+ CHECKED_MALLOC_ZERO(analysis->lookahead.intraVbvCost, uint32_t, analysis->numCUsInFrame);
108
+ CHECKED_MALLOC_ZERO(analysis->lookahead.vbvCost, uint32_t, analysis->numCUsInFrame);
109
+ }
110
+
111
+ //Allocate memory for weightParam pointer
112
+ if (!(param->bMVType == AVC_INFO))
113
+ CHECKED_MALLOC_ZERO(analysis->wt, x265_weight_param, numPlanes * numDir);
114
+
115
+ if (param->analysisReuseLevel < 2)
116
+ return;
117
+
118
+ //Allocate memory for intraData pointer
119
+ CHECKED_MALLOC_ZERO(intraData, x265_analysis_intra_data, 1);
120
+ CHECKED_MALLOC(intraData->depth, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
121
+ CHECKED_MALLOC(intraData->modes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
122
+ CHECKED_MALLOC(intraData->partSizes, char, analysis->numPartitions * analysis->numCUsInFrame);
123
+ CHECKED_MALLOC(intraData->chromaModes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
124
+ analysis->intraData = intraData;
125
+
126
+ //Allocate memory for interData pointer based on ReuseLevels
127
+ CHECKED_MALLOC_ZERO(interData, x265_analysis_inter_data, 1);
128
+ CHECKED_MALLOC(interData->depth, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
129
+ CHECKED_MALLOC(interData->modes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
130
+
131
+ CHECKED_MALLOC_ZERO(interData->mvpIdx[0], uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
132
+ CHECKED_MALLOC_ZERO(interData->mvpIdx[1], uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
133
+ CHECKED_MALLOC_ZERO(interData->mv[0], x265_analysis_MV, analysis->numPartitions * analysis->numCUsInFrame);
134
+ CHECKED_MALLOC_ZERO(interData->mv[1], x265_analysis_MV, analysis->numPartitions * analysis->numCUsInFrame);
135
+
136
+ if (param->analysisReuseLevel > 4)
137
+ {
138
+ CHECKED_MALLOC(interData->partSize, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
139
+ CHECKED_MALLOC_ZERO(interData->mergeFlag, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
140
+ }
141
+ if (param->analysisReuseLevel >= 7)
142
+ {
143
+ CHECKED_MALLOC(interData->interDir, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
144
+ CHECKED_MALLOC(interData->sadCost, int64_t, analysis->numPartitions * analysis->numCUsInFrame);
145
+ for (int dir = 0; dir < numDir; dir++)
146
+ {
147
+ CHECKED_MALLOC(interData->refIdx[dir], int8_t, analysis->numPartitions * analysis->numCUsInFrame);
148
+ CHECKED_MALLOC_ZERO(analysis->modeFlag[dir], uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
149
+ }
150
+ }
151
+ else
152
+ {
153
+ if (param->analysisMultiPassRefine || param->analysisMultiPassDistortion){
154
+ CHECKED_MALLOC_ZERO(interData->ref, int32_t, 2 * analysis->numPartitions * analysis->numCUsInFrame);
155
+ }
156
+ else
157
+ CHECKED_MALLOC_ZERO(interData->ref, int32_t, analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir);
158
+ }
159
+ analysis->interData = interData;
160
+
161
+ return;
162
+
163
+fail:
164
+ x265_free_analysis_data(param, analysis);
165
+}
166
+
167
+void x265_free_analysis_data(x265_param *param, x265_analysis_data* analysis)
168
+{
169
+ bool isVbv = param->rc.vbvMaxBitrate > 0 && param->rc.vbvBufferSize > 0;
170
+
171
+ //Free memory for Lookahead pointers
172
+ if (param->bDisableLookahead && isVbv)
173
+ {
174
+ X265_FREE(analysis->lookahead.satdForVbv);
175
+ X265_FREE(analysis->lookahead.intraSatdForVbv);
176
+ X265_FREE(analysis->lookahead.vbvCost);
177
+ X265_FREE(analysis->lookahead.intraVbvCost);
178
+ }
179
+
180
+ //Free memory for distortionData pointers
181
+ if (analysis->distortionData)
182
+ {
183
+ X265_FREE((analysis->distortionData)->distortion);
184
+ if (param->rc.bStatRead)
185
+ {
186
+ X265_FREE((analysis->distortionData)->ctuDistortion);
187
+ X265_FREE((analysis->distortionData)->scaledDistortion);
188
+ X265_FREE((analysis->distortionData)->offset);
189
+ X265_FREE((analysis->distortionData)->threshold);
190
+ }
191
+ X265_FREE(analysis->distortionData);
192
+ }
193
+
194
+ /* Early exit freeing weights alone if level is 1 (when there is no analysis inter/intra) */
195
+ if (analysis->wt && !(param->bMVType == AVC_INFO))
196
+ X265_FREE(analysis->wt);
197
+
198
+ if (param->analysisReuseLevel < 2)
199
+ return;
200
+
201
x265_2.7.tar.gz/source/encoder/dpb.cpp -> x265_2.9.tar.gz/source/encoder/dpb.cpp
Changed
35
1
2
int pocCurr = slice->m_poc;
3
int type = newFrame->m_lowres.sliceType;
4
bool bIsKeyFrame = newFrame->m_lowres.bKeyframe;
5
-
6
slice->m_nalUnitType = getNalUnitType(pocCurr, bIsKeyFrame);
7
- if (slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL)
8
+ if (slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL || slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_N_LP)
9
m_lastIDR = pocCurr;
10
slice->m_lastIDR = m_lastIDR;
11
slice->m_sliceType = IS_X265_TYPE_B(type) ? B_SLICE : (type == X265_TYPE_P) ? P_SLICE : I_SLICE;
12
13
/* Marking reference pictures when an IDR/CRA is encountered. */
14
void DPB::decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType)
15
{
16
- if (nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL)
17
+ if (nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL || nalUnitType == NAL_UNIT_CODED_SLICE_IDR_N_LP)
18
{
19
/* If the nal_unit_type is IDR, all pictures in the reference picture
20
* list are marked as "unused for reference" */
21
22
NalUnitType DPB::getNalUnitType(int curPOC, bool bIsKeyFrame)
23
{
24
if (!curPOC)
25
- return NAL_UNIT_CODED_SLICE_IDR_W_RADL;
26
-
27
+ return NAL_UNIT_CODED_SLICE_IDR_N_LP;
28
if (bIsKeyFrame)
29
- return m_bOpenGOP ? NAL_UNIT_CODED_SLICE_CRA : NAL_UNIT_CODED_SLICE_IDR_W_RADL;
30
-
31
+ return m_bOpenGOP ? NAL_UNIT_CODED_SLICE_CRA : m_bhasLeadingPicture ? NAL_UNIT_CODED_SLICE_IDR_W_RADL : NAL_UNIT_CODED_SLICE_IDR_N_LP;
32
if (m_pocCRA && curPOC < m_pocCRA)
33
// All leading pictures are being marked as TFD pictures here since
34
// current encoder uses all reference pictures while encoding leading
35
x265_2.7.tar.gz/source/encoder/dpb.h -> x265_2.9.tar.gz/source/encoder/dpb.h
Changed
17
1
2
int m_lastIDR;
3
int m_pocCRA;
4
int m_bOpenGOP;
5
+ int m_bhasLeadingPicture;
6
bool m_bRefreshPending;
7
bool m_bTemporalSublayer;
8
PicList m_picList;
9
10
{
11
m_lastIDR = 0;
12
m_pocCRA = 0;
13
+ m_bhasLeadingPicture = param->radl;
14
m_bRefreshPending = false;
15
m_frameDataFreeList = NULL;
16
m_bOpenGOP = param->bOpenGOP;
17
x265_2.7.tar.gz/source/encoder/encoder.cpp -> x265_2.9.tar.gz/source/encoder/encoder.cpp
Changed
201
1
2
m_threadPool = NULL;
3
m_analysisFileIn = NULL;
4
m_analysisFileOut = NULL;
5
+ m_naluFile = NULL;
6
m_offsetEmergency = NULL;
7
m_iFrameNum = 0;
8
m_iPPSQpMinus26 = 0;
9
10
#endif
11
12
m_prevTonemapPayload.payload = NULL;
13
+ m_startPoint = 0;
14
+ m_saveCTUSize = 0;
15
}
16
inline char *strcatFilename(const char *input, const char *suffix)
17
{
18
19
20
if (m_param->bEmitHRDSEI)
21
m_rateControl->initHRD(m_sps);
22
+
23
if (!m_rateControl->init(m_sps))
24
m_aborted = true;
25
if (!m_lookahead->create())
26
m_aborted = true;
27
+
28
initRefIdx();
29
if (m_param->analysisSave && m_param->bUseAnalysisFile)
30
{
31
32
33
m_emitCLLSEI = p->maxCLL || p->maxFALL;
34
35
+ if (m_param->naluFile)
36
+ {
37
+ m_naluFile = x265_fopen(m_param->naluFile, "r");
38
+ if (!m_naluFile)
39
+ {
40
+ x265_log_file(NULL, X265_LOG_ERROR, "%s file not found or Failed to open\n", m_param->naluFile);
41
+ m_aborted = true;
42
+ }
43
+ else
44
+ m_enableNal = 1;
45
+ }
46
+ else
47
+ m_enableNal = 0;
48
+
49
#if ENABLE_HDR10_PLUS
50
if (m_bToneMap)
51
m_numCimInfo = m_hdr10plus_api->hdr10plus_json_to_movie_cim(m_param->toneMapFile, m_cim);
52
#endif
53
+ if (m_param->bDynamicRefine)
54
+ {
55
+ /* Allocate memory for 1 GOP and reuse it for the subsequent GOPs */
56
+ int size = (m_param->keyframeMax + m_param->lookaheadDepth) * m_param->maxCUDepth * X265_REFINE_INTER_LEVELS;
57
+ CHECKED_MALLOC_ZERO(m_variance, uint64_t, size);
58
+ CHECKED_MALLOC_ZERO(m_rdCost, uint64_t, size);
59
+ CHECKED_MALLOC_ZERO(m_trainingCount, uint32_t, size);
60
+ return;
61
+ fail:
62
+ m_aborted = true;
63
+ }
64
}
65
66
void Encoder::stopJobs()
67
68
curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions;
69
int num16x16inCUWidth = m_param->maxCUSize >> 4;
70
uint32_t ctuAddr, offset, cuPos;
71
- analysis_intra_data * intraData = (analysis_intra_data *)curFrame->m_analysisData.intraData;
72
- analysis_intra_data * srcIntraData = (analysis_intra_data *)analysis_data->intraData;
73
+ x265_analysis_intra_data * intraData = curFrame->m_analysisData.intraData;
74
+ x265_analysis_intra_data * srcIntraData = analysis_data->intraData;
75
for (int i = 0; i < mbImageHeight; i++)
76
{
77
for (int j = 0; j < mbImageWidth; j++)
78
79
curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions;
80
int num16x16inCUWidth = m_param->maxCUSize >> 4;
81
uint32_t ctuAddr, offset, cuPos;
82
- analysis_inter_data * interData = (analysis_inter_data *)curFrame->m_analysisData.interData;
83
- analysis_inter_data * srcInterData = (analysis_inter_data*)analysis_data->interData;
84
+ x265_analysis_inter_data * interData = curFrame->m_analysisData.interData;
85
+ x265_analysis_inter_data * srcInterData = analysis_data->interData;
86
for (int i = 0; i < mbImageHeight; i++)
87
{
88
for (int j = 0; j < mbImageWidth; j++)
89
90
curFrame->m_analysisData = (*analysis_data);
91
curFrame->m_analysisData.numCUsInFrame = widthInCU * heightInCU;
92
curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions;
93
- allocAnalysis(&curFrame->m_analysisData);
94
+ x265_alloc_analysis_data(m_param, &curFrame->m_analysisData);
95
if (m_param->maxCUSize == 16)
96
{
97
if (analysis_data->sliceType == X265_TYPE_IDR || analysis_data->sliceType == X265_TYPE_I)
98
99
100
curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions;
101
size_t count = 0;
102
- analysis_intra_data * currIntraData = (analysis_intra_data *)curFrame->m_analysisData.intraData;
103
- analysis_intra_data * intraData = (analysis_intra_data *)analysis_data->intraData;
104
+ x265_analysis_intra_data * currIntraData = curFrame->m_analysisData.intraData;
105
+ x265_analysis_intra_data * intraData = analysis_data->intraData;
106
for (uint32_t d = 0; d < cuBytes; d++)
107
{
108
int bytes = curFrame->m_analysisData.numPartitions >> ((intraData)->depth[d] * 2);
109
110
111
curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions;
112
size_t count = 0;
113
- analysis_inter_data * currInterData = (analysis_inter_data *)curFrame->m_analysisData.interData;
114
- analysis_inter_data * interData = (analysis_inter_data *)analysis_data->interData;
115
+ x265_analysis_inter_data * currInterData = curFrame->m_analysisData.interData;
116
+ x265_analysis_inter_data * interData = analysis_data->interData;
117
for (uint32_t d = 0; d < cuBytes; d++)
118
{
119
int bytes = curFrame->m_analysisData.numPartitions >> ((interData)->depth[d] * 2);
120
memset(&(currInterData)->depth[count], (interData)->depth[d], bytes);
121
memset(&(currInterData)->modes[count], (interData)->modes[d], bytes);
122
- memcpy(&(currInterData)->sadCost[count], &((analysis_inter_data*)analysis_data->interData)->sadCost[d], bytes);
123
+ memcpy(&(currInterData)->sadCost[count], &(analysis_data->interData)->sadCost[d], bytes);
124
if (m_param->analysisReuseLevel > 4)
125
{
126
memset(&(currInterData)->partSize[count], (interData)->partSize[d], bytes);
127
128
if (m_bToneMap)
129
m_hdr10plus_api->hdr10plus_clear_movie(m_cim, m_numCimInfo);
130
#endif
131
-
132
+
133
+ if (m_param->bDynamicRefine)
134
+ {
135
+ X265_FREE(m_variance);
136
+ X265_FREE(m_rdCost);
137
+ X265_FREE(m_trainingCount);
138
+ }
139
if (m_exportedPic)
140
{
141
ATOMIC_DEC(&m_exportedPic->m_countRefEncoders);
142
143
}
144
X265_FREE(temp);
145
}
146
+ if (m_naluFile)
147
+ fclose(m_naluFile);
148
if (m_param)
149
{
150
if (m_param->csvfpt)
151
152
}
153
}
154
155
+void Encoder::copyUserSEIMessages(Frame *frame, const x265_picture* pic_in)
156
+{
157
+ x265_sei_payload toneMap;
158
+ toneMap.payload = NULL;
159
+ int toneMapPayload = 0;
160
+
161
+#if ENABLE_HDR10_PLUS
162
+ if (m_bToneMap)
163
+ {
164
+ int currentPOC = m_pocLast;
165
+ if (currentPOC < m_numCimInfo)
166
+ {
167
+ int32_t i = 0;
168
+ toneMap.payloadSize = 0;
169
+ while (m_cim[currentPOC][i] == 0xFF)
170
+ toneMap.payloadSize += m_cim[currentPOC][i++];
171
+ toneMap.payloadSize += m_cim[currentPOC][i];
172
+
173
+ toneMap.payload = (uint8_t*)x265_malloc(sizeof(uint8_t) * toneMap.payloadSize);
174
+ toneMap.payloadType = USER_DATA_REGISTERED_ITU_T_T35;
175
+ memcpy(toneMap.payload, &m_cim[currentPOC][i + 1], toneMap.payloadSize);
176
+ toneMapPayload = 1;
177
+ }
178
+ }
179
+#endif
180
+ /* seiMsg will contain SEI messages specified in a fixed file format in POC order.
181
+ * Format of the file : <POC><space><PREFIX><space><NAL UNIT TYPE>/<SEI TYPE><space><SEI Payload> */
182
+ x265_sei_payload seiMsg;
183
+ seiMsg.payload = NULL;
184
+ int userPayload = 0;
185
+ if (m_enableNal)
186
+ {
187
+ readUserSeiFile(seiMsg, m_pocLast);
188
+ if (seiMsg.payload)
189
+ userPayload = 1;;
190
+ }
191
+
192
+ int numPayloads = pic_in->userSEI.numPayloads + toneMapPayload + userPayload;
193
+ frame->m_userSEI.numPayloads = numPayloads;
194
+
195
+ if (frame->m_userSEI.numPayloads)
196
+ {
197
+ if (!frame->m_userSEI.payloads)
198
+ {
199
+ frame->m_userSEI.payloads = new x265_sei_payload[numPayloads];
200
+ for (int i = 0; i < numPayloads; i++)
201
x265_2.7.tar.gz/source/encoder/encoder.h -> x265_2.9.tar.gz/source/encoder/encoder.h
Changed
121
1
2
RPSListNode* prior;
3
};
4
5
+struct cuLocation
6
+{
7
+ bool skipWidth;
8
+ bool skipHeight;
9
+ uint32_t heightInCU;
10
+ uint32_t widthInCU;
11
+ uint32_t oddRowIndex;
12
+ uint32_t evenRowIndex;
13
+ uint32_t switchCondition;
14
+
15
+ void init(x265_param* param)
16
+ {
17
+ skipHeight = false;
18
+ skipWidth = false;
19
+ heightInCU = (param->sourceHeight + param->maxCUSize - 1) >> param->maxLog2CUSize;
20
+ widthInCU = (param->sourceWidth + param->maxCUSize - 1) >> param->maxLog2CUSize;
21
+ evenRowIndex = 0;
22
+ oddRowIndex = param->num4x4Partitions * widthInCU;
23
+ switchCondition = 0; // To switch between odd and even rows
24
+ }
25
+};
26
+
27
+struct puOrientation
28
+{
29
+ bool isVert;
30
+ bool isRect;
31
+ bool isAmp;
32
+
33
+ void init()
34
+ {
35
+ isRect = false;
36
+ isAmp = false;
37
+ isVert = false;
38
+ }
39
+};
40
+
41
+
42
class FrameEncoder;
43
class DPB;
44
class Lookahead;
45
46
Frame* m_exportedPic;
47
FILE* m_analysisFileIn;
48
FILE* m_analysisFileOut;
49
+ FILE* m_naluFile;
50
x265_param* m_param;
51
x265_param* m_latestParam; // Holds latest param during a reconfigure
52
RateControl* m_rateControl;
53
54
double m_cR;
55
56
int m_bToneMap; // Enables tone-mapping
57
+ int m_enableNal;
58
59
#ifdef ENABLE_HDR10_PLUS
60
const hdr10plus_api *m_hdr10plus_api;
61
62
63
x265_sei_payload m_prevTonemapPayload;
64
65
+ /* Collect frame level feature data */
66
+ uint64_t* m_rdCost;
67
+ uint64_t* m_variance;
68
+ uint32_t* m_trainingCount;
69
+ int32_t m_startPoint;
70
+ Lock m_dynamicRefineLock;
71
+
72
+ bool m_saveCTUSize;
73
+
74
Encoder();
75
~Encoder()
76
{
77
78
79
void updateVbvPlan(RateControl* rc);
80
81
- void allocAnalysis(x265_analysis_data* analysis);
82
+ void readAnalysisFile(x265_analysis_data* analysis, int poc, int sliceType);
83
+
84
+ void readAnalysisFile(x265_analysis_data* analysis, int poc, const x265_picture* picIn, int paramBytes);
85
86
- void freeAnalysis(x265_analysis_data* analysis);
87
+ void readAnalysisFile(x265_analysis_data* analysis, int poc, const x265_picture* picIn, int paramBytes, cuLocation cuLoc);
88
89
- void allocAnalysis2Pass(x265_analysis_2Pass* analysis, int sliceType);
90
+ int getCUIndex(cuLocation* cuLoc, uint32_t* count, int bytes, int flag);
91
92
- void freeAnalysis2Pass(x265_analysis_2Pass* analysis, int sliceType);
93
+ int getPuShape(puOrientation* puOrient, int partSize, int numCTU);
94
95
- void readAnalysisFile(x265_analysis_data* analysis, int poc, const x265_picture* picIn);
96
+ void writeAnalysisFile(x265_analysis_data* analysis, FrameData &curEncData);
97
+
98
+ void writeAnalysisFileRefine(x265_analysis_data* analysis, FrameData &curEncData);
99
100
- void writeAnalysisFile(x265_analysis_data* pic, FrameData &curEncData);
101
- void readAnalysis2PassFile(x265_analysis_2Pass* analysis2Pass, int poc, int sliceType);
102
- void writeAnalysis2PassFile(x265_analysis_2Pass* analysis2Pass, FrameData &curEncData, int slicetype);
103
void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, x265_frame_stats* frameStats, int inPoc);
104
105
+ int validateAnalysisData(x265_analysis_data* analysis, int readWriteFlag);
106
+
107
+ void readUserSeiFile(x265_sei_payload& seiMsg, int poc);
108
+
109
void calcRefreshInterval(Frame* frameEnc);
110
111
void initRefIdx();
112
113
void updateRefIdx();
114
bool computeSPSRPSIndex();
115
116
+ void copyUserSEIMessages(Frame *frame, const x265_picture* pic_in);
117
+
118
protected:
119
120
void initVPS(VPS *vps);
121
x265_2.7.tar.gz/source/encoder/entropy.cpp -> x265_2.9.tar.gz/source/encoder/entropy.cpp
Changed
40
1
2
}
3
bDenomCoded = true;
4
}
5
- WRITE_FLAG(wp[0].bPresentFlag, "luma_weight_lX_flag");
6
- totalSignalledWeightFlags += wp[0].bPresentFlag;
7
+ WRITE_FLAG(!!wp[0].wtPresent, "luma_weight_lX_flag");
8
+ totalSignalledWeightFlags += wp[0].wtPresent;
9
}
10
11
if (bChroma)
12
13
for (int ref = 0; ref < slice.m_numRefIdx[list]; ref++)
14
{
15
wp = slice.m_weightPredTable[list][ref];
16
- WRITE_FLAG(wp[1].bPresentFlag, "chroma_weight_lX_flag");
17
- totalSignalledWeightFlags += 2 * wp[1].bPresentFlag;
18
+ WRITE_FLAG(!!wp[1].wtPresent, "chroma_weight_lX_flag");
19
+ totalSignalledWeightFlags += 2 * wp[1].wtPresent;
20
}
21
}
22
23
for (int ref = 0; ref < slice.m_numRefIdx[list]; ref++)
24
{
25
wp = slice.m_weightPredTable[list][ref];
26
- if (wp[0].bPresentFlag)
27
+ if (wp[0].wtPresent)
28
{
29
int deltaWeight = (wp[0].inputWeight - (1 << wp[0].log2WeightDenom));
30
WRITE_SVLC(deltaWeight, "delta_luma_weight_lX");
31
32
33
if (bChroma)
34
{
35
- if (wp[1].bPresentFlag)
36
+ if (wp[1].wtPresent)
37
{
38
for (int plane = 1; plane < 3; plane++)
39
{
40
x265_2.7.tar.gz/source/encoder/frameencoder.cpp -> x265_2.9.tar.gz/source/encoder/frameencoder.cpp
Changed
201
1
2
ok &= m_rce.picTimingSEI && m_rce.hrdTiming;
3
}
4
5
- if (m_param->noiseReductionIntra || m_param->noiseReductionInter || m_param->rc.vbvBufferSize)
6
+ if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
7
m_nr = X265_MALLOC(NoiseReduction, 1);
8
if (m_nr)
9
memset(m_nr, 0, sizeof(NoiseReduction));
10
11
return length;
12
}
13
14
+bool FrameEncoder::writeToneMapInfo(x265_sei_payload *payload)
15
+{
16
+ bool payloadChange = false;
17
+ if (m_top->m_prevTonemapPayload.payload != NULL && payload->payloadSize == m_top->m_prevTonemapPayload.payloadSize)
18
+ {
19
+ if (memcmp(m_top->m_prevTonemapPayload.payload, payload->payload, payload->payloadSize) != 0)
20
+ payloadChange = true;
21
+ }
22
+ else
23
+ {
24
+ payloadChange = true;
25
+ if (m_top->m_prevTonemapPayload.payload != NULL)
26
+ x265_free(m_top->m_prevTonemapPayload.payload);
27
+ m_top->m_prevTonemapPayload.payload = (uint8_t*)x265_malloc(sizeof(uint8_t)* payload->payloadSize);
28
+ }
29
+
30
+ if (payloadChange)
31
+ {
32
+ m_top->m_prevTonemapPayload.payloadType = payload->payloadType;
33
+ m_top->m_prevTonemapPayload.payloadSize = payload->payloadSize;
34
+ memcpy(m_top->m_prevTonemapPayload.payload, payload->payload, payload->payloadSize);
35
+ }
36
+
37
+ bool isIDR = m_frame->m_lowres.sliceType == X265_TYPE_IDR;
38
+ return (payloadChange || isIDR);
39
+}
40
+
41
+void FrameEncoder::writeTrailingSEIMessages()
42
+{
43
+ Slice* slice = m_frame->m_encData->m_slice;
44
+ int planes = (m_param->internalCsp != X265_CSP_I400) ? 3 : 1;
45
+ int32_t payloadSize = 0;
46
+
47
+ if (m_param->decodedPictureHashSEI == 1)
48
+ {
49
+ m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::MD5;
50
+ for (int i = 0; i < planes; i++)
51
+ MD5Final(&m_seiReconPictureDigest.m_state[i], m_seiReconPictureDigest.m_digest[i]);
52
+ payloadSize = 1 + 16 * planes;
53
+ }
54
+ else if (m_param->decodedPictureHashSEI == 2)
55
+ {
56
+ m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CRC;
57
+ for (int i = 0; i < planes; i++)
58
+ crcFinish(m_seiReconPictureDigest.m_crc[i], m_seiReconPictureDigest.m_digest[i]);
59
+ payloadSize = 1 + 2 * planes;
60
+ }
61
+ else if (m_param->decodedPictureHashSEI == 3)
62
+ {
63
+ m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CHECKSUM;
64
+ for (int i = 0; i < planes; i++)
65
+ checksumFinish(m_seiReconPictureDigest.m_checksum[i], m_seiReconPictureDigest.m_digest[i]);
66
+ payloadSize = 1 + 4 * planes;
67
+ }
68
+
69
+ m_seiReconPictureDigest.setSize(payloadSize);
70
+ m_seiReconPictureDigest.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_SUFFIX_SEI, m_nalList, false);
71
+}
72
+
73
void FrameEncoder::compressFrame()
74
{
75
ProfileScopeEvent(frameThread);
76
77
* not repeating headers (since AUD is supposed to be the first NAL in the access
78
* unit) */
79
Slice* slice = m_frame->m_encData->m_slice;
80
+
81
if (m_param->bEnableAccessUnitDelimiters && (m_frame->m_poc || m_param->bRepeatHeaders))
82
{
83
m_bs.resetBits();
84
85
m_entropyCoder.codeAUD(*slice);
86
m_bs.writeByteAlignment();
87
m_nalList.serialize(NAL_UNIT_ACCESS_UNIT_DELIMITER, m_bs);
88
+ if (m_param->bSingleSeiNal)
89
+ m_bs.resetBits();
90
}
91
if (m_frame->m_lowres.bKeyframe && m_param->bRepeatHeaders)
92
{
93
94
wa.waitForExit();
95
else
96
weightAnalyse(*slice, *m_frame, *m_param);
97
-
98
}
99
-
100
}
101
else
102
slice->disableWeights();
103
104
for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
105
{
106
WeightParam *w = NULL;
107
- if ((bUseWeightP || bUseWeightB) && slice->m_weightPredTable[l][ref][0].bPresentFlag)
108
+ if ((bUseWeightP || bUseWeightB) && slice->m_weightPredTable[l][ref][0].wtPresent)
109
w = slice->m_weightPredTable[l][ref];
110
slice->m_refReconPicList[l][ref] = slice->m_refFrameList[l][ref]->m_reconPic;
111
m_mref[l][ref].init(slice->m_refReconPicList[l][ref], w, *m_param);
112
113
114
/* Get the QP for this frame from rate control. This call may block until
115
* frames ahead of it in encode order have called rateControlEnd() */
116
- m_rce.encodeOrder = m_frame->m_encodeOrder;
117
- bool payloadChange = false;
118
- bool writeSei = true;
119
- if (m_param->bDhdr10opt)
120
- {
121
- for (int i = 0; i < m_frame->m_userSEI.numPayloads; i++)
122
- {
123
- x265_sei_payload *payload = &m_frame->m_userSEI.payloads[i];
124
- if(payload->payloadType == USER_DATA_REGISTERED_ITU_T_T35)
125
- {
126
- if (m_top->m_prevTonemapPayload.payload != NULL && payload->payloadSize == m_top->m_prevTonemapPayload.payloadSize)
127
- {
128
- if (memcmp(m_top->m_prevTonemapPayload.payload, payload->payload, payload->payloadSize) != 0)
129
- payloadChange = true;
130
- }
131
- else
132
- {
133
- payloadChange = true;
134
- if (m_top->m_prevTonemapPayload.payload != NULL)
135
- x265_free(m_top->m_prevTonemapPayload.payload);
136
- m_top->m_prevTonemapPayload.payload = (uint8_t*)x265_malloc(sizeof(uint8_t) * payload->payloadSize);
137
- }
138
-
139
- if (payloadChange)
140
- {
141
- m_top->m_prevTonemapPayload.payloadType = payload->payloadType;
142
- m_top->m_prevTonemapPayload.payloadSize = payload->payloadSize;
143
- memcpy(m_top->m_prevTonemapPayload.payload, payload->payload, payload->payloadSize);
144
- }
145
-
146
- bool isIDR = m_frame->m_lowres.sliceType == X265_TYPE_IDR;
147
- writeSei = payloadChange || isIDR;
148
- }
149
- }
150
- }
151
int qp = m_top->m_rateControl->rateControlStart(m_frame, &m_rce, m_top);
152
m_rce.newQp = qp;
153
154
155
156
/* reset entropy coders and compute slice id */
157
m_entropyCoder.load(m_initSliceContext);
158
-
159
for (uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)
160
for (uint32_t row = m_sliceBaseRow[sliceId]; row < m_sliceBaseRow[sliceId + 1]; row++)
161
m_rows[row].init(m_initSliceContext, sliceId);
162
163
m_outStreams[i].resetBits();
164
}
165
166
+ m_rce.encodeOrder = m_frame->m_encodeOrder;
167
int prevBPSEI = m_rce.encodeOrder ? m_top->m_lastBPSEI : 0;
168
169
if (m_frame->m_lowres.bKeyframe)
170
171
bpSei->m_auCpbRemovalDelayDelta = 1;
172
bpSei->m_cpbDelayOffset = 0;
173
bpSei->m_dpbDelayOffset = 0;
174
-
175
// hrdFullness() calculates the initial CPB removal delay and offset
176
m_top->m_rateControl->hrdFullness(bpSei);
177
-
178
- m_bs.resetBits();
179
- bpSei->write(m_bs, *slice->m_sps);
180
- m_bs.writeByteAlignment();
181
-
182
- m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
183
+ bpSei->writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
184
185
m_top->m_lastBPSEI = m_rce.encodeOrder;
186
}
187
+
188
+ if (m_frame->m_lowres.sliceType == X265_TYPE_IDR && m_param->bEmitIDRRecoverySEI)
189
+ {
190
+ /* Recovery Point SEI require the SPS to be "activated" */
191
+ SEIRecoveryPoint sei;
192
+ sei.m_recoveryPocCnt = 0;
193
+ sei.m_exactMatchingFlag = true;
194
+ sei.m_brokenLinkFlag = false;
195
+ sei.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
196
+ }
197
}
198
199
if ((m_param->bEmitHRDSEI || !!m_param->interlaceMode))
200
201
x265_2.7.tar.gz/source/encoder/frameencoder.h -> x265_2.9.tar.gz/source/encoder/frameencoder.h
Changed
42
1
2
/* blocks until worker thread is done, returns access unit */
3
Frame *getEncodedPicture(NALList& list);
4
5
+ void initDecodedPictureHashSEI(int row, int cuAddr, int height);
6
+
7
Event m_enable;
8
Event m_done;
9
Event m_completionEvent;
10
11
double m_ssim;
12
uint64_t m_accessUnitBits;
13
uint32_t m_ssimCnt;
14
- MD5Context m_state[3];
15
- uint32_t m_crc[3];
16
- uint32_t m_checksum[3];
17
18
volatile int m_activeWorkerCount; // count of workers currently encoding or filtering CTUs
19
volatile int m_totalActiveWorkerCount; // sum of m_activeWorkerCount sampled at end of each CTU
20
21
void threadMain();
22
int collectCTUStatistics(const CUData& ctu, FrameStats* frameLog);
23
void noiseReductionUpdate();
24
+ void writeTrailingSEIMessages();
25
+ bool writeToneMapInfo(x265_sei_payload *payload);
26
27
/* Called by WaveFront::findJob() */
28
virtual void processRow(int row, int threadId);
29
30
void enqueueRowFilter(int row) { WaveFront::enqueueRow(row * 2 + 1); }
31
void enableRowEncoder(int row) { WaveFront::enableRow(row * 2 + 0); }
32
void enableRowFilter(int row) { WaveFront::enableRow(row * 2 + 1); }
33
+#if ENABLE_LIBVMAF
34
+ void vmafFrameLevelScore();
35
+#endif
36
+ void collectDynDataFrame();
37
+ void computeAvgTrainingData();
38
+ void collectDynDataRow(CUData& ctu, FrameStats* rowStats);
39
};
40
}
41
42
x265_2.7.tar.gz/source/encoder/framefilter.cpp -> x265_2.9.tar.gz/source/encoder/framefilter.cpp
Changed
82
1
2
3
if (m_param->maxSlices == 1)
4
{
5
- if (m_param->decodedPictureHashSEI == 1)
6
- {
7
- uint32_t height = m_parallelFilter[row].getCUHeight();
8
- uint32_t width = reconPic->m_picWidth;
9
- intptr_t stride = reconPic->m_stride;
10
-
11
- if (!row)
12
- MD5Init(&m_frameEncoder->m_state[0]);
13
-
14
- updateMD5Plane(m_frameEncoder->m_state[0], reconPic->getLumaAddr(cuAddr), width, height, stride);
15
- if (m_param->internalCsp != X265_CSP_I400)
16
- {
17
- if (!row)
18
- {
19
- MD5Init(&m_frameEncoder->m_state[1]);
20
- MD5Init(&m_frameEncoder->m_state[2]);
21
- }
22
-
23
- width >>= m_hChromaShift;
24
- height >>= m_vChromaShift;
25
- stride = reconPic->m_strideC;
26
-
27
- updateMD5Plane(m_frameEncoder->m_state[1], reconPic->getCbAddr(cuAddr), width, height, stride);
28
- updateMD5Plane(m_frameEncoder->m_state[2], reconPic->getCrAddr(cuAddr), width, height, stride);
29
- }
30
- }
31
- else if (m_param->decodedPictureHashSEI == 2)
32
- {
33
- uint32_t height = m_parallelFilter[row].getCUHeight();
34
- uint32_t width = reconPic->m_picWidth;
35
- intptr_t stride = reconPic->m_stride;
36
-
37
- if (!row)
38
- m_frameEncoder->m_crc[0] = 0xffff;
39
-
40
- updateCRC(reconPic->getLumaAddr(cuAddr), m_frameEncoder->m_crc[0], height, width, stride);
41
- if (m_param->internalCsp != X265_CSP_I400)
42
- {
43
- width >>= m_hChromaShift;
44
- height >>= m_vChromaShift;
45
- stride = reconPic->m_strideC;
46
- m_frameEncoder->m_crc[1] = m_frameEncoder->m_crc[2] = 0xffff;
47
-
48
- updateCRC(reconPic->getCbAddr(cuAddr), m_frameEncoder->m_crc[1], height, width, stride);
49
- updateCRC(reconPic->getCrAddr(cuAddr), m_frameEncoder->m_crc[2], height, width, stride);
50
- }
51
- }
52
- else if (m_param->decodedPictureHashSEI == 3)
53
- {
54
- uint32_t width = reconPic->m_picWidth;
55
- uint32_t height = m_parallelFilter[row].getCUHeight();
56
- intptr_t stride = reconPic->m_stride;
57
- uint32_t cuHeight = m_param->maxCUSize;
58
-
59
- if (!row)
60
- m_frameEncoder->m_checksum[0] = 0;
61
-
62
- updateChecksum(reconPic->m_picOrg[0], m_frameEncoder->m_checksum[0], height, width, stride, row, cuHeight);
63
- if (m_param->internalCsp != X265_CSP_I400)
64
- {
65
- width >>= m_hChromaShift;
66
- height >>= m_vChromaShift;
67
- stride = reconPic->m_strideC;
68
- cuHeight >>= m_vChromaShift;
69
-
70
- if (!row)
71
- m_frameEncoder->m_checksum[1] = m_frameEncoder->m_checksum[2] = 0;
72
-
73
- updateChecksum(reconPic->m_picOrg[1], m_frameEncoder->m_checksum[1], height, width, stride, row, cuHeight);
74
- updateChecksum(reconPic->m_picOrg[2], m_frameEncoder->m_checksum[2], height, width, stride, row, cuHeight);
75
- }
76
- }
77
+ uint32_t height = m_parallelFilter[row].getCUHeight();
78
+ m_frameEncoder->initDecodedPictureHashSEI(row, cuAddr, height);
79
} // end of (m_param->maxSlices == 1)
80
81
if (ATOMIC_INC(&m_frameEncoder->m_completionCount) == 2 * (int)m_frameEncoder->m_numRows)
82
x265_2.7.tar.gz/source/encoder/ratecontrol.cpp -> x265_2.9.tar.gz/source/encoder/ratecontrol.cpp
Changed
114
1
2
m_predictedBits = m_totalBits;
3
updateVbvPlan(enc);
4
rce->bufferFill = m_bufferFill;
5
+ rce->vbvEndAdj = false;
6
+ if (m_param->vbvBufferEnd && rce->encodeOrder >= m_param->vbvEndFrameAdjust * m_param->totalFrames)
7
+ {
8
+ rce->vbvEndAdj = true;
9
+ rce->targetFill = 0;
10
+ }
11
12
int mincr = enc->m_vps.ptl.minCrForLevel;
13
/* Profiles above Main10 don't require maxAU size check, so just set the maximum to a large value. */
14
15
else
16
{
17
/* The spec has a special case for the first frame. */
18
- if (rce->encodeOrder == 0)
19
+ if (curFrame->m_lowres.bKeyframe)
20
{
21
/* 1.5 * (Max( PicSizeInSamplesY, fR * MaxLumaSr) + MaxLumaSr * (AuCpbRemovalTime[ 0 ] -AuNominalRemovalTime[ 0 ])) ? MinCr */
22
double fr = 1. / 300;
23
24
/* 1.5 * MaxLumaSr * (AuCpbRemovalTime[ n ] - AuCpbRemovalTime[ n - 1 ]) / MinCr */
25
rce->frameSizeMaximum = 8 * 1.5 * enc->m_vps.ptl.maxLumaSrForLevel * m_frameDuration / mincr;
26
}
27
+ rce->frameSizeMaximum *= m_param->maxAUSizeFactor;
28
}
29
}
30
if (!m_isAbr && m_2pass && m_param->rc.rateControlMode == X265_RC_CRF)
31
32
curBits = predictSize(&m_pred[predType], frameQ[type], (double)satd);
33
bufferFillCur -= curBits;
34
}
35
- if (m_param->vbvBufferEnd && rce->encodeOrder >= m_param->vbvEndFrameAdjust * m_param->totalFrames)
36
+ if (rce->vbvEndAdj)
37
{
38
bool loopBreak = false;
39
double bufferDiff = m_param->vbvBufferEnd - (m_bufferFill / m_bufferSize);
40
- targetFill = m_bufferFill + m_bufferSize * (bufferDiff / (m_param->totalFrames - rce->encodeOrder));
41
- if (bufferFillCur < targetFill)
42
+ rce->targetFill = m_bufferFill + m_bufferSize * (bufferDiff / (m_param->totalFrames - rce->encodeOrder));
43
+ if (bufferFillCur < rce->targetFill)
44
{
45
q *= 1.01;
46
loopTerminate |= 1;
47
48
double rcTol = bufferLeftPlanned / m_param->frameNumThreads * m_rateTolerance;
49
int32_t encodedBitsSoFar = 0;
50
double accFrameBits = predictRowsSizeSum(curFrame, rce, qpVbv, encodedBitsSoFar);
51
+ double vbvEndBias = 0.95;
52
53
/* * Don't increase the row QPs until a sufficent amount of the bits of
54
* the frame have been processed, in case a flat area at the top of the
55
56
while (qpVbv < qpMax
57
&& (((accFrameBits > rce->frameSizePlanned + rcTol) ||
58
(rce->bufferFill - accFrameBits < bufferLeftPlanned * 0.5) ||
59
- (accFrameBits > rce->frameSizePlanned && qpVbv < rce->qpNoVbv))
60
+ (accFrameBits > rce->frameSizePlanned && qpVbv < rce->qpNoVbv) ||
61
+ (rce->vbvEndAdj && ((rce->bufferFill - accFrameBits) < (rce->targetFill * vbvEndBias))))
62
&& (!m_param->rc.bStrictCbr ? 1 : abrOvershoot > 0.1)))
63
{
64
qpVbv += stepSize;
65
66
while (qpVbv > qpMin
67
&& (qpVbv > curEncData.m_rowStat[0].rowQp || m_singleFrameVbv)
68
&& (((accFrameBits < rce->frameSizePlanned * 0.8f && qpVbv <= prevRowQp)
69
- || accFrameBits < (rce->bufferFill - m_bufferSize + m_bufferRate) * 1.1)
70
+ || accFrameBits < (rce->bufferFill - m_bufferSize + m_bufferRate) * 1.1
71
+ || (rce->vbvEndAdj && ((rce->bufferFill - accFrameBits) > (rce->targetFill * vbvEndBias))))
72
&& (!m_param->rc.bStrictCbr ? 1 : abrOvershoot < 0)))
73
{
74
qpVbv -= stepSize;
75
76
FrameData& curEncData = *curFrame->m_encData;
77
int64_t actualBits = bits;
78
Slice *slice = curEncData.m_slice;
79
+ bool bEnableDistOffset = m_param->analysisMultiPassDistortion && m_param->rc.bStatRead;
80
81
- if (m_param->rc.aqMode || m_isVbv || m_param->bAQMotion)
82
+ if (m_param->rc.aqMode || m_isVbv || m_param->bAQMotion || bEnableDistOffset)
83
{
84
if (m_isVbv && !(m_2pass && m_param->rc.rateControlMode == X265_RC_CRF))
85
{
86
87
rce->qpaRc = curEncData.m_avgQpRc;
88
}
89
90
- if (m_param->rc.aqMode || m_param->bAQMotion)
91
+ if (m_param->rc.aqMode || m_param->bAQMotion || bEnableDistOffset)
92
{
93
double avgQpAq = 0;
94
- /* determine actual avg encoded QP, after AQ/cutree adjustments */
95
+ /* determine actual avg encoded QP, after AQ/cutree/distortion adjustments */
96
for (uint32_t i = 0; i < slice->m_sps->numCuInHeight; i++)
97
avgQpAq += curEncData.m_rowStat[i].sumQpAq;
98
99
100
/* called to write out the rate control frame stats info in multipass encodes */
101
int RateControl::writeRateControlFrameStats(Frame* curFrame, RateControlEntry* rce)
102
{
103
- FrameData& curEncData = *curFrame->m_encData;
104
- int ncu;
105
- if (m_param->rc.qgSize == 8)
106
- ncu = m_ncu * 4;
107
- else
108
- ncu = m_ncu;
109
+ FrameData& curEncData = *curFrame->m_encData;
110
+ int ncu = (m_param->rc.qgSize == 8) ? m_ncu * 4 : m_ncu;
111
char cType = rce->sliceType == I_SLICE ? (curFrame->m_lowres.sliceType == X265_TYPE_IDR ? 'I' : 'i')
112
: rce->sliceType == P_SLICE ? 'P'
113
: IS_REFERENCED(curFrame) ? 'B' : 'b';
114
x265_2.7.tar.gz/source/encoder/ratecontrol.h -> x265_2.9.tar.gz/source/encoder/ratecontrol.h
Changed
10
1
2
double rowCplxrSum;
3
double qpNoVbv;
4
double bufferFill;
5
+ double targetFill;
6
+ bool vbvEndAdj;
7
double frameDuration;
8
double clippedDuration;
9
double frameSizeEstimated; /* hold frameSize, updated from cu level vbv rc */
10
x265_2.7.tar.gz/source/encoder/reference.cpp -> x265_2.9.tar.gz/source/encoder/reference.cpp
Changed
24
1
2
cuHeight >>= reconPic->m_vChromaShift;
3
}
4
5
- if (wp[c].bPresentFlag)
6
+ if (wp[c].wtPresent)
7
{
8
if (!weightBuffer[c])
9
{
10
11
12
const pixel* src = reconPic->m_picOrg[c] + numWeightedRows * cuHeight * stride;
13
pixel* dst = fpelPlane[c] + numWeightedRows * cuHeight * stride;
14
-
15
// Computing weighted CU rows
16
int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
17
- int padwidth = (width + 15) & ~15; // weightp assembly needs even 16 byte widths
18
+ int padwidth = (width + 31) & ~31; // weightp assembly needs even 32 byte widths
19
primitives.weight_pp(src, dst, stride, padwidth, height, w[c].weight, w[c].round << correction, w[c].shift + correction, w[c].offset);
20
-
21
// Extending Left & Right
22
primitives.extendRowBorder(dst, stride, width, height, marginX);
23
24
x265_2.7.tar.gz/source/encoder/search.cpp -> x265_2.9.tar.gz/source/encoder/search.cpp
Changed
201
1
2
m_me.init(param.internalCsp);
3
4
bool ok = m_quant.init(param.psyRdoq, scalingList, m_entropyCoder);
5
- if (m_param->noiseReductionIntra || m_param->noiseReductionInter || m_param->rc.vbvBufferSize)
6
+ if (m_param->noiseReductionIntra || m_param->noiseReductionInter )
7
ok &= m_quant.allocNoiseReduction(param);
8
9
ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */
10
11
// store original entropy coding status
12
if (bEnableRDOQ)
13
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
14
-
15
- primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
16
+ primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
17
18
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
19
if (numSig)
20
{
21
m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
22
- primitives.cu[sizeIdx].add_ps(reconQt, reconQtStride, pred, residual, stride, stride);
23
+ bool reconQtYuvAlign = m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
24
+ bool predAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
25
+ bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
26
+ bool bufferAlignCheck = (reconQtStride % 64 == 0) && (stride % 64 == 0) && reconQtYuvAlign && predAlign && residualAlign;
27
+ primitives.cu[sizeIdx].add_ps[bufferAlignCheck](reconQt, reconQtStride, pred, residual, stride, stride);
28
}
29
else
30
// no coded residual, recon = pred
31
32
33
coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffY);
34
pixel* tmpRecon = (useTSkip ? m_tsRecon : reconQt);
35
+ bool tmpReconAlign = (useTSkip ? 1 : (m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].reconQtYuv.m_size) % 64 == 0));
36
uint32_t tmpReconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
37
38
- primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
39
+ primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
40
41
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTSkip);
42
if (numSig)
43
{
44
m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTSkip, numSig);
45
- primitives.cu[sizeIdx].add_ps(tmpRecon, tmpReconStride, pred, residual, stride, stride);
46
+ bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, m_rqt[cuGeom.depth].tmpResiYuv.m_size) % 64 == 0;
47
+ bool predAlign = predYuv->getAddrOffset(absPartIdx, predYuv->m_size) % 64 == 0;
48
+ bool bufferAlignCheck = (stride % 64 == 0) && (tmpReconStride % 64 == 0) && tmpReconAlign && residualAlign && predAlign;
49
+ primitives.cu[sizeIdx].add_ps[bufferAlignCheck](tmpRecon, tmpReconStride, pred, residual, stride, stride);
50
}
51
else if (useTSkip)
52
{
53
54
coeff_t* coeffY = cu.m_trCoeff[0] + coeffOffsetY;
55
56
uint32_t sizeIdx = log2TrSize - 2;
57
- primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
58
+ primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
59
60
PicYuv* reconPic = m_frame->m_reconPic;
61
pixel* picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
62
63
if (numSig)
64
{
65
m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
66
- primitives.cu[sizeIdx].add_ps(picReconY, picStride, pred, residual, stride, stride);
67
+ bool picReconYAlign = (reconPic->m_cuOffsetY[cu.m_cuAddr] + reconPic->m_buOffsetY[cuGeom.absPartIdx + absPartIdx]) % 64 == 0;
68
+ bool predAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
69
+ bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, m_rqt[cuGeom.depth].tmpResiYuv.m_size)% 64 == 0;
70
+ bool bufferAlignCheck = (picStride % 64 == 0) && (stride % 64 == 0) && picReconYAlign && predAlign && residualAlign;
71
+ primitives.cu[sizeIdx].add_ps[bufferAlignCheck](picReconY, picStride, pred, residual, stride, stride);
72
cu.setCbfSubParts(1 << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
73
}
74
else
75
76
predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
77
cu.setTransformSkipPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
78
79
- primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
80
+ primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
81
+
82
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
83
if (numSig)
84
{
85
m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
86
- primitives.cu[sizeIdxC].add_ps(reconQt, reconQtStride, pred, residual, stride, stride);
87
+ bool reconQtAlign = m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
88
+ bool predAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
89
+ bool residualAlign = resiYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
90
+ bool bufferAlignCheck = reconQtAlign && predAlign && residualAlign && (reconQtStride % 64 == 0) && (stride % 64 == 0);
91
+ primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](reconQt, reconQtStride, pred, residual, stride, stride);
92
cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
93
}
94
else
95
96
pixel* recon = (useTSkip ? m_tsRecon : reconQt);
97
uint32_t reconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
98
99
- primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
100
+ primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
101
102
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTSkip);
103
if (numSig)
104
{
105
m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig);
106
- primitives.cu[sizeIdxC].add_ps(recon, reconStride, pred, residual, stride, stride);
107
+ bool reconAlign = (useTSkip ? 1 : m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC)) % 64 == 0;
108
+ bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
109
+ bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
110
+ bool bufferAlignCheck = reconAlign && predYuvAlign && residualAlign && (reconStride % 64 == 0) && (stride % 64 == 0);
111
+ primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](recon, reconStride, pred, residual, stride, stride);
112
cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
113
}
114
else if (useTSkip)
115
116
117
X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not supported at low RD levels\n");
118
119
- primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
120
+ primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
121
+
122
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
123
if (numSig)
124
{
125
m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
126
- primitives.cu[sizeIdxC].add_ps(picReconC, picStride, pred, residual, stride, stride);
127
+ bool picReconCAlign = (reconPic->m_cuOffsetC[cu.m_cuAddr] + reconPic->m_buOffsetC[cuGeom.absPartIdx + absPartIdxC]) % 64 == 0;
128
+ bool predAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
129
+ bool residualAlign = resiYuv.getChromaAddrOffset(absPartIdxC)% 64 == 0;
130
+ bool bufferAlignCheck = picReconCAlign && predAlign && residualAlign && (picStride % 64 == 0) && (stride % 64 == 0);
131
+ primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](picReconC, picStride, pred, residual, stride, stride);
132
cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
133
}
134
else
135
136
137
pixel nScale[129];
138
intraNeighbourBuf[1][0] = intraNeighbourBuf[0][0];
139
- primitives.scale1D_128to64(nScale + 1, intraNeighbourBuf[0] + 1);
140
+ primitives.scale1D_128to64[NONALIGNED](nScale + 1, intraNeighbourBuf[0] + 1);
141
142
// we do not estimate filtering for downscaled samples
143
memcpy(&intraNeighbourBuf[0][1], &nScale[1], 2 * 64 * sizeof(pixel)); // Top & Left pixels
144
145
bestME[list].mvCost = mvCost;
146
}
147
}
148
-
149
-void Search::searchMV(Mode& interMode, const PredictionUnit& pu, int list, int ref, MV& outmv)
150
+void Search::searchMV(Mode& interMode, const PredictionUnit& pu, int list, int ref, MV& outmv, MV mvp, int numMvc, MV* mvc)
151
{
152
CUData& cu = interMode.cu;
153
const Slice *slice = m_slice;
154
- MV mv = cu.m_mv[list][pu.puAbsPartIdx];
155
+ MV mv;
156
+ if (m_param->interRefine == 1)
157
+ mv = mvp;
158
+ else
159
+ mv = cu.m_mv[list][pu.puAbsPartIdx];
160
cu.clipMv(mv);
161
MV mvmin, mvmax;
162
setSearchRange(cu, mv, m_param->searchRange, mvmin, mvmax);
163
- m_me.refineMV(&slice->m_mref[list][ref], mvmin, mvmax, mv, outmv);
164
+ if (m_param->interRefine == 1)
165
+ m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mv, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices,
166
+ m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
167
+ else
168
+ m_me.refineMV(&slice->m_mref[list][ref], mvmin, mvmax, mv, outmv);
169
}
170
-
171
/* find the best inter prediction for each PU of specified mode */
172
void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t refMasks[2])
173
{
174
175
int totalmebits = 0;
176
MV mvzero(0, 0);
177
Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
178
-
179
MergeData merge;
180
memset(&merge, 0, sizeof(merge));
181
-
182
+ bool useAsMVP = false;
183
for (int puIdx = 0; puIdx < numPart; puIdx++)
184
{
185
MotionData* bestME = interMode.bestME[puIdx];
186
PredictionUnit pu(cu, cuGeom, puIdx);
187
-
188
m_me.setSourcePU(*interMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC);
189
-
190
+ useAsMVP = false;
191
+ x265_analysis_inter_data* interDataCTU = NULL;
192
+ int cuIdx;
193
+ cuIdx = (interMode.cu.m_cuAddr * m_param->num4x4Partitions) + cuGeom.absPartIdx;
194
+ if (m_param->analysisReuseLevel == 10 && m_param->interRefine > 1)
195
+ {
196
+ interDataCTU = m_frame->m_analysisData.interData;
197
+ if ((cu.m_predMode[pu.puAbsPartIdx] == interDataCTU->modes[cuIdx + pu.puAbsPartIdx])
198
+ && (cu.m_partSize[pu.puAbsPartIdx] == interDataCTU->partSize[cuIdx + pu.puAbsPartIdx])
199
+ && !(interDataCTU->mergeFlag[cuIdx + puIdx])
200
+ && (cu.m_cuDepth[0] == interDataCTU->depth[cuIdx]))
201
x265_2.7.tar.gz/source/encoder/search.h -> x265_2.9.tar.gz/source/encoder/search.h
Changed
11
1
2
3
// estimation inter prediction (non-skip)
4
void predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t masks[2]);
5
-
6
- void searchMV(Mode& interMode, const PredictionUnit& pu, int list, int ref, MV& outmv);
7
+ void searchMV(Mode& interMode, const PredictionUnit& pu, int list, int ref, MV& outmv, MV mvp, int numMvc, MV* mvc);
8
// encode residual and compute rd-cost for inter mode
9
void encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom);
10
void encodeResAndCalcRdSkipCU(Mode& interMode);
11
x265_2.7.tar.gz/source/encoder/sei.cpp -> x265_2.9.tar.gz/source/encoder/sei.cpp
Changed
141
1
2
};
3
4
/* marshal a single SEI message sei, storing the marshalled representation
5
- * in bitstream bs */
6
-void SEI::write(Bitstream& bs, const SPS& sps)
7
+* in bitstream bs */
8
+void SEI::writeSEImessages(Bitstream& bs, const SPS& sps, NalUnitType nalUnitType, NALList& list, int isNested)
9
{
10
- uint32_t type = m_payloadType;
11
+ if (!isNested)
12
+ bs.resetBits();
13
+
14
+ BitCounter counter;
15
+ m_bitIf = &counter;
16
+ writeSEI(sps);
17
+ /* count the size of the payload and return the size in bits */
18
+ X265_CHECK(0 == (counter.getNumberOfWrittenBits() & 7), "payload unaligned\n");
19
+ uint32_t payloadData = counter.getNumberOfWrittenBits() >> 3;
20
+
21
+ // set bitstream
22
m_bitIf = &bs;
23
- BitCounter count;
24
- bool hrdTypes = (m_payloadType == ACTIVE_PARAMETER_SETS || m_payloadType == PICTURE_TIMING || m_payloadType == BUFFERING_PERIOD);
25
- if (hrdTypes)
26
- {
27
- m_bitIf = &count;
28
- /* virtual writeSEI method, write to bit counter to determine size */
29
- writeSEI(sps);
30
- m_bitIf = &bs;
31
- uint32_t payloadType = m_payloadType;
32
- for (; payloadType >= 0xff; payloadType -= 0xff)
33
- WRITE_CODE(0xff, 8, "payload_type");
34
- }
35
- WRITE_CODE(type, 8, "payload_type");
36
- uint32_t payloadSize;
37
- if (hrdTypes || m_payloadType == USER_DATA_UNREGISTERED || m_payloadType == USER_DATA_REGISTERED_ITU_T_T35)
38
+
39
+ uint32_t payloadType = m_payloadType;
40
+ for (; payloadType >= 0xff; payloadType -= 0xff)
41
+ WRITE_CODE(0xff, 8, "payload_type");
42
+ WRITE_CODE(payloadType, 8, "payload_type");
43
+
44
+ uint32_t payloadSize = payloadData;
45
+ for (; payloadSize >= 0xff; payloadSize -= 0xff)
46
+ WRITE_CODE(0xff, 8, "payload_size");
47
+ WRITE_CODE(payloadSize, 8, "payload_size");
48
+
49
+ // virtual writeSEI method, write to bs
50
+ writeSEI(sps);
51
+
52
+ if (!isNested)
53
{
54
- if (hrdTypes)
55
- {
56
- X265_CHECK(0 == (count.getNumberOfWrittenBits() & 7), "payload unaligned\n");
57
- payloadSize = count.getNumberOfWrittenBits() >> 3;
58
- }
59
- else if (m_payloadType == USER_DATA_UNREGISTERED)
60
- payloadSize = m_payloadSize + 16;
61
- else
62
- payloadSize = m_payloadSize;
63
-
64
- for (; payloadSize >= 0xff; payloadSize -= 0xff)
65
- WRITE_CODE(0xff, 8, "payload_size");
66
- WRITE_CODE(payloadSize, 8, "payload_size");
67
+ bs.writeByteAlignment();
68
+ list.serialize(nalUnitType, bs);
69
}
70
- else
71
- WRITE_CODE(m_payloadSize, 8, "payload_size");
72
- /* virtual writeSEI method, write to bs */
73
- writeSEI(sps);
74
}
75
76
void SEI::writeByteAlign()
77
78
{
79
m_payloadSize = size;
80
}
81
+
82
+/* charSet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/" */
83
+
84
+char* SEI::base64Decode(char encodedString[], int base64EncodeLength)
85
+{
86
+ char* decodedString;
87
+ decodedString = (char*)malloc(sizeof(char) * ((base64EncodeLength / 4) * 3));
88
+ int i, j, k = 0;
89
+ // stores the bitstream
90
+ int bitstream = 0;
91
+ // countBits stores current number of bits in bitstream
92
+ int countBits = 0;
93
+ // selects 4 characters from encodedString at a time. Find the position of each encoded character in charSet and stores in bitstream
94
+ for (i = 0; i < base64EncodeLength; i += 4)
95
+ {
96
+ bitstream = 0, countBits = 0;
97
+ for (j = 0; j < 4; j++)
98
+ {
99
+ // make space for 6 bits
100
+ if (encodedString[i + j] != '=')
101
+ {
102
+ bitstream = bitstream << 6;
103
+ countBits += 6;
104
+ }
105
+ // Finding the position of each encoded character in charSet and storing in bitstream, use OR '|' operator to store bits
106
+
107
+ if (encodedString[i + j] >= 'A' && encodedString[i + j] <= 'Z')
108
+ bitstream = bitstream | (encodedString[i + j] - 'A');
109
+
110
+ else if (encodedString[i + j] >= 'a' && encodedString[i + j] <= 'z')
111
+ bitstream = bitstream | (encodedString[i + j] - 'a' + 26);
112
+
113
+ else if (encodedString[i + j] >= '0' && encodedString[i + j] <= '9')
114
+ bitstream = bitstream | (encodedString[i + j] - '0' + 52);
115
+
116
+ // '+' occurs in 62nd position in charSet
117
+ else if (encodedString[i + j] == '+')
118
+ bitstream = bitstream | 62;
119
+
120
+ // '/' occurs in 63rd position in charSet
121
+ else if (encodedString[i + j] == '/')
122
+ bitstream = bitstream | 63;
123
+
124
+ // to delete appended bits during encoding
125
+ else
126
+ {
127
+ bitstream = bitstream >> 2;
128
+ countBits -= 2;
129
+ }
130
+ }
131
+
132
+ while (countBits != 0)
133
+ {
134
+ countBits -= 8;
135
+ decodedString[k++] = (bitstream >> countBits) & 255;
136
+ }
137
+ }
138
+ return decodedString;
139
+}
140
+
141
x265_2.7.tar.gz/source/encoder/sei.h -> x265_2.9.tar.gz/source/encoder/sei.h
Changed
135
1
2
#include "common.h"
3
#include "bitstream.h"
4
#include "slice.h"
5
+#include "nal.h"
6
+#include "md5.h"
7
8
namespace X265_NS {
9
// private namespace
10
11
class SEI : public SyntaxElementWriter
12
{
13
public:
14
- /* SEI users call write() to marshal an SEI to a bitstream.
15
- * The write() method calls writeSEI() which encodes the header */
16
- void write(Bitstream& bs, const SPS& sps);
17
-
18
+ /* SEI users call writeSEImessages() to marshal an SEI to a bitstream.
19
+ * The writeSEImessages() method calls writeSEI() which encodes the header */
20
+ void writeSEImessages(Bitstream& bs, const SPS& sps, NalUnitType nalUnitType, NALList& list, int isNested);
21
void setSize(uint32_t size);
22
+ static char* base64Decode(char encodedString[], int base64EncodeLength);
23
virtual ~SEI() {}
24
protected:
25
SEIPayloadType m_payloadType;
26
27
void writeByteAlign();
28
};
29
30
+//seongnam.oh@samsung.com :: for the Creative Intent Meta Data Encoding
31
+class SEIuserDataRegistered : public SEI
32
+{
33
+public:
34
+ SEIuserDataRegistered()
35
+ {
36
+ m_payloadType = USER_DATA_REGISTERED_ITU_T_T35;
37
+ m_payloadSize = 0;
38
+ }
39
+
40
+ uint8_t *m_userData;
41
+
42
+ // daniel.vt@samsung.com :: for the Creative Intent Meta Data Encoding ( seongnam.oh@samsung.com )
43
+ void writeSEI(const SPS&)
44
+ {
45
+ if (!m_userData)
46
+ return;
47
+
48
+ uint32_t i = 0;
49
+ for (; i < m_payloadSize; ++i)
50
+ WRITE_CODE(m_userData[i], 8, "creative_intent_metadata");
51
+ }
52
+};
53
+
54
+static const uint32_t ISO_IEC_11578_LEN = 16;
55
+
56
class SEIuserDataUnregistered : public SEI
57
{
58
public:
59
60
m_payloadType = USER_DATA_UNREGISTERED;
61
m_payloadSize = 0;
62
}
63
- static const uint8_t m_uuid_iso_iec_11578[16];
64
+ static const uint8_t m_uuid_iso_iec_11578[ISO_IEC_11578_LEN];
65
uint8_t *m_userData;
66
void writeSEI(const SPS&)
67
{
68
- for (uint32_t i = 0; i < 16; i++)
69
+ for (uint32_t i = 0; i < ISO_IEC_11578_LEN; i++)
70
WRITE_CODE(m_uuid_iso_iec_11578[i], 8, "sei.uuid_iso_iec_11578[i]");
71
for (uint32_t i = 0; i < m_payloadSize; i++)
72
WRITE_CODE(m_userData[i], 8, "user_data");
73
74
CRC,
75
CHECKSUM,
76
} m_method;
77
- uint8_t m_digest[3][16];
78
+
79
+ MD5Context m_state[3];
80
+ uint32_t m_crc[3];
81
+ uint32_t m_checksum[3];
82
+ uint8_t m_digest[3][16];
83
+
84
void writeSEI(const SPS& sps)
85
{
86
int planes = (sps.chromaFormatIdc != X265_CSP_I400) ? 3 : 1;
87
88
class SEIRecoveryPoint : public SEI
89
{
90
public:
91
+ SEIRecoveryPoint()
92
+ {
93
+ m_payloadType = RECOVERY_POINT;
94
+ m_payloadSize = 0;
95
+ }
96
int m_recoveryPocCnt;
97
bool m_exactMatchingFlag;
98
bool m_brokenLinkFlag;
99
100
}
101
};
102
103
-//seongnam.oh@samsung.com :: for the Creative Intent Meta Data Encoding
104
-class SEICreativeIntentMeta : public SEI
105
+class SEIAlternativeTC : public SEI
106
{
107
public:
108
- SEICreativeIntentMeta()
109
+ int m_preferredTransferCharacteristics;
110
+ SEIAlternativeTC()
111
{
112
- m_payloadType = USER_DATA_REGISTERED_ITU_T_T35;
113
+ m_payloadType = ALTERNATIVE_TRANSFER_CHARACTERISTICS;
114
m_payloadSize = 0;
115
+ m_preferredTransferCharacteristics = -1;
116
}
117
118
- uint8_t *m_payload;
119
-
120
- // daniel.vt@samsung.com :: for the Creative Intent Meta Data Encoding ( seongnam.oh@samsung.com )
121
void writeSEI(const SPS&)
122
{
123
- if (!m_payload)
124
- return;
125
-
126
- uint32_t i = 0;
127
- for (; i < m_payloadSize; ++i)
128
- WRITE_CODE(m_payload[i], 8, "creative_intent_metadata");
129
+ WRITE_CODE(m_preferredTransferCharacteristics, 8, "Preferred transfer characteristics");
130
}
131
};
132
+
133
}
134
#endif // ifndef X265_SEI_H
135
x265_2.7.tar.gz/source/encoder/slicetype.cpp -> x265_2.9.tar.gz/source/encoder/slicetype.cpp
Changed
201
1
2
curFrame->m_lowres.wp_sum[y] = 0;
3
}
4
5
- /* Calculate Qp offset for each 16x16 or 8x8 block in the frame */
6
- int blockXY = 0;
7
- int blockX = 0, blockY = 0;
8
- double strength = 0.f;
9
+ /* Calculate Qp offset for each 16x16 or 8x8 block in the frame */
10
if ((param->rc.aqMode == X265_AQ_NONE || param->rc.aqStrength == 0) || (param->rc.bStatRead && param->rc.cuTree && IS_REFERENCED(curFrame)))
11
{
12
- /* Need to init it anyways for CU tree */
13
- int cuCount = blockCount;
14
-
15
if (param->rc.aqMode && param->rc.aqStrength == 0)
16
{
17
if (quantOffsets)
18
{
19
- for (int cuxy = 0; cuxy < cuCount; cuxy++)
20
+ for (int cuxy = 0; cuxy < blockCount; cuxy++)
21
{
22
curFrame->m_lowres.qpCuTreeOffset[cuxy] = curFrame->m_lowres.qpAqOffset[cuxy] = quantOffsets[cuxy];
23
curFrame->m_lowres.invQscaleFactor[cuxy] = x265_exp2fix8(curFrame->m_lowres.qpCuTreeOffset[cuxy]);
24
25
}
26
else
27
{
28
- memset(curFrame->m_lowres.qpCuTreeOffset, 0, cuCount * sizeof(double));
29
- memset(curFrame->m_lowres.qpAqOffset, 0, cuCount * sizeof(double));
30
- for (int cuxy = 0; cuxy < cuCount; cuxy++)
31
- curFrame->m_lowres.invQscaleFactor[cuxy] = 256;
32
+ memset(curFrame->m_lowres.qpCuTreeOffset, 0, blockCount * sizeof(double));
33
+ memset(curFrame->m_lowres.qpAqOffset, 0, blockCount * sizeof(double));
34
+ for (int cuxy = 0; cuxy < blockCount; cuxy++)
35
+ curFrame->m_lowres.invQscaleFactor[cuxy] = 256;
36
}
37
}
38
39
- /* Need variance data for weighted prediction */
40
+ /* Need variance data for weighted prediction and dynamic refinement*/
41
if (param->bEnableWeightedPred || param->bEnableWeightedBiPred)
42
{
43
- for (blockY = 0; blockY < maxRow; blockY += loopIncr)
44
- for (blockX = 0; blockX < maxCol; blockX += loopIncr)
45
- acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
46
+ for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
47
+ for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
48
+ acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
49
}
50
}
51
else
52
{
53
- blockXY = 0;
54
- double avg_adj_pow2 = 0, avg_adj = 0, qp_adj = 0;
55
- double bias_strength = 0.f;
56
+ int blockXY = 0;
57
+ double avg_adj_pow2 = 0.f, avg_adj = 0.f, qp_adj = 0.f;
58
+ double bias_strength = 0.f, strength = 0.f;
59
if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE || param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED)
60
{
61
- double bit_depth_correction = 1.f / (1 << (2*(X265_DEPTH-8)));
62
- curFrame->m_lowres.frameVariance = 0;
63
- uint64_t rowVariance = 0;
64
- for (blockY = 0; blockY < maxRow; blockY += loopIncr)
65
- {
66
- rowVariance = 0;
67
- for (blockX = 0; blockX < maxCol; blockX += loopIncr)
68
- {
69
- uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
70
- curFrame->m_lowres.blockVariance[blockXY] = energy;
71
- rowVariance += energy;
72
+ double bit_depth_correction = 1.f / (1 << (2*(X265_DEPTH-8)));
73
+
74
+ for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
75
+ {
76
+ for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
77
+ {
78
+ uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
79
qp_adj = pow(energy * bit_depth_correction + 1, 0.1);
80
curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
81
avg_adj += qp_adj;
82
avg_adj_pow2 += qp_adj * qp_adj;
83
blockXY++;
84
}
85
- curFrame->m_lowres.frameVariance += (rowVariance / maxCol);
86
}
87
- curFrame->m_lowres.frameVariance /= maxRow;
88
avg_adj /= blockCount;
89
avg_adj_pow2 /= blockCount;
90
strength = param->rc.aqStrength * avg_adj;
91
- avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (modeTwoConst)) / avg_adj;
92
+ avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - modeTwoConst) / avg_adj;
93
bias_strength = param->rc.aqStrength;
94
}
95
else
96
strength = param->rc.aqStrength * 1.0397f;
97
98
blockXY = 0;
99
- for (blockY = 0; blockY < maxRow; blockY += loopIncr)
100
+ for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
101
{
102
- for (blockX = 0; blockX < maxCol; blockX += loopIncr)
103
+ for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
104
{
105
if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED)
106
{
107
108
else
109
{
110
uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp,param->rc.qgSize);
111
- qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (modeOneConst + 2 * (X265_DEPTH - 8)));
112
+ qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (modeOneConst + 2 * (X265_DEPTH - 8)));
113
}
114
115
if (param->bHDROpt)
116
117
curFrame->m_lowres.wp_ssd[i] = ssd - (sum * sum + (width[i] * height[i]) / 2) / (width[i] * height[i]);
118
}
119
}
120
+
121
+ if (param->bDynamicRefine)
122
+ {
123
+ int blockXY = 0;
124
+ for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
125
+ for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
126
+ {
127
+ curFrame->m_lowres.blockVariance[blockXY] = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
128
+ blockXY++;
129
+ }
130
+ }
131
}
132
133
void LookaheadTLD::lowresIntraEstimate(Lowres& fenc, uint32_t qgSize)
134
135
pixel *src = ref.fpelPlane[0];
136
intptr_t stride = fenc.lumaStride;
137
138
- if (wp.bPresentFlag)
139
+ if (wp.wtPresent)
140
{
141
int offset = wp.inputOffset << (X265_DEPTH - 8);
142
int scale = wp.inputWeight;
143
144
int deltaIndex = fenc.frameNum - ref.frameNum;
145
146
WeightParam wp;
147
- wp.bPresentFlag = false;
148
+ wp.wtPresent = 0;
149
150
if (!wbuffer[0])
151
{
152
153
}
154
155
int bframes, brefs;
156
- for (bframes = 0, brefs = 0;; bframes++)
157
+ if (!m_param->analysisLoad)
158
{
159
- Lowres& frm = list[bframes]->m_lowres;
160
-
161
- if (frm.sliceType == X265_TYPE_BREF && !m_param->bBPyramid && brefs == m_param->bBPyramid)
162
+ for (bframes = 0, brefs = 0;; bframes++)
163
{
164
- frm.sliceType = X265_TYPE_B;
165
- x265_log(m_param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid\n",
166
- frm.frameNum);
167
- }
168
+ Lowres& frm = list[bframes]->m_lowres;
169
170
- /* pyramid with multiple B-refs needs a big enough dpb that the preceding P-frame stays available.
171
- * smaller dpb could be supported by smart enough use of mmco, but it's easier just to forbid it. */
172
- else if (frm.sliceType == X265_TYPE_BREF && m_param->bBPyramid && brefs &&
173
- m_param->maxNumReferences <= (brefs + 3))
174
- {
175
- frm.sliceType = X265_TYPE_B;
176
- x265_log(m_param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid and %d reference frames\n",
177
- frm.sliceType, m_param->maxNumReferences);
178
- }
179
- if ((!m_param->bIntraRefresh || frm.frameNum == 0) && frm.frameNum - m_lastKeyframe >= m_param->keyframeMax &&
180
- (!m_extendGopBoundary || frm.frameNum - m_lastKeyframe >= m_param->keyframeMax + m_param->gopLookahead))
181
- {
182
- if (frm.sliceType == X265_TYPE_AUTO || frm.sliceType == X265_TYPE_I)
183
- frm.sliceType = m_param->bOpenGOP && m_lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR;
184
- bool warn = frm.sliceType != X265_TYPE_IDR;
185
- if (warn && m_param->bOpenGOP)
186
- warn &= frm.sliceType != X265_TYPE_I;
187
- if (warn)
188
+ if (frm.sliceType == X265_TYPE_BREF && !m_param->bBPyramid && brefs == m_param->bBPyramid)
189
{
190
- x265_log(m_param, X265_LOG_WARNING, "specified frame type (%d) at %d is not compatible with keyframe interval\n",
191
- frm.sliceType, frm.frameNum);
192
- frm.sliceType = m_param->bOpenGOP && m_lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR;
193
+ frm.sliceType = X265_TYPE_B;
194
+ x265_log(m_param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid\n",
195
+ frm.frameNum);
196
}
197
- }
198
- if (frm.sliceType == X265_TYPE_I && frm.frameNum - m_lastKeyframe >= m_param->keyframeMin)
199
- {
200
- if (m_param->bOpenGOP)
201
x265_2.7.tar.gz/source/encoder/weightPrediction.cpp -> x265_2.9.tar.gz/source/encoder/weightPrediction.cpp
Changed
56
1
2
int denom = w->log2WeightDenom;
3
int round = denom ? 1 << (denom - 1) : 0;
4
int correction = IF_INTERNAL_PREC - X265_DEPTH; /* intermediate interpolation depth */
5
- int pwidth = ((width + 15) >> 4) << 4;
6
-
7
+ int pwidth = ((width + 31) >> 5) << 5;
8
primitives.weight_pp(ref, weightTemp, stride, pwidth, height,
9
weight, round << correction, denom + correction, offset);
10
ref = weightTemp;
11
12
for (int plane = 0; plane < (param.internalCsp != X265_CSP_I400 ? 3 : 1); plane++)
13
{
14
denom = plane ? chromaDenom : lumaDenom;
15
- if (plane && !weights[0].bPresentFlag)
16
+ if (plane && !weights[0].wtPresent)
17
break;
18
19
/* Early termination */
20
21
}
22
}
23
24
- if (weights[0].bPresentFlag)
25
+ if (weights[0].wtPresent)
26
{
27
// Make sure both chroma channels match
28
- if (weights[1].bPresentFlag != weights[2].bPresentFlag)
29
+ if (weights[1].wtPresent != weights[2].wtPresent)
30
{
31
- if (weights[1].bPresentFlag)
32
+ if (weights[1].wtPresent)
33
weights[2] = weights[1];
34
else
35
weights[1] = weights[2];
36
37
for (int list = 0; list < numPredDir; list++)
38
{
39
WeightParam* w = &wp[list][0][0];
40
- if (w[0].bPresentFlag || w[1].bPresentFlag || w[2].bPresentFlag)
41
+ if (w[0].wtPresent || w[1].wtPresent || w[2].wtPresent)
42
{
43
bWeighted = true;
44
p += sprintf(buf + p, " [L%d:R0 ", list);
45
- if (w[0].bPresentFlag)
46
+ if (w[0].wtPresent)
47
p += sprintf(buf + p, "Y{%d/%d%+d}", w[0].inputWeight, 1 << w[0].log2WeightDenom, w[0].inputOffset);
48
- if (w[1].bPresentFlag)
49
+ if (w[1].wtPresent)
50
p += sprintf(buf + p, "U{%d/%d%+d}", w[1].inputWeight, 1 << w[1].log2WeightDenom, w[1].inputOffset);
51
- if (w[2].bPresentFlag)
52
+ if (w[2].wtPresent)
53
p += sprintf(buf + p, "V{%d/%d%+d}", w[2].inputWeight, 1 << w[2].log2WeightDenom, w[2].inputOffset);
54
p += sprintf(buf + p, "]");
55
}
56
x265_2.7.tar.gz/source/test/ipfilterharness.cpp -> x265_2.9.tar.gz/source/test/ipfilterharness.cpp
Changed
201
1
2
return true;
3
}
4
5
+bool IPFilterHarness::check_IPFilterLumaP2S_aligned_primitive(filter_p2s_t ref, filter_p2s_t opt)
6
+{
7
+ for (int i = 0; i < TEST_CASES; i++)
8
+ {
9
+ int index = i % TEST_CASES;
10
+ intptr_t rand_srcStride[] = { 128, 192, 256, 512 };
11
+ intptr_t dstStride[] = { 192, 256, 512, 576 };
12
+ for (int p = 0; p < 4; p++)
13
+ {
14
+ ref(pixel_test_buff[index], rand_srcStride[p], IPF_C_output_s, dstStride[p]);
15
+ checked(opt, pixel_test_buff[index] + (64 * i), rand_srcStride[p], IPF_vec_output_s, dstStride[p]);
16
+ if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(int16_t)))
17
+ return false;
18
+ }
19
+ reportfail();
20
+ }
21
+
22
+ return true;
23
+}
24
+
25
bool IPFilterHarness::check_IPFilterChromaP2S_primitive(filter_p2s_t ref, filter_p2s_t opt)
26
{
27
for (int i = 0; i < ITERS; i++)
28
29
return true;
30
}
31
32
+bool IPFilterHarness::check_IPFilterChromaP2S_aligned_primitive(filter_p2s_t ref, filter_p2s_t opt)
33
+{
34
+ for (int i = 0; i < TEST_CASES; i++)
35
+ {
36
+ int index = i % TEST_CASES;
37
+ intptr_t rand_srcStride[] = { 128, 192, 256, 512};
38
+ intptr_t dstStride[] = { 192, 256, 512, 576 };
39
+
40
+ for (int p = 0; p < 4; p++)
41
+ {
42
+ ref(pixel_test_buff[index], rand_srcStride[p], IPF_C_output_s, dstStride[p]);
43
+
44
+ checked(opt, pixel_test_buff[index], rand_srcStride[p], IPF_vec_output_s, dstStride[p]);
45
+
46
+ if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(int16_t)))
47
+ return false;
48
+ }
49
+ reportfail();
50
+ }
51
+
52
+ return true;
53
+}
54
+
55
bool IPFilterHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
56
{
57
58
59
return false;
60
}
61
}
62
- if (opt.pu[value].convert_p2s)
63
+ if (opt.pu[value].convert_p2s[NONALIGNED])
64
{
65
- if (!check_IPFilterLumaP2S_primitive(ref.pu[value].convert_p2s, opt.pu[value].convert_p2s))
66
+ if (!check_IPFilterLumaP2S_primitive(ref.pu[value].convert_p2s[NONALIGNED], opt.pu[value].convert_p2s[NONALIGNED]))
67
{
68
printf("convert_p2s[%s]", lumaPartStr[value]);
69
return false;
70
}
71
}
72
+ if (opt.pu[value].convert_p2s[ALIGNED])
73
+ {
74
+ if (!check_IPFilterLumaP2S_aligned_primitive(ref.pu[value].convert_p2s[ALIGNED], opt.pu[value].convert_p2s[ALIGNED]))
75
+ {
76
+ printf("convert_p2s_aligned[%s]", lumaPartStr[value]);
77
+ return false;
78
+ }
79
+ }
80
}
81
82
for (int csp = X265_CSP_I420; csp < X265_CSP_COUNT; csp++)
83
84
return false;
85
}
86
}
87
- if (opt.chroma[csp].pu[value].p2s)
88
+ if (opt.chroma[csp].pu[value].p2s[ALIGNED])
89
+ {
90
+ if (!check_IPFilterChromaP2S_aligned_primitive(ref.chroma[csp].pu[value].p2s[ALIGNED], opt.chroma[csp].pu[value].p2s[ALIGNED]))
91
+ {
92
+ printf("chroma_p2s_aligned[%s]", chromaPartStr[csp][value]);
93
+ return false;
94
+ }
95
+ }
96
+ if (opt.chroma[csp].pu[value].p2s[NONALIGNED])
97
{
98
- if (!check_IPFilterChromaP2S_primitive(ref.chroma[csp].pu[value].p2s, opt.chroma[csp].pu[value].p2s))
99
+ if (!check_IPFilterChromaP2S_primitive(ref.chroma[csp].pu[value].p2s[NONALIGNED], opt.chroma[csp].pu[value].p2s[NONALIGNED]))
100
{
101
printf("chroma_p2s[%s]", chromaPartStr[csp][value]);
102
return false;
103
104
105
void IPFilterHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
106
{
107
- int16_t srcStride = 96;
108
- int16_t dstStride = 96;
109
+ int16_t srcStride = 192; /* Multiple of 64 */
110
+ int16_t dstStride = 192;
111
int maxVerticalfilterHalfDistance = 3;
112
113
for (int value = 0; value < NUM_PU_SIZES; value++)
114
115
{
116
printf("luma_hpp[%s]\t", lumaPartStr[value]);
117
REPORT_SPEEDUP(opt.pu[value].luma_hpp, ref.pu[value].luma_hpp,
118
- pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1);
119
+ pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1);
120
}
121
122
if (opt.pu[value].luma_hps)
123
{
124
printf("luma_hps[%s]\t", lumaPartStr[value]);
125
REPORT_SPEEDUP(opt.pu[value].luma_hps, ref.pu[value].luma_hps,
126
- pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
127
- IPF_vec_output_s, dstStride, 1, 1);
128
+ pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
129
+ IPF_vec_output_s, dstStride, 1, 1);
130
}
131
132
if (opt.pu[value].luma_vpp)
133
{
134
printf("luma_vpp[%s]\t", lumaPartStr[value]);
135
REPORT_SPEEDUP(opt.pu[value].luma_vpp, ref.pu[value].luma_vpp,
136
- pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
137
- IPF_vec_output_p, dstStride, 1);
138
+ pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
139
+ IPF_vec_output_p, dstStride, 1);
140
}
141
142
if (opt.pu[value].luma_vps)
143
{
144
printf("luma_vps[%s]\t", lumaPartStr[value]);
145
REPORT_SPEEDUP(opt.pu[value].luma_vps, ref.pu[value].luma_vps,
146
- pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
147
- IPF_vec_output_s, dstStride, 1);
148
+ pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
149
+ IPF_vec_output_s, dstStride, 1);
150
}
151
152
if (opt.pu[value].luma_vsp)
153
{
154
printf("luma_vsp[%s]\t", lumaPartStr[value]);
155
REPORT_SPEEDUP(opt.pu[value].luma_vsp, ref.pu[value].luma_vsp,
156
- short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
157
- IPF_vec_output_p, dstStride, 1);
158
+ short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
159
+ IPF_vec_output_p, dstStride, 1);
160
}
161
162
if (opt.pu[value].luma_vss)
163
{
164
printf("luma_vss[%s]\t", lumaPartStr[value]);
165
REPORT_SPEEDUP(opt.pu[value].luma_vss, ref.pu[value].luma_vss,
166
- short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
167
- IPF_vec_output_s, dstStride, 1);
168
+ short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
169
+ IPF_vec_output_s, dstStride, 1);
170
}
171
172
if (opt.pu[value].luma_hvpp)
173
{
174
printf("luma_hv [%s]\t", lumaPartStr[value]);
175
REPORT_SPEEDUP(opt.pu[value].luma_hvpp, ref.pu[value].luma_hvpp,
176
- pixel_buff + 3 * srcStride, srcStride, IPF_vec_output_p, srcStride, 1, 3);
177
+ pixel_buff + 3 * srcStride, srcStride, IPF_vec_output_p, srcStride, 1, 3);
178
}
179
180
- if (opt.pu[value].convert_p2s)
181
+ if (opt.pu[value].convert_p2s[NONALIGNED])
182
{
183
printf("convert_p2s[%s]\t", lumaPartStr[value]);
184
- REPORT_SPEEDUP(opt.pu[value].convert_p2s, ref.pu[value].convert_p2s,
185
- pixel_buff, srcStride,
186
- IPF_vec_output_s, dstStride);
187
+ REPORT_SPEEDUP(opt.pu[value].convert_p2s[NONALIGNED], ref.pu[value].convert_p2s[NONALIGNED],
188
+ pixel_buff, srcStride,
189
+ IPF_vec_output_s, dstStride);
190
+ }
191
+
192
+ if (opt.pu[value].convert_p2s[ALIGNED])
193
+ {
194
+ printf("convert_p2s_aligned[%s]\t", lumaPartStr[value]);
195
+ REPORT_SPEEDUP(opt.pu[value].convert_p2s[ALIGNED], ref.pu[value].convert_p2s[ALIGNED],
196
+ pixel_buff, srcStride,
197
+ IPF_vec_output_s, dstStride);
198
}
199
}
200
201
x265_2.7.tar.gz/source/test/ipfilterharness.h -> x265_2.9.tar.gz/source/test/ipfilterharness.h
Changed
35
1
2
enum { TEST_CASES = 3 };
3
enum { SMAX = 1 << 12 };
4
enum { SMIN = (unsigned)-1 << 12 };
5
- ALIGN_VAR_32(pixel, pixel_buff[TEST_BUF_SIZE]);
6
- int16_t short_buff[TEST_BUF_SIZE];
7
- int16_t IPF_vec_output_s[TEST_BUF_SIZE];
8
- int16_t IPF_C_output_s[TEST_BUF_SIZE];
9
- pixel IPF_vec_output_p[TEST_BUF_SIZE];
10
- pixel IPF_C_output_p[TEST_BUF_SIZE];
11
+ ALIGN_VAR_64(pixel, pixel_buff[TEST_BUF_SIZE]);
12
+ ALIGN_VAR_64(int16_t, short_buff[TEST_BUF_SIZE]);
13
+ ALIGN_VAR_64(int16_t, IPF_vec_output_s[TEST_BUF_SIZE]);
14
+ ALIGN_VAR_64(int16_t, IPF_C_output_s[TEST_BUF_SIZE]);
15
+ ALIGN_VAR_64(pixel, IPF_vec_output_p[TEST_BUF_SIZE]);
16
+ ALIGN_VAR_64(pixel, IPF_C_output_p[TEST_BUF_SIZE]);
17
18
- pixel pixel_test_buff[TEST_CASES][TEST_BUF_SIZE];
19
- int16_t short_test_buff[TEST_CASES][TEST_BUF_SIZE];
20
+ ALIGN_VAR_64(pixel, pixel_test_buff[TEST_CASES][TEST_BUF_SIZE]);
21
+ ALIGN_VAR_64(int16_t, short_test_buff[TEST_CASES][TEST_BUF_SIZE]);
22
23
bool check_IPFilterChroma_primitive(filter_pp_t ref, filter_pp_t opt);
24
bool check_IPFilterChroma_ps_primitive(filter_ps_t ref, filter_ps_t opt);
25
26
bool check_IPFilterLuma_ss_primitive(filter_ss_t ref, filter_ss_t opt);
27
bool check_IPFilterLumaHV_primitive(filter_hv_pp_t ref, filter_hv_pp_t opt);
28
bool check_IPFilterLumaP2S_primitive(filter_p2s_t ref, filter_p2s_t opt);
29
+ bool check_IPFilterLumaP2S_aligned_primitive(filter_p2s_t ref, filter_p2s_t opt);
30
bool check_IPFilterChromaP2S_primitive(filter_p2s_t ref, filter_p2s_t opt);
31
+ bool check_IPFilterChromaP2S_aligned_primitive(filter_p2s_t ref, filter_p2s_t opt);
32
33
public:
34
35
x265_2.7.tar.gz/source/test/mbdstharness.cpp -> x265_2.9.tar.gz/source/test/mbdstharness.cpp
Changed
201
1
2
for (int i = 0; i < TEST_BUF_SIZE; i++)
3
{
4
short_test_buff[0][i] = (rand() & PIXEL_MAX) - (rand() & PIXEL_MAX);
5
+ short_test_buff1[0][i] = (rand() & PIXEL_MAX) - (rand() & PIXEL_MAX);
6
int_test_buff[0][i] = rand() % PIXEL_MAX;
7
int_idct_test_buff[0][i] = (rand() % (SHORT_MAX - SHORT_MIN)) - SHORT_MAX;
8
short_denoise_test_buff1[0][i] = short_denoise_test_buff2[0][i] = (rand() & SHORT_MAX) - (rand() & SHORT_MAX);
9
-
10
short_test_buff[1][i] = -PIXEL_MAX;
11
+ short_test_buff1[1][i] = -PIXEL_MAX;
12
int_test_buff[1][i] = -PIXEL_MAX;
13
int_idct_test_buff[1][i] = SHORT_MIN;
14
short_denoise_test_buff1[1][i] = short_denoise_test_buff2[1][i] = -SHORT_MAX;
15
-
16
short_test_buff[2][i] = PIXEL_MAX;
17
+ short_test_buff1[2][i] = PIXEL_MAX;
18
int_test_buff[2][i] = PIXEL_MAX;
19
int_idct_test_buff[2][i] = SHORT_MAX;
20
short_denoise_test_buff1[2][i] = short_denoise_test_buff2[2][i] = SHORT_MAX;
21
22
bool MBDstHarness::check_nquant_primitive(nquant_t ref, nquant_t opt)
23
{
24
int j = 0;
25
-
26
for (int i = 0; i < ITERS; i++)
27
{
28
- int width = (rand() % 4 + 1) * 4;
29
+ int width = 1 << (rand() % 4 + 2);
30
int height = width;
31
-
32
uint32_t optReturnValue = 0;
33
uint32_t refReturnValue = 0;
34
35
36
reportfail();
37
j += INCR;
38
}
39
+ return true;
40
+}
41
+
42
+bool MBDstHarness::check_nonPsyRdoQuant_primitive(nonPsyRdoQuant_t ref, nonPsyRdoQuant_t opt)
43
+{
44
+ int j = 0;
45
+ int trSize[4] = { 16, 64, 256, 1024 };
46
+
47
+ ALIGN_VAR_32(int64_t, ref_dest[4 * MAX_TU_SIZE]);
48
+ ALIGN_VAR_32(int64_t, opt_dest[4 * MAX_TU_SIZE]);
49
+
50
+ for (int i = 0; i < ITERS; i++)
51
+ {
52
+ int64_t totalRdCostRef = rand();
53
+ int64_t totalUncodedCostRef = rand();
54
+ int64_t totalRdCostOpt = totalRdCostRef;
55
+ int64_t totalUncodedCostOpt = totalUncodedCostRef;
56
+
57
+ int index = rand() % 4;
58
+ uint32_t blkPos = trSize[index];
59
+ int cmp_size = 4 * MAX_TU_SIZE;
60
+
61
+ memset(ref_dest, 0, MAX_TU_SIZE * sizeof(int64_t));
62
+ memset(opt_dest, 0, MAX_TU_SIZE * sizeof(int64_t));
63
+
64
+ int index1 = rand() % TEST_CASES;
65
+
66
+ ref(short_test_buff[index1] + j, ref_dest, &totalUncodedCostRef, &totalRdCostRef, blkPos);
67
+ checked(opt, short_test_buff[index1] + j, opt_dest, &totalUncodedCostOpt, &totalRdCostOpt, blkPos);
68
+
69
+ if (memcmp(ref_dest, opt_dest, cmp_size))
70
+ return false;
71
+
72
+ if (totalUncodedCostRef != totalUncodedCostOpt)
73
+ return false;
74
+
75
+ if (totalRdCostRef != totalRdCostOpt)
76
+ return false;
77
+
78
+ reportfail();
79
+ j += INCR;
80
+ }
81
+
82
+ return true;
83
+}
84
+bool MBDstHarness::check_psyRdoQuant_primitive(psyRdoQuant_t ref, psyRdoQuant_t opt)
85
+{
86
+ int j = 0;
87
+ int trSize[4] = { 16, 64, 256, 1024 };
88
+
89
+ ALIGN_VAR_32(int64_t, ref_dest[4 * MAX_TU_SIZE]);
90
+ ALIGN_VAR_32(int64_t, opt_dest[4 * MAX_TU_SIZE]);
91
+
92
+ for (int i = 0; i < ITERS; i++)
93
+ {
94
+ int64_t totalRdCostRef = rand();
95
+ int64_t totalUncodedCostRef = rand();
96
+ int64_t totalRdCostOpt = totalRdCostRef;
97
+ int64_t totalUncodedCostOpt = totalUncodedCostRef;
98
+ int64_t *psyScale = X265_MALLOC(int64_t, 1);
99
+ *psyScale = rand();
100
+
101
+ int index = rand() % 4;
102
+ uint32_t blkPos = trSize[index];
103
+ int cmp_size = 4 * MAX_TU_SIZE;
104
+
105
+ memset(ref_dest, 0, MAX_TU_SIZE * sizeof(int64_t));
106
+ memset(opt_dest, 0, MAX_TU_SIZE * sizeof(int64_t));
107
+
108
+ int index1 = rand() % TEST_CASES;
109
+
110
+ ref(short_test_buff[index1] + j, short_test_buff1[index1] + j, ref_dest, &totalUncodedCostRef, &totalRdCostRef, psyScale, blkPos);
111
+ checked(opt, short_test_buff[index1] + j, short_test_buff1[index1] + j, opt_dest, &totalUncodedCostOpt, &totalRdCostOpt, psyScale, blkPos);
112
+
113
+ X265_FREE(psyScale);
114
+ if (memcmp(ref_dest, opt_dest, cmp_size))
115
+ return false;
116
+
117
+ if (totalUncodedCostRef != totalUncodedCostOpt)
118
+ return false;
119
+
120
+ if (totalRdCostRef != totalRdCostOpt)
121
+ return false;
122
+
123
+ reportfail();
124
+ j += INCR;
125
+ }
126
+
127
+ return true;
128
+}
129
+bool MBDstHarness::check_psyRdoQuant_primitive_avx2(psyRdoQuant_t1 ref, psyRdoQuant_t1 opt)
130
+{
131
+ int j = 0;
132
+ int trSize[4] = { 16, 64, 256, 1024 };
133
+
134
+ ALIGN_VAR_32(int64_t, ref_dest[4 * MAX_TU_SIZE]);
135
+ ALIGN_VAR_32(int64_t, opt_dest[4 * MAX_TU_SIZE]);
136
+
137
+ for (int i = 0; i < ITERS; i++)
138
+ {
139
+ int64_t totalRdCostRef = rand();
140
+ int64_t totalUncodedCostRef = rand();
141
+ int64_t totalRdCostOpt = totalRdCostRef;
142
+ int64_t totalUncodedCostOpt = totalUncodedCostRef;
143
+
144
+ int index = rand() % 4;
145
+ uint32_t blkPos = trSize[index];
146
+ int cmp_size = 4 * MAX_TU_SIZE;
147
+
148
+ memset(ref_dest, 0, MAX_TU_SIZE * sizeof(int64_t));
149
+ memset(opt_dest, 0, MAX_TU_SIZE * sizeof(int64_t));
150
+
151
+ int index1 = rand() % TEST_CASES;
152
+
153
+ ref(short_test_buff[index1] + j, ref_dest, &totalUncodedCostRef, &totalRdCostRef, blkPos);
154
+ checked(opt, short_test_buff[index1] + j, opt_dest, &totalUncodedCostOpt, &totalRdCostOpt, blkPos);
155
+
156
+
157
+ if (memcmp(ref_dest, opt_dest, cmp_size))
158
+ return false;
159
+
160
+ if (totalUncodedCostRef != totalUncodedCostOpt)
161
+ return false;
162
+
163
+ if (totalRdCostRef != totalRdCostOpt)
164
+ return false;
165
+
166
+ reportfail();
167
+ j += INCR;
168
+ }
169
170
return true;
171
}
172
173
return false;
174
}
175
}
176
+
177
+ for (int i = 0; i < NUM_TR_SIZE; i++)
178
+ {
179
+ if (opt.cu[i].nonPsyRdoQuant)
180
+ {
181
+ if (!check_nonPsyRdoQuant_primitive(ref.cu[i].nonPsyRdoQuant, opt.cu[i].nonPsyRdoQuant))
182
+ {
183
+ printf("nonPsyRdoQuant[%dx%d]: Failed!\n", 4 << i, 4 << i);
184
+ return false;
185
+ }
186
+ }
187
+ }
188
+ for (int i = 0; i < NUM_TR_SIZE; i++)
189
+ {
190
+ if (opt.cu[i].psyRdoQuant)
191
+ {
192
+ if (!check_psyRdoQuant_primitive(ref.cu[i].psyRdoQuant, opt.cu[i].psyRdoQuant))
193
+ {
194
+ printf("psyRdoQuant[%dx%d]: Failed!\n", 4 << i, 4 << i);
195
+ return false;
196
+ }
197
+ }
198
+ }
199
+ for (int i = 0; i < NUM_TR_SIZE; i++)
200
+ {
201
x265_2.7.tar.gz/source/test/mbdstharness.h -> x265_2.9.tar.gz/source/test/mbdstharness.h
Changed
32
1
2
int mintbuf2[MAX_TU_SIZE];
3
int mintbuf3[MAX_TU_SIZE];
4
int mintbuf4[MAX_TU_SIZE];
5
-
6
int16_t short_test_buff[TEST_CASES][TEST_BUF_SIZE];
7
+ int16_t short_test_buff1[TEST_CASES][TEST_BUF_SIZE];
8
int int_test_buff[TEST_CASES][TEST_BUF_SIZE];
9
int int_idct_test_buff[TEST_CASES][TEST_BUF_SIZE];
10
-
11
uint32_t mubuf1[MAX_TU_SIZE];
12
uint32_t mubuf2[MAX_TU_SIZE];
13
uint16_t mushortbuf1[MAX_TU_SIZE];
14
15
int16_t short_denoise_test_buff1[TEST_CASES][TEST_BUF_SIZE];
16
int16_t short_denoise_test_buff2[TEST_CASES][TEST_BUF_SIZE];
17
-
18
bool check_dequant_primitive(dequant_scaling_t ref, dequant_scaling_t opt);
19
bool check_dequant_primitive(dequant_normal_t ref, dequant_normal_t opt);
20
+ bool check_nonPsyRdoQuant_primitive(nonPsyRdoQuant_t ref, nonPsyRdoQuant_t opt);
21
+ bool check_psyRdoQuant_primitive(psyRdoQuant_t ref, psyRdoQuant_t opt);
22
bool check_quant_primitive(quant_t ref, quant_t opt);
23
bool check_nquant_primitive(nquant_t ref, nquant_t opt);
24
bool check_dct_primitive(dct_t ref, dct_t opt, intptr_t width);
25
bool check_idct_primitive(idct_t ref, idct_t opt, intptr_t width);
26
bool check_count_nonzero_primitive(count_nonzero_t ref, count_nonzero_t opt);
27
bool check_denoise_dct_primitive(denoiseDct_t ref, denoiseDct_t opt);
28
+ bool check_psyRdoQuant_primitive_avx2(psyRdoQuant_t1 ref, psyRdoQuant_t1 opt);
29
30
public:
31
32
x265_2.7.tar.gz/source/test/pixelharness.cpp -> x265_2.9.tar.gz/source/test/pixelharness.cpp
Changed
201
1
2
return true;
3
}
4
5
+bool PixelHarness::check_calresidual_aligned(calcresidual_t ref, calcresidual_t opt)
6
+{
7
+ ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
8
+ ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
9
+ memset(ref_dest, 0, 64 * 64 * sizeof(int16_t));
10
+ memset(opt_dest, 0, 64 * 64 * sizeof(int16_t));
11
+
12
+ int j = 0;
13
+ intptr_t stride = STRIDE;
14
+ for (int i = 0; i < ITERS; i++)
15
+ {
16
+ int index = i % TEST_CASES;
17
+ checked(opt, pbuf1 + j, pixel_test_buff[index] + j, opt_dest, stride);
18
+ ref(pbuf1 + j, pixel_test_buff[index] + j, ref_dest, stride);
19
+
20
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
21
+ return false;
22
+
23
+ reportfail();
24
+ j += INCR;
25
+ }
26
+
27
+ return true;
28
+}
29
+
30
bool PixelHarness::check_ssd_s(pixel_ssd_s_t ref, pixel_ssd_s_t opt)
31
{
32
int j = 0;
33
34
reportfail();
35
j += INCR;
36
}
37
-
38
return true;
39
}
40
+bool PixelHarness::check_ssd_s_aligned(pixel_ssd_s_t ref, pixel_ssd_s_t opt)
41
+{
42
+ int j = 0;
43
+ for (int i = 0; i < ITERS; i++)
44
+ {
45
+ // NOTE: stride must be multiple of 16, because minimum block is 4x4
46
+ int stride = STRIDE;
47
+ sse_t cres = ref(sbuf1 + j, stride);
48
+ sse_t vres = (sse_t)checked(opt, sbuf1 + j, (intptr_t)stride);
49
+
50
+ if (cres != vres)
51
+ return false;
52
+
53
+ reportfail();
54
+ j += INCR+32;
55
+ }
56
57
+ return true;
58
+}
59
bool PixelHarness::check_weightp(weightp_sp_t ref, weightp_sp_t opt)
60
{
61
ALIGN_VAR_16(pixel, ref_dest[64 * (64 + 1)]);
62
63
memset(ref_dest, 0, 64 * 64 * sizeof(pixel));
64
memset(opt_dest, 0, 64 * 64 * sizeof(pixel));
65
int j = 0;
66
+ bool enableavx512 = true;
67
int width = 16 * (rand() % 4 + 1);
68
+ int cpuid = X265_NS::cpu_detect(enableavx512);
69
+ if (cpuid & X265_CPU_AVX512)
70
+ width = 32 * (rand() % 2 + 1);
71
int height = 8;
72
int w0 = rand() % 128;
73
int shift = rand() % 8; // maximum is 7, see setFromWeightAndOffset()
74
75
76
return true;
77
}
78
-
79
bool PixelHarness::check_cpy1Dto2D_shl_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt)
80
{
81
- ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
82
- ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
83
-
84
+ ALIGN_VAR_64(int16_t, ref_dest[64 * 64]);
85
+ ALIGN_VAR_64(int16_t, opt_dest[64 * 64]);
86
memset(ref_dest, 0xCD, sizeof(ref_dest));
87
memset(opt_dest, 0xCD, sizeof(opt_dest));
88
89
90
91
return true;
92
}
93
+bool PixelHarness::check_cpy1Dto2D_shl_aligned_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt)
94
+{
95
+ ALIGN_VAR_64(int16_t, ref_dest[64 * 64]);
96
+ ALIGN_VAR_64(int16_t, opt_dest[64 * 64]);
97
+
98
+ memset(ref_dest, 0xCD, sizeof(ref_dest));
99
+ memset(opt_dest, 0xCD, sizeof(opt_dest));
100
+
101
+ int j = 0;
102
+ intptr_t stride = STRIDE;
103
+ for (int i = 0; i < ITERS; i++)
104
+ {
105
+ int shift = (rand() % 7 + 1);
106
+
107
+ int index = i % TEST_CASES;
108
+ checked(opt, opt_dest, short_test_buff[index] + j, stride, shift);
109
+ ref(ref_dest, short_test_buff[index] + j, stride, shift);
110
+
111
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
112
+ return false;
113
+
114
+ reportfail();
115
+ j += INCR + 32;
116
+ }
117
+
118
+ return true;
119
+}
120
121
bool PixelHarness::check_cpy1Dto2D_shr_t(cpy1Dto2D_shr_t ref, cpy1Dto2D_shr_t opt)
122
{
123
124
125
return true;
126
}
127
-
128
bool PixelHarness::check_pixelavg_pp(pixelavg_pp_t ref, pixelavg_pp_t opt)
129
{
130
- ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
131
- ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
132
+ ALIGN_VAR_64(pixel, ref_dest[64 * 64]);
133
+ ALIGN_VAR_64(pixel, opt_dest[64 * 64]);
134
+ int j = 0;
135
+ memset(ref_dest, 0xCD, sizeof(ref_dest));
136
+ memset(opt_dest, 0xCD, sizeof(opt_dest));
137
+
138
+ intptr_t stride = STRIDE;
139
+ for (int i = 0; i < ITERS; i++)
140
+ {
141
+ int index1 = rand() % TEST_CASES;
142
+ int index2 = rand() % TEST_CASES;
143
+ checked(ref, ref_dest, stride, pixel_test_buff[index1] + j,
144
+ stride, pixel_test_buff[index2] + j, stride, 32);
145
+ opt(opt_dest, stride, pixel_test_buff[index1] + j,
146
+ stride, pixel_test_buff[index2] + j, stride, 32);
147
+
148
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
149
+ return false;
150
+
151
+ reportfail();
152
+ j += INCR;
153
+ }
154
+
155
+ return true;
156
+}
157
+bool PixelHarness::check_pixelavg_pp_aligned(pixelavg_pp_t ref, pixelavg_pp_t opt)
158
+{
159
+ ALIGN_VAR_64(pixel, ref_dest[64 * 64]);
160
+ ALIGN_VAR_64(pixel, opt_dest[64 * 64]);
161
162
int j = 0;
163
164
165
return false;
166
167
reportfail();
168
- j += INCR;
169
+ j += INCR + 32;
170
}
171
172
return true;
173
174
175
bool PixelHarness::check_blockfill_s(blockfill_s_t ref, blockfill_s_t opt)
176
{
177
- ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
178
- ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
179
+ ALIGN_VAR_64(int16_t, ref_dest[64 * 64]);
180
+ ALIGN_VAR_64(int16_t, opt_dest[64 * 64]);
181
+
182
+ memset(ref_dest, 0xCD, sizeof(ref_dest));
183
+ memset(opt_dest, 0xCD, sizeof(opt_dest));
184
+
185
+ intptr_t stride = 64;
186
+ for (int i = 0; i < ITERS; i++)
187
+ {
188
+ int16_t value = (rand() % SHORT_MAX) + 1;
189
+
190
+ checked(opt, opt_dest, stride, value);
191
+ ref(ref_dest, stride, value);
192
+
193
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
194
+ return false;
195
+
196
+ reportfail();
197
+ }
198
+
199
+ return true;
200
+}
201
x265_2.7.tar.gz/source/test/pixelharness.h -> x265_2.9.tar.gz/source/test/pixelharness.h
Changed
89
1
2
enum { RMAX = PIXEL_MAX - PIXEL_MIN }; //The maximum value obtained by subtracting pixel values (residual max)
3
enum { RMIN = PIXEL_MIN - PIXEL_MAX }; //The minimum value obtained by subtracting pixel values (residual min)
4
5
- ALIGN_VAR_32(pixel, pbuf1[BUFFSIZE]);
6
- pixel pbuf2[BUFFSIZE];
7
- pixel pbuf3[BUFFSIZE];
8
- pixel pbuf4[BUFFSIZE];
9
- int ibuf1[BUFFSIZE];
10
- int8_t psbuf1[BUFFSIZE];
11
- int8_t psbuf2[BUFFSIZE];
12
- int8_t psbuf3[BUFFSIZE];
13
- int8_t psbuf4[BUFFSIZE];
14
- int8_t psbuf5[BUFFSIZE];
15
+ ALIGN_VAR_64(pixel, pbuf1[BUFFSIZE]);
16
+ ALIGN_VAR_64(pixel, pbuf2[BUFFSIZE]);
17
+ ALIGN_VAR_64(pixel, pbuf3[BUFFSIZE]);
18
+ ALIGN_VAR_64(pixel, pbuf4[BUFFSIZE]);
19
+ ALIGN_VAR_64(int, ibuf1[BUFFSIZE]);
20
+ ALIGN_VAR_64(int8_t, psbuf1[BUFFSIZE]);
21
+ ALIGN_VAR_64(int8_t, psbuf2[BUFFSIZE]);
22
+ ALIGN_VAR_64(int8_t, psbuf3[BUFFSIZE]);
23
+ ALIGN_VAR_64(int8_t, psbuf4[BUFFSIZE]);
24
+ ALIGN_VAR_64(int8_t, psbuf5[BUFFSIZE]);
25
26
- int16_t sbuf1[BUFFSIZE];
27
- int16_t sbuf2[BUFFSIZE];
28
- int16_t sbuf3[BUFFSIZE];
29
+ ALIGN_VAR_64(int16_t, sbuf1[BUFFSIZE]);
30
+ ALIGN_VAR_64(int16_t, sbuf2[BUFFSIZE]);
31
+ ALIGN_VAR_64(int16_t, sbuf3[BUFFSIZE]);
32
33
- pixel pixel_test_buff[TEST_CASES][BUFFSIZE];
34
- int16_t short_test_buff[TEST_CASES][BUFFSIZE];
35
- int16_t short_test_buff1[TEST_CASES][BUFFSIZE];
36
- int16_t short_test_buff2[TEST_CASES][BUFFSIZE];
37
- int int_test_buff[TEST_CASES][BUFFSIZE];
38
- uint16_t ushort_test_buff[TEST_CASES][BUFFSIZE];
39
- uint8_t uchar_test_buff[TEST_CASES][BUFFSIZE];
40
- double double_test_buff[TEST_CASES][BUFFSIZE];
41
- int16_t residual_test_buff[TEST_CASES][BUFFSIZE];
42
+ ALIGN_VAR_64(pixel, pixel_test_buff[TEST_CASES][BUFFSIZE]);
43
+ ALIGN_VAR_64(int16_t, short_test_buff[TEST_CASES][BUFFSIZE]);
44
+ ALIGN_VAR_64(int16_t, short_test_buff1[TEST_CASES][BUFFSIZE]);
45
+ ALIGN_VAR_64(int16_t, short_test_buff2[TEST_CASES][BUFFSIZE]);
46
+ ALIGN_VAR_64(int, int_test_buff[TEST_CASES][BUFFSIZE]);
47
+ ALIGN_VAR_64(uint16_t, ushort_test_buff[TEST_CASES][BUFFSIZE]);
48
+ ALIGN_VAR_64(uint8_t, uchar_test_buff[TEST_CASES][BUFFSIZE]);
49
+ ALIGN_VAR_64(double, double_test_buff[TEST_CASES][BUFFSIZE]);
50
+ ALIGN_VAR_64(int16_t, residual_test_buff[TEST_CASES][BUFFSIZE]);
51
52
bool check_pixelcmp(pixelcmp_t ref, pixelcmp_t opt);
53
bool check_pixel_sse(pixel_sse_t ref, pixel_sse_t opt);
54
55
bool check_copy_ps(copy_ps_t ref, copy_ps_t opt);
56
bool check_copy_ss(copy_ss_t ref, copy_ss_t opt);
57
bool check_pixelavg_pp(pixelavg_pp_t ref, pixelavg_pp_t opt);
58
+ bool check_pixelavg_pp_aligned(pixelavg_pp_t ref, pixelavg_pp_t opt);
59
bool check_pixel_sub_ps(pixel_sub_ps_t ref, pixel_sub_ps_t opt);
60
bool check_pixel_add_ps(pixel_add_ps_t ref, pixel_add_ps_t opt);
61
+ bool check_pixel_add_ps_aligned(pixel_add_ps_t ref, pixel_add_ps_t opt);
62
bool check_scale1D_pp(scale1D_t ref, scale1D_t opt);
63
+ bool check_scale1D_pp_aligned(scale1D_t ref, scale1D_t opt);
64
bool check_scale2D_pp(scale2D_t ref, scale2D_t opt);
65
bool check_ssd_s(pixel_ssd_s_t ref, pixel_ssd_s_t opt);
66
+ bool check_ssd_s_aligned(pixel_ssd_s_t ref, pixel_ssd_s_t opt);
67
bool check_blockfill_s(blockfill_s_t ref, blockfill_s_t opt);
68
+ bool check_blockfill_s_aligned(blockfill_s_t ref, blockfill_s_t opt);
69
bool check_calresidual(calcresidual_t ref, calcresidual_t opt);
70
+ bool check_calresidual_aligned(calcresidual_t ref, calcresidual_t opt);
71
bool check_transpose(transpose_t ref, transpose_t opt);
72
bool check_weightp(weightp_pp_t ref, weightp_pp_t opt);
73
bool check_weightp(weightp_sp_t ref, weightp_sp_t opt);
74
75
bool check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt);
76
bool check_cpy2Dto1D_shr_t(cpy2Dto1D_shr_t ref, cpy2Dto1D_shr_t opt);
77
bool check_cpy1Dto2D_shl_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt);
78
+ bool check_cpy1Dto2D_shl_aligned_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt);
79
bool check_cpy1Dto2D_shr_t(cpy1Dto2D_shr_t ref, cpy1Dto2D_shr_t opt);
80
bool check_copy_cnt_t(copy_cnt_t ref, copy_cnt_t opt);
81
bool check_pixel_var(var_t ref, var_t opt);
82
bool check_ssim_4x4x2_core(ssim_4x4x2_core_t ref, ssim_4x4x2_core_t opt);
83
bool check_ssim_end(ssim_end4_t ref, ssim_end4_t opt);
84
bool check_addAvg(addAvg_t, addAvg_t);
85
+ bool check_addAvg_aligned(addAvg_t, addAvg_t);
86
bool check_saoCuOrgE0_t(saoCuOrgE0_t ref, saoCuOrgE0_t opt);
87
bool check_saoCuOrgE1_t(saoCuOrgE1_t ref, saoCuOrgE1_t opt);
88
bool check_saoCuOrgE2_t(saoCuOrgE2_t ref[], saoCuOrgE2_t opt[]);
89
x265_2.7.tar.gz/source/test/regression-tests.txt -> x265_2.9.tar.gz/source/test/regression-tests.txt
Changed
63
1
2
BasketballDrive_1920x1080_50.y4m,--preset slower --lossless --chromaloc 3 --subme 0 --limit-tu 4
3
BasketballDrive_1920x1080_50.y4m,--preset slower --no-cutree --analysis-save x265_analysis.dat --analysis-reuse-level 10 --bitrate 7000 --limit-tu 0::--preset slower --no-cutree --analysis-load x265_analysis.dat --analysis-reuse-level 10 --bitrate 7000 --limit-tu 0
4
BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1 --aq-mode 3 --limit-tu 3
5
-BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-save x265_analysis.dat --bitrate 7000 --tskip-fast --limit-tu 2::--preset veryslow --no-cutree --analysis-load x265_analysis.dat --bitrate 7000 --tskip-fast --limit-tu 2
6
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-save x265_analysis.dat --crf 18 --tskip-fast --limit-tu 2::--preset veryslow --no-cutree --analysis-load x265_analysis.dat --crf 18 --tskip-fast --limit-tu 2
7
BasketballDrive_1920x1080_50.y4m,--preset veryslow --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
8
Coastguard-4k.y4m,--preset ultrafast --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
9
Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop
10
Coastguard-4k.y4m,--preset superfast --tune grain --pme --aq-strength 2 --merange 190
11
-Coastguard-4k.y4m,--preset veryfast --no-cutree --analysis-save x265_analysis.dat --analysis-reuse-level 1 --bitrate 15000::--preset veryfast --no-cutree --analysis-load x265_analysis.dat --analysis-reuse-level 1 --bitrate 15000
12
+Coastguard-4k.y4m,--preset veryfast --no-cutree --analysis-save x265_analysis.dat --analysis-reuse-level 1 --qp 35::--preset veryfast --no-cutree --analysis-load x265_analysis.dat --analysis-reuse-level 1 --qp 35
13
Coastguard-4k.y4m,--preset medium --rdoq-level 1 --tune ssim --no-signhide --me umh --slices 2
14
Coastguard-4k.y4m,--preset slow --tune psnr --cbqpoffs -1 --crqpoffs 1 --limit-refs 1
15
CrowdRun_1920x1080_50_10bit_422.yuv,--preset ultrafast --weightp --tune zerolatency --qg-size 16
16
17
KristenAndSara_1280x720_60.y4m,--preset slower --pmode --max-tu-size 8 --limit-refs 0 --limit-modes --limit-tu 1
18
NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset superfast --tune psnr
19
NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --tune grain --limit-refs 2
20
-NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset slow --no-cutree --analysis-save x265_analysis.dat --rd 5 --analysis-reuse-level 10 --bitrate 9000::--preset slow --no-cutree --analysis-load x265_analysis.dat --rd 5 --analysis-reuse-level 10 --bitrate 9000
21
+NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset slow --no-cutree --analysis-save x265_analysis.dat --rd 5 --analysis-reuse-level 10 --bitrate 9000 --vbv-maxrate 9000 --vbv-bufsize 9000::--preset slow --no-cutree --analysis-load x265_analysis.dat --rd 5 --analysis-reuse-level 10 --bitrate 9000 --vbv-maxrate 9000 --vbv-bufsize 9000
22
News-4k.y4m,--preset ultrafast --no-cutree --analysis-save x265_analysis.dat --analysis-reuse-level 2 --bitrate 15000::--preset ultrafast --no-cutree --analysis-load x265_analysis.dat --analysis-reuse-level 2 --bitrate 15000
23
News-4k.y4m,--preset superfast --lookahead-slices 6 --aq-mode 0
24
News-4k.y4m,--preset superfast --slices 4 --aq-mode 0
25
News-4k.y4m,--preset medium --tune ssim --no-sao --qg-size 16
26
-News-4k.y4m,--preset slower --opt-cu-delta-qp
27
News-4k.y4m,--preset veryslow --no-rskip
28
News-4k.y4m,--preset veryslow --pme --crf 40
29
OldTownCross_1920x1080_50_10bit_422.yuv,--preset superfast --weightp
30
31
city_4cif_60fps.y4m,--preset superfast --rdpenalty 1 --tu-intra-depth 2
32
city_4cif_60fps.y4m,--preset medium --crf 4 --cu-lossless --sao-non-deblock
33
city_4cif_60fps.y4m,--preset slower --scaling-list default
34
-city_4cif_60fps.y4m,--preset veryslow --opt-cu-delta-qp
35
city_4cif_60fps.y4m,--preset veryslow --rdpenalty 2 --sao-non-deblock --no-b-intra --limit-refs 0
36
ducks_take_off_420_720p50.y4m,--preset ultrafast --constrained-intra --rd 1
37
ducks_take_off_444_720p50.y4m,--preset superfast --weightp --limit-refs 2
38
39
Kimono1_1920x1080_24_400.yuv,--preset veryslow --crf 4 --cu-lossless --slices 2 --limit-refs 3 --limit-modes
40
Kimono1_1920x1080_24_400.yuv,--preset placebo --ctu 32 --max-tu-size 8 --limit-tu 2
41
big_buck_bunny_360p24.y4m, --keyint 60 --min-keyint 40 --gop-lookahead 14
42
-BasketballDrive_1920x1080_50.y4m, --preset medium --no-open-gop --keyint 50 --min-keyint 50 --radl 2
43
+BasketballDrive_1920x1080_50.y4m, --preset medium --no-open-gop --keyint 50 --min-keyint 50 --radl 2 --vbv-maxrate 5000 --vbv-bufsize 5000
44
45
# Main12 intraCost overflow bug test
46
720p50_parkrun_ter.y4m,--preset medium
47
48
#low-pass dct test
49
720p50_parkrun_ter.y4m,--preset medium --lowpass-dct
50
51
+#scaled save/load test
52
+crowd_run_1080p50.y4m,--preset ultrafast --no-cutree --analysis-save x265_analysis.dat --analysis-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 8000 --vbv-bufsize 8000::crowd_run_2160p50.y4m, --preset ultrafast --no-cutree --analysis-load x265_analysis.dat --analysis-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 12000 --vbv-bufsize 12000
53
+crowd_run_1080p50.y4m,--preset superfast --no-cutree --analysis-save x265_analysis.dat --analysis-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m, --preset superfast --no-cutree --analysis-load x265_analysis.dat --analysis-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000
54
+crowd_run_1080p50.y4m,--preset fast --no-cutree --analysis-save x265_analysis.dat --analysis-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m, --preset fast --no-cutree --analysis-load x265_analysis.dat --analysis-reuse-level 5 --scale-factor 2 --qp 18
55
+crowd_run_1080p50.y4m,--preset medium --no-cutree --analysis-save x265_analysis.dat --analysis-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-maxrate 5000 --vbv-bufsize 5000 --early-skip --tu-inter-depth 3::crowd_run_2160p50.y4m, --preset medium --no-cutree --analysis-load x265_analysis.dat --analysis-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 4 --dynamic-refine::crowd_run_2160p50.y4m, --preset medium --no-cutree --analysis-load x265_analysis.dat --analysis-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 3 --refine-inter 3
56
+RaceHorses_416x240_30.y4m,--preset slow --no-cutree --ctu 16 --analysis-save x265_analysis.dat --analysis-reuse-level 10 --scale-factor 2 --crf 22 --vbv-maxrate 1000 --vbv-bufsize 1000::RaceHorses_832x480_30.y4m, --preset slow --no-cutree --ctu 32 --analysis-load x265_analysis.dat --analysis-save x265_analysis_2.dat --analysis-reuse-level 10 --scale-factor 2 --crf 16 --vbv-maxrate 4000 --vbv-bufsize 4000 --refine-intra 0 --refine-inter 1::RaceHorses_1664x960_30.y4m,--preset slow --no-cutree --ctu 64 --analysis-load x265_analysis_2.dat --analysis-reuse-level 10 --scale-factor 2 --crf 12 --vbv-maxrate 7000 --vbv-bufsize 7000 --refine-intra 2 --refine-inter 2
57
+ElFunete_960x540_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune psnr --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500 --analysis-reuse-level 10 --analysis-save elfuente_960x540.dat --scale-factor 2::ElFunete_1920x1080_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune psnr --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500 --analysis-reuse-level 10 --analysis-save elfuente_1920x1080.dat --limit-tu 0 --scale-factor 2 --analysis-load elfuente_960x540.dat --refine-intra 4 --refine-inter 2::ElFuente_3840x2160_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune=psnr --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000 --analysis-reuse-level 10 --limit-tu 0 --scale-factor 2 --analysis-load elfuente_1920x1080.dat --refine-intra 4 --refine-inter 2
58
+
59
+#segment encoding
60
+BasketballDrive_1920x1080_50.y4m, --preset ultrafast --no-open-gop --chunk-start 100 --chunk-end 200
61
+
62
# vim: tw=200
63
x265_2.7.tar.gz/source/test/smoke-tests.txt -> x265_2.9.tar.gz/source/test/smoke-tests.txt
Changed
10
1
2
old_town_cross_444_720p50.y4m,--preset=fast --keyint 20 --min-cu-size 16
3
old_town_cross_444_720p50.y4m,--preset=slow --sao-non-deblock --pmode --qg-size 32
4
RaceHorses_416x240_30_10bit.yuv,--preset=veryfast --max-tu-size 8
5
-RaceHorses_416x240_30_10bit.yuv,--preset=slower --bitrate 500 -F4 --rdoq-level 1 --opt-cu-delta-qp
6
+RaceHorses_416x240_30_10bit.yuv,--preset=slower --bitrate 500 -F4 --rdoq-level 1
7
CrowdRun_1920x1080_50_10bit_444.yuv,--preset=ultrafast --constrained-intra --min-keyint 5 --keyint 10
8
CrowdRun_1920x1080_50_10bit_444.yuv,--preset=medium --max-tu-size 16 --tu-inter-depth 2 --limit-tu 3
9
DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=veryfast --min-cu 16
10
x265_2.7.tar.gz/source/test/testbench.cpp -> x265_2.9.tar.gz/source/test/testbench.cpp
Changed
28
1
2
3
int main(int argc, char *argv[])
4
{
5
- int cpuid = X265_NS::cpu_detect();
6
+ bool enableavx512 = true;
7
+ int cpuid = X265_NS::cpu_detect(enableavx512);
8
const char *testname = 0;
9
10
if (!(argc & 1))
11
12
if (!strncmp(name, "cpuid", strlen(name)))
13
{
14
bool bError = false;
15
- cpuid = parseCpuName(value, bError);
16
+ cpuid = parseCpuName(value, bError, enableavx512);
17
if (bError)
18
{
19
printf("Invalid CPU name: %s\n", value);
20
21
{ "XOP", X265_CPU_XOP },
22
{ "AVX2", X265_CPU_AVX2 },
23
{ "BMI2", X265_CPU_AVX2 | X265_CPU_BMI1 | X265_CPU_BMI2 },
24
+ { "AVX512", X265_CPU_AVX512 },
25
{ "ARMv6", X265_CPU_ARMV6 },
26
{ "NEON", X265_CPU_NEON },
27
{ "FastNeonMRC", X265_CPU_FAST_NEON_MRC },
28
x265_2.7.tar.gz/source/test/testharness.h -> x265_2.9.tar.gz/source/test/testharness.h
Changed
19
1
2
#include <x86intrin.h>
3
#elif ( !defined(__APPLE__) && defined (__GNUC__) && defined(__ARM_NEON__))
4
#include <arm_neon.h>
5
-#elif defined(__GNUC__)
6
+#elif defined(__GNUC__) && (!defined(__clang__) || __clang_major__ < 4)
7
/* fallback for older GCC/MinGW */
8
static inline uint32_t __rdtsc(void)
9
{
10
11
}
12
#endif // ifdef _MSC_VER
13
14
-#define BENCH_RUNS 1000
15
+#define BENCH_RUNS 2000
16
17
// Adapted from checkasm.c, runs each optimized primitive four times, measures rdtsc
18
// and discards invalid times. Repeats 1000 times to get a good average. Then measures
19
x265_2.7.tar.gz/source/x265.cpp -> x265_2.9.tar.gz/source/x265.cpp
Changed
121
1
2
const char* reconPlayCmd;
3
const x265_api* api;
4
x265_param* param;
5
+ x265_vmaf_data* vmafData;
6
bool bProgress;
7
bool bForceY4m;
8
bool bDither;
9
10
reconPlayCmd = NULL;
11
api = NULL;
12
param = NULL;
13
+ vmafData = NULL;
14
framesToBeEncoded = seek = 0;
15
totalbytes = 0;
16
bProgress = true;
17
18
{
19
int eta = (int)(elapsed * (framesToBeEncoded - frameNum) / ((int64_t)frameNum * 1000000));
20
sprintf(buf, "x265 [%.1f%%] %d/%d frames, %.2f fps, %.2f kb/s, eta %d:%02d:%02d",
21
- 100. * frameNum / framesToBeEncoded, frameNum, framesToBeEncoded, fps, bitrate,
22
+ 100. * frameNum / (param->chunkEnd ? param->chunkEnd : param->totalFrames), frameNum, (param->chunkEnd ? param->chunkEnd : param->totalFrames), fps, bitrate,
23
eta / 3600, (eta / 60) % 60, eta % 60);
24
}
25
else
26
27
x265_log(NULL, X265_LOG_ERROR, "param alloc failed\n");
28
return true;
29
}
30
+#if ENABLE_LIBVMAF
31
+ vmafData = (x265_vmaf_data*)x265_malloc(sizeof(x265_vmaf_data));
32
+ if(!vmafData)
33
+ {
34
+ x265_log(NULL, X265_LOG_ERROR, "vmaf data alloc failed\n");
35
+ return true;
36
+ }
37
+#endif
38
39
if (api->param_default_preset(param, preset, tune) < 0)
40
{
41
42
info.frameCount = 0;
43
getParamAspectRatio(param, info.sarWidth, info.sarHeight);
44
45
+
46
this->input = InputFile::open(info, this->bForceY4m);
47
if (!this->input || this->input->isFail())
48
{
49
50
if (this->framesToBeEncoded == 0 && info.frameCount > (int)seek)
51
this->framesToBeEncoded = info.frameCount - seek;
52
param->totalFrames = this->framesToBeEncoded;
53
-
54
+
55
/* Force CFR until we have support for VFR */
56
info.timebaseNum = param->fpsDenom;
57
info.timebaseDenom = param->fpsNum;
58
59
param->sourceWidth, param->sourceHeight, param->fpsNum, param->fpsDenom,
60
x265_source_csp_names[param->internalCsp]);
61
}
62
+#if ENABLE_LIBVMAF
63
+ if (!reconfn)
64
+ {
65
+ x265_log(param, X265_LOG_ERROR, "recon file must be specified to get VMAF score, try --help for help\n");
66
+ return true;
67
+ }
68
+ const char *str = strrchr(info.filename, '.');
69
70
+ if (!strcmp(str, ".y4m"))
71
+ {
72
+ x265_log(param, X265_LOG_ERROR, "VMAF supports YUV file format only.\n");
73
+ return true;
74
+ }
75
+ if(param->internalCsp == X265_CSP_I420 || param->internalCsp == X265_CSP_I422 || param->internalCsp == X265_CSP_I444)
76
+ {
77
+ vmafData->reference_file = x265_fopen(inputfn, "rb");
78
+ vmafData->distorted_file = x265_fopen(reconfn, "rb");
79
+ }
80
+ else
81
+ {
82
+ x265_log(param, X265_LOG_ERROR, "VMAF will support only yuv420p, yu422p, yu444p, yuv420p10le, yuv422p10le, yuv444p10le formats.\n");
83
+ return true;
84
+ }
85
+#endif
86
this->output = OutputFile::open(outputfn, info);
87
if (this->output->isFail())
88
{
89
90
91
x265_param* param = cliopt.param;
92
const x265_api* api = cliopt.api;
93
-
94
+#if ENABLE_LIBVMAF
95
+ x265_vmaf_data* vmafdata = cliopt.vmafData;
96
+#endif
97
/* This allows muxers to modify bitstream format */
98
cliopt.output->setParam(param);
99
100
101
if (!numEncoded)
102
break;
103
}
104
-
105
+
106
/* clear progress report */
107
if (cliopt.bProgress)
108
fprintf(stderr, "%*s\r", 80, " ");
109
110
111
api->encoder_get_stats(encoder, &stats, sizeof(stats));
112
if (param->csvfn && !b_ctrl_c)
113
+#if ENABLE_LIBVMAF
114
+ api->vmaf_encoder_log(encoder, argc, argv, param, vmafdata);
115
+#else
116
api->encoder_log(encoder, argc, argv);
117
+#endif
118
api->encoder_close(encoder);
119
120
int64_t second_largest_pts = 0;
121
x265_2.7.tar.gz/source/x265.h -> x265_2.9.tar.gz/source/x265.h
Changed
201
1
2
extern "C" {
3
#endif
4
5
+#if _MSC_VER
6
+#pragma warning(disable: 4201) // non-standard extension used (nameless struct/union)
7
+#endif
8
+
9
/* x265_encoder:
10
* opaque handler for encoder */
11
typedef struct x265_encoder x265_encoder;
12
13
int lastMiniGopBFrame;
14
int plannedType[X265_LOOKAHEAD_MAX + 1];
15
int64_t dts;
16
+ int64_t reorderedPts;
17
} x265_lookahead_data;
18
19
+typedef struct x265_analysis_validate
20
+{
21
+ int maxNumReferences;
22
+ int analysisReuseLevel;
23
+ int sourceWidth;
24
+ int sourceHeight;
25
+ int keyframeMax;
26
+ int keyframeMin;
27
+ int openGOP;
28
+ int bframes;
29
+ int bPyramid;
30
+ int maxCUSize;
31
+ int minCUSize;
32
+ int intraRefresh;
33
+ int lookaheadDepth;
34
+ int chunkStart;
35
+ int chunkEnd;
36
+}x265_analysis_validate;
37
+
38
+/* Stores intra analysis data for a single frame. This struct needs better packing */
39
+typedef struct x265_analysis_intra_data
40
+{
41
+ uint8_t* depth;
42
+ uint8_t* modes;
43
+ char* partSizes;
44
+ uint8_t* chromaModes;
45
+}x265_analysis_intra_data;
46
+
47
+typedef struct x265_analysis_MV
48
+{
49
+ union{
50
+ struct { int16_t x, y; };
51
+
52
+ int32_t word;
53
+ };
54
+}x265_analysis_MV;
55
+
56
+/* Stores inter analysis data for a single frame */
57
+typedef struct x265_analysis_inter_data
58
+{
59
+ int32_t* ref;
60
+ uint8_t* depth;
61
+ uint8_t* modes;
62
+ uint8_t* partSize;
63
+ uint8_t* mergeFlag;
64
+ uint8_t* interDir;
65
+ uint8_t* mvpIdx[2];
66
+ int8_t* refIdx[2];
67
+ x265_analysis_MV* mv[2];
68
+ int64_t* sadCost;
69
+}x265_analysis_inter_data;
70
+
71
+typedef struct x265_weight_param
72
+{
73
+ uint32_t log2WeightDenom;
74
+ int inputWeight;
75
+ int inputOffset;
76
+ int wtPresent;
77
+}x265_weight_param;
78
+
79
+#if X265_DEPTH < 10
80
+typedef uint32_t sse_t;
81
+#else
82
+typedef uint64_t sse_t;
83
+#endif
84
+
85
+typedef struct x265_analysis_distortion_data
86
+{
87
+ sse_t* distortion;
88
+ sse_t* ctuDistortion;
89
+ double* scaledDistortion;
90
+ double averageDistortion;
91
+ double sdDistortion;
92
+ uint32_t highDistortionCtuCount;
93
+ uint32_t lowDistortionCtuCount;
94
+ double* offset;
95
+ double* threshold;
96
+}x265_analysis_distortion_data;
97
+
98
/* Stores all analysis data for a single frame */
99
typedef struct x265_analysis_data
100
{
101
- int64_t satdCost;
102
- uint32_t frameRecordSize;
103
- uint32_t poc;
104
- uint32_t sliceType;
105
- uint32_t numCUsInFrame;
106
- uint32_t numPartitions;
107
- uint32_t depthBytes;
108
- int bScenecut;
109
- void* wt;
110
- void* interData;
111
- void* intraData;
112
- uint32_t numCuInHeight;
113
- x265_lookahead_data lookahead;
114
- uint8_t* modeFlag[2];
115
+ int64_t satdCost;
116
+ uint32_t frameRecordSize;
117
+ uint32_t poc;
118
+ uint32_t sliceType;
119
+ uint32_t numCUsInFrame;
120
+ uint32_t numPartitions;
121
+ uint32_t depthBytes;
122
+ int bScenecut;
123
+ x265_weight_param* wt;
124
+ x265_analysis_inter_data* interData;
125
+ x265_analysis_intra_data* intraData;
126
+ uint32_t numCuInHeight;
127
+ x265_lookahead_data lookahead;
128
+ uint8_t* modeFlag[2];
129
+ x265_analysis_validate saveParam;
130
+ x265_analysis_distortion_data* distortionData;
131
} x265_analysis_data;
132
133
/* cu statistics */
134
135
/* All the above values will add up to 100%. */
136
} x265_pu_stats;
137
138
-
139
-typedef struct x265_analysis_2Pass
140
-{
141
- uint32_t poc;
142
- uint32_t frameRecordSize;
143
- void* analysisFramedata;
144
-}x265_analysis_2Pass;
145
-
146
/* Frame level statistics */
147
typedef struct x265_frame_stats
148
{
149
150
x265_cu_stats cuStats;
151
x265_pu_stats puStats;
152
double totalFrameTime;
153
+ double vmafFrameScore;
154
+ double bufferFillFinal;
155
} x265_frame_stats;
156
157
typedef struct x265_ctu_info_t
158
159
REGION_REFRESH_INFO = 134,
160
MASTERING_DISPLAY_INFO = 137,
161
CONTENT_LIGHT_LEVEL_INFO = 144,
162
+ ALTERNATIVE_TRANSFER_CHARACTERISTICS = 147,
163
} SEIPayloadType;
164
165
typedef struct x265_sei_payload
166
167
168
int height;
169
170
- x265_analysis_2Pass analysis2Pass;
171
+ // pts is reordered in the order of encoding.
172
+ int64_t reorderedPts;
173
} x265_picture;
174
175
typedef enum
176
177
/* CPU flags */
178
179
/* x86 */
180
-#define X265_CPU_CMOV 0x0000001
181
-#define X265_CPU_MMX 0x0000002
182
-#define X265_CPU_MMX2 0x0000004 /* MMX2 aka MMXEXT aka ISSE */
183
+#define X265_CPU_MMX (1 << 0)
184
+#define X265_CPU_MMX2 (1 << 1) /* MMX2 aka MMXEXT aka ISSE */
185
#define X265_CPU_MMXEXT X265_CPU_MMX2
186
-#define X265_CPU_SSE 0x0000008
187
-#define X265_CPU_SSE2 0x0000010
188
-#define X265_CPU_SSE3 0x0000020
189
-#define X265_CPU_SSSE3 0x0000040
190
-#define X265_CPU_SSE4 0x0000080 /* SSE4.1 */
191
-#define X265_CPU_SSE42 0x0000100 /* SSE4.2 */
192
-#define X265_CPU_LZCNT 0x0000200 /* Phenom support for "leading zero count" instruction. */
193
-#define X265_CPU_AVX 0x0000400 /* AVX support: requires OS support even if YMM registers aren't used. */
194
-#define X265_CPU_XOP 0x0000800 /* AMD XOP */
195
-#define X265_CPU_FMA4 0x0001000 /* AMD FMA4 */
196
-#define X265_CPU_AVX2 0x0002000 /* AVX2 */
197
-#define X265_CPU_FMA3 0x0004000 /* Intel FMA3 */
198
-#define X265_CPU_BMI1 0x0008000 /* BMI1 */
199
-#define X265_CPU_BMI2 0x0010000 /* BMI2 */
200
+#define X265_CPU_SSE (1 << 2)
201
x265_2.7.tar.gz/source/x265cli.h -> x265_2.9.tar.gz/source/x265cli.h
Changed
104
1
2
{ "vbv-init", required_argument, NULL, 0 },
3
{ "vbv-end", required_argument, NULL, 0 },
4
{ "vbv-end-fr-adj", required_argument, NULL, 0 },
5
+ { "chunk-start", required_argument, NULL, 0 },
6
+ { "chunk-end", required_argument, NULL, 0 },
7
{ "bitrate", required_argument, NULL, 0 },
8
{ "qp", required_argument, NULL, 'q' },
9
{ "aq-mode", required_argument, NULL, 0 },
10
11
{ "scale-factor", required_argument, NULL, 0 },
12
{ "refine-intra", required_argument, NULL, 0 },
13
{ "refine-inter", required_argument, NULL, 0 },
14
+ { "dynamic-refine", no_argument, NULL, 0 },
15
+ { "no-dynamic-refine", no_argument, NULL, 0 },
16
{ "strict-cbr", no_argument, NULL, 0 },
17
{ "temporal-layers", no_argument, NULL, 0 },
18
{ "no-temporal-layers", no_argument, NULL, 0 },
19
20
{ "refine-mv-type", required_argument, NULL, 0 },
21
{ "copy-pic", no_argument, NULL, 0 },
22
{ "no-copy-pic", no_argument, NULL, 0 },
23
+ { "max-ausize-factor", required_argument, NULL, 0 },
24
+ { "idr-recovery-sei", no_argument, NULL, 0 },
25
+ { "no-idr-recovery-sei", no_argument, NULL, 0 },
26
+ { "single-sei", no_argument, NULL, 0 },
27
+ { "no-single-sei", no_argument, NULL, 0 },
28
+ { "atc-sei", required_argument, NULL, 0 },
29
+ { "pic-struct", required_argument, NULL, 0 },
30
+ { "nalu-file", required_argument, NULL, 0 },
31
{ 0, 0, 0, 0 },
32
{ 0, 0, 0, 0 },
33
{ 0, 0, 0, 0 },
34
35
H0(" --dhdr10-info <filename> JSON file containing the Creative Intent Metadata to be encoded as Dynamic Tone Mapping\n");
36
H0(" --[no-]dhdr10-opt Insert tone mapping SEI only for IDR frames and when the tone mapping information changes. Default disabled\n");
37
#endif
38
+ H0(" --nalu-file <filename> Text file containing SEI messages in the following format : <POC><space><PREFIX><space><NAL UNIT TYPE>/<SEI TYPE><space><SEI Payload>\n");
39
H0("-f/--frames <integer> Maximum number of frames to encode. Default all\n");
40
H0(" --seek <integer> First frame to encode\n");
41
H1(" --[no-]interlace <bff|tff> Indicate input pictures are interlace fields in temporal order. Default progressive\n");
42
43
H0(" --[no-]early-skip Enable early SKIP detection. Default %s\n", OPT(param->bEnableEarlySkip));
44
H0(" --[no-]rskip Enable early exit from recursion. Default %s\n", OPT(param->bEnableRecursionSkip));
45
H1(" --[no-]tskip-fast Enable fast intra transform skipping. Default %s\n", OPT(param->bEnableTSkipFast));
46
- H1(" --[no-]splitrd-skip Enable skipping split RD analysis when sum of split CU rdCost larger than none split CU rdCost for Intra CU. Default %s\n", OPT(param->bEnableSplitRdSkip));
47
+ H1(" --[no-]splitrd-skip Enable skipping split RD analysis when sum of split CU rdCost larger than one split CU rdCost for Intra CU. Default %s\n", OPT(param->bEnableSplitRdSkip));
48
H1(" --nr-intra <integer> An integer value in range of 0 to 2000, which denotes strength of noise reduction in intra CUs. Default 0\n");
49
H1(" --nr-inter <integer> An integer value in range of 0 to 2000, which denotes strength of noise reduction in inter CUs. Default 0\n");
50
H0(" --ctu-info <integer> Enable receiving ctu information asynchronously and determine reaction to the CTU information (0, 1, 2, 4, 6) Default 0\n"
51
52
H0(" --vbv-init <float> Initial VBV buffer occupancy (fraction of bufsize or in kbits). Default %.2f\n", param->rc.vbvBufferInit);
53
H0(" --vbv-end <float> Final VBV buffer emptiness (fraction of bufsize or in kbits). Default 0 (disabled)\n");
54
H0(" --vbv-end-fr-adj <float> Frame from which qp has to be adjusted to achieve final decode buffer emptiness. Default 0\n");
55
+ H0(" --chunk-start <integer> First frame of the chunk. Default 0 (disabled)\n");
56
+ H0(" --chunk-end <integer> Last frame of the chunk. Default 0 (disabled)\n");
57
H0(" --pass Multi pass rate control.\n"
58
" - 1 : First pass, creates stats file\n"
59
" - 2 : Last pass, does not overwrite stats file\n"
60
61
H0(" --analysis-reuse-level <1..10> Level of analysis reuse indicates amount of info stored/reused in save/load mode, 1:least..10:most. Default %d\n", param->analysisReuseLevel);
62
H0(" --refine-mv-type <string> Reuse MV information received through API call. Supported option is avc. Default disabled - %d\n", param->bMVType);
63
H0(" --scale-factor <int> Specify factor by which input video is scaled down for analysis save mode. Default %d\n", param->scaleFactor);
64
- H0(" --refine-intra <0..3> Enable intra refinement for encode that uses analysis-load.\n"
65
+ H0(" --refine-intra <0..4> Enable intra refinement for encode that uses analysis-load.\n"
66
" - 0 : Forces both mode and depth from the save encode.\n"
67
" - 1 : Functionality of (0) + evaluate all intra modes at min-cu-size's depth when current depth is one smaller than min-cu-size's depth.\n"
68
" - 2 : Functionality of (1) + irrespective of size evaluate all angular modes when the save encode decides the best mode as angular.\n"
69
" - 3 : Functionality of (1) + irrespective of size evaluate all intra modes.\n"
70
+ " - 4 : Re-evaluate all intra blocks, does not reuse data from save encode.\n"
71
" Default:%d\n", param->intraRefine);
72
H0(" --refine-inter <0..3> Enable inter refinement for encode that uses analysis-load.\n"
73
" - 0 : Forces both mode and depth from the save encode.\n"
74
75
" - 2 : Functionality of (1) + irrespective of size restrict the modes evaluated when specific modes are decided as the best mode by the save encode.\n"
76
" - 3 : Functionality of (1) + irrespective of size evaluate all inter modes.\n"
77
" Default:%d\n", param->interRefine);
78
+ H0(" --[no-]dynamic-refine Dynamically changes refine-inter level for each CU. Default %s\n", OPT(param->bDynamicRefine));
79
H0(" --[no-]refine-mv Enable mv refinement for load mode. Default %s\n", OPT(param->mvRefine));
80
H0(" --aq-mode <integer> Mode for Adaptive Quantization - 0:none 1:uniform AQ 2:auto variance 3:auto variance with bias to dark scenes. Default %d\n", param->rc.aqMode);
81
H0(" --aq-strength <float> Reduces blocking and blurring in flat and textured areas (0 to 3.0). Default %.2f\n", param->rc.aqStrength);
82
83
H1(" MAX_MAX_QP+1 floats for lambda table, then again for lambda2 table\n");
84
H1(" Blank lines and lines starting with hash(#) are ignored\n");
85
H1(" Comma is considered to be white-space\n");
86
+ H0(" --max-ausize-factor <float> This value controls the maximum AU size defined in specification.\n");
87
+ H0(" It represents the percentage of maximum AU size used. Default %.1f\n", param->maxAUSizeFactor);
88
H0("\nLoop filters (deblock and SAO):\n");
89
H0(" --[no-]deblock Enable Deblocking Loop Filter, optionally specify tC:Beta offsets Default %s\n", OPT(param->bEnableLoopFilter));
90
H0(" --[no-]sao Enable Sample Adaptive Offset. Default %s\n", OPT(param->bEnableSAO));
91
92
H0(" --[no-]repeat-headers Emit SPS and PPS headers at each keyframe. Default %s\n", OPT(param->bRepeatHeaders));
93
H0(" --[no-]info Emit SEI identifying encoder and parameters. Default %s\n", OPT(param->bEmitInfoSEI));
94
H0(" --[no-]hrd Enable HRD parameters signaling. Default %s\n", OPT(param->bEmitHRDSEI));
95
+ H0(" --[no-]idr-recovery-sei Emit recovery point infor SEI at each IDR frame \n");
96
H0(" --[no-]temporal-layers Enable a temporal sublayer for unreferenced B frames. Default %s\n", OPT(param->bEnableTemporalSubLayers));
97
H0(" --[no-]aud Emit access unit delimiters at the start of each access unit. Default %s\n", OPT(param->bEnableAccessUnitDelimiters));
98
H1(" --hash <integer> Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default %d\n", param->decodedPictureHashSEI);
99
+ H0(" --atc-sei <integer> Emit the alternative transfer characteristics SEI message where the integer is the preferred transfer characteristics. Default disabled\n");
100
+ H0(" --pic-struct <integer> Set the picture structure and emits it in the picture timing SEI message. Values in the range 0..12. See D.3.3 of the HEVC spec. for a detailed explanation.\n");
101
H0(" --log2-max-poc-lsb <integer> Maximum of the picture order count\n");
102
H0(" --[no-]vui-timing-info Emit VUI timing information in the bistream. Default %s\n", OPT(param->bEmitVUITimingInfo));
103
H0(" --[no-]vui-hrd-info Emit VUI HRD information in the bistream. Default %s\n", OPT(param->bEmitVUIHRDInfo));
104